mempalace-code 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mempalace/README.md +40 -0
- mempalace/__init__.py +6 -0
- mempalace/__main__.py +5 -0
- mempalace/cli.py +811 -0
- mempalace/config.py +149 -0
- mempalace/convo_miner.py +415 -0
- mempalace/dialect.py +1075 -0
- mempalace/entity_detector.py +853 -0
- mempalace/entity_registry.py +639 -0
- mempalace/export.py +378 -0
- mempalace/general_extractor.py +521 -0
- mempalace/knowledge_graph.py +410 -0
- mempalace/layers.py +515 -0
- mempalace/mcp_server.py +873 -0
- mempalace/migrate.py +153 -0
- mempalace/miner.py +1285 -0
- mempalace/normalize.py +328 -0
- mempalace/onboarding.py +489 -0
- mempalace/palace_graph.py +225 -0
- mempalace/py.typed +0 -0
- mempalace/room_detector_local.py +310 -0
- mempalace/searcher.py +305 -0
- mempalace/spellcheck.py +269 -0
- mempalace/split_mega_files.py +309 -0
- mempalace/storage.py +807 -0
- mempalace/version.py +3 -0
- mempalace_code-1.0.0.dist-info/METADATA +489 -0
- mempalace_code-1.0.0.dist-info/RECORD +32 -0
- mempalace_code-1.0.0.dist-info/WHEEL +4 -0
- mempalace_code-1.0.0.dist-info/entry_points.txt +2 -0
- mempalace_code-1.0.0.dist-info/licenses/LICENSE +192 -0
- mempalace_code-1.0.0.dist-info/licenses/NOTICE +17 -0
|
@@ -0,0 +1,521 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
general_extractor.py — Extract 5 types of memories from text.
|
|
4
|
+
|
|
5
|
+
Types:
|
|
6
|
+
1. DECISIONS — "we went with X because Y", choices made
|
|
7
|
+
2. PREFERENCES — "always use X", "never do Y", "I prefer Z"
|
|
8
|
+
3. MILESTONES — breakthroughs, things that finally worked
|
|
9
|
+
4. PROBLEMS — what broke, what fixed it, root causes
|
|
10
|
+
5. EMOTIONAL — feelings, vulnerability, relationships
|
|
11
|
+
|
|
12
|
+
No LLM required. Pure keyword/pattern heuristics.
|
|
13
|
+
No external dependencies on palace.py, dialect.py, or layers.py.
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
from general_extractor import extract_memories
|
|
17
|
+
|
|
18
|
+
chunks = extract_memories(text)
|
|
19
|
+
# [{"content": "...", "memory_type": "decision", "chunk_index": 0}, ...]
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
from typing import List, Dict, Tuple
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# =============================================================================
|
|
27
|
+
# MARKER SETS — One per memory type
|
|
28
|
+
# =============================================================================
|
|
29
|
+
|
|
30
|
+
DECISION_MARKERS = [
|
|
31
|
+
r"\blet'?s (use|go with|try|pick|choose|switch to)\b",
|
|
32
|
+
r"\bwe (should|decided|chose|went with|picked|settled on)\b",
|
|
33
|
+
r"\bi'?m going (to|with)\b",
|
|
34
|
+
r"\bbetter (to|than|approach|option|choice)\b",
|
|
35
|
+
r"\binstead of\b",
|
|
36
|
+
r"\brather than\b",
|
|
37
|
+
r"\bthe reason (is|was|being)\b",
|
|
38
|
+
r"\bbecause\b",
|
|
39
|
+
r"\btrade-?off\b",
|
|
40
|
+
r"\bpros and cons\b",
|
|
41
|
+
r"\bover\b.*\bbecause\b",
|
|
42
|
+
r"\barchitecture\b",
|
|
43
|
+
r"\bapproach\b",
|
|
44
|
+
r"\bstrategy\b",
|
|
45
|
+
r"\bpattern\b",
|
|
46
|
+
r"\bstack\b",
|
|
47
|
+
r"\bframework\b",
|
|
48
|
+
r"\binfrastructure\b",
|
|
49
|
+
r"\bset (it |this )?to\b",
|
|
50
|
+
r"\bconfigure\b",
|
|
51
|
+
r"\bdefault\b",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
PREFERENCE_MARKERS = [
|
|
55
|
+
r"\bi prefer\b",
|
|
56
|
+
r"\balways use\b",
|
|
57
|
+
r"\bnever use\b",
|
|
58
|
+
r"\bdon'?t (ever |like to )?(use|do|mock|stub|import)\b",
|
|
59
|
+
r"\bi like (to|when|how)\b",
|
|
60
|
+
r"\bi hate (when|how|it when)\b",
|
|
61
|
+
r"\bplease (always|never|don'?t)\b",
|
|
62
|
+
r"\bmy (rule|preference|style|convention) is\b",
|
|
63
|
+
r"\bwe (always|never)\b",
|
|
64
|
+
r"\bfunctional\b.*\bstyle\b",
|
|
65
|
+
r"\bimperative\b",
|
|
66
|
+
r"\bsnake_?case\b",
|
|
67
|
+
r"\bcamel_?case\b",
|
|
68
|
+
r"\btabs\b.*\bspaces\b",
|
|
69
|
+
r"\bspaces\b.*\btabs\b",
|
|
70
|
+
r"\buse\b.*\binstead of\b",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
MILESTONE_MARKERS = [
|
|
74
|
+
r"\bit works\b",
|
|
75
|
+
r"\bit worked\b",
|
|
76
|
+
r"\bgot it working\b",
|
|
77
|
+
r"\bfixed\b",
|
|
78
|
+
r"\bsolved\b",
|
|
79
|
+
r"\bbreakthrough\b",
|
|
80
|
+
r"\bfigured (it )?out\b",
|
|
81
|
+
r"\bnailed it\b",
|
|
82
|
+
r"\bcracked (it|the)\b",
|
|
83
|
+
r"\bfinally\b",
|
|
84
|
+
r"\bfirst time\b",
|
|
85
|
+
r"\bfirst ever\b",
|
|
86
|
+
r"\bnever (done|been|had) before\b",
|
|
87
|
+
r"\bdiscovered\b",
|
|
88
|
+
r"\brealized\b",
|
|
89
|
+
r"\bfound (out|that)\b",
|
|
90
|
+
r"\bturns out\b",
|
|
91
|
+
r"\bthe key (is|was|insight)\b",
|
|
92
|
+
r"\bthe trick (is|was)\b",
|
|
93
|
+
r"\bnow i (understand|see|get it)\b",
|
|
94
|
+
r"\bbuilt\b",
|
|
95
|
+
r"\bcreated\b",
|
|
96
|
+
r"\bimplemented\b",
|
|
97
|
+
r"\bshipped\b",
|
|
98
|
+
r"\blaunched\b",
|
|
99
|
+
r"\bdeployed\b",
|
|
100
|
+
r"\breleased\b",
|
|
101
|
+
r"\bprototype\b",
|
|
102
|
+
r"\bproof of concept\b",
|
|
103
|
+
r"\bdemo\b",
|
|
104
|
+
r"\bversion \d",
|
|
105
|
+
r"\bv\d+\.\d+",
|
|
106
|
+
r"\d+x (compression|faster|slower|better|improvement|reduction)",
|
|
107
|
+
r"\d+% (reduction|improvement|faster|better|smaller)",
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
PROBLEM_MARKERS = [
|
|
111
|
+
r"\b(bug|error|crash|fail|broke|broken|issue|problem)\b",
|
|
112
|
+
r"\bdoesn'?t work\b",
|
|
113
|
+
r"\bnot working\b",
|
|
114
|
+
r"\bwon'?t\b.*\bwork\b",
|
|
115
|
+
r"\bkeeps? (failing|crashing|breaking|erroring)\b",
|
|
116
|
+
r"\broot cause\b",
|
|
117
|
+
r"\bthe (problem|issue|bug) (is|was)\b",
|
|
118
|
+
r"\bturns out\b.*\b(was|because|due to)\b",
|
|
119
|
+
r"\bthe fix (is|was)\b",
|
|
120
|
+
r"\bworkaround\b",
|
|
121
|
+
r"\bthat'?s why\b",
|
|
122
|
+
r"\bthe reason it\b",
|
|
123
|
+
r"\bfixed (it |the |by )\b",
|
|
124
|
+
r"\bsolution (is|was)\b",
|
|
125
|
+
r"\bresolved\b",
|
|
126
|
+
r"\bpatched\b",
|
|
127
|
+
r"\bthe answer (is|was)\b",
|
|
128
|
+
r"\b(had|need) to\b.*\binstead\b",
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
EMOTION_MARKERS = [
|
|
132
|
+
r"\blove\b",
|
|
133
|
+
r"\bscared\b",
|
|
134
|
+
r"\bafraid\b",
|
|
135
|
+
r"\bproud\b",
|
|
136
|
+
r"\bhurt\b",
|
|
137
|
+
r"\bhappy\b",
|
|
138
|
+
r"\bsad\b",
|
|
139
|
+
r"\bcry\b",
|
|
140
|
+
r"\bcrying\b",
|
|
141
|
+
r"\bmiss\b",
|
|
142
|
+
r"\bsorry\b",
|
|
143
|
+
r"\bgrateful\b",
|
|
144
|
+
r"\bangry\b",
|
|
145
|
+
r"\bworried\b",
|
|
146
|
+
r"\blonely\b",
|
|
147
|
+
r"\bbeautiful\b",
|
|
148
|
+
r"\bamazing\b",
|
|
149
|
+
r"\bwonderful\b",
|
|
150
|
+
r"i feel",
|
|
151
|
+
r"i'm scared",
|
|
152
|
+
r"i love you",
|
|
153
|
+
r"i'm sorry",
|
|
154
|
+
r"i can't",
|
|
155
|
+
r"i wish",
|
|
156
|
+
r"i miss",
|
|
157
|
+
r"i need",
|
|
158
|
+
r"never told anyone",
|
|
159
|
+
r"nobody knows",
|
|
160
|
+
r"\*[^*]+\*",
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
ALL_MARKERS = {
|
|
164
|
+
"decision": DECISION_MARKERS,
|
|
165
|
+
"preference": PREFERENCE_MARKERS,
|
|
166
|
+
"milestone": MILESTONE_MARKERS,
|
|
167
|
+
"problem": PROBLEM_MARKERS,
|
|
168
|
+
"emotional": EMOTION_MARKERS,
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# =============================================================================
|
|
173
|
+
# SENTIMENT — for disambiguation
|
|
174
|
+
# =============================================================================
|
|
175
|
+
|
|
176
|
+
POSITIVE_WORDS = {
|
|
177
|
+
"pride",
|
|
178
|
+
"proud",
|
|
179
|
+
"joy",
|
|
180
|
+
"happy",
|
|
181
|
+
"love",
|
|
182
|
+
"loving",
|
|
183
|
+
"beautiful",
|
|
184
|
+
"amazing",
|
|
185
|
+
"wonderful",
|
|
186
|
+
"incredible",
|
|
187
|
+
"fantastic",
|
|
188
|
+
"brilliant",
|
|
189
|
+
"perfect",
|
|
190
|
+
"excited",
|
|
191
|
+
"thrilled",
|
|
192
|
+
"grateful",
|
|
193
|
+
"warm",
|
|
194
|
+
"breakthrough",
|
|
195
|
+
"success",
|
|
196
|
+
"works",
|
|
197
|
+
"working",
|
|
198
|
+
"solved",
|
|
199
|
+
"fixed",
|
|
200
|
+
"nailed",
|
|
201
|
+
"heart",
|
|
202
|
+
"hug",
|
|
203
|
+
"precious",
|
|
204
|
+
"adore",
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
NEGATIVE_WORDS = {
|
|
208
|
+
"bug",
|
|
209
|
+
"error",
|
|
210
|
+
"crash",
|
|
211
|
+
"crashing",
|
|
212
|
+
"crashed",
|
|
213
|
+
"fail",
|
|
214
|
+
"failed",
|
|
215
|
+
"failing",
|
|
216
|
+
"failure",
|
|
217
|
+
"broken",
|
|
218
|
+
"broke",
|
|
219
|
+
"breaking",
|
|
220
|
+
"breaks",
|
|
221
|
+
"issue",
|
|
222
|
+
"problem",
|
|
223
|
+
"wrong",
|
|
224
|
+
"stuck",
|
|
225
|
+
"blocked",
|
|
226
|
+
"unable",
|
|
227
|
+
"impossible",
|
|
228
|
+
"missing",
|
|
229
|
+
"terrible",
|
|
230
|
+
"horrible",
|
|
231
|
+
"awful",
|
|
232
|
+
"worse",
|
|
233
|
+
"worst",
|
|
234
|
+
"panic",
|
|
235
|
+
"disaster",
|
|
236
|
+
"mess",
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _get_sentiment(text: str) -> str:
|
|
241
|
+
"""Quick sentiment: 'positive', 'negative', or 'neutral'."""
|
|
242
|
+
words = set(w.lower() for w in re.findall(r"\b\w+\b", text))
|
|
243
|
+
pos = len(words & POSITIVE_WORDS)
|
|
244
|
+
neg = len(words & NEGATIVE_WORDS)
|
|
245
|
+
if pos > neg:
|
|
246
|
+
return "positive"
|
|
247
|
+
elif neg > pos:
|
|
248
|
+
return "negative"
|
|
249
|
+
return "neutral"
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _has_resolution(text: str) -> bool:
|
|
253
|
+
"""Check if text describes a RESOLVED problem."""
|
|
254
|
+
text_lower = text.lower()
|
|
255
|
+
patterns = [
|
|
256
|
+
r"\bfixed\b",
|
|
257
|
+
r"\bsolved\b",
|
|
258
|
+
r"\bresolved\b",
|
|
259
|
+
r"\bpatched\b",
|
|
260
|
+
r"\bgot it working\b",
|
|
261
|
+
r"\bit works\b",
|
|
262
|
+
r"\bnailed it\b",
|
|
263
|
+
r"\bfigured (it )?out\b",
|
|
264
|
+
r"\bthe (fix|answer|solution)\b",
|
|
265
|
+
]
|
|
266
|
+
return any(re.search(p, text_lower) for p in patterns)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _disambiguate(memory_type: str, text: str, scores: Dict[str, float]) -> str:
|
|
270
|
+
"""Fix misclassifications using sentiment + resolution."""
|
|
271
|
+
sentiment = _get_sentiment(text)
|
|
272
|
+
|
|
273
|
+
# Resolved problems are milestones
|
|
274
|
+
if memory_type == "problem" and _has_resolution(text):
|
|
275
|
+
if scores.get("emotional", 0) > 0 and sentiment == "positive":
|
|
276
|
+
return "emotional"
|
|
277
|
+
return "milestone"
|
|
278
|
+
|
|
279
|
+
# Problem + positive sentiment => milestone or emotional
|
|
280
|
+
if memory_type == "problem" and sentiment == "positive":
|
|
281
|
+
if scores.get("milestone", 0) > 0:
|
|
282
|
+
return "milestone"
|
|
283
|
+
if scores.get("emotional", 0) > 0:
|
|
284
|
+
return "emotional"
|
|
285
|
+
|
|
286
|
+
return memory_type
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# =============================================================================
|
|
290
|
+
# CODE LINE FILTERING
|
|
291
|
+
# =============================================================================
|
|
292
|
+
|
|
293
|
+
_CODE_LINE_PATTERNS = [
|
|
294
|
+
re.compile(r"^\s*[\$#]\s"),
|
|
295
|
+
re.compile(
|
|
296
|
+
r"^\s*(cd|source|echo|export|pip|npm|git|python|bash|curl|wget|mkdir|rm|cp|mv|ls|cat|grep|find|chmod|sudo|brew|docker)\s"
|
|
297
|
+
),
|
|
298
|
+
re.compile(r"^\s*```"),
|
|
299
|
+
re.compile(r"^\s*(import|from|def|class|function|const|let|var|return)\s"),
|
|
300
|
+
re.compile(r"^\s*[A-Z_]{2,}="),
|
|
301
|
+
re.compile(r"^\s*\|"),
|
|
302
|
+
re.compile(r"^\s*[-]{2,}"),
|
|
303
|
+
re.compile(r"^\s*[{}\[\]]\s*$"),
|
|
304
|
+
re.compile(r"^\s*(if|for|while|try|except|elif|else:)\b"),
|
|
305
|
+
re.compile(r"^\s*\w+\.\w+\("),
|
|
306
|
+
re.compile(r"^\s*\w+ = \w+\.\w+"),
|
|
307
|
+
]
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _is_code_line(line: str) -> bool:
|
|
311
|
+
stripped = line.strip()
|
|
312
|
+
if not stripped:
|
|
313
|
+
return False
|
|
314
|
+
for pattern in _CODE_LINE_PATTERNS:
|
|
315
|
+
if pattern.match(stripped):
|
|
316
|
+
return True
|
|
317
|
+
alpha_ratio = sum(1 for c in stripped if c.isalpha()) / max(len(stripped), 1)
|
|
318
|
+
if alpha_ratio < 0.4 and len(stripped) > 10:
|
|
319
|
+
return True
|
|
320
|
+
return False
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _extract_prose(text: str) -> str:
|
|
324
|
+
"""Extract only prose lines (skip code) for classification scoring."""
|
|
325
|
+
lines = text.split("\n")
|
|
326
|
+
prose = []
|
|
327
|
+
in_code = False
|
|
328
|
+
for line in lines:
|
|
329
|
+
if line.strip().startswith("```"):
|
|
330
|
+
in_code = not in_code
|
|
331
|
+
continue
|
|
332
|
+
if in_code:
|
|
333
|
+
continue
|
|
334
|
+
if not _is_code_line(line):
|
|
335
|
+
prose.append(line)
|
|
336
|
+
result = "\n".join(prose).strip()
|
|
337
|
+
return result if result else text
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
# =============================================================================
|
|
341
|
+
# SCORING
|
|
342
|
+
# =============================================================================
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _score_markers(text: str, markers: List[str]) -> Tuple[float, List[str]]:
|
|
346
|
+
"""Score text against regex markers. Returns (score, matched_keywords)."""
|
|
347
|
+
text_lower = text.lower()
|
|
348
|
+
score = 0.0
|
|
349
|
+
keywords = []
|
|
350
|
+
for marker in markers:
|
|
351
|
+
matches = re.findall(marker, text_lower)
|
|
352
|
+
if matches:
|
|
353
|
+
score += len(matches)
|
|
354
|
+
keywords.extend(m if isinstance(m, str) else m[0] if m else marker for m in matches)
|
|
355
|
+
return score, list(set(keywords))
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
# =============================================================================
|
|
359
|
+
# MAIN EXTRACTION
|
|
360
|
+
# =============================================================================
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def extract_memories(text: str, min_confidence: float = 0.3) -> List[Dict]:
|
|
364
|
+
"""
|
|
365
|
+
Extract memories from a text string.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
text: The text to extract from (any format).
|
|
369
|
+
min_confidence: Minimum confidence threshold (0.0-1.0).
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
List of dicts: {"content": str, "memory_type": str, "chunk_index": int}
|
|
373
|
+
"""
|
|
374
|
+
# Split into paragraphs (double newline or speaker-turn boundaries)
|
|
375
|
+
paragraphs = _split_into_segments(text)
|
|
376
|
+
memories = []
|
|
377
|
+
|
|
378
|
+
for para in paragraphs:
|
|
379
|
+
if len(para.strip()) < 20:
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
prose = _extract_prose(para)
|
|
383
|
+
|
|
384
|
+
# Score against all types
|
|
385
|
+
scores = {}
|
|
386
|
+
for mem_type, markers in ALL_MARKERS.items():
|
|
387
|
+
score, _ = _score_markers(prose, markers)
|
|
388
|
+
if score > 0:
|
|
389
|
+
scores[mem_type] = score
|
|
390
|
+
|
|
391
|
+
if not scores:
|
|
392
|
+
continue
|
|
393
|
+
|
|
394
|
+
# Length bonus
|
|
395
|
+
if len(para) > 500:
|
|
396
|
+
length_bonus = 2
|
|
397
|
+
elif len(para) > 200:
|
|
398
|
+
length_bonus = 1
|
|
399
|
+
else:
|
|
400
|
+
length_bonus = 0
|
|
401
|
+
|
|
402
|
+
max_type = max(scores, key=scores.get)
|
|
403
|
+
max_score = scores[max_type] + length_bonus
|
|
404
|
+
|
|
405
|
+
# Disambiguate
|
|
406
|
+
max_type = _disambiguate(max_type, prose, scores)
|
|
407
|
+
|
|
408
|
+
# Confidence
|
|
409
|
+
confidence = min(1.0, max_score / 5.0)
|
|
410
|
+
if confidence < min_confidence:
|
|
411
|
+
continue
|
|
412
|
+
|
|
413
|
+
memories.append(
|
|
414
|
+
{
|
|
415
|
+
"content": para.strip(),
|
|
416
|
+
"memory_type": max_type,
|
|
417
|
+
"chunk_index": len(memories),
|
|
418
|
+
}
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
return memories
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def _split_into_segments(text: str) -> List[str]:
|
|
425
|
+
"""
|
|
426
|
+
Split text into segments suitable for memory extraction.
|
|
427
|
+
|
|
428
|
+
Tries speaker-turn splitting first (> markers, "Human:", "Assistant:", etc.),
|
|
429
|
+
then falls back to paragraph splitting.
|
|
430
|
+
"""
|
|
431
|
+
lines = text.split("\n")
|
|
432
|
+
|
|
433
|
+
# Check for speaker-turn markers
|
|
434
|
+
turn_patterns = [
|
|
435
|
+
re.compile(r"^>\s"), # > quoted user turn
|
|
436
|
+
re.compile(r"^(Human|User|Q)\s*:", re.I), # Human: / User:
|
|
437
|
+
re.compile(r"^(Assistant|AI|A|Claude|ChatGPT)\s*:", re.I),
|
|
438
|
+
]
|
|
439
|
+
|
|
440
|
+
turn_count = 0
|
|
441
|
+
for line in lines:
|
|
442
|
+
stripped = line.strip()
|
|
443
|
+
for pat in turn_patterns:
|
|
444
|
+
if pat.match(stripped):
|
|
445
|
+
turn_count += 1
|
|
446
|
+
break
|
|
447
|
+
|
|
448
|
+
# If enough turn markers, split by turns
|
|
449
|
+
if turn_count >= 3:
|
|
450
|
+
return _split_by_turns(lines, turn_patterns)
|
|
451
|
+
|
|
452
|
+
# Fallback: paragraph splitting
|
|
453
|
+
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
|
454
|
+
|
|
455
|
+
# If single giant block, chunk by line groups
|
|
456
|
+
if len(paragraphs) <= 1 and len(lines) > 20:
|
|
457
|
+
segments = []
|
|
458
|
+
for i in range(0, len(lines), 25):
|
|
459
|
+
group = "\n".join(lines[i : i + 25]).strip()
|
|
460
|
+
if group:
|
|
461
|
+
segments.append(group)
|
|
462
|
+
return segments
|
|
463
|
+
|
|
464
|
+
return paragraphs
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _split_by_turns(lines: List[str], turn_patterns: List[re.Pattern]) -> List[str]:
|
|
468
|
+
"""Split lines into segments at each speaker turn boundary."""
|
|
469
|
+
segments = []
|
|
470
|
+
current = []
|
|
471
|
+
|
|
472
|
+
for line in lines:
|
|
473
|
+
stripped = line.strip()
|
|
474
|
+
is_turn = any(pat.match(stripped) for pat in turn_patterns)
|
|
475
|
+
|
|
476
|
+
if is_turn and current:
|
|
477
|
+
segments.append("\n".join(current))
|
|
478
|
+
current = [line]
|
|
479
|
+
else:
|
|
480
|
+
current.append(line)
|
|
481
|
+
|
|
482
|
+
if current:
|
|
483
|
+
segments.append("\n".join(current))
|
|
484
|
+
|
|
485
|
+
return segments
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
# =============================================================================
|
|
489
|
+
# CLI
|
|
490
|
+
# =============================================================================
|
|
491
|
+
|
|
492
|
+
if __name__ == "__main__":
|
|
493
|
+
import sys
|
|
494
|
+
|
|
495
|
+
if len(sys.argv) < 2:
|
|
496
|
+
print("Usage: python general_extractor.py <file>")
|
|
497
|
+
print()
|
|
498
|
+
print("Extracts decisions, preferences, milestones, problems, and")
|
|
499
|
+
print("emotional moments from any text file.")
|
|
500
|
+
sys.exit(1)
|
|
501
|
+
|
|
502
|
+
filepath = sys.argv[1]
|
|
503
|
+
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
|
504
|
+
text = f.read()
|
|
505
|
+
|
|
506
|
+
memories = extract_memories(text)
|
|
507
|
+
|
|
508
|
+
# Summary
|
|
509
|
+
from collections import Counter
|
|
510
|
+
|
|
511
|
+
type_counts = Counter(m["memory_type"] for m in memories)
|
|
512
|
+
print(f"Extracted {len(memories)} memories:")
|
|
513
|
+
for mtype in ["decision", "preference", "milestone", "problem", "emotional"]:
|
|
514
|
+
count = type_counts.get(mtype, 0)
|
|
515
|
+
if count:
|
|
516
|
+
print(f" {mtype:12} {count}")
|
|
517
|
+
|
|
518
|
+
print()
|
|
519
|
+
for m in memories[:10]:
|
|
520
|
+
preview = m["content"][:80].replace("\n", " ")
|
|
521
|
+
print(f" [{m['memory_type']:10}] {preview}...")
|