keepsake-memory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keepsake/__init__.py +558 -0
- keepsake/attention.py +146 -0
- keepsake/consolidator.py +395 -0
- keepsake/embedder.py +155 -0
- keepsake/emotion.py +136 -0
- keepsake/forgetter.py +262 -0
- keepsake/py.typed +0 -0
- keepsake/splitter.py +436 -0
- keepsake/storage.py +1360 -0
- keepsake_memory-1.0.0.dist-info/METADATA +424 -0
- keepsake_memory-1.0.0.dist-info/RECORD +14 -0
- keepsake_memory-1.0.0.dist-info/WHEEL +5 -0
- keepsake_memory-1.0.0.dist-info/licenses/LICENSE +21 -0
- keepsake_memory-1.0.0.dist-info/top_level.txt +1 -0
keepsake/emotion.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""情绪烈度分析 — 检测用户表达中的情绪强度,不判断正负。
|
|
2
|
+
|
|
3
|
+
输入一段文本,返回情绪烈度值 (0.0~2.0),越高表示用户越激动。
|
|
4
|
+
|
|
5
|
+
检测维度:
|
|
6
|
+
- 标点密度: !! 和 ?? 的数量
|
|
7
|
+
- 中文程度副词: 太、非常、极其、到底、完全、真的
|
|
8
|
+
- 重复字符: 啊啊啊、对对对、真的真的
|
|
9
|
+
- 反复问: 连续两个以上问句
|
|
10
|
+
- 英文全大写词 (English only)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from typing import Tuple
|
|
17
|
+
|
|
18
|
+
# 中文程度副词(单个字 + 常见组合)
|
|
19
|
+
_INTENSITY_ADVERBS = frozenset({
|
|
20
|
+
"太", "超", "极", "巨", "贼", "老", "特", "爆",
|
|
21
|
+
"非常", "极其", "超级", "格外", "分外", "过于",
|
|
22
|
+
"无比", "绝顶", "十分", "相当", "特别",
|
|
23
|
+
"完全", "根本", "彻底", "绝对",
|
|
24
|
+
"真的", "真是", "简直", "实在",
|
|
25
|
+
"到底", "究竟", "明明",
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
# 重复字符检测(连续重复 3+ 次)
|
|
29
|
+
_RE_REPEATED_CHAR = re.compile(r"(.)\1{2,}")
|
|
30
|
+
|
|
31
|
+
# 重复词检测(连续重复 2+ 次)
|
|
32
|
+
_RE_REPEATED_WORD = re.compile(r"(.{2,4})\1{1,}")
|
|
33
|
+
|
|
34
|
+
# 问号/叹号簇
|
|
35
|
+
_RE_EXCLAMATION_CLUSTER = re.compile(r"!{2,}")
|
|
36
|
+
_RE_QUESTION_CLUSTER = re.compile(r"\?{2,}")
|
|
37
|
+
_RE_MIXED_PUNCT = re.compile(r"[!?!?]{2,}")
|
|
38
|
+
|
|
39
|
+
# 全大写英文词(至少 3 字母)
|
|
40
|
+
_RE_CAPS_WORD = re.compile(r"\b[A-Z]{3,}\b")
|
|
41
|
+
|
|
42
|
+
# 否定词(用于检测负面)
|
|
43
|
+
_NEGATORS = frozenset({"不", "没", "别", "勿", "无", "not", "no", "never"})
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def analyze_emotion(text: str) -> Tuple[float, str]:
|
|
47
|
+
"""分析一段文本的情绪烈度和情感极性。
|
|
48
|
+
|
|
49
|
+
返回:
|
|
50
|
+
(intensity, label):
|
|
51
|
+
intensity: 0.0~2.0 情绪烈度
|
|
52
|
+
label: "strong_positive", "positive", "negative", "strong_negative", "neutral"
|
|
53
|
+
"""
|
|
54
|
+
if not text or not text.strip():
|
|
55
|
+
return 0.0, "neutral"
|
|
56
|
+
|
|
57
|
+
raw = text.strip()
|
|
58
|
+
|
|
59
|
+
# ---- 1. 情绪烈度计算 ----
|
|
60
|
+
intensity = 0.0
|
|
61
|
+
|
|
62
|
+
# 1a: 感叹号/问号密度
|
|
63
|
+
excl = _RE_EXCLAMATION_CLUSTER.findall(raw)
|
|
64
|
+
qst = _RE_QUESTION_CLUSTER.findall(raw)
|
|
65
|
+
mixed = _RE_MIXED_PUNCT.findall(raw)
|
|
66
|
+
|
|
67
|
+
# 每个重复标点簇贡献 0.15,最多 0.6
|
|
68
|
+
punct_score = (len(excl) + len(qst) + len(mixed)) * 0.15
|
|
69
|
+
intensity += min(punct_score, 0.6)
|
|
70
|
+
|
|
71
|
+
# 混合 !? 连用额外加分(如 "真的吗?!")
|
|
72
|
+
for m in mixed:
|
|
73
|
+
if "!" in m and "?" in m:
|
|
74
|
+
intensity += 0.2
|
|
75
|
+
|
|
76
|
+
# 1b: 程度副词密度
|
|
77
|
+
adverb_hits = sum(1 for adv in _INTENSITY_ADVERBS if adv in raw)
|
|
78
|
+
intensity += min(adverb_hits * 0.2, 0.6)
|
|
79
|
+
|
|
80
|
+
# 1c: 重复字符(啊啊啊、对对对)
|
|
81
|
+
repeated_chars = _RE_REPEATED_CHAR.findall(raw)
|
|
82
|
+
intensity += min(len(repeated_chars) * 0.15, 0.3)
|
|
83
|
+
|
|
84
|
+
# 1d: 重复词(真的真的、完全完全)
|
|
85
|
+
repeated_words = _RE_REPEATED_WORD.findall(raw)
|
|
86
|
+
intensity += min(len(repeated_words) * 0.2, 0.4)
|
|
87
|
+
|
|
88
|
+
# 1e: 全大写英文词
|
|
89
|
+
caps_words = _RE_CAPS_WORD.findall(raw)
|
|
90
|
+
intensity += min(len(caps_words) * 0.15, 0.3)
|
|
91
|
+
|
|
92
|
+
# 1f: 反问+连续问句
|
|
93
|
+
q_markers = sum(1 for ch in raw if ch in ("?", "?"))
|
|
94
|
+
if q_markers >= 2:
|
|
95
|
+
intensity += min((q_markers - 1) * 0.1, 0.3)
|
|
96
|
+
|
|
97
|
+
# 裁剪到 0.0~2.0
|
|
98
|
+
intensity = max(0.0, min(2.0, intensity))
|
|
99
|
+
|
|
100
|
+
# ---- 2. 情感极性判断(弱化版关键词检测) ----
|
|
101
|
+
# 仍然检测关键词,但只影响极性标签,不决定强度
|
|
102
|
+
pos_kw = {"喜欢", "不错", "满意", "nice", "good", "great",
|
|
103
|
+
"太棒", "爱了", "绝了", "牛逼", "推荐", "值得"}
|
|
104
|
+
neg_kw = {"垃圾", "恶心", "讨厌", "烂", "烦", "失望", "差",
|
|
105
|
+
"bad", "terrible", "hate", "useless", "废物"}
|
|
106
|
+
|
|
107
|
+
text_lower = raw.lower()
|
|
108
|
+
pos_score = 0.0
|
|
109
|
+
neg_score = 0.0
|
|
110
|
+
|
|
111
|
+
for kw in pos_kw:
|
|
112
|
+
if kw in text_lower:
|
|
113
|
+
pos_score += 1.0
|
|
114
|
+
|
|
115
|
+
for kw in neg_kw:
|
|
116
|
+
if kw in text_lower:
|
|
117
|
+
neg_score += 1.0
|
|
118
|
+
|
|
119
|
+
# 否定词检测:作为独立词出现时才生效(排除"不错"里嵌入的"不")
|
|
120
|
+
negators_set = set()
|
|
121
|
+
if "不" in text_lower and not any(kw in text_lower for kw in ("不错", "不错过")):
|
|
122
|
+
negators_set.add("不")
|
|
123
|
+
for n in _NEGATORS:
|
|
124
|
+
if n in ("不",): # already handled
|
|
125
|
+
continue
|
|
126
|
+
if n in text_lower:
|
|
127
|
+
negators_set.add(n)
|
|
128
|
+
|
|
129
|
+
if pos_score > 0 and not negators_set:
|
|
130
|
+
label = "strong_positive" if intensity >= 1.2 else "positive"
|
|
131
|
+
elif neg_score > 0 or negators_set:
|
|
132
|
+
label = "strong_negative" if intensity >= 1.2 else "negative"
|
|
133
|
+
else:
|
|
134
|
+
label = "neutral"
|
|
135
|
+
|
|
136
|
+
return round(intensity, 4), label
|
keepsake/forgetter.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""选择性遗忘 — 主动清理低价值碎片。
|
|
2
|
+
|
|
3
|
+
价值判断维度:
|
|
4
|
+
1. 年龄: 创建 > max_age_days 的碎片
|
|
5
|
+
2. 反馈: feedback_score 为负或为零
|
|
6
|
+
3. 情绪烈度: intensity 低(用户不激动的内容)
|
|
7
|
+
4. 注意力: 从未被命中过高注意力话题
|
|
8
|
+
5. 召回率: 从未被检索召回过(如果有关联字段追踪)
|
|
9
|
+
|
|
10
|
+
只有多个维度同时低,才会被遗忘。防止误删有用信息。
|
|
11
|
+
|
|
12
|
+
配置参数:
|
|
13
|
+
- max_age_days: 最大保留天数(默认 30)
|
|
14
|
+
- min_feedback_score: 最低反馈分(低于此值可遗忘,默认 0)
|
|
15
|
+
- batch_size: 每轮扫描数(默认 200)
|
|
16
|
+
- dry_run: 仅统计不删除(默认 True,安全模式)
|
|
17
|
+
- min_intensity: 最低情绪烈度(低于此值且其他维度也低才删,默认 0.3)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
from datetime import datetime, timezone
|
|
24
|
+
from typing import Any, Dict, List, Optional
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
# 默认参数
|
|
29
|
+
DEFAULT_MAX_AGE_DAYS = 30
|
|
30
|
+
DEFAULT_MIN_FEEDBACK_SCORE = 0
|
|
31
|
+
DEFAULT_BATCH_SIZE = 200
|
|
32
|
+
DEFAULT_DRY_RUN = True
|
|
33
|
+
DEFAULT_MIN_INTENSITY = 0.3
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Forgetter:
|
|
37
|
+
"""选择性遗忘引擎。"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
storage: Any,
|
|
42
|
+
max_age_days: int = DEFAULT_MAX_AGE_DAYS,
|
|
43
|
+
min_feedback_score: int = DEFAULT_MIN_FEEDBACK_SCORE,
|
|
44
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
45
|
+
dry_run: bool = DEFAULT_DRY_RUN,
|
|
46
|
+
min_intensity: float = DEFAULT_MIN_INTENSITY,
|
|
47
|
+
full_max_age_days: int = 60,
|
|
48
|
+
):
|
|
49
|
+
self._storage = storage
|
|
50
|
+
self._max_age_days = max_age_days
|
|
51
|
+
self._min_feedback_score = min_feedback_score
|
|
52
|
+
self._batch_size = batch_size
|
|
53
|
+
self._dry_run = dry_run
|
|
54
|
+
self._min_intensity = min_intensity
|
|
55
|
+
self._full_max_age_days = full_max_age_days
|
|
56
|
+
|
|
57
|
+
def forget(self, force: bool = False) -> Dict[str, Any]:
|
|
58
|
+
"""执行一轮遗忘操作。
|
|
59
|
+
|
|
60
|
+
参数:
|
|
61
|
+
force: True 时忽略 dry_run 设置,实际删除
|
|
62
|
+
|
|
63
|
+
返回:
|
|
64
|
+
操作统计
|
|
65
|
+
"""
|
|
66
|
+
client = self._storage._get_client()
|
|
67
|
+
if not client:
|
|
68
|
+
return {"status": "error", "reason": "Redis not available"}
|
|
69
|
+
|
|
70
|
+
stats = {
|
|
71
|
+
"scanned": 0,
|
|
72
|
+
"candidates": 0,
|
|
73
|
+
"deleted": 0,
|
|
74
|
+
"skipped_protected": 0,
|
|
75
|
+
"dry_run": self._dry_run and not force,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
forgettable = self._find_forgettable(client, stats)
|
|
79
|
+
|
|
80
|
+
# 扫描完整记忆(memory:full:*),只按年龄判断
|
|
81
|
+
forgettable_full = self._find_forgettable_full(client, stats)
|
|
82
|
+
forgettable.extend(forgettable_full)
|
|
83
|
+
|
|
84
|
+
stats["candidates"] = len(forgettable)
|
|
85
|
+
|
|
86
|
+
if not forgettable:
|
|
87
|
+
return stats
|
|
88
|
+
|
|
89
|
+
if self._dry_run and not force:
|
|
90
|
+
# 只统计不删除
|
|
91
|
+
stats["deleted"] = 0
|
|
92
|
+
logger.info(
|
|
93
|
+
"forgetter: [DRY RUN] would delete %d fragments (skipped %d protected)",
|
|
94
|
+
len(forgettable), stats["skipped_protected"],
|
|
95
|
+
)
|
|
96
|
+
return stats
|
|
97
|
+
|
|
98
|
+
# 实际删除
|
|
99
|
+
deleted = 0
|
|
100
|
+
for key in forgettable:
|
|
101
|
+
try:
|
|
102
|
+
client.delete(key)
|
|
103
|
+
deleted += 1
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.debug("forgetter: delete %s failed: %s", key, e)
|
|
106
|
+
|
|
107
|
+
stats["deleted"] = deleted
|
|
108
|
+
logger.info(
|
|
109
|
+
"forgetter: deleted %d/%d forgettable fragments",
|
|
110
|
+
deleted, len(forgettable),
|
|
111
|
+
)
|
|
112
|
+
return stats
|
|
113
|
+
|
|
114
|
+
def _find_forgettable(
|
|
115
|
+
self,
|
|
116
|
+
client,
|
|
117
|
+
stats: Dict[str, Any],
|
|
118
|
+
) -> List[str]:
|
|
119
|
+
"""扫描并筛选可遗忘的碎片。"""
|
|
120
|
+
now = datetime.now(timezone.utc)
|
|
121
|
+
cutoff_ts = now.timestamp() - self._max_age_days * 86400
|
|
122
|
+
forgettable_keys: List[str] = []
|
|
123
|
+
|
|
124
|
+
cursor = 0
|
|
125
|
+
protected = 0
|
|
126
|
+
|
|
127
|
+
while True:
|
|
128
|
+
cursor, keys = client.scan(
|
|
129
|
+
cursor=cursor,
|
|
130
|
+
match="memory:frag:*",
|
|
131
|
+
count=self._batch_size,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if not keys:
|
|
135
|
+
if cursor == 0:
|
|
136
|
+
break
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
# 用 pipeline 批量 HMGET,减少网络往返
|
|
140
|
+
pipe = client.pipeline()
|
|
141
|
+
hmget_fields = ["created", "feedback_score", "sentiment_score",
|
|
142
|
+
"fragment_type", "source", "category", "content"]
|
|
143
|
+
for key_b in keys:
|
|
144
|
+
pipe.hmget(key_b, hmget_fields)
|
|
145
|
+
pipe_results = pipe.execute()
|
|
146
|
+
|
|
147
|
+
for key_b, fields in zip(keys, pipe_results):
|
|
148
|
+
key = key_b.decode("utf-8") if isinstance(key_b, bytes) else key_b
|
|
149
|
+
stats["scanned"] += 1
|
|
150
|
+
|
|
151
|
+
if not fields or not any(fields):
|
|
152
|
+
continue # key 不存在或空
|
|
153
|
+
|
|
154
|
+
def _d(v):
|
|
155
|
+
if v is None:
|
|
156
|
+
return ""
|
|
157
|
+
return v.decode("utf-8") if isinstance(v, bytes) else str(v)
|
|
158
|
+
|
|
159
|
+
created_str = _d(fields[0])
|
|
160
|
+
fb_str = _d(fields[1])
|
|
161
|
+
sent_str = _d(fields[2])
|
|
162
|
+
frag_type = _d(fields[3])
|
|
163
|
+
source = _d(fields[4])
|
|
164
|
+
category = _d(fields[5])
|
|
165
|
+
content = _d(fields[6])
|
|
166
|
+
|
|
167
|
+
# ---- 保护规则 ----
|
|
168
|
+
# 1. 不删 consolidated 碎片
|
|
169
|
+
if frag_type == "consolidated":
|
|
170
|
+
protected += 1
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
# 2. 不删用户手动存的 memory
|
|
174
|
+
if source == "hermes_agent":
|
|
175
|
+
fb = self._parse_float(fb_str, 0)
|
|
176
|
+
if fb >= 0:
|
|
177
|
+
protected += 1
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
# 3. 不删正反馈碎片
|
|
181
|
+
fb = self._parse_float(fb_str, 0)
|
|
182
|
+
if fb > self._min_feedback_score:
|
|
183
|
+
protected += 1
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
# ---- 年龄检查 ----
|
|
187
|
+
if created_str:
|
|
188
|
+
try:
|
|
189
|
+
created_ts = datetime.fromisoformat(created_str).timestamp()
|
|
190
|
+
if created_ts > cutoff_ts:
|
|
191
|
+
continue # 还不够老
|
|
192
|
+
except (ValueError, TypeError):
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
# ---- 情绪烈度检查 ----
|
|
196
|
+
intensity = self._parse_float(sent_str, 0)
|
|
197
|
+
if intensity >= self._min_intensity:
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
# ---- 注意力检查 ----
|
|
201
|
+
if content:
|
|
202
|
+
try:
|
|
203
|
+
attn_w = self._storage.match_attention(content)
|
|
204
|
+
if attn_w and attn_w > 1.1:
|
|
205
|
+
continue # 高关注度话题,保留
|
|
206
|
+
except Exception:
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
# 所有条件都满足 → 可遗忘
|
|
210
|
+
forgettable_keys.append(key)
|
|
211
|
+
|
|
212
|
+
if cursor == 0:
|
|
213
|
+
break
|
|
214
|
+
|
|
215
|
+
stats["skipped_protected"] = protected
|
|
216
|
+
return forgettable_keys
|
|
217
|
+
|
|
218
|
+
@staticmethod
|
|
219
|
+
def _parse_float(val, default: float = 0.0) -> float:
|
|
220
|
+
"""安全转 float。"""
|
|
221
|
+
if val is None or val == "":
|
|
222
|
+
return default
|
|
223
|
+
try:
|
|
224
|
+
return float(val)
|
|
225
|
+
except (ValueError, TypeError):
|
|
226
|
+
return default
|
|
227
|
+
|
|
228
|
+
def _find_forgettable_full(
|
|
229
|
+
self,
|
|
230
|
+
client,
|
|
231
|
+
stats: Dict[str, Any],
|
|
232
|
+
) -> List[str]:
|
|
233
|
+
"""扫描完整记忆(memory:full:*),只按年龄判断是否可遗忘。"""
|
|
234
|
+
now = datetime.now(timezone.utc)
|
|
235
|
+
cutoff_ts = now.timestamp() - self._full_max_age_days * 86400
|
|
236
|
+
forgettable_keys: List[str] = []
|
|
237
|
+
|
|
238
|
+
cursor = 0
|
|
239
|
+
while True:
|
|
240
|
+
cursor, keys = client.scan(
|
|
241
|
+
cursor=cursor,
|
|
242
|
+
match="memory:full:*",
|
|
243
|
+
count=self._batch_size,
|
|
244
|
+
)
|
|
245
|
+
for key_b in keys:
|
|
246
|
+
key = key_b.decode("utf-8") if isinstance(key_b, bytes) else key_b
|
|
247
|
+
stats["scanned"] += 1
|
|
248
|
+
try:
|
|
249
|
+
created_data = client.hget(key, "last_accessed") or client.hget(key, "created")
|
|
250
|
+
if not created_data:
|
|
251
|
+
continue
|
|
252
|
+
created_ts = float(created_data)
|
|
253
|
+
if created_ts > cutoff_ts:
|
|
254
|
+
continue # 最近被访问过或创建不久,保留
|
|
255
|
+
forgettable_keys.append(key)
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.debug("forgetter: skip full memory key %s: %s", key, e)
|
|
258
|
+
continue
|
|
259
|
+
if cursor == 0:
|
|
260
|
+
break
|
|
261
|
+
|
|
262
|
+
return forgettable_keys
|
keepsake/py.typed
ADDED
|
File without changes
|