keepsake-memory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keepsake/__init__.py +558 -0
- keepsake/attention.py +146 -0
- keepsake/consolidator.py +395 -0
- keepsake/embedder.py +155 -0
- keepsake/emotion.py +136 -0
- keepsake/forgetter.py +262 -0
- keepsake/py.typed +0 -0
- keepsake/splitter.py +436 -0
- keepsake/storage.py +1360 -0
- keepsake_memory-1.0.0.dist-info/METADATA +424 -0
- keepsake_memory-1.0.0.dist-info/RECORD +14 -0
- keepsake_memory-1.0.0.dist-info/WHEEL +5 -0
- keepsake_memory-1.0.0.dist-info/licenses/LICENSE +21 -0
- keepsake_memory-1.0.0.dist-info/top_level.txt +1 -0
keepsake/splitter.py
ADDED
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
"""文本切分工具 + 情感分析 — 按语义完整性切分文本,检测情感倾向。"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, List, Tuple
|
|
9
|
+
|
|
10
|
+
import jieba
|
|
11
|
+
|
|
12
|
+
# 自定义领域词典路径(由 discover_synonyms 自动生成)
|
|
13
|
+
_DOMAIN_DICT = Path.home() / '.config' / 'keepsake' / 'jieba_dict.txt'
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def init_domain_dict() -> None:
|
|
17
|
+
"""加载/重载自定义领域词典。
|
|
18
|
+
|
|
19
|
+
首次 import 时自动调用一次。插件 initialize() 时也调用一次,
|
|
20
|
+
确保发 /new 后词典被重新加载(此时词典文件可能已被 discover_synonyms 更新)。
|
|
21
|
+
重复调用安全(jieba.load_userdict 是累加的)。
|
|
22
|
+
"""
|
|
23
|
+
if _DOMAIN_DICT.exists():
|
|
24
|
+
jieba.load_userdict(str(_DOMAIN_DICT))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# 首次 import 时自动加载
|
|
28
|
+
init_domain_dict()
|
|
29
|
+
|
|
30
|
+
NEWLINE = "\n"
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# 常见英文缩写(不在此边界断句)
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
_ABBREVIATIONS = frozenset({
|
|
37
|
+
"mr", "mrs", "ms", "dr", "prof", "sr", "jr", "st", "vs", "etc", "dept",
|
|
38
|
+
"inc", "ltd", "co", "corp", "capt", "gen", "sgt", "lt", "maj", "col",
|
|
39
|
+
"gov", "rep", "sen", "pres", "vice", "pres", "hon", "esq", "phd", "md",
|
|
40
|
+
"ave", "blvd", "rd", "ct", "dr", "est", "inst", "univ",
|
|
41
|
+
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
|
|
42
|
+
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
|
|
43
|
+
"jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "oct", "nov", "dec",
|
|
44
|
+
"al", "fig", "vol", "no", "pp", "ex",
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
_STOP_WORDS = {
|
|
48
|
+
"这个", "那个", "什么", "怎么", "为什么", "可以", "没有",
|
|
49
|
+
"但是", "如果", "因为", "所以", "而且", "然后", "还是",
|
|
50
|
+
"就是", "不是", "一个", "我们", "你们", "他们", "已经",
|
|
51
|
+
"可以", "可能", "应该", "需要", "这样", "那样", "这里",
|
|
52
|
+
"那里", "这个", "这些", "那些", "之后", "之前", "时候",
|
|
53
|
+
"the", "this", "that", "what", "why", "how", "and",
|
|
54
|
+
"but", "for", "with", "not", "are", "was", "had",
|
|
55
|
+
"its", "has", "all", "can", "use", "get", "set",
|
|
56
|
+
"的", "了", "在", "是", "我", "有", "和", "就", "不",
|
|
57
|
+
"人", "都", "一", "一个", "上", "也", "很", "到", "说",
|
|
58
|
+
"要", "去", "你", "会", "着", "没有", "看", "好", "自己",
|
|
59
|
+
"这",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
# 情感关键词表(支持中英文)
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
_POSITIVE_KEYWORDS: Dict[str, float] = {
|
|
68
|
+
# 强烈正面 (×1.5)
|
|
69
|
+
"太棒了": 1.5, "太好了": 1.5, "非常满意": 1.5, "爱了": 1.5,
|
|
70
|
+
"绝了": 1.5, "牛逼": 1.5, "太强了": 1.5,
|
|
71
|
+
# 中等正面 (×1.0)
|
|
72
|
+
"喜欢": 1.0, "不错": 1.0, "好用": 1.0, "满意": 1.0,
|
|
73
|
+
"推荐": 1.0, "值得": 1.0, "好使": 1.0, "方便": 0.8,
|
|
74
|
+
"清晰": 0.8, "直观": 0.8, "nice": 0.8, "good": 0.8,
|
|
75
|
+
"great": 1.0, "awesome": 1.2, "excellent": 1.3,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
_NEGATIVE_KEYWORDS: Dict[str, float] = {
|
|
79
|
+
# 强烈负面 (×1.5)
|
|
80
|
+
"太差了": 1.5, "垃圾": 1.5, "恶心": 1.5, "废物": 1.5,
|
|
81
|
+
"烂": 1.3, "垃圾东西": 1.5,
|
|
82
|
+
# 中等负面 (×1.0)
|
|
83
|
+
"讨厌": 1.0, "不好用": 1.0, "不满意": 1.0, "糟糕": 1.0,
|
|
84
|
+
"烦": 0.8, "失望": 1.0, "差评": 1.0, "不行": 0.8,
|
|
85
|
+
"bad": 1.0, "terrible": 1.3, "hate": 1.2, "useless": 1.0,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# 否定词 — 出现在情感词前 3 个字符内会反转情感
|
|
89
|
+
_NEGATORS = {"不", "没", "别", "勿", "无", "not", "no", "n't", "never"}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def analyze_sentiment(text: str) -> Tuple[float, str]:
|
|
93
|
+
"""分析文本情感倾向。
|
|
94
|
+
|
|
95
|
+
返回 (sentiment_score, sentiment_label):
|
|
96
|
+
sentiment_score: -1.0 ~ 1.0, 负值=负面, 正值=正面
|
|
97
|
+
sentiment_label: "positive" / "negative" / "neutral"
|
|
98
|
+
|
|
99
|
+
算法:
|
|
100
|
+
1. 扫描正负面关键词,匹配时检测前文否定词
|
|
101
|
+
2. 累计正负得分,归一化到 -1~1
|
|
102
|
+
3. |score| < 0.15 视为 neutral
|
|
103
|
+
"""
|
|
104
|
+
if not text or not text.strip():
|
|
105
|
+
return 0.0, "neutral"
|
|
106
|
+
|
|
107
|
+
text_lower = text.lower().strip()
|
|
108
|
+
pos_score = 0.0
|
|
109
|
+
neg_score = 0.0
|
|
110
|
+
|
|
111
|
+
# 扫描正面关键词
|
|
112
|
+
for kw, intensity in _POSITIVE_KEYWORDS.items():
|
|
113
|
+
idx = text_lower.find(kw.lower())
|
|
114
|
+
if idx == -1:
|
|
115
|
+
continue
|
|
116
|
+
# 检查前文是否有否定词
|
|
117
|
+
start = max(0, idx - 4)
|
|
118
|
+
prefix = text_lower[start:idx].strip()
|
|
119
|
+
negated = any(n in prefix for n in _NEGATORS)
|
|
120
|
+
if negated:
|
|
121
|
+
neg_score += intensity * 0.5
|
|
122
|
+
else:
|
|
123
|
+
pos_score += intensity
|
|
124
|
+
|
|
125
|
+
# 扫描负面关键词
|
|
126
|
+
for kw, intensity in _NEGATIVE_KEYWORDS.items():
|
|
127
|
+
idx = text_lower.find(kw.lower())
|
|
128
|
+
if idx == -1:
|
|
129
|
+
continue
|
|
130
|
+
start = max(0, idx - 4)
|
|
131
|
+
prefix = text_lower[start:idx].strip()
|
|
132
|
+
negated = any(n in prefix for n in _NEGATORS)
|
|
133
|
+
if negated:
|
|
134
|
+
pos_score += intensity * 0.5
|
|
135
|
+
else:
|
|
136
|
+
neg_score += intensity
|
|
137
|
+
|
|
138
|
+
# 归一化到 -1~1
|
|
139
|
+
total = pos_score + neg_score
|
|
140
|
+
if total == 0:
|
|
141
|
+
return 0.0, "neutral"
|
|
142
|
+
|
|
143
|
+
score = (pos_score - neg_score) / max(total, 0.1)
|
|
144
|
+
score = max(-1.0, min(1.0, score))
|
|
145
|
+
|
|
146
|
+
if score > 0.15:
|
|
147
|
+
label = "positive"
|
|
148
|
+
elif score < -0.15:
|
|
149
|
+
label = "negative"
|
|
150
|
+
else:
|
|
151
|
+
label = "neutral"
|
|
152
|
+
|
|
153
|
+
return round(score, 4), label
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def extract_keywords(text: str, max_keywords: int = 5) -> List[str]:
|
|
157
|
+
"""从文本中提取关键词(基于 jieba 分词)。
|
|
158
|
+
|
|
159
|
+
使用 jieba 进行中文分词 + 词频统计,过滤停用词后返回
|
|
160
|
+
高频词作为关键词。英文词单独提取(3 字母以上)。
|
|
161
|
+
"""
|
|
162
|
+
if not text or not text.strip():
|
|
163
|
+
return []
|
|
164
|
+
|
|
165
|
+
text_lower = text.lower().strip()
|
|
166
|
+
candidates: List[str] = []
|
|
167
|
+
|
|
168
|
+
# 1. 用 jieba 做中文分词
|
|
169
|
+
words = jieba.lcut(text)
|
|
170
|
+
# 过滤停用词 + 长度 >= 2(单字词一般是语气词/助词)
|
|
171
|
+
chinese_words = [w for w in words
|
|
172
|
+
if len(w) >= 2
|
|
173
|
+
and w not in _STOP_WORDS
|
|
174
|
+
and not w.isdigit()
|
|
175
|
+
and len(set(w)) > 1] # 过滤 "哈哈" "AA" 类重复词
|
|
176
|
+
candidates.extend(chinese_words)
|
|
177
|
+
|
|
178
|
+
# 2. 提取英文词(3 字母以上)
|
|
179
|
+
eng_words = re.findall(r"\b[a-zA-Z]{3,}\b", text_lower)
|
|
180
|
+
candidates.extend([w for w in eng_words if w not in _STOP_WORDS])
|
|
181
|
+
|
|
182
|
+
# 3. 按频次降序
|
|
183
|
+
freq = Counter(candidates)
|
|
184
|
+
sorted_words = sorted(freq.items(), key=lambda x: -x[1])
|
|
185
|
+
return [w for w, _ in sorted_words[:max_keywords]]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# ---------------------------------------------------------------------------
|
|
189
|
+
# 句子切分
|
|
190
|
+
# ---------------------------------------------------------------------------
|
|
191
|
+
|
|
192
|
+
# 预编译边界正则 — 同时匹配中文和英文句末标点
|
|
193
|
+
# 中文:。!? 后直接切(不跟在实际可能出现的引号后)
|
|
194
|
+
# 英文:.!?后跟空白、大写字母、标点或行尾才切
|
|
195
|
+
# 保护场景在后处理 _split_sentences 中处理
|
|
196
|
+
_SENTENCE_SPLIT_RE = re.compile(
|
|
197
|
+
r'(?<=[。!?])' # 中文句末标点
|
|
198
|
+
r'|'
|
|
199
|
+
r'(?<=[!?])' # 中英文通用叹号/问号
|
|
200
|
+
r'|'
|
|
201
|
+
# 英文 .!? 后跟空白字符或文本结束才切
|
|
202
|
+
r'(?<=[.!?])(?=\s|$)' # 仅空白或行尾
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _split_sentences(text: str) -> List[str]:
|
|
207
|
+
"""将段落切分为句级片段,优先保留语义完整。
|
|
208
|
+
|
|
209
|
+
保护规则:
|
|
210
|
+
- 数字间句点(3.14、v1.0)
|
|
211
|
+
- 缩写句点(Mr.、Dr.、U.S.A.)
|
|
212
|
+
- 省略号(... → …)
|
|
213
|
+
- 中文 。 后跟引号不单独切
|
|
214
|
+
"""
|
|
215
|
+
# 1. 归一化连点 → …(保留至少 3 个点时才归一化)
|
|
216
|
+
text = re.sub(r'\.{3,}', '…', text)
|
|
217
|
+
text = re.sub(r'…{2,}', '…', text)
|
|
218
|
+
|
|
219
|
+
# 2. 用正则切分
|
|
220
|
+
raw_parts = _SENTENCE_SPLIT_RE.split(text)
|
|
221
|
+
|
|
222
|
+
# 3. 后处理:合并因缩写/数字/单字母误切的部分
|
|
223
|
+
merged: List[str] = []
|
|
224
|
+
for part in raw_parts:
|
|
225
|
+
part = part.strip()
|
|
226
|
+
if not part:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
if merged:
|
|
230
|
+
last = merged[-1]
|
|
231
|
+
# 如果上一片段的末尾看起像是一个缩写或数字(如 Mr、3.14、U.S.)
|
|
232
|
+
# 或者当前片段很短(<=3 字符)且不以大写字母开头 → 应合并
|
|
233
|
+
prev_tail = last.rstrip()
|
|
234
|
+
is_abbrev = (
|
|
235
|
+
_looks_like_abbreviation(prev_tail)
|
|
236
|
+
or _ends_with_digit_dot(prev_tail)
|
|
237
|
+
)
|
|
238
|
+
is_fragment = len(part) <= 3 and not part[0].isupper()
|
|
239
|
+
if is_abbrev or is_fragment:
|
|
240
|
+
merged[-1] = last + part
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
merged.append(part)
|
|
244
|
+
|
|
245
|
+
return merged
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _looks_like_abbreviation(text: str) -> bool:
|
|
249
|
+
"""检查文本末尾是否像缩写(Mr.、Dr.、U.S.A.、etc.)。"""
|
|
250
|
+
# 去掉末尾空白
|
|
251
|
+
text = text.rstrip()
|
|
252
|
+
m = re.search(r'\b([A-Za-z]{1,5})\.$', text)
|
|
253
|
+
if not m:
|
|
254
|
+
return False
|
|
255
|
+
word = m.group(1).lower()
|
|
256
|
+
# 在缩写列表中,或是 1-2 个全大写字母(如 U.S. -> U 和 S 各一段)
|
|
257
|
+
return word in _ABBREVIATIONS or (word.isupper() and len(word) <= 2)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _ends_with_digit_dot(text: str) -> bool:
|
|
261
|
+
"""检查文本是否以数字+句点结尾(如 '3.14' 中的 '3.')。"""
|
|
262
|
+
return bool(re.search(r'\d\.$', text.rstrip()))
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# ---------------------------------------------------------------------------
|
|
266
|
+
# 主切分入口
|
|
267
|
+
# ---------------------------------------------------------------------------
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def split_text(text: str, max_chars: int = 500) -> List[str]:
|
|
271
|
+
"""按语义完整性切分文本。
|
|
272
|
+
|
|
273
|
+
策略:
|
|
274
|
+
1. 先按段落(连续空行)切分
|
|
275
|
+
2. 超长段落按智能句子边界切分(保护数字、缩写、引号)
|
|
276
|
+
3. 过短碎片(<10 字)与相邻片段合并
|
|
277
|
+
|
|
278
|
+
参数:
|
|
279
|
+
text: 要切分的文本
|
|
280
|
+
max_chars: 单个碎片最大字符数
|
|
281
|
+
|
|
282
|
+
返回:
|
|
283
|
+
切分后的文本列表
|
|
284
|
+
"""
|
|
285
|
+
if not text or not text.strip():
|
|
286
|
+
return []
|
|
287
|
+
|
|
288
|
+
raw_paras = re.split(r"\n\s*\n", text.strip())
|
|
289
|
+
segments: List[str] = []
|
|
290
|
+
|
|
291
|
+
for para in raw_paras:
|
|
292
|
+
para = para.strip()
|
|
293
|
+
if not para:
|
|
294
|
+
continue
|
|
295
|
+
|
|
296
|
+
if len(para) <= max_chars:
|
|
297
|
+
segments.append(para)
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
# 超长段落:按智能句子边界切
|
|
301
|
+
sentences = _split_sentences(para)
|
|
302
|
+
chunk = ""
|
|
303
|
+
for s in sentences:
|
|
304
|
+
s = s.strip()
|
|
305
|
+
if not s:
|
|
306
|
+
continue
|
|
307
|
+
if len(chunk) + len(s) > max_chars and chunk:
|
|
308
|
+
segments.append(chunk.strip())
|
|
309
|
+
chunk = s
|
|
310
|
+
else:
|
|
311
|
+
chunk += s + NEWLINE
|
|
312
|
+
if chunk.strip():
|
|
313
|
+
segments.append(chunk.strip())
|
|
314
|
+
|
|
315
|
+
# 合并过短碎片(<10 字符)到前一个
|
|
316
|
+
final: List[str] = []
|
|
317
|
+
for seg in segments:
|
|
318
|
+
if len(seg) < 10 and final:
|
|
319
|
+
final[-1] += NEWLINE + seg
|
|
320
|
+
else:
|
|
321
|
+
final.append(seg)
|
|
322
|
+
|
|
323
|
+
return [s for s in final if len(s) >= 10]
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
# ---------------------------------------------------------------------------
|
|
327
|
+
# 实体提取 — 用于实体关系图
|
|
328
|
+
# ---------------------------------------------------------------------------
|
|
329
|
+
|
|
330
|
+
# 大写缩写/英文实体: BTC, ETH, ZG, MACD 等
|
|
331
|
+
_ENTITY_ENGLISH_RE = re.compile(r"[A-Z][A-Z0-9]{1,}(?:/[A-Z0-9]+)*") # BTC, ETH, ZG, ZD
|
|
332
|
+
|
|
333
|
+
# 中文平台/项目名常见词缀
|
|
334
|
+
_ENTITY_CHN_SUFFIXES = {"公司", "平台", "集团", "科技", "网络", "学院", "大学", "社区", "基金", "项目", "团队", "部门"}
|
|
335
|
+
|
|
336
|
+
# 已知高频实体白名单(本领域常见名词)
|
|
337
|
+
_ENTITY_KNOWN = frozenset({
|
|
338
|
+
"缠论", "中枢", "三买", "三卖", "顶背驰", "底背驰", "金叉", "死叉",
|
|
339
|
+
"以太坊", "比特币", "知乎", "B站", "抖音", "微博", "公众号", "视频号",
|
|
340
|
+
"微信", "QQ", "Telegram", "币安", "Bitget", "Binance",
|
|
341
|
+
"小红书", "TradeApi", "Hermes",
|
|
342
|
+
})
|
|
343
|
+
|
|
344
|
+
# 价格数字: 5-6位数(常见crypto价格)
|
|
345
|
+
_ENTITY_PRICE_RE = re.compile(r"(?<!\d)([6-9]\d{4,5})(?!\d)")
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def extract_entities(text: str) -> list[str]:
|
|
349
|
+
"""从文本中提取候选实体(零LLM,纯jieba + regex + 白名单)。
|
|
350
|
+
|
|
351
|
+
返回去重后的实体名列表,按出现顺序排列。
|
|
352
|
+
"""
|
|
353
|
+
if not text or not text.strip():
|
|
354
|
+
return []
|
|
355
|
+
|
|
356
|
+
entities: list[str] = []
|
|
357
|
+
seen: set[str] = set()
|
|
358
|
+
|
|
359
|
+
# 0. 已知白名单实体
|
|
360
|
+
text_lower = text.lower()
|
|
361
|
+
for known in _ENTITY_KNOWN:
|
|
362
|
+
if known.lower() in text_lower:
|
|
363
|
+
key = known.lower()
|
|
364
|
+
if key not in seen:
|
|
365
|
+
entities.append(known)
|
|
366
|
+
seen.add(key)
|
|
367
|
+
|
|
368
|
+
# 1. jieba posseg 提取
|
|
369
|
+
try:
|
|
370
|
+
words = jieba.posseg.lcut(text)
|
|
371
|
+
for w, flag in words:
|
|
372
|
+
w_stripped = w.strip()
|
|
373
|
+
if len(w_stripped) < 2:
|
|
374
|
+
continue
|
|
375
|
+
# 人名/机构/地名/专名
|
|
376
|
+
if flag in ("nr", "nr1", "nr2", "nrj", "nrf", "nt", "ns", "nsf", "nz"):
|
|
377
|
+
key = w_stripped.lower()
|
|
378
|
+
if key not in seen:
|
|
379
|
+
entities.append(w_stripped)
|
|
380
|
+
seen.add(key)
|
|
381
|
+
# 英文词(BTC, ETH 等)
|
|
382
|
+
elif flag == "eng":
|
|
383
|
+
key = w_stripped.lower()
|
|
384
|
+
if len(key) >= 2 and key not in seen:
|
|
385
|
+
entities.append(w_stripped.upper())
|
|
386
|
+
seen.add(key)
|
|
387
|
+
except Exception:
|
|
388
|
+
pass
|
|
389
|
+
|
|
390
|
+
# 2. regex 补充 — 大写缩写
|
|
391
|
+
for m in _ENTITY_ENGLISH_RE.finditer(text):
|
|
392
|
+
token = m.group()
|
|
393
|
+
key = token.lower()
|
|
394
|
+
if len(token) >= 2 and key not in seen:
|
|
395
|
+
entities.append(token)
|
|
396
|
+
seen.add(key)
|
|
397
|
+
|
|
398
|
+
# 3. regex 补充 — 技术术语
|
|
399
|
+
for token in ("ZG", "ZD", "MACD", "DIF", "DEA", "HIST", "RSI", "OBV", "EMA", "SMA", "BOLL",
|
|
400
|
+
"buy1", "sell1", "buy2", "sell2"):
|
|
401
|
+
if token.lower() in text_lower:
|
|
402
|
+
key = token.lower()
|
|
403
|
+
if key not in seen:
|
|
404
|
+
entities.append(token.upper())
|
|
405
|
+
seen.add(key)
|
|
406
|
+
|
|
407
|
+
# 4. 中文名+词缀组合: 如 "小米公司" 里的 "小米"
|
|
408
|
+
for suffix in _ENTITY_CHN_SUFFIXES:
|
|
409
|
+
idx = text.find(suffix)
|
|
410
|
+
if idx >= 1:
|
|
411
|
+
# 取 suffix 前一个词(1-6字)
|
|
412
|
+
start = max(0, idx - 12)
|
|
413
|
+
prefix = text[start:idx]
|
|
414
|
+
# 尝试用 jieba 分词找最后一个有意义的词
|
|
415
|
+
try:
|
|
416
|
+
words = jieba.lcut(prefix)
|
|
417
|
+
for w in reversed(words):
|
|
418
|
+
w = w.strip()
|
|
419
|
+
if len(w) >= 2 and w not in _STOP_WORDS:
|
|
420
|
+
key = w.lower()
|
|
421
|
+
if key not in seen:
|
|
422
|
+
entities.append(w)
|
|
423
|
+
seen.add(key)
|
|
424
|
+
break
|
|
425
|
+
except Exception:
|
|
426
|
+
pass
|
|
427
|
+
|
|
428
|
+
# 5. 价格数字(5-6位数,如 63000)
|
|
429
|
+
for m in _ENTITY_PRICE_RE.finditer(text):
|
|
430
|
+
token = m.group()
|
|
431
|
+
key = token
|
|
432
|
+
if key not in seen:
|
|
433
|
+
entities.append(token)
|
|
434
|
+
seen.add(key)
|
|
435
|
+
|
|
436
|
+
return entities
|