neural-memory 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neural_memory/__init__.py +38 -0
- neural_memory/cli/__init__.py +15 -0
- neural_memory/cli/__main__.py +6 -0
- neural_memory/cli/config.py +176 -0
- neural_memory/cli/main.py +2702 -0
- neural_memory/cli/storage.py +169 -0
- neural_memory/cli/tui.py +471 -0
- neural_memory/core/__init__.py +52 -0
- neural_memory/core/brain.py +301 -0
- neural_memory/core/brain_mode.py +273 -0
- neural_memory/core/fiber.py +236 -0
- neural_memory/core/memory_types.py +331 -0
- neural_memory/core/neuron.py +168 -0
- neural_memory/core/project.py +257 -0
- neural_memory/core/synapse.py +215 -0
- neural_memory/engine/__init__.py +15 -0
- neural_memory/engine/activation.py +335 -0
- neural_memory/engine/encoder.py +391 -0
- neural_memory/engine/retrieval.py +440 -0
- neural_memory/extraction/__init__.py +42 -0
- neural_memory/extraction/entities.py +547 -0
- neural_memory/extraction/parser.py +337 -0
- neural_memory/extraction/router.py +396 -0
- neural_memory/extraction/temporal.py +428 -0
- neural_memory/mcp/__init__.py +9 -0
- neural_memory/mcp/__main__.py +6 -0
- neural_memory/mcp/server.py +621 -0
- neural_memory/py.typed +0 -0
- neural_memory/safety/__init__.py +31 -0
- neural_memory/safety/freshness.py +238 -0
- neural_memory/safety/sensitive.py +304 -0
- neural_memory/server/__init__.py +5 -0
- neural_memory/server/app.py +99 -0
- neural_memory/server/dependencies.py +33 -0
- neural_memory/server/models.py +138 -0
- neural_memory/server/routes/__init__.py +7 -0
- neural_memory/server/routes/brain.py +221 -0
- neural_memory/server/routes/memory.py +169 -0
- neural_memory/server/routes/sync.py +387 -0
- neural_memory/storage/__init__.py +17 -0
- neural_memory/storage/base.py +441 -0
- neural_memory/storage/factory.py +329 -0
- neural_memory/storage/memory_store.py +896 -0
- neural_memory/storage/shared_store.py +650 -0
- neural_memory/storage/sqlite_store.py +1613 -0
- neural_memory/sync/__init__.py +5 -0
- neural_memory/sync/client.py +435 -0
- neural_memory/unified_config.py +315 -0
- neural_memory/utils/__init__.py +5 -0
- neural_memory/utils/config.py +98 -0
- neural_memory-0.1.0.dist-info/METADATA +314 -0
- neural_memory-0.1.0.dist-info/RECORD +55 -0
- neural_memory-0.1.0.dist-info/WHEEL +4 -0
- neural_memory-0.1.0.dist-info/entry_points.txt +4 -0
- neural_memory-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,547 @@
|
|
|
1
|
+
"""Entity extraction from text."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from enum import StrEnum
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EntityType(StrEnum):
|
|
11
|
+
"""Types of named entities."""
|
|
12
|
+
|
|
13
|
+
PERSON = "person"
|
|
14
|
+
LOCATION = "location"
|
|
15
|
+
ORGANIZATION = "organization"
|
|
16
|
+
PRODUCT = "product"
|
|
17
|
+
EVENT = "event"
|
|
18
|
+
UNKNOWN = "unknown"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Entity:
|
|
23
|
+
"""
|
|
24
|
+
A named entity extracted from text.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
text: The original text of the entity
|
|
28
|
+
type: The entity type
|
|
29
|
+
start: Start character position in source text
|
|
30
|
+
end: End character position in source text
|
|
31
|
+
confidence: Extraction confidence (0.0 - 1.0)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
text: str
|
|
35
|
+
type: EntityType
|
|
36
|
+
start: int
|
|
37
|
+
end: int
|
|
38
|
+
confidence: float = 1.0
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EntityExtractor:
|
|
42
|
+
"""
|
|
43
|
+
Entity extractor using pattern matching.
|
|
44
|
+
|
|
45
|
+
For production use, consider using spaCy or underthesea
|
|
46
|
+
for better entity recognition. This provides basic
|
|
47
|
+
rule-based extraction as a fallback.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# Common Vietnamese person name prefixes
|
|
51
|
+
VI_PERSON_PREFIXES = {
|
|
52
|
+
"anh",
|
|
53
|
+
"chị",
|
|
54
|
+
"em",
|
|
55
|
+
"bạn",
|
|
56
|
+
"cô",
|
|
57
|
+
"chú",
|
|
58
|
+
"bác",
|
|
59
|
+
"ông",
|
|
60
|
+
"bà",
|
|
61
|
+
"thầy",
|
|
62
|
+
"cô giáo",
|
|
63
|
+
"mr",
|
|
64
|
+
"mrs",
|
|
65
|
+
"ms",
|
|
66
|
+
"miss",
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Common location indicators
|
|
70
|
+
LOCATION_INDICATORS = {
|
|
71
|
+
# Vietnamese
|
|
72
|
+
"ở",
|
|
73
|
+
"tại",
|
|
74
|
+
"đến",
|
|
75
|
+
"từ",
|
|
76
|
+
"quán",
|
|
77
|
+
"cafe",
|
|
78
|
+
"cà phê",
|
|
79
|
+
"nhà hàng",
|
|
80
|
+
"công ty",
|
|
81
|
+
"văn phòng",
|
|
82
|
+
# English
|
|
83
|
+
"at",
|
|
84
|
+
"in",
|
|
85
|
+
"to",
|
|
86
|
+
"from",
|
|
87
|
+
"restaurant",
|
|
88
|
+
"office",
|
|
89
|
+
"building",
|
|
90
|
+
"hotel",
|
|
91
|
+
"shop",
|
|
92
|
+
"store",
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
# Pattern for capitalized words (potential entities)
|
|
96
|
+
CAPITALIZED_PATTERN = re.compile(r"\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\b")
|
|
97
|
+
|
|
98
|
+
# Pattern for Vietnamese names (words after person prefixes)
|
|
99
|
+
VI_NAME_PATTERN = re.compile(
|
|
100
|
+
r"\b(?:anh|chị|em|bạn|cô|chú|bác|ông|bà)\s+([A-ZÀ-Ỹ][a-zà-ỹ]+(?:\s+[A-ZÀ-Ỹ][a-zà-ỹ]+)*)",
|
|
101
|
+
re.IGNORECASE,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def __init__(self, use_nlp: bool = False) -> None:
|
|
105
|
+
"""
|
|
106
|
+
Initialize the extractor.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
use_nlp: If True, try to use spaCy/underthesea (not implemented yet)
|
|
110
|
+
"""
|
|
111
|
+
self._use_nlp = use_nlp
|
|
112
|
+
self._nlp_en = None
|
|
113
|
+
self._nlp_vi = None
|
|
114
|
+
|
|
115
|
+
if use_nlp:
|
|
116
|
+
self._init_nlp()
|
|
117
|
+
|
|
118
|
+
def _init_nlp(self) -> None:
|
|
119
|
+
"""Initialize NLP models if available."""
|
|
120
|
+
# Try to load spaCy for English
|
|
121
|
+
try:
|
|
122
|
+
import spacy
|
|
123
|
+
|
|
124
|
+
self._nlp_en = spacy.load("en_core_web_sm")
|
|
125
|
+
except (ImportError, OSError):
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
# Try to load underthesea for Vietnamese
|
|
129
|
+
try:
|
|
130
|
+
import underthesea
|
|
131
|
+
|
|
132
|
+
self._nlp_vi = underthesea
|
|
133
|
+
except ImportError:
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
def extract(
|
|
137
|
+
self,
|
|
138
|
+
text: str,
|
|
139
|
+
language: str = "auto",
|
|
140
|
+
) -> list[Entity]:
|
|
141
|
+
"""
|
|
142
|
+
Extract entities from text.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
text: The text to extract from
|
|
146
|
+
language: "vi", "en", or "auto"
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
List of Entity objects
|
|
150
|
+
"""
|
|
151
|
+
entities: list[Entity] = []
|
|
152
|
+
|
|
153
|
+
# Try NLP-based extraction first
|
|
154
|
+
if self._use_nlp:
|
|
155
|
+
nlp_entities = self._extract_with_nlp(text, language)
|
|
156
|
+
if nlp_entities:
|
|
157
|
+
return nlp_entities
|
|
158
|
+
|
|
159
|
+
# Fall back to pattern-based extraction
|
|
160
|
+
entities.extend(self._extract_vietnamese_names(text))
|
|
161
|
+
entities.extend(self._extract_capitalized_words(text, entities))
|
|
162
|
+
entities.extend(self._extract_locations(text, entities))
|
|
163
|
+
|
|
164
|
+
# Remove duplicates
|
|
165
|
+
seen: set[str] = set()
|
|
166
|
+
unique: list[Entity] = []
|
|
167
|
+
for entity in entities:
|
|
168
|
+
key = f"{entity.text.lower()}:{entity.type}"
|
|
169
|
+
if key not in seen:
|
|
170
|
+
seen.add(key)
|
|
171
|
+
unique.append(entity)
|
|
172
|
+
|
|
173
|
+
return unique
|
|
174
|
+
|
|
175
|
+
def _extract_with_nlp(
|
|
176
|
+
self,
|
|
177
|
+
text: str,
|
|
178
|
+
language: str,
|
|
179
|
+
) -> list[Entity] | None:
|
|
180
|
+
"""Try to extract using NLP models."""
|
|
181
|
+
if language in ("en", "auto") and self._nlp_en:
|
|
182
|
+
doc = self._nlp_en(text)
|
|
183
|
+
entities = []
|
|
184
|
+
for ent in doc.ents:
|
|
185
|
+
entity_type = self._map_spacy_type(ent.label_)
|
|
186
|
+
if entity_type:
|
|
187
|
+
entities.append(
|
|
188
|
+
Entity(
|
|
189
|
+
text=ent.text,
|
|
190
|
+
type=entity_type,
|
|
191
|
+
start=ent.start_char,
|
|
192
|
+
end=ent.end_char,
|
|
193
|
+
confidence=0.9,
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
if entities:
|
|
197
|
+
return entities
|
|
198
|
+
|
|
199
|
+
if language in ("vi", "auto") and self._nlp_vi:
|
|
200
|
+
try:
|
|
201
|
+
ner_results = self._nlp_vi.ner(text)
|
|
202
|
+
entities = []
|
|
203
|
+
for word, tag in ner_results:
|
|
204
|
+
if tag.startswith("B-") or tag.startswith("I-"):
|
|
205
|
+
entity_type = self._map_underthesea_type(tag[2:])
|
|
206
|
+
if entity_type:
|
|
207
|
+
# Find position in text
|
|
208
|
+
start = text.find(word)
|
|
209
|
+
if start >= 0:
|
|
210
|
+
entities.append(
|
|
211
|
+
Entity(
|
|
212
|
+
text=word,
|
|
213
|
+
type=entity_type,
|
|
214
|
+
start=start,
|
|
215
|
+
end=start + len(word),
|
|
216
|
+
confidence=0.85,
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
if entities:
|
|
220
|
+
return entities
|
|
221
|
+
except Exception:
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
def _map_spacy_type(self, label: str) -> EntityType | None:
|
|
227
|
+
"""Map spaCy NER label to EntityType."""
|
|
228
|
+
mapping = {
|
|
229
|
+
"PERSON": EntityType.PERSON,
|
|
230
|
+
"PER": EntityType.PERSON,
|
|
231
|
+
"GPE": EntityType.LOCATION,
|
|
232
|
+
"LOC": EntityType.LOCATION,
|
|
233
|
+
"FAC": EntityType.LOCATION,
|
|
234
|
+
"ORG": EntityType.ORGANIZATION,
|
|
235
|
+
"PRODUCT": EntityType.PRODUCT,
|
|
236
|
+
"EVENT": EntityType.EVENT,
|
|
237
|
+
}
|
|
238
|
+
return mapping.get(label)
|
|
239
|
+
|
|
240
|
+
def _map_underthesea_type(self, label: str) -> EntityType | None:
|
|
241
|
+
"""Map underthesea NER label to EntityType."""
|
|
242
|
+
mapping = {
|
|
243
|
+
"PER": EntityType.PERSON,
|
|
244
|
+
"LOC": EntityType.LOCATION,
|
|
245
|
+
"ORG": EntityType.ORGANIZATION,
|
|
246
|
+
}
|
|
247
|
+
return mapping.get(label)
|
|
248
|
+
|
|
249
|
+
def _extract_vietnamese_names(self, text: str) -> list[Entity]:
|
|
250
|
+
"""Extract Vietnamese person names."""
|
|
251
|
+
entities = []
|
|
252
|
+
|
|
253
|
+
for match in self.VI_NAME_PATTERN.finditer(text):
|
|
254
|
+
name = match.group(1)
|
|
255
|
+
entities.append(
|
|
256
|
+
Entity(
|
|
257
|
+
text=name,
|
|
258
|
+
type=EntityType.PERSON,
|
|
259
|
+
start=match.start(1),
|
|
260
|
+
end=match.end(1),
|
|
261
|
+
confidence=0.8,
|
|
262
|
+
)
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
return entities
|
|
266
|
+
|
|
267
|
+
def _extract_capitalized_words(
|
|
268
|
+
self,
|
|
269
|
+
text: str,
|
|
270
|
+
existing: list[Entity],
|
|
271
|
+
) -> list[Entity]:
|
|
272
|
+
"""Extract capitalized words as potential entities."""
|
|
273
|
+
entities = []
|
|
274
|
+
existing_spans = {(e.start, e.end) for e in existing}
|
|
275
|
+
|
|
276
|
+
for match in self.CAPITALIZED_PATTERN.finditer(text):
|
|
277
|
+
# Skip if already extracted
|
|
278
|
+
if (match.start(), match.end()) in existing_spans:
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
word = match.group(1)
|
|
282
|
+
|
|
283
|
+
# Skip common words
|
|
284
|
+
if word.lower() in {"the", "a", "an", "i", "my", "we", "they"}:
|
|
285
|
+
continue
|
|
286
|
+
|
|
287
|
+
# Skip if at start of sentence (could be just capitalization)
|
|
288
|
+
if match.start() == 0 or text[match.start() - 1] in ".!?\n":
|
|
289
|
+
# Still include if it looks like a proper noun
|
|
290
|
+
if len(word.split()) == 1 and len(word) < 4:
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
entities.append(
|
|
294
|
+
Entity(
|
|
295
|
+
text=word,
|
|
296
|
+
type=EntityType.UNKNOWN,
|
|
297
|
+
start=match.start(),
|
|
298
|
+
end=match.end(),
|
|
299
|
+
confidence=0.5,
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
return entities
|
|
304
|
+
|
|
305
|
+
def _extract_locations(
|
|
306
|
+
self,
|
|
307
|
+
text: str,
|
|
308
|
+
existing: list[Entity],
|
|
309
|
+
) -> list[Entity]:
|
|
310
|
+
"""Extract locations based on context indicators."""
|
|
311
|
+
entities = []
|
|
312
|
+
existing_texts = {e.text.lower() for e in existing}
|
|
313
|
+
|
|
314
|
+
# Find words after location indicators
|
|
315
|
+
for indicator in self.LOCATION_INDICATORS:
|
|
316
|
+
pattern = re.compile(
|
|
317
|
+
rf"\b{re.escape(indicator)}\s+([A-ZÀ-Ỹ][a-zà-ỹA-ZÀ-Ỹ\s]+?)(?:[,.]|\s+(?:với|và|to|with|for)|$)",
|
|
318
|
+
re.IGNORECASE,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
for match in pattern.finditer(text):
|
|
322
|
+
location = match.group(1).strip()
|
|
323
|
+
|
|
324
|
+
if location.lower() in existing_texts:
|
|
325
|
+
continue
|
|
326
|
+
|
|
327
|
+
if len(location) < 2:
|
|
328
|
+
continue
|
|
329
|
+
|
|
330
|
+
entities.append(
|
|
331
|
+
Entity(
|
|
332
|
+
text=location,
|
|
333
|
+
type=EntityType.LOCATION,
|
|
334
|
+
start=match.start(1),
|
|
335
|
+
end=match.start(1) + len(location),
|
|
336
|
+
confidence=0.7,
|
|
337
|
+
)
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
return entities
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def extract_keywords(text: str, min_length: int = 2) -> list[str]:
|
|
344
|
+
"""
|
|
345
|
+
Extract keywords from text (simple word extraction).
|
|
346
|
+
|
|
347
|
+
This is a basic keyword extractor. For better results,
|
|
348
|
+
consider using TF-IDF or other NLP techniques.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
text: The text to extract from
|
|
352
|
+
min_length: Minimum word length
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
List of keywords
|
|
356
|
+
"""
|
|
357
|
+
# Common stop words (English + Vietnamese)
|
|
358
|
+
stop_words = {
|
|
359
|
+
# English
|
|
360
|
+
"the",
|
|
361
|
+
"a",
|
|
362
|
+
"an",
|
|
363
|
+
"is",
|
|
364
|
+
"are",
|
|
365
|
+
"was",
|
|
366
|
+
"were",
|
|
367
|
+
"be",
|
|
368
|
+
"been",
|
|
369
|
+
"being",
|
|
370
|
+
"have",
|
|
371
|
+
"has",
|
|
372
|
+
"had",
|
|
373
|
+
"do",
|
|
374
|
+
"does",
|
|
375
|
+
"did",
|
|
376
|
+
"will",
|
|
377
|
+
"would",
|
|
378
|
+
"could",
|
|
379
|
+
"should",
|
|
380
|
+
"may",
|
|
381
|
+
"might",
|
|
382
|
+
"must",
|
|
383
|
+
"shall",
|
|
384
|
+
"can",
|
|
385
|
+
"need",
|
|
386
|
+
"dare",
|
|
387
|
+
"ought",
|
|
388
|
+
"used",
|
|
389
|
+
"to",
|
|
390
|
+
"of",
|
|
391
|
+
"in",
|
|
392
|
+
"for",
|
|
393
|
+
"on",
|
|
394
|
+
"with",
|
|
395
|
+
"at",
|
|
396
|
+
"by",
|
|
397
|
+
"from",
|
|
398
|
+
"as",
|
|
399
|
+
"into",
|
|
400
|
+
"through",
|
|
401
|
+
"during",
|
|
402
|
+
"before",
|
|
403
|
+
"after",
|
|
404
|
+
"above",
|
|
405
|
+
"below",
|
|
406
|
+
"between",
|
|
407
|
+
"under",
|
|
408
|
+
"again",
|
|
409
|
+
"further",
|
|
410
|
+
"then",
|
|
411
|
+
"once",
|
|
412
|
+
"here",
|
|
413
|
+
"there",
|
|
414
|
+
"when",
|
|
415
|
+
"where",
|
|
416
|
+
"why",
|
|
417
|
+
"how",
|
|
418
|
+
"all",
|
|
419
|
+
"each",
|
|
420
|
+
"few",
|
|
421
|
+
"more",
|
|
422
|
+
"most",
|
|
423
|
+
"other",
|
|
424
|
+
"some",
|
|
425
|
+
"such",
|
|
426
|
+
"no",
|
|
427
|
+
"nor",
|
|
428
|
+
"not",
|
|
429
|
+
"only",
|
|
430
|
+
"own",
|
|
431
|
+
"same",
|
|
432
|
+
"so",
|
|
433
|
+
"than",
|
|
434
|
+
"too",
|
|
435
|
+
"very",
|
|
436
|
+
"just",
|
|
437
|
+
"and",
|
|
438
|
+
"but",
|
|
439
|
+
"if",
|
|
440
|
+
"or",
|
|
441
|
+
"because",
|
|
442
|
+
"until",
|
|
443
|
+
"while",
|
|
444
|
+
"this",
|
|
445
|
+
"that",
|
|
446
|
+
"these",
|
|
447
|
+
"those",
|
|
448
|
+
"i",
|
|
449
|
+
"me",
|
|
450
|
+
"my",
|
|
451
|
+
"myself",
|
|
452
|
+
"we",
|
|
453
|
+
"our",
|
|
454
|
+
"ours",
|
|
455
|
+
"ourselves",
|
|
456
|
+
"you",
|
|
457
|
+
"your",
|
|
458
|
+
"yours",
|
|
459
|
+
"yourself",
|
|
460
|
+
"he",
|
|
461
|
+
"him",
|
|
462
|
+
"his",
|
|
463
|
+
"himself",
|
|
464
|
+
"she",
|
|
465
|
+
"her",
|
|
466
|
+
"hers",
|
|
467
|
+
"herself",
|
|
468
|
+
"it",
|
|
469
|
+
"its",
|
|
470
|
+
"itself",
|
|
471
|
+
"they",
|
|
472
|
+
"them",
|
|
473
|
+
"their",
|
|
474
|
+
"theirs",
|
|
475
|
+
"what",
|
|
476
|
+
"which",
|
|
477
|
+
"who",
|
|
478
|
+
"whom",
|
|
479
|
+
# Vietnamese
|
|
480
|
+
"và",
|
|
481
|
+
"của",
|
|
482
|
+
"là",
|
|
483
|
+
"có",
|
|
484
|
+
"được",
|
|
485
|
+
"cho",
|
|
486
|
+
"với",
|
|
487
|
+
"này",
|
|
488
|
+
"trong",
|
|
489
|
+
"để",
|
|
490
|
+
"các",
|
|
491
|
+
"những",
|
|
492
|
+
"một",
|
|
493
|
+
"đã",
|
|
494
|
+
"tôi",
|
|
495
|
+
"bạn",
|
|
496
|
+
"anh",
|
|
497
|
+
"chị",
|
|
498
|
+
"em",
|
|
499
|
+
"ở",
|
|
500
|
+
"tại",
|
|
501
|
+
"khi",
|
|
502
|
+
"thì",
|
|
503
|
+
"mà",
|
|
504
|
+
"nếu",
|
|
505
|
+
"vì",
|
|
506
|
+
"cũng",
|
|
507
|
+
"như",
|
|
508
|
+
"từ",
|
|
509
|
+
"đến",
|
|
510
|
+
"lại",
|
|
511
|
+
"ra",
|
|
512
|
+
"vào",
|
|
513
|
+
"lên",
|
|
514
|
+
"xuống",
|
|
515
|
+
"rồi",
|
|
516
|
+
"sẽ",
|
|
517
|
+
"đang",
|
|
518
|
+
"vẫn",
|
|
519
|
+
"còn",
|
|
520
|
+
"chỉ",
|
|
521
|
+
"rất",
|
|
522
|
+
"quá",
|
|
523
|
+
"làm",
|
|
524
|
+
"gì",
|
|
525
|
+
"sao",
|
|
526
|
+
"nào",
|
|
527
|
+
"đâu",
|
|
528
|
+
"ai",
|
|
529
|
+
"bao",
|
|
530
|
+
"nhiêu",
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
# Tokenize (simple split)
|
|
534
|
+
words = re.findall(r"\b[a-zA-ZÀ-ỹ]+\b", text.lower())
|
|
535
|
+
|
|
536
|
+
# Filter
|
|
537
|
+
keywords = [w for w in words if len(w) >= min_length and w not in stop_words]
|
|
538
|
+
|
|
539
|
+
# Remove duplicates while preserving order
|
|
540
|
+
seen: set[str] = set()
|
|
541
|
+
unique: list[str] = []
|
|
542
|
+
for word in keywords:
|
|
543
|
+
if word not in seen:
|
|
544
|
+
seen.add(word)
|
|
545
|
+
unique.append(word)
|
|
546
|
+
|
|
547
|
+
return unique
|