MemoryOS 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MemoryOS might be problematic. Click here for more details.
- {memoryos-0.2.0.dist-info → memoryos-0.2.2.dist-info}/METADATA +67 -26
- memoryos-0.2.2.dist-info/RECORD +169 -0
- memoryos-0.2.2.dist-info/entry_points.txt +3 -0
- memos/__init__.py +1 -1
- memos/api/config.py +562 -0
- memos/api/context/context.py +147 -0
- memos/api/context/dependencies.py +90 -0
- memos/api/exceptions.py +28 -0
- memos/api/mcp_serve.py +502 -0
- memos/api/product_api.py +35 -0
- memos/api/product_models.py +163 -0
- memos/api/routers/__init__.py +1 -0
- memos/api/routers/product_router.py +386 -0
- memos/chunkers/sentence_chunker.py +8 -2
- memos/cli.py +113 -0
- memos/configs/embedder.py +27 -0
- memos/configs/graph_db.py +132 -3
- memos/configs/internet_retriever.py +6 -0
- memos/configs/llm.py +47 -0
- memos/configs/mem_cube.py +1 -1
- memos/configs/mem_os.py +5 -0
- memos/configs/mem_reader.py +9 -0
- memos/configs/mem_scheduler.py +107 -7
- memos/configs/mem_user.py +58 -0
- memos/configs/memory.py +5 -4
- memos/dependency.py +52 -0
- memos/embedders/ark.py +92 -0
- memos/embedders/factory.py +4 -0
- memos/embedders/sentence_transformer.py +8 -2
- memos/embedders/universal_api.py +32 -0
- memos/graph_dbs/base.py +11 -3
- memos/graph_dbs/factory.py +4 -0
- memos/graph_dbs/nebular.py +1364 -0
- memos/graph_dbs/neo4j.py +333 -124
- memos/graph_dbs/neo4j_community.py +300 -0
- memos/llms/base.py +9 -0
- memos/llms/deepseek.py +54 -0
- memos/llms/factory.py +10 -1
- memos/llms/hf.py +170 -13
- memos/llms/hf_singleton.py +114 -0
- memos/llms/ollama.py +4 -0
- memos/llms/openai.py +67 -1
- memos/llms/qwen.py +63 -0
- memos/llms/vllm.py +153 -0
- memos/log.py +1 -1
- memos/mem_cube/general.py +77 -16
- memos/mem_cube/utils.py +109 -0
- memos/mem_os/core.py +251 -51
- memos/mem_os/main.py +94 -12
- memos/mem_os/product.py +1220 -43
- memos/mem_os/utils/default_config.py +352 -0
- memos/mem_os/utils/format_utils.py +1401 -0
- memos/mem_reader/simple_struct.py +18 -10
- memos/mem_scheduler/base_scheduler.py +441 -40
- memos/mem_scheduler/general_scheduler.py +249 -248
- memos/mem_scheduler/modules/base.py +14 -5
- memos/mem_scheduler/modules/dispatcher.py +67 -4
- memos/mem_scheduler/modules/misc.py +104 -0
- memos/mem_scheduler/modules/monitor.py +240 -50
- memos/mem_scheduler/modules/rabbitmq_service.py +319 -0
- memos/mem_scheduler/modules/redis_service.py +32 -22
- memos/mem_scheduler/modules/retriever.py +167 -23
- memos/mem_scheduler/modules/scheduler_logger.py +255 -0
- memos/mem_scheduler/mos_for_test_scheduler.py +140 -0
- memos/mem_scheduler/schemas/__init__.py +0 -0
- memos/mem_scheduler/schemas/general_schemas.py +43 -0
- memos/mem_scheduler/{modules/schemas.py → schemas/message_schemas.py} +63 -61
- memos/mem_scheduler/schemas/monitor_schemas.py +329 -0
- memos/mem_scheduler/utils/__init__.py +0 -0
- memos/mem_scheduler/utils/filter_utils.py +176 -0
- memos/mem_scheduler/utils/misc_utils.py +61 -0
- memos/mem_user/factory.py +94 -0
- memos/mem_user/mysql_persistent_user_manager.py +271 -0
- memos/mem_user/mysql_user_manager.py +500 -0
- memos/mem_user/persistent_factory.py +96 -0
- memos/mem_user/persistent_user_manager.py +260 -0
- memos/mem_user/user_manager.py +4 -4
- memos/memories/activation/item.py +29 -0
- memos/memories/activation/kv.py +10 -3
- memos/memories/activation/vllmkv.py +219 -0
- memos/memories/factory.py +2 -0
- memos/memories/textual/base.py +1 -1
- memos/memories/textual/general.py +43 -97
- memos/memories/textual/item.py +5 -33
- memos/memories/textual/tree.py +22 -12
- memos/memories/textual/tree_text_memory/organize/conflict.py +9 -5
- memos/memories/textual/tree_text_memory/organize/manager.py +26 -18
- memos/memories/textual/tree_text_memory/organize/redundancy.py +25 -44
- memos/memories/textual/tree_text_memory/organize/relation_reason_detector.py +50 -48
- memos/memories/textual/tree_text_memory/organize/reorganizer.py +81 -56
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever.py +6 -3
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +2 -0
- memos/memories/textual/tree_text_memory/retrieve/recall.py +0 -1
- memos/memories/textual/tree_text_memory/retrieve/reranker.py +2 -2
- memos/memories/textual/tree_text_memory/retrieve/retrieval_mid_structs.py +2 -0
- memos/memories/textual/tree_text_memory/retrieve/searcher.py +52 -28
- memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py +42 -15
- memos/memories/textual/tree_text_memory/retrieve/utils.py +11 -7
- memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +62 -58
- memos/memos_tools/dinding_report_bot.py +422 -0
- memos/memos_tools/notification_service.py +44 -0
- memos/memos_tools/notification_utils.py +96 -0
- memos/parsers/markitdown.py +8 -2
- memos/settings.py +3 -1
- memos/templates/mem_reader_prompts.py +66 -23
- memos/templates/mem_scheduler_prompts.py +126 -43
- memos/templates/mos_prompts.py +87 -0
- memos/templates/tree_reorganize_prompts.py +85 -30
- memos/vec_dbs/base.py +12 -0
- memos/vec_dbs/qdrant.py +46 -20
- memoryos-0.2.0.dist-info/RECORD +0 -128
- memos/mem_scheduler/utils.py +0 -26
- {memoryos-0.2.0.dist-info → memoryos-0.2.2.dist-info}/LICENSE +0 -0
- {memoryos-0.2.0.dist-info → memoryos-0.2.2.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import ClassVar
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field, computed_field, field_validator
|
|
8
|
+
|
|
9
|
+
from memos.log import get_logger
|
|
10
|
+
from memos.mem_scheduler.modules.misc import AutoDroppingQueue, DictConversionMixin
|
|
11
|
+
from memos.mem_scheduler.schemas.general_schemas import (
|
|
12
|
+
DEFAULT_MAX_QUERY_KEY_WORDS,
|
|
13
|
+
DEFAULT_WEIGHT_VECTOR_FOR_RANKING,
|
|
14
|
+
NOT_INITIALIZED,
|
|
15
|
+
)
|
|
16
|
+
from memos.mem_scheduler.utils.filter_utils import transform_name_to_key
|
|
17
|
+
from memos.memories.textual.tree import TextualMemoryItem
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
FILE_PATH = Path(__file__).absolute()
|
|
23
|
+
BASE_DIR = FILE_PATH.parent.parent.parent.parent.parent
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ============== Queries ==============
|
|
27
|
+
class QueryMonitorItem(BaseModel, DictConversionMixin):
|
|
28
|
+
item_id: str = Field(
|
|
29
|
+
description="Unique identifier for the query item", default_factory=lambda: str(uuid4())
|
|
30
|
+
)
|
|
31
|
+
query_text: str = Field(
|
|
32
|
+
...,
|
|
33
|
+
description="The actual user query text content",
|
|
34
|
+
min_length=1,
|
|
35
|
+
)
|
|
36
|
+
keywords: list[str] | None = Field(
|
|
37
|
+
default=None,
|
|
38
|
+
min_length=1, # If provided, shouldn't be empty
|
|
39
|
+
description="Semantic keywords extracted from the query text",
|
|
40
|
+
)
|
|
41
|
+
max_keywords: ClassVar[int] = DEFAULT_MAX_QUERY_KEY_WORDS
|
|
42
|
+
|
|
43
|
+
timestamp: datetime = Field(
|
|
44
|
+
default_factory=datetime.now, description="Timestamp indicating when query was submitted"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@field_validator("keywords", mode="before")
|
|
48
|
+
@classmethod
|
|
49
|
+
def validate_keywords(cls, v, values):
|
|
50
|
+
if v is None:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
if not isinstance(v, list):
|
|
54
|
+
raise ValueError("Keywords must be a list")
|
|
55
|
+
|
|
56
|
+
if len(v) > cls.max_keywords:
|
|
57
|
+
logger.warning(
|
|
58
|
+
f"Keywords list truncated from {len(v)} to {cls.max_keywords} items. "
|
|
59
|
+
f"Configure max_keywords class attribute to adjust this limit."
|
|
60
|
+
)
|
|
61
|
+
return v[: cls.max_keywords]
|
|
62
|
+
return v
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def with_max_keywords(cls, limit: int):
|
|
66
|
+
"""Create a new class with custom keywords limit."""
|
|
67
|
+
if not isinstance(limit, int) or limit <= 0:
|
|
68
|
+
raise ValueError("Max keywords limit must be positive integer")
|
|
69
|
+
|
|
70
|
+
return type(f"{cls.__name__}_MaxKeywords{limit}", (cls,), {"max_keywords": limit})
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class QueryMonitorQueue(AutoDroppingQueue[QueryMonitorItem]):
|
|
74
|
+
"""
|
|
75
|
+
A thread-safe queue for monitoring queries with timestamp and keyword tracking.
|
|
76
|
+
Each item is expected to be a dictionary containing:
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def put(self, item: QueryMonitorItem, block: bool = True, timeout: float | None = None) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Add a query item to the queue. Ensures the item is of correct type.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
item: A QueryMonitorItem instance
|
|
85
|
+
"""
|
|
86
|
+
if not isinstance(item, QueryMonitorItem):
|
|
87
|
+
raise ValueError("Item must be an instance of QueryMonitorItem")
|
|
88
|
+
super().put(item, block, timeout)
|
|
89
|
+
|
|
90
|
+
def get_queries_by_timestamp(
|
|
91
|
+
self, start_time: datetime, end_time: datetime
|
|
92
|
+
) -> list[QueryMonitorItem]:
|
|
93
|
+
"""
|
|
94
|
+
Retrieve queries added between the specified time range.
|
|
95
|
+
"""
|
|
96
|
+
with self.mutex:
|
|
97
|
+
return [item for item in self.queue if start_time <= item.timestamp <= end_time]
|
|
98
|
+
|
|
99
|
+
def get_keywords_collections(self) -> Counter:
|
|
100
|
+
"""
|
|
101
|
+
Generate a Counter containing keyword frequencies across all queries.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Counter object with keyword counts
|
|
105
|
+
"""
|
|
106
|
+
with self.mutex:
|
|
107
|
+
all_keywords = [kw for item in self.queue for kw in item.keywords]
|
|
108
|
+
return Counter(all_keywords)
|
|
109
|
+
|
|
110
|
+
def get_queries_with_timesort(self, reverse: bool = True) -> list[str]:
|
|
111
|
+
"""
|
|
112
|
+
Retrieve all queries sorted by timestamp.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
reverse: If True, sort in descending order (newest first),
|
|
116
|
+
otherwise sort in ascending order (oldest first)
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
List of query items sorted by timestamp
|
|
120
|
+
"""
|
|
121
|
+
with self.mutex:
|
|
122
|
+
return [
|
|
123
|
+
monitor.query_text
|
|
124
|
+
for monitor in sorted(self.queue, key=lambda x: x.timestamp, reverse=reverse)
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ============== Memories ==============
|
|
129
|
+
class MemoryMonitorItem(BaseModel, DictConversionMixin):
|
|
130
|
+
item_id: str = Field(
|
|
131
|
+
description="Unique identifier for the memory item", default_factory=lambda: str(uuid4())
|
|
132
|
+
)
|
|
133
|
+
memory_text: str = Field(
|
|
134
|
+
...,
|
|
135
|
+
description="The actual content of the memory",
|
|
136
|
+
min_length=1,
|
|
137
|
+
)
|
|
138
|
+
tree_memory_item: TextualMemoryItem | None = Field(
|
|
139
|
+
default=None, description="Optional textual memory item"
|
|
140
|
+
)
|
|
141
|
+
tree_memory_item_mapping_key: str = Field(
|
|
142
|
+
description="Key generated from memory_text using transform_name_to_key",
|
|
143
|
+
)
|
|
144
|
+
keywords_score: float = Field(
|
|
145
|
+
default=NOT_INITIALIZED,
|
|
146
|
+
description="The score generate by counting keywords in queries",
|
|
147
|
+
ge=NOT_INITIALIZED, # Minimum value of 0
|
|
148
|
+
)
|
|
149
|
+
sorting_score: float = Field(
|
|
150
|
+
default=NOT_INITIALIZED,
|
|
151
|
+
description="The score generate from rerank process",
|
|
152
|
+
ge=NOT_INITIALIZED, # Minimum value of 0
|
|
153
|
+
)
|
|
154
|
+
importance_score: float = Field(
|
|
155
|
+
default=NOT_INITIALIZED,
|
|
156
|
+
description="Numerical score representing the memory's importance",
|
|
157
|
+
ge=NOT_INITIALIZED, # Minimum value of 0
|
|
158
|
+
)
|
|
159
|
+
recording_count: int = Field(
|
|
160
|
+
default=1,
|
|
161
|
+
description="How many times this memory has been recorded",
|
|
162
|
+
ge=1, # Greater than or equal to 1
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
@field_validator("tree_memory_item_mapping_key", mode="before")
|
|
166
|
+
def generate_mapping_key(cls, v, values): # noqa: N805
|
|
167
|
+
if v is None and "memory_text" in values:
|
|
168
|
+
return transform_name_to_key(values["memory_text"])
|
|
169
|
+
return v
|
|
170
|
+
|
|
171
|
+
def get_importance_score(self, weight_vector: list[float] | None = None) -> float:
|
|
172
|
+
"""
|
|
173
|
+
Calculate the effective score for the memory item.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
float: The importance_score if it has been initialized (>=0),
|
|
177
|
+
otherwise the recording_count converted to float.
|
|
178
|
+
|
|
179
|
+
Note:
|
|
180
|
+
This method provides a unified way to retrieve a comparable score
|
|
181
|
+
for memory items, regardless of whether their importance has been explicitly set.
|
|
182
|
+
"""
|
|
183
|
+
if weight_vector is None:
|
|
184
|
+
logger.warning("weight_vector of get_importance_score is None.")
|
|
185
|
+
weight_vector = DEFAULT_WEIGHT_VECTOR_FOR_RANKING
|
|
186
|
+
assert sum(weight_vector) == 1
|
|
187
|
+
normalized_keywords_score = min(self.keywords_score * weight_vector[1], 5)
|
|
188
|
+
normalized_recording_count_score = min(self.recording_count * weight_vector[2], 2)
|
|
189
|
+
self.importance_score = (
|
|
190
|
+
self.sorting_score * weight_vector[0]
|
|
191
|
+
+ normalized_keywords_score
|
|
192
|
+
+ normalized_recording_count_score
|
|
193
|
+
)
|
|
194
|
+
return self.importance_score
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class MemoryMonitorManager(BaseModel, DictConversionMixin):
|
|
198
|
+
user_id: str = Field(..., description="Required user identifier", min_length=1)
|
|
199
|
+
mem_cube_id: str = Field(..., description="Required memory cube identifier", min_length=1)
|
|
200
|
+
memories: list[MemoryMonitorItem] = Field(
|
|
201
|
+
default_factory=list, description="Collection of memory items"
|
|
202
|
+
)
|
|
203
|
+
max_capacity: int | None = Field(
|
|
204
|
+
default=None, description="Maximum number of memories allowed (None for unlimited)", ge=1
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
@computed_field
|
|
208
|
+
@property
|
|
209
|
+
def memory_size(self) -> int:
|
|
210
|
+
"""Automatically calculated count of memory items."""
|
|
211
|
+
return len(self.memories)
|
|
212
|
+
|
|
213
|
+
@property
|
|
214
|
+
def memories_mapping_dict(self) -> dict[str, MemoryMonitorItem]:
|
|
215
|
+
"""
|
|
216
|
+
Generate a mapping dictionary for the memories in MemoryMonitorManager,
|
|
217
|
+
using tree_memory_item_mapping_key as the key and MemoryMonitorItem as the value.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Dict[str, MemoryMonitorItem]: A dictionary where keys are
|
|
221
|
+
tree_memory_item_mapping_key values from MemoryMonitorItem,
|
|
222
|
+
and values are the corresponding MemoryMonitorItem objects.
|
|
223
|
+
"""
|
|
224
|
+
mapping_dict = {
|
|
225
|
+
mem_item.tree_memory_item_mapping_key: mem_item for mem_item in self.memories
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
logger.debug(
|
|
229
|
+
f"Generated memories mapping dict for user_id={self.user_id}, "
|
|
230
|
+
f"mem_cube_id={self.mem_cube_id}, "
|
|
231
|
+
f"total_items={len(mapping_dict)}, "
|
|
232
|
+
f"source_memory_count={len(self.memories)}"
|
|
233
|
+
)
|
|
234
|
+
return mapping_dict
|
|
235
|
+
|
|
236
|
+
def get_sorted_mem_monitors(self, reverse=True) -> list[MemoryMonitorItem]:
|
|
237
|
+
"""
|
|
238
|
+
Retrieve memory monitors sorted by their ranking score in descending order.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
list[MemoryMonitorItem]: Sorted list of memory monitor items.
|
|
242
|
+
"""
|
|
243
|
+
return sorted(
|
|
244
|
+
self.memories,
|
|
245
|
+
key=lambda item: item.get_importance_score(
|
|
246
|
+
weight_vector=DEFAULT_WEIGHT_VECTOR_FOR_RANKING
|
|
247
|
+
),
|
|
248
|
+
reverse=reverse,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
def update_memories(
|
|
252
|
+
self, new_memory_monitors: list[MemoryMonitorItem], partial_retention_number: int
|
|
253
|
+
) -> MemoryMonitorItem:
|
|
254
|
+
"""
|
|
255
|
+
Update memories based on monitor_working_memories.
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
# Validate partial_retention_number
|
|
259
|
+
if partial_retention_number < 0:
|
|
260
|
+
raise ValueError("partial_retention_number must be non-negative")
|
|
261
|
+
|
|
262
|
+
# Step 1: Update existing memories or add new ones
|
|
263
|
+
added_count = 0
|
|
264
|
+
memories_mapping_dict = self.memories_mapping_dict
|
|
265
|
+
new_mem_set = set()
|
|
266
|
+
for memory_monitor in new_memory_monitors:
|
|
267
|
+
if memory_monitor.tree_memory_item_mapping_key in memories_mapping_dict:
|
|
268
|
+
# Update existing memory
|
|
269
|
+
item: MemoryMonitorItem = memories_mapping_dict[
|
|
270
|
+
memory_monitor.tree_memory_item_mapping_key
|
|
271
|
+
]
|
|
272
|
+
item.recording_count += 1
|
|
273
|
+
item.keywords_score = memory_monitor.keywords_score
|
|
274
|
+
item.sorting_score = memory_monitor.sorting_score
|
|
275
|
+
else:
|
|
276
|
+
# Add new memory
|
|
277
|
+
self.memories.append(memory_monitor)
|
|
278
|
+
added_count += 1
|
|
279
|
+
|
|
280
|
+
new_mem_set.add(memory_monitor.tree_memory_item_mapping_key)
|
|
281
|
+
|
|
282
|
+
# Step 2: Identify memories to remove
|
|
283
|
+
old_mem_monitor_list = []
|
|
284
|
+
for mem_monitor in self.memories:
|
|
285
|
+
if mem_monitor.tree_memory_item_mapping_key not in new_mem_set:
|
|
286
|
+
old_mem_monitor_list.append(mem_monitor)
|
|
287
|
+
|
|
288
|
+
# Sort memories by recording_count in descending order
|
|
289
|
+
sorted_old_mem_monitors = sorted(
|
|
290
|
+
old_mem_monitor_list,
|
|
291
|
+
key=lambda item: item.get_importance_score(
|
|
292
|
+
weight_vector=DEFAULT_WEIGHT_VECTOR_FOR_RANKING
|
|
293
|
+
),
|
|
294
|
+
reverse=True,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Keep the top N old memories
|
|
298
|
+
memories_to_remove = sorted_old_mem_monitors[partial_retention_number:]
|
|
299
|
+
memories_to_change_score = sorted_old_mem_monitors[:partial_retention_number]
|
|
300
|
+
|
|
301
|
+
# Step 3: Remove identified memories and change the scores of left old memories
|
|
302
|
+
for memory in memories_to_remove:
|
|
303
|
+
self.memories.remove(memory)
|
|
304
|
+
|
|
305
|
+
for memory in memories_to_change_score:
|
|
306
|
+
memory.sorting_score = 0
|
|
307
|
+
memory.recording_count = 0
|
|
308
|
+
memory.keywords_score = 0
|
|
309
|
+
|
|
310
|
+
# Step 4: Enforce max_capacity if set
|
|
311
|
+
sorted_memories = sorted(
|
|
312
|
+
self.memories,
|
|
313
|
+
key=lambda item: item.get_importance_score(
|
|
314
|
+
weight_vector=DEFAULT_WEIGHT_VECTOR_FOR_RANKING
|
|
315
|
+
),
|
|
316
|
+
reverse=True,
|
|
317
|
+
)
|
|
318
|
+
# Keep only the top max_capacity memories
|
|
319
|
+
self.memories = sorted_memories[: self.max_capacity]
|
|
320
|
+
|
|
321
|
+
# Log the update result
|
|
322
|
+
logger.info(
|
|
323
|
+
f"Updated monitor manager for user {self.user_id}, mem_cube {self.mem_cube_id}: "
|
|
324
|
+
f"Total memories: {len(self.memories)}, "
|
|
325
|
+
f"Added/Updated: {added_count}, "
|
|
326
|
+
f"Removed: {len(memories_to_remove)} (excluding top {partial_retention_number} by recording_count)"
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
return self.memories
|
|
File without changes
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from memos.dependency import require_python_package
|
|
4
|
+
from memos.log import get_logger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
logger = get_logger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def transform_name_to_key(name):
|
|
11
|
+
"""
|
|
12
|
+
Normalize text by removing all punctuation marks, keeping only letters, numbers, and word characters.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
name (str): Input text to be processed
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
str: Processed text with all punctuation removed
|
|
19
|
+
"""
|
|
20
|
+
# Match all characters that are NOT:
|
|
21
|
+
# \w - word characters (letters, digits, underscore)
|
|
22
|
+
# \u4e00-\u9fff - Chinese/Japanese/Korean characters
|
|
23
|
+
# \s - whitespace
|
|
24
|
+
pattern = r"[^\w\u4e00-\u9fff\s]"
|
|
25
|
+
|
|
26
|
+
# Substitute all matched punctuation marks with empty string
|
|
27
|
+
# re.UNICODE flag ensures proper handling of Unicode characters
|
|
28
|
+
normalized = re.sub(pattern, "", name, flags=re.UNICODE)
|
|
29
|
+
|
|
30
|
+
# Optional: Collapse multiple whitespaces into single space
|
|
31
|
+
normalized = "_".join(normalized.split())
|
|
32
|
+
|
|
33
|
+
normalized = normalized.lower()
|
|
34
|
+
|
|
35
|
+
return normalized
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def is_all_english(input_string: str) -> bool:
|
|
39
|
+
"""Determine if the string consists entirely of English characters (including spaces)"""
|
|
40
|
+
return all(char.isascii() or char.isspace() for char in input_string)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def is_all_chinese(input_string: str) -> bool:
|
|
44
|
+
"""Determine if the string consists entirely of Chinese characters (including Chinese punctuation and spaces)"""
|
|
45
|
+
return all(
|
|
46
|
+
("\u4e00" <= char <= "\u9fff") # Basic Chinese characters
|
|
47
|
+
or ("\u3400" <= char <= "\u4dbf") # Extension A
|
|
48
|
+
or ("\u20000" <= char <= "\u2a6df") # Extension B
|
|
49
|
+
or ("\u2a700" <= char <= "\u2b73f") # Extension C
|
|
50
|
+
or ("\u2b740" <= char <= "\u2b81f") # Extension D
|
|
51
|
+
or ("\u2b820" <= char <= "\u2ceaf") # Extension E
|
|
52
|
+
or ("\u2f800" <= char <= "\u2fa1f") # Extension F
|
|
53
|
+
or char.isspace() # Spaces
|
|
54
|
+
for char in input_string
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@require_python_package(
|
|
59
|
+
import_name="sklearn",
|
|
60
|
+
install_command="pip install scikit-learn",
|
|
61
|
+
install_link="https://scikit-learn.org/stable/install.html",
|
|
62
|
+
)
|
|
63
|
+
def filter_similar_memories(
|
|
64
|
+
text_memories: list[str], similarity_threshold: float = 0.75
|
|
65
|
+
) -> list[str]:
|
|
66
|
+
"""
|
|
67
|
+
Filters out low-quality or duplicate memories based on text similarity.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
text_memories: List of text memories to filter
|
|
71
|
+
similarity_threshold: Threshold for considering memories duplicates (0.0-1.0)
|
|
72
|
+
Higher values mean stricter filtering
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
List of filtered memories with duplicates removed
|
|
76
|
+
"""
|
|
77
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
78
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
79
|
+
|
|
80
|
+
if not text_memories:
|
|
81
|
+
logger.warning("Received empty memories list - nothing to filter")
|
|
82
|
+
return []
|
|
83
|
+
|
|
84
|
+
for idx in range(len(text_memories)):
|
|
85
|
+
if not isinstance(text_memories[idx], str):
|
|
86
|
+
logger.error(
|
|
87
|
+
f"{text_memories[idx]} in memories is not a string,"
|
|
88
|
+
f" and now has been transformed to be a string."
|
|
89
|
+
)
|
|
90
|
+
text_memories[idx] = str(text_memories[idx])
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
# Step 1: Vectorize texts using TF-IDF
|
|
94
|
+
vectorizer = TfidfVectorizer()
|
|
95
|
+
tfidf_matrix = vectorizer.fit_transform(text_memories)
|
|
96
|
+
|
|
97
|
+
# Step 2: Calculate pairwise similarity matrix
|
|
98
|
+
similarity_matrix = cosine_similarity(tfidf_matrix)
|
|
99
|
+
|
|
100
|
+
# Step 3: Identify duplicates
|
|
101
|
+
to_keep = set(range(len(text_memories))) # Start with all indices
|
|
102
|
+
for i in range(len(similarity_matrix)):
|
|
103
|
+
if i not in to_keep:
|
|
104
|
+
continue # Already marked for removal
|
|
105
|
+
|
|
106
|
+
# Find all similar items to this one (excluding self and already removed)
|
|
107
|
+
similar_indices = [
|
|
108
|
+
j
|
|
109
|
+
for j in range(i + 1, len(similarity_matrix))
|
|
110
|
+
if similarity_matrix[i][j] >= similarity_threshold and j in to_keep
|
|
111
|
+
]
|
|
112
|
+
similar_indices = set(similar_indices)
|
|
113
|
+
|
|
114
|
+
# Remove all similar items (keeping the first one - i)
|
|
115
|
+
to_keep -= similar_indices
|
|
116
|
+
|
|
117
|
+
# Return filtered memories
|
|
118
|
+
filtered_memories = [text_memories[i] for i in sorted(to_keep)]
|
|
119
|
+
logger.debug(f"filtered_memories: {filtered_memories}")
|
|
120
|
+
return filtered_memories
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"Error filtering memories: {e!s}")
|
|
124
|
+
return text_memories # Return original list if error occurs
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def filter_too_short_memories(
|
|
128
|
+
text_memories: list[str], min_length_threshold: int = 20
|
|
129
|
+
) -> list[str]:
|
|
130
|
+
"""
|
|
131
|
+
Filters out text memories that fall below the minimum length requirement.
|
|
132
|
+
Handles both English (word count) and Chinese (character count) differently.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
text_memories: List of text memories to be filtered
|
|
136
|
+
min_length_threshold: Minimum length required to keep a memory.
|
|
137
|
+
For English: word count, for Chinese: character count.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
List of filtered memories meeting the length requirement
|
|
141
|
+
"""
|
|
142
|
+
if not text_memories:
|
|
143
|
+
logger.debug("Empty memories list received in short memory filter")
|
|
144
|
+
return []
|
|
145
|
+
|
|
146
|
+
filtered_memories = []
|
|
147
|
+
removed_count = 0
|
|
148
|
+
|
|
149
|
+
for memory in text_memories:
|
|
150
|
+
stripped_memory = memory.strip()
|
|
151
|
+
if not stripped_memory: # Skip empty/whitespace memories
|
|
152
|
+
removed_count += 1
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
# Determine measurement method based on language
|
|
156
|
+
if is_all_english(stripped_memory):
|
|
157
|
+
length = len(stripped_memory.split()) # Word count for English
|
|
158
|
+
elif is_all_chinese(stripped_memory):
|
|
159
|
+
length = len(stripped_memory) # Character count for Chinese
|
|
160
|
+
else:
|
|
161
|
+
logger.debug(f"Mixed-language memory, using character count: {stripped_memory[:50]}...")
|
|
162
|
+
length = len(stripped_memory) # Default to character count
|
|
163
|
+
|
|
164
|
+
if length >= min_length_threshold:
|
|
165
|
+
filtered_memories.append(memory)
|
|
166
|
+
else:
|
|
167
|
+
removed_count += 1
|
|
168
|
+
|
|
169
|
+
if removed_count > 0:
|
|
170
|
+
logger.info(
|
|
171
|
+
f"Filtered out {removed_count} short memories "
|
|
172
|
+
f"(below {min_length_threshold} units). "
|
|
173
|
+
f"Total remaining: {len(filtered_memories)}"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
return filtered_memories
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from functools import wraps
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from memos.log import get_logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
logger = get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_json_dict(text: str):
|
|
15
|
+
text = text.strip()
|
|
16
|
+
patterns_to_remove = ["json```", "```python", "```json", "latex```", "```latex", "```"]
|
|
17
|
+
for pattern in patterns_to_remove:
|
|
18
|
+
text = text.replace(pattern, "")
|
|
19
|
+
res = json.loads(text.strip())
|
|
20
|
+
return res
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_yaml(yaml_file: str | Path):
|
|
24
|
+
yaml_path = Path(yaml_file)
|
|
25
|
+
if not yaml_path.is_file():
|
|
26
|
+
raise FileNotFoundError(f"No such file: {yaml_file}")
|
|
27
|
+
|
|
28
|
+
with yaml_path.open("r", encoding="utf-8") as fr:
|
|
29
|
+
data = yaml.safe_load(fr)
|
|
30
|
+
|
|
31
|
+
return data
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def log_exceptions(logger=logger):
|
|
35
|
+
"""
|
|
36
|
+
Exception-catching decorator that automatically logs errors (including stack traces)
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
logger: Optional logger object (default: module-level logger)
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
@log_exceptions()
|
|
43
|
+
def risky_function():
|
|
44
|
+
raise ValueError("Oops!")
|
|
45
|
+
|
|
46
|
+
@log_exceptions(logger=custom_logger)
|
|
47
|
+
def another_risky_function():
|
|
48
|
+
might_fail()
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def decorator(func):
|
|
52
|
+
@wraps(func)
|
|
53
|
+
def wrapper(*args, **kwargs):
|
|
54
|
+
try:
|
|
55
|
+
return func(*args, **kwargs)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.error(f"Error in {func.__name__}: {e}", exc_info=True)
|
|
58
|
+
|
|
59
|
+
return wrapper
|
|
60
|
+
|
|
61
|
+
return decorator
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from typing import Any, ClassVar
|
|
2
|
+
|
|
3
|
+
from memos.configs.mem_user import UserManagerConfigFactory
|
|
4
|
+
from memos.mem_user.mysql_user_manager import MySQLUserManager
|
|
5
|
+
from memos.mem_user.user_manager import UserManager
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class UserManagerFactory:
|
|
9
|
+
"""Factory class for creating user manager instances."""
|
|
10
|
+
|
|
11
|
+
backend_to_class: ClassVar[dict[str, Any]] = {
|
|
12
|
+
"sqlite": UserManager,
|
|
13
|
+
"mysql": MySQLUserManager,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def from_config(
|
|
18
|
+
cls, config_factory: UserManagerConfigFactory
|
|
19
|
+
) -> UserManager | MySQLUserManager:
|
|
20
|
+
"""Create a user manager instance from configuration.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
config_factory: Configuration factory containing backend and config
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
User manager instance
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValueError: If backend is not supported
|
|
30
|
+
"""
|
|
31
|
+
backend = config_factory.backend
|
|
32
|
+
if backend not in cls.backend_to_class:
|
|
33
|
+
raise ValueError(f"Invalid user manager backend: {backend}")
|
|
34
|
+
|
|
35
|
+
user_manager_class = cls.backend_to_class[backend]
|
|
36
|
+
config = config_factory.config
|
|
37
|
+
|
|
38
|
+
# Use model_dump() to convert Pydantic model to dict and unpack as kwargs
|
|
39
|
+
return user_manager_class(**config.model_dump())
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def create_sqlite(cls, db_path: str | None = None, user_id: str = "root") -> UserManager:
|
|
43
|
+
"""Create SQLite user manager with default configuration.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
db_path: Path to SQLite database file
|
|
47
|
+
user_id: Default user ID for initialization
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
SQLite user manager instance
|
|
51
|
+
"""
|
|
52
|
+
config_factory = UserManagerConfigFactory(
|
|
53
|
+
backend="sqlite", config={"db_path": db_path, "user_id": user_id}
|
|
54
|
+
)
|
|
55
|
+
return cls.from_config(config_factory)
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def create_mysql(
|
|
59
|
+
cls,
|
|
60
|
+
user_id: str = "root",
|
|
61
|
+
host: str = "localhost",
|
|
62
|
+
port: int = 3306,
|
|
63
|
+
username: str = "root",
|
|
64
|
+
password: str = "",
|
|
65
|
+
database: str = "memos_users",
|
|
66
|
+
charset: str = "utf8mb4",
|
|
67
|
+
) -> MySQLUserManager:
|
|
68
|
+
"""Create MySQL user manager with specified configuration.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
user_id: Default user ID for initialization
|
|
72
|
+
host: MySQL server host
|
|
73
|
+
port: MySQL server port
|
|
74
|
+
username: MySQL username
|
|
75
|
+
password: MySQL password
|
|
76
|
+
database: MySQL database name
|
|
77
|
+
charset: MySQL charset
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
MySQL user manager instance
|
|
81
|
+
"""
|
|
82
|
+
config_factory = UserManagerConfigFactory(
|
|
83
|
+
backend="mysql",
|
|
84
|
+
config={
|
|
85
|
+
"user_id": user_id,
|
|
86
|
+
"host": host,
|
|
87
|
+
"port": port,
|
|
88
|
+
"username": username,
|
|
89
|
+
"password": password,
|
|
90
|
+
"database": database,
|
|
91
|
+
"charset": charset,
|
|
92
|
+
},
|
|
93
|
+
)
|
|
94
|
+
return cls.from_config(config_factory)
|