MemoryOS 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MemoryOS might be problematic. Click here for more details.
- {memoryos-0.2.2.dist-info → memoryos-1.0.0.dist-info}/METADATA +6 -1
- {memoryos-0.2.2.dist-info → memoryos-1.0.0.dist-info}/RECORD +61 -55
- memos/__init__.py +1 -1
- memos/api/config.py +6 -8
- memos/api/context/context.py +1 -1
- memos/api/context/dependencies.py +11 -0
- memos/configs/internet_retriever.py +13 -0
- memos/configs/mem_scheduler.py +38 -16
- memos/graph_dbs/base.py +30 -3
- memos/graph_dbs/nebular.py +442 -194
- memos/graph_dbs/neo4j.py +14 -5
- memos/log.py +5 -0
- memos/mem_os/core.py +19 -9
- memos/mem_os/main.py +1 -1
- memos/mem_os/product.py +6 -69
- memos/mem_os/utils/default_config.py +1 -1
- memos/mem_os/utils/format_utils.py +11 -47
- memos/mem_os/utils/reference_utils.py +133 -0
- memos/mem_scheduler/base_scheduler.py +58 -55
- memos/mem_scheduler/{modules → general_modules}/base.py +1 -2
- memos/mem_scheduler/{modules → general_modules}/dispatcher.py +54 -15
- memos/mem_scheduler/{modules → general_modules}/rabbitmq_service.py +4 -4
- memos/mem_scheduler/{modules → general_modules}/redis_service.py +1 -1
- memos/mem_scheduler/{modules → general_modules}/retriever.py +19 -5
- memos/mem_scheduler/{modules → general_modules}/scheduler_logger.py +10 -4
- memos/mem_scheduler/general_scheduler.py +110 -67
- memos/mem_scheduler/monitors/__init__.py +0 -0
- memos/mem_scheduler/monitors/dispatcher_monitor.py +305 -0
- memos/mem_scheduler/{modules/monitor.py → monitors/general_monitor.py} +57 -19
- memos/mem_scheduler/mos_for_test_scheduler.py +7 -1
- memos/mem_scheduler/schemas/general_schemas.py +3 -2
- memos/mem_scheduler/schemas/message_schemas.py +2 -1
- memos/mem_scheduler/schemas/monitor_schemas.py +10 -2
- memos/mem_scheduler/utils/misc_utils.py +43 -2
- memos/memories/activation/item.py +1 -1
- memos/memories/activation/kv.py +20 -8
- memos/memories/textual/base.py +1 -1
- memos/memories/textual/general.py +1 -1
- memos/memories/textual/tree_text_memory/organize/{conflict.py → handler.py} +30 -48
- memos/memories/textual/tree_text_memory/organize/manager.py +8 -96
- memos/memories/textual/tree_text_memory/organize/relation_reason_detector.py +2 -0
- memos/memories/textual/tree_text_memory/organize/reorganizer.py +102 -140
- memos/memories/textual/tree_text_memory/retrieve/bochasearch.py +229 -0
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +9 -0
- memos/memories/textual/tree_text_memory/retrieve/recall.py +15 -8
- memos/memories/textual/tree_text_memory/retrieve/reranker.py +1 -1
- memos/memories/textual/tree_text_memory/retrieve/searcher.py +177 -125
- memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py +7 -2
- memos/memories/textual/tree_text_memory/retrieve/utils.py +1 -1
- memos/memos_tools/lockfree_dict.py +120 -0
- memos/memos_tools/thread_safe_dict.py +288 -0
- memos/templates/mem_reader_prompts.py +2 -0
- memos/templates/mem_scheduler_prompts.py +23 -10
- memos/templates/mos_prompts.py +40 -11
- memos/templates/tree_reorganize_prompts.py +24 -17
- memos/utils.py +19 -0
- memos/memories/textual/tree_text_memory/organize/redundancy.py +0 -193
- {memoryos-0.2.2.dist-info → memoryos-1.0.0.dist-info}/LICENSE +0 -0
- {memoryos-0.2.2.dist-info → memoryos-1.0.0.dist-info}/WHEEL +0 -0
- {memoryos-0.2.2.dist-info → memoryos-1.0.0.dist-info}/entry_points.txt +0 -0
- /memos/mem_scheduler/{modules → general_modules}/__init__.py +0 -0
- /memos/mem_scheduler/{modules → general_modules}/misc.py +0 -0
|
@@ -9,11 +9,13 @@ from memos.mem_scheduler.schemas.general_schemas import (
|
|
|
9
9
|
ANSWER_LABEL,
|
|
10
10
|
DEFAULT_MAX_QUERY_KEY_WORDS,
|
|
11
11
|
QUERY_LABEL,
|
|
12
|
+
WORKING_MEMORY_TYPE,
|
|
12
13
|
MemCubeID,
|
|
13
14
|
UserID,
|
|
14
15
|
)
|
|
15
16
|
from memos.mem_scheduler.schemas.message_schemas import ScheduleMessageItem
|
|
16
17
|
from memos.mem_scheduler.schemas.monitor_schemas import QueryMonitorItem
|
|
18
|
+
from memos.mem_scheduler.utils.filter_utils import is_all_chinese, is_all_english
|
|
17
19
|
from memos.memories.textual.tree import TextualMemoryItem, TreeTextMemory
|
|
18
20
|
|
|
19
21
|
|
|
@@ -35,11 +37,12 @@ class GeneralScheduler(BaseScheduler):
|
|
|
35
37
|
|
|
36
38
|
# for evaluation
|
|
37
39
|
def search_for_eval(
|
|
38
|
-
self,
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
self, query: str, user_id: UserID | str, top_k: int, scheduler_flag: bool = True
|
|
41
|
+
) -> (list[str], bool):
|
|
42
|
+
self.monitor.register_query_monitor_if_not_exists(
|
|
43
|
+
user_id=user_id, mem_cube_id=self.current_mem_cube_id
|
|
44
|
+
)
|
|
45
|
+
|
|
43
46
|
query_keywords = self.monitor.extract_query_keywords(query=query)
|
|
44
47
|
logger.info(f'Extract keywords "{query_keywords}" from query "{query}"')
|
|
45
48
|
|
|
@@ -48,35 +51,61 @@ class GeneralScheduler(BaseScheduler):
|
|
|
48
51
|
keywords=query_keywords,
|
|
49
52
|
max_keywords=DEFAULT_MAX_QUERY_KEY_WORDS,
|
|
50
53
|
)
|
|
51
|
-
self.monitor.query_monitors.
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
)
|
|
54
|
+
query_monitor = self.monitor.query_monitors[user_id][self.current_mem_cube_id]
|
|
55
|
+
query_monitor.put(item=item)
|
|
56
|
+
logger.debug(f"Queries in monitor are {query_monitor.get_queries_with_timesort()}.")
|
|
55
57
|
|
|
56
58
|
queries = [query]
|
|
57
59
|
|
|
58
60
|
# recall
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
logger.info(f"Processed {queries} and get {len(new_candidates)} new candidate memories.")
|
|
67
|
-
|
|
68
|
-
# rerank
|
|
69
|
-
new_order_working_memory = self.replace_working_memory(
|
|
70
|
-
user_id=user_id,
|
|
71
|
-
mem_cube_id=self.current_mem_cube_id,
|
|
72
|
-
mem_cube=self.current_mem_cube,
|
|
73
|
-
original_memory=cur_working_memory,
|
|
74
|
-
new_memory=new_candidates,
|
|
61
|
+
mem_cube = self.current_mem_cube
|
|
62
|
+
text_mem_base = mem_cube.text_mem
|
|
63
|
+
|
|
64
|
+
cur_working_memory: list[TextualMemoryItem] = text_mem_base.get_working_memory()
|
|
65
|
+
text_working_memory: list[str] = [w_m.memory for w_m in cur_working_memory]
|
|
66
|
+
intent_result = self.monitor.detect_intent(
|
|
67
|
+
q_list=queries, text_working_memory=text_working_memory
|
|
75
68
|
)
|
|
76
|
-
new_order_working_memory = new_order_working_memory[:top_k]
|
|
77
|
-
logger.info(f"size of new_order_working_memory: {len(new_order_working_memory)}")
|
|
78
69
|
|
|
79
|
-
|
|
70
|
+
if not scheduler_flag:
|
|
71
|
+
return text_working_memory, intent_result["trigger_retrieval"]
|
|
72
|
+
else:
|
|
73
|
+
if intent_result["trigger_retrieval"]:
|
|
74
|
+
missing_evidences = intent_result["missing_evidences"]
|
|
75
|
+
num_evidence = len(missing_evidences)
|
|
76
|
+
k_per_evidence = max(1, top_k // max(1, num_evidence))
|
|
77
|
+
new_candidates = []
|
|
78
|
+
for item in missing_evidences:
|
|
79
|
+
logger.info(f"missing_evidences: {item}")
|
|
80
|
+
results: list[TextualMemoryItem] = self.retriever.search(
|
|
81
|
+
query=item,
|
|
82
|
+
mem_cube=mem_cube,
|
|
83
|
+
top_k=k_per_evidence,
|
|
84
|
+
method=self.search_method,
|
|
85
|
+
)
|
|
86
|
+
logger.info(
|
|
87
|
+
f"search results for {missing_evidences}: {[one.memory for one in results]}"
|
|
88
|
+
)
|
|
89
|
+
new_candidates.extend(results)
|
|
90
|
+
print(
|
|
91
|
+
f"missing_evidences: {missing_evidences} and get {len(new_candidates)} new candidate memories."
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
new_candidates = []
|
|
95
|
+
print(f"intent_result: {intent_result}. not triggered")
|
|
96
|
+
|
|
97
|
+
# rerank
|
|
98
|
+
new_order_working_memory = self.replace_working_memory(
|
|
99
|
+
user_id=user_id,
|
|
100
|
+
mem_cube_id=self.current_mem_cube_id,
|
|
101
|
+
mem_cube=self.current_mem_cube,
|
|
102
|
+
original_memory=cur_working_memory,
|
|
103
|
+
new_memory=new_candidates,
|
|
104
|
+
)
|
|
105
|
+
new_order_working_memory = new_order_working_memory[:top_k]
|
|
106
|
+
logger.info(f"size of new_order_working_memory: {len(new_order_working_memory)}")
|
|
107
|
+
|
|
108
|
+
return [m.memory for m in new_order_working_memory], intent_result["trigger_retrieval"]
|
|
80
109
|
|
|
81
110
|
def _query_message_consumer(self, messages: list[ScheduleMessageItem]) -> None:
|
|
82
111
|
"""
|
|
@@ -105,18 +134,42 @@ class GeneralScheduler(BaseScheduler):
|
|
|
105
134
|
|
|
106
135
|
# update query monitors
|
|
107
136
|
for msg in messages:
|
|
137
|
+
self.monitor.register_query_monitor_if_not_exists(
|
|
138
|
+
user_id=user_id, mem_cube_id=mem_cube_id
|
|
139
|
+
)
|
|
140
|
+
|
|
108
141
|
query = msg.content
|
|
109
142
|
query_keywords = self.monitor.extract_query_keywords(query=query)
|
|
110
143
|
logger.info(f'Extract keywords "{query_keywords}" from query "{query}"')
|
|
111
144
|
|
|
145
|
+
if len(query_keywords) == 0:
|
|
146
|
+
stripped_query = query.strip()
|
|
147
|
+
# Determine measurement method based on language
|
|
148
|
+
if is_all_english(stripped_query):
|
|
149
|
+
words = stripped_query.split() # Word count for English
|
|
150
|
+
elif is_all_chinese(stripped_query):
|
|
151
|
+
words = stripped_query # Character count for Chinese
|
|
152
|
+
else:
|
|
153
|
+
logger.debug(
|
|
154
|
+
f"Mixed-language memory, using character count: {stripped_query[:50]}..."
|
|
155
|
+
)
|
|
156
|
+
words = stripped_query # Default to character count
|
|
157
|
+
|
|
158
|
+
query_keywords = list(set(words[:20]))
|
|
159
|
+
logger.error(
|
|
160
|
+
f"Keyword extraction failed for query. Using fallback keywords: {query_keywords[:10]}... (truncated)"
|
|
161
|
+
)
|
|
162
|
+
|
|
112
163
|
item = QueryMonitorItem(
|
|
113
164
|
query_text=query,
|
|
114
165
|
keywords=query_keywords,
|
|
115
166
|
max_keywords=DEFAULT_MAX_QUERY_KEY_WORDS,
|
|
116
167
|
)
|
|
117
|
-
|
|
168
|
+
|
|
169
|
+
self.monitor.query_monitors[user_id][mem_cube_id].put(item=item)
|
|
118
170
|
logger.debug(
|
|
119
|
-
f"Queries in monitor are
|
|
171
|
+
f"Queries in monitor are "
|
|
172
|
+
f"{self.monitor.query_monitors[user_id][mem_cube_id].get_queries_with_timesort()}."
|
|
120
173
|
)
|
|
121
174
|
|
|
122
175
|
queries = [msg.content for msg in messages]
|
|
@@ -143,6 +196,20 @@ class GeneralScheduler(BaseScheduler):
|
|
|
143
196
|
)
|
|
144
197
|
logger.info(f"size of new_order_working_memory: {len(new_order_working_memory)}")
|
|
145
198
|
|
|
199
|
+
# update activation memories
|
|
200
|
+
logger.info(
|
|
201
|
+
f"Activation memory update {'enabled' if self.enable_activation_memory else 'disabled'} "
|
|
202
|
+
f"(interval: {self.monitor.act_mem_update_interval}s)"
|
|
203
|
+
)
|
|
204
|
+
if self.enable_activation_memory:
|
|
205
|
+
self.update_activation_memory_periodically(
|
|
206
|
+
interval_seconds=self.monitor.act_mem_update_interval,
|
|
207
|
+
label=QUERY_LABEL,
|
|
208
|
+
user_id=user_id,
|
|
209
|
+
mem_cube_id=mem_cube_id,
|
|
210
|
+
mem_cube=messages[0].mem_cube,
|
|
211
|
+
)
|
|
212
|
+
|
|
146
213
|
def _answer_message_consumer(self, messages: list[ScheduleMessageItem]) -> None:
|
|
147
214
|
"""
|
|
148
215
|
Process and handle answer trigger messages from the queue.
|
|
@@ -165,26 +232,6 @@ class GeneralScheduler(BaseScheduler):
|
|
|
165
232
|
# for status update
|
|
166
233
|
self._set_current_context_from_message(msg=messages[0])
|
|
167
234
|
|
|
168
|
-
# update activation memories
|
|
169
|
-
if self.enable_act_memory_update:
|
|
170
|
-
if (
|
|
171
|
-
len(self.monitor.working_memory_monitors[user_id][mem_cube_id].memories)
|
|
172
|
-
== 0
|
|
173
|
-
):
|
|
174
|
-
self.initialize_working_memory_monitors(
|
|
175
|
-
user_id=user_id,
|
|
176
|
-
mem_cube_id=mem_cube_id,
|
|
177
|
-
mem_cube=messages[0].mem_cube,
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
self.update_activation_memory_periodically(
|
|
181
|
-
interval_seconds=self.monitor.act_mem_update_interval,
|
|
182
|
-
label=ANSWER_LABEL,
|
|
183
|
-
user_id=user_id,
|
|
184
|
-
mem_cube_id=mem_cube_id,
|
|
185
|
-
mem_cube=messages[0].mem_cube,
|
|
186
|
-
)
|
|
187
|
-
|
|
188
235
|
def _add_message_consumer(self, messages: list[ScheduleMessageItem]) -> None:
|
|
189
236
|
logger.info(f"Messages {messages} assigned to {ADD_LABEL} handler.")
|
|
190
237
|
# Process the query in a session turn
|
|
@@ -215,6 +262,9 @@ class GeneralScheduler(BaseScheduler):
|
|
|
215
262
|
mem_type = mem_item.metadata.memory_type
|
|
216
263
|
mem_content = mem_item.memory
|
|
217
264
|
|
|
265
|
+
if mem_type == WORKING_MEMORY_TYPE:
|
|
266
|
+
continue
|
|
267
|
+
|
|
218
268
|
self.log_adding_memory(
|
|
219
269
|
memory=mem_content,
|
|
220
270
|
memory_type=mem_type,
|
|
@@ -224,15 +274,6 @@ class GeneralScheduler(BaseScheduler):
|
|
|
224
274
|
log_func_callback=self._submit_web_logs,
|
|
225
275
|
)
|
|
226
276
|
|
|
227
|
-
# update activation memories
|
|
228
|
-
if self.enable_act_memory_update:
|
|
229
|
-
self.update_activation_memory_periodically(
|
|
230
|
-
interval_seconds=self.monitor.act_mem_update_interval,
|
|
231
|
-
label=ADD_LABEL,
|
|
232
|
-
user_id=user_id,
|
|
233
|
-
mem_cube_id=mem_cube_id,
|
|
234
|
-
mem_cube=messages[0].mem_cube,
|
|
235
|
-
)
|
|
236
277
|
except Exception as e:
|
|
237
278
|
logger.error(f"Error: {e}", exc_info=True)
|
|
238
279
|
|
|
@@ -289,18 +330,20 @@ class GeneralScheduler(BaseScheduler):
|
|
|
289
330
|
new_candidates = []
|
|
290
331
|
for item in missing_evidences:
|
|
291
332
|
logger.info(f"missing_evidences: {item}")
|
|
333
|
+
info = {
|
|
334
|
+
"user_id": user_id,
|
|
335
|
+
"session_id": "",
|
|
336
|
+
}
|
|
337
|
+
|
|
292
338
|
results: list[TextualMemoryItem] = self.retriever.search(
|
|
293
|
-
query=item,
|
|
339
|
+
query=item,
|
|
340
|
+
mem_cube=mem_cube,
|
|
341
|
+
top_k=k_per_evidence,
|
|
342
|
+
method=self.search_method,
|
|
343
|
+
info=info,
|
|
294
344
|
)
|
|
295
345
|
logger.info(
|
|
296
346
|
f"search results for {missing_evidences}: {[one.memory for one in results]}"
|
|
297
347
|
)
|
|
298
348
|
new_candidates.extend(results)
|
|
299
|
-
|
|
300
|
-
if len(new_candidates) == 0:
|
|
301
|
-
logger.warning(
|
|
302
|
-
f"As new_candidates is empty, new_candidates is set same to working_memory.\n"
|
|
303
|
-
f"time_trigger_flag: {time_trigger_flag}; intent_result: {intent_result}"
|
|
304
|
-
)
|
|
305
|
-
new_candidates = cur_working_memory
|
|
306
349
|
return cur_working_memory, new_candidates
|
|
File without changes
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from time import perf_counter
|
|
7
|
+
|
|
8
|
+
from memos.configs.mem_scheduler import BaseSchedulerConfig
|
|
9
|
+
from memos.log import get_logger
|
|
10
|
+
from memos.mem_scheduler.general_modules.base import BaseSchedulerModule
|
|
11
|
+
from memos.mem_scheduler.general_modules.dispatcher import SchedulerDispatcher
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SchedulerDispatcherMonitor(BaseSchedulerModule):
|
|
18
|
+
"""Monitors and manages scheduling operations with LLM integration."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, config: BaseSchedulerConfig):
|
|
21
|
+
super().__init__()
|
|
22
|
+
self.config: BaseSchedulerConfig = config
|
|
23
|
+
|
|
24
|
+
self.check_interval = self.config.get("dispatcher_monitor_check_interval", 60)
|
|
25
|
+
self.max_failures = self.config.get("dispatcher_monitor_max_failures", 2)
|
|
26
|
+
|
|
27
|
+
# Registry of monitored thread pools
|
|
28
|
+
self._pools: dict[str, dict] = {}
|
|
29
|
+
self._pool_lock = threading.Lock()
|
|
30
|
+
|
|
31
|
+
# thread pool monitor
|
|
32
|
+
self._monitor_thread: threading.Thread | None = None
|
|
33
|
+
self._running = False
|
|
34
|
+
self._restart_in_progress = False
|
|
35
|
+
|
|
36
|
+
# modules with thread pool
|
|
37
|
+
self.dispatcher: SchedulerDispatcher | None = None
|
|
38
|
+
self.dispatcher_pool_name = "dispatcher"
|
|
39
|
+
|
|
40
|
+
def initialize(self, dispatcher: SchedulerDispatcher):
|
|
41
|
+
self.dispatcher = dispatcher
|
|
42
|
+
self.register_pool(
|
|
43
|
+
name=self.dispatcher_pool_name,
|
|
44
|
+
executor=self.dispatcher.dispatcher_executor,
|
|
45
|
+
max_workers=self.dispatcher.max_workers,
|
|
46
|
+
restart_on_failure=True,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def register_pool(
|
|
50
|
+
self,
|
|
51
|
+
name: str,
|
|
52
|
+
executor: ThreadPoolExecutor,
|
|
53
|
+
max_workers: int,
|
|
54
|
+
restart_on_failure: bool = True,
|
|
55
|
+
) -> bool:
|
|
56
|
+
"""
|
|
57
|
+
Register a thread pool for monitoring.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
name: Unique identifier for the pool
|
|
61
|
+
executor: ThreadPoolExecutor instance to monitor
|
|
62
|
+
max_workers: Expected maximum worker count
|
|
63
|
+
restart_on_failure: Whether to restart if pool fails
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
bool: True if registration succeeded, False if pool already registered
|
|
67
|
+
"""
|
|
68
|
+
with self._pool_lock:
|
|
69
|
+
if name in self._pools:
|
|
70
|
+
logger.warning(f"Thread pool '{name}' is already registered")
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
self._pools[name] = {
|
|
74
|
+
"executor": executor,
|
|
75
|
+
"max_workers": max_workers,
|
|
76
|
+
"restart": restart_on_failure,
|
|
77
|
+
"failure_count": 0,
|
|
78
|
+
"last_active": datetime.utcnow(),
|
|
79
|
+
"healthy": True,
|
|
80
|
+
}
|
|
81
|
+
logger.info(f"Registered thread pool '{name}' for monitoring")
|
|
82
|
+
return True
|
|
83
|
+
|
|
84
|
+
def unregister_pool(self, name: str) -> bool:
|
|
85
|
+
"""
|
|
86
|
+
Remove a thread pool from monitoring.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
name: Identifier of the pool to remove
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
bool: True if removal succeeded, False if pool not found
|
|
93
|
+
"""
|
|
94
|
+
with self._pool_lock:
|
|
95
|
+
if name not in self._pools:
|
|
96
|
+
logger.warning(f"Thread pool '{name}' not found in registry")
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
del self._pools[name]
|
|
100
|
+
logger.info(f"Unregistered thread pool '{name}'")
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
def _monitor_loop(self) -> None:
|
|
104
|
+
"""Main monitoring loop that periodically checks all registered pools."""
|
|
105
|
+
logger.info(f"Starting monitor loop with {self.check_interval} second interval")
|
|
106
|
+
|
|
107
|
+
while self._running:
|
|
108
|
+
time.sleep(self.check_interval)
|
|
109
|
+
try:
|
|
110
|
+
self._check_pools_health()
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.error(f"Error during health check: {e!s}", exc_info=True)
|
|
113
|
+
|
|
114
|
+
logger.debug("Monitor loop exiting")
|
|
115
|
+
|
|
116
|
+
def start(self) -> bool:
|
|
117
|
+
"""
|
|
118
|
+
Start the monitoring thread.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
bool: True if monitor started successfully, False if already running
|
|
122
|
+
"""
|
|
123
|
+
if self._running:
|
|
124
|
+
logger.warning("Dispatcher Monitor is already running")
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
self._running = True
|
|
128
|
+
self._monitor_thread = threading.Thread(
|
|
129
|
+
target=self._monitor_loop, name="threadpool_monitor", daemon=True
|
|
130
|
+
)
|
|
131
|
+
self._monitor_thread.start()
|
|
132
|
+
logger.info("Dispatcher Monitor monitor started")
|
|
133
|
+
return True
|
|
134
|
+
|
|
135
|
+
def stop(self) -> None:
|
|
136
|
+
"""
|
|
137
|
+
Stop the monitoring thread and clean up all managed thread pools.
|
|
138
|
+
Ensures proper shutdown of all monitored executors.
|
|
139
|
+
"""
|
|
140
|
+
if not self._running:
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
# Stop the monitoring loop
|
|
144
|
+
self._running = False
|
|
145
|
+
if self._monitor_thread and self._monitor_thread.is_alive():
|
|
146
|
+
self._monitor_thread.join(timeout=5)
|
|
147
|
+
|
|
148
|
+
# Shutdown all registered pools
|
|
149
|
+
with self._pool_lock:
|
|
150
|
+
for name, pool_info in self._pools.items():
|
|
151
|
+
executor = pool_info["executor"]
|
|
152
|
+
if not executor._shutdown: # pylint: disable=protected-access
|
|
153
|
+
try:
|
|
154
|
+
logger.info(f"Shutting down thread pool '{name}'")
|
|
155
|
+
executor.shutdown(wait=True, cancel_futures=True)
|
|
156
|
+
logger.info(f"Successfully shut down thread pool '{name}'")
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f"Error shutting down pool '{name}': {e!s}", exc_info=True)
|
|
159
|
+
|
|
160
|
+
# Clear the pool registry
|
|
161
|
+
self._pools.clear()
|
|
162
|
+
logger.info("Thread pool monitor and all pools stopped")
|
|
163
|
+
|
|
164
|
+
def _check_pools_health(self) -> None:
|
|
165
|
+
"""Check health of all registered thread pools."""
|
|
166
|
+
for name, pool_info in list(self._pools.items()):
|
|
167
|
+
is_healthy, reason = self._check_pool_health(
|
|
168
|
+
pool_info=pool_info,
|
|
169
|
+
stuck_max_interval=4,
|
|
170
|
+
)
|
|
171
|
+
logger.info(f"Pool '{name}'. is_healthy: {is_healthy}. pool_info: {pool_info}")
|
|
172
|
+
with self._pool_lock:
|
|
173
|
+
if is_healthy:
|
|
174
|
+
pool_info["failure_count"] = 0
|
|
175
|
+
pool_info["healthy"] = True
|
|
176
|
+
return
|
|
177
|
+
else:
|
|
178
|
+
pool_info["failure_count"] += 1
|
|
179
|
+
pool_info["healthy"] = False
|
|
180
|
+
logger.warning(
|
|
181
|
+
f"Pool '{name}' unhealthy ({pool_info['failure_count']}/{self.max_failures}): {reason}"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
if (
|
|
185
|
+
pool_info["failure_count"] >= self.max_failures
|
|
186
|
+
and pool_info["restart"]
|
|
187
|
+
and not self._restart_in_progress
|
|
188
|
+
):
|
|
189
|
+
self._restart_pool(name, pool_info)
|
|
190
|
+
|
|
191
|
+
def _check_pool_health(self, pool_info: dict, stuck_max_interval=4) -> tuple[bool, str]:
|
|
192
|
+
"""
|
|
193
|
+
Check health of a single thread pool.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
pool_info: Dictionary containing pool configuration
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Tuple: (is_healthy, reason) where reason explains failure if not healthy
|
|
200
|
+
"""
|
|
201
|
+
executor = pool_info["executor"]
|
|
202
|
+
|
|
203
|
+
# Check if executor is shutdown
|
|
204
|
+
if executor._shutdown: # pylint: disable=protected-access
|
|
205
|
+
return False, "Executor is shutdown"
|
|
206
|
+
|
|
207
|
+
# Check thread activity
|
|
208
|
+
active_threads = sum(
|
|
209
|
+
1
|
|
210
|
+
for t in threading.enumerate()
|
|
211
|
+
if t.name.startswith(executor._thread_name_prefix) # pylint: disable=protected-access
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Check if no threads are active but should be
|
|
215
|
+
if active_threads == 0 and pool_info["max_workers"] > 0:
|
|
216
|
+
return False, "No active worker threads"
|
|
217
|
+
|
|
218
|
+
# Check if threads are stuck (no activity for 2 intervals)
|
|
219
|
+
time_delta = (datetime.utcnow() - pool_info["last_active"]).total_seconds()
|
|
220
|
+
if time_delta >= self.check_interval * stuck_max_interval:
|
|
221
|
+
return False, "No recent activity"
|
|
222
|
+
|
|
223
|
+
# If we got here, pool appears healthy
|
|
224
|
+
pool_info["last_active"] = datetime.utcnow()
|
|
225
|
+
return True, ""
|
|
226
|
+
|
|
227
|
+
def _restart_pool(self, name: str, pool_info: dict) -> None:
|
|
228
|
+
"""
|
|
229
|
+
Attempt to restart a failed thread pool.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
name: Name of the pool to restart
|
|
233
|
+
pool_info: Dictionary containing pool configuration
|
|
234
|
+
"""
|
|
235
|
+
if self._restart_in_progress:
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
self._restart_in_progress = True
|
|
239
|
+
logger.warning(f"Attempting to restart thread pool '{name}'")
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
old_executor = pool_info["executor"]
|
|
243
|
+
self.dispatcher.shutdown()
|
|
244
|
+
|
|
245
|
+
# Create new executor with same parameters
|
|
246
|
+
new_executor = ThreadPoolExecutor(
|
|
247
|
+
max_workers=pool_info["max_workers"],
|
|
248
|
+
thread_name_prefix=self.dispatcher.thread_name_prefix, # pylint: disable=protected-access
|
|
249
|
+
)
|
|
250
|
+
self.unregister_pool(name=self.dispatcher_pool_name)
|
|
251
|
+
self.dispatcher.dispatcher_executor = new_executor
|
|
252
|
+
self.register_pool(
|
|
253
|
+
name=self.dispatcher_pool_name,
|
|
254
|
+
executor=self.dispatcher.dispatcher_executor,
|
|
255
|
+
max_workers=self.dispatcher.max_workers,
|
|
256
|
+
restart_on_failure=True,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Replace in registry
|
|
260
|
+
start_time = perf_counter()
|
|
261
|
+
with self._pool_lock:
|
|
262
|
+
pool_info["executor"] = new_executor
|
|
263
|
+
pool_info["failure_count"] = 0
|
|
264
|
+
pool_info["healthy"] = True
|
|
265
|
+
pool_info["last_active"] = datetime.utcnow()
|
|
266
|
+
|
|
267
|
+
elapsed_time = perf_counter() - start_time
|
|
268
|
+
if elapsed_time > 1:
|
|
269
|
+
logger.warning(f"Long lock wait: {elapsed_time:.3f}s")
|
|
270
|
+
|
|
271
|
+
# Shutdown old executor
|
|
272
|
+
try:
|
|
273
|
+
old_executor.shutdown(wait=False)
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.error(f"Error shutting down old executor: {e!s}", exc_info=True)
|
|
276
|
+
|
|
277
|
+
logger.info(f"Successfully restarted thread pool '{name}'")
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.error(f"Failed to restart pool '{name}': {e!s}", exc_info=True)
|
|
280
|
+
finally:
|
|
281
|
+
self._restart_in_progress = False
|
|
282
|
+
|
|
283
|
+
def get_status(self, name: str | None = None) -> dict:
|
|
284
|
+
"""
|
|
285
|
+
Get status of monitored pools.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
name: Optional specific pool name to check
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
Dictionary of status information
|
|
292
|
+
"""
|
|
293
|
+
with self._pool_lock:
|
|
294
|
+
if name:
|
|
295
|
+
return {name: self._pools.get(name, {}).copy()}
|
|
296
|
+
return {k: v.copy() for k, v in self._pools.items()}
|
|
297
|
+
|
|
298
|
+
def __enter__(self):
|
|
299
|
+
"""Context manager entry point."""
|
|
300
|
+
self.start()
|
|
301
|
+
return self
|
|
302
|
+
|
|
303
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
304
|
+
"""Context manager exit point."""
|
|
305
|
+
self.stop()
|