MemoryOS 2.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. memoryos-2.0.3.dist-info/METADATA +418 -0
  2. memoryos-2.0.3.dist-info/RECORD +315 -0
  3. memoryos-2.0.3.dist-info/WHEEL +4 -0
  4. memoryos-2.0.3.dist-info/entry_points.txt +3 -0
  5. memoryos-2.0.3.dist-info/licenses/LICENSE +201 -0
  6. memos/__init__.py +20 -0
  7. memos/api/client.py +571 -0
  8. memos/api/config.py +1018 -0
  9. memos/api/context/dependencies.py +50 -0
  10. memos/api/exceptions.py +53 -0
  11. memos/api/handlers/__init__.py +62 -0
  12. memos/api/handlers/add_handler.py +158 -0
  13. memos/api/handlers/base_handler.py +194 -0
  14. memos/api/handlers/chat_handler.py +1401 -0
  15. memos/api/handlers/component_init.py +388 -0
  16. memos/api/handlers/config_builders.py +190 -0
  17. memos/api/handlers/feedback_handler.py +93 -0
  18. memos/api/handlers/formatters_handler.py +237 -0
  19. memos/api/handlers/memory_handler.py +316 -0
  20. memos/api/handlers/scheduler_handler.py +497 -0
  21. memos/api/handlers/search_handler.py +222 -0
  22. memos/api/handlers/suggestion_handler.py +117 -0
  23. memos/api/mcp_serve.py +614 -0
  24. memos/api/middleware/request_context.py +101 -0
  25. memos/api/product_api.py +38 -0
  26. memos/api/product_models.py +1206 -0
  27. memos/api/routers/__init__.py +1 -0
  28. memos/api/routers/product_router.py +477 -0
  29. memos/api/routers/server_router.py +394 -0
  30. memos/api/server_api.py +44 -0
  31. memos/api/start_api.py +433 -0
  32. memos/chunkers/__init__.py +4 -0
  33. memos/chunkers/base.py +24 -0
  34. memos/chunkers/charactertext_chunker.py +41 -0
  35. memos/chunkers/factory.py +24 -0
  36. memos/chunkers/markdown_chunker.py +62 -0
  37. memos/chunkers/sentence_chunker.py +54 -0
  38. memos/chunkers/simple_chunker.py +50 -0
  39. memos/cli.py +113 -0
  40. memos/configs/__init__.py +0 -0
  41. memos/configs/base.py +82 -0
  42. memos/configs/chunker.py +59 -0
  43. memos/configs/embedder.py +88 -0
  44. memos/configs/graph_db.py +236 -0
  45. memos/configs/internet_retriever.py +100 -0
  46. memos/configs/llm.py +151 -0
  47. memos/configs/mem_agent.py +54 -0
  48. memos/configs/mem_chat.py +81 -0
  49. memos/configs/mem_cube.py +105 -0
  50. memos/configs/mem_os.py +83 -0
  51. memos/configs/mem_reader.py +91 -0
  52. memos/configs/mem_scheduler.py +385 -0
  53. memos/configs/mem_user.py +70 -0
  54. memos/configs/memory.py +324 -0
  55. memos/configs/parser.py +38 -0
  56. memos/configs/reranker.py +18 -0
  57. memos/configs/utils.py +8 -0
  58. memos/configs/vec_db.py +80 -0
  59. memos/context/context.py +355 -0
  60. memos/dependency.py +52 -0
  61. memos/deprecation.py +262 -0
  62. memos/embedders/__init__.py +0 -0
  63. memos/embedders/ark.py +95 -0
  64. memos/embedders/base.py +106 -0
  65. memos/embedders/factory.py +29 -0
  66. memos/embedders/ollama.py +77 -0
  67. memos/embedders/sentence_transformer.py +49 -0
  68. memos/embedders/universal_api.py +51 -0
  69. memos/exceptions.py +30 -0
  70. memos/graph_dbs/__init__.py +0 -0
  71. memos/graph_dbs/base.py +274 -0
  72. memos/graph_dbs/factory.py +27 -0
  73. memos/graph_dbs/item.py +46 -0
  74. memos/graph_dbs/nebular.py +1794 -0
  75. memos/graph_dbs/neo4j.py +1942 -0
  76. memos/graph_dbs/neo4j_community.py +1058 -0
  77. memos/graph_dbs/polardb.py +5446 -0
  78. memos/hello_world.py +97 -0
  79. memos/llms/__init__.py +0 -0
  80. memos/llms/base.py +25 -0
  81. memos/llms/deepseek.py +13 -0
  82. memos/llms/factory.py +38 -0
  83. memos/llms/hf.py +443 -0
  84. memos/llms/hf_singleton.py +114 -0
  85. memos/llms/ollama.py +135 -0
  86. memos/llms/openai.py +222 -0
  87. memos/llms/openai_new.py +198 -0
  88. memos/llms/qwen.py +13 -0
  89. memos/llms/utils.py +14 -0
  90. memos/llms/vllm.py +218 -0
  91. memos/log.py +237 -0
  92. memos/mem_agent/base.py +19 -0
  93. memos/mem_agent/deepsearch_agent.py +391 -0
  94. memos/mem_agent/factory.py +36 -0
  95. memos/mem_chat/__init__.py +0 -0
  96. memos/mem_chat/base.py +30 -0
  97. memos/mem_chat/factory.py +21 -0
  98. memos/mem_chat/simple.py +200 -0
  99. memos/mem_cube/__init__.py +0 -0
  100. memos/mem_cube/base.py +30 -0
  101. memos/mem_cube/general.py +240 -0
  102. memos/mem_cube/navie.py +172 -0
  103. memos/mem_cube/utils.py +169 -0
  104. memos/mem_feedback/base.py +15 -0
  105. memos/mem_feedback/feedback.py +1192 -0
  106. memos/mem_feedback/simple_feedback.py +40 -0
  107. memos/mem_feedback/utils.py +230 -0
  108. memos/mem_os/client.py +5 -0
  109. memos/mem_os/core.py +1203 -0
  110. memos/mem_os/main.py +582 -0
  111. memos/mem_os/product.py +1608 -0
  112. memos/mem_os/product_server.py +455 -0
  113. memos/mem_os/utils/default_config.py +359 -0
  114. memos/mem_os/utils/format_utils.py +1403 -0
  115. memos/mem_os/utils/reference_utils.py +162 -0
  116. memos/mem_reader/__init__.py +0 -0
  117. memos/mem_reader/base.py +47 -0
  118. memos/mem_reader/factory.py +53 -0
  119. memos/mem_reader/memory.py +298 -0
  120. memos/mem_reader/multi_modal_struct.py +965 -0
  121. memos/mem_reader/read_multi_modal/__init__.py +43 -0
  122. memos/mem_reader/read_multi_modal/assistant_parser.py +311 -0
  123. memos/mem_reader/read_multi_modal/base.py +273 -0
  124. memos/mem_reader/read_multi_modal/file_content_parser.py +826 -0
  125. memos/mem_reader/read_multi_modal/image_parser.py +359 -0
  126. memos/mem_reader/read_multi_modal/multi_modal_parser.py +252 -0
  127. memos/mem_reader/read_multi_modal/string_parser.py +139 -0
  128. memos/mem_reader/read_multi_modal/system_parser.py +327 -0
  129. memos/mem_reader/read_multi_modal/text_content_parser.py +131 -0
  130. memos/mem_reader/read_multi_modal/tool_parser.py +210 -0
  131. memos/mem_reader/read_multi_modal/user_parser.py +218 -0
  132. memos/mem_reader/read_multi_modal/utils.py +358 -0
  133. memos/mem_reader/simple_struct.py +912 -0
  134. memos/mem_reader/strategy_struct.py +163 -0
  135. memos/mem_reader/utils.py +157 -0
  136. memos/mem_scheduler/__init__.py +0 -0
  137. memos/mem_scheduler/analyzer/__init__.py +0 -0
  138. memos/mem_scheduler/analyzer/api_analyzer.py +714 -0
  139. memos/mem_scheduler/analyzer/eval_analyzer.py +219 -0
  140. memos/mem_scheduler/analyzer/mos_for_test_scheduler.py +571 -0
  141. memos/mem_scheduler/analyzer/scheduler_for_eval.py +280 -0
  142. memos/mem_scheduler/base_scheduler.py +1319 -0
  143. memos/mem_scheduler/general_modules/__init__.py +0 -0
  144. memos/mem_scheduler/general_modules/api_misc.py +137 -0
  145. memos/mem_scheduler/general_modules/base.py +80 -0
  146. memos/mem_scheduler/general_modules/init_components_for_scheduler.py +425 -0
  147. memos/mem_scheduler/general_modules/misc.py +313 -0
  148. memos/mem_scheduler/general_modules/scheduler_logger.py +389 -0
  149. memos/mem_scheduler/general_modules/task_threads.py +315 -0
  150. memos/mem_scheduler/general_scheduler.py +1495 -0
  151. memos/mem_scheduler/memory_manage_modules/__init__.py +5 -0
  152. memos/mem_scheduler/memory_manage_modules/memory_filter.py +306 -0
  153. memos/mem_scheduler/memory_manage_modules/retriever.py +547 -0
  154. memos/mem_scheduler/monitors/__init__.py +0 -0
  155. memos/mem_scheduler/monitors/dispatcher_monitor.py +366 -0
  156. memos/mem_scheduler/monitors/general_monitor.py +394 -0
  157. memos/mem_scheduler/monitors/task_schedule_monitor.py +254 -0
  158. memos/mem_scheduler/optimized_scheduler.py +410 -0
  159. memos/mem_scheduler/orm_modules/__init__.py +0 -0
  160. memos/mem_scheduler/orm_modules/api_redis_model.py +518 -0
  161. memos/mem_scheduler/orm_modules/base_model.py +729 -0
  162. memos/mem_scheduler/orm_modules/monitor_models.py +261 -0
  163. memos/mem_scheduler/orm_modules/redis_model.py +699 -0
  164. memos/mem_scheduler/scheduler_factory.py +23 -0
  165. memos/mem_scheduler/schemas/__init__.py +0 -0
  166. memos/mem_scheduler/schemas/analyzer_schemas.py +52 -0
  167. memos/mem_scheduler/schemas/api_schemas.py +233 -0
  168. memos/mem_scheduler/schemas/general_schemas.py +55 -0
  169. memos/mem_scheduler/schemas/message_schemas.py +173 -0
  170. memos/mem_scheduler/schemas/monitor_schemas.py +406 -0
  171. memos/mem_scheduler/schemas/task_schemas.py +132 -0
  172. memos/mem_scheduler/task_schedule_modules/__init__.py +0 -0
  173. memos/mem_scheduler/task_schedule_modules/dispatcher.py +740 -0
  174. memos/mem_scheduler/task_schedule_modules/local_queue.py +247 -0
  175. memos/mem_scheduler/task_schedule_modules/orchestrator.py +74 -0
  176. memos/mem_scheduler/task_schedule_modules/redis_queue.py +1385 -0
  177. memos/mem_scheduler/task_schedule_modules/task_queue.py +162 -0
  178. memos/mem_scheduler/utils/__init__.py +0 -0
  179. memos/mem_scheduler/utils/api_utils.py +77 -0
  180. memos/mem_scheduler/utils/config_utils.py +100 -0
  181. memos/mem_scheduler/utils/db_utils.py +50 -0
  182. memos/mem_scheduler/utils/filter_utils.py +176 -0
  183. memos/mem_scheduler/utils/metrics.py +125 -0
  184. memos/mem_scheduler/utils/misc_utils.py +290 -0
  185. memos/mem_scheduler/utils/monitor_event_utils.py +67 -0
  186. memos/mem_scheduler/utils/status_tracker.py +229 -0
  187. memos/mem_scheduler/webservice_modules/__init__.py +0 -0
  188. memos/mem_scheduler/webservice_modules/rabbitmq_service.py +485 -0
  189. memos/mem_scheduler/webservice_modules/redis_service.py +380 -0
  190. memos/mem_user/factory.py +94 -0
  191. memos/mem_user/mysql_persistent_user_manager.py +271 -0
  192. memos/mem_user/mysql_user_manager.py +502 -0
  193. memos/mem_user/persistent_factory.py +98 -0
  194. memos/mem_user/persistent_user_manager.py +260 -0
  195. memos/mem_user/redis_persistent_user_manager.py +225 -0
  196. memos/mem_user/user_manager.py +488 -0
  197. memos/memories/__init__.py +0 -0
  198. memos/memories/activation/__init__.py +0 -0
  199. memos/memories/activation/base.py +42 -0
  200. memos/memories/activation/item.py +56 -0
  201. memos/memories/activation/kv.py +292 -0
  202. memos/memories/activation/vllmkv.py +219 -0
  203. memos/memories/base.py +19 -0
  204. memos/memories/factory.py +42 -0
  205. memos/memories/parametric/__init__.py +0 -0
  206. memos/memories/parametric/base.py +19 -0
  207. memos/memories/parametric/item.py +11 -0
  208. memos/memories/parametric/lora.py +41 -0
  209. memos/memories/textual/__init__.py +0 -0
  210. memos/memories/textual/base.py +92 -0
  211. memos/memories/textual/general.py +236 -0
  212. memos/memories/textual/item.py +304 -0
  213. memos/memories/textual/naive.py +187 -0
  214. memos/memories/textual/prefer_text_memory/__init__.py +0 -0
  215. memos/memories/textual/prefer_text_memory/adder.py +504 -0
  216. memos/memories/textual/prefer_text_memory/config.py +106 -0
  217. memos/memories/textual/prefer_text_memory/extractor.py +221 -0
  218. memos/memories/textual/prefer_text_memory/factory.py +85 -0
  219. memos/memories/textual/prefer_text_memory/retrievers.py +177 -0
  220. memos/memories/textual/prefer_text_memory/spliter.py +132 -0
  221. memos/memories/textual/prefer_text_memory/utils.py +93 -0
  222. memos/memories/textual/preference.py +344 -0
  223. memos/memories/textual/simple_preference.py +161 -0
  224. memos/memories/textual/simple_tree.py +69 -0
  225. memos/memories/textual/tree.py +459 -0
  226. memos/memories/textual/tree_text_memory/__init__.py +0 -0
  227. memos/memories/textual/tree_text_memory/organize/__init__.py +0 -0
  228. memos/memories/textual/tree_text_memory/organize/handler.py +184 -0
  229. memos/memories/textual/tree_text_memory/organize/manager.py +518 -0
  230. memos/memories/textual/tree_text_memory/organize/relation_reason_detector.py +238 -0
  231. memos/memories/textual/tree_text_memory/organize/reorganizer.py +622 -0
  232. memos/memories/textual/tree_text_memory/retrieve/__init__.py +0 -0
  233. memos/memories/textual/tree_text_memory/retrieve/advanced_searcher.py +364 -0
  234. memos/memories/textual/tree_text_memory/retrieve/bm25_util.py +186 -0
  235. memos/memories/textual/tree_text_memory/retrieve/bochasearch.py +419 -0
  236. memos/memories/textual/tree_text_memory/retrieve/internet_retriever.py +270 -0
  237. memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +102 -0
  238. memos/memories/textual/tree_text_memory/retrieve/reasoner.py +61 -0
  239. memos/memories/textual/tree_text_memory/retrieve/recall.py +497 -0
  240. memos/memories/textual/tree_text_memory/retrieve/reranker.py +111 -0
  241. memos/memories/textual/tree_text_memory/retrieve/retrieval_mid_structs.py +16 -0
  242. memos/memories/textual/tree_text_memory/retrieve/retrieve_utils.py +472 -0
  243. memos/memories/textual/tree_text_memory/retrieve/searcher.py +848 -0
  244. memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py +135 -0
  245. memos/memories/textual/tree_text_memory/retrieve/utils.py +54 -0
  246. memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +387 -0
  247. memos/memos_tools/dinding_report_bot.py +453 -0
  248. memos/memos_tools/lockfree_dict.py +120 -0
  249. memos/memos_tools/notification_service.py +44 -0
  250. memos/memos_tools/notification_utils.py +142 -0
  251. memos/memos_tools/singleton.py +174 -0
  252. memos/memos_tools/thread_safe_dict.py +310 -0
  253. memos/memos_tools/thread_safe_dict_segment.py +382 -0
  254. memos/multi_mem_cube/__init__.py +0 -0
  255. memos/multi_mem_cube/composite_cube.py +86 -0
  256. memos/multi_mem_cube/single_cube.py +874 -0
  257. memos/multi_mem_cube/views.py +54 -0
  258. memos/parsers/__init__.py +0 -0
  259. memos/parsers/base.py +15 -0
  260. memos/parsers/factory.py +21 -0
  261. memos/parsers/markitdown.py +28 -0
  262. memos/reranker/__init__.py +4 -0
  263. memos/reranker/base.py +25 -0
  264. memos/reranker/concat.py +103 -0
  265. memos/reranker/cosine_local.py +102 -0
  266. memos/reranker/factory.py +72 -0
  267. memos/reranker/http_bge.py +324 -0
  268. memos/reranker/http_bge_strategy.py +327 -0
  269. memos/reranker/noop.py +19 -0
  270. memos/reranker/strategies/__init__.py +4 -0
  271. memos/reranker/strategies/base.py +61 -0
  272. memos/reranker/strategies/concat_background.py +94 -0
  273. memos/reranker/strategies/concat_docsource.py +110 -0
  274. memos/reranker/strategies/dialogue_common.py +109 -0
  275. memos/reranker/strategies/factory.py +31 -0
  276. memos/reranker/strategies/single_turn.py +107 -0
  277. memos/reranker/strategies/singleturn_outmem.py +98 -0
  278. memos/settings.py +10 -0
  279. memos/templates/__init__.py +0 -0
  280. memos/templates/advanced_search_prompts.py +211 -0
  281. memos/templates/cloud_service_prompt.py +107 -0
  282. memos/templates/instruction_completion.py +66 -0
  283. memos/templates/mem_agent_prompts.py +85 -0
  284. memos/templates/mem_feedback_prompts.py +822 -0
  285. memos/templates/mem_reader_prompts.py +1096 -0
  286. memos/templates/mem_reader_strategy_prompts.py +238 -0
  287. memos/templates/mem_scheduler_prompts.py +626 -0
  288. memos/templates/mem_search_prompts.py +93 -0
  289. memos/templates/mos_prompts.py +403 -0
  290. memos/templates/prefer_complete_prompt.py +735 -0
  291. memos/templates/tool_mem_prompts.py +139 -0
  292. memos/templates/tree_reorganize_prompts.py +230 -0
  293. memos/types/__init__.py +34 -0
  294. memos/types/general_types.py +151 -0
  295. memos/types/openai_chat_completion_types/__init__.py +15 -0
  296. memos/types/openai_chat_completion_types/chat_completion_assistant_message_param.py +56 -0
  297. memos/types/openai_chat_completion_types/chat_completion_content_part_image_param.py +27 -0
  298. memos/types/openai_chat_completion_types/chat_completion_content_part_input_audio_param.py +23 -0
  299. memos/types/openai_chat_completion_types/chat_completion_content_part_param.py +43 -0
  300. memos/types/openai_chat_completion_types/chat_completion_content_part_refusal_param.py +16 -0
  301. memos/types/openai_chat_completion_types/chat_completion_content_part_text_param.py +16 -0
  302. memos/types/openai_chat_completion_types/chat_completion_message_custom_tool_call_param.py +27 -0
  303. memos/types/openai_chat_completion_types/chat_completion_message_function_tool_call_param.py +32 -0
  304. memos/types/openai_chat_completion_types/chat_completion_message_param.py +18 -0
  305. memos/types/openai_chat_completion_types/chat_completion_message_tool_call_union_param.py +15 -0
  306. memos/types/openai_chat_completion_types/chat_completion_system_message_param.py +36 -0
  307. memos/types/openai_chat_completion_types/chat_completion_tool_message_param.py +30 -0
  308. memos/types/openai_chat_completion_types/chat_completion_user_message_param.py +34 -0
  309. memos/utils.py +123 -0
  310. memos/vec_dbs/__init__.py +0 -0
  311. memos/vec_dbs/base.py +117 -0
  312. memos/vec_dbs/factory.py +23 -0
  313. memos/vec_dbs/item.py +50 -0
  314. memos/vec_dbs/milvus.py +654 -0
  315. memos/vec_dbs/qdrant.py +355 -0
@@ -0,0 +1,912 @@
1
+ import concurrent.futures
2
+ import copy
3
+ import json
4
+ import os
5
+ import traceback
6
+
7
+ from abc import ABC
8
+ from typing import TYPE_CHECKING, Any, TypeAlias
9
+
10
+ from tqdm import tqdm
11
+
12
+ from memos import log
13
+ from memos.chunkers import ChunkerFactory
14
+ from memos.configs.mem_reader import SimpleStructMemReaderConfig
15
+ from memos.context.context import ContextThreadPoolExecutor
16
+ from memos.embedders.factory import EmbedderFactory
17
+ from memos.llms.factory import LLMFactory
18
+ from memos.mem_reader.base import BaseMemReader
19
+
20
+
21
+ if TYPE_CHECKING:
22
+ from memos.graph_dbs.base import BaseGraphDB
23
+ from memos.mem_reader.read_multi_modal import coerce_scene_data, detect_lang
24
+ from memos.mem_reader.utils import (
25
+ count_tokens_text,
26
+ derive_key,
27
+ parse_json_result,
28
+ parse_keep_filter_response,
29
+ parse_rewritten_response,
30
+ )
31
+ from memos.memories.textual.item import (
32
+ SourceMessage,
33
+ TextualMemoryItem,
34
+ TreeNodeTextualMemoryMetadata,
35
+ )
36
+ from memos.templates.mem_reader_prompts import (
37
+ CUSTOM_TAGS_INSTRUCTION,
38
+ CUSTOM_TAGS_INSTRUCTION_ZH,
39
+ GENERAL_STRUCT_STRING_READER_PROMPT,
40
+ GENERAL_STRUCT_STRING_READER_PROMPT_ZH,
41
+ PROMPT_MAPPING,
42
+ SIMPLE_STRUCT_DOC_READER_PROMPT,
43
+ SIMPLE_STRUCT_DOC_READER_PROMPT_ZH,
44
+ SIMPLE_STRUCT_MEM_READER_EXAMPLE,
45
+ SIMPLE_STRUCT_MEM_READER_EXAMPLE_ZH,
46
+ SIMPLE_STRUCT_MEM_READER_PROMPT,
47
+ SIMPLE_STRUCT_MEM_READER_PROMPT_ZH,
48
+ )
49
+ from memos.types import MessagesType
50
+ from memos.types.openai_chat_completion_types import (
51
+ ChatCompletionAssistantMessageParam,
52
+ ChatCompletionContentPartTextParam,
53
+ ChatCompletionSystemMessageParam,
54
+ ChatCompletionToolMessageParam,
55
+ ChatCompletionUserMessageParam,
56
+ File,
57
+ )
58
+ from memos.utils import timed
59
+
60
+
61
+ class ParserFactory:
62
+ """Placeholder required by test suite."""
63
+
64
+ @staticmethod
65
+ def from_config(_config):
66
+ return None
67
+
68
+
69
+ ChatMessageClasses = (
70
+ ChatCompletionSystemMessageParam,
71
+ ChatCompletionUserMessageParam,
72
+ ChatCompletionAssistantMessageParam,
73
+ ChatCompletionToolMessageParam,
74
+ )
75
+
76
+ RawContentClasses = (ChatCompletionContentPartTextParam, File)
77
+ MessageDict: TypeAlias = dict[str, Any] # (Deprecated) not supported in the future
78
+ SceneDataInput: TypeAlias = (
79
+ list[list[MessageDict]] # (Deprecated) legacy chat example: scenes -> messages
80
+ | list[str] # (Deprecated) legacy doc example: list of paths / pure text
81
+ | list[MessagesType] # new: list of scenes (each scene is MessagesType)
82
+ )
83
+
84
+
85
+ logger = log.get_logger(__name__)
86
+ PROMPT_DICT = {
87
+ "chat": {
88
+ "en": SIMPLE_STRUCT_MEM_READER_PROMPT,
89
+ "zh": SIMPLE_STRUCT_MEM_READER_PROMPT_ZH,
90
+ "en_example": SIMPLE_STRUCT_MEM_READER_EXAMPLE,
91
+ "zh_example": SIMPLE_STRUCT_MEM_READER_EXAMPLE_ZH,
92
+ },
93
+ "doc": {"en": SIMPLE_STRUCT_DOC_READER_PROMPT, "zh": SIMPLE_STRUCT_DOC_READER_PROMPT_ZH},
94
+ "general_string": {
95
+ "en": GENERAL_STRUCT_STRING_READER_PROMPT,
96
+ "zh": GENERAL_STRUCT_STRING_READER_PROMPT_ZH,
97
+ },
98
+ "custom_tags": {"en": CUSTOM_TAGS_INSTRUCTION, "zh": CUSTOM_TAGS_INSTRUCTION_ZH},
99
+ }
100
+
101
+
102
+ def _build_node(idx, message, info, source_info, llm, parse_json_result, embedder):
103
+ # generate
104
+ try:
105
+ raw = llm.generate(message)
106
+ if not raw:
107
+ logger.warning(f"[LLM] Empty generation for input: {message}")
108
+ return None
109
+ except Exception as e:
110
+ logger.error(f"[LLM] Exception during generation: {e}")
111
+ return None
112
+
113
+ # parse_json_result
114
+ try:
115
+ chunk_res = parse_json_result(raw)
116
+ if not chunk_res:
117
+ logger.warning(f"[Parse] Failed to parse result: {raw}")
118
+ return None
119
+ except Exception as e:
120
+ logger.error(f"[Parse] Exception during JSON parsing: {e}")
121
+ return None
122
+
123
+ try:
124
+ value = chunk_res.get("value", "").strip()
125
+ if not value:
126
+ logger.warning("[BuildNode] value is empty")
127
+ return None
128
+
129
+ tags = chunk_res.get("tags", [])
130
+ if not isinstance(tags, list):
131
+ tags = []
132
+
133
+ key = chunk_res.get("key", None)
134
+
135
+ embedding = embedder.embed([value])[0]
136
+
137
+ info_ = info.copy()
138
+ user_id = info_.pop("user_id", "")
139
+ session_id = info_.pop("session_id", "")
140
+
141
+ return TextualMemoryItem(
142
+ memory=value,
143
+ metadata=TreeNodeTextualMemoryMetadata(
144
+ user_id=user_id,
145
+ session_id=session_id,
146
+ memory_type="LongTermMemory",
147
+ status="activated",
148
+ tags=tags,
149
+ key=key,
150
+ embedding=embedding,
151
+ usage=[],
152
+ sources=source_info,
153
+ background="",
154
+ confidence=0.99,
155
+ type="fact",
156
+ info=info_,
157
+ ),
158
+ )
159
+ except Exception as e:
160
+ logger.error(f"[BuildNode] Error building node: {e}")
161
+ return None
162
+
163
+
164
+ class SimpleStructMemReader(BaseMemReader, ABC):
165
+ """Naive implementation of MemReader."""
166
+
167
+ def __init__(self, config: SimpleStructMemReaderConfig):
168
+ """
169
+ Initialize the NaiveMemReader with configuration.
170
+
171
+ Args:
172
+ config: Configuration object for the reader
173
+ """
174
+ self.config = config
175
+ self.llm = LLMFactory.from_config(config.llm)
176
+ self.embedder = EmbedderFactory.from_config(config.embedder)
177
+ self.chunker = ChunkerFactory.from_config(config.chunker)
178
+ self.memory_max_length = 8000
179
+ # Use token-based windowing; default to ~5000 tokens if not configured
180
+ self.chat_window_max_tokens = getattr(self.config, "chat_window_max_tokens", 1024)
181
+ self._count_tokens = count_tokens_text
182
+ self.searcher = None
183
+ # Initialize graph_db as None, can be set later via set_graph_db for
184
+ # recall operations
185
+ self.graph_db = None
186
+
187
+ def set_graph_db(self, graph_db: "BaseGraphDB | None") -> None:
188
+ self.graph_db = graph_db
189
+
190
+ def _make_memory_item(
191
+ self,
192
+ value: str,
193
+ info: dict,
194
+ memory_type: str,
195
+ tags: list[str] | None = None,
196
+ key: str | None = None,
197
+ sources: list | None = None,
198
+ background: str = "",
199
+ type_: str = "fact",
200
+ confidence: float = 0.99,
201
+ **kwargs,
202
+ ) -> TextualMemoryItem:
203
+ """construct memory item"""
204
+ info_ = info.copy()
205
+ user_id = info_.pop("user_id", "")
206
+ session_id = info_.pop("session_id", "")
207
+ return TextualMemoryItem(
208
+ memory=value,
209
+ metadata=TreeNodeTextualMemoryMetadata(
210
+ user_id=user_id,
211
+ session_id=session_id,
212
+ memory_type=memory_type,
213
+ status="activated",
214
+ tags=tags or [],
215
+ key=key if key is not None else derive_key(value),
216
+ embedding=self.embedder.embed([value])[0],
217
+ usage=[],
218
+ sources=sources or [],
219
+ background=background,
220
+ confidence=confidence,
221
+ type=type_,
222
+ info=info_,
223
+ **kwargs,
224
+ ),
225
+ )
226
+
227
+ def _safe_generate(self, messages: list[dict]) -> str | None:
228
+ try:
229
+ return self.llm.generate(messages)
230
+ except Exception:
231
+ logger.exception("[LLM] Generation failed")
232
+ return None
233
+
234
+ def _safe_parse(self, text: str | None) -> dict | None:
235
+ if not text:
236
+ return None
237
+ try:
238
+ return parse_json_result(text)
239
+ except Exception:
240
+ logger.warning("[LLM] JSON parse failed")
241
+ return None
242
+
243
+ def _get_llm_response(self, mem_str: str, custom_tags: list[str] | None) -> dict:
244
+ lang = detect_lang(mem_str)
245
+ template = PROMPT_DICT["chat"][lang]
246
+ examples = PROMPT_DICT["chat"][f"{lang}_example"]
247
+ prompt = template.replace("${conversation}", mem_str)
248
+
249
+ custom_tags_prompt = (
250
+ PROMPT_DICT["custom_tags"][lang].replace("{custom_tags}", str(custom_tags))
251
+ if custom_tags
252
+ else ""
253
+ )
254
+ prompt = prompt.replace("${custom_tags_prompt}", custom_tags_prompt)
255
+
256
+ if self.config.remove_prompt_example:
257
+ prompt = prompt.replace(examples, "")
258
+ messages = [{"role": "user", "content": prompt}]
259
+
260
+ response_text = self._safe_generate(messages)
261
+ response_json = self._safe_parse(response_text)
262
+
263
+ if not response_json:
264
+ return {
265
+ "memory_list": [
266
+ {
267
+ "key": mem_str[:10],
268
+ "memory_type": "UserMemory",
269
+ "value": mem_str,
270
+ "tags": [],
271
+ }
272
+ ],
273
+ "summary": mem_str,
274
+ }
275
+
276
+ return response_json
277
+
278
+ def _iter_chat_windows(self, scene_data_info, max_tokens=None, overlap=200):
279
+ """
280
+ use token counter to get a slide window generator
281
+ """
282
+ max_tokens = max_tokens or self.chat_window_max_tokens
283
+ buf, sources, start_idx = [], [], 0
284
+ cur_text = ""
285
+ for idx, item in enumerate(scene_data_info):
286
+ role = item.get("role", "")
287
+ content = item.get("content", "")
288
+ chat_time = item.get("chat_time", None)
289
+ parts = []
290
+ if role and str(role).lower() != "mix":
291
+ parts.append(f"{role}: ")
292
+ if chat_time:
293
+ parts.append(f"[{chat_time}]: ")
294
+ prefix = "".join(parts)
295
+ line = f"{prefix}{content}\n"
296
+
297
+ if self._count_tokens(cur_text + line) > max_tokens and cur_text:
298
+ text = "".join(buf)
299
+ yield {"text": text, "sources": sources.copy(), "start_idx": start_idx}
300
+ while buf and self._count_tokens("".join(buf)) > overlap:
301
+ buf.pop(0)
302
+ sources.pop(0)
303
+ start_idx = idx
304
+ cur_text = "".join(buf)
305
+
306
+ buf.append(line)
307
+ sources.append(
308
+ {
309
+ "type": "chat",
310
+ "index": idx,
311
+ "role": role,
312
+ "chat_time": chat_time,
313
+ "content": content,
314
+ }
315
+ )
316
+ cur_text = "".join(buf)
317
+
318
+ if buf:
319
+ yield {"text": "".join(buf), "sources": sources.copy(), "start_idx": start_idx}
320
+
321
+ @timed
322
+ def _process_chat_data(self, scene_data_info, info, **kwargs):
323
+ mode = kwargs.get("mode", "fine")
324
+ windows = list(self._iter_chat_windows(scene_data_info))
325
+ custom_tags = info.pop(
326
+ "custom_tags", None
327
+ ) # must pop here, avoid add to info, only used in sync fine mode
328
+
329
+ if mode == "fast":
330
+ logger.debug("Using unified Fast Mode")
331
+
332
+ def _build_fast_node(w):
333
+ text = w["text"]
334
+ roles = {s.get("role", "") for s in w["sources"] if s.get("role")}
335
+ mem_type = "UserMemory" if roles == {"user"} else "LongTermMemory"
336
+ tags = ["mode:fast"]
337
+ return self._make_memory_item(
338
+ value=text, info=info, memory_type=mem_type, tags=tags, sources=w["sources"]
339
+ )
340
+
341
+ with ContextThreadPoolExecutor(max_workers=8) as ex:
342
+ futures = {ex.submit(_build_fast_node, w): i for i, w in enumerate(windows)}
343
+ results = [None] * len(futures)
344
+ for fut in concurrent.futures.as_completed(futures):
345
+ i = futures[fut]
346
+ try:
347
+ node = fut.result()
348
+ if node:
349
+ results[i] = node
350
+ except Exception as e:
351
+ logger.error(f"[ChatFast] error: {e}")
352
+ chat_nodes = [r for r in results if r]
353
+ return chat_nodes
354
+ else:
355
+ logger.debug("Using unified Fine Mode")
356
+ chat_read_nodes = []
357
+ for w in windows:
358
+ resp = self._get_llm_response(w["text"], custom_tags)
359
+ for m in resp.get("memory list", []):
360
+ try:
361
+ memory_type = (
362
+ m.get("memory_type", "LongTermMemory")
363
+ .replace("长期记忆", "LongTermMemory")
364
+ .replace("用户记忆", "UserMemory")
365
+ )
366
+ node = self._make_memory_item(
367
+ value=m.get("value", ""),
368
+ info=info,
369
+ memory_type=memory_type,
370
+ tags=m.get("tags", []),
371
+ key=m.get("key", ""),
372
+ sources=w["sources"],
373
+ background=resp.get("summary", ""),
374
+ )
375
+ chat_read_nodes.append(node)
376
+ except Exception as e:
377
+ logger.error(f"[ChatFine] parse error: {e}")
378
+ return chat_read_nodes
379
+
380
+ def _process_transfer_chat_data(
381
+ self, raw_node: TextualMemoryItem, custom_tags: list[str] | None = None, **kwargs
382
+ ):
383
+ raw_memory = raw_node.memory
384
+ response_json = self._get_llm_response(raw_memory, custom_tags)
385
+
386
+ chat_read_nodes = []
387
+ for memory_i_raw in response_json.get("memory list", []):
388
+ try:
389
+ memory_type = (
390
+ memory_i_raw.get("memory_type", "LongTermMemory")
391
+ .replace("长期记忆", "LongTermMemory")
392
+ .replace("用户记忆", "UserMemory")
393
+ )
394
+ if memory_type not in ["LongTermMemory", "UserMemory"]:
395
+ memory_type = "LongTermMemory"
396
+ node_i = self._make_memory_item(
397
+ value=memory_i_raw.get("value", ""),
398
+ info={
399
+ **(raw_node.metadata.info or {}),
400
+ "user_id": raw_node.metadata.user_id,
401
+ "session_id": raw_node.metadata.session_id,
402
+ },
403
+ memory_type=memory_type,
404
+ tags=memory_i_raw.get("tags", [])
405
+ if isinstance(memory_i_raw.get("tags", []), list)
406
+ else [],
407
+ key=memory_i_raw.get("key", ""),
408
+ sources=raw_node.metadata.sources,
409
+ background=response_json.get("summary", ""),
410
+ type_="fact",
411
+ confidence=0.99,
412
+ )
413
+ chat_read_nodes.append(node_i)
414
+ except Exception as e:
415
+ logger.error(f"[ChatReader] Error parsing memory item: {e}")
416
+
417
+ return chat_read_nodes
418
+
419
+ def get_memory(
420
+ self,
421
+ scene_data: SceneDataInput,
422
+ type: str,
423
+ info: dict[str, Any],
424
+ mode: str = "fine",
425
+ user_name: str | None = None,
426
+ ) -> list[list[TextualMemoryItem]]:
427
+ """
428
+ Extract and classify memory content from scene_data.
429
+ For dictionaries: Use LLM to summarize pairs of Q&A
430
+ For file paths: Use chunker to split documents and LLM to summarize each chunk
431
+
432
+ Args:
433
+ scene_data: List of dialogue information or document paths
434
+ type: (Deprecated) not supported in the future. Type of scene_data: ['doc', 'chat']
435
+ info: Dictionary containing user_id and session_id.
436
+ Must be in format: {"user_id": "1111", "session_id": "2222"}
437
+ Optional parameters:
438
+ - topic_chunk_size: Size for large topic chunks (default: 1024)
439
+ - topic_chunk_overlap: Overlap for large topic chunks (default: 100)
440
+ - chunk_size: Size for small chunks (default: 256)
441
+ - chunk_overlap: Overlap for small chunks (default: 50)
442
+ mode: mem-reader mode, fast for quick process while fine for
443
+ better understanding via calling llm
444
+ user_name: tha user_name would be inserted later into the
445
+ database, may be used in recall.
446
+ Returns:
447
+ list[list[TextualMemoryItem]] containing memory content with summaries as keys and original text as values
448
+ Raises:
449
+ ValueError: If scene_data is empty or if info dictionary is missing required fields
450
+ """
451
+ if not scene_data:
452
+ raise ValueError("scene_data is empty")
453
+
454
+ # Validate info dictionary format
455
+ if not isinstance(info, dict):
456
+ raise ValueError("info must be a dictionary")
457
+
458
+ required_fields = {"user_id", "session_id"}
459
+ missing_fields = required_fields - set(info.keys())
460
+ if missing_fields:
461
+ raise ValueError(f"info dictionary is missing required fields: {missing_fields}")
462
+
463
+ if not all(isinstance(info[field], str) for field in required_fields):
464
+ raise ValueError("user_id and session_id must be strings")
465
+
466
+ # Backward compatibility, after coercing scene_data, we only tackle
467
+ # with standard scene_data type: MessagesType
468
+ standard_scene_data = coerce_scene_data(scene_data, type)
469
+ return self._read_memory(standard_scene_data, type, info, mode, user_name=user_name)
470
+
471
+ def rewrite_memories(
472
+ self, messages: list[dict], memory_list: list[TextualMemoryItem], user_only: bool = True
473
+ ) -> list[TextualMemoryItem]:
474
+ # Build input objects with memory text and metadata (timestamps, sources, etc.)
475
+ if user_only:
476
+ template = PROMPT_MAPPING["rewrite_user_only"]
477
+ filtered_messages = [m for m in messages if m.get("role") != "assistant"]
478
+ if len(filtered_messages) < 1:
479
+ return memory_list
480
+ else:
481
+ template = PROMPT_MAPPING["rewrite"]
482
+ filtered_messages = messages
483
+ if len(filtered_messages) < 2:
484
+ return memory_list
485
+
486
+ prompt_args = {
487
+ "messages_inline": "\n".join(
488
+ [f"- [{message['role']}]: {message['content']}" for message in filtered_messages]
489
+ ),
490
+ "memories_inline": json.dumps(
491
+ {idx: mem.memory for idx, mem in enumerate(memory_list)},
492
+ ensure_ascii=False,
493
+ indent=2,
494
+ ),
495
+ }
496
+ prompt = template.format(**prompt_args)
497
+
498
+ # Optionally run filter and parse the output
499
+ try:
500
+ raw = self.llm.generate([{"role": "user", "content": prompt}])
501
+ success, parsed = parse_rewritten_response(raw)
502
+ logger.info(
503
+ f"[rewrite_memories] Hallucination filter parsed successfully: {success};prompt: {prompt}"
504
+ )
505
+ if success:
506
+ logger.info(f"Rewrite filter result: {parsed}")
507
+
508
+ new_memory_list = []
509
+ for mem_idx, content in parsed.items():
510
+ if mem_idx < 0 or mem_idx >= len(memory_list):
511
+ logger.warning(
512
+ f"[rewrite_memories] Invalid memory index {mem_idx} for memory_list {len(memory_list)}, skipping."
513
+ )
514
+ continue
515
+
516
+ need_rewrite = content.get("need_rewrite", False)
517
+ rewritten_text = content.get("rewritten", "")
518
+ reason = content.get("reason", "")
519
+ original_text = memory_list[mem_idx].memory
520
+
521
+ # Replace memory text with rewritten content when rewrite is needed
522
+ if need_rewrite and isinstance(rewritten_text, str):
523
+ logger.info(
524
+ f"[rewrite_memories] index={mem_idx}, need_rewrite={need_rewrite}, rewritten='{rewritten_text}', reason='{reason}', original memory='{original_text}', action='replace_text'"
525
+ )
526
+ if len(rewritten_text.strip()) != 0:
527
+ memory_list[mem_idx].memory = rewritten_text
528
+ new_memory_list.append(memory_list[mem_idx])
529
+ else:
530
+ new_memory_list.append(memory_list[mem_idx])
531
+ return new_memory_list
532
+ else:
533
+ logger.warning("Rewrite filter parsing failed or returned empty result.")
534
+ except Exception as e:
535
+ logger.error(f"Rewrite filter execution error: {e}", stack_info=True)
536
+
537
+ return memory_list
538
+
539
+ def filter_hallucination_in_memories(
540
+ self, messages: list[dict], memory_list: list[TextualMemoryItem]
541
+ ) -> list[TextualMemoryItem]:
542
+ # Build input objects with memory text and metadata (timestamps, sources, etc.)
543
+ template = PROMPT_MAPPING["hallucination_filter"]
544
+ if len(messages) < 2:
545
+ return memory_list
546
+ prompt_args = {
547
+ "messages_inline": "\n".join(
548
+ [f"- [{message['role']}]: {message['content']}" for message in messages]
549
+ ),
550
+ "memories_inline": json.dumps(
551
+ {idx: mem.memory for idx, mem in enumerate(memory_list)},
552
+ ensure_ascii=False,
553
+ indent=2,
554
+ ),
555
+ }
556
+ prompt = template.format(**prompt_args)
557
+
558
+ # Optionally run filter and parse the output
559
+ try:
560
+ raw = self.llm.generate([{"role": "user", "content": prompt}])
561
+ success, parsed = parse_keep_filter_response(raw)
562
+ logger.info(
563
+ f"[filter_hallucination_in_memories] Hallucination filter parsed successfully: {success};prompt: {prompt}"
564
+ )
565
+ if success:
566
+ logger.info(f"Hallucination filter result: {parsed}")
567
+
568
+ filtered_list = []
569
+ for mem_idx, mem in enumerate(memory_list):
570
+ content = parsed.get(mem_idx)
571
+ if not content:
572
+ logger.warning(f"No verdict for memory {mem_idx}, keeping it.")
573
+ filtered_list.append(mem)
574
+ continue
575
+
576
+ keep = content.get("keep", True)
577
+ reason = content.get("reason", "")
578
+
579
+ if keep:
580
+ filtered_list.append(mem)
581
+ else:
582
+ logger.info(
583
+ f"[filter_hallucination_in_memories] Dropping memory index={mem_idx}, reason='{reason}', memory='{mem.memory}'"
584
+ )
585
+
586
+ return filtered_list
587
+ else:
588
+ logger.warning("Hallucination filter parsing failed or returned empty result.")
589
+ except Exception as e:
590
+ logger.error(f"Hallucination filter execution error: {e}", stack_info=True)
591
+
592
+ return memory_list
593
+
594
+ def _read_memory(
595
+ self,
596
+ messages: list[MessagesType],
597
+ type: str,
598
+ info: dict[str, Any],
599
+ mode: str = "fine",
600
+ **kwargs,
601
+ ) -> list[list[TextualMemoryItem]]:
602
+ """
603
+ 1. raw file:
604
+ [
605
+ [
606
+ {"type": "file", "file": "str"}
607
+ ],
608
+ [
609
+ {"type": "file", "file": "str"}
610
+ ],...
611
+ ]
612
+ 2. text chat:
613
+ scene_data = [
614
+ [ {role: user, ...}, {role: assistant, ...}, ... ],
615
+ [ {role: user, ...}, {role: assistant, ...}, ... ],
616
+ [ ... ]
617
+ ]
618
+ """
619
+ list_scene_data_info = self.get_scene_data_info(messages, type)
620
+
621
+ memory_list = []
622
+ if type == "chat":
623
+ processing_func = self._process_chat_data
624
+ elif type == "doc":
625
+ processing_func = self._process_doc_data
626
+ else:
627
+ processing_func = self._process_doc_data
628
+
629
+ # Process Q&A pairs concurrently with context propagation
630
+ with ContextThreadPoolExecutor() as executor:
631
+ futures = [
632
+ executor.submit(processing_func, scene_data_info, info, mode=mode)
633
+ for scene_data_info in list_scene_data_info
634
+ ]
635
+ for future in concurrent.futures.as_completed(futures):
636
+ try:
637
+ res_memory = future.result()
638
+ if res_memory is not None:
639
+ memory_list.append(res_memory)
640
+ except Exception as e:
641
+ logger.error(f"Task failed with exception: {e}")
642
+ logger.error(traceback.format_exc())
643
+
644
+ if os.getenv("SIMPLE_STRUCT_ADD_FILTER", "false") == "true":
645
+ # Build inputs
646
+ combined_messages = []
647
+ for group_messages in messages:
648
+ combined_messages.extend(group_messages)
649
+
650
+ for group_id in range(len(memory_list)):
651
+ try:
652
+ original_memory_group = copy.deepcopy(memory_list[group_id])
653
+ serialized_origin_memories = json.dumps(
654
+ [one.memory for one in original_memory_group], indent=2
655
+ )
656
+ revised_memory_list = self.filter_hallucination_in_memories(
657
+ messages=combined_messages,
658
+ memory_list=original_memory_group,
659
+ )
660
+ serialized_revised_memories = json.dumps(
661
+ [one.memory for one in revised_memory_list], indent=2
662
+ )
663
+ if serialized_origin_memories != serialized_revised_memories:
664
+ memory_list[group_id] = revised_memory_list
665
+ logger.info(
666
+ f"[SIMPLE_STRUCT_ADD_FILTER] Modified the list for group_id={group_id}: "
667
+ f"\noriginal={serialized_origin_memories},"
668
+ f"\nrevised={serialized_revised_memories}"
669
+ )
670
+
671
+ except Exception as e:
672
+ group_serialized = [
673
+ one.memory if hasattr(one, "memory") else str(one)
674
+ for one in memory_list[group_id]
675
+ ]
676
+ logger.error(
677
+ f"There is an exception while filtering group_id={group_id}: {e}\n"
678
+ f"messages: {combined_messages}\n"
679
+ f"memory_list(serialized): {group_serialized}",
680
+ exc_info=True,
681
+ )
682
+ return memory_list
683
+
684
+ def fine_transfer_simple_mem(
685
+ self,
686
+ input_memories: list[TextualMemoryItem],
687
+ type: str,
688
+ custom_tags: list[str] | None = None,
689
+ **kwargs,
690
+ ) -> list[list[TextualMemoryItem]]:
691
+ if not input_memories:
692
+ return []
693
+
694
+ memory_list = []
695
+
696
+ if type == "chat":
697
+ processing_func = self._process_transfer_chat_data
698
+ elif type == "doc":
699
+ processing_func = self._process_transfer_doc_data
700
+ else:
701
+ processing_func = self._process_transfer_doc_data
702
+
703
+ # Process Q&A pairs concurrently with context propagation
704
+ with ContextThreadPoolExecutor() as executor:
705
+ futures = [
706
+ executor.submit(processing_func, scene_data_info, custom_tags, **kwargs)
707
+ for scene_data_info in input_memories
708
+ ]
709
+ for future in concurrent.futures.as_completed(futures):
710
+ try:
711
+ res_memory = future.result()
712
+ if res_memory is not None:
713
+ memory_list.append(res_memory)
714
+ except Exception as e:
715
+ logger.error(f"Task failed with exception: {e}")
716
+ logger.error(traceback.format_exc())
717
+ return memory_list
718
+
719
+ def get_scene_data_info(self, scene_data: list, type: str) -> list[list[Any]]:
720
+ """
721
+ Convert normalized MessagesType scenes into typical MessagesType this reader can
722
+ handle.
723
+ SimpleStructMemReader only supports text-only chat messages with roles.
724
+ For chat scenes we:
725
+ - skip unsupported scene types (e.g. `str` scenes)
726
+ - drop non-dict messages
727
+ - keep only roles in {user, assistant, system}
728
+ - coerce OpenAI multimodal `content` (list[parts]) into a single plain-text string
729
+ - then apply the existing windowing logic (<=10 messages with 2-message overlap)
730
+ For doc scenes we pass through; doc handling is done in `_process_doc_data`.
731
+ """
732
+ results: list[list[Any]] = []
733
+
734
+ if type == "chat":
735
+ allowed_roles = {"user", "assistant", "system"}
736
+ for items in scene_data:
737
+ if isinstance(items, str):
738
+ logger.warning(
739
+ "SimpleStruct MemReader does not support "
740
+ "str message data now, your messages "
741
+ f"contains {items}, skipping"
742
+ )
743
+ continue
744
+ if not isinstance(items, list):
745
+ logger.warning(
746
+ "SimpleStruct MemReader expects message as "
747
+ f"list[dict], your messages contains"
748
+ f"{items}, skipping"
749
+ )
750
+ continue
751
+ # Filter messages within this message
752
+ result = []
753
+ for _i, item in enumerate(items):
754
+ if not isinstance(item, dict):
755
+ logger.warning(
756
+ "SimpleStruct MemReader expects message as "
757
+ f"list[dict], your messages contains"
758
+ f"{item}, skipping"
759
+ )
760
+ continue
761
+ role = item.get("role") or ""
762
+ role = role if isinstance(role, str) else str(role)
763
+ role = role.strip().lower()
764
+ if role not in allowed_roles:
765
+ logger.warning(
766
+ f"SimpleStruct MemReader expects message with "
767
+ f"role in {allowed_roles}, your messages contains"
768
+ f"role {role}, skipping"
769
+ )
770
+ continue
771
+
772
+ content = item.get("content", "")
773
+ if not isinstance(content, str):
774
+ logger.warning(
775
+ f"SimpleStruct MemReader expects message content "
776
+ f"with str, your messages content"
777
+ f"is {content!s}, skipping"
778
+ )
779
+ continue
780
+ if not content:
781
+ continue
782
+
783
+ result.append(
784
+ {
785
+ "role": role,
786
+ "content": content,
787
+ "chat_time": item.get("chat_time", ""),
788
+ }
789
+ )
790
+ if not result:
791
+ continue
792
+ window = []
793
+ for i, item in enumerate(result):
794
+ window.append(item)
795
+ if len(window) >= 10:
796
+ results.append(window)
797
+ context = copy.deepcopy(window[-2:]) if i + 1 < len(result) else []
798
+ window = context
799
+
800
+ if window:
801
+ results.append(window)
802
+ elif type == "doc":
803
+ results = scene_data
804
+ return results
805
+
806
+ def _process_doc_data(self, scene_data_info, info, **kwargs):
807
+ """
808
+ Process doc data after being normalized to new RawMessageList format.
809
+
810
+ scene_data_info format (length always == 1):
811
+ [
812
+ {"type": "file", "file": {"filename": "...", "file_data": "..."}}
813
+ ]
814
+ OR
815
+ [
816
+ {"type": "text", "text": "..."}
817
+ ]
818
+
819
+ Behavior:
820
+ - Merge all text/file_data into a single "full text"
821
+ - Chunk the text
822
+ - Build prompts
823
+ - Send to LLM
824
+ - Parse results and build memory nodes
825
+ """
826
+ mode = kwargs.get("mode", "fine")
827
+ if mode == "fast":
828
+ raise NotImplementedError
829
+
830
+ custom_tags = info.pop("custom_tags", None)
831
+
832
+ if not scene_data_info or len(scene_data_info) != 1:
833
+ logger.error(
834
+ "[DocReader] scene_data_info must contain exactly 1 item after normalization"
835
+ )
836
+ return []
837
+
838
+ item = scene_data_info[0]
839
+ text_content = ""
840
+ source_info_list = []
841
+
842
+ # Determine content and source metadata
843
+ if item.get("type") == "file":
844
+ f = item["file"]
845
+ filename = f.get("filename") or "document"
846
+ file_data = f.get("file_data") or ""
847
+
848
+ text_content = file_data
849
+ source_dict = {
850
+ "type": "doc",
851
+ "doc_path": filename,
852
+ }
853
+ source_info_list = [SourceMessage(**source_dict)]
854
+
855
+ elif item.get("type") == "text":
856
+ text_content = item.get("text", "")
857
+ source_info_list = [SourceMessage(type="doc", doc_path="inline-text")]
858
+
859
+ text_content = (text_content or "").strip()
860
+ if not text_content:
861
+ logger.warning("[DocReader] Empty document text after normalization.")
862
+ return []
863
+
864
+ chunks = self.chunker.chunk(text_content)
865
+ messages = []
866
+ for chunk in chunks:
867
+ lang = detect_lang(chunk.text)
868
+ template = PROMPT_DICT["doc"][lang]
869
+ prompt = template.replace("{chunk_text}", chunk.text)
870
+ custom_tags_prompt = (
871
+ PROMPT_DICT["custom_tags"][lang].replace("{custom_tags}", str(custom_tags))
872
+ if custom_tags
873
+ else ""
874
+ )
875
+ prompt = prompt.replace("{custom_tags_prompt}", custom_tags_prompt)
876
+ message = [{"role": "user", "content": prompt}]
877
+ messages.append(message)
878
+
879
+ doc_nodes = []
880
+
881
+ with ContextThreadPoolExecutor(max_workers=50) as executor:
882
+ futures = {
883
+ executor.submit(
884
+ _build_node,
885
+ idx,
886
+ msg,
887
+ info,
888
+ source_info_list,
889
+ self.llm,
890
+ parse_json_result,
891
+ self.embedder,
892
+ ): idx
893
+ for idx, msg in enumerate(messages)
894
+ }
895
+ total = len(futures)
896
+
897
+ for future in tqdm(
898
+ concurrent.futures.as_completed(futures), total=total, desc="Processing"
899
+ ):
900
+ try:
901
+ node = future.result()
902
+ if node:
903
+ doc_nodes.append(node)
904
+ except Exception as e:
905
+ tqdm.write(f"[ERROR] {e}")
906
+ logger.error(f"[DocReader] Future task failed: {e}")
907
+ return doc_nodes
908
+
909
+ def _process_transfer_doc_data(
910
+ self, raw_node: TextualMemoryItem, custom_tags: list[str] | None = None, **kwargs
911
+ ):
912
+ raise NotImplementedError