MemoryOS 2.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. memoryos-2.0.3.dist-info/METADATA +418 -0
  2. memoryos-2.0.3.dist-info/RECORD +315 -0
  3. memoryos-2.0.3.dist-info/WHEEL +4 -0
  4. memoryos-2.0.3.dist-info/entry_points.txt +3 -0
  5. memoryos-2.0.3.dist-info/licenses/LICENSE +201 -0
  6. memos/__init__.py +20 -0
  7. memos/api/client.py +571 -0
  8. memos/api/config.py +1018 -0
  9. memos/api/context/dependencies.py +50 -0
  10. memos/api/exceptions.py +53 -0
  11. memos/api/handlers/__init__.py +62 -0
  12. memos/api/handlers/add_handler.py +158 -0
  13. memos/api/handlers/base_handler.py +194 -0
  14. memos/api/handlers/chat_handler.py +1401 -0
  15. memos/api/handlers/component_init.py +388 -0
  16. memos/api/handlers/config_builders.py +190 -0
  17. memos/api/handlers/feedback_handler.py +93 -0
  18. memos/api/handlers/formatters_handler.py +237 -0
  19. memos/api/handlers/memory_handler.py +316 -0
  20. memos/api/handlers/scheduler_handler.py +497 -0
  21. memos/api/handlers/search_handler.py +222 -0
  22. memos/api/handlers/suggestion_handler.py +117 -0
  23. memos/api/mcp_serve.py +614 -0
  24. memos/api/middleware/request_context.py +101 -0
  25. memos/api/product_api.py +38 -0
  26. memos/api/product_models.py +1206 -0
  27. memos/api/routers/__init__.py +1 -0
  28. memos/api/routers/product_router.py +477 -0
  29. memos/api/routers/server_router.py +394 -0
  30. memos/api/server_api.py +44 -0
  31. memos/api/start_api.py +433 -0
  32. memos/chunkers/__init__.py +4 -0
  33. memos/chunkers/base.py +24 -0
  34. memos/chunkers/charactertext_chunker.py +41 -0
  35. memos/chunkers/factory.py +24 -0
  36. memos/chunkers/markdown_chunker.py +62 -0
  37. memos/chunkers/sentence_chunker.py +54 -0
  38. memos/chunkers/simple_chunker.py +50 -0
  39. memos/cli.py +113 -0
  40. memos/configs/__init__.py +0 -0
  41. memos/configs/base.py +82 -0
  42. memos/configs/chunker.py +59 -0
  43. memos/configs/embedder.py +88 -0
  44. memos/configs/graph_db.py +236 -0
  45. memos/configs/internet_retriever.py +100 -0
  46. memos/configs/llm.py +151 -0
  47. memos/configs/mem_agent.py +54 -0
  48. memos/configs/mem_chat.py +81 -0
  49. memos/configs/mem_cube.py +105 -0
  50. memos/configs/mem_os.py +83 -0
  51. memos/configs/mem_reader.py +91 -0
  52. memos/configs/mem_scheduler.py +385 -0
  53. memos/configs/mem_user.py +70 -0
  54. memos/configs/memory.py +324 -0
  55. memos/configs/parser.py +38 -0
  56. memos/configs/reranker.py +18 -0
  57. memos/configs/utils.py +8 -0
  58. memos/configs/vec_db.py +80 -0
  59. memos/context/context.py +355 -0
  60. memos/dependency.py +52 -0
  61. memos/deprecation.py +262 -0
  62. memos/embedders/__init__.py +0 -0
  63. memos/embedders/ark.py +95 -0
  64. memos/embedders/base.py +106 -0
  65. memos/embedders/factory.py +29 -0
  66. memos/embedders/ollama.py +77 -0
  67. memos/embedders/sentence_transformer.py +49 -0
  68. memos/embedders/universal_api.py +51 -0
  69. memos/exceptions.py +30 -0
  70. memos/graph_dbs/__init__.py +0 -0
  71. memos/graph_dbs/base.py +274 -0
  72. memos/graph_dbs/factory.py +27 -0
  73. memos/graph_dbs/item.py +46 -0
  74. memos/graph_dbs/nebular.py +1794 -0
  75. memos/graph_dbs/neo4j.py +1942 -0
  76. memos/graph_dbs/neo4j_community.py +1058 -0
  77. memos/graph_dbs/polardb.py +5446 -0
  78. memos/hello_world.py +97 -0
  79. memos/llms/__init__.py +0 -0
  80. memos/llms/base.py +25 -0
  81. memos/llms/deepseek.py +13 -0
  82. memos/llms/factory.py +38 -0
  83. memos/llms/hf.py +443 -0
  84. memos/llms/hf_singleton.py +114 -0
  85. memos/llms/ollama.py +135 -0
  86. memos/llms/openai.py +222 -0
  87. memos/llms/openai_new.py +198 -0
  88. memos/llms/qwen.py +13 -0
  89. memos/llms/utils.py +14 -0
  90. memos/llms/vllm.py +218 -0
  91. memos/log.py +237 -0
  92. memos/mem_agent/base.py +19 -0
  93. memos/mem_agent/deepsearch_agent.py +391 -0
  94. memos/mem_agent/factory.py +36 -0
  95. memos/mem_chat/__init__.py +0 -0
  96. memos/mem_chat/base.py +30 -0
  97. memos/mem_chat/factory.py +21 -0
  98. memos/mem_chat/simple.py +200 -0
  99. memos/mem_cube/__init__.py +0 -0
  100. memos/mem_cube/base.py +30 -0
  101. memos/mem_cube/general.py +240 -0
  102. memos/mem_cube/navie.py +172 -0
  103. memos/mem_cube/utils.py +169 -0
  104. memos/mem_feedback/base.py +15 -0
  105. memos/mem_feedback/feedback.py +1192 -0
  106. memos/mem_feedback/simple_feedback.py +40 -0
  107. memos/mem_feedback/utils.py +230 -0
  108. memos/mem_os/client.py +5 -0
  109. memos/mem_os/core.py +1203 -0
  110. memos/mem_os/main.py +582 -0
  111. memos/mem_os/product.py +1608 -0
  112. memos/mem_os/product_server.py +455 -0
  113. memos/mem_os/utils/default_config.py +359 -0
  114. memos/mem_os/utils/format_utils.py +1403 -0
  115. memos/mem_os/utils/reference_utils.py +162 -0
  116. memos/mem_reader/__init__.py +0 -0
  117. memos/mem_reader/base.py +47 -0
  118. memos/mem_reader/factory.py +53 -0
  119. memos/mem_reader/memory.py +298 -0
  120. memos/mem_reader/multi_modal_struct.py +965 -0
  121. memos/mem_reader/read_multi_modal/__init__.py +43 -0
  122. memos/mem_reader/read_multi_modal/assistant_parser.py +311 -0
  123. memos/mem_reader/read_multi_modal/base.py +273 -0
  124. memos/mem_reader/read_multi_modal/file_content_parser.py +826 -0
  125. memos/mem_reader/read_multi_modal/image_parser.py +359 -0
  126. memos/mem_reader/read_multi_modal/multi_modal_parser.py +252 -0
  127. memos/mem_reader/read_multi_modal/string_parser.py +139 -0
  128. memos/mem_reader/read_multi_modal/system_parser.py +327 -0
  129. memos/mem_reader/read_multi_modal/text_content_parser.py +131 -0
  130. memos/mem_reader/read_multi_modal/tool_parser.py +210 -0
  131. memos/mem_reader/read_multi_modal/user_parser.py +218 -0
  132. memos/mem_reader/read_multi_modal/utils.py +358 -0
  133. memos/mem_reader/simple_struct.py +912 -0
  134. memos/mem_reader/strategy_struct.py +163 -0
  135. memos/mem_reader/utils.py +157 -0
  136. memos/mem_scheduler/__init__.py +0 -0
  137. memos/mem_scheduler/analyzer/__init__.py +0 -0
  138. memos/mem_scheduler/analyzer/api_analyzer.py +714 -0
  139. memos/mem_scheduler/analyzer/eval_analyzer.py +219 -0
  140. memos/mem_scheduler/analyzer/mos_for_test_scheduler.py +571 -0
  141. memos/mem_scheduler/analyzer/scheduler_for_eval.py +280 -0
  142. memos/mem_scheduler/base_scheduler.py +1319 -0
  143. memos/mem_scheduler/general_modules/__init__.py +0 -0
  144. memos/mem_scheduler/general_modules/api_misc.py +137 -0
  145. memos/mem_scheduler/general_modules/base.py +80 -0
  146. memos/mem_scheduler/general_modules/init_components_for_scheduler.py +425 -0
  147. memos/mem_scheduler/general_modules/misc.py +313 -0
  148. memos/mem_scheduler/general_modules/scheduler_logger.py +389 -0
  149. memos/mem_scheduler/general_modules/task_threads.py +315 -0
  150. memos/mem_scheduler/general_scheduler.py +1495 -0
  151. memos/mem_scheduler/memory_manage_modules/__init__.py +5 -0
  152. memos/mem_scheduler/memory_manage_modules/memory_filter.py +306 -0
  153. memos/mem_scheduler/memory_manage_modules/retriever.py +547 -0
  154. memos/mem_scheduler/monitors/__init__.py +0 -0
  155. memos/mem_scheduler/monitors/dispatcher_monitor.py +366 -0
  156. memos/mem_scheduler/monitors/general_monitor.py +394 -0
  157. memos/mem_scheduler/monitors/task_schedule_monitor.py +254 -0
  158. memos/mem_scheduler/optimized_scheduler.py +410 -0
  159. memos/mem_scheduler/orm_modules/__init__.py +0 -0
  160. memos/mem_scheduler/orm_modules/api_redis_model.py +518 -0
  161. memos/mem_scheduler/orm_modules/base_model.py +729 -0
  162. memos/mem_scheduler/orm_modules/monitor_models.py +261 -0
  163. memos/mem_scheduler/orm_modules/redis_model.py +699 -0
  164. memos/mem_scheduler/scheduler_factory.py +23 -0
  165. memos/mem_scheduler/schemas/__init__.py +0 -0
  166. memos/mem_scheduler/schemas/analyzer_schemas.py +52 -0
  167. memos/mem_scheduler/schemas/api_schemas.py +233 -0
  168. memos/mem_scheduler/schemas/general_schemas.py +55 -0
  169. memos/mem_scheduler/schemas/message_schemas.py +173 -0
  170. memos/mem_scheduler/schemas/monitor_schemas.py +406 -0
  171. memos/mem_scheduler/schemas/task_schemas.py +132 -0
  172. memos/mem_scheduler/task_schedule_modules/__init__.py +0 -0
  173. memos/mem_scheduler/task_schedule_modules/dispatcher.py +740 -0
  174. memos/mem_scheduler/task_schedule_modules/local_queue.py +247 -0
  175. memos/mem_scheduler/task_schedule_modules/orchestrator.py +74 -0
  176. memos/mem_scheduler/task_schedule_modules/redis_queue.py +1385 -0
  177. memos/mem_scheduler/task_schedule_modules/task_queue.py +162 -0
  178. memos/mem_scheduler/utils/__init__.py +0 -0
  179. memos/mem_scheduler/utils/api_utils.py +77 -0
  180. memos/mem_scheduler/utils/config_utils.py +100 -0
  181. memos/mem_scheduler/utils/db_utils.py +50 -0
  182. memos/mem_scheduler/utils/filter_utils.py +176 -0
  183. memos/mem_scheduler/utils/metrics.py +125 -0
  184. memos/mem_scheduler/utils/misc_utils.py +290 -0
  185. memos/mem_scheduler/utils/monitor_event_utils.py +67 -0
  186. memos/mem_scheduler/utils/status_tracker.py +229 -0
  187. memos/mem_scheduler/webservice_modules/__init__.py +0 -0
  188. memos/mem_scheduler/webservice_modules/rabbitmq_service.py +485 -0
  189. memos/mem_scheduler/webservice_modules/redis_service.py +380 -0
  190. memos/mem_user/factory.py +94 -0
  191. memos/mem_user/mysql_persistent_user_manager.py +271 -0
  192. memos/mem_user/mysql_user_manager.py +502 -0
  193. memos/mem_user/persistent_factory.py +98 -0
  194. memos/mem_user/persistent_user_manager.py +260 -0
  195. memos/mem_user/redis_persistent_user_manager.py +225 -0
  196. memos/mem_user/user_manager.py +488 -0
  197. memos/memories/__init__.py +0 -0
  198. memos/memories/activation/__init__.py +0 -0
  199. memos/memories/activation/base.py +42 -0
  200. memos/memories/activation/item.py +56 -0
  201. memos/memories/activation/kv.py +292 -0
  202. memos/memories/activation/vllmkv.py +219 -0
  203. memos/memories/base.py +19 -0
  204. memos/memories/factory.py +42 -0
  205. memos/memories/parametric/__init__.py +0 -0
  206. memos/memories/parametric/base.py +19 -0
  207. memos/memories/parametric/item.py +11 -0
  208. memos/memories/parametric/lora.py +41 -0
  209. memos/memories/textual/__init__.py +0 -0
  210. memos/memories/textual/base.py +92 -0
  211. memos/memories/textual/general.py +236 -0
  212. memos/memories/textual/item.py +304 -0
  213. memos/memories/textual/naive.py +187 -0
  214. memos/memories/textual/prefer_text_memory/__init__.py +0 -0
  215. memos/memories/textual/prefer_text_memory/adder.py +504 -0
  216. memos/memories/textual/prefer_text_memory/config.py +106 -0
  217. memos/memories/textual/prefer_text_memory/extractor.py +221 -0
  218. memos/memories/textual/prefer_text_memory/factory.py +85 -0
  219. memos/memories/textual/prefer_text_memory/retrievers.py +177 -0
  220. memos/memories/textual/prefer_text_memory/spliter.py +132 -0
  221. memos/memories/textual/prefer_text_memory/utils.py +93 -0
  222. memos/memories/textual/preference.py +344 -0
  223. memos/memories/textual/simple_preference.py +161 -0
  224. memos/memories/textual/simple_tree.py +69 -0
  225. memos/memories/textual/tree.py +459 -0
  226. memos/memories/textual/tree_text_memory/__init__.py +0 -0
  227. memos/memories/textual/tree_text_memory/organize/__init__.py +0 -0
  228. memos/memories/textual/tree_text_memory/organize/handler.py +184 -0
  229. memos/memories/textual/tree_text_memory/organize/manager.py +518 -0
  230. memos/memories/textual/tree_text_memory/organize/relation_reason_detector.py +238 -0
  231. memos/memories/textual/tree_text_memory/organize/reorganizer.py +622 -0
  232. memos/memories/textual/tree_text_memory/retrieve/__init__.py +0 -0
  233. memos/memories/textual/tree_text_memory/retrieve/advanced_searcher.py +364 -0
  234. memos/memories/textual/tree_text_memory/retrieve/bm25_util.py +186 -0
  235. memos/memories/textual/tree_text_memory/retrieve/bochasearch.py +419 -0
  236. memos/memories/textual/tree_text_memory/retrieve/internet_retriever.py +270 -0
  237. memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +102 -0
  238. memos/memories/textual/tree_text_memory/retrieve/reasoner.py +61 -0
  239. memos/memories/textual/tree_text_memory/retrieve/recall.py +497 -0
  240. memos/memories/textual/tree_text_memory/retrieve/reranker.py +111 -0
  241. memos/memories/textual/tree_text_memory/retrieve/retrieval_mid_structs.py +16 -0
  242. memos/memories/textual/tree_text_memory/retrieve/retrieve_utils.py +472 -0
  243. memos/memories/textual/tree_text_memory/retrieve/searcher.py +848 -0
  244. memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py +135 -0
  245. memos/memories/textual/tree_text_memory/retrieve/utils.py +54 -0
  246. memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +387 -0
  247. memos/memos_tools/dinding_report_bot.py +453 -0
  248. memos/memos_tools/lockfree_dict.py +120 -0
  249. memos/memos_tools/notification_service.py +44 -0
  250. memos/memos_tools/notification_utils.py +142 -0
  251. memos/memos_tools/singleton.py +174 -0
  252. memos/memos_tools/thread_safe_dict.py +310 -0
  253. memos/memos_tools/thread_safe_dict_segment.py +382 -0
  254. memos/multi_mem_cube/__init__.py +0 -0
  255. memos/multi_mem_cube/composite_cube.py +86 -0
  256. memos/multi_mem_cube/single_cube.py +874 -0
  257. memos/multi_mem_cube/views.py +54 -0
  258. memos/parsers/__init__.py +0 -0
  259. memos/parsers/base.py +15 -0
  260. memos/parsers/factory.py +21 -0
  261. memos/parsers/markitdown.py +28 -0
  262. memos/reranker/__init__.py +4 -0
  263. memos/reranker/base.py +25 -0
  264. memos/reranker/concat.py +103 -0
  265. memos/reranker/cosine_local.py +102 -0
  266. memos/reranker/factory.py +72 -0
  267. memos/reranker/http_bge.py +324 -0
  268. memos/reranker/http_bge_strategy.py +327 -0
  269. memos/reranker/noop.py +19 -0
  270. memos/reranker/strategies/__init__.py +4 -0
  271. memos/reranker/strategies/base.py +61 -0
  272. memos/reranker/strategies/concat_background.py +94 -0
  273. memos/reranker/strategies/concat_docsource.py +110 -0
  274. memos/reranker/strategies/dialogue_common.py +109 -0
  275. memos/reranker/strategies/factory.py +31 -0
  276. memos/reranker/strategies/single_turn.py +107 -0
  277. memos/reranker/strategies/singleturn_outmem.py +98 -0
  278. memos/settings.py +10 -0
  279. memos/templates/__init__.py +0 -0
  280. memos/templates/advanced_search_prompts.py +211 -0
  281. memos/templates/cloud_service_prompt.py +107 -0
  282. memos/templates/instruction_completion.py +66 -0
  283. memos/templates/mem_agent_prompts.py +85 -0
  284. memos/templates/mem_feedback_prompts.py +822 -0
  285. memos/templates/mem_reader_prompts.py +1096 -0
  286. memos/templates/mem_reader_strategy_prompts.py +238 -0
  287. memos/templates/mem_scheduler_prompts.py +626 -0
  288. memos/templates/mem_search_prompts.py +93 -0
  289. memos/templates/mos_prompts.py +403 -0
  290. memos/templates/prefer_complete_prompt.py +735 -0
  291. memos/templates/tool_mem_prompts.py +139 -0
  292. memos/templates/tree_reorganize_prompts.py +230 -0
  293. memos/types/__init__.py +34 -0
  294. memos/types/general_types.py +151 -0
  295. memos/types/openai_chat_completion_types/__init__.py +15 -0
  296. memos/types/openai_chat_completion_types/chat_completion_assistant_message_param.py +56 -0
  297. memos/types/openai_chat_completion_types/chat_completion_content_part_image_param.py +27 -0
  298. memos/types/openai_chat_completion_types/chat_completion_content_part_input_audio_param.py +23 -0
  299. memos/types/openai_chat_completion_types/chat_completion_content_part_param.py +43 -0
  300. memos/types/openai_chat_completion_types/chat_completion_content_part_refusal_param.py +16 -0
  301. memos/types/openai_chat_completion_types/chat_completion_content_part_text_param.py +16 -0
  302. memos/types/openai_chat_completion_types/chat_completion_message_custom_tool_call_param.py +27 -0
  303. memos/types/openai_chat_completion_types/chat_completion_message_function_tool_call_param.py +32 -0
  304. memos/types/openai_chat_completion_types/chat_completion_message_param.py +18 -0
  305. memos/types/openai_chat_completion_types/chat_completion_message_tool_call_union_param.py +15 -0
  306. memos/types/openai_chat_completion_types/chat_completion_system_message_param.py +36 -0
  307. memos/types/openai_chat_completion_types/chat_completion_tool_message_param.py +30 -0
  308. memos/types/openai_chat_completion_types/chat_completion_user_message_param.py +34 -0
  309. memos/utils.py +123 -0
  310. memos/vec_dbs/__init__.py +0 -0
  311. memos/vec_dbs/base.py +117 -0
  312. memos/vec_dbs/factory.py +23 -0
  313. memos/vec_dbs/item.py +50 -0
  314. memos/vec_dbs/milvus.py +654 -0
  315. memos/vec_dbs/qdrant.py +355 -0
@@ -0,0 +1,358 @@
1
+ """Utility functions for message parsing."""
2
+
3
+ import json
4
+ import os
5
+ import re
6
+
7
+ from datetime import datetime
8
+ from typing import Any, TypeAlias
9
+ from urllib.parse import urlparse
10
+
11
+ from memos import log
12
+ from memos.configs.parser import ParserConfigFactory
13
+ from memos.parsers.factory import ParserFactory
14
+ from memos.types import MessagesType
15
+ from memos.types.openai_chat_completion_types import (
16
+ ChatCompletionAssistantMessageParam,
17
+ ChatCompletionContentPartTextParam,
18
+ ChatCompletionSystemMessageParam,
19
+ ChatCompletionToolMessageParam,
20
+ ChatCompletionUserMessageParam,
21
+ File,
22
+ )
23
+
24
+
25
+ ChatMessageClasses = (
26
+ ChatCompletionSystemMessageParam,
27
+ ChatCompletionUserMessageParam,
28
+ ChatCompletionAssistantMessageParam,
29
+ ChatCompletionToolMessageParam,
30
+ )
31
+
32
+ RawContentClasses = (ChatCompletionContentPartTextParam, File)
33
+ MessageDict: TypeAlias = dict[str, Any] # (Deprecated) not supported in the future
34
+ SceneDataInput: TypeAlias = (
35
+ list[list[MessageDict]] # (Deprecated) legacy chat example: scenes -> messages
36
+ | list[str] # (Deprecated) legacy doc example: list of paths / pure text
37
+ | list[MessagesType] # new: list of scenes (each scene is MessagesType)
38
+ )
39
+
40
+
41
+ logger = log.get_logger(__name__)
42
+ FILE_EXT_RE = re.compile(
43
+ r"\.(pdf|docx?|pptx?|xlsx?|txt|md|html?|json|csv|png|jpe?g|webp|wav|mp3|m4a)$",
44
+ re.I,
45
+ )
46
+
47
+
48
+ def parse_json_result(response_text: str) -> dict:
49
+ """
50
+ Parse JSON result from LLM response.
51
+
52
+ Handles various formats including:
53
+ - JSON wrapped in markdown code blocks
54
+ - Raw JSON
55
+ - Incomplete JSON (attempts to fix)
56
+
57
+ Args:
58
+ response_text: Raw response text from LLM
59
+
60
+ Returns:
61
+ Parsed dictionary or empty dict if parsing fails
62
+ """
63
+ s = (response_text or "").strip()
64
+
65
+ m = re.search(r"```(?:json)?\s*([\s\S]*?)```", s, flags=re.I)
66
+ s = (m.group(1) if m else s.replace("```", "")).strip()
67
+
68
+ i = s.find("{")
69
+ if i == -1:
70
+ return {}
71
+ s = s[i:].strip()
72
+
73
+ try:
74
+ return json.loads(s)
75
+ except json.JSONDecodeError:
76
+ pass
77
+
78
+ j = max(s.rfind("}"), s.rfind("]"))
79
+ if j != -1:
80
+ try:
81
+ return json.loads(s[: j + 1])
82
+ except json.JSONDecodeError:
83
+ pass
84
+
85
+ def _cheap_close(t: str) -> str:
86
+ t += "}" * max(0, t.count("{") - t.count("}"))
87
+ t += "]" * max(0, t.count("[") - t.count("]"))
88
+ return t
89
+
90
+ t = _cheap_close(s)
91
+ try:
92
+ return json.loads(t)
93
+ except json.JSONDecodeError as e:
94
+ if "Invalid \\escape" in str(e):
95
+ s = s.replace("\\", "\\\\")
96
+ try:
97
+ return json.loads(s)
98
+ except json.JSONDecodeError:
99
+ pass
100
+ logger.warning(f"[JSONParse] Failed to decode JSON: {e}\nRaw: {response_text}")
101
+ return {}
102
+
103
+
104
+ # Default configuration for parser and text splitter
105
+ DEFAULT_PARSER_CONFIG = {
106
+ "backend": "markitdown",
107
+ "config": {},
108
+ }
109
+
110
+ DEFAULT_CHUNK_SIZE = int(os.getenv("FILE_PARSER_CHUNK_SIZE", "1280"))
111
+ DEFAULT_CHUNK_OVERLAP = int(os.getenv("FILE_PARSER_CHUNK_OVERLAP", "200"))
112
+
113
+
114
+ # Initialize parser instance
115
+ file_parser = None
116
+ try:
117
+ parser_config = ParserConfigFactory.model_validate(DEFAULT_PARSER_CONFIG)
118
+ file_parser = ParserFactory.from_config(parser_config)
119
+ logger.debug("[FileContentParser] Initialized parser instance")
120
+ except Exception as e:
121
+ logger.error(f"[FileContentParser] Failed to create parser: {e}")
122
+ file_parser = None
123
+
124
+ markdown_text_splitter = None
125
+
126
+ try:
127
+ from memos.chunkers.charactertext_chunker import CharacterTextChunker
128
+ from memos.chunkers.markdown_chunker import MarkdownChunker
129
+
130
+ markdown_text_splitter = MarkdownChunker(
131
+ chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP, recursive=True
132
+ )
133
+ text_splitter = CharacterTextChunker(
134
+ chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
135
+ )
136
+ logger.info("[FileContentParser] Initialized text splitter instances by lancga")
137
+ except Exception as e:
138
+ logger.warning(
139
+ f"[FileContentParser] Failed to create text splitter: {e} will use simple splitter fallback"
140
+ )
141
+ from memos.chunkers.simple_chunker import SimpleTextSplitter
142
+
143
+ markdown_text_splitter = None
144
+ text_splitter = None
145
+
146
+
147
+ def get_parser() -> Any:
148
+ """
149
+ Get parser instance.
150
+
151
+ Returns:
152
+ Parser instance (from ParserFactory) or None if not available
153
+ """
154
+ return file_parser
155
+
156
+
157
+ def get_text_splitter(
158
+ chunk_size: int | None = None, chunk_overlap: int | None = None, is_markdown: bool = False
159
+ ) -> Any:
160
+ """
161
+ Get text splitter instance or a callable that uses simple splitter.
162
+
163
+ Args:
164
+ chunk_size: Maximum size of chunks when splitting text (used for simple splitter fallback)
165
+ chunk_overlap: Overlap between chunks when splitting text (used for simple splitter fallback)
166
+
167
+ Returns:
168
+ Text splitter instance (RecursiveCharacterTextSplitter) or a callable wrapper for simple splitter
169
+ """
170
+ if is_markdown and markdown_text_splitter is not None:
171
+ return markdown_text_splitter
172
+ elif text_splitter is not None:
173
+ return text_splitter
174
+ else:
175
+ actual_chunk_size = chunk_size or DEFAULT_CHUNK_SIZE
176
+ actual_chunk_overlap = chunk_overlap or DEFAULT_CHUNK_OVERLAP
177
+ return SimpleTextSplitter(actual_chunk_size, actual_chunk_overlap)
178
+
179
+
180
+ def extract_role(message: dict[str, Any]) -> str:
181
+ """Extract role from message."""
182
+ return message.get("role", "")
183
+
184
+
185
+ def _is_message_list(obj):
186
+ """
187
+ Detect whether `obj` is a MessageList (OpenAI ChatCompletionMessageParam list).
188
+ Criteria:
189
+ - Must be a list
190
+ - Each element must be a dict with keys: role, content
191
+ """
192
+ if not isinstance(obj, list):
193
+ return False
194
+
195
+ for item in obj:
196
+ if not isinstance(item, dict):
197
+ return False
198
+ if "role" not in item or "content" not in item:
199
+ return False
200
+ return True
201
+
202
+
203
+ def coerce_scene_data(scene_data: SceneDataInput, scene_type: str) -> list[MessagesType]:
204
+ """
205
+ Normalize ANY allowed SceneDataInput into: list[MessagesType].
206
+ Supports:
207
+ - Already normalized scene_data → passthrough
208
+ - doc: legacy list[str] → automatically detect:
209
+ * local file path → read & parse into text
210
+ * remote URL/path → keep as file part
211
+ * pure text → text part
212
+ - chat:
213
+ * Passthrough normalization
214
+ * Auto-inject chat_time into each message group
215
+ - fallback: wrap unknown → [str(scene_data)]
216
+ """
217
+ if not scene_data:
218
+ return []
219
+ head = scene_data[0]
220
+
221
+ if scene_type != "doc":
222
+ normalized = scene_data if isinstance(head, str | list) else [str(scene_data)]
223
+
224
+ complete_scene_data = []
225
+ for items in normalized:
226
+ if not items:
227
+ continue
228
+
229
+ # Keep string as-is (MessagesType supports str)
230
+ if isinstance(items, str):
231
+ complete_scene_data.append(items)
232
+ continue
233
+
234
+ # ONLY add chat_time if it's a MessageList
235
+ if not _is_message_list(items):
236
+ complete_scene_data.append(items)
237
+ continue
238
+
239
+ # Detect existing chat_time
240
+ chat_time_value = None
241
+ for item in items:
242
+ if isinstance(item, dict) and "chat_time" in item:
243
+ chat_time_value = item["chat_time"]
244
+ break
245
+
246
+ # Default timestamp
247
+ if chat_time_value is None:
248
+ session_date = datetime.now()
249
+ date_format = "%I:%M %p on %d %B, %Y"
250
+ chat_time_value = session_date.strftime(date_format)
251
+
252
+ # Inject chat_time
253
+ for m in items:
254
+ if isinstance(m, dict) and "chat_time" not in m:
255
+ m["chat_time"] = chat_time_value
256
+
257
+ complete_scene_data.append(items)
258
+
259
+ return complete_scene_data
260
+
261
+ # doc: list[str] -> RawMessageList
262
+ if scene_type == "doc" and isinstance(head, str):
263
+ raw_items = []
264
+
265
+ # prepare parser
266
+ parser_config = ParserConfigFactory.model_validate(
267
+ {
268
+ "backend": "markitdown",
269
+ "config": {},
270
+ }
271
+ )
272
+ parser = ParserFactory.from_config(parser_config)
273
+
274
+ for s in scene_data:
275
+ s = (s or "").strip()
276
+ if not s:
277
+ continue
278
+
279
+ parsed = urlparse(s)
280
+ looks_like_url = parsed.scheme in {"http", "https", "oss", "s3", "gs", "cos"}
281
+ looks_like_path = ("/" in s) or ("\\" in s)
282
+ looks_like_file = bool(FILE_EXT_RE.search(s)) or looks_like_url or looks_like_path
283
+
284
+ # Case A: Local filesystem path
285
+ if os.path.exists(s):
286
+ filename = os.path.basename(s) or "document"
287
+ try:
288
+ # parse local file into text
289
+ parsed_text = parser.parse(s)
290
+ raw_items.append(
291
+ [
292
+ {
293
+ "type": "file",
294
+ "file": {
295
+ "filename": filename or "document",
296
+ "file_data": parsed_text,
297
+ },
298
+ }
299
+ ]
300
+ )
301
+ except Exception as e:
302
+ logger.error(f"[SceneParser] Error parsing {s}: {e}")
303
+ continue
304
+
305
+ # Case B: URL or non-local file path
306
+ if looks_like_file:
307
+ if looks_like_url:
308
+ filename = os.path.basename(parsed.path)
309
+ else:
310
+ # Windows absolute path detection
311
+ if "\\" in s and re.match(r"^[A-Za-z]:", s):
312
+ parts = [p for p in s.split("\\") if p]
313
+ filename = parts[-1] if parts else os.path.basename(s)
314
+ else:
315
+ filename = os.path.basename(s)
316
+ raw_items.append(
317
+ [{"type": "file", "file": {"filename": filename or "document", "file_data": s}}]
318
+ )
319
+ continue
320
+
321
+ # Case C: Pure text
322
+ raw_items.append([{"type": "text", "text": s}])
323
+
324
+ return raw_items
325
+
326
+ # fallback
327
+ return [str(scene_data)]
328
+
329
+
330
+ def detect_lang(text):
331
+ """
332
+ Detect the language of the given text (Chinese or English).
333
+
334
+ Args:
335
+ text: Text to analyze
336
+
337
+ Returns:
338
+ "zh" for Chinese, "en" for English (default)
339
+ """
340
+ try:
341
+ if not text or not isinstance(text, str):
342
+ return "en"
343
+ cleaned_text = text
344
+ # remove role and timestamp
345
+ cleaned_text = re.sub(
346
+ r"\b(user|assistant|query|answer)\s*:", "", cleaned_text, flags=re.IGNORECASE
347
+ )
348
+ cleaned_text = re.sub(r"\[[\d\-:\s]+\]", "", cleaned_text)
349
+
350
+ # extract chinese characters
351
+ chinese_pattern = r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f\U0002b820-\U0002ceaf\uf900-\ufaff]"
352
+ chinese_chars = re.findall(chinese_pattern, cleaned_text)
353
+ text_without_special = re.sub(r"[\s\d\W]", "", cleaned_text)
354
+ if text_without_special and len(chinese_chars) / len(text_without_special) > 0.3:
355
+ return "zh"
356
+ return "en"
357
+ except Exception:
358
+ return "en"