MemoryOS 2.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. memoryos-2.0.3.dist-info/METADATA +418 -0
  2. memoryos-2.0.3.dist-info/RECORD +315 -0
  3. memoryos-2.0.3.dist-info/WHEEL +4 -0
  4. memoryos-2.0.3.dist-info/entry_points.txt +3 -0
  5. memoryos-2.0.3.dist-info/licenses/LICENSE +201 -0
  6. memos/__init__.py +20 -0
  7. memos/api/client.py +571 -0
  8. memos/api/config.py +1018 -0
  9. memos/api/context/dependencies.py +50 -0
  10. memos/api/exceptions.py +53 -0
  11. memos/api/handlers/__init__.py +62 -0
  12. memos/api/handlers/add_handler.py +158 -0
  13. memos/api/handlers/base_handler.py +194 -0
  14. memos/api/handlers/chat_handler.py +1401 -0
  15. memos/api/handlers/component_init.py +388 -0
  16. memos/api/handlers/config_builders.py +190 -0
  17. memos/api/handlers/feedback_handler.py +93 -0
  18. memos/api/handlers/formatters_handler.py +237 -0
  19. memos/api/handlers/memory_handler.py +316 -0
  20. memos/api/handlers/scheduler_handler.py +497 -0
  21. memos/api/handlers/search_handler.py +222 -0
  22. memos/api/handlers/suggestion_handler.py +117 -0
  23. memos/api/mcp_serve.py +614 -0
  24. memos/api/middleware/request_context.py +101 -0
  25. memos/api/product_api.py +38 -0
  26. memos/api/product_models.py +1206 -0
  27. memos/api/routers/__init__.py +1 -0
  28. memos/api/routers/product_router.py +477 -0
  29. memos/api/routers/server_router.py +394 -0
  30. memos/api/server_api.py +44 -0
  31. memos/api/start_api.py +433 -0
  32. memos/chunkers/__init__.py +4 -0
  33. memos/chunkers/base.py +24 -0
  34. memos/chunkers/charactertext_chunker.py +41 -0
  35. memos/chunkers/factory.py +24 -0
  36. memos/chunkers/markdown_chunker.py +62 -0
  37. memos/chunkers/sentence_chunker.py +54 -0
  38. memos/chunkers/simple_chunker.py +50 -0
  39. memos/cli.py +113 -0
  40. memos/configs/__init__.py +0 -0
  41. memos/configs/base.py +82 -0
  42. memos/configs/chunker.py +59 -0
  43. memos/configs/embedder.py +88 -0
  44. memos/configs/graph_db.py +236 -0
  45. memos/configs/internet_retriever.py +100 -0
  46. memos/configs/llm.py +151 -0
  47. memos/configs/mem_agent.py +54 -0
  48. memos/configs/mem_chat.py +81 -0
  49. memos/configs/mem_cube.py +105 -0
  50. memos/configs/mem_os.py +83 -0
  51. memos/configs/mem_reader.py +91 -0
  52. memos/configs/mem_scheduler.py +385 -0
  53. memos/configs/mem_user.py +70 -0
  54. memos/configs/memory.py +324 -0
  55. memos/configs/parser.py +38 -0
  56. memos/configs/reranker.py +18 -0
  57. memos/configs/utils.py +8 -0
  58. memos/configs/vec_db.py +80 -0
  59. memos/context/context.py +355 -0
  60. memos/dependency.py +52 -0
  61. memos/deprecation.py +262 -0
  62. memos/embedders/__init__.py +0 -0
  63. memos/embedders/ark.py +95 -0
  64. memos/embedders/base.py +106 -0
  65. memos/embedders/factory.py +29 -0
  66. memos/embedders/ollama.py +77 -0
  67. memos/embedders/sentence_transformer.py +49 -0
  68. memos/embedders/universal_api.py +51 -0
  69. memos/exceptions.py +30 -0
  70. memos/graph_dbs/__init__.py +0 -0
  71. memos/graph_dbs/base.py +274 -0
  72. memos/graph_dbs/factory.py +27 -0
  73. memos/graph_dbs/item.py +46 -0
  74. memos/graph_dbs/nebular.py +1794 -0
  75. memos/graph_dbs/neo4j.py +1942 -0
  76. memos/graph_dbs/neo4j_community.py +1058 -0
  77. memos/graph_dbs/polardb.py +5446 -0
  78. memos/hello_world.py +97 -0
  79. memos/llms/__init__.py +0 -0
  80. memos/llms/base.py +25 -0
  81. memos/llms/deepseek.py +13 -0
  82. memos/llms/factory.py +38 -0
  83. memos/llms/hf.py +443 -0
  84. memos/llms/hf_singleton.py +114 -0
  85. memos/llms/ollama.py +135 -0
  86. memos/llms/openai.py +222 -0
  87. memos/llms/openai_new.py +198 -0
  88. memos/llms/qwen.py +13 -0
  89. memos/llms/utils.py +14 -0
  90. memos/llms/vllm.py +218 -0
  91. memos/log.py +237 -0
  92. memos/mem_agent/base.py +19 -0
  93. memos/mem_agent/deepsearch_agent.py +391 -0
  94. memos/mem_agent/factory.py +36 -0
  95. memos/mem_chat/__init__.py +0 -0
  96. memos/mem_chat/base.py +30 -0
  97. memos/mem_chat/factory.py +21 -0
  98. memos/mem_chat/simple.py +200 -0
  99. memos/mem_cube/__init__.py +0 -0
  100. memos/mem_cube/base.py +30 -0
  101. memos/mem_cube/general.py +240 -0
  102. memos/mem_cube/navie.py +172 -0
  103. memos/mem_cube/utils.py +169 -0
  104. memos/mem_feedback/base.py +15 -0
  105. memos/mem_feedback/feedback.py +1192 -0
  106. memos/mem_feedback/simple_feedback.py +40 -0
  107. memos/mem_feedback/utils.py +230 -0
  108. memos/mem_os/client.py +5 -0
  109. memos/mem_os/core.py +1203 -0
  110. memos/mem_os/main.py +582 -0
  111. memos/mem_os/product.py +1608 -0
  112. memos/mem_os/product_server.py +455 -0
  113. memos/mem_os/utils/default_config.py +359 -0
  114. memos/mem_os/utils/format_utils.py +1403 -0
  115. memos/mem_os/utils/reference_utils.py +162 -0
  116. memos/mem_reader/__init__.py +0 -0
  117. memos/mem_reader/base.py +47 -0
  118. memos/mem_reader/factory.py +53 -0
  119. memos/mem_reader/memory.py +298 -0
  120. memos/mem_reader/multi_modal_struct.py +965 -0
  121. memos/mem_reader/read_multi_modal/__init__.py +43 -0
  122. memos/mem_reader/read_multi_modal/assistant_parser.py +311 -0
  123. memos/mem_reader/read_multi_modal/base.py +273 -0
  124. memos/mem_reader/read_multi_modal/file_content_parser.py +826 -0
  125. memos/mem_reader/read_multi_modal/image_parser.py +359 -0
  126. memos/mem_reader/read_multi_modal/multi_modal_parser.py +252 -0
  127. memos/mem_reader/read_multi_modal/string_parser.py +139 -0
  128. memos/mem_reader/read_multi_modal/system_parser.py +327 -0
  129. memos/mem_reader/read_multi_modal/text_content_parser.py +131 -0
  130. memos/mem_reader/read_multi_modal/tool_parser.py +210 -0
  131. memos/mem_reader/read_multi_modal/user_parser.py +218 -0
  132. memos/mem_reader/read_multi_modal/utils.py +358 -0
  133. memos/mem_reader/simple_struct.py +912 -0
  134. memos/mem_reader/strategy_struct.py +163 -0
  135. memos/mem_reader/utils.py +157 -0
  136. memos/mem_scheduler/__init__.py +0 -0
  137. memos/mem_scheduler/analyzer/__init__.py +0 -0
  138. memos/mem_scheduler/analyzer/api_analyzer.py +714 -0
  139. memos/mem_scheduler/analyzer/eval_analyzer.py +219 -0
  140. memos/mem_scheduler/analyzer/mos_for_test_scheduler.py +571 -0
  141. memos/mem_scheduler/analyzer/scheduler_for_eval.py +280 -0
  142. memos/mem_scheduler/base_scheduler.py +1319 -0
  143. memos/mem_scheduler/general_modules/__init__.py +0 -0
  144. memos/mem_scheduler/general_modules/api_misc.py +137 -0
  145. memos/mem_scheduler/general_modules/base.py +80 -0
  146. memos/mem_scheduler/general_modules/init_components_for_scheduler.py +425 -0
  147. memos/mem_scheduler/general_modules/misc.py +313 -0
  148. memos/mem_scheduler/general_modules/scheduler_logger.py +389 -0
  149. memos/mem_scheduler/general_modules/task_threads.py +315 -0
  150. memos/mem_scheduler/general_scheduler.py +1495 -0
  151. memos/mem_scheduler/memory_manage_modules/__init__.py +5 -0
  152. memos/mem_scheduler/memory_manage_modules/memory_filter.py +306 -0
  153. memos/mem_scheduler/memory_manage_modules/retriever.py +547 -0
  154. memos/mem_scheduler/monitors/__init__.py +0 -0
  155. memos/mem_scheduler/monitors/dispatcher_monitor.py +366 -0
  156. memos/mem_scheduler/monitors/general_monitor.py +394 -0
  157. memos/mem_scheduler/monitors/task_schedule_monitor.py +254 -0
  158. memos/mem_scheduler/optimized_scheduler.py +410 -0
  159. memos/mem_scheduler/orm_modules/__init__.py +0 -0
  160. memos/mem_scheduler/orm_modules/api_redis_model.py +518 -0
  161. memos/mem_scheduler/orm_modules/base_model.py +729 -0
  162. memos/mem_scheduler/orm_modules/monitor_models.py +261 -0
  163. memos/mem_scheduler/orm_modules/redis_model.py +699 -0
  164. memos/mem_scheduler/scheduler_factory.py +23 -0
  165. memos/mem_scheduler/schemas/__init__.py +0 -0
  166. memos/mem_scheduler/schemas/analyzer_schemas.py +52 -0
  167. memos/mem_scheduler/schemas/api_schemas.py +233 -0
  168. memos/mem_scheduler/schemas/general_schemas.py +55 -0
  169. memos/mem_scheduler/schemas/message_schemas.py +173 -0
  170. memos/mem_scheduler/schemas/monitor_schemas.py +406 -0
  171. memos/mem_scheduler/schemas/task_schemas.py +132 -0
  172. memos/mem_scheduler/task_schedule_modules/__init__.py +0 -0
  173. memos/mem_scheduler/task_schedule_modules/dispatcher.py +740 -0
  174. memos/mem_scheduler/task_schedule_modules/local_queue.py +247 -0
  175. memos/mem_scheduler/task_schedule_modules/orchestrator.py +74 -0
  176. memos/mem_scheduler/task_schedule_modules/redis_queue.py +1385 -0
  177. memos/mem_scheduler/task_schedule_modules/task_queue.py +162 -0
  178. memos/mem_scheduler/utils/__init__.py +0 -0
  179. memos/mem_scheduler/utils/api_utils.py +77 -0
  180. memos/mem_scheduler/utils/config_utils.py +100 -0
  181. memos/mem_scheduler/utils/db_utils.py +50 -0
  182. memos/mem_scheduler/utils/filter_utils.py +176 -0
  183. memos/mem_scheduler/utils/metrics.py +125 -0
  184. memos/mem_scheduler/utils/misc_utils.py +290 -0
  185. memos/mem_scheduler/utils/monitor_event_utils.py +67 -0
  186. memos/mem_scheduler/utils/status_tracker.py +229 -0
  187. memos/mem_scheduler/webservice_modules/__init__.py +0 -0
  188. memos/mem_scheduler/webservice_modules/rabbitmq_service.py +485 -0
  189. memos/mem_scheduler/webservice_modules/redis_service.py +380 -0
  190. memos/mem_user/factory.py +94 -0
  191. memos/mem_user/mysql_persistent_user_manager.py +271 -0
  192. memos/mem_user/mysql_user_manager.py +502 -0
  193. memos/mem_user/persistent_factory.py +98 -0
  194. memos/mem_user/persistent_user_manager.py +260 -0
  195. memos/mem_user/redis_persistent_user_manager.py +225 -0
  196. memos/mem_user/user_manager.py +488 -0
  197. memos/memories/__init__.py +0 -0
  198. memos/memories/activation/__init__.py +0 -0
  199. memos/memories/activation/base.py +42 -0
  200. memos/memories/activation/item.py +56 -0
  201. memos/memories/activation/kv.py +292 -0
  202. memos/memories/activation/vllmkv.py +219 -0
  203. memos/memories/base.py +19 -0
  204. memos/memories/factory.py +42 -0
  205. memos/memories/parametric/__init__.py +0 -0
  206. memos/memories/parametric/base.py +19 -0
  207. memos/memories/parametric/item.py +11 -0
  208. memos/memories/parametric/lora.py +41 -0
  209. memos/memories/textual/__init__.py +0 -0
  210. memos/memories/textual/base.py +92 -0
  211. memos/memories/textual/general.py +236 -0
  212. memos/memories/textual/item.py +304 -0
  213. memos/memories/textual/naive.py +187 -0
  214. memos/memories/textual/prefer_text_memory/__init__.py +0 -0
  215. memos/memories/textual/prefer_text_memory/adder.py +504 -0
  216. memos/memories/textual/prefer_text_memory/config.py +106 -0
  217. memos/memories/textual/prefer_text_memory/extractor.py +221 -0
  218. memos/memories/textual/prefer_text_memory/factory.py +85 -0
  219. memos/memories/textual/prefer_text_memory/retrievers.py +177 -0
  220. memos/memories/textual/prefer_text_memory/spliter.py +132 -0
  221. memos/memories/textual/prefer_text_memory/utils.py +93 -0
  222. memos/memories/textual/preference.py +344 -0
  223. memos/memories/textual/simple_preference.py +161 -0
  224. memos/memories/textual/simple_tree.py +69 -0
  225. memos/memories/textual/tree.py +459 -0
  226. memos/memories/textual/tree_text_memory/__init__.py +0 -0
  227. memos/memories/textual/tree_text_memory/organize/__init__.py +0 -0
  228. memos/memories/textual/tree_text_memory/organize/handler.py +184 -0
  229. memos/memories/textual/tree_text_memory/organize/manager.py +518 -0
  230. memos/memories/textual/tree_text_memory/organize/relation_reason_detector.py +238 -0
  231. memos/memories/textual/tree_text_memory/organize/reorganizer.py +622 -0
  232. memos/memories/textual/tree_text_memory/retrieve/__init__.py +0 -0
  233. memos/memories/textual/tree_text_memory/retrieve/advanced_searcher.py +364 -0
  234. memos/memories/textual/tree_text_memory/retrieve/bm25_util.py +186 -0
  235. memos/memories/textual/tree_text_memory/retrieve/bochasearch.py +419 -0
  236. memos/memories/textual/tree_text_memory/retrieve/internet_retriever.py +270 -0
  237. memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +102 -0
  238. memos/memories/textual/tree_text_memory/retrieve/reasoner.py +61 -0
  239. memos/memories/textual/tree_text_memory/retrieve/recall.py +497 -0
  240. memos/memories/textual/tree_text_memory/retrieve/reranker.py +111 -0
  241. memos/memories/textual/tree_text_memory/retrieve/retrieval_mid_structs.py +16 -0
  242. memos/memories/textual/tree_text_memory/retrieve/retrieve_utils.py +472 -0
  243. memos/memories/textual/tree_text_memory/retrieve/searcher.py +848 -0
  244. memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py +135 -0
  245. memos/memories/textual/tree_text_memory/retrieve/utils.py +54 -0
  246. memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +387 -0
  247. memos/memos_tools/dinding_report_bot.py +453 -0
  248. memos/memos_tools/lockfree_dict.py +120 -0
  249. memos/memos_tools/notification_service.py +44 -0
  250. memos/memos_tools/notification_utils.py +142 -0
  251. memos/memos_tools/singleton.py +174 -0
  252. memos/memos_tools/thread_safe_dict.py +310 -0
  253. memos/memos_tools/thread_safe_dict_segment.py +382 -0
  254. memos/multi_mem_cube/__init__.py +0 -0
  255. memos/multi_mem_cube/composite_cube.py +86 -0
  256. memos/multi_mem_cube/single_cube.py +874 -0
  257. memos/multi_mem_cube/views.py +54 -0
  258. memos/parsers/__init__.py +0 -0
  259. memos/parsers/base.py +15 -0
  260. memos/parsers/factory.py +21 -0
  261. memos/parsers/markitdown.py +28 -0
  262. memos/reranker/__init__.py +4 -0
  263. memos/reranker/base.py +25 -0
  264. memos/reranker/concat.py +103 -0
  265. memos/reranker/cosine_local.py +102 -0
  266. memos/reranker/factory.py +72 -0
  267. memos/reranker/http_bge.py +324 -0
  268. memos/reranker/http_bge_strategy.py +327 -0
  269. memos/reranker/noop.py +19 -0
  270. memos/reranker/strategies/__init__.py +4 -0
  271. memos/reranker/strategies/base.py +61 -0
  272. memos/reranker/strategies/concat_background.py +94 -0
  273. memos/reranker/strategies/concat_docsource.py +110 -0
  274. memos/reranker/strategies/dialogue_common.py +109 -0
  275. memos/reranker/strategies/factory.py +31 -0
  276. memos/reranker/strategies/single_turn.py +107 -0
  277. memos/reranker/strategies/singleturn_outmem.py +98 -0
  278. memos/settings.py +10 -0
  279. memos/templates/__init__.py +0 -0
  280. memos/templates/advanced_search_prompts.py +211 -0
  281. memos/templates/cloud_service_prompt.py +107 -0
  282. memos/templates/instruction_completion.py +66 -0
  283. memos/templates/mem_agent_prompts.py +85 -0
  284. memos/templates/mem_feedback_prompts.py +822 -0
  285. memos/templates/mem_reader_prompts.py +1096 -0
  286. memos/templates/mem_reader_strategy_prompts.py +238 -0
  287. memos/templates/mem_scheduler_prompts.py +626 -0
  288. memos/templates/mem_search_prompts.py +93 -0
  289. memos/templates/mos_prompts.py +403 -0
  290. memos/templates/prefer_complete_prompt.py +735 -0
  291. memos/templates/tool_mem_prompts.py +139 -0
  292. memos/templates/tree_reorganize_prompts.py +230 -0
  293. memos/types/__init__.py +34 -0
  294. memos/types/general_types.py +151 -0
  295. memos/types/openai_chat_completion_types/__init__.py +15 -0
  296. memos/types/openai_chat_completion_types/chat_completion_assistant_message_param.py +56 -0
  297. memos/types/openai_chat_completion_types/chat_completion_content_part_image_param.py +27 -0
  298. memos/types/openai_chat_completion_types/chat_completion_content_part_input_audio_param.py +23 -0
  299. memos/types/openai_chat_completion_types/chat_completion_content_part_param.py +43 -0
  300. memos/types/openai_chat_completion_types/chat_completion_content_part_refusal_param.py +16 -0
  301. memos/types/openai_chat_completion_types/chat_completion_content_part_text_param.py +16 -0
  302. memos/types/openai_chat_completion_types/chat_completion_message_custom_tool_call_param.py +27 -0
  303. memos/types/openai_chat_completion_types/chat_completion_message_function_tool_call_param.py +32 -0
  304. memos/types/openai_chat_completion_types/chat_completion_message_param.py +18 -0
  305. memos/types/openai_chat_completion_types/chat_completion_message_tool_call_union_param.py +15 -0
  306. memos/types/openai_chat_completion_types/chat_completion_system_message_param.py +36 -0
  307. memos/types/openai_chat_completion_types/chat_completion_tool_message_param.py +30 -0
  308. memos/types/openai_chat_completion_types/chat_completion_user_message_param.py +34 -0
  309. memos/utils.py +123 -0
  310. memos/vec_dbs/__init__.py +0 -0
  311. memos/vec_dbs/base.py +117 -0
  312. memos/vec_dbs/factory.py +23 -0
  313. memos/vec_dbs/item.py +50 -0
  314. memos/vec_dbs/milvus.py +654 -0
  315. memos/vec_dbs/qdrant.py +355 -0
@@ -0,0 +1,826 @@
1
+ """Parser for file content parts (RawMessageList)."""
2
+
3
+ import concurrent.futures
4
+ import os
5
+ import re
6
+ import tempfile
7
+
8
+ from typing import Any
9
+
10
+ from tqdm import tqdm
11
+
12
+ from memos.context.context import ContextThreadPoolExecutor
13
+ from memos.embedders.base import BaseEmbedder
14
+ from memos.llms.base import BaseLLM
15
+ from memos.log import get_logger
16
+ from memos.mem_reader.read_multi_modal.base import BaseMessageParser, _derive_key
17
+ from memos.mem_reader.read_multi_modal.image_parser import ImageParser
18
+ from memos.mem_reader.read_multi_modal.utils import (
19
+ detect_lang,
20
+ get_parser,
21
+ parse_json_result,
22
+ )
23
+ from memos.memories.textual.item import (
24
+ SourceMessage,
25
+ TextualMemoryItem,
26
+ TreeNodeTextualMemoryMetadata,
27
+ )
28
+ from memos.templates.mem_reader_prompts import (
29
+ CUSTOM_TAGS_INSTRUCTION,
30
+ CUSTOM_TAGS_INSTRUCTION_ZH,
31
+ SIMPLE_STRUCT_DOC_READER_PROMPT,
32
+ SIMPLE_STRUCT_DOC_READER_PROMPT_ZH,
33
+ )
34
+ from memos.types.openai_chat_completion_types import File
35
+
36
+
37
+ logger = get_logger(__name__)
38
+
39
+ # Prompt dictionary for doc processing (shared by simple_struct and file_content_parser)
40
+ DOC_PROMPT_DICT = {
41
+ "doc": {"en": SIMPLE_STRUCT_DOC_READER_PROMPT, "zh": SIMPLE_STRUCT_DOC_READER_PROMPT_ZH},
42
+ "custom_tags": {"en": CUSTOM_TAGS_INSTRUCTION, "zh": CUSTOM_TAGS_INSTRUCTION_ZH},
43
+ }
44
+
45
+
46
+ class FileContentParser(BaseMessageParser):
47
+ """Parser for file content parts."""
48
+
49
+ def _get_doc_llm_response(self, chunk_text: str, custom_tags: list[str] | None = None) -> dict:
50
+ """
51
+ Call LLM to extract memory from document chunk.
52
+ Uses doc prompts from DOC_PROMPT_DICT.
53
+
54
+ Args:
55
+ chunk_text: Text chunk to extract memory from
56
+ custom_tags: Optional list of custom tags for LLM extraction
57
+
58
+ Returns:
59
+ Parsed JSON response from LLM or empty dict if failed
60
+ """
61
+ if not self.llm:
62
+ logger.warning("[FileContentParser] LLM not available for fine mode")
63
+ return {}
64
+
65
+ lang = detect_lang(chunk_text)
66
+ template = DOC_PROMPT_DICT["doc"][lang]
67
+ prompt = template.replace("{chunk_text}", chunk_text)
68
+
69
+ custom_tags_prompt = (
70
+ DOC_PROMPT_DICT["custom_tags"][lang].replace("{custom_tags}", str(custom_tags))
71
+ if custom_tags
72
+ else ""
73
+ )
74
+ prompt = prompt.replace("{custom_tags_prompt}", custom_tags_prompt)
75
+
76
+ messages = [{"role": "user", "content": prompt}]
77
+ try:
78
+ response_text = self.llm.generate(messages)
79
+ response_json = parse_json_result(response_text)
80
+ except Exception as e:
81
+ logger.error(f"[FileContentParser] LLM generation error: {e}")
82
+ response_json = {}
83
+ return response_json
84
+
85
+ def _handle_url(self, url_str: str, filename: str) -> tuple[str, str | None, bool]:
86
+ """Download and parse file from URL."""
87
+ try:
88
+ from urllib.parse import urlparse
89
+
90
+ import requests
91
+
92
+ parsed_url = urlparse(url_str)
93
+ hostname = parsed_url.hostname or ""
94
+
95
+ response = requests.get(url_str, timeout=30)
96
+ response.raise_for_status()
97
+ response.encoding = "utf-8"
98
+
99
+ if not filename:
100
+ filename = os.path.basename(parsed_url.path) or "downloaded_file"
101
+
102
+ if hostname in self.direct_markdown_hostnames:
103
+ return response.text, None, True
104
+
105
+ file_ext = os.path.splitext(filename)[1].lower()
106
+ if file_ext in [".md", ".markdown", ".txt"]:
107
+ return response.text, None, True
108
+ with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=file_ext) as temp_file:
109
+ temp_file.write(response.content)
110
+ return "", temp_file.name, False
111
+ except Exception as e:
112
+ logger.error(f"[FileContentParser] URL processing error: {e}")
113
+ return f"[File URL download failed: {url_str}]", None
114
+
115
+ def _is_base64(self, data: str) -> bool:
116
+ """Quick heuristic to check base64-like string."""
117
+ return data.startswith("data:") or (
118
+ len(data) > 100
119
+ and all(
120
+ c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
121
+ for c in data[:100]
122
+ )
123
+ )
124
+
125
+ def _handle_base64(self, data: str) -> str:
126
+ """Base64 not implemented placeholder."""
127
+ logger.info("[FileContentParser] Base64 content detected but decoding is not implemented.")
128
+ return ""
129
+
130
+ def _handle_local(self, data: str) -> str:
131
+ """Base64 not implemented placeholder."""
132
+ logger.info("[FileContentParser] Local file paths are not supported in fine mode.")
133
+ return ""
134
+
135
+ def _process_single_image(
136
+ self, image_url: str, original_ref: str, info: dict[str, Any], **kwargs
137
+ ) -> tuple[str, str]:
138
+ """
139
+ Process a single image and return (original_ref, replacement_text).
140
+
141
+ Args:
142
+ image_url: URL of the image to process
143
+ original_ref: Original markdown image reference to replace
144
+ info: Dictionary containing user_id and session_id
145
+ **kwargs: Additional parameters for ImageParser
146
+
147
+ Returns:
148
+ Tuple of (original_ref, replacement_text)
149
+ """
150
+ try:
151
+ # Construct image message format for ImageParser
152
+ image_message = {
153
+ "type": "image_url",
154
+ "image_url": {
155
+ "url": image_url,
156
+ "detail": "auto",
157
+ },
158
+ }
159
+
160
+ # Process image using ImageParser
161
+ logger.debug(f"[FileContentParser] Processing image: {image_url}")
162
+ memory_items = self.image_parser.parse_fine(image_message, info, **kwargs)
163
+
164
+ # Extract text content from memory items (only strings as requested)
165
+ extracted_texts = []
166
+ for item in memory_items:
167
+ if hasattr(item, "memory") and item.memory:
168
+ extracted_texts.append(str(item.memory))
169
+
170
+ if extracted_texts:
171
+ # Combine all extracted texts
172
+ extracted_content = "\n".join(extracted_texts)
173
+ # Replace image with extracted content
174
+ return (
175
+ original_ref,
176
+ f"\n[Image Content from {image_url}]:\n{extracted_content}\n",
177
+ )
178
+ else:
179
+ # If no content extracted, keep original with a note
180
+ logger.warning(f"[FileContentParser] No content extracted from image: {image_url}")
181
+ return (
182
+ original_ref,
183
+ f"\n[Image: {image_url} - No content extracted]\n",
184
+ )
185
+
186
+ except Exception as e:
187
+ logger.error(f"[FileContentParser] Error processing image {image_url}: {e}")
188
+ # On error, keep original image reference
189
+ return (original_ref, original_ref)
190
+
191
+ def _extract_and_process_images(self, text: str, info: dict[str, Any], **kwargs) -> str:
192
+ """
193
+ Extract all images from markdown text and process them using ImageParser in parallel.
194
+ Replaces image references with extracted text content.
195
+
196
+ Args:
197
+ text: Markdown text containing image references
198
+ info: Dictionary containing user_id and session_id
199
+ **kwargs: Additional parameters for ImageParser
200
+
201
+ Returns:
202
+ Text with image references replaced by extracted content
203
+ """
204
+ if not text or not self.image_parser:
205
+ return text
206
+
207
+ # Pattern to match markdown images: ![](url) or ![alt](url)
208
+ image_pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
209
+
210
+ # Find all image matches first
211
+ image_matches = list(re.finditer(image_pattern, text))
212
+ if not image_matches:
213
+ return text
214
+
215
+ logger.info(f"[FileContentParser] Found {len(image_matches)} images to process in parallel")
216
+
217
+ # Prepare tasks for parallel processing
218
+ tasks = []
219
+ for match in image_matches:
220
+ image_url = match.group(2)
221
+ original_ref = match.group(0)
222
+ tasks.append((image_url, original_ref))
223
+
224
+ # Process images in parallel
225
+ replacements = {}
226
+ max_workers = min(len(tasks), 10) # Limit concurrent image processing
227
+
228
+ with ContextThreadPoolExecutor(max_workers=max_workers) as executor:
229
+ futures = {
230
+ executor.submit(
231
+ self._process_single_image, image_url, original_ref, info, **kwargs
232
+ ): (image_url, original_ref)
233
+ for image_url, original_ref in tasks
234
+ }
235
+
236
+ # Collect results with progress tracking
237
+ for future in tqdm(
238
+ concurrent.futures.as_completed(futures),
239
+ total=len(futures),
240
+ desc="[FileContentParser] Processing images",
241
+ ):
242
+ try:
243
+ original_ref, replacement = future.result()
244
+ replacements[original_ref] = replacement
245
+ except Exception as e:
246
+ image_url, original_ref = futures[future]
247
+ logger.error(f"[FileContentParser] Future failed for image {image_url}: {e}")
248
+ # On error, keep original image reference
249
+ replacements[original_ref] = original_ref
250
+
251
+ # Replace all images in the text
252
+ processed_text = text
253
+ for original, replacement in replacements.items():
254
+ processed_text = processed_text.replace(original, replacement, 1)
255
+
256
+ # Count successfully extracted images
257
+ success_count = sum(
258
+ 1 for replacement in replacements.values() if "Image Content from" in replacement
259
+ )
260
+ logger.info(
261
+ f"[FileContentParser] Processed {len(image_matches)} images in parallel, "
262
+ f"extracted content for {success_count} images"
263
+ )
264
+ return processed_text
265
+
266
+ def __init__(
267
+ self,
268
+ embedder: BaseEmbedder,
269
+ llm: BaseLLM | None = None,
270
+ parser: Any | None = None,
271
+ direct_markdown_hostnames: list[str] | None = None,
272
+ ):
273
+ """
274
+ Initialize FileContentParser.
275
+
276
+ Args:
277
+ embedder: Embedder for generating embeddings
278
+ llm: Optional LLM for fine mode processing
279
+ parser: Optional parser for parsing file contents
280
+ direct_markdown_hostnames: List of hostnames that should return markdown directly
281
+ without parsing. If None, reads from FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES
282
+ environment variable (comma-separated).
283
+ """
284
+ super().__init__(embedder, llm)
285
+ self.parser = parser
286
+ # Initialize ImageParser for processing images in markdown
287
+ self.image_parser = ImageParser(embedder, llm) if llm else None
288
+
289
+ # Get inner markdown hostnames from config or environment
290
+ if direct_markdown_hostnames is not None:
291
+ self.direct_markdown_hostnames = direct_markdown_hostnames
292
+ else:
293
+ env_hostnames = os.getenv("FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "")
294
+ if env_hostnames:
295
+ # Support comma-separated list
296
+ self.direct_markdown_hostnames = [
297
+ h.strip() for h in env_hostnames.split(",") if h.strip()
298
+ ]
299
+ else:
300
+ self.direct_markdown_hostnames = []
301
+
302
+ def create_source(
303
+ self,
304
+ message: File,
305
+ info: dict[str, Any],
306
+ chunk_index: int | None = None,
307
+ chunk_total: int | None = None,
308
+ chunk_content: str | None = None,
309
+ file_url_flag: bool = False,
310
+ ) -> SourceMessage:
311
+ """Create SourceMessage from file content part."""
312
+ if isinstance(message, dict):
313
+ file_info = message.get("file", {})
314
+ source_dict = {
315
+ "type": "file",
316
+ "doc_path": file_info.get("filename") or file_info.get("file_id", ""),
317
+ "content": chunk_content if chunk_content else file_info.get("file_data", ""),
318
+ "file_info": file_info if file_url_flag else {},
319
+ }
320
+ # Add chunk ordering information if provided
321
+ if chunk_index is not None:
322
+ source_dict["chunk_index"] = chunk_index
323
+ if chunk_total is not None:
324
+ source_dict["chunk_total"] = chunk_total
325
+ return SourceMessage(**source_dict)
326
+ source_dict = {"type": "file", "doc_path": str(message)}
327
+ if chunk_index is not None:
328
+ source_dict["chunk_index"] = chunk_index
329
+ if chunk_total is not None:
330
+ source_dict["chunk_total"] = chunk_total
331
+ if chunk_content is not None:
332
+ source_dict["content"] = chunk_content
333
+ return SourceMessage(**source_dict)
334
+
335
+ def rebuild_from_source(
336
+ self,
337
+ source: SourceMessage,
338
+ ) -> File:
339
+ """Rebuild file content part from SourceMessage."""
340
+ # Rebuild from source fields
341
+ return {
342
+ "type": "file",
343
+ "file": source.file_info,
344
+ }
345
+
346
+ def _parse_file(self, file_info: dict[str, Any]) -> str:
347
+ """
348
+ Parse file content.
349
+
350
+ Args:
351
+ file_info: File information dictionary
352
+
353
+ Returns:
354
+ Parsed text content
355
+ """
356
+ parser = self.parser or get_parser()
357
+ if not parser:
358
+ logger.warning("[FileContentParser] Parser not available")
359
+ return ""
360
+
361
+ file_path = file_info.get("path") or file_info.get("file_id", "")
362
+ filename = file_info.get("filename", "unknown")
363
+
364
+ if not file_path:
365
+ logger.warning("[FileContentParser] No file path or file_id provided")
366
+ return f"[File: {filename}]"
367
+
368
+ try:
369
+ if os.path.exists(file_path):
370
+ parsed_text = parser.parse(file_path)
371
+ return parsed_text
372
+ else:
373
+ logger.warning(f"[FileContentParser] File not found: {file_path}")
374
+ return f"[File: {filename}]"
375
+ except Exception as e:
376
+ logger.error(f"[FileContentParser] Error parsing file {file_path}: {e}")
377
+ return f"[File: {filename}]"
378
+
379
+ def parse_fast(
380
+ self,
381
+ message: File,
382
+ info: dict[str, Any],
383
+ **kwargs,
384
+ ) -> list[TextualMemoryItem]:
385
+ """
386
+ Parse file content part in fast mode.
387
+
388
+ Fast mode extracts file information and creates a memory item without parsing file content.
389
+ Handles various file parameter scenarios:
390
+ - file_data: base64 encoded data, URL, or plain text content
391
+ - file_id: ID of an uploaded file
392
+ - filename: name of the file
393
+
394
+ Args:
395
+ message: File content part to parse (dict with "type": "file" and "file": {...})
396
+ info: Dictionary containing user_id and session_id
397
+ **kwargs: Additional parameters
398
+
399
+ Returns:
400
+ List of TextualMemoryItem objects
401
+ """
402
+ if not isinstance(message, dict):
403
+ logger.warning(f"[FileContentParser] Expected dict, got {type(message)}")
404
+ return []
405
+
406
+ # Extract file information
407
+ file_info = message.get("file", {})
408
+ if not isinstance(file_info, dict):
409
+ logger.warning(f"[FileContentParser] Expected file dict, got {type(file_info)}")
410
+ return []
411
+
412
+ # Extract file parameters (all are optional)
413
+ file_data = file_info.get("file_data", "")
414
+ file_id = file_info.get("file_id", "")
415
+ filename = file_info.get("filename", "")
416
+ file_url_flag = False
417
+ # Build content string based on available information
418
+ content_parts = []
419
+
420
+ # Priority 1: If file_data is provided, use it (could be base64, URL, or plain text)
421
+ if file_data:
422
+ # In fast mode, we don't decode base64 or fetch URLs, just record the reference
423
+ if isinstance(file_data, str):
424
+ # Check if it looks like base64 (starts with data: or is long base64 string)
425
+ if file_data.startswith("data:") or (
426
+ len(file_data) > 100
427
+ and all(
428
+ c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
429
+ for c in file_data[:100]
430
+ )
431
+ ):
432
+ content_parts.append(f"[File Data (base64/encoded): {len(file_data)} chars]")
433
+ # Check if it looks like a URL
434
+ elif file_data.startswith(("http://", "https://", "file://")):
435
+ file_url_flag = True
436
+ content_parts.append(f"[File URL: {file_data}]")
437
+ else:
438
+ # TODO: split into multiple memory items
439
+ content_parts.append(file_data)
440
+ else:
441
+ content_parts.append(f"[File Data: {type(file_data).__name__}]")
442
+
443
+ # Priority 2: If file_id is provided, reference it
444
+ if file_id:
445
+ content_parts.append(f"[File ID: {file_id}]")
446
+
447
+ # Priority 3: If filename is provided, include it
448
+ if filename:
449
+ content_parts.append(f"[Filename: {filename}]")
450
+
451
+ # If no content can be extracted, create a placeholder
452
+ if not content_parts:
453
+ content_parts.append("[File: unknown]")
454
+
455
+ # Combine content parts
456
+ content = " ".join(content_parts)
457
+
458
+ # Split content into chunks
459
+ content_chunks = self._split_text(content)
460
+
461
+ # Extract info fields
462
+ info_ = info.copy()
463
+ if file_id:
464
+ info_.update({"file_id": file_id})
465
+ user_id = info_.pop("user_id", "")
466
+ session_id = info_.pop("session_id", "")
467
+
468
+ # For file content parts, default to LongTermMemory
469
+ # (since we don't have role information at this level)
470
+ memory_type = "LongTermMemory"
471
+ file_ids = [file_id] if file_id else []
472
+ total_chunks = len(content_chunks)
473
+
474
+ # Create memory items for each chunk
475
+ content_chunk_embeddings = self.embedder.embed(content_chunks)
476
+ memory_items = []
477
+ for chunk_idx, chunk_text in enumerate(content_chunks):
478
+ if not chunk_text.strip():
479
+ continue
480
+
481
+ # Create source for this specific chunk with its index and content
482
+ source = self.create_source(
483
+ message,
484
+ info,
485
+ chunk_index=chunk_idx,
486
+ chunk_total=total_chunks,
487
+ chunk_content=chunk_text,
488
+ file_url_flag=file_url_flag,
489
+ )
490
+
491
+ memory_item = TextualMemoryItem(
492
+ memory=chunk_text,
493
+ metadata=TreeNodeTextualMemoryMetadata(
494
+ user_id=user_id,
495
+ session_id=session_id,
496
+ memory_type=memory_type,
497
+ status="activated",
498
+ tags=[
499
+ "mode:fast",
500
+ "multimodal:file",
501
+ f"chunk:{chunk_idx + 1}/{total_chunks}",
502
+ ],
503
+ key=_derive_key(chunk_text),
504
+ embedding=content_chunk_embeddings[chunk_idx],
505
+ usage=[],
506
+ sources=[source],
507
+ background="",
508
+ confidence=0.99,
509
+ type="fact",
510
+ info=info_,
511
+ file_ids=file_ids,
512
+ ),
513
+ )
514
+ memory_items.append(memory_item)
515
+
516
+ # If no chunks were created, create a placeholder
517
+ if not memory_items:
518
+ # Create source for placeholder (no chunk index since there are no chunks)
519
+ placeholder_source = self.create_source(
520
+ message,
521
+ info,
522
+ chunk_index=None,
523
+ chunk_total=0,
524
+ chunk_content=content,
525
+ file_url_flag=file_url_flag,
526
+ )
527
+ memory_item = TextualMemoryItem(
528
+ memory=content,
529
+ metadata=TreeNodeTextualMemoryMetadata(
530
+ user_id=user_id,
531
+ session_id=session_id,
532
+ memory_type=memory_type,
533
+ status="activated",
534
+ tags=["mode:fast", "multimodal:file"],
535
+ key=_derive_key(content),
536
+ embedding=self.embedder.embed([content])[0],
537
+ usage=[],
538
+ sources=[placeholder_source],
539
+ background="",
540
+ confidence=0.99,
541
+ type="fact",
542
+ info=info_,
543
+ file_ids=file_ids,
544
+ ),
545
+ )
546
+ memory_items.append(memory_item)
547
+
548
+ return memory_items
549
+
550
+ def parse_fine(
551
+ self,
552
+ message: File,
553
+ info: dict[str, Any],
554
+ **kwargs,
555
+ ) -> list[TextualMemoryItem]:
556
+ """
557
+ Parse file content part in fine mode.
558
+ Fine mode downloads and parses file content, especially for URLs.
559
+ Then uses LLM to extract structured memories from each chunk.
560
+
561
+ Handles various file parameter scenarios:
562
+ - file_data: URL (http://, https://, or @http://), base64 encoded data, or plain text content
563
+ - file_id: ID of an uploaded file
564
+ - filename: name of the file
565
+
566
+ Args:
567
+ message: File content part to parse
568
+ info: Dictionary containing user_id and session_id
569
+ **kwargs: Additional parameters including:
570
+ - custom_tags: Optional list of custom tags for LLM extraction
571
+ - context_items: Optional list of TextualMemoryItem for context
572
+ """
573
+ if not isinstance(message, dict):
574
+ logger.warning(f"[FileContentParser] Expected dict, got {type(message)}")
575
+ return []
576
+
577
+ # Extract file information
578
+ file_info = message.get("file", {})
579
+ if not isinstance(file_info, dict):
580
+ logger.warning(f"[FileContentParser] Expected file dict, got {type(file_info)}")
581
+ return []
582
+
583
+ # Extract file parameters (all are optional)
584
+ file_data = file_info.get("file_data", "")
585
+ file_id = file_info.get("file_id", "")
586
+ filename = file_info.get("filename", "")
587
+
588
+ # Extract custom_tags from kwargs (for LLM extraction)
589
+ custom_tags = kwargs.get("custom_tags")
590
+
591
+ # Use parser from utils
592
+ parser = self.parser or get_parser()
593
+ if not parser:
594
+ logger.warning("[FileContentParser] Parser not available")
595
+ return []
596
+
597
+ parsed_text = ""
598
+ temp_file_path = None
599
+ is_markdown = False
600
+
601
+ try:
602
+ # Priority 1: If file_data is provided, process it
603
+ if file_data:
604
+ if isinstance(file_data, str):
605
+ url_str = file_data[1:] if file_data.startswith("@") else file_data
606
+
607
+ if url_str.startswith(("http://", "https://")):
608
+ parsed_text, temp_file_path, is_markdown = self._handle_url(
609
+ url_str, filename
610
+ )
611
+ if temp_file_path:
612
+ try:
613
+ # Use parser from utils
614
+ if parser:
615
+ parsed_text = parser.parse(temp_file_path)
616
+ except Exception as e:
617
+ logger.error(
618
+ f"[FileContentParser] Error parsing downloaded file: {e}"
619
+ )
620
+ parsed_text = f"[File parsing error: {e!s}]"
621
+
622
+ elif os.path.exists(file_data):
623
+ parsed_text = self._handle_local(file_data)
624
+
625
+ elif self._is_base64(file_data):
626
+ parsed_text = self._handle_base64(file_data)
627
+
628
+ else:
629
+ # TODO: discuss the proper place for processing
630
+ # string file-data
631
+ return []
632
+ # Priority 2: If file_id is provided but no file_data, try to use file_id as path
633
+ elif file_id:
634
+ logger.warning(f"[FileContentParser] File data not provided for file_id: {file_id}")
635
+
636
+ except Exception as e:
637
+ logger.error(f"[FileContentParser] Error in parse_fine: {e}")
638
+
639
+ finally:
640
+ # Clean up temporary file
641
+ if temp_file_path and os.path.exists(temp_file_path):
642
+ try:
643
+ os.unlink(temp_file_path)
644
+ logger.debug(f"[FileContentParser] Cleaned up temporary file: {temp_file_path}")
645
+ except Exception as e:
646
+ logger.warning(
647
+ f"[FileContentParser] Failed to delete temp file {temp_file_path}: {e}"
648
+ )
649
+ if not parsed_text:
650
+ return []
651
+ # Extract and process images from parsed_text
652
+ if is_markdown and parsed_text and self.image_parser:
653
+ parsed_text = self._extract_and_process_images(parsed_text, info, **kwargs)
654
+
655
+ # Extract info fields
656
+ if not info:
657
+ info = {}
658
+ info_ = info.copy()
659
+ user_id = info_.pop("user_id", "")
660
+ session_id = info_.pop("session_id", "")
661
+ if file_id:
662
+ info_["file_id"] = file_id
663
+ file_ids = [file_id] if file_id else []
664
+ # For file content parts, default to LongTermMemory
665
+ memory_type = "LongTermMemory"
666
+
667
+ # Split parsed text into chunks
668
+ content_chunks = self._split_text(parsed_text, is_markdown)
669
+
670
+ # Filter out empty chunks and create indexed list
671
+ valid_chunks = [
672
+ (idx, chunk_text) for idx, chunk_text in enumerate(content_chunks) if chunk_text.strip()
673
+ ]
674
+ total_chunks = len(content_chunks)
675
+
676
+ # Helper function to create memory item (similar to SimpleStructMemReader._make_memory_item)
677
+ def _make_memory_item(
678
+ value: str,
679
+ mem_type: str = memory_type,
680
+ tags: list[str] | None = None,
681
+ key: str | None = None,
682
+ chunk_idx: int | None = None,
683
+ chunk_content: str | None = None,
684
+ ) -> TextualMemoryItem:
685
+ """Construct memory item with common fields.
686
+
687
+ Args:
688
+ value: Memory content (chunk text)
689
+ mem_type: Memory type
690
+ tags: Tags for the memory item
691
+ key: Key for the memory item
692
+ chunk_idx: Index of the chunk in the document (0-based)
693
+ """
694
+ # Create source for this specific chunk with its index and content
695
+ chunk_source = self.create_source(
696
+ message,
697
+ info,
698
+ chunk_index=chunk_idx,
699
+ chunk_total=total_chunks,
700
+ chunk_content=chunk_content,
701
+ )
702
+ return TextualMemoryItem(
703
+ memory=value,
704
+ metadata=TreeNodeTextualMemoryMetadata(
705
+ user_id=user_id,
706
+ session_id=session_id,
707
+ memory_type=mem_type,
708
+ status="activated",
709
+ tags=tags or [],
710
+ key=key if key is not None else _derive_key(value),
711
+ embedding=self.embedder.embed([value])[0],
712
+ usage=[],
713
+ sources=[chunk_source],
714
+ background="",
715
+ confidence=0.99,
716
+ type="fact",
717
+ info=info_,
718
+ file_ids=file_ids,
719
+ ),
720
+ )
721
+
722
+ # Helper function to create fallback item for a chunk
723
+ def _make_fallback(
724
+ chunk_idx: int, chunk_text: str, reason: str = "raw"
725
+ ) -> TextualMemoryItem:
726
+ """Create fallback memory item with raw chunk text."""
727
+ return _make_memory_item(
728
+ value=chunk_text,
729
+ tags=[
730
+ "mode:fine",
731
+ "multimodal:file",
732
+ f"fallback:{reason}",
733
+ f"chunk:{chunk_idx + 1}/{total_chunks}",
734
+ ],
735
+ chunk_idx=chunk_idx,
736
+ chunk_content=chunk_text,
737
+ )
738
+
739
+ # Handle empty chunks case
740
+ if not valid_chunks:
741
+ return [
742
+ _make_memory_item(
743
+ value=parsed_text or "[File: empty content]",
744
+ tags=["mode:fine", "multimodal:file"],
745
+ chunk_idx=None,
746
+ )
747
+ ]
748
+
749
+ # If no LLM available, create memory items directly from chunks
750
+ if not self.llm:
751
+ return [_make_fallback(idx, text, "no_llm") for idx, text in valid_chunks]
752
+
753
+ # Process single chunk with LLM extraction (worker function)
754
+ def _process_chunk(chunk_idx: int, chunk_text: str) -> TextualMemoryItem:
755
+ """Process chunk with LLM, fallback to raw on failure."""
756
+ try:
757
+ response_json = self._get_doc_llm_response(chunk_text, custom_tags)
758
+ if response_json:
759
+ value = response_json.get("value", "").strip()
760
+ if value:
761
+ tags = response_json.get("tags", [])
762
+ tags = tags if isinstance(tags, list) else []
763
+ tags.extend(["mode:fine", "multimodal:file"])
764
+
765
+ llm_mem_type = response_json.get("memory_type", memory_type)
766
+ if llm_mem_type not in ["LongTermMemory", "UserMemory"]:
767
+ llm_mem_type = memory_type
768
+
769
+ return _make_memory_item(
770
+ value=value,
771
+ mem_type=llm_mem_type,
772
+ tags=tags,
773
+ key=response_json.get("key"),
774
+ chunk_idx=chunk_idx,
775
+ chunk_content=chunk_text,
776
+ )
777
+ except Exception as e:
778
+ logger.error(f"[FileContentParser] LLM error for chunk {chunk_idx}: {e}")
779
+
780
+ # Fallback to raw chunk
781
+ logger.warning(f"[FileContentParser] Fallback to raw for chunk {chunk_idx}")
782
+ return _make_fallback(chunk_idx, chunk_text)
783
+
784
+ # Process chunks concurrently with progress bar
785
+ memory_items = []
786
+ chunk_map = dict(valid_chunks)
787
+ total_chunks = len(valid_chunks)
788
+
789
+ logger.info(f"[FileContentParser] Processing {total_chunks} chunks with LLM...")
790
+
791
+ with ContextThreadPoolExecutor(max_workers=20) as executor:
792
+ futures = {
793
+ executor.submit(_process_chunk, idx, text): idx for idx, text in valid_chunks
794
+ }
795
+
796
+ # Use tqdm for progress bar (similar to simple_struct.py _process_doc_data)
797
+ for future in tqdm(
798
+ concurrent.futures.as_completed(futures),
799
+ total=total_chunks,
800
+ desc="[FileContentParser] Processing chunks",
801
+ ):
802
+ chunk_idx = futures[future]
803
+ try:
804
+ node = future.result()
805
+ if node:
806
+ memory_items.append(node)
807
+ except Exception as e:
808
+ tqdm.write(f"[ERROR] Chunk {chunk_idx} failed: {e}")
809
+ logger.error(f"[FileContentParser] Future failed for chunk {chunk_idx}: {e}")
810
+ # Create fallback for failed future
811
+ if chunk_idx in chunk_map:
812
+ memory_items.append(
813
+ _make_fallback(chunk_idx, chunk_map[chunk_idx], "error")
814
+ )
815
+
816
+ logger.info(
817
+ f"[FileContentParser] Completed processing {len(memory_items)}/{total_chunks} chunks"
818
+ )
819
+
820
+ return memory_items or [
821
+ _make_memory_item(
822
+ value=parsed_text or "[File: empty content]",
823
+ tags=["mode:fine", "multimodal:file"],
824
+ chunk_idx=None,
825
+ )
826
+ ]