MemoryOS 2.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memoryos-2.0.3.dist-info/METADATA +418 -0
- memoryos-2.0.3.dist-info/RECORD +315 -0
- memoryos-2.0.3.dist-info/WHEEL +4 -0
- memoryos-2.0.3.dist-info/entry_points.txt +3 -0
- memoryos-2.0.3.dist-info/licenses/LICENSE +201 -0
- memos/__init__.py +20 -0
- memos/api/client.py +571 -0
- memos/api/config.py +1018 -0
- memos/api/context/dependencies.py +50 -0
- memos/api/exceptions.py +53 -0
- memos/api/handlers/__init__.py +62 -0
- memos/api/handlers/add_handler.py +158 -0
- memos/api/handlers/base_handler.py +194 -0
- memos/api/handlers/chat_handler.py +1401 -0
- memos/api/handlers/component_init.py +388 -0
- memos/api/handlers/config_builders.py +190 -0
- memos/api/handlers/feedback_handler.py +93 -0
- memos/api/handlers/formatters_handler.py +237 -0
- memos/api/handlers/memory_handler.py +316 -0
- memos/api/handlers/scheduler_handler.py +497 -0
- memos/api/handlers/search_handler.py +222 -0
- memos/api/handlers/suggestion_handler.py +117 -0
- memos/api/mcp_serve.py +614 -0
- memos/api/middleware/request_context.py +101 -0
- memos/api/product_api.py +38 -0
- memos/api/product_models.py +1206 -0
- memos/api/routers/__init__.py +1 -0
- memos/api/routers/product_router.py +477 -0
- memos/api/routers/server_router.py +394 -0
- memos/api/server_api.py +44 -0
- memos/api/start_api.py +433 -0
- memos/chunkers/__init__.py +4 -0
- memos/chunkers/base.py +24 -0
- memos/chunkers/charactertext_chunker.py +41 -0
- memos/chunkers/factory.py +24 -0
- memos/chunkers/markdown_chunker.py +62 -0
- memos/chunkers/sentence_chunker.py +54 -0
- memos/chunkers/simple_chunker.py +50 -0
- memos/cli.py +113 -0
- memos/configs/__init__.py +0 -0
- memos/configs/base.py +82 -0
- memos/configs/chunker.py +59 -0
- memos/configs/embedder.py +88 -0
- memos/configs/graph_db.py +236 -0
- memos/configs/internet_retriever.py +100 -0
- memos/configs/llm.py +151 -0
- memos/configs/mem_agent.py +54 -0
- memos/configs/mem_chat.py +81 -0
- memos/configs/mem_cube.py +105 -0
- memos/configs/mem_os.py +83 -0
- memos/configs/mem_reader.py +91 -0
- memos/configs/mem_scheduler.py +385 -0
- memos/configs/mem_user.py +70 -0
- memos/configs/memory.py +324 -0
- memos/configs/parser.py +38 -0
- memos/configs/reranker.py +18 -0
- memos/configs/utils.py +8 -0
- memos/configs/vec_db.py +80 -0
- memos/context/context.py +355 -0
- memos/dependency.py +52 -0
- memos/deprecation.py +262 -0
- memos/embedders/__init__.py +0 -0
- memos/embedders/ark.py +95 -0
- memos/embedders/base.py +106 -0
- memos/embedders/factory.py +29 -0
- memos/embedders/ollama.py +77 -0
- memos/embedders/sentence_transformer.py +49 -0
- memos/embedders/universal_api.py +51 -0
- memos/exceptions.py +30 -0
- memos/graph_dbs/__init__.py +0 -0
- memos/graph_dbs/base.py +274 -0
- memos/graph_dbs/factory.py +27 -0
- memos/graph_dbs/item.py +46 -0
- memos/graph_dbs/nebular.py +1794 -0
- memos/graph_dbs/neo4j.py +1942 -0
- memos/graph_dbs/neo4j_community.py +1058 -0
- memos/graph_dbs/polardb.py +5446 -0
- memos/hello_world.py +97 -0
- memos/llms/__init__.py +0 -0
- memos/llms/base.py +25 -0
- memos/llms/deepseek.py +13 -0
- memos/llms/factory.py +38 -0
- memos/llms/hf.py +443 -0
- memos/llms/hf_singleton.py +114 -0
- memos/llms/ollama.py +135 -0
- memos/llms/openai.py +222 -0
- memos/llms/openai_new.py +198 -0
- memos/llms/qwen.py +13 -0
- memos/llms/utils.py +14 -0
- memos/llms/vllm.py +218 -0
- memos/log.py +237 -0
- memos/mem_agent/base.py +19 -0
- memos/mem_agent/deepsearch_agent.py +391 -0
- memos/mem_agent/factory.py +36 -0
- memos/mem_chat/__init__.py +0 -0
- memos/mem_chat/base.py +30 -0
- memos/mem_chat/factory.py +21 -0
- memos/mem_chat/simple.py +200 -0
- memos/mem_cube/__init__.py +0 -0
- memos/mem_cube/base.py +30 -0
- memos/mem_cube/general.py +240 -0
- memos/mem_cube/navie.py +172 -0
- memos/mem_cube/utils.py +169 -0
- memos/mem_feedback/base.py +15 -0
- memos/mem_feedback/feedback.py +1192 -0
- memos/mem_feedback/simple_feedback.py +40 -0
- memos/mem_feedback/utils.py +230 -0
- memos/mem_os/client.py +5 -0
- memos/mem_os/core.py +1203 -0
- memos/mem_os/main.py +582 -0
- memos/mem_os/product.py +1608 -0
- memos/mem_os/product_server.py +455 -0
- memos/mem_os/utils/default_config.py +359 -0
- memos/mem_os/utils/format_utils.py +1403 -0
- memos/mem_os/utils/reference_utils.py +162 -0
- memos/mem_reader/__init__.py +0 -0
- memos/mem_reader/base.py +47 -0
- memos/mem_reader/factory.py +53 -0
- memos/mem_reader/memory.py +298 -0
- memos/mem_reader/multi_modal_struct.py +965 -0
- memos/mem_reader/read_multi_modal/__init__.py +43 -0
- memos/mem_reader/read_multi_modal/assistant_parser.py +311 -0
- memos/mem_reader/read_multi_modal/base.py +273 -0
- memos/mem_reader/read_multi_modal/file_content_parser.py +826 -0
- memos/mem_reader/read_multi_modal/image_parser.py +359 -0
- memos/mem_reader/read_multi_modal/multi_modal_parser.py +252 -0
- memos/mem_reader/read_multi_modal/string_parser.py +139 -0
- memos/mem_reader/read_multi_modal/system_parser.py +327 -0
- memos/mem_reader/read_multi_modal/text_content_parser.py +131 -0
- memos/mem_reader/read_multi_modal/tool_parser.py +210 -0
- memos/mem_reader/read_multi_modal/user_parser.py +218 -0
- memos/mem_reader/read_multi_modal/utils.py +358 -0
- memos/mem_reader/simple_struct.py +912 -0
- memos/mem_reader/strategy_struct.py +163 -0
- memos/mem_reader/utils.py +157 -0
- memos/mem_scheduler/__init__.py +0 -0
- memos/mem_scheduler/analyzer/__init__.py +0 -0
- memos/mem_scheduler/analyzer/api_analyzer.py +714 -0
- memos/mem_scheduler/analyzer/eval_analyzer.py +219 -0
- memos/mem_scheduler/analyzer/mos_for_test_scheduler.py +571 -0
- memos/mem_scheduler/analyzer/scheduler_for_eval.py +280 -0
- memos/mem_scheduler/base_scheduler.py +1319 -0
- memos/mem_scheduler/general_modules/__init__.py +0 -0
- memos/mem_scheduler/general_modules/api_misc.py +137 -0
- memos/mem_scheduler/general_modules/base.py +80 -0
- memos/mem_scheduler/general_modules/init_components_for_scheduler.py +425 -0
- memos/mem_scheduler/general_modules/misc.py +313 -0
- memos/mem_scheduler/general_modules/scheduler_logger.py +389 -0
- memos/mem_scheduler/general_modules/task_threads.py +315 -0
- memos/mem_scheduler/general_scheduler.py +1495 -0
- memos/mem_scheduler/memory_manage_modules/__init__.py +5 -0
- memos/mem_scheduler/memory_manage_modules/memory_filter.py +306 -0
- memos/mem_scheduler/memory_manage_modules/retriever.py +547 -0
- memos/mem_scheduler/monitors/__init__.py +0 -0
- memos/mem_scheduler/monitors/dispatcher_monitor.py +366 -0
- memos/mem_scheduler/monitors/general_monitor.py +394 -0
- memos/mem_scheduler/monitors/task_schedule_monitor.py +254 -0
- memos/mem_scheduler/optimized_scheduler.py +410 -0
- memos/mem_scheduler/orm_modules/__init__.py +0 -0
- memos/mem_scheduler/orm_modules/api_redis_model.py +518 -0
- memos/mem_scheduler/orm_modules/base_model.py +729 -0
- memos/mem_scheduler/orm_modules/monitor_models.py +261 -0
- memos/mem_scheduler/orm_modules/redis_model.py +699 -0
- memos/mem_scheduler/scheduler_factory.py +23 -0
- memos/mem_scheduler/schemas/__init__.py +0 -0
- memos/mem_scheduler/schemas/analyzer_schemas.py +52 -0
- memos/mem_scheduler/schemas/api_schemas.py +233 -0
- memos/mem_scheduler/schemas/general_schemas.py +55 -0
- memos/mem_scheduler/schemas/message_schemas.py +173 -0
- memos/mem_scheduler/schemas/monitor_schemas.py +406 -0
- memos/mem_scheduler/schemas/task_schemas.py +132 -0
- memos/mem_scheduler/task_schedule_modules/__init__.py +0 -0
- memos/mem_scheduler/task_schedule_modules/dispatcher.py +740 -0
- memos/mem_scheduler/task_schedule_modules/local_queue.py +247 -0
- memos/mem_scheduler/task_schedule_modules/orchestrator.py +74 -0
- memos/mem_scheduler/task_schedule_modules/redis_queue.py +1385 -0
- memos/mem_scheduler/task_schedule_modules/task_queue.py +162 -0
- memos/mem_scheduler/utils/__init__.py +0 -0
- memos/mem_scheduler/utils/api_utils.py +77 -0
- memos/mem_scheduler/utils/config_utils.py +100 -0
- memos/mem_scheduler/utils/db_utils.py +50 -0
- memos/mem_scheduler/utils/filter_utils.py +176 -0
- memos/mem_scheduler/utils/metrics.py +125 -0
- memos/mem_scheduler/utils/misc_utils.py +290 -0
- memos/mem_scheduler/utils/monitor_event_utils.py +67 -0
- memos/mem_scheduler/utils/status_tracker.py +229 -0
- memos/mem_scheduler/webservice_modules/__init__.py +0 -0
- memos/mem_scheduler/webservice_modules/rabbitmq_service.py +485 -0
- memos/mem_scheduler/webservice_modules/redis_service.py +380 -0
- memos/mem_user/factory.py +94 -0
- memos/mem_user/mysql_persistent_user_manager.py +271 -0
- memos/mem_user/mysql_user_manager.py +502 -0
- memos/mem_user/persistent_factory.py +98 -0
- memos/mem_user/persistent_user_manager.py +260 -0
- memos/mem_user/redis_persistent_user_manager.py +225 -0
- memos/mem_user/user_manager.py +488 -0
- memos/memories/__init__.py +0 -0
- memos/memories/activation/__init__.py +0 -0
- memos/memories/activation/base.py +42 -0
- memos/memories/activation/item.py +56 -0
- memos/memories/activation/kv.py +292 -0
- memos/memories/activation/vllmkv.py +219 -0
- memos/memories/base.py +19 -0
- memos/memories/factory.py +42 -0
- memos/memories/parametric/__init__.py +0 -0
- memos/memories/parametric/base.py +19 -0
- memos/memories/parametric/item.py +11 -0
- memos/memories/parametric/lora.py +41 -0
- memos/memories/textual/__init__.py +0 -0
- memos/memories/textual/base.py +92 -0
- memos/memories/textual/general.py +236 -0
- memos/memories/textual/item.py +304 -0
- memos/memories/textual/naive.py +187 -0
- memos/memories/textual/prefer_text_memory/__init__.py +0 -0
- memos/memories/textual/prefer_text_memory/adder.py +504 -0
- memos/memories/textual/prefer_text_memory/config.py +106 -0
- memos/memories/textual/prefer_text_memory/extractor.py +221 -0
- memos/memories/textual/prefer_text_memory/factory.py +85 -0
- memos/memories/textual/prefer_text_memory/retrievers.py +177 -0
- memos/memories/textual/prefer_text_memory/spliter.py +132 -0
- memos/memories/textual/prefer_text_memory/utils.py +93 -0
- memos/memories/textual/preference.py +344 -0
- memos/memories/textual/simple_preference.py +161 -0
- memos/memories/textual/simple_tree.py +69 -0
- memos/memories/textual/tree.py +459 -0
- memos/memories/textual/tree_text_memory/__init__.py +0 -0
- memos/memories/textual/tree_text_memory/organize/__init__.py +0 -0
- memos/memories/textual/tree_text_memory/organize/handler.py +184 -0
- memos/memories/textual/tree_text_memory/organize/manager.py +518 -0
- memos/memories/textual/tree_text_memory/organize/relation_reason_detector.py +238 -0
- memos/memories/textual/tree_text_memory/organize/reorganizer.py +622 -0
- memos/memories/textual/tree_text_memory/retrieve/__init__.py +0 -0
- memos/memories/textual/tree_text_memory/retrieve/advanced_searcher.py +364 -0
- memos/memories/textual/tree_text_memory/retrieve/bm25_util.py +186 -0
- memos/memories/textual/tree_text_memory/retrieve/bochasearch.py +419 -0
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever.py +270 -0
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +102 -0
- memos/memories/textual/tree_text_memory/retrieve/reasoner.py +61 -0
- memos/memories/textual/tree_text_memory/retrieve/recall.py +497 -0
- memos/memories/textual/tree_text_memory/retrieve/reranker.py +111 -0
- memos/memories/textual/tree_text_memory/retrieve/retrieval_mid_structs.py +16 -0
- memos/memories/textual/tree_text_memory/retrieve/retrieve_utils.py +472 -0
- memos/memories/textual/tree_text_memory/retrieve/searcher.py +848 -0
- memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py +135 -0
- memos/memories/textual/tree_text_memory/retrieve/utils.py +54 -0
- memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +387 -0
- memos/memos_tools/dinding_report_bot.py +453 -0
- memos/memos_tools/lockfree_dict.py +120 -0
- memos/memos_tools/notification_service.py +44 -0
- memos/memos_tools/notification_utils.py +142 -0
- memos/memos_tools/singleton.py +174 -0
- memos/memos_tools/thread_safe_dict.py +310 -0
- memos/memos_tools/thread_safe_dict_segment.py +382 -0
- memos/multi_mem_cube/__init__.py +0 -0
- memos/multi_mem_cube/composite_cube.py +86 -0
- memos/multi_mem_cube/single_cube.py +874 -0
- memos/multi_mem_cube/views.py +54 -0
- memos/parsers/__init__.py +0 -0
- memos/parsers/base.py +15 -0
- memos/parsers/factory.py +21 -0
- memos/parsers/markitdown.py +28 -0
- memos/reranker/__init__.py +4 -0
- memos/reranker/base.py +25 -0
- memos/reranker/concat.py +103 -0
- memos/reranker/cosine_local.py +102 -0
- memos/reranker/factory.py +72 -0
- memos/reranker/http_bge.py +324 -0
- memos/reranker/http_bge_strategy.py +327 -0
- memos/reranker/noop.py +19 -0
- memos/reranker/strategies/__init__.py +4 -0
- memos/reranker/strategies/base.py +61 -0
- memos/reranker/strategies/concat_background.py +94 -0
- memos/reranker/strategies/concat_docsource.py +110 -0
- memos/reranker/strategies/dialogue_common.py +109 -0
- memos/reranker/strategies/factory.py +31 -0
- memos/reranker/strategies/single_turn.py +107 -0
- memos/reranker/strategies/singleturn_outmem.py +98 -0
- memos/settings.py +10 -0
- memos/templates/__init__.py +0 -0
- memos/templates/advanced_search_prompts.py +211 -0
- memos/templates/cloud_service_prompt.py +107 -0
- memos/templates/instruction_completion.py +66 -0
- memos/templates/mem_agent_prompts.py +85 -0
- memos/templates/mem_feedback_prompts.py +822 -0
- memos/templates/mem_reader_prompts.py +1096 -0
- memos/templates/mem_reader_strategy_prompts.py +238 -0
- memos/templates/mem_scheduler_prompts.py +626 -0
- memos/templates/mem_search_prompts.py +93 -0
- memos/templates/mos_prompts.py +403 -0
- memos/templates/prefer_complete_prompt.py +735 -0
- memos/templates/tool_mem_prompts.py +139 -0
- memos/templates/tree_reorganize_prompts.py +230 -0
- memos/types/__init__.py +34 -0
- memos/types/general_types.py +151 -0
- memos/types/openai_chat_completion_types/__init__.py +15 -0
- memos/types/openai_chat_completion_types/chat_completion_assistant_message_param.py +56 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_image_param.py +27 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_input_audio_param.py +23 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_param.py +43 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_refusal_param.py +16 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_text_param.py +16 -0
- memos/types/openai_chat_completion_types/chat_completion_message_custom_tool_call_param.py +27 -0
- memos/types/openai_chat_completion_types/chat_completion_message_function_tool_call_param.py +32 -0
- memos/types/openai_chat_completion_types/chat_completion_message_param.py +18 -0
- memos/types/openai_chat_completion_types/chat_completion_message_tool_call_union_param.py +15 -0
- memos/types/openai_chat_completion_types/chat_completion_system_message_param.py +36 -0
- memos/types/openai_chat_completion_types/chat_completion_tool_message_param.py +30 -0
- memos/types/openai_chat_completion_types/chat_completion_user_message_param.py +34 -0
- memos/utils.py +123 -0
- memos/vec_dbs/__init__.py +0 -0
- memos/vec_dbs/base.py +117 -0
- memos/vec_dbs/factory.py +23 -0
- memos/vec_dbs/item.py +50 -0
- memos/vec_dbs/milvus.py +654 -0
- memos/vec_dbs/qdrant.py +355 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""Utility functions for message parsing."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import Any, TypeAlias
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
from memos import log
|
|
12
|
+
from memos.configs.parser import ParserConfigFactory
|
|
13
|
+
from memos.parsers.factory import ParserFactory
|
|
14
|
+
from memos.types import MessagesType
|
|
15
|
+
from memos.types.openai_chat_completion_types import (
|
|
16
|
+
ChatCompletionAssistantMessageParam,
|
|
17
|
+
ChatCompletionContentPartTextParam,
|
|
18
|
+
ChatCompletionSystemMessageParam,
|
|
19
|
+
ChatCompletionToolMessageParam,
|
|
20
|
+
ChatCompletionUserMessageParam,
|
|
21
|
+
File,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
ChatMessageClasses = (
|
|
26
|
+
ChatCompletionSystemMessageParam,
|
|
27
|
+
ChatCompletionUserMessageParam,
|
|
28
|
+
ChatCompletionAssistantMessageParam,
|
|
29
|
+
ChatCompletionToolMessageParam,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
RawContentClasses = (ChatCompletionContentPartTextParam, File)
|
|
33
|
+
MessageDict: TypeAlias = dict[str, Any] # (Deprecated) not supported in the future
|
|
34
|
+
SceneDataInput: TypeAlias = (
|
|
35
|
+
list[list[MessageDict]] # (Deprecated) legacy chat example: scenes -> messages
|
|
36
|
+
| list[str] # (Deprecated) legacy doc example: list of paths / pure text
|
|
37
|
+
| list[MessagesType] # new: list of scenes (each scene is MessagesType)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
logger = log.get_logger(__name__)
|
|
42
|
+
FILE_EXT_RE = re.compile(
|
|
43
|
+
r"\.(pdf|docx?|pptx?|xlsx?|txt|md|html?|json|csv|png|jpe?g|webp|wav|mp3|m4a)$",
|
|
44
|
+
re.I,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_json_result(response_text: str) -> dict:
|
|
49
|
+
"""
|
|
50
|
+
Parse JSON result from LLM response.
|
|
51
|
+
|
|
52
|
+
Handles various formats including:
|
|
53
|
+
- JSON wrapped in markdown code blocks
|
|
54
|
+
- Raw JSON
|
|
55
|
+
- Incomplete JSON (attempts to fix)
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
response_text: Raw response text from LLM
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Parsed dictionary or empty dict if parsing fails
|
|
62
|
+
"""
|
|
63
|
+
s = (response_text or "").strip()
|
|
64
|
+
|
|
65
|
+
m = re.search(r"```(?:json)?\s*([\s\S]*?)```", s, flags=re.I)
|
|
66
|
+
s = (m.group(1) if m else s.replace("```", "")).strip()
|
|
67
|
+
|
|
68
|
+
i = s.find("{")
|
|
69
|
+
if i == -1:
|
|
70
|
+
return {}
|
|
71
|
+
s = s[i:].strip()
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
return json.loads(s)
|
|
75
|
+
except json.JSONDecodeError:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
j = max(s.rfind("}"), s.rfind("]"))
|
|
79
|
+
if j != -1:
|
|
80
|
+
try:
|
|
81
|
+
return json.loads(s[: j + 1])
|
|
82
|
+
except json.JSONDecodeError:
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
def _cheap_close(t: str) -> str:
|
|
86
|
+
t += "}" * max(0, t.count("{") - t.count("}"))
|
|
87
|
+
t += "]" * max(0, t.count("[") - t.count("]"))
|
|
88
|
+
return t
|
|
89
|
+
|
|
90
|
+
t = _cheap_close(s)
|
|
91
|
+
try:
|
|
92
|
+
return json.loads(t)
|
|
93
|
+
except json.JSONDecodeError as e:
|
|
94
|
+
if "Invalid \\escape" in str(e):
|
|
95
|
+
s = s.replace("\\", "\\\\")
|
|
96
|
+
try:
|
|
97
|
+
return json.loads(s)
|
|
98
|
+
except json.JSONDecodeError:
|
|
99
|
+
pass
|
|
100
|
+
logger.warning(f"[JSONParse] Failed to decode JSON: {e}\nRaw: {response_text}")
|
|
101
|
+
return {}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# Default configuration for parser and text splitter
|
|
105
|
+
DEFAULT_PARSER_CONFIG = {
|
|
106
|
+
"backend": "markitdown",
|
|
107
|
+
"config": {},
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
DEFAULT_CHUNK_SIZE = int(os.getenv("FILE_PARSER_CHUNK_SIZE", "1280"))
|
|
111
|
+
DEFAULT_CHUNK_OVERLAP = int(os.getenv("FILE_PARSER_CHUNK_OVERLAP", "200"))
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# Initialize parser instance
|
|
115
|
+
file_parser = None
|
|
116
|
+
try:
|
|
117
|
+
parser_config = ParserConfigFactory.model_validate(DEFAULT_PARSER_CONFIG)
|
|
118
|
+
file_parser = ParserFactory.from_config(parser_config)
|
|
119
|
+
logger.debug("[FileContentParser] Initialized parser instance")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.error(f"[FileContentParser] Failed to create parser: {e}")
|
|
122
|
+
file_parser = None
|
|
123
|
+
|
|
124
|
+
markdown_text_splitter = None
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
from memos.chunkers.charactertext_chunker import CharacterTextChunker
|
|
128
|
+
from memos.chunkers.markdown_chunker import MarkdownChunker
|
|
129
|
+
|
|
130
|
+
markdown_text_splitter = MarkdownChunker(
|
|
131
|
+
chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP, recursive=True
|
|
132
|
+
)
|
|
133
|
+
text_splitter = CharacterTextChunker(
|
|
134
|
+
chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
|
|
135
|
+
)
|
|
136
|
+
logger.info("[FileContentParser] Initialized text splitter instances by lancga")
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.warning(
|
|
139
|
+
f"[FileContentParser] Failed to create text splitter: {e} will use simple splitter fallback"
|
|
140
|
+
)
|
|
141
|
+
from memos.chunkers.simple_chunker import SimpleTextSplitter
|
|
142
|
+
|
|
143
|
+
markdown_text_splitter = None
|
|
144
|
+
text_splitter = None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def get_parser() -> Any:
|
|
148
|
+
"""
|
|
149
|
+
Get parser instance.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Parser instance (from ParserFactory) or None if not available
|
|
153
|
+
"""
|
|
154
|
+
return file_parser
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def get_text_splitter(
|
|
158
|
+
chunk_size: int | None = None, chunk_overlap: int | None = None, is_markdown: bool = False
|
|
159
|
+
) -> Any:
|
|
160
|
+
"""
|
|
161
|
+
Get text splitter instance or a callable that uses simple splitter.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
chunk_size: Maximum size of chunks when splitting text (used for simple splitter fallback)
|
|
165
|
+
chunk_overlap: Overlap between chunks when splitting text (used for simple splitter fallback)
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
Text splitter instance (RecursiveCharacterTextSplitter) or a callable wrapper for simple splitter
|
|
169
|
+
"""
|
|
170
|
+
if is_markdown and markdown_text_splitter is not None:
|
|
171
|
+
return markdown_text_splitter
|
|
172
|
+
elif text_splitter is not None:
|
|
173
|
+
return text_splitter
|
|
174
|
+
else:
|
|
175
|
+
actual_chunk_size = chunk_size or DEFAULT_CHUNK_SIZE
|
|
176
|
+
actual_chunk_overlap = chunk_overlap or DEFAULT_CHUNK_OVERLAP
|
|
177
|
+
return SimpleTextSplitter(actual_chunk_size, actual_chunk_overlap)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def extract_role(message: dict[str, Any]) -> str:
|
|
181
|
+
"""Extract role from message."""
|
|
182
|
+
return message.get("role", "")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _is_message_list(obj):
|
|
186
|
+
"""
|
|
187
|
+
Detect whether `obj` is a MessageList (OpenAI ChatCompletionMessageParam list).
|
|
188
|
+
Criteria:
|
|
189
|
+
- Must be a list
|
|
190
|
+
- Each element must be a dict with keys: role, content
|
|
191
|
+
"""
|
|
192
|
+
if not isinstance(obj, list):
|
|
193
|
+
return False
|
|
194
|
+
|
|
195
|
+
for item in obj:
|
|
196
|
+
if not isinstance(item, dict):
|
|
197
|
+
return False
|
|
198
|
+
if "role" not in item or "content" not in item:
|
|
199
|
+
return False
|
|
200
|
+
return True
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def coerce_scene_data(scene_data: SceneDataInput, scene_type: str) -> list[MessagesType]:
|
|
204
|
+
"""
|
|
205
|
+
Normalize ANY allowed SceneDataInput into: list[MessagesType].
|
|
206
|
+
Supports:
|
|
207
|
+
- Already normalized scene_data → passthrough
|
|
208
|
+
- doc: legacy list[str] → automatically detect:
|
|
209
|
+
* local file path → read & parse into text
|
|
210
|
+
* remote URL/path → keep as file part
|
|
211
|
+
* pure text → text part
|
|
212
|
+
- chat:
|
|
213
|
+
* Passthrough normalization
|
|
214
|
+
* Auto-inject chat_time into each message group
|
|
215
|
+
- fallback: wrap unknown → [str(scene_data)]
|
|
216
|
+
"""
|
|
217
|
+
if not scene_data:
|
|
218
|
+
return []
|
|
219
|
+
head = scene_data[0]
|
|
220
|
+
|
|
221
|
+
if scene_type != "doc":
|
|
222
|
+
normalized = scene_data if isinstance(head, str | list) else [str(scene_data)]
|
|
223
|
+
|
|
224
|
+
complete_scene_data = []
|
|
225
|
+
for items in normalized:
|
|
226
|
+
if not items:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
# Keep string as-is (MessagesType supports str)
|
|
230
|
+
if isinstance(items, str):
|
|
231
|
+
complete_scene_data.append(items)
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
# ONLY add chat_time if it's a MessageList
|
|
235
|
+
if not _is_message_list(items):
|
|
236
|
+
complete_scene_data.append(items)
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
# Detect existing chat_time
|
|
240
|
+
chat_time_value = None
|
|
241
|
+
for item in items:
|
|
242
|
+
if isinstance(item, dict) and "chat_time" in item:
|
|
243
|
+
chat_time_value = item["chat_time"]
|
|
244
|
+
break
|
|
245
|
+
|
|
246
|
+
# Default timestamp
|
|
247
|
+
if chat_time_value is None:
|
|
248
|
+
session_date = datetime.now()
|
|
249
|
+
date_format = "%I:%M %p on %d %B, %Y"
|
|
250
|
+
chat_time_value = session_date.strftime(date_format)
|
|
251
|
+
|
|
252
|
+
# Inject chat_time
|
|
253
|
+
for m in items:
|
|
254
|
+
if isinstance(m, dict) and "chat_time" not in m:
|
|
255
|
+
m["chat_time"] = chat_time_value
|
|
256
|
+
|
|
257
|
+
complete_scene_data.append(items)
|
|
258
|
+
|
|
259
|
+
return complete_scene_data
|
|
260
|
+
|
|
261
|
+
# doc: list[str] -> RawMessageList
|
|
262
|
+
if scene_type == "doc" and isinstance(head, str):
|
|
263
|
+
raw_items = []
|
|
264
|
+
|
|
265
|
+
# prepare parser
|
|
266
|
+
parser_config = ParserConfigFactory.model_validate(
|
|
267
|
+
{
|
|
268
|
+
"backend": "markitdown",
|
|
269
|
+
"config": {},
|
|
270
|
+
}
|
|
271
|
+
)
|
|
272
|
+
parser = ParserFactory.from_config(parser_config)
|
|
273
|
+
|
|
274
|
+
for s in scene_data:
|
|
275
|
+
s = (s or "").strip()
|
|
276
|
+
if not s:
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
parsed = urlparse(s)
|
|
280
|
+
looks_like_url = parsed.scheme in {"http", "https", "oss", "s3", "gs", "cos"}
|
|
281
|
+
looks_like_path = ("/" in s) or ("\\" in s)
|
|
282
|
+
looks_like_file = bool(FILE_EXT_RE.search(s)) or looks_like_url or looks_like_path
|
|
283
|
+
|
|
284
|
+
# Case A: Local filesystem path
|
|
285
|
+
if os.path.exists(s):
|
|
286
|
+
filename = os.path.basename(s) or "document"
|
|
287
|
+
try:
|
|
288
|
+
# parse local file into text
|
|
289
|
+
parsed_text = parser.parse(s)
|
|
290
|
+
raw_items.append(
|
|
291
|
+
[
|
|
292
|
+
{
|
|
293
|
+
"type": "file",
|
|
294
|
+
"file": {
|
|
295
|
+
"filename": filename or "document",
|
|
296
|
+
"file_data": parsed_text,
|
|
297
|
+
},
|
|
298
|
+
}
|
|
299
|
+
]
|
|
300
|
+
)
|
|
301
|
+
except Exception as e:
|
|
302
|
+
logger.error(f"[SceneParser] Error parsing {s}: {e}")
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
# Case B: URL or non-local file path
|
|
306
|
+
if looks_like_file:
|
|
307
|
+
if looks_like_url:
|
|
308
|
+
filename = os.path.basename(parsed.path)
|
|
309
|
+
else:
|
|
310
|
+
# Windows absolute path detection
|
|
311
|
+
if "\\" in s and re.match(r"^[A-Za-z]:", s):
|
|
312
|
+
parts = [p for p in s.split("\\") if p]
|
|
313
|
+
filename = parts[-1] if parts else os.path.basename(s)
|
|
314
|
+
else:
|
|
315
|
+
filename = os.path.basename(s)
|
|
316
|
+
raw_items.append(
|
|
317
|
+
[{"type": "file", "file": {"filename": filename or "document", "file_data": s}}]
|
|
318
|
+
)
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
# Case C: Pure text
|
|
322
|
+
raw_items.append([{"type": "text", "text": s}])
|
|
323
|
+
|
|
324
|
+
return raw_items
|
|
325
|
+
|
|
326
|
+
# fallback
|
|
327
|
+
return [str(scene_data)]
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def detect_lang(text):
|
|
331
|
+
"""
|
|
332
|
+
Detect the language of the given text (Chinese or English).
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
text: Text to analyze
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
"zh" for Chinese, "en" for English (default)
|
|
339
|
+
"""
|
|
340
|
+
try:
|
|
341
|
+
if not text or not isinstance(text, str):
|
|
342
|
+
return "en"
|
|
343
|
+
cleaned_text = text
|
|
344
|
+
# remove role and timestamp
|
|
345
|
+
cleaned_text = re.sub(
|
|
346
|
+
r"\b(user|assistant|query|answer)\s*:", "", cleaned_text, flags=re.IGNORECASE
|
|
347
|
+
)
|
|
348
|
+
cleaned_text = re.sub(r"\[[\d\-:\s]+\]", "", cleaned_text)
|
|
349
|
+
|
|
350
|
+
# extract chinese characters
|
|
351
|
+
chinese_pattern = r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f\U0002b820-\U0002ceaf\uf900-\ufaff]"
|
|
352
|
+
chinese_chars = re.findall(chinese_pattern, cleaned_text)
|
|
353
|
+
text_without_special = re.sub(r"[\s\d\W]", "", cleaned_text)
|
|
354
|
+
if text_without_special and len(chinese_chars) / len(text_without_special) > 0.3:
|
|
355
|
+
return "zh"
|
|
356
|
+
return "en"
|
|
357
|
+
except Exception:
|
|
358
|
+
return "en"
|