MemoryOS 2.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memoryos-2.0.3.dist-info/METADATA +418 -0
- memoryos-2.0.3.dist-info/RECORD +315 -0
- memoryos-2.0.3.dist-info/WHEEL +4 -0
- memoryos-2.0.3.dist-info/entry_points.txt +3 -0
- memoryos-2.0.3.dist-info/licenses/LICENSE +201 -0
- memos/__init__.py +20 -0
- memos/api/client.py +571 -0
- memos/api/config.py +1018 -0
- memos/api/context/dependencies.py +50 -0
- memos/api/exceptions.py +53 -0
- memos/api/handlers/__init__.py +62 -0
- memos/api/handlers/add_handler.py +158 -0
- memos/api/handlers/base_handler.py +194 -0
- memos/api/handlers/chat_handler.py +1401 -0
- memos/api/handlers/component_init.py +388 -0
- memos/api/handlers/config_builders.py +190 -0
- memos/api/handlers/feedback_handler.py +93 -0
- memos/api/handlers/formatters_handler.py +237 -0
- memos/api/handlers/memory_handler.py +316 -0
- memos/api/handlers/scheduler_handler.py +497 -0
- memos/api/handlers/search_handler.py +222 -0
- memos/api/handlers/suggestion_handler.py +117 -0
- memos/api/mcp_serve.py +614 -0
- memos/api/middleware/request_context.py +101 -0
- memos/api/product_api.py +38 -0
- memos/api/product_models.py +1206 -0
- memos/api/routers/__init__.py +1 -0
- memos/api/routers/product_router.py +477 -0
- memos/api/routers/server_router.py +394 -0
- memos/api/server_api.py +44 -0
- memos/api/start_api.py +433 -0
- memos/chunkers/__init__.py +4 -0
- memos/chunkers/base.py +24 -0
- memos/chunkers/charactertext_chunker.py +41 -0
- memos/chunkers/factory.py +24 -0
- memos/chunkers/markdown_chunker.py +62 -0
- memos/chunkers/sentence_chunker.py +54 -0
- memos/chunkers/simple_chunker.py +50 -0
- memos/cli.py +113 -0
- memos/configs/__init__.py +0 -0
- memos/configs/base.py +82 -0
- memos/configs/chunker.py +59 -0
- memos/configs/embedder.py +88 -0
- memos/configs/graph_db.py +236 -0
- memos/configs/internet_retriever.py +100 -0
- memos/configs/llm.py +151 -0
- memos/configs/mem_agent.py +54 -0
- memos/configs/mem_chat.py +81 -0
- memos/configs/mem_cube.py +105 -0
- memos/configs/mem_os.py +83 -0
- memos/configs/mem_reader.py +91 -0
- memos/configs/mem_scheduler.py +385 -0
- memos/configs/mem_user.py +70 -0
- memos/configs/memory.py +324 -0
- memos/configs/parser.py +38 -0
- memos/configs/reranker.py +18 -0
- memos/configs/utils.py +8 -0
- memos/configs/vec_db.py +80 -0
- memos/context/context.py +355 -0
- memos/dependency.py +52 -0
- memos/deprecation.py +262 -0
- memos/embedders/__init__.py +0 -0
- memos/embedders/ark.py +95 -0
- memos/embedders/base.py +106 -0
- memos/embedders/factory.py +29 -0
- memos/embedders/ollama.py +77 -0
- memos/embedders/sentence_transformer.py +49 -0
- memos/embedders/universal_api.py +51 -0
- memos/exceptions.py +30 -0
- memos/graph_dbs/__init__.py +0 -0
- memos/graph_dbs/base.py +274 -0
- memos/graph_dbs/factory.py +27 -0
- memos/graph_dbs/item.py +46 -0
- memos/graph_dbs/nebular.py +1794 -0
- memos/graph_dbs/neo4j.py +1942 -0
- memos/graph_dbs/neo4j_community.py +1058 -0
- memos/graph_dbs/polardb.py +5446 -0
- memos/hello_world.py +97 -0
- memos/llms/__init__.py +0 -0
- memos/llms/base.py +25 -0
- memos/llms/deepseek.py +13 -0
- memos/llms/factory.py +38 -0
- memos/llms/hf.py +443 -0
- memos/llms/hf_singleton.py +114 -0
- memos/llms/ollama.py +135 -0
- memos/llms/openai.py +222 -0
- memos/llms/openai_new.py +198 -0
- memos/llms/qwen.py +13 -0
- memos/llms/utils.py +14 -0
- memos/llms/vllm.py +218 -0
- memos/log.py +237 -0
- memos/mem_agent/base.py +19 -0
- memos/mem_agent/deepsearch_agent.py +391 -0
- memos/mem_agent/factory.py +36 -0
- memos/mem_chat/__init__.py +0 -0
- memos/mem_chat/base.py +30 -0
- memos/mem_chat/factory.py +21 -0
- memos/mem_chat/simple.py +200 -0
- memos/mem_cube/__init__.py +0 -0
- memos/mem_cube/base.py +30 -0
- memos/mem_cube/general.py +240 -0
- memos/mem_cube/navie.py +172 -0
- memos/mem_cube/utils.py +169 -0
- memos/mem_feedback/base.py +15 -0
- memos/mem_feedback/feedback.py +1192 -0
- memos/mem_feedback/simple_feedback.py +40 -0
- memos/mem_feedback/utils.py +230 -0
- memos/mem_os/client.py +5 -0
- memos/mem_os/core.py +1203 -0
- memos/mem_os/main.py +582 -0
- memos/mem_os/product.py +1608 -0
- memos/mem_os/product_server.py +455 -0
- memos/mem_os/utils/default_config.py +359 -0
- memos/mem_os/utils/format_utils.py +1403 -0
- memos/mem_os/utils/reference_utils.py +162 -0
- memos/mem_reader/__init__.py +0 -0
- memos/mem_reader/base.py +47 -0
- memos/mem_reader/factory.py +53 -0
- memos/mem_reader/memory.py +298 -0
- memos/mem_reader/multi_modal_struct.py +965 -0
- memos/mem_reader/read_multi_modal/__init__.py +43 -0
- memos/mem_reader/read_multi_modal/assistant_parser.py +311 -0
- memos/mem_reader/read_multi_modal/base.py +273 -0
- memos/mem_reader/read_multi_modal/file_content_parser.py +826 -0
- memos/mem_reader/read_multi_modal/image_parser.py +359 -0
- memos/mem_reader/read_multi_modal/multi_modal_parser.py +252 -0
- memos/mem_reader/read_multi_modal/string_parser.py +139 -0
- memos/mem_reader/read_multi_modal/system_parser.py +327 -0
- memos/mem_reader/read_multi_modal/text_content_parser.py +131 -0
- memos/mem_reader/read_multi_modal/tool_parser.py +210 -0
- memos/mem_reader/read_multi_modal/user_parser.py +218 -0
- memos/mem_reader/read_multi_modal/utils.py +358 -0
- memos/mem_reader/simple_struct.py +912 -0
- memos/mem_reader/strategy_struct.py +163 -0
- memos/mem_reader/utils.py +157 -0
- memos/mem_scheduler/__init__.py +0 -0
- memos/mem_scheduler/analyzer/__init__.py +0 -0
- memos/mem_scheduler/analyzer/api_analyzer.py +714 -0
- memos/mem_scheduler/analyzer/eval_analyzer.py +219 -0
- memos/mem_scheduler/analyzer/mos_for_test_scheduler.py +571 -0
- memos/mem_scheduler/analyzer/scheduler_for_eval.py +280 -0
- memos/mem_scheduler/base_scheduler.py +1319 -0
- memos/mem_scheduler/general_modules/__init__.py +0 -0
- memos/mem_scheduler/general_modules/api_misc.py +137 -0
- memos/mem_scheduler/general_modules/base.py +80 -0
- memos/mem_scheduler/general_modules/init_components_for_scheduler.py +425 -0
- memos/mem_scheduler/general_modules/misc.py +313 -0
- memos/mem_scheduler/general_modules/scheduler_logger.py +389 -0
- memos/mem_scheduler/general_modules/task_threads.py +315 -0
- memos/mem_scheduler/general_scheduler.py +1495 -0
- memos/mem_scheduler/memory_manage_modules/__init__.py +5 -0
- memos/mem_scheduler/memory_manage_modules/memory_filter.py +306 -0
- memos/mem_scheduler/memory_manage_modules/retriever.py +547 -0
- memos/mem_scheduler/monitors/__init__.py +0 -0
- memos/mem_scheduler/monitors/dispatcher_monitor.py +366 -0
- memos/mem_scheduler/monitors/general_monitor.py +394 -0
- memos/mem_scheduler/monitors/task_schedule_monitor.py +254 -0
- memos/mem_scheduler/optimized_scheduler.py +410 -0
- memos/mem_scheduler/orm_modules/__init__.py +0 -0
- memos/mem_scheduler/orm_modules/api_redis_model.py +518 -0
- memos/mem_scheduler/orm_modules/base_model.py +729 -0
- memos/mem_scheduler/orm_modules/monitor_models.py +261 -0
- memos/mem_scheduler/orm_modules/redis_model.py +699 -0
- memos/mem_scheduler/scheduler_factory.py +23 -0
- memos/mem_scheduler/schemas/__init__.py +0 -0
- memos/mem_scheduler/schemas/analyzer_schemas.py +52 -0
- memos/mem_scheduler/schemas/api_schemas.py +233 -0
- memos/mem_scheduler/schemas/general_schemas.py +55 -0
- memos/mem_scheduler/schemas/message_schemas.py +173 -0
- memos/mem_scheduler/schemas/monitor_schemas.py +406 -0
- memos/mem_scheduler/schemas/task_schemas.py +132 -0
- memos/mem_scheduler/task_schedule_modules/__init__.py +0 -0
- memos/mem_scheduler/task_schedule_modules/dispatcher.py +740 -0
- memos/mem_scheduler/task_schedule_modules/local_queue.py +247 -0
- memos/mem_scheduler/task_schedule_modules/orchestrator.py +74 -0
- memos/mem_scheduler/task_schedule_modules/redis_queue.py +1385 -0
- memos/mem_scheduler/task_schedule_modules/task_queue.py +162 -0
- memos/mem_scheduler/utils/__init__.py +0 -0
- memos/mem_scheduler/utils/api_utils.py +77 -0
- memos/mem_scheduler/utils/config_utils.py +100 -0
- memos/mem_scheduler/utils/db_utils.py +50 -0
- memos/mem_scheduler/utils/filter_utils.py +176 -0
- memos/mem_scheduler/utils/metrics.py +125 -0
- memos/mem_scheduler/utils/misc_utils.py +290 -0
- memos/mem_scheduler/utils/monitor_event_utils.py +67 -0
- memos/mem_scheduler/utils/status_tracker.py +229 -0
- memos/mem_scheduler/webservice_modules/__init__.py +0 -0
- memos/mem_scheduler/webservice_modules/rabbitmq_service.py +485 -0
- memos/mem_scheduler/webservice_modules/redis_service.py +380 -0
- memos/mem_user/factory.py +94 -0
- memos/mem_user/mysql_persistent_user_manager.py +271 -0
- memos/mem_user/mysql_user_manager.py +502 -0
- memos/mem_user/persistent_factory.py +98 -0
- memos/mem_user/persistent_user_manager.py +260 -0
- memos/mem_user/redis_persistent_user_manager.py +225 -0
- memos/mem_user/user_manager.py +488 -0
- memos/memories/__init__.py +0 -0
- memos/memories/activation/__init__.py +0 -0
- memos/memories/activation/base.py +42 -0
- memos/memories/activation/item.py +56 -0
- memos/memories/activation/kv.py +292 -0
- memos/memories/activation/vllmkv.py +219 -0
- memos/memories/base.py +19 -0
- memos/memories/factory.py +42 -0
- memos/memories/parametric/__init__.py +0 -0
- memos/memories/parametric/base.py +19 -0
- memos/memories/parametric/item.py +11 -0
- memos/memories/parametric/lora.py +41 -0
- memos/memories/textual/__init__.py +0 -0
- memos/memories/textual/base.py +92 -0
- memos/memories/textual/general.py +236 -0
- memos/memories/textual/item.py +304 -0
- memos/memories/textual/naive.py +187 -0
- memos/memories/textual/prefer_text_memory/__init__.py +0 -0
- memos/memories/textual/prefer_text_memory/adder.py +504 -0
- memos/memories/textual/prefer_text_memory/config.py +106 -0
- memos/memories/textual/prefer_text_memory/extractor.py +221 -0
- memos/memories/textual/prefer_text_memory/factory.py +85 -0
- memos/memories/textual/prefer_text_memory/retrievers.py +177 -0
- memos/memories/textual/prefer_text_memory/spliter.py +132 -0
- memos/memories/textual/prefer_text_memory/utils.py +93 -0
- memos/memories/textual/preference.py +344 -0
- memos/memories/textual/simple_preference.py +161 -0
- memos/memories/textual/simple_tree.py +69 -0
- memos/memories/textual/tree.py +459 -0
- memos/memories/textual/tree_text_memory/__init__.py +0 -0
- memos/memories/textual/tree_text_memory/organize/__init__.py +0 -0
- memos/memories/textual/tree_text_memory/organize/handler.py +184 -0
- memos/memories/textual/tree_text_memory/organize/manager.py +518 -0
- memos/memories/textual/tree_text_memory/organize/relation_reason_detector.py +238 -0
- memos/memories/textual/tree_text_memory/organize/reorganizer.py +622 -0
- memos/memories/textual/tree_text_memory/retrieve/__init__.py +0 -0
- memos/memories/textual/tree_text_memory/retrieve/advanced_searcher.py +364 -0
- memos/memories/textual/tree_text_memory/retrieve/bm25_util.py +186 -0
- memos/memories/textual/tree_text_memory/retrieve/bochasearch.py +419 -0
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever.py +270 -0
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +102 -0
- memos/memories/textual/tree_text_memory/retrieve/reasoner.py +61 -0
- memos/memories/textual/tree_text_memory/retrieve/recall.py +497 -0
- memos/memories/textual/tree_text_memory/retrieve/reranker.py +111 -0
- memos/memories/textual/tree_text_memory/retrieve/retrieval_mid_structs.py +16 -0
- memos/memories/textual/tree_text_memory/retrieve/retrieve_utils.py +472 -0
- memos/memories/textual/tree_text_memory/retrieve/searcher.py +848 -0
- memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py +135 -0
- memos/memories/textual/tree_text_memory/retrieve/utils.py +54 -0
- memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +387 -0
- memos/memos_tools/dinding_report_bot.py +453 -0
- memos/memos_tools/lockfree_dict.py +120 -0
- memos/memos_tools/notification_service.py +44 -0
- memos/memos_tools/notification_utils.py +142 -0
- memos/memos_tools/singleton.py +174 -0
- memos/memos_tools/thread_safe_dict.py +310 -0
- memos/memos_tools/thread_safe_dict_segment.py +382 -0
- memos/multi_mem_cube/__init__.py +0 -0
- memos/multi_mem_cube/composite_cube.py +86 -0
- memos/multi_mem_cube/single_cube.py +874 -0
- memos/multi_mem_cube/views.py +54 -0
- memos/parsers/__init__.py +0 -0
- memos/parsers/base.py +15 -0
- memos/parsers/factory.py +21 -0
- memos/parsers/markitdown.py +28 -0
- memos/reranker/__init__.py +4 -0
- memos/reranker/base.py +25 -0
- memos/reranker/concat.py +103 -0
- memos/reranker/cosine_local.py +102 -0
- memos/reranker/factory.py +72 -0
- memos/reranker/http_bge.py +324 -0
- memos/reranker/http_bge_strategy.py +327 -0
- memos/reranker/noop.py +19 -0
- memos/reranker/strategies/__init__.py +4 -0
- memos/reranker/strategies/base.py +61 -0
- memos/reranker/strategies/concat_background.py +94 -0
- memos/reranker/strategies/concat_docsource.py +110 -0
- memos/reranker/strategies/dialogue_common.py +109 -0
- memos/reranker/strategies/factory.py +31 -0
- memos/reranker/strategies/single_turn.py +107 -0
- memos/reranker/strategies/singleturn_outmem.py +98 -0
- memos/settings.py +10 -0
- memos/templates/__init__.py +0 -0
- memos/templates/advanced_search_prompts.py +211 -0
- memos/templates/cloud_service_prompt.py +107 -0
- memos/templates/instruction_completion.py +66 -0
- memos/templates/mem_agent_prompts.py +85 -0
- memos/templates/mem_feedback_prompts.py +822 -0
- memos/templates/mem_reader_prompts.py +1096 -0
- memos/templates/mem_reader_strategy_prompts.py +238 -0
- memos/templates/mem_scheduler_prompts.py +626 -0
- memos/templates/mem_search_prompts.py +93 -0
- memos/templates/mos_prompts.py +403 -0
- memos/templates/prefer_complete_prompt.py +735 -0
- memos/templates/tool_mem_prompts.py +139 -0
- memos/templates/tree_reorganize_prompts.py +230 -0
- memos/types/__init__.py +34 -0
- memos/types/general_types.py +151 -0
- memos/types/openai_chat_completion_types/__init__.py +15 -0
- memos/types/openai_chat_completion_types/chat_completion_assistant_message_param.py +56 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_image_param.py +27 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_input_audio_param.py +23 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_param.py +43 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_refusal_param.py +16 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_text_param.py +16 -0
- memos/types/openai_chat_completion_types/chat_completion_message_custom_tool_call_param.py +27 -0
- memos/types/openai_chat_completion_types/chat_completion_message_function_tool_call_param.py +32 -0
- memos/types/openai_chat_completion_types/chat_completion_message_param.py +18 -0
- memos/types/openai_chat_completion_types/chat_completion_message_tool_call_union_param.py +15 -0
- memos/types/openai_chat_completion_types/chat_completion_system_message_param.py +36 -0
- memos/types/openai_chat_completion_types/chat_completion_tool_message_param.py +30 -0
- memos/types/openai_chat_completion_types/chat_completion_user_message_param.py +34 -0
- memos/utils.py +123 -0
- memos/vec_dbs/__init__.py +0 -0
- memos/vec_dbs/base.py +117 -0
- memos/vec_dbs/factory.py +23 -0
- memos/vec_dbs/item.py +50 -0
- memos/vec_dbs/milvus.py +654 -0
- memos/vec_dbs/qdrant.py +355 -0
|
@@ -0,0 +1,826 @@
|
|
|
1
|
+
"""Parser for file content parts (RawMessageList)."""
|
|
2
|
+
|
|
3
|
+
import concurrent.futures
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import tempfile
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
from memos.context.context import ContextThreadPoolExecutor
|
|
13
|
+
from memos.embedders.base import BaseEmbedder
|
|
14
|
+
from memos.llms.base import BaseLLM
|
|
15
|
+
from memos.log import get_logger
|
|
16
|
+
from memos.mem_reader.read_multi_modal.base import BaseMessageParser, _derive_key
|
|
17
|
+
from memos.mem_reader.read_multi_modal.image_parser import ImageParser
|
|
18
|
+
from memos.mem_reader.read_multi_modal.utils import (
|
|
19
|
+
detect_lang,
|
|
20
|
+
get_parser,
|
|
21
|
+
parse_json_result,
|
|
22
|
+
)
|
|
23
|
+
from memos.memories.textual.item import (
|
|
24
|
+
SourceMessage,
|
|
25
|
+
TextualMemoryItem,
|
|
26
|
+
TreeNodeTextualMemoryMetadata,
|
|
27
|
+
)
|
|
28
|
+
from memos.templates.mem_reader_prompts import (
|
|
29
|
+
CUSTOM_TAGS_INSTRUCTION,
|
|
30
|
+
CUSTOM_TAGS_INSTRUCTION_ZH,
|
|
31
|
+
SIMPLE_STRUCT_DOC_READER_PROMPT,
|
|
32
|
+
SIMPLE_STRUCT_DOC_READER_PROMPT_ZH,
|
|
33
|
+
)
|
|
34
|
+
from memos.types.openai_chat_completion_types import File
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
logger = get_logger(__name__)
|
|
38
|
+
|
|
39
|
+
# Prompt dictionary for doc processing (shared by simple_struct and file_content_parser)
|
|
40
|
+
DOC_PROMPT_DICT = {
|
|
41
|
+
"doc": {"en": SIMPLE_STRUCT_DOC_READER_PROMPT, "zh": SIMPLE_STRUCT_DOC_READER_PROMPT_ZH},
|
|
42
|
+
"custom_tags": {"en": CUSTOM_TAGS_INSTRUCTION, "zh": CUSTOM_TAGS_INSTRUCTION_ZH},
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class FileContentParser(BaseMessageParser):
|
|
47
|
+
"""Parser for file content parts."""
|
|
48
|
+
|
|
49
|
+
def _get_doc_llm_response(self, chunk_text: str, custom_tags: list[str] | None = None) -> dict:
|
|
50
|
+
"""
|
|
51
|
+
Call LLM to extract memory from document chunk.
|
|
52
|
+
Uses doc prompts from DOC_PROMPT_DICT.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
chunk_text: Text chunk to extract memory from
|
|
56
|
+
custom_tags: Optional list of custom tags for LLM extraction
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Parsed JSON response from LLM or empty dict if failed
|
|
60
|
+
"""
|
|
61
|
+
if not self.llm:
|
|
62
|
+
logger.warning("[FileContentParser] LLM not available for fine mode")
|
|
63
|
+
return {}
|
|
64
|
+
|
|
65
|
+
lang = detect_lang(chunk_text)
|
|
66
|
+
template = DOC_PROMPT_DICT["doc"][lang]
|
|
67
|
+
prompt = template.replace("{chunk_text}", chunk_text)
|
|
68
|
+
|
|
69
|
+
custom_tags_prompt = (
|
|
70
|
+
DOC_PROMPT_DICT["custom_tags"][lang].replace("{custom_tags}", str(custom_tags))
|
|
71
|
+
if custom_tags
|
|
72
|
+
else ""
|
|
73
|
+
)
|
|
74
|
+
prompt = prompt.replace("{custom_tags_prompt}", custom_tags_prompt)
|
|
75
|
+
|
|
76
|
+
messages = [{"role": "user", "content": prompt}]
|
|
77
|
+
try:
|
|
78
|
+
response_text = self.llm.generate(messages)
|
|
79
|
+
response_json = parse_json_result(response_text)
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.error(f"[FileContentParser] LLM generation error: {e}")
|
|
82
|
+
response_json = {}
|
|
83
|
+
return response_json
|
|
84
|
+
|
|
85
|
+
def _handle_url(self, url_str: str, filename: str) -> tuple[str, str | None, bool]:
|
|
86
|
+
"""Download and parse file from URL."""
|
|
87
|
+
try:
|
|
88
|
+
from urllib.parse import urlparse
|
|
89
|
+
|
|
90
|
+
import requests
|
|
91
|
+
|
|
92
|
+
parsed_url = urlparse(url_str)
|
|
93
|
+
hostname = parsed_url.hostname or ""
|
|
94
|
+
|
|
95
|
+
response = requests.get(url_str, timeout=30)
|
|
96
|
+
response.raise_for_status()
|
|
97
|
+
response.encoding = "utf-8"
|
|
98
|
+
|
|
99
|
+
if not filename:
|
|
100
|
+
filename = os.path.basename(parsed_url.path) or "downloaded_file"
|
|
101
|
+
|
|
102
|
+
if hostname in self.direct_markdown_hostnames:
|
|
103
|
+
return response.text, None, True
|
|
104
|
+
|
|
105
|
+
file_ext = os.path.splitext(filename)[1].lower()
|
|
106
|
+
if file_ext in [".md", ".markdown", ".txt"]:
|
|
107
|
+
return response.text, None, True
|
|
108
|
+
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=file_ext) as temp_file:
|
|
109
|
+
temp_file.write(response.content)
|
|
110
|
+
return "", temp_file.name, False
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.error(f"[FileContentParser] URL processing error: {e}")
|
|
113
|
+
return f"[File URL download failed: {url_str}]", None
|
|
114
|
+
|
|
115
|
+
def _is_base64(self, data: str) -> bool:
|
|
116
|
+
"""Quick heuristic to check base64-like string."""
|
|
117
|
+
return data.startswith("data:") or (
|
|
118
|
+
len(data) > 100
|
|
119
|
+
and all(
|
|
120
|
+
c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
|
|
121
|
+
for c in data[:100]
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def _handle_base64(self, data: str) -> str:
|
|
126
|
+
"""Base64 not implemented placeholder."""
|
|
127
|
+
logger.info("[FileContentParser] Base64 content detected but decoding is not implemented.")
|
|
128
|
+
return ""
|
|
129
|
+
|
|
130
|
+
def _handle_local(self, data: str) -> str:
|
|
131
|
+
"""Base64 not implemented placeholder."""
|
|
132
|
+
logger.info("[FileContentParser] Local file paths are not supported in fine mode.")
|
|
133
|
+
return ""
|
|
134
|
+
|
|
135
|
+
def _process_single_image(
|
|
136
|
+
self, image_url: str, original_ref: str, info: dict[str, Any], **kwargs
|
|
137
|
+
) -> tuple[str, str]:
|
|
138
|
+
"""
|
|
139
|
+
Process a single image and return (original_ref, replacement_text).
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
image_url: URL of the image to process
|
|
143
|
+
original_ref: Original markdown image reference to replace
|
|
144
|
+
info: Dictionary containing user_id and session_id
|
|
145
|
+
**kwargs: Additional parameters for ImageParser
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Tuple of (original_ref, replacement_text)
|
|
149
|
+
"""
|
|
150
|
+
try:
|
|
151
|
+
# Construct image message format for ImageParser
|
|
152
|
+
image_message = {
|
|
153
|
+
"type": "image_url",
|
|
154
|
+
"image_url": {
|
|
155
|
+
"url": image_url,
|
|
156
|
+
"detail": "auto",
|
|
157
|
+
},
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
# Process image using ImageParser
|
|
161
|
+
logger.debug(f"[FileContentParser] Processing image: {image_url}")
|
|
162
|
+
memory_items = self.image_parser.parse_fine(image_message, info, **kwargs)
|
|
163
|
+
|
|
164
|
+
# Extract text content from memory items (only strings as requested)
|
|
165
|
+
extracted_texts = []
|
|
166
|
+
for item in memory_items:
|
|
167
|
+
if hasattr(item, "memory") and item.memory:
|
|
168
|
+
extracted_texts.append(str(item.memory))
|
|
169
|
+
|
|
170
|
+
if extracted_texts:
|
|
171
|
+
# Combine all extracted texts
|
|
172
|
+
extracted_content = "\n".join(extracted_texts)
|
|
173
|
+
# Replace image with extracted content
|
|
174
|
+
return (
|
|
175
|
+
original_ref,
|
|
176
|
+
f"\n[Image Content from {image_url}]:\n{extracted_content}\n",
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
# If no content extracted, keep original with a note
|
|
180
|
+
logger.warning(f"[FileContentParser] No content extracted from image: {image_url}")
|
|
181
|
+
return (
|
|
182
|
+
original_ref,
|
|
183
|
+
f"\n[Image: {image_url} - No content extracted]\n",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.error(f"[FileContentParser] Error processing image {image_url}: {e}")
|
|
188
|
+
# On error, keep original image reference
|
|
189
|
+
return (original_ref, original_ref)
|
|
190
|
+
|
|
191
|
+
def _extract_and_process_images(self, text: str, info: dict[str, Any], **kwargs) -> str:
|
|
192
|
+
"""
|
|
193
|
+
Extract all images from markdown text and process them using ImageParser in parallel.
|
|
194
|
+
Replaces image references with extracted text content.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
text: Markdown text containing image references
|
|
198
|
+
info: Dictionary containing user_id and session_id
|
|
199
|
+
**kwargs: Additional parameters for ImageParser
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Text with image references replaced by extracted content
|
|
203
|
+
"""
|
|
204
|
+
if not text or not self.image_parser:
|
|
205
|
+
return text
|
|
206
|
+
|
|
207
|
+
# Pattern to match markdown images:  or 
|
|
208
|
+
image_pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
|
|
209
|
+
|
|
210
|
+
# Find all image matches first
|
|
211
|
+
image_matches = list(re.finditer(image_pattern, text))
|
|
212
|
+
if not image_matches:
|
|
213
|
+
return text
|
|
214
|
+
|
|
215
|
+
logger.info(f"[FileContentParser] Found {len(image_matches)} images to process in parallel")
|
|
216
|
+
|
|
217
|
+
# Prepare tasks for parallel processing
|
|
218
|
+
tasks = []
|
|
219
|
+
for match in image_matches:
|
|
220
|
+
image_url = match.group(2)
|
|
221
|
+
original_ref = match.group(0)
|
|
222
|
+
tasks.append((image_url, original_ref))
|
|
223
|
+
|
|
224
|
+
# Process images in parallel
|
|
225
|
+
replacements = {}
|
|
226
|
+
max_workers = min(len(tasks), 10) # Limit concurrent image processing
|
|
227
|
+
|
|
228
|
+
with ContextThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
229
|
+
futures = {
|
|
230
|
+
executor.submit(
|
|
231
|
+
self._process_single_image, image_url, original_ref, info, **kwargs
|
|
232
|
+
): (image_url, original_ref)
|
|
233
|
+
for image_url, original_ref in tasks
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
# Collect results with progress tracking
|
|
237
|
+
for future in tqdm(
|
|
238
|
+
concurrent.futures.as_completed(futures),
|
|
239
|
+
total=len(futures),
|
|
240
|
+
desc="[FileContentParser] Processing images",
|
|
241
|
+
):
|
|
242
|
+
try:
|
|
243
|
+
original_ref, replacement = future.result()
|
|
244
|
+
replacements[original_ref] = replacement
|
|
245
|
+
except Exception as e:
|
|
246
|
+
image_url, original_ref = futures[future]
|
|
247
|
+
logger.error(f"[FileContentParser] Future failed for image {image_url}: {e}")
|
|
248
|
+
# On error, keep original image reference
|
|
249
|
+
replacements[original_ref] = original_ref
|
|
250
|
+
|
|
251
|
+
# Replace all images in the text
|
|
252
|
+
processed_text = text
|
|
253
|
+
for original, replacement in replacements.items():
|
|
254
|
+
processed_text = processed_text.replace(original, replacement, 1)
|
|
255
|
+
|
|
256
|
+
# Count successfully extracted images
|
|
257
|
+
success_count = sum(
|
|
258
|
+
1 for replacement in replacements.values() if "Image Content from" in replacement
|
|
259
|
+
)
|
|
260
|
+
logger.info(
|
|
261
|
+
f"[FileContentParser] Processed {len(image_matches)} images in parallel, "
|
|
262
|
+
f"extracted content for {success_count} images"
|
|
263
|
+
)
|
|
264
|
+
return processed_text
|
|
265
|
+
|
|
266
|
+
def __init__(
|
|
267
|
+
self,
|
|
268
|
+
embedder: BaseEmbedder,
|
|
269
|
+
llm: BaseLLM | None = None,
|
|
270
|
+
parser: Any | None = None,
|
|
271
|
+
direct_markdown_hostnames: list[str] | None = None,
|
|
272
|
+
):
|
|
273
|
+
"""
|
|
274
|
+
Initialize FileContentParser.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
embedder: Embedder for generating embeddings
|
|
278
|
+
llm: Optional LLM for fine mode processing
|
|
279
|
+
parser: Optional parser for parsing file contents
|
|
280
|
+
direct_markdown_hostnames: List of hostnames that should return markdown directly
|
|
281
|
+
without parsing. If None, reads from FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES
|
|
282
|
+
environment variable (comma-separated).
|
|
283
|
+
"""
|
|
284
|
+
super().__init__(embedder, llm)
|
|
285
|
+
self.parser = parser
|
|
286
|
+
# Initialize ImageParser for processing images in markdown
|
|
287
|
+
self.image_parser = ImageParser(embedder, llm) if llm else None
|
|
288
|
+
|
|
289
|
+
# Get inner markdown hostnames from config or environment
|
|
290
|
+
if direct_markdown_hostnames is not None:
|
|
291
|
+
self.direct_markdown_hostnames = direct_markdown_hostnames
|
|
292
|
+
else:
|
|
293
|
+
env_hostnames = os.getenv("FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "")
|
|
294
|
+
if env_hostnames:
|
|
295
|
+
# Support comma-separated list
|
|
296
|
+
self.direct_markdown_hostnames = [
|
|
297
|
+
h.strip() for h in env_hostnames.split(",") if h.strip()
|
|
298
|
+
]
|
|
299
|
+
else:
|
|
300
|
+
self.direct_markdown_hostnames = []
|
|
301
|
+
|
|
302
|
+
def create_source(
|
|
303
|
+
self,
|
|
304
|
+
message: File,
|
|
305
|
+
info: dict[str, Any],
|
|
306
|
+
chunk_index: int | None = None,
|
|
307
|
+
chunk_total: int | None = None,
|
|
308
|
+
chunk_content: str | None = None,
|
|
309
|
+
file_url_flag: bool = False,
|
|
310
|
+
) -> SourceMessage:
|
|
311
|
+
"""Create SourceMessage from file content part."""
|
|
312
|
+
if isinstance(message, dict):
|
|
313
|
+
file_info = message.get("file", {})
|
|
314
|
+
source_dict = {
|
|
315
|
+
"type": "file",
|
|
316
|
+
"doc_path": file_info.get("filename") or file_info.get("file_id", ""),
|
|
317
|
+
"content": chunk_content if chunk_content else file_info.get("file_data", ""),
|
|
318
|
+
"file_info": file_info if file_url_flag else {},
|
|
319
|
+
}
|
|
320
|
+
# Add chunk ordering information if provided
|
|
321
|
+
if chunk_index is not None:
|
|
322
|
+
source_dict["chunk_index"] = chunk_index
|
|
323
|
+
if chunk_total is not None:
|
|
324
|
+
source_dict["chunk_total"] = chunk_total
|
|
325
|
+
return SourceMessage(**source_dict)
|
|
326
|
+
source_dict = {"type": "file", "doc_path": str(message)}
|
|
327
|
+
if chunk_index is not None:
|
|
328
|
+
source_dict["chunk_index"] = chunk_index
|
|
329
|
+
if chunk_total is not None:
|
|
330
|
+
source_dict["chunk_total"] = chunk_total
|
|
331
|
+
if chunk_content is not None:
|
|
332
|
+
source_dict["content"] = chunk_content
|
|
333
|
+
return SourceMessage(**source_dict)
|
|
334
|
+
|
|
335
|
+
def rebuild_from_source(
|
|
336
|
+
self,
|
|
337
|
+
source: SourceMessage,
|
|
338
|
+
) -> File:
|
|
339
|
+
"""Rebuild file content part from SourceMessage."""
|
|
340
|
+
# Rebuild from source fields
|
|
341
|
+
return {
|
|
342
|
+
"type": "file",
|
|
343
|
+
"file": source.file_info,
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
def _parse_file(self, file_info: dict[str, Any]) -> str:
|
|
347
|
+
"""
|
|
348
|
+
Parse file content.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
file_info: File information dictionary
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Parsed text content
|
|
355
|
+
"""
|
|
356
|
+
parser = self.parser or get_parser()
|
|
357
|
+
if not parser:
|
|
358
|
+
logger.warning("[FileContentParser] Parser not available")
|
|
359
|
+
return ""
|
|
360
|
+
|
|
361
|
+
file_path = file_info.get("path") or file_info.get("file_id", "")
|
|
362
|
+
filename = file_info.get("filename", "unknown")
|
|
363
|
+
|
|
364
|
+
if not file_path:
|
|
365
|
+
logger.warning("[FileContentParser] No file path or file_id provided")
|
|
366
|
+
return f"[File: {filename}]"
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
if os.path.exists(file_path):
|
|
370
|
+
parsed_text = parser.parse(file_path)
|
|
371
|
+
return parsed_text
|
|
372
|
+
else:
|
|
373
|
+
logger.warning(f"[FileContentParser] File not found: {file_path}")
|
|
374
|
+
return f"[File: {filename}]"
|
|
375
|
+
except Exception as e:
|
|
376
|
+
logger.error(f"[FileContentParser] Error parsing file {file_path}: {e}")
|
|
377
|
+
return f"[File: {filename}]"
|
|
378
|
+
|
|
379
|
+
def parse_fast(
|
|
380
|
+
self,
|
|
381
|
+
message: File,
|
|
382
|
+
info: dict[str, Any],
|
|
383
|
+
**kwargs,
|
|
384
|
+
) -> list[TextualMemoryItem]:
|
|
385
|
+
"""
|
|
386
|
+
Parse file content part in fast mode.
|
|
387
|
+
|
|
388
|
+
Fast mode extracts file information and creates a memory item without parsing file content.
|
|
389
|
+
Handles various file parameter scenarios:
|
|
390
|
+
- file_data: base64 encoded data, URL, or plain text content
|
|
391
|
+
- file_id: ID of an uploaded file
|
|
392
|
+
- filename: name of the file
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
message: File content part to parse (dict with "type": "file" and "file": {...})
|
|
396
|
+
info: Dictionary containing user_id and session_id
|
|
397
|
+
**kwargs: Additional parameters
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
List of TextualMemoryItem objects
|
|
401
|
+
"""
|
|
402
|
+
if not isinstance(message, dict):
|
|
403
|
+
logger.warning(f"[FileContentParser] Expected dict, got {type(message)}")
|
|
404
|
+
return []
|
|
405
|
+
|
|
406
|
+
# Extract file information
|
|
407
|
+
file_info = message.get("file", {})
|
|
408
|
+
if not isinstance(file_info, dict):
|
|
409
|
+
logger.warning(f"[FileContentParser] Expected file dict, got {type(file_info)}")
|
|
410
|
+
return []
|
|
411
|
+
|
|
412
|
+
# Extract file parameters (all are optional)
|
|
413
|
+
file_data = file_info.get("file_data", "")
|
|
414
|
+
file_id = file_info.get("file_id", "")
|
|
415
|
+
filename = file_info.get("filename", "")
|
|
416
|
+
file_url_flag = False
|
|
417
|
+
# Build content string based on available information
|
|
418
|
+
content_parts = []
|
|
419
|
+
|
|
420
|
+
# Priority 1: If file_data is provided, use it (could be base64, URL, or plain text)
|
|
421
|
+
if file_data:
|
|
422
|
+
# In fast mode, we don't decode base64 or fetch URLs, just record the reference
|
|
423
|
+
if isinstance(file_data, str):
|
|
424
|
+
# Check if it looks like base64 (starts with data: or is long base64 string)
|
|
425
|
+
if file_data.startswith("data:") or (
|
|
426
|
+
len(file_data) > 100
|
|
427
|
+
and all(
|
|
428
|
+
c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
|
|
429
|
+
for c in file_data[:100]
|
|
430
|
+
)
|
|
431
|
+
):
|
|
432
|
+
content_parts.append(f"[File Data (base64/encoded): {len(file_data)} chars]")
|
|
433
|
+
# Check if it looks like a URL
|
|
434
|
+
elif file_data.startswith(("http://", "https://", "file://")):
|
|
435
|
+
file_url_flag = True
|
|
436
|
+
content_parts.append(f"[File URL: {file_data}]")
|
|
437
|
+
else:
|
|
438
|
+
# TODO: split into multiple memory items
|
|
439
|
+
content_parts.append(file_data)
|
|
440
|
+
else:
|
|
441
|
+
content_parts.append(f"[File Data: {type(file_data).__name__}]")
|
|
442
|
+
|
|
443
|
+
# Priority 2: If file_id is provided, reference it
|
|
444
|
+
if file_id:
|
|
445
|
+
content_parts.append(f"[File ID: {file_id}]")
|
|
446
|
+
|
|
447
|
+
# Priority 3: If filename is provided, include it
|
|
448
|
+
if filename:
|
|
449
|
+
content_parts.append(f"[Filename: {filename}]")
|
|
450
|
+
|
|
451
|
+
# If no content can be extracted, create a placeholder
|
|
452
|
+
if not content_parts:
|
|
453
|
+
content_parts.append("[File: unknown]")
|
|
454
|
+
|
|
455
|
+
# Combine content parts
|
|
456
|
+
content = " ".join(content_parts)
|
|
457
|
+
|
|
458
|
+
# Split content into chunks
|
|
459
|
+
content_chunks = self._split_text(content)
|
|
460
|
+
|
|
461
|
+
# Extract info fields
|
|
462
|
+
info_ = info.copy()
|
|
463
|
+
if file_id:
|
|
464
|
+
info_.update({"file_id": file_id})
|
|
465
|
+
user_id = info_.pop("user_id", "")
|
|
466
|
+
session_id = info_.pop("session_id", "")
|
|
467
|
+
|
|
468
|
+
# For file content parts, default to LongTermMemory
|
|
469
|
+
# (since we don't have role information at this level)
|
|
470
|
+
memory_type = "LongTermMemory"
|
|
471
|
+
file_ids = [file_id] if file_id else []
|
|
472
|
+
total_chunks = len(content_chunks)
|
|
473
|
+
|
|
474
|
+
# Create memory items for each chunk
|
|
475
|
+
content_chunk_embeddings = self.embedder.embed(content_chunks)
|
|
476
|
+
memory_items = []
|
|
477
|
+
for chunk_idx, chunk_text in enumerate(content_chunks):
|
|
478
|
+
if not chunk_text.strip():
|
|
479
|
+
continue
|
|
480
|
+
|
|
481
|
+
# Create source for this specific chunk with its index and content
|
|
482
|
+
source = self.create_source(
|
|
483
|
+
message,
|
|
484
|
+
info,
|
|
485
|
+
chunk_index=chunk_idx,
|
|
486
|
+
chunk_total=total_chunks,
|
|
487
|
+
chunk_content=chunk_text,
|
|
488
|
+
file_url_flag=file_url_flag,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
memory_item = TextualMemoryItem(
|
|
492
|
+
memory=chunk_text,
|
|
493
|
+
metadata=TreeNodeTextualMemoryMetadata(
|
|
494
|
+
user_id=user_id,
|
|
495
|
+
session_id=session_id,
|
|
496
|
+
memory_type=memory_type,
|
|
497
|
+
status="activated",
|
|
498
|
+
tags=[
|
|
499
|
+
"mode:fast",
|
|
500
|
+
"multimodal:file",
|
|
501
|
+
f"chunk:{chunk_idx + 1}/{total_chunks}",
|
|
502
|
+
],
|
|
503
|
+
key=_derive_key(chunk_text),
|
|
504
|
+
embedding=content_chunk_embeddings[chunk_idx],
|
|
505
|
+
usage=[],
|
|
506
|
+
sources=[source],
|
|
507
|
+
background="",
|
|
508
|
+
confidence=0.99,
|
|
509
|
+
type="fact",
|
|
510
|
+
info=info_,
|
|
511
|
+
file_ids=file_ids,
|
|
512
|
+
),
|
|
513
|
+
)
|
|
514
|
+
memory_items.append(memory_item)
|
|
515
|
+
|
|
516
|
+
# If no chunks were created, create a placeholder
|
|
517
|
+
if not memory_items:
|
|
518
|
+
# Create source for placeholder (no chunk index since there are no chunks)
|
|
519
|
+
placeholder_source = self.create_source(
|
|
520
|
+
message,
|
|
521
|
+
info,
|
|
522
|
+
chunk_index=None,
|
|
523
|
+
chunk_total=0,
|
|
524
|
+
chunk_content=content,
|
|
525
|
+
file_url_flag=file_url_flag,
|
|
526
|
+
)
|
|
527
|
+
memory_item = TextualMemoryItem(
|
|
528
|
+
memory=content,
|
|
529
|
+
metadata=TreeNodeTextualMemoryMetadata(
|
|
530
|
+
user_id=user_id,
|
|
531
|
+
session_id=session_id,
|
|
532
|
+
memory_type=memory_type,
|
|
533
|
+
status="activated",
|
|
534
|
+
tags=["mode:fast", "multimodal:file"],
|
|
535
|
+
key=_derive_key(content),
|
|
536
|
+
embedding=self.embedder.embed([content])[0],
|
|
537
|
+
usage=[],
|
|
538
|
+
sources=[placeholder_source],
|
|
539
|
+
background="",
|
|
540
|
+
confidence=0.99,
|
|
541
|
+
type="fact",
|
|
542
|
+
info=info_,
|
|
543
|
+
file_ids=file_ids,
|
|
544
|
+
),
|
|
545
|
+
)
|
|
546
|
+
memory_items.append(memory_item)
|
|
547
|
+
|
|
548
|
+
return memory_items
|
|
549
|
+
|
|
550
|
+
def parse_fine(
|
|
551
|
+
self,
|
|
552
|
+
message: File,
|
|
553
|
+
info: dict[str, Any],
|
|
554
|
+
**kwargs,
|
|
555
|
+
) -> list[TextualMemoryItem]:
|
|
556
|
+
"""
|
|
557
|
+
Parse file content part in fine mode.
|
|
558
|
+
Fine mode downloads and parses file content, especially for URLs.
|
|
559
|
+
Then uses LLM to extract structured memories from each chunk.
|
|
560
|
+
|
|
561
|
+
Handles various file parameter scenarios:
|
|
562
|
+
- file_data: URL (http://, https://, or @http://), base64 encoded data, or plain text content
|
|
563
|
+
- file_id: ID of an uploaded file
|
|
564
|
+
- filename: name of the file
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
message: File content part to parse
|
|
568
|
+
info: Dictionary containing user_id and session_id
|
|
569
|
+
**kwargs: Additional parameters including:
|
|
570
|
+
- custom_tags: Optional list of custom tags for LLM extraction
|
|
571
|
+
- context_items: Optional list of TextualMemoryItem for context
|
|
572
|
+
"""
|
|
573
|
+
if not isinstance(message, dict):
|
|
574
|
+
logger.warning(f"[FileContentParser] Expected dict, got {type(message)}")
|
|
575
|
+
return []
|
|
576
|
+
|
|
577
|
+
# Extract file information
|
|
578
|
+
file_info = message.get("file", {})
|
|
579
|
+
if not isinstance(file_info, dict):
|
|
580
|
+
logger.warning(f"[FileContentParser] Expected file dict, got {type(file_info)}")
|
|
581
|
+
return []
|
|
582
|
+
|
|
583
|
+
# Extract file parameters (all are optional)
|
|
584
|
+
file_data = file_info.get("file_data", "")
|
|
585
|
+
file_id = file_info.get("file_id", "")
|
|
586
|
+
filename = file_info.get("filename", "")
|
|
587
|
+
|
|
588
|
+
# Extract custom_tags from kwargs (for LLM extraction)
|
|
589
|
+
custom_tags = kwargs.get("custom_tags")
|
|
590
|
+
|
|
591
|
+
# Use parser from utils
|
|
592
|
+
parser = self.parser or get_parser()
|
|
593
|
+
if not parser:
|
|
594
|
+
logger.warning("[FileContentParser] Parser not available")
|
|
595
|
+
return []
|
|
596
|
+
|
|
597
|
+
parsed_text = ""
|
|
598
|
+
temp_file_path = None
|
|
599
|
+
is_markdown = False
|
|
600
|
+
|
|
601
|
+
try:
|
|
602
|
+
# Priority 1: If file_data is provided, process it
|
|
603
|
+
if file_data:
|
|
604
|
+
if isinstance(file_data, str):
|
|
605
|
+
url_str = file_data[1:] if file_data.startswith("@") else file_data
|
|
606
|
+
|
|
607
|
+
if url_str.startswith(("http://", "https://")):
|
|
608
|
+
parsed_text, temp_file_path, is_markdown = self._handle_url(
|
|
609
|
+
url_str, filename
|
|
610
|
+
)
|
|
611
|
+
if temp_file_path:
|
|
612
|
+
try:
|
|
613
|
+
# Use parser from utils
|
|
614
|
+
if parser:
|
|
615
|
+
parsed_text = parser.parse(temp_file_path)
|
|
616
|
+
except Exception as e:
|
|
617
|
+
logger.error(
|
|
618
|
+
f"[FileContentParser] Error parsing downloaded file: {e}"
|
|
619
|
+
)
|
|
620
|
+
parsed_text = f"[File parsing error: {e!s}]"
|
|
621
|
+
|
|
622
|
+
elif os.path.exists(file_data):
|
|
623
|
+
parsed_text = self._handle_local(file_data)
|
|
624
|
+
|
|
625
|
+
elif self._is_base64(file_data):
|
|
626
|
+
parsed_text = self._handle_base64(file_data)
|
|
627
|
+
|
|
628
|
+
else:
|
|
629
|
+
# TODO: discuss the proper place for processing
|
|
630
|
+
# string file-data
|
|
631
|
+
return []
|
|
632
|
+
# Priority 2: If file_id is provided but no file_data, try to use file_id as path
|
|
633
|
+
elif file_id:
|
|
634
|
+
logger.warning(f"[FileContentParser] File data not provided for file_id: {file_id}")
|
|
635
|
+
|
|
636
|
+
except Exception as e:
|
|
637
|
+
logger.error(f"[FileContentParser] Error in parse_fine: {e}")
|
|
638
|
+
|
|
639
|
+
finally:
|
|
640
|
+
# Clean up temporary file
|
|
641
|
+
if temp_file_path and os.path.exists(temp_file_path):
|
|
642
|
+
try:
|
|
643
|
+
os.unlink(temp_file_path)
|
|
644
|
+
logger.debug(f"[FileContentParser] Cleaned up temporary file: {temp_file_path}")
|
|
645
|
+
except Exception as e:
|
|
646
|
+
logger.warning(
|
|
647
|
+
f"[FileContentParser] Failed to delete temp file {temp_file_path}: {e}"
|
|
648
|
+
)
|
|
649
|
+
if not parsed_text:
|
|
650
|
+
return []
|
|
651
|
+
# Extract and process images from parsed_text
|
|
652
|
+
if is_markdown and parsed_text and self.image_parser:
|
|
653
|
+
parsed_text = self._extract_and_process_images(parsed_text, info, **kwargs)
|
|
654
|
+
|
|
655
|
+
# Extract info fields
|
|
656
|
+
if not info:
|
|
657
|
+
info = {}
|
|
658
|
+
info_ = info.copy()
|
|
659
|
+
user_id = info_.pop("user_id", "")
|
|
660
|
+
session_id = info_.pop("session_id", "")
|
|
661
|
+
if file_id:
|
|
662
|
+
info_["file_id"] = file_id
|
|
663
|
+
file_ids = [file_id] if file_id else []
|
|
664
|
+
# For file content parts, default to LongTermMemory
|
|
665
|
+
memory_type = "LongTermMemory"
|
|
666
|
+
|
|
667
|
+
# Split parsed text into chunks
|
|
668
|
+
content_chunks = self._split_text(parsed_text, is_markdown)
|
|
669
|
+
|
|
670
|
+
# Filter out empty chunks and create indexed list
|
|
671
|
+
valid_chunks = [
|
|
672
|
+
(idx, chunk_text) for idx, chunk_text in enumerate(content_chunks) if chunk_text.strip()
|
|
673
|
+
]
|
|
674
|
+
total_chunks = len(content_chunks)
|
|
675
|
+
|
|
676
|
+
# Helper function to create memory item (similar to SimpleStructMemReader._make_memory_item)
|
|
677
|
+
def _make_memory_item(
|
|
678
|
+
value: str,
|
|
679
|
+
mem_type: str = memory_type,
|
|
680
|
+
tags: list[str] | None = None,
|
|
681
|
+
key: str | None = None,
|
|
682
|
+
chunk_idx: int | None = None,
|
|
683
|
+
chunk_content: str | None = None,
|
|
684
|
+
) -> TextualMemoryItem:
|
|
685
|
+
"""Construct memory item with common fields.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
value: Memory content (chunk text)
|
|
689
|
+
mem_type: Memory type
|
|
690
|
+
tags: Tags for the memory item
|
|
691
|
+
key: Key for the memory item
|
|
692
|
+
chunk_idx: Index of the chunk in the document (0-based)
|
|
693
|
+
"""
|
|
694
|
+
# Create source for this specific chunk with its index and content
|
|
695
|
+
chunk_source = self.create_source(
|
|
696
|
+
message,
|
|
697
|
+
info,
|
|
698
|
+
chunk_index=chunk_idx,
|
|
699
|
+
chunk_total=total_chunks,
|
|
700
|
+
chunk_content=chunk_content,
|
|
701
|
+
)
|
|
702
|
+
return TextualMemoryItem(
|
|
703
|
+
memory=value,
|
|
704
|
+
metadata=TreeNodeTextualMemoryMetadata(
|
|
705
|
+
user_id=user_id,
|
|
706
|
+
session_id=session_id,
|
|
707
|
+
memory_type=mem_type,
|
|
708
|
+
status="activated",
|
|
709
|
+
tags=tags or [],
|
|
710
|
+
key=key if key is not None else _derive_key(value),
|
|
711
|
+
embedding=self.embedder.embed([value])[0],
|
|
712
|
+
usage=[],
|
|
713
|
+
sources=[chunk_source],
|
|
714
|
+
background="",
|
|
715
|
+
confidence=0.99,
|
|
716
|
+
type="fact",
|
|
717
|
+
info=info_,
|
|
718
|
+
file_ids=file_ids,
|
|
719
|
+
),
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
# Helper function to create fallback item for a chunk
|
|
723
|
+
def _make_fallback(
|
|
724
|
+
chunk_idx: int, chunk_text: str, reason: str = "raw"
|
|
725
|
+
) -> TextualMemoryItem:
|
|
726
|
+
"""Create fallback memory item with raw chunk text."""
|
|
727
|
+
return _make_memory_item(
|
|
728
|
+
value=chunk_text,
|
|
729
|
+
tags=[
|
|
730
|
+
"mode:fine",
|
|
731
|
+
"multimodal:file",
|
|
732
|
+
f"fallback:{reason}",
|
|
733
|
+
f"chunk:{chunk_idx + 1}/{total_chunks}",
|
|
734
|
+
],
|
|
735
|
+
chunk_idx=chunk_idx,
|
|
736
|
+
chunk_content=chunk_text,
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
# Handle empty chunks case
|
|
740
|
+
if not valid_chunks:
|
|
741
|
+
return [
|
|
742
|
+
_make_memory_item(
|
|
743
|
+
value=parsed_text or "[File: empty content]",
|
|
744
|
+
tags=["mode:fine", "multimodal:file"],
|
|
745
|
+
chunk_idx=None,
|
|
746
|
+
)
|
|
747
|
+
]
|
|
748
|
+
|
|
749
|
+
# If no LLM available, create memory items directly from chunks
|
|
750
|
+
if not self.llm:
|
|
751
|
+
return [_make_fallback(idx, text, "no_llm") for idx, text in valid_chunks]
|
|
752
|
+
|
|
753
|
+
# Process single chunk with LLM extraction (worker function)
|
|
754
|
+
def _process_chunk(chunk_idx: int, chunk_text: str) -> TextualMemoryItem:
|
|
755
|
+
"""Process chunk with LLM, fallback to raw on failure."""
|
|
756
|
+
try:
|
|
757
|
+
response_json = self._get_doc_llm_response(chunk_text, custom_tags)
|
|
758
|
+
if response_json:
|
|
759
|
+
value = response_json.get("value", "").strip()
|
|
760
|
+
if value:
|
|
761
|
+
tags = response_json.get("tags", [])
|
|
762
|
+
tags = tags if isinstance(tags, list) else []
|
|
763
|
+
tags.extend(["mode:fine", "multimodal:file"])
|
|
764
|
+
|
|
765
|
+
llm_mem_type = response_json.get("memory_type", memory_type)
|
|
766
|
+
if llm_mem_type not in ["LongTermMemory", "UserMemory"]:
|
|
767
|
+
llm_mem_type = memory_type
|
|
768
|
+
|
|
769
|
+
return _make_memory_item(
|
|
770
|
+
value=value,
|
|
771
|
+
mem_type=llm_mem_type,
|
|
772
|
+
tags=tags,
|
|
773
|
+
key=response_json.get("key"),
|
|
774
|
+
chunk_idx=chunk_idx,
|
|
775
|
+
chunk_content=chunk_text,
|
|
776
|
+
)
|
|
777
|
+
except Exception as e:
|
|
778
|
+
logger.error(f"[FileContentParser] LLM error for chunk {chunk_idx}: {e}")
|
|
779
|
+
|
|
780
|
+
# Fallback to raw chunk
|
|
781
|
+
logger.warning(f"[FileContentParser] Fallback to raw for chunk {chunk_idx}")
|
|
782
|
+
return _make_fallback(chunk_idx, chunk_text)
|
|
783
|
+
|
|
784
|
+
# Process chunks concurrently with progress bar
|
|
785
|
+
memory_items = []
|
|
786
|
+
chunk_map = dict(valid_chunks)
|
|
787
|
+
total_chunks = len(valid_chunks)
|
|
788
|
+
|
|
789
|
+
logger.info(f"[FileContentParser] Processing {total_chunks} chunks with LLM...")
|
|
790
|
+
|
|
791
|
+
with ContextThreadPoolExecutor(max_workers=20) as executor:
|
|
792
|
+
futures = {
|
|
793
|
+
executor.submit(_process_chunk, idx, text): idx for idx, text in valid_chunks
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
# Use tqdm for progress bar (similar to simple_struct.py _process_doc_data)
|
|
797
|
+
for future in tqdm(
|
|
798
|
+
concurrent.futures.as_completed(futures),
|
|
799
|
+
total=total_chunks,
|
|
800
|
+
desc="[FileContentParser] Processing chunks",
|
|
801
|
+
):
|
|
802
|
+
chunk_idx = futures[future]
|
|
803
|
+
try:
|
|
804
|
+
node = future.result()
|
|
805
|
+
if node:
|
|
806
|
+
memory_items.append(node)
|
|
807
|
+
except Exception as e:
|
|
808
|
+
tqdm.write(f"[ERROR] Chunk {chunk_idx} failed: {e}")
|
|
809
|
+
logger.error(f"[FileContentParser] Future failed for chunk {chunk_idx}: {e}")
|
|
810
|
+
# Create fallback for failed future
|
|
811
|
+
if chunk_idx in chunk_map:
|
|
812
|
+
memory_items.append(
|
|
813
|
+
_make_fallback(chunk_idx, chunk_map[chunk_idx], "error")
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
logger.info(
|
|
817
|
+
f"[FileContentParser] Completed processing {len(memory_items)}/{total_chunks} chunks"
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
return memory_items or [
|
|
821
|
+
_make_memory_item(
|
|
822
|
+
value=parsed_text or "[File: empty content]",
|
|
823
|
+
tags=["mode:fine", "multimodal:file"],
|
|
824
|
+
chunk_idx=None,
|
|
825
|
+
)
|
|
826
|
+
]
|