MemoryOS 2.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memoryos-2.0.3.dist-info/METADATA +418 -0
- memoryos-2.0.3.dist-info/RECORD +315 -0
- memoryos-2.0.3.dist-info/WHEEL +4 -0
- memoryos-2.0.3.dist-info/entry_points.txt +3 -0
- memoryos-2.0.3.dist-info/licenses/LICENSE +201 -0
- memos/__init__.py +20 -0
- memos/api/client.py +571 -0
- memos/api/config.py +1018 -0
- memos/api/context/dependencies.py +50 -0
- memos/api/exceptions.py +53 -0
- memos/api/handlers/__init__.py +62 -0
- memos/api/handlers/add_handler.py +158 -0
- memos/api/handlers/base_handler.py +194 -0
- memos/api/handlers/chat_handler.py +1401 -0
- memos/api/handlers/component_init.py +388 -0
- memos/api/handlers/config_builders.py +190 -0
- memos/api/handlers/feedback_handler.py +93 -0
- memos/api/handlers/formatters_handler.py +237 -0
- memos/api/handlers/memory_handler.py +316 -0
- memos/api/handlers/scheduler_handler.py +497 -0
- memos/api/handlers/search_handler.py +222 -0
- memos/api/handlers/suggestion_handler.py +117 -0
- memos/api/mcp_serve.py +614 -0
- memos/api/middleware/request_context.py +101 -0
- memos/api/product_api.py +38 -0
- memos/api/product_models.py +1206 -0
- memos/api/routers/__init__.py +1 -0
- memos/api/routers/product_router.py +477 -0
- memos/api/routers/server_router.py +394 -0
- memos/api/server_api.py +44 -0
- memos/api/start_api.py +433 -0
- memos/chunkers/__init__.py +4 -0
- memos/chunkers/base.py +24 -0
- memos/chunkers/charactertext_chunker.py +41 -0
- memos/chunkers/factory.py +24 -0
- memos/chunkers/markdown_chunker.py +62 -0
- memos/chunkers/sentence_chunker.py +54 -0
- memos/chunkers/simple_chunker.py +50 -0
- memos/cli.py +113 -0
- memos/configs/__init__.py +0 -0
- memos/configs/base.py +82 -0
- memos/configs/chunker.py +59 -0
- memos/configs/embedder.py +88 -0
- memos/configs/graph_db.py +236 -0
- memos/configs/internet_retriever.py +100 -0
- memos/configs/llm.py +151 -0
- memos/configs/mem_agent.py +54 -0
- memos/configs/mem_chat.py +81 -0
- memos/configs/mem_cube.py +105 -0
- memos/configs/mem_os.py +83 -0
- memos/configs/mem_reader.py +91 -0
- memos/configs/mem_scheduler.py +385 -0
- memos/configs/mem_user.py +70 -0
- memos/configs/memory.py +324 -0
- memos/configs/parser.py +38 -0
- memos/configs/reranker.py +18 -0
- memos/configs/utils.py +8 -0
- memos/configs/vec_db.py +80 -0
- memos/context/context.py +355 -0
- memos/dependency.py +52 -0
- memos/deprecation.py +262 -0
- memos/embedders/__init__.py +0 -0
- memos/embedders/ark.py +95 -0
- memos/embedders/base.py +106 -0
- memos/embedders/factory.py +29 -0
- memos/embedders/ollama.py +77 -0
- memos/embedders/sentence_transformer.py +49 -0
- memos/embedders/universal_api.py +51 -0
- memos/exceptions.py +30 -0
- memos/graph_dbs/__init__.py +0 -0
- memos/graph_dbs/base.py +274 -0
- memos/graph_dbs/factory.py +27 -0
- memos/graph_dbs/item.py +46 -0
- memos/graph_dbs/nebular.py +1794 -0
- memos/graph_dbs/neo4j.py +1942 -0
- memos/graph_dbs/neo4j_community.py +1058 -0
- memos/graph_dbs/polardb.py +5446 -0
- memos/hello_world.py +97 -0
- memos/llms/__init__.py +0 -0
- memos/llms/base.py +25 -0
- memos/llms/deepseek.py +13 -0
- memos/llms/factory.py +38 -0
- memos/llms/hf.py +443 -0
- memos/llms/hf_singleton.py +114 -0
- memos/llms/ollama.py +135 -0
- memos/llms/openai.py +222 -0
- memos/llms/openai_new.py +198 -0
- memos/llms/qwen.py +13 -0
- memos/llms/utils.py +14 -0
- memos/llms/vllm.py +218 -0
- memos/log.py +237 -0
- memos/mem_agent/base.py +19 -0
- memos/mem_agent/deepsearch_agent.py +391 -0
- memos/mem_agent/factory.py +36 -0
- memos/mem_chat/__init__.py +0 -0
- memos/mem_chat/base.py +30 -0
- memos/mem_chat/factory.py +21 -0
- memos/mem_chat/simple.py +200 -0
- memos/mem_cube/__init__.py +0 -0
- memos/mem_cube/base.py +30 -0
- memos/mem_cube/general.py +240 -0
- memos/mem_cube/navie.py +172 -0
- memos/mem_cube/utils.py +169 -0
- memos/mem_feedback/base.py +15 -0
- memos/mem_feedback/feedback.py +1192 -0
- memos/mem_feedback/simple_feedback.py +40 -0
- memos/mem_feedback/utils.py +230 -0
- memos/mem_os/client.py +5 -0
- memos/mem_os/core.py +1203 -0
- memos/mem_os/main.py +582 -0
- memos/mem_os/product.py +1608 -0
- memos/mem_os/product_server.py +455 -0
- memos/mem_os/utils/default_config.py +359 -0
- memos/mem_os/utils/format_utils.py +1403 -0
- memos/mem_os/utils/reference_utils.py +162 -0
- memos/mem_reader/__init__.py +0 -0
- memos/mem_reader/base.py +47 -0
- memos/mem_reader/factory.py +53 -0
- memos/mem_reader/memory.py +298 -0
- memos/mem_reader/multi_modal_struct.py +965 -0
- memos/mem_reader/read_multi_modal/__init__.py +43 -0
- memos/mem_reader/read_multi_modal/assistant_parser.py +311 -0
- memos/mem_reader/read_multi_modal/base.py +273 -0
- memos/mem_reader/read_multi_modal/file_content_parser.py +826 -0
- memos/mem_reader/read_multi_modal/image_parser.py +359 -0
- memos/mem_reader/read_multi_modal/multi_modal_parser.py +252 -0
- memos/mem_reader/read_multi_modal/string_parser.py +139 -0
- memos/mem_reader/read_multi_modal/system_parser.py +327 -0
- memos/mem_reader/read_multi_modal/text_content_parser.py +131 -0
- memos/mem_reader/read_multi_modal/tool_parser.py +210 -0
- memos/mem_reader/read_multi_modal/user_parser.py +218 -0
- memos/mem_reader/read_multi_modal/utils.py +358 -0
- memos/mem_reader/simple_struct.py +912 -0
- memos/mem_reader/strategy_struct.py +163 -0
- memos/mem_reader/utils.py +157 -0
- memos/mem_scheduler/__init__.py +0 -0
- memos/mem_scheduler/analyzer/__init__.py +0 -0
- memos/mem_scheduler/analyzer/api_analyzer.py +714 -0
- memos/mem_scheduler/analyzer/eval_analyzer.py +219 -0
- memos/mem_scheduler/analyzer/mos_for_test_scheduler.py +571 -0
- memos/mem_scheduler/analyzer/scheduler_for_eval.py +280 -0
- memos/mem_scheduler/base_scheduler.py +1319 -0
- memos/mem_scheduler/general_modules/__init__.py +0 -0
- memos/mem_scheduler/general_modules/api_misc.py +137 -0
- memos/mem_scheduler/general_modules/base.py +80 -0
- memos/mem_scheduler/general_modules/init_components_for_scheduler.py +425 -0
- memos/mem_scheduler/general_modules/misc.py +313 -0
- memos/mem_scheduler/general_modules/scheduler_logger.py +389 -0
- memos/mem_scheduler/general_modules/task_threads.py +315 -0
- memos/mem_scheduler/general_scheduler.py +1495 -0
- memos/mem_scheduler/memory_manage_modules/__init__.py +5 -0
- memos/mem_scheduler/memory_manage_modules/memory_filter.py +306 -0
- memos/mem_scheduler/memory_manage_modules/retriever.py +547 -0
- memos/mem_scheduler/monitors/__init__.py +0 -0
- memos/mem_scheduler/monitors/dispatcher_monitor.py +366 -0
- memos/mem_scheduler/monitors/general_monitor.py +394 -0
- memos/mem_scheduler/monitors/task_schedule_monitor.py +254 -0
- memos/mem_scheduler/optimized_scheduler.py +410 -0
- memos/mem_scheduler/orm_modules/__init__.py +0 -0
- memos/mem_scheduler/orm_modules/api_redis_model.py +518 -0
- memos/mem_scheduler/orm_modules/base_model.py +729 -0
- memos/mem_scheduler/orm_modules/monitor_models.py +261 -0
- memos/mem_scheduler/orm_modules/redis_model.py +699 -0
- memos/mem_scheduler/scheduler_factory.py +23 -0
- memos/mem_scheduler/schemas/__init__.py +0 -0
- memos/mem_scheduler/schemas/analyzer_schemas.py +52 -0
- memos/mem_scheduler/schemas/api_schemas.py +233 -0
- memos/mem_scheduler/schemas/general_schemas.py +55 -0
- memos/mem_scheduler/schemas/message_schemas.py +173 -0
- memos/mem_scheduler/schemas/monitor_schemas.py +406 -0
- memos/mem_scheduler/schemas/task_schemas.py +132 -0
- memos/mem_scheduler/task_schedule_modules/__init__.py +0 -0
- memos/mem_scheduler/task_schedule_modules/dispatcher.py +740 -0
- memos/mem_scheduler/task_schedule_modules/local_queue.py +247 -0
- memos/mem_scheduler/task_schedule_modules/orchestrator.py +74 -0
- memos/mem_scheduler/task_schedule_modules/redis_queue.py +1385 -0
- memos/mem_scheduler/task_schedule_modules/task_queue.py +162 -0
- memos/mem_scheduler/utils/__init__.py +0 -0
- memos/mem_scheduler/utils/api_utils.py +77 -0
- memos/mem_scheduler/utils/config_utils.py +100 -0
- memos/mem_scheduler/utils/db_utils.py +50 -0
- memos/mem_scheduler/utils/filter_utils.py +176 -0
- memos/mem_scheduler/utils/metrics.py +125 -0
- memos/mem_scheduler/utils/misc_utils.py +290 -0
- memos/mem_scheduler/utils/monitor_event_utils.py +67 -0
- memos/mem_scheduler/utils/status_tracker.py +229 -0
- memos/mem_scheduler/webservice_modules/__init__.py +0 -0
- memos/mem_scheduler/webservice_modules/rabbitmq_service.py +485 -0
- memos/mem_scheduler/webservice_modules/redis_service.py +380 -0
- memos/mem_user/factory.py +94 -0
- memos/mem_user/mysql_persistent_user_manager.py +271 -0
- memos/mem_user/mysql_user_manager.py +502 -0
- memos/mem_user/persistent_factory.py +98 -0
- memos/mem_user/persistent_user_manager.py +260 -0
- memos/mem_user/redis_persistent_user_manager.py +225 -0
- memos/mem_user/user_manager.py +488 -0
- memos/memories/__init__.py +0 -0
- memos/memories/activation/__init__.py +0 -0
- memos/memories/activation/base.py +42 -0
- memos/memories/activation/item.py +56 -0
- memos/memories/activation/kv.py +292 -0
- memos/memories/activation/vllmkv.py +219 -0
- memos/memories/base.py +19 -0
- memos/memories/factory.py +42 -0
- memos/memories/parametric/__init__.py +0 -0
- memos/memories/parametric/base.py +19 -0
- memos/memories/parametric/item.py +11 -0
- memos/memories/parametric/lora.py +41 -0
- memos/memories/textual/__init__.py +0 -0
- memos/memories/textual/base.py +92 -0
- memos/memories/textual/general.py +236 -0
- memos/memories/textual/item.py +304 -0
- memos/memories/textual/naive.py +187 -0
- memos/memories/textual/prefer_text_memory/__init__.py +0 -0
- memos/memories/textual/prefer_text_memory/adder.py +504 -0
- memos/memories/textual/prefer_text_memory/config.py +106 -0
- memos/memories/textual/prefer_text_memory/extractor.py +221 -0
- memos/memories/textual/prefer_text_memory/factory.py +85 -0
- memos/memories/textual/prefer_text_memory/retrievers.py +177 -0
- memos/memories/textual/prefer_text_memory/spliter.py +132 -0
- memos/memories/textual/prefer_text_memory/utils.py +93 -0
- memos/memories/textual/preference.py +344 -0
- memos/memories/textual/simple_preference.py +161 -0
- memos/memories/textual/simple_tree.py +69 -0
- memos/memories/textual/tree.py +459 -0
- memos/memories/textual/tree_text_memory/__init__.py +0 -0
- memos/memories/textual/tree_text_memory/organize/__init__.py +0 -0
- memos/memories/textual/tree_text_memory/organize/handler.py +184 -0
- memos/memories/textual/tree_text_memory/organize/manager.py +518 -0
- memos/memories/textual/tree_text_memory/organize/relation_reason_detector.py +238 -0
- memos/memories/textual/tree_text_memory/organize/reorganizer.py +622 -0
- memos/memories/textual/tree_text_memory/retrieve/__init__.py +0 -0
- memos/memories/textual/tree_text_memory/retrieve/advanced_searcher.py +364 -0
- memos/memories/textual/tree_text_memory/retrieve/bm25_util.py +186 -0
- memos/memories/textual/tree_text_memory/retrieve/bochasearch.py +419 -0
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever.py +270 -0
- memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py +102 -0
- memos/memories/textual/tree_text_memory/retrieve/reasoner.py +61 -0
- memos/memories/textual/tree_text_memory/retrieve/recall.py +497 -0
- memos/memories/textual/tree_text_memory/retrieve/reranker.py +111 -0
- memos/memories/textual/tree_text_memory/retrieve/retrieval_mid_structs.py +16 -0
- memos/memories/textual/tree_text_memory/retrieve/retrieve_utils.py +472 -0
- memos/memories/textual/tree_text_memory/retrieve/searcher.py +848 -0
- memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py +135 -0
- memos/memories/textual/tree_text_memory/retrieve/utils.py +54 -0
- memos/memories/textual/tree_text_memory/retrieve/xinyusearch.py +387 -0
- memos/memos_tools/dinding_report_bot.py +453 -0
- memos/memos_tools/lockfree_dict.py +120 -0
- memos/memos_tools/notification_service.py +44 -0
- memos/memos_tools/notification_utils.py +142 -0
- memos/memos_tools/singleton.py +174 -0
- memos/memos_tools/thread_safe_dict.py +310 -0
- memos/memos_tools/thread_safe_dict_segment.py +382 -0
- memos/multi_mem_cube/__init__.py +0 -0
- memos/multi_mem_cube/composite_cube.py +86 -0
- memos/multi_mem_cube/single_cube.py +874 -0
- memos/multi_mem_cube/views.py +54 -0
- memos/parsers/__init__.py +0 -0
- memos/parsers/base.py +15 -0
- memos/parsers/factory.py +21 -0
- memos/parsers/markitdown.py +28 -0
- memos/reranker/__init__.py +4 -0
- memos/reranker/base.py +25 -0
- memos/reranker/concat.py +103 -0
- memos/reranker/cosine_local.py +102 -0
- memos/reranker/factory.py +72 -0
- memos/reranker/http_bge.py +324 -0
- memos/reranker/http_bge_strategy.py +327 -0
- memos/reranker/noop.py +19 -0
- memos/reranker/strategies/__init__.py +4 -0
- memos/reranker/strategies/base.py +61 -0
- memos/reranker/strategies/concat_background.py +94 -0
- memos/reranker/strategies/concat_docsource.py +110 -0
- memos/reranker/strategies/dialogue_common.py +109 -0
- memos/reranker/strategies/factory.py +31 -0
- memos/reranker/strategies/single_turn.py +107 -0
- memos/reranker/strategies/singleturn_outmem.py +98 -0
- memos/settings.py +10 -0
- memos/templates/__init__.py +0 -0
- memos/templates/advanced_search_prompts.py +211 -0
- memos/templates/cloud_service_prompt.py +107 -0
- memos/templates/instruction_completion.py +66 -0
- memos/templates/mem_agent_prompts.py +85 -0
- memos/templates/mem_feedback_prompts.py +822 -0
- memos/templates/mem_reader_prompts.py +1096 -0
- memos/templates/mem_reader_strategy_prompts.py +238 -0
- memos/templates/mem_scheduler_prompts.py +626 -0
- memos/templates/mem_search_prompts.py +93 -0
- memos/templates/mos_prompts.py +403 -0
- memos/templates/prefer_complete_prompt.py +735 -0
- memos/templates/tool_mem_prompts.py +139 -0
- memos/templates/tree_reorganize_prompts.py +230 -0
- memos/types/__init__.py +34 -0
- memos/types/general_types.py +151 -0
- memos/types/openai_chat_completion_types/__init__.py +15 -0
- memos/types/openai_chat_completion_types/chat_completion_assistant_message_param.py +56 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_image_param.py +27 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_input_audio_param.py +23 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_param.py +43 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_refusal_param.py +16 -0
- memos/types/openai_chat_completion_types/chat_completion_content_part_text_param.py +16 -0
- memos/types/openai_chat_completion_types/chat_completion_message_custom_tool_call_param.py +27 -0
- memos/types/openai_chat_completion_types/chat_completion_message_function_tool_call_param.py +32 -0
- memos/types/openai_chat_completion_types/chat_completion_message_param.py +18 -0
- memos/types/openai_chat_completion_types/chat_completion_message_tool_call_union_param.py +15 -0
- memos/types/openai_chat_completion_types/chat_completion_system_message_param.py +36 -0
- memos/types/openai_chat_completion_types/chat_completion_tool_message_param.py +30 -0
- memos/types/openai_chat_completion_types/chat_completion_user_message_param.py +34 -0
- memos/utils.py +123 -0
- memos/vec_dbs/__init__.py +0 -0
- memos/vec_dbs/base.py +117 -0
- memos/vec_dbs/factory.py +23 -0
- memos/vec_dbs/item.py +50 -0
- memos/vec_dbs/milvus.py +654 -0
- memos/vec_dbs/qdrant.py +355 -0
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
"""BochaAI Search API retriever for tree text memory."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from concurrent.futures import as_completed
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from memos.context.context import ContextThreadPoolExecutor
|
|
12
|
+
from memos.dependency import require_python_package
|
|
13
|
+
from memos.embedders.factory import OllamaEmbedder
|
|
14
|
+
from memos.log import get_logger
|
|
15
|
+
from memos.mem_reader.base import BaseMemReader
|
|
16
|
+
from memos.mem_reader.read_multi_modal import detect_lang
|
|
17
|
+
from memos.memories.textual.item import (
|
|
18
|
+
SearchedTreeNodeTextualMemoryMetadata,
|
|
19
|
+
SourceMessage,
|
|
20
|
+
TextualMemoryItem,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BochaAISearchAPI:
|
|
28
|
+
"""BochaAI Search API Client"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, api_key: str, max_results: int = 20):
|
|
31
|
+
"""
|
|
32
|
+
Initialize BochaAI Search API client.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
api_key: BochaAI API key
|
|
36
|
+
max_results: Maximum number of search results to retrieve
|
|
37
|
+
"""
|
|
38
|
+
self.api_key = api_key
|
|
39
|
+
self.max_results = max_results
|
|
40
|
+
|
|
41
|
+
self.web_url = "https://api.bochaai.com/v1/web-search"
|
|
42
|
+
self.ai_url = "https://api.bochaai.com/v1/ai-search"
|
|
43
|
+
|
|
44
|
+
self.headers = {
|
|
45
|
+
"Authorization": f"Bearer {api_key}",
|
|
46
|
+
"Content-Type": "application/json",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def search_web(
|
|
50
|
+
self, query: str, summary: bool = True, freshness="noLimit", max_results=None
|
|
51
|
+
) -> list[dict]:
|
|
52
|
+
"""
|
|
53
|
+
Perform a Web Search (equivalent to the first curl).
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
query: Search query string
|
|
57
|
+
summary: Whether to include summary in the results
|
|
58
|
+
freshness: Freshness filter (e.g. 'noLimit', 'day', 'week')
|
|
59
|
+
max_results: Maximum number of results to retrieve, bocha is limited to 50
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
A list of search result dicts
|
|
63
|
+
"""
|
|
64
|
+
body = {
|
|
65
|
+
"query": query,
|
|
66
|
+
"summary": summary,
|
|
67
|
+
"freshness": freshness,
|
|
68
|
+
"count": max_results or self.max_results,
|
|
69
|
+
}
|
|
70
|
+
return self._post(self.web_url, body)
|
|
71
|
+
|
|
72
|
+
def search_ai(
|
|
73
|
+
self,
|
|
74
|
+
query: str,
|
|
75
|
+
answer: bool = False,
|
|
76
|
+
stream: bool = False,
|
|
77
|
+
freshness="noLimit",
|
|
78
|
+
max_results=None,
|
|
79
|
+
) -> list[dict]:
|
|
80
|
+
"""
|
|
81
|
+
Perform an AI Search (equivalent to the second curl).
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
query: Search query string
|
|
85
|
+
answer: Whether BochaAI should generate an answer
|
|
86
|
+
stream: Whether to use streaming response
|
|
87
|
+
freshness: Freshness filter (e.g. 'noLimit', 'day', 'week')
|
|
88
|
+
max_results: Maximum number of results to retrieve, bocha is limited to 50
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
A list of search result dicts
|
|
92
|
+
"""
|
|
93
|
+
body = {
|
|
94
|
+
"query": query,
|
|
95
|
+
"freshness": freshness,
|
|
96
|
+
"count": max_results or self.max_results,
|
|
97
|
+
"answer": answer,
|
|
98
|
+
"stream": stream,
|
|
99
|
+
}
|
|
100
|
+
return self._post(self.ai_url, body)
|
|
101
|
+
|
|
102
|
+
def _post(self, url: str, body: dict) -> list[dict]:
|
|
103
|
+
"""Send POST request and parse BochaAI search results."""
|
|
104
|
+
try:
|
|
105
|
+
resp = requests.post(url, headers=self.headers, json=body)
|
|
106
|
+
resp.raise_for_status()
|
|
107
|
+
raw_data = resp.json()
|
|
108
|
+
|
|
109
|
+
# parse the nested structure correctly
|
|
110
|
+
# ✅ AI Search
|
|
111
|
+
if "messages" in raw_data:
|
|
112
|
+
results = []
|
|
113
|
+
for msg in raw_data["messages"]:
|
|
114
|
+
if msg.get("type") == "source" and msg.get("content_type") == "webpage":
|
|
115
|
+
try:
|
|
116
|
+
content_json = json.loads(msg["content"])
|
|
117
|
+
results.extend(content_json.get("value", []))
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.error(f"Failed to parse message content: {e}")
|
|
120
|
+
return results
|
|
121
|
+
|
|
122
|
+
# ✅ Web Search
|
|
123
|
+
return raw_data.get("data", {}).get("webPages", {}).get("value", [])
|
|
124
|
+
|
|
125
|
+
except Exception:
|
|
126
|
+
import traceback
|
|
127
|
+
|
|
128
|
+
logger.error(f"BochaAI search error: {traceback.format_exc()}")
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class BochaAISearchRetriever:
|
|
133
|
+
"""BochaAI retriever that converts search results into TextualMemoryItem objects"""
|
|
134
|
+
|
|
135
|
+
@require_python_package(
|
|
136
|
+
import_name="jieba",
|
|
137
|
+
install_command="pip install jieba",
|
|
138
|
+
install_link="https://github.com/fxsjy/jieba",
|
|
139
|
+
)
|
|
140
|
+
def __init__(
|
|
141
|
+
self,
|
|
142
|
+
access_key: str,
|
|
143
|
+
embedder: OllamaEmbedder,
|
|
144
|
+
reader: BaseMemReader,
|
|
145
|
+
max_results: int = 20,
|
|
146
|
+
):
|
|
147
|
+
"""
|
|
148
|
+
Initialize BochaAI Search retriever.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
access_key: BochaAI API key
|
|
152
|
+
embedder: Embedder instance for generating embeddings
|
|
153
|
+
reader: MemReader instance for processing internet content
|
|
154
|
+
max_results: Maximum number of search results to retrieve
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
from jieba.analyse import TextRank
|
|
158
|
+
|
|
159
|
+
self.bocha_api = BochaAISearchAPI(access_key, max_results=max_results)
|
|
160
|
+
self.embedder = embedder
|
|
161
|
+
self.reader = reader
|
|
162
|
+
self.zh_fast_keywords_extractor = TextRank()
|
|
163
|
+
|
|
164
|
+
def _extract_tags(self, title: str, content: str, summary: str, parsed_goal=None) -> list[str]:
|
|
165
|
+
"""
|
|
166
|
+
Extract tags from title, content and summary
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
title: Article title
|
|
170
|
+
content: Article content
|
|
171
|
+
summary: Article summary
|
|
172
|
+
parsed_goal: Parsed task goal (optional)
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
List of extracted tags
|
|
176
|
+
"""
|
|
177
|
+
tags = []
|
|
178
|
+
|
|
179
|
+
# Add source-based tags
|
|
180
|
+
tags.append("bocha_search")
|
|
181
|
+
tags.append("news")
|
|
182
|
+
|
|
183
|
+
# Add content-based tags
|
|
184
|
+
text = f"{title} {content} {summary}".lower()
|
|
185
|
+
|
|
186
|
+
# Simple keyword-based tagging
|
|
187
|
+
keywords = {
|
|
188
|
+
"economy": [
|
|
189
|
+
"economy",
|
|
190
|
+
"GDP",
|
|
191
|
+
"growth",
|
|
192
|
+
"production",
|
|
193
|
+
"industry",
|
|
194
|
+
"investment",
|
|
195
|
+
"consumption",
|
|
196
|
+
"market",
|
|
197
|
+
"trade",
|
|
198
|
+
"finance",
|
|
199
|
+
],
|
|
200
|
+
"politics": [
|
|
201
|
+
"politics",
|
|
202
|
+
"government",
|
|
203
|
+
"policy",
|
|
204
|
+
"meeting",
|
|
205
|
+
"leader",
|
|
206
|
+
"election",
|
|
207
|
+
"parliament",
|
|
208
|
+
"ministry",
|
|
209
|
+
],
|
|
210
|
+
"technology": [
|
|
211
|
+
"technology",
|
|
212
|
+
"tech",
|
|
213
|
+
"innovation",
|
|
214
|
+
"digital",
|
|
215
|
+
"internet",
|
|
216
|
+
"AI",
|
|
217
|
+
"artificial intelligence",
|
|
218
|
+
"software",
|
|
219
|
+
"hardware",
|
|
220
|
+
],
|
|
221
|
+
"sports": [
|
|
222
|
+
"sports",
|
|
223
|
+
"game",
|
|
224
|
+
"athlete",
|
|
225
|
+
"olympic",
|
|
226
|
+
"championship",
|
|
227
|
+
"tournament",
|
|
228
|
+
"team",
|
|
229
|
+
"player",
|
|
230
|
+
],
|
|
231
|
+
"culture": [
|
|
232
|
+
"culture",
|
|
233
|
+
"education",
|
|
234
|
+
"art",
|
|
235
|
+
"history",
|
|
236
|
+
"literature",
|
|
237
|
+
"music",
|
|
238
|
+
"film",
|
|
239
|
+
"museum",
|
|
240
|
+
],
|
|
241
|
+
"health": [
|
|
242
|
+
"health",
|
|
243
|
+
"medical",
|
|
244
|
+
"pandemic",
|
|
245
|
+
"hospital",
|
|
246
|
+
"doctor",
|
|
247
|
+
"medicine",
|
|
248
|
+
"disease",
|
|
249
|
+
"treatment",
|
|
250
|
+
],
|
|
251
|
+
"environment": [
|
|
252
|
+
"environment",
|
|
253
|
+
"ecology",
|
|
254
|
+
"pollution",
|
|
255
|
+
"green",
|
|
256
|
+
"climate",
|
|
257
|
+
"sustainability",
|
|
258
|
+
"renewable",
|
|
259
|
+
],
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
for category, words in keywords.items():
|
|
263
|
+
if any(word in text for word in words):
|
|
264
|
+
tags.append(category)
|
|
265
|
+
|
|
266
|
+
# Add goal-based tags if available
|
|
267
|
+
if parsed_goal and hasattr(parsed_goal, "tags"):
|
|
268
|
+
tags.extend(parsed_goal.tags)
|
|
269
|
+
|
|
270
|
+
return list(set(tags))[:15] # Limit to 15 tags
|
|
271
|
+
|
|
272
|
+
def retrieve_from_internet(
|
|
273
|
+
self, query: str, top_k: int = 10, parsed_goal=None, info=None, mode="fast"
|
|
274
|
+
) -> list[TextualMemoryItem]:
|
|
275
|
+
"""
|
|
276
|
+
Default internet retrieval (Web Search).
|
|
277
|
+
This keeps consistent API with Xinyu and Google retrievers.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
query: Search query
|
|
281
|
+
top_k: Number of results to retrieve
|
|
282
|
+
parsed_goal: Parsed task goal (optional)
|
|
283
|
+
info (dict): Metadata for memory consumption tracking
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
List of TextualMemoryItem
|
|
287
|
+
"""
|
|
288
|
+
search_results = self.bocha_api.search_ai(query, max_results=top_k) # ✅ default to
|
|
289
|
+
# web-search
|
|
290
|
+
return self._convert_to_mem_items(search_results, query, parsed_goal, info, mode=mode)
|
|
291
|
+
|
|
292
|
+
def retrieve_from_web(
|
|
293
|
+
self, query: str, top_k: int = 10, parsed_goal=None, info=None, mode="fast"
|
|
294
|
+
) -> list[TextualMemoryItem]:
|
|
295
|
+
"""Explicitly retrieve using Bocha Web Search."""
|
|
296
|
+
search_results = self.bocha_api.search_web(query)
|
|
297
|
+
return self._convert_to_mem_items(search_results, query, parsed_goal, info, mode=mode)
|
|
298
|
+
|
|
299
|
+
def retrieve_from_ai(
|
|
300
|
+
self, query: str, top_k: int = 10, parsed_goal=None, info=None, mode="fast"
|
|
301
|
+
) -> list[TextualMemoryItem]:
|
|
302
|
+
"""Explicitly retrieve using Bocha AI Search."""
|
|
303
|
+
search_results = self.bocha_api.search_ai(query)
|
|
304
|
+
return self._convert_to_mem_items(search_results, query, parsed_goal, info, mode=mode)
|
|
305
|
+
|
|
306
|
+
def _convert_to_mem_items(
|
|
307
|
+
self, search_results: list[dict], query: str, parsed_goal=None, info=None, mode="fast"
|
|
308
|
+
):
|
|
309
|
+
"""Convert API search results into TextualMemoryItem objects."""
|
|
310
|
+
memory_items = []
|
|
311
|
+
if not info:
|
|
312
|
+
info = {"user_id": "", "session_id": ""}
|
|
313
|
+
|
|
314
|
+
with ContextThreadPoolExecutor(max_workers=8) as executor:
|
|
315
|
+
futures = [
|
|
316
|
+
executor.submit(self._process_result, r, query, parsed_goal, info, mode=mode)
|
|
317
|
+
for r in search_results
|
|
318
|
+
]
|
|
319
|
+
for future in as_completed(futures):
|
|
320
|
+
try:
|
|
321
|
+
memory_items.extend(future.result())
|
|
322
|
+
except Exception as e:
|
|
323
|
+
logger.error(f"Error processing BochaAI search result: {e}")
|
|
324
|
+
|
|
325
|
+
# Deduplicate items by memory text
|
|
326
|
+
unique_memory_items = {item.memory: item for item in memory_items}
|
|
327
|
+
return list(unique_memory_items.values())
|
|
328
|
+
|
|
329
|
+
def _process_result(
|
|
330
|
+
self, result: dict, query: str, parsed_goal: str, info: dict[str, Any], mode="fast"
|
|
331
|
+
) -> list[TextualMemoryItem]:
|
|
332
|
+
"""Process one Bocha search result into TextualMemoryItem."""
|
|
333
|
+
title = result.get("name", "")
|
|
334
|
+
content = result.get("summary", "") or result.get("snippet", "")
|
|
335
|
+
summary = result.get("summary", "") or result.get("snippet", "")
|
|
336
|
+
url = result.get("url", "")
|
|
337
|
+
publish_time = result.get("datePublished", "")
|
|
338
|
+
site_name = result.get("siteName", "")
|
|
339
|
+
site_icon = result.get("siteIcon")
|
|
340
|
+
|
|
341
|
+
if publish_time:
|
|
342
|
+
try:
|
|
343
|
+
publish_time = datetime.fromisoformat(publish_time.replace("Z", "+00:00")).strftime(
|
|
344
|
+
"%Y-%m-%d"
|
|
345
|
+
)
|
|
346
|
+
except Exception:
|
|
347
|
+
publish_time = datetime.now().strftime("%Y-%m-%d")
|
|
348
|
+
else:
|
|
349
|
+
publish_time = datetime.now().strftime("%Y-%m-%d")
|
|
350
|
+
|
|
351
|
+
if mode == "fast":
|
|
352
|
+
info_ = info.copy()
|
|
353
|
+
user_id = info_.pop("user_id", "")
|
|
354
|
+
session_id = info_.pop("session_id", "")
|
|
355
|
+
lang = detect_lang(summary)
|
|
356
|
+
tags = (
|
|
357
|
+
self.zh_fast_keywords_extractor.textrank(summary, topK=3)[:3]
|
|
358
|
+
if lang == "zh"
|
|
359
|
+
else self._extract_tags(title, content, summary)[:3]
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
return [
|
|
363
|
+
TextualMemoryItem(
|
|
364
|
+
memory=(
|
|
365
|
+
f"[Outer internet view] Title: {title}\nNewsTime:"
|
|
366
|
+
f" {publish_time}\nSummary:"
|
|
367
|
+
f" {summary}\n"
|
|
368
|
+
),
|
|
369
|
+
metadata=SearchedTreeNodeTextualMemoryMetadata(
|
|
370
|
+
user_id=user_id,
|
|
371
|
+
session_id=session_id,
|
|
372
|
+
memory_type="OuterMemory",
|
|
373
|
+
status="activated",
|
|
374
|
+
type="fact",
|
|
375
|
+
source="web",
|
|
376
|
+
sources=[SourceMessage(type="web", url=url)] if url else [],
|
|
377
|
+
visibility="public",
|
|
378
|
+
info=info_,
|
|
379
|
+
background="",
|
|
380
|
+
confidence=0.99,
|
|
381
|
+
usage=[],
|
|
382
|
+
tags=tags,
|
|
383
|
+
key=title,
|
|
384
|
+
embedding=self.embedder.embed([content])[0],
|
|
385
|
+
internet_info={
|
|
386
|
+
"title": title,
|
|
387
|
+
"url": url,
|
|
388
|
+
"site_name": site_name,
|
|
389
|
+
"site_icon": site_icon,
|
|
390
|
+
"summary": summary,
|
|
391
|
+
},
|
|
392
|
+
),
|
|
393
|
+
)
|
|
394
|
+
]
|
|
395
|
+
else:
|
|
396
|
+
# Use reader to split and process the content into chunks
|
|
397
|
+
read_items = self.reader.get_memory([content], type="doc", info=info)
|
|
398
|
+
|
|
399
|
+
memory_items = []
|
|
400
|
+
for read_item_i in read_items[0]:
|
|
401
|
+
read_item_i.memory = (
|
|
402
|
+
f"[Outer internet view] Title: {title}\nNewsTime:"
|
|
403
|
+
f" {publish_time}\nSummary:"
|
|
404
|
+
f" {summary}\n"
|
|
405
|
+
f"Content: {read_item_i.memory}"
|
|
406
|
+
)
|
|
407
|
+
read_item_i.metadata.source = "web"
|
|
408
|
+
read_item_i.metadata.memory_type = "OuterMemory"
|
|
409
|
+
read_item_i.metadata.sources = [SourceMessage(type="web", url=url)] if url else []
|
|
410
|
+
read_item_i.metadata.visibility = "public"
|
|
411
|
+
read_item_i.metadata.internet_info = {
|
|
412
|
+
"title": title,
|
|
413
|
+
"url": url,
|
|
414
|
+
"site_name": site_name,
|
|
415
|
+
"site_icon": site_icon,
|
|
416
|
+
"summary": summary,
|
|
417
|
+
}
|
|
418
|
+
memory_items.append(read_item_i)
|
|
419
|
+
return memory_items
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""Internet retrieval module for tree text memory."""
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from memos.embedders.factory import OllamaEmbedder
|
|
10
|
+
from memos.memories.textual.item import (
|
|
11
|
+
SourceMessage,
|
|
12
|
+
TextualMemoryItem,
|
|
13
|
+
TreeNodeTextualMemoryMetadata,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GoogleCustomSearchAPI:
|
|
18
|
+
"""Google Custom Search API Client"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self, api_key: str, search_engine_id: str, max_results: int = 20, num_per_request: int = 10
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Initialize Google Custom Search API client
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
api_key: Google API key
|
|
28
|
+
search_engine_id: Search engine ID (cx parameter)
|
|
29
|
+
max_results: Maximum number of results to retrieve
|
|
30
|
+
num_per_request: Number of results per API request
|
|
31
|
+
"""
|
|
32
|
+
self.api_key = api_key
|
|
33
|
+
self.search_engine_id = search_engine_id
|
|
34
|
+
self.max_results = max_results
|
|
35
|
+
self.num_per_request = min(num_per_request, 10) # Google API limits to 10
|
|
36
|
+
self.base_url = "https://www.googleapis.com/customsearch/v1"
|
|
37
|
+
|
|
38
|
+
def search(self, query: str, num_results: int | None = None, start_index: int = 1) -> dict:
|
|
39
|
+
"""
|
|
40
|
+
Execute search request
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
query: Search query
|
|
44
|
+
num_results: Number of results to return (uses config default if None)
|
|
45
|
+
start_index: Starting index (default 1)
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Dictionary containing search results
|
|
49
|
+
"""
|
|
50
|
+
if num_results is None:
|
|
51
|
+
num_results = self.num_per_request
|
|
52
|
+
|
|
53
|
+
params = {
|
|
54
|
+
"key": self.api_key,
|
|
55
|
+
"cx": self.search_engine_id,
|
|
56
|
+
"q": query,
|
|
57
|
+
"num": min(num_results, self.num_per_request),
|
|
58
|
+
"start": start_index,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
response = requests.get(self.base_url, params=params)
|
|
63
|
+
response.raise_for_status()
|
|
64
|
+
return response.json()
|
|
65
|
+
except requests.exceptions.RequestException as e:
|
|
66
|
+
print(f"Google search request failed: {e}")
|
|
67
|
+
return {}
|
|
68
|
+
|
|
69
|
+
def get_all_results(self, query: str, max_results: int | None = None) -> list[dict]:
|
|
70
|
+
"""
|
|
71
|
+
Get all search results (with pagination)
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
query: Search query
|
|
75
|
+
max_results: Maximum number of results (uses config default if None)
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
List of all search results
|
|
79
|
+
"""
|
|
80
|
+
if max_results is None:
|
|
81
|
+
max_results = self.max_results
|
|
82
|
+
|
|
83
|
+
all_results = []
|
|
84
|
+
start_index = 1
|
|
85
|
+
|
|
86
|
+
while len(all_results) < max_results:
|
|
87
|
+
search_data = self.search(query, start_index=start_index)
|
|
88
|
+
|
|
89
|
+
if not search_data or "items" not in search_data:
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
all_results.extend(search_data["items"])
|
|
93
|
+
|
|
94
|
+
# Check if there are more results
|
|
95
|
+
if len(search_data["items"]) < self.num_per_request:
|
|
96
|
+
break
|
|
97
|
+
|
|
98
|
+
start_index += self.num_per_request
|
|
99
|
+
|
|
100
|
+
# Avoid infinite loop
|
|
101
|
+
if start_index > 100:
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
return all_results[:max_results]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class InternetGoogleRetriever:
|
|
108
|
+
"""Internet retriever that converts search results to TextualMemoryItem format"""
|
|
109
|
+
|
|
110
|
+
def __init__(
|
|
111
|
+
self,
|
|
112
|
+
api_key: str,
|
|
113
|
+
search_engine_id: str,
|
|
114
|
+
embedder: OllamaEmbedder,
|
|
115
|
+
max_results: int = 20,
|
|
116
|
+
num_per_request: int = 10,
|
|
117
|
+
):
|
|
118
|
+
"""
|
|
119
|
+
Initialize internet retriever
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
api_key: Google API key
|
|
123
|
+
search_engine_id: Search engine ID
|
|
124
|
+
embedder: Embedder instance for generating embeddings
|
|
125
|
+
max_results: Maximum number of results to retrieve
|
|
126
|
+
num_per_request: Number of results per API request
|
|
127
|
+
"""
|
|
128
|
+
self.google_api = GoogleCustomSearchAPI(
|
|
129
|
+
api_key, search_engine_id, max_results=max_results, num_per_request=num_per_request
|
|
130
|
+
)
|
|
131
|
+
self.embedder = embedder
|
|
132
|
+
|
|
133
|
+
def retrieve_from_internet(
|
|
134
|
+
self, query: str, top_k: int = 10, parsed_goal=None, info=None
|
|
135
|
+
) -> list[TextualMemoryItem]:
|
|
136
|
+
"""
|
|
137
|
+
Retrieve information from the internet and convert to TextualMemoryItem format
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
query: Search query
|
|
141
|
+
top_k: Number of results to return
|
|
142
|
+
parsed_goal: Parsed task goal (optional)
|
|
143
|
+
info (dict): Leave a record of memory consumption.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
List of TextualMemoryItem
|
|
147
|
+
"""
|
|
148
|
+
if not info:
|
|
149
|
+
info = {"user_id": "", "session_id": ""}
|
|
150
|
+
# Get search results
|
|
151
|
+
search_results = self.google_api.get_all_results(query, max_results=top_k)
|
|
152
|
+
|
|
153
|
+
# Convert to TextualMemoryItem format
|
|
154
|
+
memory_items = []
|
|
155
|
+
|
|
156
|
+
for _, result in enumerate(search_results):
|
|
157
|
+
# Extract basic information
|
|
158
|
+
title = result.get("title", "")
|
|
159
|
+
snippet = result.get("snippet", "")
|
|
160
|
+
link = result.get("link", "")
|
|
161
|
+
display_link = result.get("displayLink", "")
|
|
162
|
+
|
|
163
|
+
# Combine memory content
|
|
164
|
+
memory_content = f"Title: {title}\nSummary: {snippet}\nSource: {link}"
|
|
165
|
+
# Create metadata
|
|
166
|
+
metadata = TreeNodeTextualMemoryMetadata(
|
|
167
|
+
user_id=info.get("user_id", ""),
|
|
168
|
+
session_id=info.get("session_id", ""),
|
|
169
|
+
status="activated",
|
|
170
|
+
type="fact", # Internet search results are usually factual information
|
|
171
|
+
memory_time=datetime.now().strftime("%Y-%m-%d"),
|
|
172
|
+
source="web",
|
|
173
|
+
confidence=85.0, # Confidence level for internet information
|
|
174
|
+
entities=self._extract_entities(title, snippet),
|
|
175
|
+
tags=self._extract_tags(title, snippet, parsed_goal),
|
|
176
|
+
visibility="public",
|
|
177
|
+
memory_type="LongTermMemory", # Internet search results as working memory
|
|
178
|
+
key=title,
|
|
179
|
+
sources=[SourceMessage(type="web", url=link)] if link else [],
|
|
180
|
+
embedding=self.embedder.embed([memory_content])[0], # Can add embedding later
|
|
181
|
+
created_at=datetime.now().isoformat(),
|
|
182
|
+
usage=[],
|
|
183
|
+
background=f"Internet search result from {display_link}",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Create TextualMemoryItem
|
|
187
|
+
memory_item = TextualMemoryItem(
|
|
188
|
+
id=str(uuid.uuid4()), memory=memory_content, metadata=metadata
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
memory_items.append(memory_item)
|
|
192
|
+
|
|
193
|
+
return memory_items
|
|
194
|
+
|
|
195
|
+
def _extract_entities(self, title: str, snippet: str) -> list[str]:
|
|
196
|
+
"""
|
|
197
|
+
Extract entities from title and snippet
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
title: Title
|
|
201
|
+
snippet: Snippet
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
List of entities
|
|
205
|
+
"""
|
|
206
|
+
# Simple entity extraction logic, can be improved as needed
|
|
207
|
+
text = f"{title} {snippet}"
|
|
208
|
+
entities = []
|
|
209
|
+
|
|
210
|
+
# Extract possible organization names (with common suffixes)
|
|
211
|
+
org_suffixes = ["Inc", "Corp", "LLC", "Ltd", "Company", "University", "Institute"]
|
|
212
|
+
words = text.split()
|
|
213
|
+
for i, word in enumerate(words):
|
|
214
|
+
if word in org_suffixes and i > 0:
|
|
215
|
+
entities.append(f"{words[i - 1]} {word}")
|
|
216
|
+
|
|
217
|
+
# Extract possible dates
|
|
218
|
+
import re
|
|
219
|
+
|
|
220
|
+
date_pattern = r"\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{4}|\w+ \d{1,2}, \d{4}"
|
|
221
|
+
dates = re.findall(date_pattern, text)
|
|
222
|
+
entities.extend(dates)
|
|
223
|
+
|
|
224
|
+
return entities[:5] # Limit number of entities
|
|
225
|
+
|
|
226
|
+
def _extract_tags(self, title: str, snippet: str, parsed_goal=None) -> list[str]:
|
|
227
|
+
"""
|
|
228
|
+
Extract tags from title and snippet
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
title: Title
|
|
232
|
+
snippet: Snippet
|
|
233
|
+
parsed_goal: Parsed task goal
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
List of tags
|
|
237
|
+
"""
|
|
238
|
+
tags = []
|
|
239
|
+
|
|
240
|
+
# Extract tags from parsed goal
|
|
241
|
+
if parsed_goal:
|
|
242
|
+
if hasattr(parsed_goal, "topic") and parsed_goal.topic:
|
|
243
|
+
tags.append(parsed_goal.topic)
|
|
244
|
+
if hasattr(parsed_goal, "concept") and parsed_goal.concept:
|
|
245
|
+
tags.append(parsed_goal.concept)
|
|
246
|
+
|
|
247
|
+
# Extract keywords from text
|
|
248
|
+
text = f"{title} {snippet}".lower()
|
|
249
|
+
|
|
250
|
+
# Simple keyword extraction
|
|
251
|
+
keywords = [
|
|
252
|
+
"news",
|
|
253
|
+
"report",
|
|
254
|
+
"article",
|
|
255
|
+
"study",
|
|
256
|
+
"research",
|
|
257
|
+
"analysis",
|
|
258
|
+
"update",
|
|
259
|
+
"announcement",
|
|
260
|
+
"policy",
|
|
261
|
+
"memo",
|
|
262
|
+
"document",
|
|
263
|
+
]
|
|
264
|
+
|
|
265
|
+
for keyword in keywords:
|
|
266
|
+
if keyword in text:
|
|
267
|
+
tags.append(keyword)
|
|
268
|
+
|
|
269
|
+
# Remove duplicates and limit count
|
|
270
|
+
return list(set(tags))[:10]
|