isage-middleware 0.2.4.3__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_middleware-0.2.4.3.dist-info/METADATA +266 -0
- isage_middleware-0.2.4.3.dist-info/RECORD +94 -0
- isage_middleware-0.2.4.3.dist-info/WHEEL +5 -0
- isage_middleware-0.2.4.3.dist-info/top_level.txt +1 -0
- sage/middleware/__init__.py +59 -0
- sage/middleware/_version.py +6 -0
- sage/middleware/components/__init__.py +30 -0
- sage/middleware/components/extensions_compat.py +141 -0
- sage/middleware/components/sage_db/__init__.py +116 -0
- sage/middleware/components/sage_db/backend.py +136 -0
- sage/middleware/components/sage_db/service.py +15 -0
- sage/middleware/components/sage_flow/__init__.py +76 -0
- sage/middleware/components/sage_flow/python/__init__.py +14 -0
- sage/middleware/components/sage_flow/python/micro_service/__init__.py +4 -0
- sage/middleware/components/sage_flow/python/micro_service/sage_flow_service.py +88 -0
- sage/middleware/components/sage_flow/python/sage_flow.py +30 -0
- sage/middleware/components/sage_flow/service.py +14 -0
- sage/middleware/components/sage_mem/__init__.py +83 -0
- sage/middleware/components/sage_sias/__init__.py +59 -0
- sage/middleware/components/sage_sias/continual_learner.py +184 -0
- sage/middleware/components/sage_sias/coreset_selector.py +302 -0
- sage/middleware/components/sage_sias/types.py +94 -0
- sage/middleware/components/sage_tsdb/__init__.py +81 -0
- sage/middleware/components/sage_tsdb/python/__init__.py +21 -0
- sage/middleware/components/sage_tsdb/python/_sage_tsdb.pyi +17 -0
- sage/middleware/components/sage_tsdb/python/algorithms/__init__.py +17 -0
- sage/middleware/components/sage_tsdb/python/algorithms/base.py +51 -0
- sage/middleware/components/sage_tsdb/python/algorithms/out_of_order_join.py +248 -0
- sage/middleware/components/sage_tsdb/python/algorithms/window_aggregator.py +296 -0
- sage/middleware/components/sage_tsdb/python/micro_service/__init__.py +7 -0
- sage/middleware/components/sage_tsdb/python/micro_service/sage_tsdb_service.py +365 -0
- sage/middleware/components/sage_tsdb/python/sage_tsdb.py +523 -0
- sage/middleware/components/sage_tsdb/service.py +17 -0
- sage/middleware/components/vector_stores/__init__.py +25 -0
- sage/middleware/components/vector_stores/chroma.py +483 -0
- sage/middleware/components/vector_stores/chroma_adapter.py +185 -0
- sage/middleware/components/vector_stores/milvus.py +677 -0
- sage/middleware/operators/__init__.py +56 -0
- sage/middleware/operators/agent/__init__.py +24 -0
- sage/middleware/operators/agent/planning/__init__.py +5 -0
- sage/middleware/operators/agent/planning/llm_adapter.py +41 -0
- sage/middleware/operators/agent/planning/planner_adapter.py +98 -0
- sage/middleware/operators/agent/planning/router.py +107 -0
- sage/middleware/operators/agent/runtime.py +296 -0
- sage/middleware/operators/agentic/__init__.py +41 -0
- sage/middleware/operators/agentic/config.py +254 -0
- sage/middleware/operators/agentic/planning_operator.py +125 -0
- sage/middleware/operators/agentic/refined_searcher.py +132 -0
- sage/middleware/operators/agentic/runtime.py +241 -0
- sage/middleware/operators/agentic/timing_operator.py +125 -0
- sage/middleware/operators/agentic/tool_selection_operator.py +127 -0
- sage/middleware/operators/context/__init__.py +17 -0
- sage/middleware/operators/context/critic_evaluation.py +16 -0
- sage/middleware/operators/context/model_context.py +565 -0
- sage/middleware/operators/context/quality_label.py +12 -0
- sage/middleware/operators/context/search_query_results.py +61 -0
- sage/middleware/operators/context/search_result.py +42 -0
- sage/middleware/operators/context/search_session.py +79 -0
- sage/middleware/operators/filters/__init__.py +26 -0
- sage/middleware/operators/filters/context_sink.py +387 -0
- sage/middleware/operators/filters/context_source.py +376 -0
- sage/middleware/operators/filters/evaluate_filter.py +83 -0
- sage/middleware/operators/filters/tool_filter.py +74 -0
- sage/middleware/operators/llm/__init__.py +18 -0
- sage/middleware/operators/llm/sagellm_generator.py +432 -0
- sage/middleware/operators/rag/__init__.py +147 -0
- sage/middleware/operators/rag/arxiv.py +331 -0
- sage/middleware/operators/rag/chunk.py +13 -0
- sage/middleware/operators/rag/document_loaders.py +23 -0
- sage/middleware/operators/rag/evaluate.py +658 -0
- sage/middleware/operators/rag/generator.py +340 -0
- sage/middleware/operators/rag/index_builder/__init__.py +48 -0
- sage/middleware/operators/rag/index_builder/builder.py +363 -0
- sage/middleware/operators/rag/index_builder/manifest.py +101 -0
- sage/middleware/operators/rag/index_builder/storage.py +131 -0
- sage/middleware/operators/rag/pipeline.py +46 -0
- sage/middleware/operators/rag/profiler.py +59 -0
- sage/middleware/operators/rag/promptor.py +400 -0
- sage/middleware/operators/rag/refiner.py +231 -0
- sage/middleware/operators/rag/reranker.py +364 -0
- sage/middleware/operators/rag/retriever.py +1308 -0
- sage/middleware/operators/rag/searcher.py +37 -0
- sage/middleware/operators/rag/types.py +28 -0
- sage/middleware/operators/rag/writer.py +80 -0
- sage/middleware/operators/tools/__init__.py +71 -0
- sage/middleware/operators/tools/arxiv_paper_searcher.py +175 -0
- sage/middleware/operators/tools/arxiv_searcher.py +102 -0
- sage/middleware/operators/tools/duckduckgo_searcher.py +105 -0
- sage/middleware/operators/tools/image_captioner.py +104 -0
- sage/middleware/operators/tools/nature_news_fetcher.py +224 -0
- sage/middleware/operators/tools/searcher_tool.py +514 -0
- sage/middleware/operators/tools/text_detector.py +185 -0
- sage/middleware/operators/tools/url_text_extractor.py +104 -0
- sage/middleware/py.typed +2 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
from sage.common.core.functions import MapFunction as MapOperator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BochaWebSearch(MapOperator):
|
|
9
|
+
def __init__(self, config: dict[str, Any], **kwargs):
|
|
10
|
+
super().__init__(**kwargs)
|
|
11
|
+
self.api_key = config.get("api_key")
|
|
12
|
+
self.count = config.get("count", 10)
|
|
13
|
+
self.page = config.get("page", 1)
|
|
14
|
+
self.summary = config.get("summary", True)
|
|
15
|
+
self.url = "https://api.bochaai.com/v1/web-search"
|
|
16
|
+
|
|
17
|
+
if not self.api_key:
|
|
18
|
+
raise ValueError("BochaWebSearch requires an 'api_key' in config.")
|
|
19
|
+
|
|
20
|
+
def execute(self, data: str) -> dict[str, Any]:
|
|
21
|
+
query = data
|
|
22
|
+
headers = {"Authorization": self.api_key, "Content-Type": "application/json"}
|
|
23
|
+
payload = {
|
|
24
|
+
"query": query,
|
|
25
|
+
"summary": self.summary,
|
|
26
|
+
"count": self.count,
|
|
27
|
+
"page": self.page,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
response = requests.post(self.url, headers=headers, json=payload)
|
|
32
|
+
response.raise_for_status()
|
|
33
|
+
result = response.json()
|
|
34
|
+
return result
|
|
35
|
+
except Exception as e:
|
|
36
|
+
self.logger.error(f"BochaWebSearch error: {e}", exc_info=True)
|
|
37
|
+
return {} # Return empty dict on error
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Compatibility shim for RAG type definitions.
|
|
2
|
+
|
|
3
|
+
Import from ``sage.libs.rag.types`` instead of middleware.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from sage.libs.rag.types import ( # noqa: F401
|
|
7
|
+
RAGDocument,
|
|
8
|
+
RAGInput,
|
|
9
|
+
RAGOutput,
|
|
10
|
+
RAGQuery,
|
|
11
|
+
RAGResponse,
|
|
12
|
+
create_rag_response,
|
|
13
|
+
ensure_rag_response,
|
|
14
|
+
extract_query,
|
|
15
|
+
extract_results,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"RAGDocument",
|
|
20
|
+
"RAGQuery",
|
|
21
|
+
"RAGResponse",
|
|
22
|
+
"RAGInput",
|
|
23
|
+
"RAGOutput",
|
|
24
|
+
"ensure_rag_response",
|
|
25
|
+
"extract_query",
|
|
26
|
+
"extract_results",
|
|
27
|
+
"create_rag_response",
|
|
28
|
+
]
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from sage.common.core.functions import MapFunction as MapOperator
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MemoryWriter(MapOperator):
|
|
5
|
+
def __init__(self, config: dict, **kwargs):
|
|
6
|
+
super().__init__(config, **kwargs)
|
|
7
|
+
self.state = None
|
|
8
|
+
self.config = config
|
|
9
|
+
# 初始化各类型集合
|
|
10
|
+
self.collections = {}
|
|
11
|
+
|
|
12
|
+
# 配置STM
|
|
13
|
+
if self.config.get("stm", False):
|
|
14
|
+
stm_config = self.config.get("stm_config", {})
|
|
15
|
+
self.collections["stm"] = {
|
|
16
|
+
"collection": self.config.get("stm_collection"),
|
|
17
|
+
"config": stm_config,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# 配置LTM
|
|
21
|
+
if self.config.get("ltm", False):
|
|
22
|
+
ltm_config = self.config.get("ltm_config", {})
|
|
23
|
+
self.collections["ltm"] = {
|
|
24
|
+
"collection": self.config.get("ltm_collection"),
|
|
25
|
+
"config": ltm_config,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# 配置DCM
|
|
29
|
+
if self.config.get("dcm", False):
|
|
30
|
+
dcm_config = self.config.get("dcm_config", {})
|
|
31
|
+
self.collections["dcm"] = {
|
|
32
|
+
"collection": self.config.get("dcm_collection"),
|
|
33
|
+
"config": dcm_config,
|
|
34
|
+
}
|
|
35
|
+
# TODO: 在runtime_context中增加状态管理
|
|
36
|
+
# Issue URL: https://github.com/intellistream/SAGE/issues/235
|
|
37
|
+
|
|
38
|
+
def execute(self, data: str | list[str] | tuple[str, str]):
|
|
39
|
+
input_data = data
|
|
40
|
+
|
|
41
|
+
# 统一数据类型处理
|
|
42
|
+
processed_data = []
|
|
43
|
+
if isinstance(input_data, list):
|
|
44
|
+
processed_data = input_data
|
|
45
|
+
elif isinstance(input_data, tuple) and len(input_data) == 2:
|
|
46
|
+
processed_data = [f"{input_data[0]}{input_data[1]}"] # 拼接元组
|
|
47
|
+
elif isinstance(input_data, str):
|
|
48
|
+
processed_data = [input_data]
|
|
49
|
+
else:
|
|
50
|
+
self.logger.error(f"Unsupported data type: {type(input_data)}")
|
|
51
|
+
return data
|
|
52
|
+
|
|
53
|
+
# 写入所有启用的集合
|
|
54
|
+
for mem_type, settings in self.collections.items():
|
|
55
|
+
collection = settings["collection"]
|
|
56
|
+
config = settings["config"]
|
|
57
|
+
if not collection:
|
|
58
|
+
self.logger.warning(f"{mem_type.upper()} collection not initialized")
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
# TODO: 这里的实现实际上要成为由writer 这个function主动往memory manager function发送一个数据。
|
|
63
|
+
# 而 memory manager function拿到这个数据之后就会去执行 `execute' method 即可实现记忆的读写。
|
|
64
|
+
# 这里可能会有一个由于调度原因导致的阻塞 -- 可以被优化,请参考MorphStream!
|
|
65
|
+
if self.state is not None:
|
|
66
|
+
self.state.store(
|
|
67
|
+
collection=collection,
|
|
68
|
+
documents=processed_data,
|
|
69
|
+
collection_config=config,
|
|
70
|
+
)
|
|
71
|
+
self.logger.debug(f"Stored {len(processed_data)} chunks to {mem_type.upper()}")
|
|
72
|
+
else:
|
|
73
|
+
self.logger.warning(
|
|
74
|
+
f"State manager not initialized. Cannot store to {mem_type.upper()}. "
|
|
75
|
+
"See TODO: https://github.com/intellistream/SAGE/issues/235"
|
|
76
|
+
)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
self.logger.error(f"Failed to store to {mem_type.upper()}: {str(e)}")
|
|
79
|
+
|
|
80
|
+
return data # 返回原始数据
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tool Operators
|
|
3
|
+
|
|
4
|
+
This module contains domain-specific tool operators:
|
|
5
|
+
- Search tools (web search, document search)
|
|
6
|
+
- Data extraction tools
|
|
7
|
+
|
|
8
|
+
These operators inherit from base operator classes in sage.kernel.operators
|
|
9
|
+
and implement tool-specific business logic.
|
|
10
|
+
|
|
11
|
+
Note: Some tools require heavy dependencies (torch, transformers).
|
|
12
|
+
They are loaded lazily and will raise ImportError if dependencies are missing.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import warnings
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
# Core tools (minimal dependencies)
|
|
19
|
+
from sage.middleware.operators.tools.arxiv_paper_searcher import _Searcher_Tool
|
|
20
|
+
from sage.middleware.operators.tools.arxiv_searcher import ArxivSearcher
|
|
21
|
+
from sage.middleware.operators.tools.nature_news_fetcher import Nature_News_Fetcher_Tool
|
|
22
|
+
from sage.middleware.operators.tools.searcher_tool import BochaSearchTool
|
|
23
|
+
from sage.middleware.operators.tools.url_text_extractor import URL_Text_Extractor_Tool
|
|
24
|
+
|
|
25
|
+
# Heavy tools (require torch/transformers) - lazy load
|
|
26
|
+
_HEAVY_TOOLS_LOADED = False
|
|
27
|
+
ImageCaptioner = None # type: ignore
|
|
28
|
+
text_detector = None # type: ignore
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _load_heavy_tools():
|
|
32
|
+
"""Load tools that require torch/transformers."""
|
|
33
|
+
global _HEAVY_TOOLS_LOADED, ImageCaptioner, text_detector
|
|
34
|
+
if _HEAVY_TOOLS_LOADED:
|
|
35
|
+
return
|
|
36
|
+
try:
|
|
37
|
+
from sage.middleware.operators.tools.image_captioner import ImageCaptioner as _IC
|
|
38
|
+
from sage.middleware.operators.tools.text_detector import text_detector as _TD
|
|
39
|
+
|
|
40
|
+
ImageCaptioner = _IC
|
|
41
|
+
text_detector = _TD
|
|
42
|
+
_HEAVY_TOOLS_LOADED = True
|
|
43
|
+
except ImportError as e:
|
|
44
|
+
warnings.warn(
|
|
45
|
+
f"Heavy tool operators not available: {e}\n"
|
|
46
|
+
"Install with: pip install torch transformers",
|
|
47
|
+
UserWarning,
|
|
48
|
+
stacklevel=2,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def __getattr__(name: str):
|
|
53
|
+
"""Lazy load heavy tools on access."""
|
|
54
|
+
if name in ("ImageCaptioner", "text_detector"):
|
|
55
|
+
_load_heavy_tools()
|
|
56
|
+
if name == "ImageCaptioner":
|
|
57
|
+
return ImageCaptioner
|
|
58
|
+
if name == "text_detector":
|
|
59
|
+
return text_detector
|
|
60
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
__all__ = [
|
|
64
|
+
"BochaSearchTool",
|
|
65
|
+
"_Searcher_Tool",
|
|
66
|
+
"ArxivSearcher",
|
|
67
|
+
"Nature_News_Fetcher_Tool",
|
|
68
|
+
"ImageCaptioner",
|
|
69
|
+
"text_detector",
|
|
70
|
+
"URL_Text_Extractor_Tool",
|
|
71
|
+
]
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from bs4.element import Tag
|
|
7
|
+
|
|
8
|
+
from sage.libs.foundation.tools.tool import BaseTool
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class _Searcher_Tool(BaseTool):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__(
|
|
14
|
+
tool_name="_Searcher_Tool",
|
|
15
|
+
tool_description="A tool that searches arXiv for papers based on a given query.",
|
|
16
|
+
input_types={
|
|
17
|
+
"query": "str - The search query for arXiv papers.",
|
|
18
|
+
"size": "int - The number of results per page (25, 50, 100, or 200). If None, use 25.",
|
|
19
|
+
"max_results": "int - The maximum number of papers to return (default: 25). Should be less than or equal to 100.",
|
|
20
|
+
},
|
|
21
|
+
output_type="list - A list of dictionaries containing paper information.",
|
|
22
|
+
demo_commands=[
|
|
23
|
+
{
|
|
24
|
+
"command": 'execution = tool.execute(query="tool agents with large language models")',
|
|
25
|
+
"description": "Search for papers about tool agents with large language models.",
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"command": 'execution = tool.execute(query="quantum computing", size=100, max_results=50)',
|
|
29
|
+
"description": "Search for quantum computing papers, with 100 results per page, returning a maximum of 50 papers.",
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"command": 'execution = tool.execute(query="machine learning", max_results=75)',
|
|
33
|
+
"description": "Search for machine learning papers, returning a maximum of 75 papers.",
|
|
34
|
+
},
|
|
35
|
+
],
|
|
36
|
+
)
|
|
37
|
+
# Store additional metadata as instance variables
|
|
38
|
+
self.tool_version = "1.0.0"
|
|
39
|
+
self.valid_sizes = [25, 50, 100, 200]
|
|
40
|
+
self.base_url = "https://arxiv.org/search/"
|
|
41
|
+
|
|
42
|
+
def build_tool(self):
|
|
43
|
+
"""
|
|
44
|
+
No specific build required for this tool.
|
|
45
|
+
"""
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
def execute(self, query, size=None, max_results=25):
|
|
49
|
+
"""
|
|
50
|
+
Executes the arXiv search tool to find papers based on the given query.
|
|
51
|
+
|
|
52
|
+
Parameters:
|
|
53
|
+
query (str): The search query for arXiv papers.
|
|
54
|
+
size (int): The number of results per page.
|
|
55
|
+
max_results (int): The maximum number of papers to return.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
list: A list of dictionaries containing paper information.
|
|
59
|
+
"""
|
|
60
|
+
valid_sizes = self.valid_sizes
|
|
61
|
+
base_url = self.base_url
|
|
62
|
+
|
|
63
|
+
if size is None:
|
|
64
|
+
size = 25
|
|
65
|
+
elif size not in valid_sizes:
|
|
66
|
+
size = min(valid_sizes, key=lambda x: abs(x - size))
|
|
67
|
+
|
|
68
|
+
results = []
|
|
69
|
+
start = 0
|
|
70
|
+
|
|
71
|
+
max_results = min(max_results, 100) # NOTE: For traffic reasons, limit to 100 results
|
|
72
|
+
|
|
73
|
+
while len(results) < max_results:
|
|
74
|
+
params = {
|
|
75
|
+
"searchtype": "all",
|
|
76
|
+
"query": query,
|
|
77
|
+
"abstracts": "show",
|
|
78
|
+
"order": "",
|
|
79
|
+
"size": str(size),
|
|
80
|
+
"start": str(start),
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
response = requests.get(base_url, params=params)
|
|
85
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
86
|
+
|
|
87
|
+
papers = soup.find_all("li", class_="arxiv-result") # type: ignore
|
|
88
|
+
if not papers:
|
|
89
|
+
break
|
|
90
|
+
|
|
91
|
+
for paper in papers:
|
|
92
|
+
if len(results) >= max_results:
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
title_elem = paper.find("p", class_="title") # type: ignore
|
|
96
|
+
title = title_elem.text.strip() if title_elem else "No title found"
|
|
97
|
+
|
|
98
|
+
authors_elem = paper.find("p", class_="authors") # type: ignore
|
|
99
|
+
authors = authors_elem.text.strip() if authors_elem else "No authors found"
|
|
100
|
+
authors = re.sub(r"^Authors:\s*", "", authors)
|
|
101
|
+
authors = re.sub(r"\s+", " ", authors).strip()
|
|
102
|
+
|
|
103
|
+
abstract_elem = paper.find("span", class_="abstract-full") # type: ignore
|
|
104
|
+
abstract = (
|
|
105
|
+
abstract_elem.text.strip() if abstract_elem else "No abstract available"
|
|
106
|
+
)
|
|
107
|
+
abstract = abstract.replace("△ Less", "").strip()
|
|
108
|
+
|
|
109
|
+
link_elem = paper.find("p", class_="list-title") # type: ignore
|
|
110
|
+
link_tag = link_elem.find("a") if isinstance(link_elem, Tag) else None # type: ignore
|
|
111
|
+
link = (
|
|
112
|
+
link_tag["href"]
|
|
113
|
+
if isinstance(link_tag, Tag) and link_tag.has_attr("href")
|
|
114
|
+
else "No link found"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
results.append(
|
|
118
|
+
{
|
|
119
|
+
"title": title,
|
|
120
|
+
"authors": authors,
|
|
121
|
+
"abstract": abstract,
|
|
122
|
+
"link": link,
|
|
123
|
+
}
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
start += size
|
|
127
|
+
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logging.error(f"Error searching arXiv: {e}")
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
return results[:max_results]
|
|
133
|
+
|
|
134
|
+
def get_metadata(self):
|
|
135
|
+
"""
|
|
136
|
+
Returns the metadata for the _Searcher_Tool.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
dict: A dictionary containing the tool's metadata.
|
|
140
|
+
"""
|
|
141
|
+
metadata = super().get_metadata()
|
|
142
|
+
return metadata
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
if __name__ == "__main__":
|
|
146
|
+
import json
|
|
147
|
+
|
|
148
|
+
print("ArXiv Search Tool Test")
|
|
149
|
+
|
|
150
|
+
# Example usage of the _Searcher_Tool
|
|
151
|
+
tool = _Searcher_Tool()
|
|
152
|
+
|
|
153
|
+
# Get tool metadata
|
|
154
|
+
metadata = tool.get_metadata()
|
|
155
|
+
print("Tool Metadata:")
|
|
156
|
+
print(metadata)
|
|
157
|
+
|
|
158
|
+
# Sample query for searching arXiv
|
|
159
|
+
query = ""
|
|
160
|
+
# Execute the tool
|
|
161
|
+
try:
|
|
162
|
+
execution = tool.execute(query=query, size=50, max_results=10)
|
|
163
|
+
print("\n==>> Execution:")
|
|
164
|
+
print(json.dumps(execution, indent=4)) # Pretty print JSON
|
|
165
|
+
print("\n==>> Search Results:")
|
|
166
|
+
for i, paper in enumerate(execution, 1):
|
|
167
|
+
print(f"{i}. {paper['title']}")
|
|
168
|
+
print(f" Authors: {paper['authors']}")
|
|
169
|
+
print(f" Abstract: {paper['abstract'][:2000]}")
|
|
170
|
+
print(f" Link: {paper['link']}")
|
|
171
|
+
print()
|
|
172
|
+
except Exception as e:
|
|
173
|
+
print(f"Execution failed: {e}")
|
|
174
|
+
|
|
175
|
+
print("Done!")
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Arxiv 论文搜索工具 (Real Implementation)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import urllib.parse
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import aiohttp
|
|
11
|
+
import feedparser
|
|
12
|
+
|
|
13
|
+
from sage.libs.foundation.tools.tool import BaseTool
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ArxivSearcher(BaseTool):
|
|
19
|
+
"""Arxiv 学术论文搜索工具"""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
super().__init__(
|
|
23
|
+
tool_name="arxiv_searcher",
|
|
24
|
+
tool_description="Search Arxiv for academic papers. Returns title, authors, summary, and link.",
|
|
25
|
+
input_types=["str"],
|
|
26
|
+
output_type="list",
|
|
27
|
+
demo_commands=["search for transformer papers", "find papers about LLM agents"],
|
|
28
|
+
require_llm_engine=False,
|
|
29
|
+
)
|
|
30
|
+
self.base_url = "http://export.arxiv.org/api/query"
|
|
31
|
+
|
|
32
|
+
async def execute(self, query: str, max_results: int = 5) -> list[dict[str, Any]]:
|
|
33
|
+
"""
|
|
34
|
+
Execute Arxiv search.
|
|
35
|
+
"""
|
|
36
|
+
logger.info(f"Searching Arxiv for: {query}")
|
|
37
|
+
|
|
38
|
+
# Construct API query
|
|
39
|
+
# search_query=all:electron&start=0&max_results=10
|
|
40
|
+
params = {
|
|
41
|
+
"search_query": f"all:{query}",
|
|
42
|
+
"start": 0,
|
|
43
|
+
"max_results": max_results,
|
|
44
|
+
"sortBy": "relevance",
|
|
45
|
+
"sortOrder": "descending",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
url = f"{self.base_url}?{urllib.parse.urlencode(params)}"
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
async with aiohttp.ClientSession() as session:
|
|
52
|
+
async with session.get(url) as response:
|
|
53
|
+
if response.status != 200:
|
|
54
|
+
logger.error(f"Arxiv API failed with status {response.status}")
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
content = await response.text()
|
|
58
|
+
|
|
59
|
+
# Parse with feedparser
|
|
60
|
+
feed = feedparser.parse(content)
|
|
61
|
+
|
|
62
|
+
results = []
|
|
63
|
+
for entry in feed.entries:
|
|
64
|
+
paper = {
|
|
65
|
+
"title": entry.title.replace("\n", " ").strip(),
|
|
66
|
+
"authors": [author.name for author in entry.authors],
|
|
67
|
+
"summary": entry.summary.replace("\n", " ").strip(),
|
|
68
|
+
"published": entry.published,
|
|
69
|
+
"link": entry.link,
|
|
70
|
+
"pdf_link": next(
|
|
71
|
+
(link.href for link in entry.links if link.title == "pdf"), None
|
|
72
|
+
),
|
|
73
|
+
}
|
|
74
|
+
results.append(paper)
|
|
75
|
+
|
|
76
|
+
logger.info(f"Found {len(results)} papers")
|
|
77
|
+
return results
|
|
78
|
+
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f"Arxiv search failed: {e}")
|
|
81
|
+
return []
|
|
82
|
+
|
|
83
|
+
def call(self, arguments: dict) -> Any:
|
|
84
|
+
"""Sync wrapper for MCP"""
|
|
85
|
+
query = arguments.get("query")
|
|
86
|
+
if not query:
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
# Check for running loop
|
|
90
|
+
try:
|
|
91
|
+
loop = asyncio.get_running_loop()
|
|
92
|
+
if loop.is_running():
|
|
93
|
+
# If we are in a loop, we can't use asyncio.run.
|
|
94
|
+
# But AgentRuntime calls tools synchronously?
|
|
95
|
+
# If AgentRuntime is running in a thread, we can use asyncio.run.
|
|
96
|
+
# If AgentRuntime is running in the main loop, we are in trouble.
|
|
97
|
+
# But Gateway runs AgentRuntime in run_in_executor.
|
|
98
|
+
return asyncio.run(self.execute(query))
|
|
99
|
+
except RuntimeError:
|
|
100
|
+
return asyncio.run(self.execute(query))
|
|
101
|
+
|
|
102
|
+
return asyncio.run(self.execute(query))
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DuckDuckGo web search tool (no API key required).
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import aiohttp
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
from sage.libs.foundation.tools.tool import BaseTool
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DuckDuckGoSearchInput(BaseModel):
|
|
21
|
+
query: str = Field(..., description="Search query text")
|
|
22
|
+
max_results: int = Field(5, description="Number of results to return", ge=1, le=20)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DuckDuckGoSearcher(BaseTool):
|
|
26
|
+
"""Simple HTML-based DuckDuckGo searcher.
|
|
27
|
+
|
|
28
|
+
Uses the public HTML endpoint (no API key) and extracts title/link/snippet.
|
|
29
|
+
Intended for lightweight research fallback when no commercial search API is configured.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
super().__init__(
|
|
34
|
+
tool_name="duckduckgo_search",
|
|
35
|
+
tool_description="Search the web via DuckDuckGo (HTML endpoint). Returns title, link, and snippet.",
|
|
36
|
+
input_types={"query": "str - search query", "max_results": "int - number of results"},
|
|
37
|
+
output_type="list",
|
|
38
|
+
demo_commands=[
|
|
39
|
+
"search for latest vector database papers",
|
|
40
|
+
"find recent ML system posts",
|
|
41
|
+
],
|
|
42
|
+
require_llm_engine=False,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
async def execute(self, query: str, max_results: int = 5) -> list[dict[str, Any]]:
|
|
46
|
+
url = "https://duckduckgo.com/html"
|
|
47
|
+
params = {"q": query, "kl": "us-en"}
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
async with aiohttp.ClientSession() as session:
|
|
51
|
+
async with session.post(url, data=params, timeout=15) as resp:
|
|
52
|
+
if resp.status != 200:
|
|
53
|
+
logger.warning("DuckDuckGo returned status %s", resp.status)
|
|
54
|
+
return []
|
|
55
|
+
html = await resp.text()
|
|
56
|
+
except Exception as exc: # noqa: BLE001
|
|
57
|
+
logger.error("DuckDuckGo search failed: %s", exc)
|
|
58
|
+
return []
|
|
59
|
+
|
|
60
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
61
|
+
results: list[dict[str, Any]] = []
|
|
62
|
+
|
|
63
|
+
for result in soup.select("div.result"):
|
|
64
|
+
if len(results) >= max_results:
|
|
65
|
+
break
|
|
66
|
+
|
|
67
|
+
link_tag = result.select_one("a.result__a")
|
|
68
|
+
snippet_tag = result.select_one("a.result__snippet") or result.select_one(
|
|
69
|
+
"div.result__snippet"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
title = link_tag.get_text(strip=True) if link_tag else ""
|
|
73
|
+
href = link_tag.get("href") if link_tag else ""
|
|
74
|
+
snippet = snippet_tag.get_text(strip=True) if snippet_tag else ""
|
|
75
|
+
|
|
76
|
+
if not href:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
results.append(
|
|
80
|
+
{
|
|
81
|
+
"title": title,
|
|
82
|
+
"link": href,
|
|
83
|
+
"content": snippet,
|
|
84
|
+
"source": "duckduckgo",
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return results
|
|
89
|
+
|
|
90
|
+
def call(self, arguments: dict) -> Any:
|
|
91
|
+
"""Sync wrapper used by MCP/AgentRuntime."""
|
|
92
|
+
query = arguments.get("query")
|
|
93
|
+
if not query:
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
max_results = arguments.get("max_results", 5)
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
loop = asyncio.get_running_loop()
|
|
100
|
+
if loop.is_running():
|
|
101
|
+
return asyncio.run(self.execute(query, max_results=max_results))
|
|
102
|
+
except RuntimeError:
|
|
103
|
+
return asyncio.run(self.execute(query, max_results=max_results))
|
|
104
|
+
|
|
105
|
+
return asyncio.run(self.execute(query, max_results=max_results))
|