isage-data 0.2.1.8__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_data-0.2.1.8.dist-info/METADATA +135 -0
- isage_data-0.2.1.8.dist-info/RECORD +132 -0
- isage_data-0.2.1.8.dist-info/WHEEL +5 -0
- isage_data-0.2.1.8.dist-info/entry_points.txt +2 -0
- isage_data-0.2.1.8.dist-info/licenses/LICENSE +21 -0
- isage_data-0.2.1.8.dist-info/top_level.txt +1 -0
- sage/data/__init__.py +37 -0
- sage/data/__init__.pyc +0 -0
- sage/data/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/data/__pycache__/__init__.cpython-312.pyc +0 -0
- sage/data/__pycache__/cli.cpython-311.pyc +0 -0
- sage/data/__pycache__/cli.cpython-312.pyc +0 -0
- sage/data/__pycache__/manager.cpython-311.pyc +0 -0
- sage/data/__pycache__/manager.cpython-312.pyc +0 -0
- sage/data/cli.pyc +0 -0
- sage/data/manager.pyc +0 -0
- sage/data/sources/__init__.py +13 -0
- sage/data/sources/__init__.pyc +0 -0
- sage/data/sources/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/data/sources/__pycache__/__init__.cpython-312.pyc +0 -0
- sage/data/sources/agent_benchmark/__init__.py +35 -0
- sage/data/sources/agent_benchmark/__init__.pyc +0 -0
- sage/data/sources/agent_benchmark/dataloader.pyc +0 -0
- sage/data/sources/agent_benchmark/dataset.yaml +44 -0
- sage/data/sources/agent_benchmark/external_benchmarks/__init__.py +32 -0
- sage/data/sources/agent_benchmark/external_benchmarks/__init__.pyc +0 -0
- sage/data/sources/agent_benchmark/external_benchmarks/converters.pyc +0 -0
- sage/data/sources/agent_benchmark/external_benchmarks/download_all.pyc +0 -0
- sage/data/sources/agent_benchmark/external_benchmarks/download_apibank.pyc +0 -0
- sage/data/sources/agent_benchmark/external_benchmarks/download_bfcl.pyc +0 -0
- sage/data/sources/agent_benchmark/external_benchmarks/download_toolalpaca.pyc +0 -0
- sage/data/sources/agent_benchmark/external_benchmarks/download_toolbench.pyc +0 -0
- sage/data/sources/agent_benchmark/external_benchmarks/loader.pyc +0 -0
- sage/data/sources/agent_benchmark/fix_tool_references.pyc +0 -0
- sage/data/sources/agent_benchmark/generate_data.pyc +0 -0
- sage/data/sources/agent_benchmark/prepare_planning_data.pyc +0 -0
- sage/data/sources/agent_benchmark/prepare_runtime_data.pyc +0 -0
- sage/data/sources/agent_benchmark/prepare_timing_data.pyc +0 -0
- sage/data/sources/agent_benchmark/test_integration.py +94 -0
- sage/data/sources/agent_benchmark/tests/test_agent_benchmark_loader.py +353 -0
- sage/data/sources/agent_benchmark/validate_cross_task.pyc +0 -0
- sage/data/sources/agent_benchmark/validate_data.pyc +0 -0
- sage/data/sources/agent_sft/__init__.py +10 -0
- sage/data/sources/agent_sft/__init__.pyc +0 -0
- sage/data/sources/agent_sft/data/generate_data.pyc +0 -0
- sage/data/sources/agent_sft/data/prompts_template.yaml +75 -0
- sage/data/sources/agent_sft/dataloader.pyc +0 -0
- sage/data/sources/agent_sft/dataset.yaml +9 -0
- sage/data/sources/agent_sft/fix_tool_ids.pyc +0 -0
- sage/data/sources/agent_sft/schemas.pyc +0 -0
- sage/data/sources/agent_sft/tests/test_agent_sft_loader.py +316 -0
- sage/data/sources/agent_tools/__init__.py +6 -0
- sage/data/sources/agent_tools/__init__.pyc +0 -0
- sage/data/sources/agent_tools/dataloader.pyc +0 -0
- sage/data/sources/agent_tools/dataset.yaml +9 -0
- sage/data/sources/agent_tools/generate_tools.pyc +0 -0
- sage/data/sources/agent_tools/schemas.pyc +0 -0
- sage/data/sources/agent_tools/test_integration.py +108 -0
- sage/data/sources/agent_tools/tests/test_agent_tools_loader.py +306 -0
- sage/data/sources/agent_tools/validate_data.pyc +0 -0
- sage/data/sources/bbh/__init__.py +5 -0
- sage/data/sources/bbh/__init__.pyc +0 -0
- sage/data/sources/bbh/dataloader.pyc +0 -0
- sage/data/sources/bbh/dataset.yaml +9 -0
- sage/data/sources/control_plane_benchmark/__init__.py +41 -0
- sage/data/sources/control_plane_benchmark/__init__.pyc +0 -0
- sage/data/sources/control_plane_benchmark/dataloader.pyc +0 -0
- sage/data/sources/control_plane_benchmark/dataset.yaml +101 -0
- sage/data/sources/gpqa/__init__.py +5 -0
- sage/data/sources/gpqa/__init__.pyc +0 -0
- sage/data/sources/gpqa/dataloader.pyc +0 -0
- sage/data/sources/gpqa/dataset.yaml +10 -0
- sage/data/sources/libamm_benchmark/__init__.py +10 -0
- sage/data/sources/libamm_benchmark/__init__.pyc +0 -0
- sage/data/sources/libamm_benchmark/dataset.yaml +9 -0
- sage/data/sources/locomo/__init__.py +5 -0
- sage/data/sources/locomo/__init__.pyc +0 -0
- sage/data/sources/locomo/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/data/sources/locomo/__pycache__/__init__.cpython-312.pyc +0 -0
- sage/data/sources/locomo/__pycache__/dataloader.cpython-311.pyc +0 -0
- sage/data/sources/locomo/__pycache__/dataloader.cpython-312.pyc +0 -0
- sage/data/sources/locomo/__pycache__/download.cpython-311.pyc +0 -0
- sage/data/sources/locomo/dataloader.pyc +0 -0
- sage/data/sources/locomo/dataset.yaml +10 -0
- sage/data/sources/locomo/download.pyc +0 -0
- sage/data/sources/locomo/locomo10.json +66751 -0
- sage/data/sources/longmemeval/__init__.py +5 -0
- sage/data/sources/longmemeval/__init__.pyc +0 -0
- sage/data/sources/longmemeval/compose.pyc +0 -0
- sage/data/sources/longmemeval/config/longmemeval_groups.yaml +15 -0
- sage/data/sources/longmemeval/dataloader.pyc +0 -0
- sage/data/sources/longmemeval/dataset.yaml +9 -0
- sage/data/sources/longmemeval/download.pyc +0 -0
- sage/data/sources/memagentbench/Conflict_Resolution.parquet +0 -0
- sage/data/sources/memagentbench/__init__.py +16 -0
- sage/data/sources/memagentbench/__init__.pyc +0 -0
- sage/data/sources/memagentbench/__pycache__/__init__.cpython-312.pyc +0 -0
- sage/data/sources/memagentbench/__pycache__/conflict_resolution_loader.cpython-312.pyc +0 -0
- sage/data/sources/memagentbench/__pycache__/download.cpython-312.pyc +0 -0
- sage/data/sources/memagentbench/conflict_resolution_loader.pyc +0 -0
- sage/data/sources/memagentbench/conflict_resolution_loader_test.py +169 -0
- sage/data/sources/memagentbench/dataset.yaml +10 -0
- sage/data/sources/memagentbench/download.pyc +0 -0
- sage/data/sources/mmlu/__init__.py +5 -0
- sage/data/sources/mmlu/__init__.pyc +0 -0
- sage/data/sources/mmlu/dataloader.pyc +0 -0
- sage/data/sources/mmlu/dataset.yaml +10 -0
- sage/data/sources/mmlu/download.pyc +0 -0
- sage/data/sources/orca_dpo/__init__.py +5 -0
- sage/data/sources/orca_dpo/__init__.pyc +0 -0
- sage/data/sources/orca_dpo/dataloader.pyc +0 -0
- sage/data/sources/qa_base/__init__.py +5 -0
- sage/data/sources/qa_base/__init__.pyc +0 -0
- sage/data/sources/qa_base/dataloader.pyc +0 -0
- sage/data/sources/qa_base/dataset.yaml +9 -0
- sage/data/sources/qa_base/qa_knowledge_base.txt +35 -0
- sage/data/sources/qa_base/qa_knowledge_chromaDB.txt +13 -0
- sage/data/sources/qa_base/sample/one_question.txt +1 -0
- sage/data/sources/qa_base/sample/question.txt +352 -0
- sage/data/sources/qa_base/sample/question1.txt +1 -0
- sage/data/usages/__init__.py +3 -0
- sage/data/usages/__init__.pyc +0 -0
- sage/data/usages/agent_eval/__init__.py +191 -0
- sage/data/usages/agent_eval/__init__.pyc +0 -0
- sage/data/usages/agent_eval/config.yaml +15 -0
- sage/data/usages/agent_eval/profiles/full_eval.yaml +15 -0
- sage/data/usages/agent_eval/profiles/quick_eval.yaml +11 -0
- sage/data/usages/agent_eval/profiles/sft_training.yaml +12 -0
- sage/data/usages/agent_eval/usage.yaml +8 -0
- sage/data/usages/libamm/config.yaml +13 -0
- sage/data/usages/neuromem/config.yaml +5 -0
- sage/data/usages/rag/config.yaml +9 -0
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unit tests for Agent Tools DataLoader
|
|
3
|
+
|
|
4
|
+
Tests cover:
|
|
5
|
+
- Data loading and validation
|
|
6
|
+
- Tool ID format validation
|
|
7
|
+
- Search and retrieval operations
|
|
8
|
+
- Category indexing
|
|
9
|
+
- Deduplication checks
|
|
10
|
+
- Coverage metrics
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
|
|
15
|
+
import pytest
|
|
16
|
+
|
|
17
|
+
from sage.data.sources.agent_tools import AgentToolRecord, AgentToolsDataLoader
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TestAgentToolsDataLoader:
|
|
21
|
+
"""Test suite for AgentToolsDataLoader."""
|
|
22
|
+
|
|
23
|
+
@pytest.fixture
|
|
24
|
+
def loader(self):
|
|
25
|
+
"""Create a loader instance for testing."""
|
|
26
|
+
return AgentToolsDataLoader()
|
|
27
|
+
|
|
28
|
+
def test_loader_initialization(self, loader):
|
|
29
|
+
"""Test that loader initializes successfully."""
|
|
30
|
+
assert loader is not None
|
|
31
|
+
assert len(loader) > 0
|
|
32
|
+
assert loader.get_total_tools() > 0
|
|
33
|
+
|
|
34
|
+
def test_minimum_tool_count(self, loader):
|
|
35
|
+
"""Test that we have at least 1000 tools."""
|
|
36
|
+
assert len(loader) >= 1000, f"Expected >= 1000 tools, got {len(loader)}"
|
|
37
|
+
|
|
38
|
+
def test_tool_id_format(self, loader):
|
|
39
|
+
"""Test that all tool_ids match required regex pattern."""
|
|
40
|
+
pattern = re.compile(r"^[a-z]+(_[a-z]+)*_[0-9]{3}$")
|
|
41
|
+
|
|
42
|
+
invalid_ids = []
|
|
43
|
+
for tool_id in loader.list_tool_ids():
|
|
44
|
+
if not pattern.match(tool_id):
|
|
45
|
+
invalid_ids.append(tool_id)
|
|
46
|
+
|
|
47
|
+
assert len(invalid_ids) == 0, f"Invalid tool_ids found: {invalid_ids[:10]}"
|
|
48
|
+
|
|
49
|
+
def test_tool_id_uniqueness(self, loader):
|
|
50
|
+
"""Test that all tool_ids are unique."""
|
|
51
|
+
tool_ids = loader.list_tool_ids()
|
|
52
|
+
assert len(tool_ids) == len(set(tool_ids)), "Duplicate tool_ids found"
|
|
53
|
+
|
|
54
|
+
def test_tool_name_uniqueness(self, loader):
|
|
55
|
+
"""Test that all tool names are unique."""
|
|
56
|
+
names = [tool.name for tool in loader.tools.values()]
|
|
57
|
+
duplicates = [name for name in names if names.count(name) > 1]
|
|
58
|
+
assert len(duplicates) == 0, f"Duplicate names found: {set(duplicates)}"
|
|
59
|
+
|
|
60
|
+
def test_get_tool(self, loader):
|
|
61
|
+
"""Test getting a tool by ID."""
|
|
62
|
+
# Get first tool_id
|
|
63
|
+
tool_ids = loader.list_tool_ids()
|
|
64
|
+
assert len(tool_ids) > 0
|
|
65
|
+
|
|
66
|
+
tool = loader.get_tool(tool_ids[0])
|
|
67
|
+
assert isinstance(tool, AgentToolRecord)
|
|
68
|
+
assert tool.tool_id == tool_ids[0]
|
|
69
|
+
|
|
70
|
+
def test_get_tool_invalid_id(self, loader):
|
|
71
|
+
"""Test that getting invalid tool_id raises KeyError."""
|
|
72
|
+
with pytest.raises(KeyError):
|
|
73
|
+
loader.get_tool("nonexistent_tool_999")
|
|
74
|
+
|
|
75
|
+
def test_capabilities_non_empty(self, loader):
|
|
76
|
+
"""Test that all tools have non-empty capabilities."""
|
|
77
|
+
for tool in loader.tools.values():
|
|
78
|
+
assert len(tool.capabilities) > 0, f"Tool {tool.tool_id} has empty capabilities"
|
|
79
|
+
|
|
80
|
+
def test_category_format(self, loader):
|
|
81
|
+
"""Test that all categories follow path format."""
|
|
82
|
+
category_pattern = re.compile(r"^[a-z]+(/[a-z_]+)*$")
|
|
83
|
+
|
|
84
|
+
invalid_categories = []
|
|
85
|
+
for tool in loader.tools.values():
|
|
86
|
+
if not category_pattern.match(tool.category):
|
|
87
|
+
invalid_categories.append(tool.category)
|
|
88
|
+
|
|
89
|
+
assert len(invalid_categories) == 0, f"Invalid categories: {set(invalid_categories)}"
|
|
90
|
+
|
|
91
|
+
def test_category_index(self, loader):
|
|
92
|
+
"""Test that category index is built correctly."""
|
|
93
|
+
categories = loader.get_categories()
|
|
94
|
+
assert len(categories) > 0
|
|
95
|
+
|
|
96
|
+
# Test that each category has tools
|
|
97
|
+
for category in categories:
|
|
98
|
+
tools_in_cat = list(loader.iter_category(category))
|
|
99
|
+
assert len(tools_in_cat) > 0, f"Category {category} has no tools"
|
|
100
|
+
|
|
101
|
+
def test_search_by_capability(self, loader):
|
|
102
|
+
"""Test capability-based search."""
|
|
103
|
+
# Search for common capability
|
|
104
|
+
results = loader.search_by_capability("forecast", top_k=10)
|
|
105
|
+
assert len(results) > 0, "No tools found with 'forecast' capability"
|
|
106
|
+
|
|
107
|
+
# Verify results have matching capability
|
|
108
|
+
for tool in results:
|
|
109
|
+
assert any("forecast" in cap for cap in tool.capabilities), \
|
|
110
|
+
f"Tool {tool.tool_id} doesn't have 'forecast' in capabilities"
|
|
111
|
+
|
|
112
|
+
def test_search_top_k_limit(self, loader):
|
|
113
|
+
"""Test that search respects top_k parameter."""
|
|
114
|
+
results = loader.search_by_capability("search", top_k=5)
|
|
115
|
+
assert len(results) <= 5, "Search returned more than top_k results"
|
|
116
|
+
|
|
117
|
+
def test_iter_category(self, loader):
|
|
118
|
+
"""Test category iteration."""
|
|
119
|
+
categories = loader.get_categories()
|
|
120
|
+
assert len(categories) > 0
|
|
121
|
+
|
|
122
|
+
# Test first category
|
|
123
|
+
category = categories[0]
|
|
124
|
+
count = 0
|
|
125
|
+
for tool in loader.iter_category(category):
|
|
126
|
+
assert tool.category == category
|
|
127
|
+
count += 1
|
|
128
|
+
|
|
129
|
+
assert count > 0, f"No tools found in category {category}"
|
|
130
|
+
|
|
131
|
+
def test_iter_category_invalid(self, loader):
|
|
132
|
+
"""Test that iterating invalid category raises ValueError."""
|
|
133
|
+
with pytest.raises(ValueError):
|
|
134
|
+
list(loader.iter_category("invalid/category"))
|
|
135
|
+
|
|
136
|
+
def test_load_taxonomy(self, loader):
|
|
137
|
+
"""Test loading category taxonomy."""
|
|
138
|
+
taxonomy = loader.load_taxonomy()
|
|
139
|
+
assert taxonomy is not None
|
|
140
|
+
assert len(taxonomy.taxonomy) > 0
|
|
141
|
+
assert taxonomy.version is not None
|
|
142
|
+
|
|
143
|
+
def test_load_stats(self, loader):
|
|
144
|
+
"""Test loading dataset statistics."""
|
|
145
|
+
stats = loader.load_stats()
|
|
146
|
+
assert stats is not None
|
|
147
|
+
assert stats.total_tools > 0
|
|
148
|
+
assert stats.total_categories > 0
|
|
149
|
+
assert len(stats.category_distribution) > 0
|
|
150
|
+
|
|
151
|
+
def test_stats_accuracy(self, loader):
|
|
152
|
+
"""Test that stats match actual data."""
|
|
153
|
+
stats = loader.load_stats()
|
|
154
|
+
assert stats.total_tools == len(loader)
|
|
155
|
+
|
|
156
|
+
# Check category distribution
|
|
157
|
+
for category, count in stats.category_distribution.items():
|
|
158
|
+
actual_count = len(list(loader.iter_category(category)))
|
|
159
|
+
assert actual_count == count, \
|
|
160
|
+
f"Category {category}: stats={count}, actual={actual_count}"
|
|
161
|
+
|
|
162
|
+
def test_reliability_scores(self, loader):
|
|
163
|
+
"""Test that reliability scores are valid."""
|
|
164
|
+
for tool in loader.tools.values():
|
|
165
|
+
if tool.reliability_score is not None:
|
|
166
|
+
assert 0.0 <= tool.reliability_score <= 1.0, \
|
|
167
|
+
f"Tool {tool.tool_id} has invalid reliability: {tool.reliability_score}"
|
|
168
|
+
|
|
169
|
+
def test_latency_values(self, loader):
|
|
170
|
+
"""Test that latency values are non-negative."""
|
|
171
|
+
for tool in loader.tools.values():
|
|
172
|
+
if tool.latency_ms_p50 is not None:
|
|
173
|
+
assert tool.latency_ms_p50 >= 0, \
|
|
174
|
+
f"Tool {tool.tool_id} has negative latency: {tool.latency_ms_p50}"
|
|
175
|
+
|
|
176
|
+
def test_filter_tools(self, loader):
|
|
177
|
+
"""Test multi-criteria filtering."""
|
|
178
|
+
# Filter by category
|
|
179
|
+
results = loader.filter_tools(category="environment/weather")
|
|
180
|
+
assert all(t.category == "environment/weather" for t in results)
|
|
181
|
+
|
|
182
|
+
# Filter by reliability
|
|
183
|
+
results = loader.filter_tools(min_reliability=0.95)
|
|
184
|
+
assert all(
|
|
185
|
+
t.reliability_score is None or t.reliability_score >= 0.95
|
|
186
|
+
for t in results
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Filter by latency
|
|
190
|
+
results = loader.filter_tools(max_latency=200)
|
|
191
|
+
assert all(
|
|
192
|
+
t.latency_ms_p50 is None or t.latency_ms_p50 <= 200
|
|
193
|
+
for t in results
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def test_get_tool_by_name(self, loader):
|
|
197
|
+
"""Test getting tool by name."""
|
|
198
|
+
# Get first tool
|
|
199
|
+
tool_ids = loader.list_tool_ids()
|
|
200
|
+
original_tool = loader.get_tool(tool_ids[0])
|
|
201
|
+
|
|
202
|
+
# Get by name
|
|
203
|
+
tool_by_name = loader.get_tool_by_name(original_tool.name)
|
|
204
|
+
assert tool_by_name.tool_id == original_tool.tool_id
|
|
205
|
+
|
|
206
|
+
def test_search_by_name(self, loader):
|
|
207
|
+
"""Test name-based search."""
|
|
208
|
+
results = loader.search_by_name("weather", top_k=5)
|
|
209
|
+
assert len(results) > 0
|
|
210
|
+
assert all("weather" in t.name.lower() for t in results)
|
|
211
|
+
|
|
212
|
+
def test_capability_index(self, loader):
|
|
213
|
+
"""Test that capability index is comprehensive."""
|
|
214
|
+
capabilities = loader.get_capabilities()
|
|
215
|
+
assert len(capabilities) > 0
|
|
216
|
+
|
|
217
|
+
# Each capability should have tools
|
|
218
|
+
for cap in capabilities[:10]: # Test first 10
|
|
219
|
+
tools_with_cap = [
|
|
220
|
+
t for t in loader.tools.values()
|
|
221
|
+
if cap in t.capabilities
|
|
222
|
+
]
|
|
223
|
+
assert len(tools_with_cap) > 0, f"No tools with capability: {cap}"
|
|
224
|
+
|
|
225
|
+
def test_category_stats(self, loader):
|
|
226
|
+
"""Test category statistics calculation."""
|
|
227
|
+
categories = loader.get_categories()
|
|
228
|
+
category = categories[0]
|
|
229
|
+
|
|
230
|
+
stats = loader.get_category_stats(category)
|
|
231
|
+
assert "total_tools" in stats
|
|
232
|
+
assert "avg_reliability" in stats
|
|
233
|
+
assert "avg_latency_ms" in stats
|
|
234
|
+
assert stats["total_tools"] > 0
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class TestAgentToolRecord:
|
|
238
|
+
"""Test suite for AgentToolRecord schema validation."""
|
|
239
|
+
|
|
240
|
+
def test_valid_tool_id(self):
|
|
241
|
+
"""Test valid tool_id formats."""
|
|
242
|
+
valid_ids = [
|
|
243
|
+
"weather_query_001",
|
|
244
|
+
"calendar_event_create_042",
|
|
245
|
+
"environment_air_quality_015",
|
|
246
|
+
"a_001",
|
|
247
|
+
"test_tool_999"
|
|
248
|
+
]
|
|
249
|
+
|
|
250
|
+
for tool_id in valid_ids:
|
|
251
|
+
tool = AgentToolRecord(
|
|
252
|
+
tool_id=tool_id,
|
|
253
|
+
name="Test Tool",
|
|
254
|
+
category="test/category",
|
|
255
|
+
capabilities=["test"]
|
|
256
|
+
)
|
|
257
|
+
assert tool.tool_id == tool_id
|
|
258
|
+
|
|
259
|
+
def test_invalid_tool_id(self):
|
|
260
|
+
"""Test that invalid tool_ids raise validation error."""
|
|
261
|
+
invalid_ids = [
|
|
262
|
+
"WeatherQuery_001", # uppercase
|
|
263
|
+
"weather-query_001", # hyphen
|
|
264
|
+
"weather_query_1", # not 3 digits
|
|
265
|
+
"weather_query_1234", # too many digits
|
|
266
|
+
"001_weather_query", # digits not at end
|
|
267
|
+
]
|
|
268
|
+
|
|
269
|
+
from pydantic import ValidationError
|
|
270
|
+
|
|
271
|
+
for tool_id in invalid_ids:
|
|
272
|
+
with pytest.raises(ValidationError):
|
|
273
|
+
AgentToolRecord(
|
|
274
|
+
tool_id=tool_id,
|
|
275
|
+
name="Test Tool",
|
|
276
|
+
category="test/category",
|
|
277
|
+
capabilities=["test"]
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
def test_empty_capabilities(self):
|
|
281
|
+
"""Test that empty capabilities raise error."""
|
|
282
|
+
from pydantic import ValidationError
|
|
283
|
+
|
|
284
|
+
with pytest.raises(ValidationError):
|
|
285
|
+
AgentToolRecord(
|
|
286
|
+
tool_id="test_tool_001",
|
|
287
|
+
name="Test Tool",
|
|
288
|
+
category="test/category",
|
|
289
|
+
capabilities=[]
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
def test_invalid_category(self):
|
|
293
|
+
"""Test that invalid category format raises error."""
|
|
294
|
+
from pydantic import ValidationError
|
|
295
|
+
|
|
296
|
+
with pytest.raises(ValidationError):
|
|
297
|
+
AgentToolRecord(
|
|
298
|
+
tool_id="test_tool_001",
|
|
299
|
+
name="Test Tool",
|
|
300
|
+
category="Invalid-Category", # hyphen not allowed
|
|
301
|
+
capabilities=["test"]
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
if __name__ == "__main__":
|
|
306
|
+
pytest.main([__file__, "-v"])
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Control Plane Benchmark Dataset Module
|
|
3
|
+
|
|
4
|
+
This module provides data loaders for the Control Plane scheduling benchmark,
|
|
5
|
+
including LLM workloads, hybrid workloads (LLM + Embedding), and test prompts.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from sage.data.sources.control_plane_benchmark import ControlPlaneBenchmarkDataLoader
|
|
9
|
+
|
|
10
|
+
loader = ControlPlaneBenchmarkDataLoader()
|
|
11
|
+
|
|
12
|
+
# List available workloads
|
|
13
|
+
print(loader.list_workloads())
|
|
14
|
+
print(loader.list_workloads(category="hybrid"))
|
|
15
|
+
|
|
16
|
+
# Load a workload configuration
|
|
17
|
+
workload = loader.load_workload("llm_medium")
|
|
18
|
+
print(f"Requests: {workload.request_count}, Rate: {workload.rate_per_second}")
|
|
19
|
+
|
|
20
|
+
# Load test prompts
|
|
21
|
+
llm_prompts = loader.load_prompts("llm")
|
|
22
|
+
embed_texts = loader.load_prompts("embedding")
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from .dataloader import (
|
|
26
|
+
ControlPlaneBenchmarkDataLoader,
|
|
27
|
+
EmbeddingText,
|
|
28
|
+
HybridWorkloadConfig,
|
|
29
|
+
LLMPrompt,
|
|
30
|
+
LLMWorkloadConfig,
|
|
31
|
+
WorkloadConfig,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"ControlPlaneBenchmarkDataLoader",
|
|
36
|
+
"WorkloadConfig",
|
|
37
|
+
"LLMWorkloadConfig",
|
|
38
|
+
"HybridWorkloadConfig",
|
|
39
|
+
"LLMPrompt",
|
|
40
|
+
"EmbeddingText",
|
|
41
|
+
]
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
name: "control_plane_benchmark"
|
|
2
|
+
description: "Benchmark data for Control Plane scheduling policy evaluation"
|
|
3
|
+
type: "benchmark"
|
|
4
|
+
format: "jsonl"
|
|
5
|
+
version: "0.1.0"
|
|
6
|
+
maintainer: "SAGE Control Plane Team"
|
|
7
|
+
tags: ["control_plane", "scheduling", "benchmark", "llm", "embedding", "hybrid"]
|
|
8
|
+
license: "Apache-2.0"
|
|
9
|
+
size: "~5MB"
|
|
10
|
+
|
|
11
|
+
categories:
|
|
12
|
+
- name: "llm_workloads"
|
|
13
|
+
description: "Pure LLM request workload configurations"
|
|
14
|
+
files:
|
|
15
|
+
- name: "light"
|
|
16
|
+
description: "Light workload: 100 requests, 10 req/s, single model"
|
|
17
|
+
- name: "medium"
|
|
18
|
+
description: "Medium workload: 1000 requests, 100 req/s, multi-model"
|
|
19
|
+
- name: "heavy"
|
|
20
|
+
description: "Heavy workload: 5000 requests, 500 req/s, multi-priority"
|
|
21
|
+
|
|
22
|
+
- name: "hybrid_workloads"
|
|
23
|
+
description: "Mixed LLM + Embedding workload configurations"
|
|
24
|
+
files:
|
|
25
|
+
- name: "balanced"
|
|
26
|
+
description: "Balanced mix: 50% LLM, 50% Embedding"
|
|
27
|
+
- name: "llm_heavy"
|
|
28
|
+
description: "LLM heavy: 80% LLM, 20% Embedding"
|
|
29
|
+
- name: "embed_heavy"
|
|
30
|
+
description: "Embedding heavy: 20% LLM, 80% Embedding"
|
|
31
|
+
- name: "burst"
|
|
32
|
+
description: "Burst pattern with variable mix ratios"
|
|
33
|
+
|
|
34
|
+
- name: "prompts"
|
|
35
|
+
description: "Test data for benchmark requests"
|
|
36
|
+
files:
|
|
37
|
+
- name: "llm_prompts"
|
|
38
|
+
description: "200+ LLM test prompts with expected token counts"
|
|
39
|
+
- name: "embed_texts"
|
|
40
|
+
description: "200+ Embedding test texts with batch configurations"
|
|
41
|
+
|
|
42
|
+
statistics:
|
|
43
|
+
llm_workloads: 3
|
|
44
|
+
hybrid_workloads: 4
|
|
45
|
+
llm_prompts: 200
|
|
46
|
+
embed_texts: 200
|
|
47
|
+
|
|
48
|
+
schema:
|
|
49
|
+
llm_workload:
|
|
50
|
+
required:
|
|
51
|
+
- workload_id
|
|
52
|
+
- request_count
|
|
53
|
+
- rate_per_second
|
|
54
|
+
optional:
|
|
55
|
+
- arrival_pattern
|
|
56
|
+
- model_distribution
|
|
57
|
+
- priority_distribution
|
|
58
|
+
- prompt_len_range
|
|
59
|
+
- output_len_range
|
|
60
|
+
- slo_deadlines
|
|
61
|
+
|
|
62
|
+
hybrid_workload:
|
|
63
|
+
required:
|
|
64
|
+
- workload_id
|
|
65
|
+
- request_count
|
|
66
|
+
- rate_per_second
|
|
67
|
+
- llm_ratio
|
|
68
|
+
- embedding_ratio
|
|
69
|
+
optional:
|
|
70
|
+
- arrival_pattern
|
|
71
|
+
- llm_model_distribution
|
|
72
|
+
- embedding_model
|
|
73
|
+
- embedding_batch_sizes
|
|
74
|
+
- priority_distribution
|
|
75
|
+
|
|
76
|
+
llm_prompt:
|
|
77
|
+
required:
|
|
78
|
+
- prompt_id
|
|
79
|
+
- text
|
|
80
|
+
optional:
|
|
81
|
+
- expected_tokens
|
|
82
|
+
- priority
|
|
83
|
+
- category
|
|
84
|
+
- difficulty
|
|
85
|
+
|
|
86
|
+
embed_text:
|
|
87
|
+
required:
|
|
88
|
+
- text_id
|
|
89
|
+
- texts
|
|
90
|
+
optional:
|
|
91
|
+
- model
|
|
92
|
+
- batch_size
|
|
93
|
+
- category
|
|
94
|
+
|
|
95
|
+
citation: |
|
|
96
|
+
@dataset{control_plane_benchmark_2025,
|
|
97
|
+
title={Control Plane Benchmark: Evaluating LLM Scheduling Policies},
|
|
98
|
+
author={SAGE Team},
|
|
99
|
+
year={2025},
|
|
100
|
+
publisher={SAGE Framework}
|
|
101
|
+
}
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
name: "gpqa"
|
|
2
|
+
description: "GPQA (Graduate-Level Google-Proof Q&A) - Expert-level questions in Physics, Chemistry, Biology"
|
|
3
|
+
type: "text"
|
|
4
|
+
format: "huggingface-dataset"
|
|
5
|
+
maintainer: "sage-team"
|
|
6
|
+
tags: ["gpqa", "expert-level", "reasoning", "science"]
|
|
7
|
+
size: "~5MB (cached)"
|
|
8
|
+
license: "MIT"
|
|
9
|
+
version: "1.0.0"
|
|
10
|
+
source_url: "https://huggingface.co/datasets/Idavidrein/gpqa"
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""LibAMM benchmark dataset source.
|
|
2
|
+
|
|
3
|
+
This provides access to the LibAMM benchmark datasets located in the
|
|
4
|
+
libamm-benchmark directory.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Note: The actual data is in ../libamm-benchmark/
|
|
8
|
+
# This wrapper provides a consistent interface through the sources layer
|
|
9
|
+
|
|
10
|
+
__all__ = []
|
|
Binary file
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
name: "libamm_benchmark"
|
|
2
|
+
description: "LibAMM benchmark datasets for approximate matrix multiplication evaluation"
|
|
3
|
+
type: "matrix"
|
|
4
|
+
format: "binary/text"
|
|
5
|
+
maintainer: "libamm-team"
|
|
6
|
+
tags: ["matrix", "benchmark", "approximate-multiplication", "ann"]
|
|
7
|
+
size: "~325MB"
|
|
8
|
+
license: "Research Use"
|
|
9
|
+
version: "1.0.0"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
name: "locomo"
|
|
2
|
+
description: "Long-Context Memory (LoCoMo) dataset for long-context conversation and QA evaluation"
|
|
3
|
+
type: "text"
|
|
4
|
+
format: "json"
|
|
5
|
+
maintainer: "sage-team"
|
|
6
|
+
tags: ["long-context", "memory", "conversation", "qa"]
|
|
7
|
+
size: "~2.68MB"
|
|
8
|
+
license: "MIT"
|
|
9
|
+
version: "1.0.0"
|
|
10
|
+
source_url: "https://github.com/your-repo/locomo"
|
|
Binary file
|