local-deep-research 0.5.7__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +11 -1
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +32 -6
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +33 -8
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -0
- local_deep_research/api/__init__.py +2 -0
- local_deep_research/api/research_functions.py +177 -3
- local_deep_research/benchmarks/graders.py +150 -5
- local_deep_research/benchmarks/models/__init__.py +19 -0
- local_deep_research/benchmarks/models/benchmark_models.py +283 -0
- local_deep_research/benchmarks/ui/__init__.py +1 -0
- local_deep_research/benchmarks/web_api/__init__.py +6 -0
- local_deep_research/benchmarks/web_api/benchmark_routes.py +862 -0
- local_deep_research/benchmarks/web_api/benchmark_service.py +920 -0
- local_deep_research/config/llm_config.py +106 -21
- local_deep_research/defaults/default_settings.json +448 -3
- local_deep_research/error_handling/report_generator.py +10 -0
- local_deep_research/llm/__init__.py +19 -0
- local_deep_research/llm/llm_registry.py +155 -0
- local_deep_research/metrics/db_models.py +3 -7
- local_deep_research/metrics/search_tracker.py +25 -11
- local_deep_research/report_generator.py +3 -2
- local_deep_research/search_system.py +12 -9
- local_deep_research/utilities/log_utils.py +23 -10
- local_deep_research/utilities/thread_context.py +99 -0
- local_deep_research/web/app_factory.py +32 -8
- local_deep_research/web/database/benchmark_schema.py +230 -0
- local_deep_research/web/database/convert_research_id_to_string.py +161 -0
- local_deep_research/web/database/models.py +55 -1
- local_deep_research/web/database/schema_upgrade.py +397 -2
- local_deep_research/web/database/uuid_migration.py +265 -0
- local_deep_research/web/routes/api_routes.py +62 -31
- local_deep_research/web/routes/history_routes.py +13 -6
- local_deep_research/web/routes/metrics_routes.py +264 -4
- local_deep_research/web/routes/research_routes.py +45 -18
- local_deep_research/web/routes/route_registry.py +352 -0
- local_deep_research/web/routes/settings_routes.py +382 -22
- local_deep_research/web/services/research_service.py +22 -29
- local_deep_research/web/services/settings_manager.py +53 -0
- local_deep_research/web/services/settings_service.py +2 -0
- local_deep_research/web/static/css/styles.css +8 -0
- local_deep_research/web/static/js/components/detail.js +7 -14
- local_deep_research/web/static/js/components/details.js +8 -10
- local_deep_research/web/static/js/components/fallback/ui.js +4 -4
- local_deep_research/web/static/js/components/history.js +6 -6
- local_deep_research/web/static/js/components/logpanel.js +14 -11
- local_deep_research/web/static/js/components/progress.js +51 -46
- local_deep_research/web/static/js/components/research.js +250 -89
- local_deep_research/web/static/js/components/results.js +5 -7
- local_deep_research/web/static/js/components/settings.js +32 -26
- local_deep_research/web/static/js/components/settings_sync.js +24 -23
- local_deep_research/web/static/js/config/urls.js +285 -0
- local_deep_research/web/static/js/main.js +8 -8
- local_deep_research/web/static/js/research_form.js +267 -12
- local_deep_research/web/static/js/services/api.js +18 -18
- local_deep_research/web/static/js/services/keyboard.js +8 -8
- local_deep_research/web/static/js/services/socket.js +53 -35
- local_deep_research/web/static/js/services/ui.js +1 -1
- local_deep_research/web/templates/base.html +4 -1
- local_deep_research/web/templates/components/custom_dropdown.html +5 -3
- local_deep_research/web/templates/components/mobile_nav.html +3 -3
- local_deep_research/web/templates/components/sidebar.html +9 -3
- local_deep_research/web/templates/pages/benchmark.html +2697 -0
- local_deep_research/web/templates/pages/benchmark_results.html +1136 -0
- local_deep_research/web/templates/pages/benchmark_simple.html +453 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +212 -39
- local_deep_research/web/templates/pages/research.html +8 -6
- local_deep_research/web/templates/pages/star_reviews.html +1 -1
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -1
- local_deep_research/web_search_engines/engines/search_engine_brave.py +15 -1
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +20 -1
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +26 -2
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +15 -1
- local_deep_research/web_search_engines/engines/search_engine_retriever.py +192 -0
- local_deep_research/web_search_engines/engines/search_engine_tavily.py +307 -0
- local_deep_research/web_search_engines/rate_limiting/__init__.py +14 -0
- local_deep_research/web_search_engines/rate_limiting/__main__.py +9 -0
- local_deep_research/web_search_engines/rate_limiting/cli.py +209 -0
- local_deep_research/web_search_engines/rate_limiting/exceptions.py +21 -0
- local_deep_research/web_search_engines/rate_limiting/tracker.py +506 -0
- local_deep_research/web_search_engines/retriever_registry.py +108 -0
- local_deep_research/web_search_engines/search_engine_base.py +161 -43
- local_deep_research/web_search_engines/search_engine_factory.py +14 -0
- local_deep_research/web_search_engines/search_engines_config.py +20 -0
- local_deep_research-0.6.0.dist-info/METADATA +374 -0
- {local_deep_research-0.5.7.dist-info → local_deep_research-0.6.0.dist-info}/RECORD +90 -65
- local_deep_research-0.5.7.dist-info/METADATA +0 -420
- {local_deep_research-0.5.7.dist-info → local_deep_research-0.6.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.5.7.dist-info → local_deep_research-0.6.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.5.7.dist-info → local_deep_research-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,862 @@
|
|
1
|
+
"""Flask routes for benchmark web interface."""
|
2
|
+
|
3
|
+
import time
|
4
|
+
from flask import Blueprint, request, jsonify
|
5
|
+
from loguru import logger
|
6
|
+
|
7
|
+
from .benchmark_service import benchmark_service
|
8
|
+
from ...web.utils.templates import render_template_with_defaults
|
9
|
+
from ...web.services.settings_manager import SettingsManager
|
10
|
+
from ...utilities.db_utils import get_db_session, get_db_setting
|
11
|
+
|
12
|
+
# Create blueprint for benchmark routes
|
13
|
+
benchmark_bp = Blueprint("benchmark", __name__, url_prefix="/benchmark")
|
14
|
+
|
15
|
+
|
16
|
+
@benchmark_bp.route("/")
|
17
|
+
def index():
|
18
|
+
"""Benchmark dashboard page."""
|
19
|
+
# Load evaluation settings from database
|
20
|
+
eval_settings = {
|
21
|
+
"evaluation_provider": get_db_setting(
|
22
|
+
"benchmark.evaluation.provider", "openai_endpoint"
|
23
|
+
),
|
24
|
+
"evaluation_model": get_db_setting("benchmark.evaluation.model", ""),
|
25
|
+
"evaluation_endpoint_url": get_db_setting(
|
26
|
+
"benchmark.evaluation.endpoint_url", ""
|
27
|
+
),
|
28
|
+
"evaluation_temperature": get_db_setting(
|
29
|
+
"benchmark.evaluation.temperature", 0
|
30
|
+
),
|
31
|
+
}
|
32
|
+
|
33
|
+
return render_template_with_defaults(
|
34
|
+
"pages/benchmark.html", eval_settings=eval_settings
|
35
|
+
)
|
36
|
+
|
37
|
+
|
38
|
+
@benchmark_bp.route("/results")
|
39
|
+
def results():
|
40
|
+
"""Benchmark results history page."""
|
41
|
+
return render_template_with_defaults("pages/benchmark_results.html")
|
42
|
+
|
43
|
+
|
44
|
+
@benchmark_bp.route("/api/start", methods=["POST"])
|
45
|
+
def start_benchmark():
|
46
|
+
"""Start a new benchmark run."""
|
47
|
+
try:
|
48
|
+
data = request.get_json()
|
49
|
+
|
50
|
+
if not data:
|
51
|
+
return jsonify({"error": "No data provided"}), 400
|
52
|
+
|
53
|
+
# Extract configuration
|
54
|
+
run_name = data.get("run_name")
|
55
|
+
|
56
|
+
# Get search config from database instead of request
|
57
|
+
from ...web.services.settings_manager import SettingsManager
|
58
|
+
from ...utilities.db_utils import get_db_session
|
59
|
+
|
60
|
+
session = get_db_session()
|
61
|
+
settings_manager = SettingsManager(db_session=session)
|
62
|
+
|
63
|
+
# Build search config from database settings
|
64
|
+
search_config = {
|
65
|
+
"iterations": int(
|
66
|
+
settings_manager.get_setting("search.iterations", 8)
|
67
|
+
),
|
68
|
+
"questions_per_iteration": int(
|
69
|
+
settings_manager.get_setting(
|
70
|
+
"search.questions_per_iteration", 5
|
71
|
+
)
|
72
|
+
),
|
73
|
+
"search_tool": settings_manager.get_setting(
|
74
|
+
"search.tool", "searxng"
|
75
|
+
),
|
76
|
+
"search_strategy": settings_manager.get_setting(
|
77
|
+
"search.search_strategy", "focused_iteration"
|
78
|
+
),
|
79
|
+
"model_name": settings_manager.get_setting("llm.model"),
|
80
|
+
"provider": settings_manager.get_setting("llm.provider"),
|
81
|
+
"temperature": float(
|
82
|
+
settings_manager.get_setting("llm.temperature", 0.7)
|
83
|
+
),
|
84
|
+
}
|
85
|
+
|
86
|
+
# Add provider-specific settings
|
87
|
+
provider = search_config.get("provider")
|
88
|
+
if provider == "openai_endpoint":
|
89
|
+
search_config["openai_endpoint_url"] = settings_manager.get_setting(
|
90
|
+
"llm.openai_endpoint.url"
|
91
|
+
)
|
92
|
+
search_config["openai_endpoint_api_key"] = (
|
93
|
+
settings_manager.get_setting("llm.openai_endpoint.api_key")
|
94
|
+
)
|
95
|
+
elif provider == "openai":
|
96
|
+
search_config["openai_api_key"] = settings_manager.get_setting(
|
97
|
+
"llm.openai.api_key"
|
98
|
+
)
|
99
|
+
elif provider == "anthropic":
|
100
|
+
search_config["anthropic_api_key"] = settings_manager.get_setting(
|
101
|
+
"llm.anthropic.api_key"
|
102
|
+
)
|
103
|
+
|
104
|
+
# Get evaluation config from database settings or request
|
105
|
+
if "evaluation_config" in data:
|
106
|
+
evaluation_config = data["evaluation_config"]
|
107
|
+
else:
|
108
|
+
# Read evaluation config from database settings
|
109
|
+
evaluation_provider = settings_manager.get_setting(
|
110
|
+
"benchmark.evaluation.provider", "openai_endpoint"
|
111
|
+
)
|
112
|
+
evaluation_model = settings_manager.get_setting(
|
113
|
+
"benchmark.evaluation.model", "anthropic/claude-3.7-sonnet"
|
114
|
+
)
|
115
|
+
evaluation_temperature = float(
|
116
|
+
settings_manager.get_setting(
|
117
|
+
"benchmark.evaluation.temperature", 0
|
118
|
+
)
|
119
|
+
)
|
120
|
+
|
121
|
+
evaluation_config = {
|
122
|
+
"provider": evaluation_provider,
|
123
|
+
"model_name": evaluation_model,
|
124
|
+
"temperature": evaluation_temperature,
|
125
|
+
}
|
126
|
+
|
127
|
+
# Add provider-specific settings for evaluation
|
128
|
+
if evaluation_provider == "openai_endpoint":
|
129
|
+
evaluation_config["openai_endpoint_url"] = (
|
130
|
+
settings_manager.get_setting(
|
131
|
+
"benchmark.evaluation.endpoint_url",
|
132
|
+
"https://openrouter.ai/api/v1",
|
133
|
+
)
|
134
|
+
)
|
135
|
+
evaluation_config["openai_endpoint_api_key"] = (
|
136
|
+
settings_manager.get_setting("llm.openai_endpoint.api_key")
|
137
|
+
)
|
138
|
+
elif evaluation_provider == "openai":
|
139
|
+
evaluation_config["openai_api_key"] = (
|
140
|
+
settings_manager.get_setting("llm.openai.api_key")
|
141
|
+
)
|
142
|
+
elif evaluation_provider == "anthropic":
|
143
|
+
evaluation_config["anthropic_api_key"] = (
|
144
|
+
settings_manager.get_setting("llm.anthropic.api_key")
|
145
|
+
)
|
146
|
+
datasets_config = data.get("datasets_config", {})
|
147
|
+
|
148
|
+
# Close database session
|
149
|
+
session.close()
|
150
|
+
|
151
|
+
# Validate datasets config
|
152
|
+
if not datasets_config or not any(
|
153
|
+
config.get("count", 0) > 0 for config in datasets_config.values()
|
154
|
+
):
|
155
|
+
return jsonify(
|
156
|
+
{
|
157
|
+
"error": "At least one dataset with count > 0 must be specified"
|
158
|
+
}
|
159
|
+
), 400
|
160
|
+
|
161
|
+
# Create benchmark run
|
162
|
+
benchmark_run_id = benchmark_service.create_benchmark_run(
|
163
|
+
run_name=run_name,
|
164
|
+
search_config=search_config,
|
165
|
+
evaluation_config=evaluation_config,
|
166
|
+
datasets_config=datasets_config,
|
167
|
+
)
|
168
|
+
|
169
|
+
# Start benchmark
|
170
|
+
success = benchmark_service.start_benchmark(benchmark_run_id)
|
171
|
+
|
172
|
+
if success:
|
173
|
+
return jsonify(
|
174
|
+
{
|
175
|
+
"success": True,
|
176
|
+
"benchmark_run_id": benchmark_run_id,
|
177
|
+
"message": "Benchmark started successfully",
|
178
|
+
}
|
179
|
+
)
|
180
|
+
else:
|
181
|
+
return jsonify(
|
182
|
+
{"success": False, "error": "Failed to start benchmark"}
|
183
|
+
), 500
|
184
|
+
|
185
|
+
except Exception:
|
186
|
+
logger.exception("Error starting benchmark")
|
187
|
+
return jsonify(
|
188
|
+
{"success": False, "error": "An internal error has occurred."}
|
189
|
+
), 500
|
190
|
+
|
191
|
+
|
192
|
+
@benchmark_bp.route("/api/running", methods=["GET"])
|
193
|
+
def get_running_benchmark():
|
194
|
+
"""Check if there's a running benchmark and return its ID."""
|
195
|
+
try:
|
196
|
+
from ...utilities.db_utils import get_db_session
|
197
|
+
from ..models.benchmark_models import BenchmarkRun, BenchmarkStatus
|
198
|
+
|
199
|
+
session = get_db_session()
|
200
|
+
|
201
|
+
# Find any benchmark that's currently running
|
202
|
+
running_benchmark = (
|
203
|
+
session.query(BenchmarkRun)
|
204
|
+
.filter(BenchmarkRun.status == BenchmarkStatus.IN_PROGRESS)
|
205
|
+
.order_by(BenchmarkRun.created_at.desc())
|
206
|
+
.first()
|
207
|
+
)
|
208
|
+
|
209
|
+
session.close()
|
210
|
+
|
211
|
+
if running_benchmark:
|
212
|
+
return jsonify(
|
213
|
+
{
|
214
|
+
"success": True,
|
215
|
+
"benchmark_run_id": running_benchmark.id,
|
216
|
+
"run_name": running_benchmark.run_name,
|
217
|
+
"total_examples": running_benchmark.total_examples,
|
218
|
+
"completed_examples": running_benchmark.completed_examples,
|
219
|
+
}
|
220
|
+
)
|
221
|
+
else:
|
222
|
+
return jsonify(
|
223
|
+
{"success": False, "message": "No running benchmark found"}
|
224
|
+
)
|
225
|
+
|
226
|
+
except Exception:
|
227
|
+
logger.exception("Error checking for running benchmark")
|
228
|
+
return jsonify(
|
229
|
+
{"success": False, "error": "An internal error has occurred."}
|
230
|
+
), 500
|
231
|
+
|
232
|
+
|
233
|
+
@benchmark_bp.route("/api/status/<int:benchmark_run_id>", methods=["GET"])
|
234
|
+
def get_benchmark_status(benchmark_run_id: int):
|
235
|
+
"""Get status of a benchmark run."""
|
236
|
+
try:
|
237
|
+
status = benchmark_service.get_benchmark_status(benchmark_run_id)
|
238
|
+
|
239
|
+
if status:
|
240
|
+
return jsonify({"success": True, "status": status})
|
241
|
+
else:
|
242
|
+
return jsonify(
|
243
|
+
{"success": False, "error": "Benchmark run not found"}
|
244
|
+
), 404
|
245
|
+
|
246
|
+
except Exception:
|
247
|
+
logger.exception("Error getting benchmark status")
|
248
|
+
return jsonify(
|
249
|
+
{"success": False, "error": "An internal error has occurred."}
|
250
|
+
), 500
|
251
|
+
|
252
|
+
|
253
|
+
@benchmark_bp.route("/api/cancel/<int:benchmark_run_id>", methods=["POST"])
|
254
|
+
def cancel_benchmark(benchmark_run_id: int):
|
255
|
+
"""Cancel a running benchmark."""
|
256
|
+
try:
|
257
|
+
success = benchmark_service.cancel_benchmark(benchmark_run_id)
|
258
|
+
|
259
|
+
if success:
|
260
|
+
return jsonify(
|
261
|
+
{"success": True, "message": "Benchmark cancelled successfully"}
|
262
|
+
)
|
263
|
+
else:
|
264
|
+
return jsonify(
|
265
|
+
{"success": False, "error": "Failed to cancel benchmark"}
|
266
|
+
), 500
|
267
|
+
|
268
|
+
except Exception:
|
269
|
+
logger.exception("Error cancelling benchmark")
|
270
|
+
return jsonify(
|
271
|
+
{"success": False, "error": "An internal error has occurred."}
|
272
|
+
), 500
|
273
|
+
|
274
|
+
|
275
|
+
@benchmark_bp.route("/api/history", methods=["GET"])
|
276
|
+
def get_benchmark_history():
|
277
|
+
"""Get list of recent benchmark runs."""
|
278
|
+
try:
|
279
|
+
from ...utilities.db_utils import get_db_session
|
280
|
+
from ..models.benchmark_models import BenchmarkRun
|
281
|
+
|
282
|
+
session = get_db_session()
|
283
|
+
|
284
|
+
# Get all benchmark runs (completed, failed, cancelled, or in-progress)
|
285
|
+
runs = (
|
286
|
+
session.query(BenchmarkRun)
|
287
|
+
.order_by(BenchmarkRun.created_at.desc())
|
288
|
+
.limit(50)
|
289
|
+
.all()
|
290
|
+
)
|
291
|
+
|
292
|
+
# Format runs for display
|
293
|
+
formatted_runs = []
|
294
|
+
for run in runs:
|
295
|
+
# Calculate average processing time from results
|
296
|
+
avg_processing_time = None
|
297
|
+
avg_search_results = None
|
298
|
+
try:
|
299
|
+
from ..models.benchmark_models import BenchmarkResult
|
300
|
+
from sqlalchemy import func
|
301
|
+
|
302
|
+
avg_result = (
|
303
|
+
session.query(func.avg(BenchmarkResult.processing_time))
|
304
|
+
.filter(
|
305
|
+
BenchmarkResult.benchmark_run_id == run.id,
|
306
|
+
BenchmarkResult.processing_time.isnot(None),
|
307
|
+
BenchmarkResult.processing_time > 0,
|
308
|
+
)
|
309
|
+
.scalar()
|
310
|
+
)
|
311
|
+
|
312
|
+
if avg_result:
|
313
|
+
avg_processing_time = float(avg_result)
|
314
|
+
except Exception as e:
|
315
|
+
logger.warning(
|
316
|
+
f"Error calculating avg processing time for run {run.id}: {e}"
|
317
|
+
)
|
318
|
+
|
319
|
+
# Calculate average search results and total search requests from metrics
|
320
|
+
total_search_requests = None
|
321
|
+
try:
|
322
|
+
from ...metrics.search_tracker import get_search_tracker
|
323
|
+
from ...metrics.db_models import SearchCall
|
324
|
+
|
325
|
+
# Get all results for this run to find research_ids
|
326
|
+
results = (
|
327
|
+
session.query(BenchmarkResult)
|
328
|
+
.filter(BenchmarkResult.benchmark_run_id == run.id)
|
329
|
+
.all()
|
330
|
+
)
|
331
|
+
|
332
|
+
research_ids = [r.research_id for r in results if r.research_id]
|
333
|
+
|
334
|
+
if research_ids:
|
335
|
+
tracker = get_search_tracker()
|
336
|
+
with tracker.db.get_session() as metric_session:
|
337
|
+
# Get all search calls for these research_ids
|
338
|
+
search_calls = (
|
339
|
+
metric_session.query(SearchCall)
|
340
|
+
.filter(SearchCall.research_id.in_(research_ids))
|
341
|
+
.all()
|
342
|
+
)
|
343
|
+
|
344
|
+
# Group by research_id and calculate metrics per research session
|
345
|
+
research_results = {}
|
346
|
+
research_requests = {}
|
347
|
+
|
348
|
+
for call in search_calls:
|
349
|
+
if call.research_id:
|
350
|
+
if call.research_id not in research_results:
|
351
|
+
research_results[call.research_id] = 0
|
352
|
+
research_requests[call.research_id] = 0
|
353
|
+
research_results[call.research_id] += (
|
354
|
+
call.results_count or 0
|
355
|
+
)
|
356
|
+
research_requests[call.research_id] += 1
|
357
|
+
|
358
|
+
# Calculate averages across research sessions
|
359
|
+
if research_results:
|
360
|
+
total_results = sum(research_results.values())
|
361
|
+
avg_search_results = total_results / len(
|
362
|
+
research_results
|
363
|
+
)
|
364
|
+
|
365
|
+
total_requests = sum(research_requests.values())
|
366
|
+
total_search_requests = total_requests / len(
|
367
|
+
research_requests
|
368
|
+
)
|
369
|
+
|
370
|
+
except Exception as e:
|
371
|
+
logger.warning(
|
372
|
+
f"Error calculating search metrics for run {run.id}: {e}"
|
373
|
+
)
|
374
|
+
|
375
|
+
formatted_runs.append(
|
376
|
+
{
|
377
|
+
"id": run.id,
|
378
|
+
"run_name": run.run_name or f"Benchmark #{run.id}",
|
379
|
+
"created_at": run.created_at.isoformat(),
|
380
|
+
"total_examples": run.total_examples,
|
381
|
+
"completed_examples": run.completed_examples,
|
382
|
+
"overall_accuracy": run.overall_accuracy,
|
383
|
+
"status": run.status.value,
|
384
|
+
"search_config": run.search_config,
|
385
|
+
"evaluation_config": run.evaluation_config,
|
386
|
+
"datasets_config": run.datasets_config,
|
387
|
+
"avg_processing_time": avg_processing_time,
|
388
|
+
"avg_search_results": avg_search_results,
|
389
|
+
"total_search_requests": total_search_requests,
|
390
|
+
}
|
391
|
+
)
|
392
|
+
|
393
|
+
session.close()
|
394
|
+
|
395
|
+
return jsonify({"success": True, "runs": formatted_runs})
|
396
|
+
|
397
|
+
except Exception:
|
398
|
+
logger.exception("Error getting benchmark history")
|
399
|
+
return jsonify(
|
400
|
+
{"success": False, "error": "An internal error has occurred."}
|
401
|
+
), 500
|
402
|
+
|
403
|
+
|
404
|
+
@benchmark_bp.route("/api/results/<int:benchmark_run_id>", methods=["GET"])
|
405
|
+
def get_benchmark_results(benchmark_run_id: int):
|
406
|
+
"""Get detailed results for a benchmark run."""
|
407
|
+
try:
|
408
|
+
from ...utilities.db_utils import get_db_session
|
409
|
+
from ..models.benchmark_models import BenchmarkResult
|
410
|
+
|
411
|
+
logger.info(f"Getting results for benchmark {benchmark_run_id}")
|
412
|
+
session = get_db_session()
|
413
|
+
|
414
|
+
# Get recent results (limit to last 10)
|
415
|
+
limit = int(request.args.get("limit", 10))
|
416
|
+
|
417
|
+
results = (
|
418
|
+
session.query(BenchmarkResult)
|
419
|
+
.filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)
|
420
|
+
# Temporarily show all results including pending evaluations
|
421
|
+
# .filter(
|
422
|
+
# BenchmarkResult.is_correct.isnot(None)
|
423
|
+
# ) # Only completed evaluations
|
424
|
+
.order_by(BenchmarkResult.id.desc()) # Most recent first
|
425
|
+
.limit(limit)
|
426
|
+
.all()
|
427
|
+
)
|
428
|
+
|
429
|
+
logger.info(f"Found {len(results)} results")
|
430
|
+
|
431
|
+
# Build a map of research_id to total search results
|
432
|
+
search_results_by_research_id = {}
|
433
|
+
try:
|
434
|
+
from ...metrics.search_tracker import get_search_tracker
|
435
|
+
from ...metrics.db_models import SearchCall
|
436
|
+
|
437
|
+
tracker = get_search_tracker()
|
438
|
+
|
439
|
+
# Get all unique research_ids from our results
|
440
|
+
research_ids = [r.research_id for r in results if r.research_id]
|
441
|
+
|
442
|
+
if research_ids:
|
443
|
+
with tracker.db.get_session() as metric_session:
|
444
|
+
# Get all search calls for these research_ids
|
445
|
+
all_search_calls = (
|
446
|
+
metric_session.query(SearchCall)
|
447
|
+
.filter(SearchCall.research_id.in_(research_ids))
|
448
|
+
.all()
|
449
|
+
)
|
450
|
+
|
451
|
+
# Group search results by research_id
|
452
|
+
for call in all_search_calls:
|
453
|
+
if call.research_id:
|
454
|
+
if (
|
455
|
+
call.research_id
|
456
|
+
not in search_results_by_research_id
|
457
|
+
):
|
458
|
+
search_results_by_research_id[
|
459
|
+
call.research_id
|
460
|
+
] = 0
|
461
|
+
search_results_by_research_id[call.research_id] += (
|
462
|
+
call.results_count or 0
|
463
|
+
)
|
464
|
+
|
465
|
+
logger.info(
|
466
|
+
f"Found search metrics for {len(search_results_by_research_id)} research IDs from {len(all_search_calls)} total search calls"
|
467
|
+
)
|
468
|
+
logger.debug(
|
469
|
+
f"Research IDs from results: {research_ids[:5] if len(research_ids) > 5 else research_ids}"
|
470
|
+
)
|
471
|
+
logger.debug(
|
472
|
+
f"Search results by research_id: {dict(list(search_results_by_research_id.items())[:5])}"
|
473
|
+
)
|
474
|
+
except Exception:
|
475
|
+
logger.exception(
|
476
|
+
f"Error getting search metrics for benchmark {benchmark_run_id}"
|
477
|
+
)
|
478
|
+
|
479
|
+
# Format results for UI display
|
480
|
+
formatted_results = []
|
481
|
+
for result in results:
|
482
|
+
# Get search result count using research_id
|
483
|
+
search_result_count = 0
|
484
|
+
|
485
|
+
try:
|
486
|
+
if (
|
487
|
+
result.research_id
|
488
|
+
and result.research_id in search_results_by_research_id
|
489
|
+
):
|
490
|
+
search_result_count = search_results_by_research_id[
|
491
|
+
result.research_id
|
492
|
+
]
|
493
|
+
logger.debug(
|
494
|
+
f"Found {search_result_count} search results for research_id {result.research_id}"
|
495
|
+
)
|
496
|
+
|
497
|
+
except Exception:
|
498
|
+
logger.exception(
|
499
|
+
f"Error getting search results for result {result.example_id}"
|
500
|
+
)
|
501
|
+
|
502
|
+
# Fallback to sources if available and we didn't find metrics
|
503
|
+
if search_result_count == 0 and result.sources:
|
504
|
+
try:
|
505
|
+
if isinstance(result.sources, list):
|
506
|
+
search_result_count = len(result.sources)
|
507
|
+
elif (
|
508
|
+
isinstance(result.sources, dict)
|
509
|
+
and "all_links_of_system" in result.sources
|
510
|
+
):
|
511
|
+
search_result_count = len(
|
512
|
+
result.sources["all_links_of_system"]
|
513
|
+
)
|
514
|
+
except:
|
515
|
+
pass
|
516
|
+
|
517
|
+
formatted_results.append(
|
518
|
+
{
|
519
|
+
"example_id": result.example_id,
|
520
|
+
"dataset_type": result.dataset_type.value,
|
521
|
+
"question": result.question,
|
522
|
+
"correct_answer": result.correct_answer,
|
523
|
+
"model_answer": result.extracted_answer,
|
524
|
+
"full_response": result.response,
|
525
|
+
"is_correct": result.is_correct,
|
526
|
+
"confidence": result.confidence,
|
527
|
+
"grader_response": result.grader_response,
|
528
|
+
"processing_time": result.processing_time,
|
529
|
+
"search_result_count": search_result_count,
|
530
|
+
"sources": result.sources,
|
531
|
+
"completed_at": result.completed_at.isoformat()
|
532
|
+
if result.completed_at
|
533
|
+
else None,
|
534
|
+
}
|
535
|
+
)
|
536
|
+
|
537
|
+
session.close()
|
538
|
+
|
539
|
+
return jsonify({"success": True, "results": formatted_results})
|
540
|
+
|
541
|
+
except Exception:
|
542
|
+
logger.exception("Error getting benchmark results")
|
543
|
+
return jsonify(
|
544
|
+
{"success": False, "error": "An internal error has occurred."}
|
545
|
+
), 500
|
546
|
+
|
547
|
+
|
548
|
+
@benchmark_bp.route("/api/configs", methods=["GET"])
|
549
|
+
def get_saved_configs():
|
550
|
+
"""Get list of saved benchmark configurations."""
|
551
|
+
try:
|
552
|
+
# TODO: Implement saved configs retrieval from database
|
553
|
+
# For now return default configs
|
554
|
+
default_configs = [
|
555
|
+
{
|
556
|
+
"id": 1,
|
557
|
+
"name": "Quick Test",
|
558
|
+
"description": "Fast benchmark with minimal examples",
|
559
|
+
"search_config": {
|
560
|
+
"iterations": 3,
|
561
|
+
"questions_per_iteration": 3,
|
562
|
+
"search_tool": "searxng",
|
563
|
+
"search_strategy": "focused_iteration",
|
564
|
+
},
|
565
|
+
"datasets_config": {
|
566
|
+
"simpleqa": {"count": 10},
|
567
|
+
"browsecomp": {"count": 5},
|
568
|
+
},
|
569
|
+
},
|
570
|
+
{
|
571
|
+
"id": 2,
|
572
|
+
"name": "Standard Evaluation",
|
573
|
+
"description": "Comprehensive benchmark with standard settings",
|
574
|
+
"search_config": {
|
575
|
+
"iterations": 8,
|
576
|
+
"questions_per_iteration": 5,
|
577
|
+
"search_tool": "searxng",
|
578
|
+
"search_strategy": "focused_iteration",
|
579
|
+
},
|
580
|
+
"datasets_config": {
|
581
|
+
"simpleqa": {"count": 50},
|
582
|
+
"browsecomp": {"count": 25},
|
583
|
+
},
|
584
|
+
},
|
585
|
+
]
|
586
|
+
|
587
|
+
return jsonify({"success": True, "configs": default_configs})
|
588
|
+
|
589
|
+
except Exception:
|
590
|
+
logger.exception("Error getting saved configs")
|
591
|
+
return jsonify(
|
592
|
+
{"success": False, "error": "An internal error has occurred."}
|
593
|
+
), 500
|
594
|
+
|
595
|
+
|
596
|
+
@benchmark_bp.route("/api/start-simple", methods=["POST"])
|
597
|
+
def start_benchmark_simple():
|
598
|
+
"""Start a benchmark using current database settings."""
|
599
|
+
try:
|
600
|
+
data = request.get_json()
|
601
|
+
datasets_config = data.get("datasets_config", {})
|
602
|
+
|
603
|
+
# Validate datasets
|
604
|
+
if not datasets_config or not any(
|
605
|
+
config.get("count", 0) > 0 for config in datasets_config.values()
|
606
|
+
):
|
607
|
+
return jsonify(
|
608
|
+
{
|
609
|
+
"error": "At least one dataset with count > 0 must be specified"
|
610
|
+
}
|
611
|
+
), 400
|
612
|
+
|
613
|
+
# Get current settings from database
|
614
|
+
session = get_db_session()
|
615
|
+
settings_manager = SettingsManager(db_session=session)
|
616
|
+
|
617
|
+
# Build search config from database settings
|
618
|
+
search_config = {
|
619
|
+
"iterations": int(
|
620
|
+
settings_manager.get_setting("search.iterations", 8)
|
621
|
+
),
|
622
|
+
"questions_per_iteration": int(
|
623
|
+
settings_manager.get_setting(
|
624
|
+
"search.questions_per_iteration", 5
|
625
|
+
)
|
626
|
+
),
|
627
|
+
"search_tool": settings_manager.get_setting(
|
628
|
+
"search.tool", "searxng"
|
629
|
+
),
|
630
|
+
"search_strategy": settings_manager.get_setting(
|
631
|
+
"search.search_strategy", "focused_iteration"
|
632
|
+
),
|
633
|
+
"model_name": settings_manager.get_setting("llm.model"),
|
634
|
+
"provider": settings_manager.get_setting("llm.provider"),
|
635
|
+
"temperature": float(
|
636
|
+
settings_manager.get_setting("llm.temperature", 0.7)
|
637
|
+
),
|
638
|
+
}
|
639
|
+
|
640
|
+
# Add provider-specific settings
|
641
|
+
provider = search_config.get("provider")
|
642
|
+
if provider == "openai_endpoint":
|
643
|
+
search_config["openai_endpoint_url"] = settings_manager.get_setting(
|
644
|
+
"llm.openai_endpoint.url"
|
645
|
+
)
|
646
|
+
search_config["openai_endpoint_api_key"] = (
|
647
|
+
settings_manager.get_setting("llm.openai_endpoint.api_key")
|
648
|
+
)
|
649
|
+
elif provider == "openai":
|
650
|
+
search_config["openai_api_key"] = settings_manager.get_setting(
|
651
|
+
"llm.openai.api_key"
|
652
|
+
)
|
653
|
+
elif provider == "anthropic":
|
654
|
+
search_config["anthropic_api_key"] = settings_manager.get_setting(
|
655
|
+
"llm.anthropic.api_key"
|
656
|
+
)
|
657
|
+
|
658
|
+
# Read evaluation config from database settings
|
659
|
+
evaluation_provider = settings_manager.get_setting(
|
660
|
+
"benchmark.evaluation.provider", "openai_endpoint"
|
661
|
+
)
|
662
|
+
evaluation_model = settings_manager.get_setting(
|
663
|
+
"benchmark.evaluation.model", "anthropic/claude-3.7-sonnet"
|
664
|
+
)
|
665
|
+
evaluation_temperature = float(
|
666
|
+
settings_manager.get_setting("benchmark.evaluation.temperature", 0)
|
667
|
+
)
|
668
|
+
|
669
|
+
evaluation_config = {
|
670
|
+
"provider": evaluation_provider,
|
671
|
+
"model_name": evaluation_model,
|
672
|
+
"temperature": evaluation_temperature,
|
673
|
+
}
|
674
|
+
|
675
|
+
# Add provider-specific settings for evaluation
|
676
|
+
if evaluation_provider == "openai_endpoint":
|
677
|
+
evaluation_config["openai_endpoint_url"] = (
|
678
|
+
settings_manager.get_setting(
|
679
|
+
"benchmark.evaluation.endpoint_url",
|
680
|
+
"https://openrouter.ai/api/v1",
|
681
|
+
)
|
682
|
+
)
|
683
|
+
evaluation_config["openai_endpoint_api_key"] = (
|
684
|
+
settings_manager.get_setting("llm.openai_endpoint.api_key")
|
685
|
+
)
|
686
|
+
elif evaluation_provider == "openai":
|
687
|
+
evaluation_config["openai_api_key"] = settings_manager.get_setting(
|
688
|
+
"llm.openai.api_key"
|
689
|
+
)
|
690
|
+
elif evaluation_provider == "anthropic":
|
691
|
+
evaluation_config["anthropic_api_key"] = (
|
692
|
+
settings_manager.get_setting("llm.anthropic.api_key")
|
693
|
+
)
|
694
|
+
|
695
|
+
session.close()
|
696
|
+
|
697
|
+
# Create and start benchmark
|
698
|
+
benchmark_run_id = benchmark_service.create_benchmark_run(
|
699
|
+
run_name=f"Quick Benchmark - {data.get('run_name', '')}",
|
700
|
+
search_config=search_config,
|
701
|
+
evaluation_config=evaluation_config,
|
702
|
+
datasets_config=datasets_config,
|
703
|
+
)
|
704
|
+
|
705
|
+
success = benchmark_service.start_benchmark(benchmark_run_id)
|
706
|
+
|
707
|
+
if success:
|
708
|
+
return jsonify(
|
709
|
+
{
|
710
|
+
"success": True,
|
711
|
+
"benchmark_run_id": benchmark_run_id,
|
712
|
+
"message": "Benchmark started with current settings",
|
713
|
+
}
|
714
|
+
)
|
715
|
+
else:
|
716
|
+
return jsonify(
|
717
|
+
{"success": False, "error": "Failed to start benchmark"}
|
718
|
+
), 500
|
719
|
+
|
720
|
+
except Exception:
|
721
|
+
logger.exception("Error starting simple benchmark")
|
722
|
+
return jsonify(
|
723
|
+
{"success": False, "error": "An internal error has occurred."}
|
724
|
+
), 500
|
725
|
+
|
726
|
+
|
727
|
+
@benchmark_bp.route("/api/validate-config", methods=["POST"])
|
728
|
+
def validate_config():
|
729
|
+
"""Validate a benchmark configuration."""
|
730
|
+
try:
|
731
|
+
data = request.get_json()
|
732
|
+
|
733
|
+
if not data:
|
734
|
+
return jsonify({"valid": False, "errors": ["No data provided"]})
|
735
|
+
|
736
|
+
errors = []
|
737
|
+
|
738
|
+
# Validate search config
|
739
|
+
search_config = data.get("search_config", {})
|
740
|
+
if not search_config.get("search_tool"):
|
741
|
+
errors.append("Search tool is required")
|
742
|
+
if not search_config.get("search_strategy"):
|
743
|
+
errors.append("Search strategy is required")
|
744
|
+
|
745
|
+
# Validate datasets config
|
746
|
+
datasets_config = data.get("datasets_config", {})
|
747
|
+
if not datasets_config:
|
748
|
+
errors.append("At least one dataset must be configured")
|
749
|
+
|
750
|
+
total_examples = sum(
|
751
|
+
config.get("count", 0) for config in datasets_config.values()
|
752
|
+
)
|
753
|
+
if total_examples == 0:
|
754
|
+
errors.append("Total examples must be greater than 0")
|
755
|
+
|
756
|
+
if total_examples > 1000:
|
757
|
+
errors.append(
|
758
|
+
"Total examples should not exceed 1000 for web interface"
|
759
|
+
)
|
760
|
+
|
761
|
+
return jsonify(
|
762
|
+
{
|
763
|
+
"valid": len(errors) == 0,
|
764
|
+
"errors": errors,
|
765
|
+
"total_examples": total_examples,
|
766
|
+
}
|
767
|
+
)
|
768
|
+
|
769
|
+
except Exception:
|
770
|
+
logger.exception("Error validating config")
|
771
|
+
return jsonify(
|
772
|
+
{"valid": False, "errors": ["An internal error has occurred."]}
|
773
|
+
), 500
|
774
|
+
|
775
|
+
|
776
|
+
@benchmark_bp.route("/api/search-quality", methods=["GET"])
|
777
|
+
def get_search_quality():
|
778
|
+
"""Get current search quality metrics from rate limiting tracker."""
|
779
|
+
try:
|
780
|
+
from ...web_search_engines.rate_limiting import get_tracker
|
781
|
+
|
782
|
+
tracker = get_tracker()
|
783
|
+
quality_stats = tracker.get_search_quality_stats()
|
784
|
+
|
785
|
+
return jsonify(
|
786
|
+
{
|
787
|
+
"success": True,
|
788
|
+
"search_quality": quality_stats,
|
789
|
+
"timestamp": time.time(),
|
790
|
+
}
|
791
|
+
)
|
792
|
+
|
793
|
+
except Exception:
|
794
|
+
logger.exception("Error getting search quality")
|
795
|
+
return jsonify(
|
796
|
+
{"success": False, "error": "An internal error has occurred."}
|
797
|
+
), 500
|
798
|
+
|
799
|
+
|
800
|
+
@benchmark_bp.route("/api/delete/<int:benchmark_run_id>", methods=["DELETE"])
|
801
|
+
def delete_benchmark_run(benchmark_run_id: int):
|
802
|
+
"""Delete a benchmark run and all its results."""
|
803
|
+
try:
|
804
|
+
from ...utilities.db_utils import get_db_session
|
805
|
+
from ..models.benchmark_models import (
|
806
|
+
BenchmarkRun,
|
807
|
+
BenchmarkResult,
|
808
|
+
BenchmarkProgress,
|
809
|
+
)
|
810
|
+
|
811
|
+
session = get_db_session()
|
812
|
+
|
813
|
+
# Check if benchmark run exists
|
814
|
+
benchmark_run = (
|
815
|
+
session.query(BenchmarkRun)
|
816
|
+
.filter(BenchmarkRun.id == benchmark_run_id)
|
817
|
+
.first()
|
818
|
+
)
|
819
|
+
|
820
|
+
if not benchmark_run:
|
821
|
+
session.close()
|
822
|
+
return jsonify(
|
823
|
+
{"success": False, "error": "Benchmark run not found"}
|
824
|
+
), 404
|
825
|
+
|
826
|
+
# Prevent deletion of running benchmarks
|
827
|
+
if benchmark_run.status.value == "in_progress":
|
828
|
+
session.close()
|
829
|
+
return jsonify(
|
830
|
+
{
|
831
|
+
"success": False,
|
832
|
+
"error": "Cannot delete a running benchmark. Cancel it first.",
|
833
|
+
}
|
834
|
+
), 400
|
835
|
+
|
836
|
+
# Delete related records (cascade should handle this, but being explicit)
|
837
|
+
session.query(BenchmarkResult).filter(
|
838
|
+
BenchmarkResult.benchmark_run_id == benchmark_run_id
|
839
|
+
).delete()
|
840
|
+
|
841
|
+
session.query(BenchmarkProgress).filter(
|
842
|
+
BenchmarkProgress.benchmark_run_id == benchmark_run_id
|
843
|
+
).delete()
|
844
|
+
|
845
|
+
# Delete the benchmark run
|
846
|
+
session.delete(benchmark_run)
|
847
|
+
session.commit()
|
848
|
+
session.close()
|
849
|
+
|
850
|
+
logger.info(f"Deleted benchmark run {benchmark_run_id}")
|
851
|
+
return jsonify(
|
852
|
+
{
|
853
|
+
"success": True,
|
854
|
+
"message": f"Benchmark run {benchmark_run_id} deleted successfully",
|
855
|
+
}
|
856
|
+
)
|
857
|
+
|
858
|
+
except Exception:
|
859
|
+
logger.exception(f"Error deleting benchmark run {benchmark_run_id}")
|
860
|
+
return jsonify(
|
861
|
+
{"success": False, "error": "An internal error has occurred."}
|
862
|
+
), 500
|