local-deep-research 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +1 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
- local_deep_research/advanced_search_system/findings/repository.py +0 -3
- local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
- local_deep_research/api/benchmark_functions.py +288 -0
- local_deep_research/api/research_functions.py +8 -4
- local_deep_research/benchmarks/README.md +162 -0
- local_deep_research/benchmarks/__init__.py +51 -0
- local_deep_research/benchmarks/benchmark_functions.py +353 -0
- local_deep_research/benchmarks/cli/__init__.py +16 -0
- local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
- local_deep_research/benchmarks/cli.py +347 -0
- local_deep_research/benchmarks/comparison/__init__.py +12 -0
- local_deep_research/benchmarks/comparison/evaluator.py +768 -0
- local_deep_research/benchmarks/datasets/__init__.py +53 -0
- local_deep_research/benchmarks/datasets/base.py +295 -0
- local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
- local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
- local_deep_research/benchmarks/datasets/utils.py +116 -0
- local_deep_research/benchmarks/datasets.py +31 -0
- local_deep_research/benchmarks/efficiency/__init__.py +14 -0
- local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
- local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
- local_deep_research/benchmarks/evaluators/__init__.py +18 -0
- local_deep_research/benchmarks/evaluators/base.py +74 -0
- local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
- local_deep_research/benchmarks/evaluators/composite.py +121 -0
- local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
- local_deep_research/benchmarks/graders.py +410 -0
- local_deep_research/benchmarks/metrics/README.md +80 -0
- local_deep_research/benchmarks/metrics/__init__.py +24 -0
- local_deep_research/benchmarks/metrics/calculation.py +385 -0
- local_deep_research/benchmarks/metrics/reporting.py +155 -0
- local_deep_research/benchmarks/metrics/visualization.py +205 -0
- local_deep_research/benchmarks/metrics.py +11 -0
- local_deep_research/benchmarks/optimization/__init__.py +32 -0
- local_deep_research/benchmarks/optimization/api.py +274 -0
- local_deep_research/benchmarks/optimization/metrics.py +20 -0
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
- local_deep_research/benchmarks/runners.py +434 -0
- local_deep_research/benchmarks/templates.py +65 -0
- local_deep_research/config/llm_config.py +26 -23
- local_deep_research/config/search_config.py +1 -5
- local_deep_research/defaults/default_settings.json +108 -7
- local_deep_research/search_system.py +16 -8
- local_deep_research/utilities/db_utils.py +3 -6
- local_deep_research/utilities/es_utils.py +441 -0
- local_deep_research/utilities/log_utils.py +36 -0
- local_deep_research/utilities/search_utilities.py +8 -9
- local_deep_research/web/app.py +15 -10
- local_deep_research/web/app_factory.py +9 -12
- local_deep_research/web/database/migrations.py +8 -5
- local_deep_research/web/database/models.py +20 -0
- local_deep_research/web/database/schema_upgrade.py +5 -8
- local_deep_research/web/models/database.py +15 -18
- local_deep_research/web/routes/benchmark_routes.py +427 -0
- local_deep_research/web/routes/research_routes.py +13 -17
- local_deep_research/web/routes/settings_routes.py +264 -67
- local_deep_research/web/services/research_service.py +58 -73
- local_deep_research/web/services/settings_manager.py +1 -4
- local_deep_research/web/services/settings_service.py +4 -6
- local_deep_research/web/static/css/styles.css +12 -0
- local_deep_research/web/static/js/components/logpanel.js +164 -155
- local_deep_research/web/static/js/components/research.js +44 -3
- local_deep_research/web/static/js/components/settings.js +27 -0
- local_deep_research/web/static/js/services/socket.js +47 -0
- local_deep_research/web_search_engines/default_search_engines.py +38 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
- local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
- local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
- local_deep_research/web_search_engines/search_engine_base.py +22 -5
- local_deep_research/web_search_engines/search_engine_factory.py +30 -11
- local_deep_research/web_search_engines/search_engines_config.py +14 -1
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/METADATA +10 -2
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/RECORD +93 -51
- local_deep_research/app.py +0 -8
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/WHEEL +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,768 @@
|
|
1
|
+
"""
|
2
|
+
Configuration comparison for Local Deep Research.
|
3
|
+
|
4
|
+
This module provides functions for comparing different parameter configurations
|
5
|
+
and evaluating their performance across various metrics.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
import os
|
11
|
+
import time
|
12
|
+
from datetime import datetime
|
13
|
+
from typing import Dict, List, Optional, Any, Tuple
|
14
|
+
|
15
|
+
import numpy as np
|
16
|
+
import matplotlib.pyplot as plt
|
17
|
+
|
18
|
+
from local_deep_research.config.llm_config import get_llm
|
19
|
+
from local_deep_research.config.search_config import get_search
|
20
|
+
from local_deep_research.search_system import AdvancedSearchSystem
|
21
|
+
from local_deep_research.benchmarks.efficiency.speed_profiler import SpeedProfiler
|
22
|
+
from local_deep_research.benchmarks.efficiency.resource_monitor import ResourceMonitor
|
23
|
+
from local_deep_research.benchmarks.optimization.metrics import (
|
24
|
+
calculate_quality_metrics,
|
25
|
+
calculate_speed_metrics,
|
26
|
+
calculate_resource_metrics,
|
27
|
+
calculate_combined_score
|
28
|
+
)
|
29
|
+
|
30
|
+
logger = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
|
33
|
+
def compare_configurations(
|
34
|
+
query: str,
|
35
|
+
configurations: List[Dict[str, Any]],
|
36
|
+
output_dir: str = "comparison_results",
|
37
|
+
model_name: Optional[str] = None,
|
38
|
+
provider: Optional[str] = None,
|
39
|
+
search_tool: Optional[str] = None,
|
40
|
+
repetitions: int = 1,
|
41
|
+
metric_weights: Optional[Dict[str, float]] = None,
|
42
|
+
) -> Dict[str, Any]:
|
43
|
+
"""
|
44
|
+
Compare multiple parameter configurations.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
query: Research query to use for evaluation
|
48
|
+
configurations: List of parameter configurations to compare
|
49
|
+
output_dir: Directory to save comparison results
|
50
|
+
model_name: Name of the LLM model to use
|
51
|
+
provider: LLM provider
|
52
|
+
search_tool: Search engine to use
|
53
|
+
repetitions: Number of repetitions for each configuration
|
54
|
+
metric_weights: Dictionary of weights for each metric type
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
Dictionary with comparison results
|
58
|
+
"""
|
59
|
+
os.makedirs(output_dir, exist_ok=True)
|
60
|
+
|
61
|
+
# Default metric weights if not provided
|
62
|
+
if metric_weights is None:
|
63
|
+
metric_weights = {
|
64
|
+
"quality": 0.6,
|
65
|
+
"speed": 0.4,
|
66
|
+
"resource": 0.0 # Disabled by default
|
67
|
+
}
|
68
|
+
|
69
|
+
# Verify valid configurations
|
70
|
+
if not configurations:
|
71
|
+
logger.error("No configurations provided for comparison")
|
72
|
+
return {"error": "No configurations provided"}
|
73
|
+
|
74
|
+
# Results storage
|
75
|
+
results = []
|
76
|
+
|
77
|
+
# Process each configuration
|
78
|
+
for i, config in enumerate(configurations):
|
79
|
+
logger.info(f"Evaluating configuration {i+1}/{len(configurations)}: {config}")
|
80
|
+
|
81
|
+
# Name for this configuration
|
82
|
+
config_name = config.get("name", f"Configuration {i+1}")
|
83
|
+
|
84
|
+
# Results for all repetitions of this configuration
|
85
|
+
config_results = []
|
86
|
+
|
87
|
+
# Run multiple repetitions
|
88
|
+
for rep in range(repetitions):
|
89
|
+
logger.info(f"Starting repetition {rep+1}/{repetitions} for {config_name}")
|
90
|
+
|
91
|
+
try:
|
92
|
+
# Run the configuration
|
93
|
+
result = _evaluate_single_configuration(
|
94
|
+
query=query,
|
95
|
+
config=config,
|
96
|
+
model_name=model_name,
|
97
|
+
provider=provider,
|
98
|
+
search_tool=search_tool
|
99
|
+
)
|
100
|
+
|
101
|
+
config_results.append(result)
|
102
|
+
logger.info(f"Completed repetition {rep+1} for {config_name}")
|
103
|
+
|
104
|
+
except Exception as e:
|
105
|
+
logger.error(f"Error in {config_name}, repetition {rep+1}: {str(e)}")
|
106
|
+
# Add error info but continue with other configurations
|
107
|
+
config_results.append({
|
108
|
+
"error": str(e),
|
109
|
+
"success": False
|
110
|
+
})
|
111
|
+
|
112
|
+
# Calculate aggregate metrics across repetitions
|
113
|
+
if config_results:
|
114
|
+
# Filter out failed runs
|
115
|
+
successful_runs = [r for r in config_results if r.get("success", False)]
|
116
|
+
|
117
|
+
if successful_runs:
|
118
|
+
# Calculate average metrics
|
119
|
+
avg_metrics = _calculate_average_metrics(successful_runs)
|
120
|
+
|
121
|
+
# Calculate overall score
|
122
|
+
overall_score = calculate_combined_score(
|
123
|
+
quality_metrics=avg_metrics.get("quality_metrics", {}),
|
124
|
+
speed_metrics=avg_metrics.get("speed_metrics", {}),
|
125
|
+
resource_metrics=avg_metrics.get("resource_metrics", {}),
|
126
|
+
weights=metric_weights
|
127
|
+
)
|
128
|
+
|
129
|
+
result_summary = {
|
130
|
+
"name": config_name,
|
131
|
+
"configuration": config,
|
132
|
+
"success": True,
|
133
|
+
"runs_completed": len(successful_runs),
|
134
|
+
"runs_failed": len(config_results) - len(successful_runs),
|
135
|
+
"avg_metrics": avg_metrics,
|
136
|
+
"overall_score": overall_score,
|
137
|
+
"individual_results": config_results
|
138
|
+
}
|
139
|
+
else:
|
140
|
+
# All runs failed
|
141
|
+
result_summary = {
|
142
|
+
"name": config_name,
|
143
|
+
"configuration": config,
|
144
|
+
"success": False,
|
145
|
+
"runs_completed": 0,
|
146
|
+
"runs_failed": len(config_results),
|
147
|
+
"error": "All runs failed",
|
148
|
+
"individual_results": config_results
|
149
|
+
}
|
150
|
+
|
151
|
+
results.append(result_summary)
|
152
|
+
|
153
|
+
# Sort results by overall score (if available)
|
154
|
+
sorted_results = sorted(
|
155
|
+
[r for r in results if r.get("success", False)],
|
156
|
+
key=lambda x: x.get("overall_score", 0),
|
157
|
+
reverse=True
|
158
|
+
)
|
159
|
+
|
160
|
+
# Add failed configurations at the end
|
161
|
+
sorted_results.extend([r for r in results if not r.get("success", False)])
|
162
|
+
|
163
|
+
# Create comparison report
|
164
|
+
comparison_report = {
|
165
|
+
"query": query,
|
166
|
+
"configurations_tested": len(configurations),
|
167
|
+
"successful_configurations": len([r for r in results if r.get("success", False)]),
|
168
|
+
"failed_configurations": len([r for r in results if not r.get("success", False)]),
|
169
|
+
"repetitions": repetitions,
|
170
|
+
"metric_weights": metric_weights,
|
171
|
+
"timestamp": datetime.now().isoformat(),
|
172
|
+
"results": sorted_results
|
173
|
+
}
|
174
|
+
|
175
|
+
# Save results to file
|
176
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
177
|
+
result_file = os.path.join(output_dir, f"comparison_results_{timestamp}.json")
|
178
|
+
|
179
|
+
with open(result_file, "w") as f:
|
180
|
+
json.dump(comparison_report, f, indent=2)
|
181
|
+
|
182
|
+
# Generate visualizations
|
183
|
+
visualizations_dir = os.path.join(output_dir, "visualizations")
|
184
|
+
os.makedirs(visualizations_dir, exist_ok=True)
|
185
|
+
|
186
|
+
_create_comparison_visualizations(
|
187
|
+
comparison_report,
|
188
|
+
output_dir=visualizations_dir,
|
189
|
+
timestamp=timestamp
|
190
|
+
)
|
191
|
+
|
192
|
+
logger.info(f"Comparison completed. Results saved to {result_file}")
|
193
|
+
|
194
|
+
# Add report path to the result
|
195
|
+
comparison_report["report_path"] = result_file
|
196
|
+
|
197
|
+
return comparison_report
|
198
|
+
|
199
|
+
|
200
|
+
def _evaluate_single_configuration(
|
201
|
+
query: str,
|
202
|
+
config: Dict[str, Any],
|
203
|
+
model_name: Optional[str] = None,
|
204
|
+
provider: Optional[str] = None,
|
205
|
+
search_tool: Optional[str] = None,
|
206
|
+
) -> Dict[str, Any]:
|
207
|
+
"""
|
208
|
+
Evaluate a single configuration.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
query: Research query to evaluate
|
212
|
+
config: Configuration parameters
|
213
|
+
model_name: Name of the LLM model to use
|
214
|
+
provider: LLM provider
|
215
|
+
search_tool: Search engine to use
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
Dictionary with evaluation results
|
219
|
+
"""
|
220
|
+
# Extract configuration parameters
|
221
|
+
config_model_name = config.get("model_name", model_name)
|
222
|
+
config_provider = config.get("provider", provider)
|
223
|
+
config_search_tool = config.get("search_tool", search_tool)
|
224
|
+
config_iterations = config.get("iterations", 2)
|
225
|
+
config_questions_per_iteration = config.get("questions_per_iteration", 2)
|
226
|
+
config_search_strategy = config.get("search_strategy", "iterdrag")
|
227
|
+
config_max_results = config.get("max_results", 50)
|
228
|
+
config_max_filtered_results = config.get("max_filtered_results", 20)
|
229
|
+
|
230
|
+
# Initialize profiling tools
|
231
|
+
speed_profiler = SpeedProfiler()
|
232
|
+
resource_monitor = ResourceMonitor(sampling_interval=0.5)
|
233
|
+
|
234
|
+
# Start profiling
|
235
|
+
speed_profiler.start()
|
236
|
+
resource_monitor.start()
|
237
|
+
|
238
|
+
try:
|
239
|
+
# Get LLM
|
240
|
+
with speed_profiler.timer("llm_initialization"):
|
241
|
+
llm = get_llm(
|
242
|
+
temperature=config.get("temperature", 0.7),
|
243
|
+
model_name=config_model_name,
|
244
|
+
provider=config_provider
|
245
|
+
)
|
246
|
+
|
247
|
+
# Set up search engine if specified
|
248
|
+
with speed_profiler.timer("search_initialization"):
|
249
|
+
search = None
|
250
|
+
if config_search_tool:
|
251
|
+
search = get_search(
|
252
|
+
config_search_tool,
|
253
|
+
llm_instance=llm,
|
254
|
+
max_results=config_max_results,
|
255
|
+
max_filtered_results=config_max_filtered_results
|
256
|
+
)
|
257
|
+
|
258
|
+
# Create search system
|
259
|
+
system = AdvancedSearchSystem(llm=llm, search=search)
|
260
|
+
system.max_iterations = config_iterations
|
261
|
+
system.questions_per_iteration = config_questions_per_iteration
|
262
|
+
system.strategy_name = config_search_strategy
|
263
|
+
|
264
|
+
# Run the analysis
|
265
|
+
with speed_profiler.timer("analysis"):
|
266
|
+
results = system.analyze_topic(query)
|
267
|
+
|
268
|
+
# Stop profiling
|
269
|
+
speed_profiler.stop()
|
270
|
+
resource_monitor.stop()
|
271
|
+
|
272
|
+
# Calculate metrics
|
273
|
+
quality_metrics = calculate_quality_metrics(
|
274
|
+
results=results,
|
275
|
+
system_info={"all_links_of_system": getattr(system, "all_links_of_system", [])}
|
276
|
+
)
|
277
|
+
|
278
|
+
speed_metrics = calculate_speed_metrics(
|
279
|
+
timing_info=speed_profiler.get_summary(),
|
280
|
+
system_info={
|
281
|
+
"iterations": config_iterations,
|
282
|
+
"questions_per_iteration": config_questions_per_iteration,
|
283
|
+
"results": results
|
284
|
+
}
|
285
|
+
)
|
286
|
+
|
287
|
+
resource_metrics = calculate_resource_metrics(
|
288
|
+
resource_info=resource_monitor.get_combined_stats(),
|
289
|
+
system_info={
|
290
|
+
"iterations": config_iterations,
|
291
|
+
"questions_per_iteration": config_questions_per_iteration,
|
292
|
+
"results": results
|
293
|
+
}
|
294
|
+
)
|
295
|
+
|
296
|
+
# Return comprehensive results
|
297
|
+
return {
|
298
|
+
"query": query,
|
299
|
+
"config": config,
|
300
|
+
"success": True,
|
301
|
+
"findings_count": len(results.get("findings", [])),
|
302
|
+
"knowledge_length": len(results.get("current_knowledge", "")),
|
303
|
+
"quality_metrics": quality_metrics,
|
304
|
+
"speed_metrics": speed_metrics,
|
305
|
+
"resource_metrics": resource_metrics,
|
306
|
+
"timing_details": speed_profiler.get_timings(),
|
307
|
+
"resource_details": resource_monitor.get_combined_stats()
|
308
|
+
}
|
309
|
+
|
310
|
+
except Exception as e:
|
311
|
+
# Stop profiling on error
|
312
|
+
speed_profiler.stop()
|
313
|
+
resource_monitor.stop()
|
314
|
+
|
315
|
+
# Log the error
|
316
|
+
logger.error(f"Error evaluating configuration: {str(e)}")
|
317
|
+
|
318
|
+
# Return error information
|
319
|
+
return {
|
320
|
+
"query": query,
|
321
|
+
"config": config,
|
322
|
+
"success": False,
|
323
|
+
"error": str(e),
|
324
|
+
"timing_details": speed_profiler.get_timings(),
|
325
|
+
"resource_details": resource_monitor.get_combined_stats()
|
326
|
+
}
|
327
|
+
|
328
|
+
|
329
|
+
def _calculate_average_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
330
|
+
"""
|
331
|
+
Calculate average metrics across multiple runs.
|
332
|
+
|
333
|
+
Args:
|
334
|
+
results: List of individual run results
|
335
|
+
|
336
|
+
Returns:
|
337
|
+
Dictionary with averaged metrics
|
338
|
+
"""
|
339
|
+
# Check if there are any successful results
|
340
|
+
if not results:
|
341
|
+
return {}
|
342
|
+
|
343
|
+
# Initialize average metrics
|
344
|
+
avg_metrics = {
|
345
|
+
"quality_metrics": {},
|
346
|
+
"speed_metrics": {},
|
347
|
+
"resource_metrics": {}
|
348
|
+
}
|
349
|
+
|
350
|
+
# Quality metrics
|
351
|
+
quality_keys = set()
|
352
|
+
for result in results:
|
353
|
+
quality_metrics = result.get("quality_metrics", {})
|
354
|
+
quality_keys.update(quality_metrics.keys())
|
355
|
+
|
356
|
+
for key in quality_keys:
|
357
|
+
values = [r.get("quality_metrics", {}).get(key) for r in results]
|
358
|
+
values = [v for v in values if v is not None]
|
359
|
+
if values:
|
360
|
+
avg_metrics["quality_metrics"][key] = sum(values) / len(values)
|
361
|
+
|
362
|
+
# Speed metrics
|
363
|
+
speed_keys = set()
|
364
|
+
for result in results:
|
365
|
+
speed_metrics = result.get("speed_metrics", {})
|
366
|
+
speed_keys.update(speed_metrics.keys())
|
367
|
+
|
368
|
+
for key in speed_keys:
|
369
|
+
values = [r.get("speed_metrics", {}).get(key) for r in results]
|
370
|
+
values = [v for v in values if v is not None]
|
371
|
+
if values:
|
372
|
+
avg_metrics["speed_metrics"][key] = sum(values) / len(values)
|
373
|
+
|
374
|
+
# Resource metrics
|
375
|
+
resource_keys = set()
|
376
|
+
for result in results:
|
377
|
+
resource_metrics = result.get("resource_metrics", {})
|
378
|
+
resource_keys.update(resource_metrics.keys())
|
379
|
+
|
380
|
+
for key in resource_keys:
|
381
|
+
values = [r.get("resource_metrics", {}).get(key) for r in results]
|
382
|
+
values = [v for v in values if v is not None]
|
383
|
+
if values:
|
384
|
+
avg_metrics["resource_metrics"][key] = sum(values) / len(values)
|
385
|
+
|
386
|
+
return avg_metrics
|
387
|
+
|
388
|
+
|
389
|
+
def _create_comparison_visualizations(
|
390
|
+
comparison_report: Dict[str, Any],
|
391
|
+
output_dir: str,
|
392
|
+
timestamp: str
|
393
|
+
):
|
394
|
+
"""
|
395
|
+
Create visualizations for the comparison results.
|
396
|
+
|
397
|
+
Args:
|
398
|
+
comparison_report: Comparison report dictionary
|
399
|
+
output_dir: Directory to save visualizations
|
400
|
+
timestamp: Timestamp string for filenames
|
401
|
+
"""
|
402
|
+
# Check if there are successful results
|
403
|
+
successful_results = [
|
404
|
+
r for r in comparison_report.get("results", [])
|
405
|
+
if r.get("success", False)
|
406
|
+
]
|
407
|
+
|
408
|
+
if not successful_results:
|
409
|
+
logger.warning("No successful configurations to visualize")
|
410
|
+
return
|
411
|
+
|
412
|
+
# Extract configuration names
|
413
|
+
config_names = [r.get("name", f"Config {i+1}") for i, r in enumerate(successful_results)]
|
414
|
+
|
415
|
+
# 1. Overall score comparison
|
416
|
+
plt.figure(figsize=(12, 6))
|
417
|
+
scores = [r.get("overall_score", 0) for r in successful_results]
|
418
|
+
|
419
|
+
# Create horizontal bar chart
|
420
|
+
plt.barh(config_names, scores, color='skyblue')
|
421
|
+
plt.xlabel('Overall Score')
|
422
|
+
plt.ylabel('Configuration')
|
423
|
+
plt.title('Configuration Performance Comparison')
|
424
|
+
plt.grid(axis='x', linestyle='--', alpha=0.7)
|
425
|
+
plt.tight_layout()
|
426
|
+
plt.savefig(os.path.join(output_dir, f"overall_score_comparison_{timestamp}.png"))
|
427
|
+
plt.close()
|
428
|
+
|
429
|
+
# 2. Quality metrics comparison
|
430
|
+
quality_metrics = ["overall_quality", "source_count", "lexical_diversity"]
|
431
|
+
_create_metric_comparison_chart(
|
432
|
+
successful_results,
|
433
|
+
config_names,
|
434
|
+
quality_metrics,
|
435
|
+
"quality_metrics",
|
436
|
+
"Quality Metrics Comparison",
|
437
|
+
os.path.join(output_dir, f"quality_metrics_comparison_{timestamp}.png")
|
438
|
+
)
|
439
|
+
|
440
|
+
# 3. Speed metrics comparison
|
441
|
+
speed_metrics = ["overall_speed", "total_duration", "duration_per_question"]
|
442
|
+
_create_metric_comparison_chart(
|
443
|
+
successful_results,
|
444
|
+
config_names,
|
445
|
+
speed_metrics,
|
446
|
+
"speed_metrics",
|
447
|
+
"Speed Metrics Comparison",
|
448
|
+
os.path.join(output_dir, f"speed_metrics_comparison_{timestamp}.png")
|
449
|
+
)
|
450
|
+
|
451
|
+
# 4. Resource metrics comparison
|
452
|
+
resource_metrics = ["overall_resource", "process_memory_max_mb", "system_cpu_avg"]
|
453
|
+
_create_metric_comparison_chart(
|
454
|
+
successful_results,
|
455
|
+
config_names,
|
456
|
+
resource_metrics,
|
457
|
+
"resource_metrics",
|
458
|
+
"Resource Usage Comparison",
|
459
|
+
os.path.join(output_dir, f"resource_metrics_comparison_{timestamp}.png")
|
460
|
+
)
|
461
|
+
|
462
|
+
# 5. Spider chart for multi-dimensional comparison
|
463
|
+
_create_spider_chart(
|
464
|
+
successful_results,
|
465
|
+
config_names,
|
466
|
+
os.path.join(output_dir, f"spider_chart_comparison_{timestamp}.png")
|
467
|
+
)
|
468
|
+
|
469
|
+
# 6. Pareto frontier chart for quality vs. speed
|
470
|
+
_create_pareto_chart(
|
471
|
+
successful_results,
|
472
|
+
os.path.join(output_dir, f"pareto_chart_comparison_{timestamp}.png")
|
473
|
+
)
|
474
|
+
|
475
|
+
|
476
|
+
def _create_metric_comparison_chart(
|
477
|
+
results: List[Dict[str, Any]],
|
478
|
+
config_names: List[str],
|
479
|
+
metric_keys: List[str],
|
480
|
+
metric_category: str,
|
481
|
+
title: str,
|
482
|
+
output_path: str
|
483
|
+
):
|
484
|
+
"""
|
485
|
+
Create a chart comparing specific metrics across configurations.
|
486
|
+
|
487
|
+
Args:
|
488
|
+
results: List of configuration results
|
489
|
+
config_names: Names of configurations
|
490
|
+
metric_keys: Keys of metrics to compare
|
491
|
+
metric_category: Category of metrics (quality_metrics, speed_metrics, etc.)
|
492
|
+
title: Chart title
|
493
|
+
output_path: Path to save the chart
|
494
|
+
"""
|
495
|
+
# Create figure with multiple subplots (one per metric)
|
496
|
+
fig, axes = plt.subplots(len(metric_keys), 1, figsize=(12, 5 * len(metric_keys)))
|
497
|
+
|
498
|
+
# Handle case with only one metric
|
499
|
+
if len(metric_keys) == 1:
|
500
|
+
axes = [axes]
|
501
|
+
|
502
|
+
for i, metric_key in enumerate(metric_keys):
|
503
|
+
ax = axes[i]
|
504
|
+
|
505
|
+
# Get metric values
|
506
|
+
metric_values = []
|
507
|
+
for result in results:
|
508
|
+
metrics = result.get("avg_metrics", {}).get(metric_category, {})
|
509
|
+
value = metrics.get(metric_key)
|
510
|
+
|
511
|
+
# Handle time values for better visualization
|
512
|
+
if "duration" in metric_key and value is not None:
|
513
|
+
# Convert to seconds if > 60 seconds, minutes if > 60 minutes
|
514
|
+
if value > 3600:
|
515
|
+
value = value / 3600 # Convert to hours
|
516
|
+
metric_key += " (hours)"
|
517
|
+
elif value > 60:
|
518
|
+
value = value / 60 # Convert to minutes
|
519
|
+
metric_key += " (minutes)"
|
520
|
+
else:
|
521
|
+
metric_key += " (seconds)"
|
522
|
+
|
523
|
+
metric_values.append(value if value is not None else 0)
|
524
|
+
|
525
|
+
# Create horizontal bar chart
|
526
|
+
bars = ax.barh(config_names, metric_values, color='lightblue')
|
527
|
+
ax.set_xlabel(metric_key.replace('_', ' ').title())
|
528
|
+
ax.set_title(f"{metric_key.replace('_', ' ').title()}")
|
529
|
+
ax.grid(axis='x', linestyle='--', alpha=0.7)
|
530
|
+
|
531
|
+
# Add value labels to bars
|
532
|
+
for bar in bars:
|
533
|
+
width = bar.get_width()
|
534
|
+
label_x_pos = width * 1.01
|
535
|
+
ax.text(label_x_pos, bar.get_y() + bar.get_height()/2, f'{width:.2f}',
|
536
|
+
va='center')
|
537
|
+
|
538
|
+
plt.suptitle(title, fontsize=16)
|
539
|
+
plt.tight_layout()
|
540
|
+
plt.savefig(output_path)
|
541
|
+
plt.close()
|
542
|
+
|
543
|
+
|
544
|
+
def _create_spider_chart(
|
545
|
+
results: List[Dict[str, Any]],
|
546
|
+
config_names: List[str],
|
547
|
+
output_path: str
|
548
|
+
):
|
549
|
+
"""
|
550
|
+
Create a spider chart comparing metrics across configurations.
|
551
|
+
|
552
|
+
Args:
|
553
|
+
results: List of configuration results
|
554
|
+
config_names: Names of configurations
|
555
|
+
output_path: Path to save the chart
|
556
|
+
"""
|
557
|
+
# Try to import the radar chart module
|
558
|
+
try:
|
559
|
+
from matplotlib.path import Path
|
560
|
+
from matplotlib.projections import register_projection
|
561
|
+
from matplotlib.projections.polar import PolarAxes
|
562
|
+
from matplotlib.spines import Spine
|
563
|
+
|
564
|
+
def radar_factory(num_vars, frame='circle'):
|
565
|
+
"""Create a radar chart with `num_vars` axes."""
|
566
|
+
# Calculate evenly-spaced axis angles
|
567
|
+
theta = np.linspace(0, 2*np.pi, num_vars, endpoint=False)
|
568
|
+
|
569
|
+
class RadarAxes(PolarAxes):
|
570
|
+
name = 'radar'
|
571
|
+
|
572
|
+
def __init__(self, *args, **kwargs):
|
573
|
+
super().__init__(*args, **kwargs)
|
574
|
+
self.set_theta_zero_location('N')
|
575
|
+
|
576
|
+
def fill(self, *args, closed=True, **kwargs):
|
577
|
+
return super().fill(closed=closed, *args, **kwargs)
|
578
|
+
|
579
|
+
def plot(self, *args, **kwargs):
|
580
|
+
return super().plot(*args, **kwargs)
|
581
|
+
|
582
|
+
def set_varlabels(self, labels):
|
583
|
+
self.set_thetagrids(np.degrees(theta), labels)
|
584
|
+
|
585
|
+
def _gen_axes_patch(self):
|
586
|
+
if frame == 'circle':
|
587
|
+
return Circle((0.5, 0.5), 0.5)
|
588
|
+
elif frame == 'polygon':
|
589
|
+
return RegularPolygon((0.5, 0.5), num_vars, radius=0.5, edgecolor="k")
|
590
|
+
else:
|
591
|
+
raise ValueError("Unknown value for 'frame': %s" % frame)
|
592
|
+
|
593
|
+
def _gen_axes_spines(self):
|
594
|
+
if frame == 'circle':
|
595
|
+
return super()._gen_axes_spines()
|
596
|
+
elif frame == 'polygon':
|
597
|
+
spine_type = Spine.circular_spine
|
598
|
+
verts = unit_poly_verts(num_vars)
|
599
|
+
vertices = [(0.5, 0.5)] + verts
|
600
|
+
codes = [Path.MOVETO] + [Path.LINETO] * num_vars + [Path.CLOSEPOLY]
|
601
|
+
path = Path(vertices, codes)
|
602
|
+
spine = Spine(self, spine_type, path)
|
603
|
+
spine.set_transform(self.transAxes)
|
604
|
+
return {'polar': spine}
|
605
|
+
else:
|
606
|
+
raise ValueError("Unknown value for 'frame': %s" % frame)
|
607
|
+
|
608
|
+
def unit_poly_verts(num_vars):
|
609
|
+
"""Return vertices of polygon for radar chart."""
|
610
|
+
verts = []
|
611
|
+
for i in range(num_vars):
|
612
|
+
angle = theta[i]
|
613
|
+
verts.append((0.5 * (1 + np.cos(angle)), 0.5 * (1 + np.sin(angle))))
|
614
|
+
return verts
|
615
|
+
|
616
|
+
register_projection(RadarAxes)
|
617
|
+
return theta
|
618
|
+
|
619
|
+
# Select metrics for the spider chart
|
620
|
+
metrics = [
|
621
|
+
{"name": "Quality", "key": "quality_metrics.overall_quality"},
|
622
|
+
{"name": "Speed", "key": "speed_metrics.overall_speed"},
|
623
|
+
{"name": "Sources", "key": "quality_metrics.normalized_source_count"},
|
624
|
+
{"name": "Content", "key": "quality_metrics.normalized_knowledge_length"},
|
625
|
+
{"name": "Memory", "key": "resource_metrics.normalized_memory_usage", "invert": True},
|
626
|
+
]
|
627
|
+
|
628
|
+
# Extract metric values
|
629
|
+
spoke_labels = [m["name"] for m in metrics]
|
630
|
+
num_vars = len(spoke_labels)
|
631
|
+
theta = radar_factory(num_vars)
|
632
|
+
|
633
|
+
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='radar'))
|
634
|
+
|
635
|
+
# Color map for different configurations
|
636
|
+
colors = plt.cm.viridis(np.linspace(0, 1, len(results)))
|
637
|
+
|
638
|
+
for i, result in enumerate(results):
|
639
|
+
values = []
|
640
|
+
for metric in metrics:
|
641
|
+
# Extract metric value using the key path (e.g., "quality_metrics.overall_quality")
|
642
|
+
key_parts = metric["key"].split(".")
|
643
|
+
value = result.get("avg_metrics", {})
|
644
|
+
for part in key_parts:
|
645
|
+
value = value.get(part, 0) if isinstance(value, dict) else 0
|
646
|
+
|
647
|
+
# Invert if needed (for metrics where lower is better)
|
648
|
+
if metric.get("invert", False):
|
649
|
+
value = 1.0 - value
|
650
|
+
|
651
|
+
values.append(value)
|
652
|
+
|
653
|
+
# Plot this configuration
|
654
|
+
ax.plot(theta, values, color=colors[i], linewidth=2, label=config_names[i])
|
655
|
+
ax.fill(theta, values, color=colors[i], alpha=0.25)
|
656
|
+
|
657
|
+
# Set chart properties
|
658
|
+
ax.set_varlabels(spoke_labels)
|
659
|
+
plt.legend(loc='best', bbox_to_anchor=(0.5, 0.1))
|
660
|
+
plt.title('Multi-Dimensional Configuration Comparison', size=16, y=1.05)
|
661
|
+
plt.tight_layout()
|
662
|
+
|
663
|
+
# Save chart
|
664
|
+
plt.savefig(output_path)
|
665
|
+
plt.close()
|
666
|
+
|
667
|
+
except Exception as e:
|
668
|
+
logger.error(f"Error creating spider chart: {str(e)}")
|
669
|
+
# Create a text-based chart as fallback
|
670
|
+
plt.figure(figsize=(10, 6))
|
671
|
+
plt.text(0.5, 0.5, f"Spider chart could not be created: {str(e)}",
|
672
|
+
horizontalalignment='center', verticalalignment='center')
|
673
|
+
plt.axis('off')
|
674
|
+
plt.savefig(output_path)
|
675
|
+
plt.close()
|
676
|
+
|
677
|
+
|
678
|
+
def _create_pareto_chart(
|
679
|
+
results: List[Dict[str, Any]],
|
680
|
+
output_path: str
|
681
|
+
):
|
682
|
+
"""
|
683
|
+
Create a Pareto frontier chart showing quality vs. speed tradeoff.
|
684
|
+
|
685
|
+
Args:
|
686
|
+
results: List of configuration results
|
687
|
+
output_path: Path to save the chart
|
688
|
+
"""
|
689
|
+
# Extract quality and speed metrics
|
690
|
+
quality_scores = []
|
691
|
+
speed_scores = []
|
692
|
+
names = []
|
693
|
+
|
694
|
+
for result in results:
|
695
|
+
metrics = result.get("avg_metrics", {})
|
696
|
+
quality = metrics.get("quality_metrics", {}).get("overall_quality", 0)
|
697
|
+
|
698
|
+
# For speed, we use inverse of duration (so higher is better)
|
699
|
+
duration = metrics.get("speed_metrics", {}).get("total_duration", 1)
|
700
|
+
speed = 1.0 / max(duration, 0.001) # Avoid division by zero
|
701
|
+
|
702
|
+
quality_scores.append(quality)
|
703
|
+
speed_scores.append(speed)
|
704
|
+
names.append(result.get("name", "Configuration"))
|
705
|
+
|
706
|
+
# Create scatter plot
|
707
|
+
plt.figure(figsize=(10, 8))
|
708
|
+
plt.scatter(quality_scores, speed_scores, s=100, alpha=0.7)
|
709
|
+
|
710
|
+
# Add labels for each point
|
711
|
+
for i, name in enumerate(names):
|
712
|
+
plt.annotate(name,
|
713
|
+
(quality_scores[i], speed_scores[i]),
|
714
|
+
xytext=(5, 5),
|
715
|
+
textcoords='offset points')
|
716
|
+
|
717
|
+
# Identify Pareto frontier
|
718
|
+
pareto_points = []
|
719
|
+
for i, (q, s) in enumerate(zip(quality_scores, speed_scores)):
|
720
|
+
is_pareto = True
|
721
|
+
for q2, s2 in zip(quality_scores, speed_scores):
|
722
|
+
if q2 > q and s2 > s: # Dominated
|
723
|
+
is_pareto = False
|
724
|
+
break
|
725
|
+
if is_pareto:
|
726
|
+
pareto_points.append(i)
|
727
|
+
|
728
|
+
# Highlight Pareto frontier
|
729
|
+
pareto_quality = [quality_scores[i] for i in pareto_points]
|
730
|
+
pareto_speed = [speed_scores[i] for i in pareto_points]
|
731
|
+
|
732
|
+
# Sort pareto points for line drawing
|
733
|
+
pareto_sorted = sorted(zip(pareto_quality, pareto_speed, pareto_points))
|
734
|
+
pareto_quality = [p[0] for p in pareto_sorted]
|
735
|
+
pareto_speed = [p[1] for p in pareto_sorted]
|
736
|
+
pareto_indices = [p[2] for p in pareto_sorted]
|
737
|
+
|
738
|
+
# Draw Pareto frontier line
|
739
|
+
plt.plot(pareto_quality, pareto_speed, 'r--', linewidth=2)
|
740
|
+
|
741
|
+
# Highlight Pareto optimal points
|
742
|
+
plt.scatter([quality_scores[i] for i in pareto_indices],
|
743
|
+
[speed_scores[i] for i in pareto_indices],
|
744
|
+
s=150, facecolors='none', edgecolors='r', linewidth=2)
|
745
|
+
|
746
|
+
# Add labels for Pareto optimal configurations
|
747
|
+
for i in pareto_indices:
|
748
|
+
plt.annotate(names[i],
|
749
|
+
(quality_scores[i], speed_scores[i]),
|
750
|
+
xytext=(8, 8),
|
751
|
+
textcoords='offset points',
|
752
|
+
bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.7))
|
753
|
+
|
754
|
+
# Set chart properties
|
755
|
+
plt.xlabel('Quality Score (higher is better)')
|
756
|
+
plt.ylabel('Speed Score (higher is better)')
|
757
|
+
plt.title('Quality vs. Speed Tradeoff (Pareto Frontier)', size=14)
|
758
|
+
plt.grid(True, linestyle='--', alpha=0.7)
|
759
|
+
|
760
|
+
# Add explanation
|
761
|
+
plt.figtext(0.5, 0.01,
|
762
|
+
"Points on the red line are Pareto optimal configurations\n"
|
763
|
+
"(no other configuration is better in both quality and speed)",
|
764
|
+
ha='center', fontsize=10, bbox=dict(boxstyle='round', fc='white', alpha=0.7))
|
765
|
+
|
766
|
+
plt.tight_layout()
|
767
|
+
plt.savefig(output_path)
|
768
|
+
plt.close()
|