local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +20 -3
- local_deep_research/web/database/models.py +74 -25
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +63 -83
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +192 -54
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +412 -251
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/METADATA +7 -1
- local_deep_research-0.5.2.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/licenses/LICENSE +0 -0
@@ -6,8 +6,7 @@ of benchmark and optimization results.
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
import logging
|
9
|
-
import
|
10
|
-
from typing import Dict, List, Optional, Tuple, Union
|
9
|
+
from typing import Dict, List, Optional
|
11
10
|
|
12
11
|
import numpy as np
|
13
12
|
|
@@ -21,7 +20,9 @@ try:
|
|
21
20
|
MATPLOTLIB_AVAILABLE = True
|
22
21
|
except ImportError:
|
23
22
|
MATPLOTLIB_AVAILABLE = False
|
24
|
-
logger.warning(
|
23
|
+
logger.warning(
|
24
|
+
"Matplotlib not available. Visualization functions will be limited."
|
25
|
+
)
|
25
26
|
|
26
27
|
|
27
28
|
def plot_optimization_history(
|
@@ -98,13 +99,13 @@ def plot_parameter_importance(
|
|
98
99
|
|
99
100
|
fig, ax = plt.subplots(figsize=(10, 6))
|
100
101
|
y_pos = range(len(sorted_names))
|
101
|
-
|
102
|
+
|
102
103
|
# Create horizontal bar chart
|
103
104
|
ax.barh(y_pos, sorted_values, align="center")
|
104
105
|
ax.set_yticks(y_pos)
|
105
106
|
ax.set_yticklabels(sorted_names)
|
106
107
|
ax.invert_yaxis() # Labels read top-to-bottom
|
107
|
-
|
108
|
+
|
108
109
|
# Add labels and title
|
109
110
|
ax.set_xlabel("Importance")
|
110
111
|
ax.set_title(title)
|
@@ -144,31 +145,43 @@ def plot_quality_vs_speed(
|
|
144
145
|
return None
|
145
146
|
|
146
147
|
fig, ax = plt.subplots(figsize=(10, 8))
|
147
|
-
|
148
|
+
|
148
149
|
# Create scatter plot
|
149
150
|
scatter = ax.scatter(
|
150
|
-
speed_scores,
|
151
|
-
quality_scores,
|
152
|
-
c=np.arange(len(quality_scores)),
|
153
|
-
cmap="viridis",
|
151
|
+
speed_scores,
|
152
|
+
quality_scores,
|
153
|
+
c=np.arange(len(quality_scores)),
|
154
|
+
cmap="viridis",
|
154
155
|
alpha=0.7,
|
155
|
-
s=100
|
156
|
+
s=100,
|
156
157
|
)
|
157
|
-
|
158
|
+
|
158
159
|
# Add colorbar to show trial number
|
159
160
|
cbar = plt.colorbar(scatter)
|
160
161
|
cbar.set_label("Trial Number")
|
161
|
-
|
162
|
+
|
162
163
|
# Add labels and title
|
163
164
|
ax.set_xlabel("Speed Score (higher = faster)")
|
164
165
|
ax.set_ylabel("Quality Score (higher = better)")
|
165
166
|
ax.set_title(title)
|
166
167
|
ax.grid(True, linestyle="--", alpha=0.5)
|
167
|
-
|
168
|
+
|
168
169
|
# Add reference lines
|
169
|
-
ax.axhline(
|
170
|
-
|
171
|
-
|
170
|
+
ax.axhline(
|
171
|
+
y=0.7,
|
172
|
+
color="r",
|
173
|
+
linestyle="--",
|
174
|
+
alpha=0.3,
|
175
|
+
label="Good Quality Threshold",
|
176
|
+
)
|
177
|
+
ax.axvline(
|
178
|
+
x=0.7,
|
179
|
+
color="g",
|
180
|
+
linestyle="--",
|
181
|
+
alpha=0.3,
|
182
|
+
label="Good Speed Threshold",
|
183
|
+
)
|
184
|
+
|
172
185
|
# Mark Pareto frontier
|
173
186
|
if len(quality_scores) > 2:
|
174
187
|
try:
|
@@ -178,13 +191,19 @@ def plot_quality_vs_speed(
|
|
178
191
|
is_pareto = True
|
179
192
|
for j in range(len(quality_scores)):
|
180
193
|
if i != j:
|
181
|
-
if
|
182
|
-
|
194
|
+
if (
|
195
|
+
quality_scores[j] >= quality_scores[i]
|
196
|
+
and speed_scores[j] >= speed_scores[i]
|
197
|
+
):
|
198
|
+
if (
|
199
|
+
quality_scores[j] > quality_scores[i]
|
200
|
+
or speed_scores[j] > speed_scores[i]
|
201
|
+
):
|
183
202
|
is_pareto = False
|
184
203
|
break
|
185
204
|
if is_pareto:
|
186
205
|
pareto_points.append((speed_scores[i], quality_scores[i]))
|
187
|
-
|
206
|
+
|
188
207
|
# Sort pareto points by speed score
|
189
208
|
pareto_points.sort()
|
190
209
|
if pareto_points:
|
@@ -193,13 +212,13 @@ def plot_quality_vs_speed(
|
|
193
212
|
ax.scatter(pareto_x, pareto_y, c="red", s=50, alpha=0.8)
|
194
213
|
except Exception as e:
|
195
214
|
logger.warning(f"Error calculating Pareto frontier: {e}")
|
196
|
-
|
215
|
+
|
197
216
|
ax.legend()
|
198
|
-
|
217
|
+
|
199
218
|
# Save or return
|
200
219
|
if output_file:
|
201
220
|
fig.tight_layout()
|
202
221
|
fig.savefig(output_file, dpi=300, bbox_inches="tight")
|
203
222
|
logger.info(f"Saved quality vs. speed plot to {output_file}")
|
204
223
|
|
205
|
-
return fig
|
224
|
+
return fig
|
@@ -17,7 +17,9 @@ from local_deep_research.benchmarks.optimization.metrics import (
|
|
17
17
|
calculate_resource_metrics,
|
18
18
|
calculate_speed_metrics,
|
19
19
|
)
|
20
|
-
from local_deep_research.benchmarks.optimization.optuna_optimizer import
|
20
|
+
from local_deep_research.benchmarks.optimization.optuna_optimizer import (
|
21
|
+
OptunaOptimizer,
|
22
|
+
)
|
21
23
|
|
22
24
|
__all__ = [
|
23
25
|
"OptunaOptimizer",
|
@@ -257,7 +257,13 @@ def get_default_param_space() -> Dict[str, Any]:
|
|
257
257
|
},
|
258
258
|
"search_strategy": {
|
259
259
|
"type": "categorical",
|
260
|
-
"choices": [
|
260
|
+
"choices": [
|
261
|
+
"iterdrag",
|
262
|
+
"standard",
|
263
|
+
"rapid",
|
264
|
+
"parallel",
|
265
|
+
"source_based",
|
266
|
+
],
|
261
267
|
},
|
262
268
|
"max_results": {
|
263
269
|
"type": "int",
|
@@ -24,8 +24,12 @@ from optuna.visualization import (
|
|
24
24
|
plot_slice,
|
25
25
|
)
|
26
26
|
|
27
|
-
from local_deep_research.benchmarks.efficiency.speed_profiler import
|
28
|
-
|
27
|
+
from local_deep_research.benchmarks.efficiency.speed_profiler import (
|
28
|
+
SpeedProfiler,
|
29
|
+
)
|
30
|
+
from local_deep_research.benchmarks.evaluators import (
|
31
|
+
CompositeBenchmarkEvaluator,
|
32
|
+
)
|
29
33
|
|
30
34
|
# Import benchmark evaluator components
|
31
35
|
|
@@ -108,7 +112,9 @@ class OptunaOptimizer:
|
|
108
112
|
|
109
113
|
# Initialize benchmark evaluator with weights
|
110
114
|
self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
|
111
|
-
self.benchmark_evaluator = CompositeBenchmarkEvaluator(
|
115
|
+
self.benchmark_evaluator = CompositeBenchmarkEvaluator(
|
116
|
+
self.benchmark_weights
|
117
|
+
)
|
112
118
|
|
113
119
|
# Normalize weights to sum to 1.0
|
114
120
|
total_weight = sum(self.metric_weights.values())
|
@@ -200,7 +206,9 @@ class OptunaOptimizer:
|
|
200
206
|
# Create visualizations
|
201
207
|
self._create_visualizations()
|
202
208
|
|
203
|
-
logger.info(
|
209
|
+
logger.info(
|
210
|
+
f"Optimization complete. Best parameters: {self.best_params}"
|
211
|
+
)
|
204
212
|
logger.info(f"Best value: {self.study.best_value}")
|
205
213
|
|
206
214
|
# Report completion
|
@@ -281,7 +289,9 @@ class OptunaOptimizer:
|
|
281
289
|
},
|
282
290
|
}
|
283
291
|
|
284
|
-
def _objective(
|
292
|
+
def _objective(
|
293
|
+
self, trial: optuna.Trial, param_space: Dict[str, Any]
|
294
|
+
) -> float:
|
285
295
|
"""
|
286
296
|
Objective function for Optuna optimization.
|
287
297
|
|
@@ -496,7 +506,9 @@ class OptunaOptimizer:
|
|
496
506
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
497
507
|
|
498
508
|
# Save trial history
|
499
|
-
history_file = os.path.join(
|
509
|
+
history_file = os.path.join(
|
510
|
+
self.output_dir, f"{self.study_name}_history.json"
|
511
|
+
)
|
500
512
|
with open(history_file, "w") as f:
|
501
513
|
# Convert numpy values to native Python types for JSON serialization
|
502
514
|
clean_history = []
|
@@ -517,7 +529,11 @@ class OptunaOptimizer:
|
|
517
529
|
json.dump(clean_history, f, indent=2)
|
518
530
|
|
519
531
|
# Save current best parameters
|
520
|
-
if
|
532
|
+
if (
|
533
|
+
self.study
|
534
|
+
and hasattr(self.study, "best_params")
|
535
|
+
and self.study.best_params
|
536
|
+
):
|
521
537
|
best_params_file = os.path.join(
|
522
538
|
self.output_dir, f"{self.study_name}_best_params.json"
|
523
539
|
)
|
@@ -541,7 +557,9 @@ class OptunaOptimizer:
|
|
541
557
|
|
542
558
|
# Save the Optuna study
|
543
559
|
if self.study:
|
544
|
-
study_file = os.path.join(
|
560
|
+
study_file = os.path.join(
|
561
|
+
self.output_dir, f"{self.study_name}_study.pkl"
|
562
|
+
)
|
545
563
|
joblib.dump(self.study, study_file)
|
546
564
|
|
547
565
|
logger.info(f"Results saved to {self.output_dir}")
|
@@ -549,7 +567,9 @@ class OptunaOptimizer:
|
|
549
567
|
def _create_visualizations(self):
|
550
568
|
"""Create and save comprehensive visualizations of the optimization results."""
|
551
569
|
if not PLOTTING_AVAILABLE:
|
552
|
-
logger.warning(
|
570
|
+
logger.warning(
|
571
|
+
"Matplotlib not available, skipping visualization creation"
|
572
|
+
)
|
553
573
|
return
|
554
574
|
|
555
575
|
if not self.study or len(self.study.trials) < 2:
|
@@ -570,7 +590,11 @@ class OptunaOptimizer:
|
|
570
590
|
|
571
591
|
def _create_quick_visualizations(self):
|
572
592
|
"""Create a smaller set of visualizations for intermediate progress."""
|
573
|
-
if
|
593
|
+
if (
|
594
|
+
not PLOTTING_AVAILABLE
|
595
|
+
or not self.study
|
596
|
+
or len(self.study.trials) < 2
|
597
|
+
):
|
574
598
|
return
|
575
599
|
|
576
600
|
# Create directory for visualizations
|
@@ -582,7 +606,8 @@ class OptunaOptimizer:
|
|
582
606
|
fig = plot_optimization_history(self.study)
|
583
607
|
fig.write_image(
|
584
608
|
os.path.join(
|
585
|
-
viz_dir,
|
609
|
+
viz_dir,
|
610
|
+
f"{self.study_name}_optimization_history_current.png",
|
586
611
|
)
|
587
612
|
)
|
588
613
|
except Exception as e:
|
@@ -602,7 +627,8 @@ class OptunaOptimizer:
|
|
602
627
|
fig = plot_optimization_history(self.study)
|
603
628
|
fig.write_image(
|
604
629
|
os.path.join(
|
605
|
-
viz_dir,
|
630
|
+
viz_dir,
|
631
|
+
f"{self.study_name}_optimization_history_{timestamp}.png",
|
606
632
|
)
|
607
633
|
)
|
608
634
|
except Exception as e:
|
@@ -613,7 +639,8 @@ class OptunaOptimizer:
|
|
613
639
|
fig = plot_param_importances(self.study)
|
614
640
|
fig.write_image(
|
615
641
|
os.path.join(
|
616
|
-
viz_dir,
|
642
|
+
viz_dir,
|
643
|
+
f"{self.study_name}_param_importances_{timestamp}.png",
|
617
644
|
)
|
618
645
|
)
|
619
646
|
except Exception as e:
|
@@ -625,7 +652,8 @@ class OptunaOptimizer:
|
|
625
652
|
fig = plot_slice(self.study, [param_name])
|
626
653
|
fig.write_image(
|
627
654
|
os.path.join(
|
628
|
-
viz_dir,
|
655
|
+
viz_dir,
|
656
|
+
f"{self.study_name}_slice_{param_name}_{timestamp}.png",
|
629
657
|
)
|
630
658
|
)
|
631
659
|
except Exception as e:
|
@@ -684,7 +712,9 @@ class OptunaOptimizer:
|
|
684
712
|
|
685
713
|
# Extract data from successful trials
|
686
714
|
successful_trials = [
|
687
|
-
t
|
715
|
+
t
|
716
|
+
for t in self.trials_history
|
717
|
+
if t.get("result", {}).get("success", False)
|
688
718
|
]
|
689
719
|
|
690
720
|
if not successful_trials:
|
@@ -715,7 +745,9 @@ class OptunaOptimizer:
|
|
715
745
|
questions_values.append(questions)
|
716
746
|
|
717
747
|
# Create scatter plot with size based on iterations*questions
|
718
|
-
sizes = [
|
748
|
+
sizes = [
|
749
|
+
i * q * 5 for i, q in zip(iterations_values, questions_values)
|
750
|
+
]
|
719
751
|
scatter = plt.scatter(
|
720
752
|
quality_scores,
|
721
753
|
speed_scores,
|
@@ -727,12 +759,15 @@ class OptunaOptimizer:
|
|
727
759
|
|
728
760
|
# Highlight best trial
|
729
761
|
best_trial = max(
|
730
|
-
successful_trials,
|
762
|
+
successful_trials,
|
763
|
+
key=lambda x: x.get("result", {}).get("score", 0),
|
731
764
|
)
|
732
765
|
best_quality = best_trial["result"].get("quality_score", 0)
|
733
766
|
best_speed = best_trial["result"].get("speed_score", 0)
|
734
767
|
best_iter = best_trial["params"].get("iterations", 0)
|
735
|
-
best_questions = best_trial["params"].get(
|
768
|
+
best_questions = best_trial["params"].get(
|
769
|
+
"questions_per_iteration", 0
|
770
|
+
)
|
736
771
|
|
737
772
|
plt.scatter(
|
738
773
|
[best_quality],
|
@@ -745,7 +780,9 @@ class OptunaOptimizer:
|
|
745
780
|
)
|
746
781
|
|
747
782
|
# Add annotations for key points
|
748
|
-
for i, (q, s,
|
783
|
+
for i, (q, s, label) in enumerate(
|
784
|
+
zip(quality_scores, speed_scores, labels)
|
785
|
+
):
|
749
786
|
if i % max(1, len(quality_scores) // 5) == 0: # Label ~5 points
|
750
787
|
plt.annotate(
|
751
788
|
f"{iterations_values[i]}×{questions_values[i]}",
|
@@ -762,7 +799,9 @@ class OptunaOptimizer:
|
|
762
799
|
weights_str = ", ".join(
|
763
800
|
[f"{k}:{v:.1f}" for k, v in self.benchmark_weights.items()]
|
764
801
|
)
|
765
|
-
plt.title(
|
802
|
+
plt.title(
|
803
|
+
f"Quality vs. Speed Trade-off\nBenchmark Weights: {weights_str}"
|
804
|
+
)
|
766
805
|
plt.xlabel("Quality Score (Benchmark Accuracy)")
|
767
806
|
plt.ylabel("Speed Score")
|
768
807
|
plt.grid(True, linestyle="--", alpha=0.7)
|
@@ -786,7 +825,8 @@ class OptunaOptimizer:
|
|
786
825
|
plt.tight_layout()
|
787
826
|
plt.savefig(
|
788
827
|
os.path.join(
|
789
|
-
viz_dir,
|
828
|
+
viz_dir,
|
829
|
+
f"{self.study_name}_quality_vs_speed_{timestamp}.png",
|
790
830
|
)
|
791
831
|
)
|
792
832
|
plt.close()
|
@@ -895,7 +935,9 @@ class OptunaOptimizer:
|
|
895
935
|
duration = trial.get("duration", 0)
|
896
936
|
score = trial.get("score", 0)
|
897
937
|
iterations = trial.get("params", {}).get("iterations", 1)
|
898
|
-
questions = trial.get("params", {}).get(
|
938
|
+
questions = trial.get("params", {}).get(
|
939
|
+
"questions_per_iteration", 1
|
940
|
+
)
|
899
941
|
|
900
942
|
trial_durations.append(duration)
|
901
943
|
trial_scores.append(score)
|
@@ -903,13 +945,17 @@ class OptunaOptimizer:
|
|
903
945
|
trial_questions.append(questions)
|
904
946
|
|
905
947
|
# Total questions per trial
|
906
|
-
total_questions = [
|
948
|
+
total_questions = [
|
949
|
+
i * q for i, q in zip(trial_iterations, trial_questions)
|
950
|
+
]
|
907
951
|
|
908
952
|
# Create scatter plot with size based on total questions
|
909
953
|
plt.scatter(
|
910
954
|
trial_durations,
|
911
955
|
trial_scores,
|
912
|
-
s=[
|
956
|
+
s=[
|
957
|
+
q * 5 for q in total_questions
|
958
|
+
], # Size based on total questions
|
913
959
|
alpha=0.7,
|
914
960
|
c=range(len(trial_durations)),
|
915
961
|
cmap="viridis",
|
@@ -923,7 +969,9 @@ class OptunaOptimizer:
|
|
923
969
|
|
924
970
|
# Add trial number annotations for selected points
|
925
971
|
for i, (d, s) in enumerate(zip(trial_durations, trial_scores)):
|
926
|
-
if
|
972
|
+
if (
|
973
|
+
i % max(1, len(trial_durations) // 5) == 0
|
974
|
+
): # Annotate ~5 points
|
927
975
|
plt.annotate(
|
928
976
|
f"{trial_iterations[i]}×{trial_questions[i]}",
|
929
977
|
(d, s),
|
@@ -935,7 +983,8 @@ class OptunaOptimizer:
|
|
935
983
|
plt.tight_layout()
|
936
984
|
plt.savefig(
|
937
985
|
os.path.join(
|
938
|
-
viz_dir,
|
986
|
+
viz_dir,
|
987
|
+
f"{self.study_name}_duration_vs_score_{timestamp}.png",
|
939
988
|
)
|
940
989
|
)
|
941
990
|
plt.close()
|
@@ -92,10 +92,14 @@ def run_benchmark(
|
|
92
92
|
# Load the examples
|
93
93
|
dataset = dataset_instance.load()
|
94
94
|
|
95
|
-
logger.info(
|
95
|
+
logger.info(
|
96
|
+
f"Loaded {len(dataset)} examples using dataset class {type(dataset_instance).__name__}"
|
97
|
+
)
|
96
98
|
except Exception as e:
|
97
99
|
# Fallback to legacy function if there's any issue
|
98
|
-
logger.warning(
|
100
|
+
logger.warning(
|
101
|
+
f"Error using dataset class: {e}. Falling back to legacy function."
|
102
|
+
)
|
99
103
|
dataset = load_dataset(
|
100
104
|
dataset_type=dataset_type,
|
101
105
|
dataset_path=dataset_path,
|
@@ -105,11 +109,15 @@ def run_benchmark(
|
|
105
109
|
|
106
110
|
# Set up output files
|
107
111
|
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
108
|
-
results_file = os.path.join(
|
112
|
+
results_file = os.path.join(
|
113
|
+
output_dir, f"{dataset_type}_{timestamp}_results.jsonl"
|
114
|
+
)
|
109
115
|
evaluation_file = os.path.join(
|
110
116
|
output_dir, f"{dataset_type}_{timestamp}_evaluation.jsonl"
|
111
117
|
)
|
112
|
-
report_file = os.path.join(
|
118
|
+
report_file = os.path.join(
|
119
|
+
output_dir, f"{dataset_type}_{timestamp}_report.md"
|
120
|
+
)
|
113
121
|
|
114
122
|
# Make sure output files don't exist
|
115
123
|
for file in [results_file, evaluation_file, report_file]:
|
@@ -135,11 +143,16 @@ def run_benchmark(
|
|
135
143
|
|
136
144
|
for i, example in enumerate(dataset):
|
137
145
|
# Extract question and answer in a way that uses the dataset class when available
|
138
|
-
if
|
146
|
+
if "dataset_instance" in locals() and isinstance(
|
147
|
+
dataset_instance,
|
148
|
+
DatasetRegistry.get_dataset_class(dataset_type.lower()),
|
149
|
+
):
|
139
150
|
# Use the dataset class methods to extract question and answer
|
140
151
|
question = dataset_instance.get_question(example)
|
141
152
|
correct_answer = dataset_instance.get_answer(example)
|
142
|
-
logger.debug(
|
153
|
+
logger.debug(
|
154
|
+
"Using dataset class methods to extract question and answer"
|
155
|
+
)
|
143
156
|
else:
|
144
157
|
# Fallback to the legacy approach
|
145
158
|
if dataset_type.lower() == "simpleqa":
|
@@ -163,7 +176,9 @@ def run_benchmark(
|
|
163
176
|
"current": i + 1,
|
164
177
|
"total": total_examples,
|
165
178
|
"question": (
|
166
|
-
question[:50] + "..."
|
179
|
+
question[:50] + "..."
|
180
|
+
if len(question) > 50
|
181
|
+
else question
|
167
182
|
),
|
168
183
|
},
|
169
184
|
)
|
@@ -181,7 +196,9 @@ def run_benchmark(
|
|
181
196
|
search_result = quick_summary(
|
182
197
|
query=formatted_query,
|
183
198
|
iterations=search_config.get("iterations", 3),
|
184
|
-
questions_per_iteration=search_config.get(
|
199
|
+
questions_per_iteration=search_config.get(
|
200
|
+
"questions_per_iteration", 3
|
201
|
+
),
|
185
202
|
search_tool=search_config.get("search_tool", "searxng"),
|
186
203
|
)
|
187
204
|
|
@@ -278,7 +295,9 @@ def run_benchmark(
|
|
278
295
|
|
279
296
|
logger.info("Running human evaluation...")
|
280
297
|
evaluation_results = evaluate(
|
281
|
-
results_file=results_file,
|
298
|
+
results_file=results_file,
|
299
|
+
output_file=evaluation_file,
|
300
|
+
interactive=True,
|
282
301
|
)
|
283
302
|
else:
|
284
303
|
logger.info("Running automated evaluation...")
|
@@ -349,7 +368,9 @@ def run_benchmark(
|
|
349
368
|
|
350
369
|
# Generate report
|
351
370
|
if progress_callback:
|
352
|
-
progress_callback(
|
371
|
+
progress_callback(
|
372
|
+
"Generating report", 95, {"status": "generating_report"}
|
373
|
+
)
|
353
374
|
|
354
375
|
dataset_name = dataset_type.capitalize()
|
355
376
|
report_path = generate_report(
|
@@ -366,7 +387,9 @@ def run_benchmark(
|
|
366
387
|
"questions_per_iteration", 3
|
367
388
|
),
|
368
389
|
"Search tool": search_config.get("search_tool", "searxng"),
|
369
|
-
"Evaluation method": "Human"
|
390
|
+
"Evaluation method": "Human"
|
391
|
+
if human_evaluation
|
392
|
+
else "Automated",
|
370
393
|
},
|
371
394
|
)
|
372
395
|
|
@@ -375,7 +398,11 @@ def run_benchmark(
|
|
375
398
|
progress_callback(
|
376
399
|
"Benchmark complete",
|
377
400
|
100,
|
378
|
-
{
|
401
|
+
{
|
402
|
+
"status": "complete",
|
403
|
+
"metrics": metrics,
|
404
|
+
"report_path": report_path,
|
405
|
+
},
|
379
406
|
)
|
380
407
|
|
381
408
|
return {
|
@@ -417,10 +444,14 @@ def run_simpleqa_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:
|
|
417
444
|
Returns:
|
418
445
|
Dictionary with benchmark results
|
419
446
|
"""
|
420
|
-
return run_benchmark(
|
447
|
+
return run_benchmark(
|
448
|
+
dataset_type="simpleqa", num_examples=num_examples, **kwargs
|
449
|
+
)
|
421
450
|
|
422
451
|
|
423
|
-
def run_browsecomp_benchmark(
|
452
|
+
def run_browsecomp_benchmark(
|
453
|
+
num_examples: int = 100, **kwargs
|
454
|
+
) -> Dict[str, Any]:
|
424
455
|
"""
|
425
456
|
Run BrowseComp benchmark with default settings.
|
426
457
|
|
@@ -431,4 +462,6 @@ def run_browsecomp_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any
|
|
431
462
|
Returns:
|
432
463
|
Dictionary with benchmark results
|
433
464
|
"""
|
434
|
-
return run_benchmark(
|
465
|
+
return run_benchmark(
|
466
|
+
dataset_type="browsecomp", num_examples=num_examples, **kwargs
|
467
|
+
)
|