local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. local_deep_research/__init__.py +7 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
  4. local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
  5. local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
  6. local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
  7. local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
  8. local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
  9. local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
  10. local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
  11. local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
  12. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
  13. local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
  14. local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
  15. local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
  16. local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
  17. local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
  18. local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
  19. local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
  20. local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
  21. local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
  22. local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
  23. local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
  24. local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
  25. local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
  26. local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
  27. local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
  28. local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
  29. local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
  30. local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
  31. local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
  32. local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
  33. local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
  34. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
  35. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
  36. local_deep_research/advanced_search_system/findings/repository.py +54 -17
  37. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
  38. local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
  39. local_deep_research/advanced_search_system/questions/__init__.py +16 -0
  40. local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
  41. local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
  42. local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
  43. local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
  44. local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
  45. local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
  46. local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
  47. local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
  48. local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
  49. local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
  50. local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
  51. local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
  52. local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
  53. local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
  54. local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
  55. local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
  56. local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
  57. local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
  58. local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
  59. local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
  60. local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
  61. local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
  62. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
  63. local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
  64. local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
  65. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
  66. local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
  67. local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
  68. local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
  69. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
  70. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
  71. local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
  72. local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
  73. local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
  74. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
  75. local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
  76. local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
  77. local_deep_research/api/benchmark_functions.py +6 -2
  78. local_deep_research/api/research_functions.py +10 -4
  79. local_deep_research/benchmarks/__init__.py +9 -7
  80. local_deep_research/benchmarks/benchmark_functions.py +6 -2
  81. local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
  82. local_deep_research/benchmarks/cli.py +38 -13
  83. local_deep_research/benchmarks/comparison/__init__.py +4 -2
  84. local_deep_research/benchmarks/comparison/evaluator.py +316 -239
  85. local_deep_research/benchmarks/datasets/__init__.py +1 -1
  86. local_deep_research/benchmarks/datasets/base.py +91 -72
  87. local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
  88. local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
  89. local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
  90. local_deep_research/benchmarks/datasets/utils.py +48 -29
  91. local_deep_research/benchmarks/datasets.py +4 -11
  92. local_deep_research/benchmarks/efficiency/__init__.py +8 -4
  93. local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
  94. local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
  95. local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
  96. local_deep_research/benchmarks/evaluators/composite.py +6 -2
  97. local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
  98. local_deep_research/benchmarks/graders.py +32 -10
  99. local_deep_research/benchmarks/metrics/README.md +1 -1
  100. local_deep_research/benchmarks/metrics/calculation.py +25 -10
  101. local_deep_research/benchmarks/metrics/reporting.py +7 -3
  102. local_deep_research/benchmarks/metrics/visualization.py +42 -23
  103. local_deep_research/benchmarks/metrics.py +1 -1
  104. local_deep_research/benchmarks/optimization/__init__.py +3 -1
  105. local_deep_research/benchmarks/optimization/api.py +7 -1
  106. local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
  107. local_deep_research/benchmarks/runners.py +48 -15
  108. local_deep_research/citation_handler.py +65 -92
  109. local_deep_research/citation_handlers/__init__.py +15 -0
  110. local_deep_research/citation_handlers/base_citation_handler.py +70 -0
  111. local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
  112. local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
  113. local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
  114. local_deep_research/config/llm_config.py +271 -169
  115. local_deep_research/config/search_config.py +14 -5
  116. local_deep_research/defaults/__init__.py +0 -1
  117. local_deep_research/metrics/__init__.py +13 -0
  118. local_deep_research/metrics/database.py +58 -0
  119. local_deep_research/metrics/db_models.py +115 -0
  120. local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
  121. local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
  122. local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
  123. local_deep_research/metrics/migrate_research_ratings.py +31 -0
  124. local_deep_research/metrics/models.py +61 -0
  125. local_deep_research/metrics/pricing/__init__.py +12 -0
  126. local_deep_research/metrics/pricing/cost_calculator.py +237 -0
  127. local_deep_research/metrics/pricing/pricing_cache.py +143 -0
  128. local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
  129. local_deep_research/metrics/query_utils.py +51 -0
  130. local_deep_research/metrics/search_tracker.py +380 -0
  131. local_deep_research/metrics/token_counter.py +1078 -0
  132. local_deep_research/migrate_db.py +3 -1
  133. local_deep_research/report_generator.py +22 -8
  134. local_deep_research/search_system.py +390 -9
  135. local_deep_research/test_migration.py +15 -5
  136. local_deep_research/utilities/db_utils.py +7 -4
  137. local_deep_research/utilities/es_utils.py +115 -104
  138. local_deep_research/utilities/llm_utils.py +15 -5
  139. local_deep_research/utilities/log_utils.py +151 -0
  140. local_deep_research/utilities/search_cache.py +387 -0
  141. local_deep_research/utilities/search_utilities.py +14 -6
  142. local_deep_research/utilities/threading_utils.py +92 -0
  143. local_deep_research/utilities/url_utils.py +6 -0
  144. local_deep_research/web/api.py +347 -0
  145. local_deep_research/web/app.py +13 -17
  146. local_deep_research/web/app_factory.py +71 -66
  147. local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
  148. local_deep_research/web/database/migrations.py +5 -3
  149. local_deep_research/web/database/models.py +51 -2
  150. local_deep_research/web/database/schema_upgrade.py +49 -29
  151. local_deep_research/web/models/database.py +51 -61
  152. local_deep_research/web/routes/api_routes.py +56 -22
  153. local_deep_research/web/routes/benchmark_routes.py +4 -1
  154. local_deep_research/web/routes/globals.py +22 -0
  155. local_deep_research/web/routes/history_routes.py +71 -46
  156. local_deep_research/web/routes/metrics_routes.py +1155 -0
  157. local_deep_research/web/routes/research_routes.py +227 -41
  158. local_deep_research/web/routes/settings_routes.py +156 -55
  159. local_deep_research/web/services/research_service.py +310 -103
  160. local_deep_research/web/services/resource_service.py +36 -11
  161. local_deep_research/web/services/settings_manager.py +55 -17
  162. local_deep_research/web/services/settings_service.py +12 -4
  163. local_deep_research/web/services/socket_service.py +295 -188
  164. local_deep_research/web/static/css/custom_dropdown.css +180 -0
  165. local_deep_research/web/static/css/styles.css +39 -1
  166. local_deep_research/web/static/js/components/detail.js +633 -267
  167. local_deep_research/web/static/js/components/details.js +751 -0
  168. local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
  169. local_deep_research/web/static/js/components/fallback/ui.js +23 -23
  170. local_deep_research/web/static/js/components/history.js +76 -76
  171. local_deep_research/web/static/js/components/logpanel.js +61 -13
  172. local_deep_research/web/static/js/components/progress.js +13 -2
  173. local_deep_research/web/static/js/components/research.js +99 -12
  174. local_deep_research/web/static/js/components/results.js +239 -106
  175. local_deep_research/web/static/js/main.js +40 -40
  176. local_deep_research/web/static/js/services/audio.js +1 -1
  177. local_deep_research/web/static/js/services/formatting.js +11 -11
  178. local_deep_research/web/static/js/services/keyboard.js +157 -0
  179. local_deep_research/web/static/js/services/pdf.js +80 -80
  180. local_deep_research/web/static/sounds/README.md +1 -1
  181. local_deep_research/web/templates/base.html +1 -0
  182. local_deep_research/web/templates/components/log_panel.html +7 -1
  183. local_deep_research/web/templates/components/mobile_nav.html +1 -1
  184. local_deep_research/web/templates/components/sidebar.html +3 -0
  185. local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
  186. local_deep_research/web/templates/pages/details.html +325 -24
  187. local_deep_research/web/templates/pages/history.html +1 -1
  188. local_deep_research/web/templates/pages/metrics.html +1929 -0
  189. local_deep_research/web/templates/pages/progress.html +2 -2
  190. local_deep_research/web/templates/pages/research.html +53 -17
  191. local_deep_research/web/templates/pages/results.html +12 -1
  192. local_deep_research/web/templates/pages/star_reviews.html +803 -0
  193. local_deep_research/web/utils/formatters.py +9 -3
  194. local_deep_research/web_search_engines/default_search_engines.py +5 -3
  195. local_deep_research/web_search_engines/engines/full_search.py +8 -2
  196. local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
  197. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
  198. local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
  199. local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
  200. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
  201. local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
  202. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
  203. local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
  204. local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
  205. local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
  206. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
  207. local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
  208. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
  209. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
  210. local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
  211. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
  212. local_deep_research/web_search_engines/search_engine_base.py +83 -35
  213. local_deep_research/web_search_engines/search_engine_factory.py +25 -8
  214. local_deep_research/web_search_engines/search_engines_config.py +9 -3
  215. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
  216. local_deep_research-0.5.0.dist-info/RECORD +265 -0
  217. local_deep_research-0.4.4.dist-info/RECORD +0 -177
  218. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
  219. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
  220. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -6,8 +6,7 @@ of benchmark and optimization results.
6
6
  """
7
7
 
8
8
  import logging
9
- import os
10
- from typing import Dict, List, Optional, Tuple, Union
9
+ from typing import Dict, List, Optional
11
10
 
12
11
  import numpy as np
13
12
 
@@ -21,7 +20,9 @@ try:
21
20
  MATPLOTLIB_AVAILABLE = True
22
21
  except ImportError:
23
22
  MATPLOTLIB_AVAILABLE = False
24
- logger.warning("Matplotlib not available. Visualization functions will be limited.")
23
+ logger.warning(
24
+ "Matplotlib not available. Visualization functions will be limited."
25
+ )
25
26
 
26
27
 
27
28
  def plot_optimization_history(
@@ -98,13 +99,13 @@ def plot_parameter_importance(
98
99
 
99
100
  fig, ax = plt.subplots(figsize=(10, 6))
100
101
  y_pos = range(len(sorted_names))
101
-
102
+
102
103
  # Create horizontal bar chart
103
104
  ax.barh(y_pos, sorted_values, align="center")
104
105
  ax.set_yticks(y_pos)
105
106
  ax.set_yticklabels(sorted_names)
106
107
  ax.invert_yaxis() # Labels read top-to-bottom
107
-
108
+
108
109
  # Add labels and title
109
110
  ax.set_xlabel("Importance")
110
111
  ax.set_title(title)
@@ -144,31 +145,43 @@ def plot_quality_vs_speed(
144
145
  return None
145
146
 
146
147
  fig, ax = plt.subplots(figsize=(10, 8))
147
-
148
+
148
149
  # Create scatter plot
149
150
  scatter = ax.scatter(
150
- speed_scores,
151
- quality_scores,
152
- c=np.arange(len(quality_scores)),
153
- cmap="viridis",
151
+ speed_scores,
152
+ quality_scores,
153
+ c=np.arange(len(quality_scores)),
154
+ cmap="viridis",
154
155
  alpha=0.7,
155
- s=100
156
+ s=100,
156
157
  )
157
-
158
+
158
159
  # Add colorbar to show trial number
159
160
  cbar = plt.colorbar(scatter)
160
161
  cbar.set_label("Trial Number")
161
-
162
+
162
163
  # Add labels and title
163
164
  ax.set_xlabel("Speed Score (higher = faster)")
164
165
  ax.set_ylabel("Quality Score (higher = better)")
165
166
  ax.set_title(title)
166
167
  ax.grid(True, linestyle="--", alpha=0.5)
167
-
168
+
168
169
  # Add reference lines
169
- ax.axhline(y=0.7, color="r", linestyle="--", alpha=0.3, label="Good Quality Threshold")
170
- ax.axvline(x=0.7, color="g", linestyle="--", alpha=0.3, label="Good Speed Threshold")
171
-
170
+ ax.axhline(
171
+ y=0.7,
172
+ color="r",
173
+ linestyle="--",
174
+ alpha=0.3,
175
+ label="Good Quality Threshold",
176
+ )
177
+ ax.axvline(
178
+ x=0.7,
179
+ color="g",
180
+ linestyle="--",
181
+ alpha=0.3,
182
+ label="Good Speed Threshold",
183
+ )
184
+
172
185
  # Mark Pareto frontier
173
186
  if len(quality_scores) > 2:
174
187
  try:
@@ -178,13 +191,19 @@ def plot_quality_vs_speed(
178
191
  is_pareto = True
179
192
  for j in range(len(quality_scores)):
180
193
  if i != j:
181
- if quality_scores[j] >= quality_scores[i] and speed_scores[j] >= speed_scores[i]:
182
- if quality_scores[j] > quality_scores[i] or speed_scores[j] > speed_scores[i]:
194
+ if (
195
+ quality_scores[j] >= quality_scores[i]
196
+ and speed_scores[j] >= speed_scores[i]
197
+ ):
198
+ if (
199
+ quality_scores[j] > quality_scores[i]
200
+ or speed_scores[j] > speed_scores[i]
201
+ ):
183
202
  is_pareto = False
184
203
  break
185
204
  if is_pareto:
186
205
  pareto_points.append((speed_scores[i], quality_scores[i]))
187
-
206
+
188
207
  # Sort pareto points by speed score
189
208
  pareto_points.sort()
190
209
  if pareto_points:
@@ -193,13 +212,13 @@ def plot_quality_vs_speed(
193
212
  ax.scatter(pareto_x, pareto_y, c="red", s=50, alpha=0.8)
194
213
  except Exception as e:
195
214
  logger.warning(f"Error calculating Pareto frontier: {e}")
196
-
215
+
197
216
  ax.legend()
198
-
217
+
199
218
  # Save or return
200
219
  if output_file:
201
220
  fig.tight_layout()
202
221
  fig.savefig(output_file, dpi=300, bbox_inches="tight")
203
222
  logger.info(f"Saved quality vs. speed plot to {output_file}")
204
223
 
205
- return fig
224
+ return fig
@@ -8,4 +8,4 @@ New code should use the metrics package directly.
8
8
  from .metrics.calculation import calculate_metrics
9
9
  from .metrics.reporting import generate_report
10
10
 
11
- __all__ = ["calculate_metrics", "generate_report"]
11
+ __all__ = ["calculate_metrics", "generate_report"]
@@ -17,7 +17,9 @@ from local_deep_research.benchmarks.optimization.metrics import (
17
17
  calculate_resource_metrics,
18
18
  calculate_speed_metrics,
19
19
  )
20
- from local_deep_research.benchmarks.optimization.optuna_optimizer import OptunaOptimizer
20
+ from local_deep_research.benchmarks.optimization.optuna_optimizer import (
21
+ OptunaOptimizer,
22
+ )
21
23
 
22
24
  __all__ = [
23
25
  "OptunaOptimizer",
@@ -257,7 +257,13 @@ def get_default_param_space() -> Dict[str, Any]:
257
257
  },
258
258
  "search_strategy": {
259
259
  "type": "categorical",
260
- "choices": ["iterdrag", "standard", "rapid", "parallel", "source_based"],
260
+ "choices": [
261
+ "iterdrag",
262
+ "standard",
263
+ "rapid",
264
+ "parallel",
265
+ "source_based",
266
+ ],
261
267
  },
262
268
  "max_results": {
263
269
  "type": "int",
@@ -24,8 +24,12 @@ from optuna.visualization import (
24
24
  plot_slice,
25
25
  )
26
26
 
27
- from local_deep_research.benchmarks.efficiency.speed_profiler import SpeedProfiler
28
- from local_deep_research.benchmarks.evaluators import CompositeBenchmarkEvaluator
27
+ from local_deep_research.benchmarks.efficiency.speed_profiler import (
28
+ SpeedProfiler,
29
+ )
30
+ from local_deep_research.benchmarks.evaluators import (
31
+ CompositeBenchmarkEvaluator,
32
+ )
29
33
 
30
34
  # Import benchmark evaluator components
31
35
 
@@ -108,7 +112,9 @@ class OptunaOptimizer:
108
112
 
109
113
  # Initialize benchmark evaluator with weights
110
114
  self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
111
- self.benchmark_evaluator = CompositeBenchmarkEvaluator(self.benchmark_weights)
115
+ self.benchmark_evaluator = CompositeBenchmarkEvaluator(
116
+ self.benchmark_weights
117
+ )
112
118
 
113
119
  # Normalize weights to sum to 1.0
114
120
  total_weight = sum(self.metric_weights.values())
@@ -200,7 +206,9 @@ class OptunaOptimizer:
200
206
  # Create visualizations
201
207
  self._create_visualizations()
202
208
 
203
- logger.info(f"Optimization complete. Best parameters: {self.best_params}")
209
+ logger.info(
210
+ f"Optimization complete. Best parameters: {self.best_params}"
211
+ )
204
212
  logger.info(f"Best value: {self.study.best_value}")
205
213
 
206
214
  # Report completion
@@ -281,7 +289,9 @@ class OptunaOptimizer:
281
289
  },
282
290
  }
283
291
 
284
- def _objective(self, trial: optuna.Trial, param_space: Dict[str, Any]) -> float:
292
+ def _objective(
293
+ self, trial: optuna.Trial, param_space: Dict[str, Any]
294
+ ) -> float:
285
295
  """
286
296
  Objective function for Optuna optimization.
287
297
 
@@ -496,7 +506,9 @@ class OptunaOptimizer:
496
506
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
497
507
 
498
508
  # Save trial history
499
- history_file = os.path.join(self.output_dir, f"{self.study_name}_history.json")
509
+ history_file = os.path.join(
510
+ self.output_dir, f"{self.study_name}_history.json"
511
+ )
500
512
  with open(history_file, "w") as f:
501
513
  # Convert numpy values to native Python types for JSON serialization
502
514
  clean_history = []
@@ -517,7 +529,11 @@ class OptunaOptimizer:
517
529
  json.dump(clean_history, f, indent=2)
518
530
 
519
531
  # Save current best parameters
520
- if self.study and hasattr(self.study, "best_params") and self.study.best_params:
532
+ if (
533
+ self.study
534
+ and hasattr(self.study, "best_params")
535
+ and self.study.best_params
536
+ ):
521
537
  best_params_file = os.path.join(
522
538
  self.output_dir, f"{self.study_name}_best_params.json"
523
539
  )
@@ -541,7 +557,9 @@ class OptunaOptimizer:
541
557
 
542
558
  # Save the Optuna study
543
559
  if self.study:
544
- study_file = os.path.join(self.output_dir, f"{self.study_name}_study.pkl")
560
+ study_file = os.path.join(
561
+ self.output_dir, f"{self.study_name}_study.pkl"
562
+ )
545
563
  joblib.dump(self.study, study_file)
546
564
 
547
565
  logger.info(f"Results saved to {self.output_dir}")
@@ -549,7 +567,9 @@ class OptunaOptimizer:
549
567
  def _create_visualizations(self):
550
568
  """Create and save comprehensive visualizations of the optimization results."""
551
569
  if not PLOTTING_AVAILABLE:
552
- logger.warning("Matplotlib not available, skipping visualization creation")
570
+ logger.warning(
571
+ "Matplotlib not available, skipping visualization creation"
572
+ )
553
573
  return
554
574
 
555
575
  if not self.study or len(self.study.trials) < 2:
@@ -570,7 +590,11 @@ class OptunaOptimizer:
570
590
 
571
591
  def _create_quick_visualizations(self):
572
592
  """Create a smaller set of visualizations for intermediate progress."""
573
- if not PLOTTING_AVAILABLE or not self.study or len(self.study.trials) < 2:
593
+ if (
594
+ not PLOTTING_AVAILABLE
595
+ or not self.study
596
+ or len(self.study.trials) < 2
597
+ ):
574
598
  return
575
599
 
576
600
  # Create directory for visualizations
@@ -582,7 +606,8 @@ class OptunaOptimizer:
582
606
  fig = plot_optimization_history(self.study)
583
607
  fig.write_image(
584
608
  os.path.join(
585
- viz_dir, f"{self.study_name}_optimization_history_current.png"
609
+ viz_dir,
610
+ f"{self.study_name}_optimization_history_current.png",
586
611
  )
587
612
  )
588
613
  except Exception as e:
@@ -602,7 +627,8 @@ class OptunaOptimizer:
602
627
  fig = plot_optimization_history(self.study)
603
628
  fig.write_image(
604
629
  os.path.join(
605
- viz_dir, f"{self.study_name}_optimization_history_{timestamp}.png"
630
+ viz_dir,
631
+ f"{self.study_name}_optimization_history_{timestamp}.png",
606
632
  )
607
633
  )
608
634
  except Exception as e:
@@ -613,7 +639,8 @@ class OptunaOptimizer:
613
639
  fig = plot_param_importances(self.study)
614
640
  fig.write_image(
615
641
  os.path.join(
616
- viz_dir, f"{self.study_name}_param_importances_{timestamp}.png"
642
+ viz_dir,
643
+ f"{self.study_name}_param_importances_{timestamp}.png",
617
644
  )
618
645
  )
619
646
  except Exception as e:
@@ -625,7 +652,8 @@ class OptunaOptimizer:
625
652
  fig = plot_slice(self.study, [param_name])
626
653
  fig.write_image(
627
654
  os.path.join(
628
- viz_dir, f"{self.study_name}_slice_{param_name}_{timestamp}.png"
655
+ viz_dir,
656
+ f"{self.study_name}_slice_{param_name}_{timestamp}.png",
629
657
  )
630
658
  )
631
659
  except Exception as e:
@@ -684,7 +712,9 @@ class OptunaOptimizer:
684
712
 
685
713
  # Extract data from successful trials
686
714
  successful_trials = [
687
- t for t in self.trials_history if t.get("result", {}).get("success", False)
715
+ t
716
+ for t in self.trials_history
717
+ if t.get("result", {}).get("success", False)
688
718
  ]
689
719
 
690
720
  if not successful_trials:
@@ -715,7 +745,9 @@ class OptunaOptimizer:
715
745
  questions_values.append(questions)
716
746
 
717
747
  # Create scatter plot with size based on iterations*questions
718
- sizes = [i * q * 5 for i, q in zip(iterations_values, questions_values)]
748
+ sizes = [
749
+ i * q * 5 for i, q in zip(iterations_values, questions_values)
750
+ ]
719
751
  scatter = plt.scatter(
720
752
  quality_scores,
721
753
  speed_scores,
@@ -727,12 +759,15 @@ class OptunaOptimizer:
727
759
 
728
760
  # Highlight best trial
729
761
  best_trial = max(
730
- successful_trials, key=lambda x: x.get("result", {}).get("score", 0)
762
+ successful_trials,
763
+ key=lambda x: x.get("result", {}).get("score", 0),
731
764
  )
732
765
  best_quality = best_trial["result"].get("quality_score", 0)
733
766
  best_speed = best_trial["result"].get("speed_score", 0)
734
767
  best_iter = best_trial["params"].get("iterations", 0)
735
- best_questions = best_trial["params"].get("questions_per_iteration", 0)
768
+ best_questions = best_trial["params"].get(
769
+ "questions_per_iteration", 0
770
+ )
736
771
 
737
772
  plt.scatter(
738
773
  [best_quality],
@@ -745,7 +780,9 @@ class OptunaOptimizer:
745
780
  )
746
781
 
747
782
  # Add annotations for key points
748
- for i, (q, s, l) in enumerate(zip(quality_scores, speed_scores, labels)):
783
+ for i, (q, s, label) in enumerate(
784
+ zip(quality_scores, speed_scores, labels)
785
+ ):
749
786
  if i % max(1, len(quality_scores) // 5) == 0: # Label ~5 points
750
787
  plt.annotate(
751
788
  f"{iterations_values[i]}×{questions_values[i]}",
@@ -762,7 +799,9 @@ class OptunaOptimizer:
762
799
  weights_str = ", ".join(
763
800
  [f"{k}:{v:.1f}" for k, v in self.benchmark_weights.items()]
764
801
  )
765
- plt.title(f"Quality vs. Speed Trade-off\nBenchmark Weights: {weights_str}")
802
+ plt.title(
803
+ f"Quality vs. Speed Trade-off\nBenchmark Weights: {weights_str}"
804
+ )
766
805
  plt.xlabel("Quality Score (Benchmark Accuracy)")
767
806
  plt.ylabel("Speed Score")
768
807
  plt.grid(True, linestyle="--", alpha=0.7)
@@ -786,7 +825,8 @@ class OptunaOptimizer:
786
825
  plt.tight_layout()
787
826
  plt.savefig(
788
827
  os.path.join(
789
- viz_dir, f"{self.study_name}_quality_vs_speed_{timestamp}.png"
828
+ viz_dir,
829
+ f"{self.study_name}_quality_vs_speed_{timestamp}.png",
790
830
  )
791
831
  )
792
832
  plt.close()
@@ -895,7 +935,9 @@ class OptunaOptimizer:
895
935
  duration = trial.get("duration", 0)
896
936
  score = trial.get("score", 0)
897
937
  iterations = trial.get("params", {}).get("iterations", 1)
898
- questions = trial.get("params", {}).get("questions_per_iteration", 1)
938
+ questions = trial.get("params", {}).get(
939
+ "questions_per_iteration", 1
940
+ )
899
941
 
900
942
  trial_durations.append(duration)
901
943
  trial_scores.append(score)
@@ -903,13 +945,17 @@ class OptunaOptimizer:
903
945
  trial_questions.append(questions)
904
946
 
905
947
  # Total questions per trial
906
- total_questions = [i * q for i, q in zip(trial_iterations, trial_questions)]
948
+ total_questions = [
949
+ i * q for i, q in zip(trial_iterations, trial_questions)
950
+ ]
907
951
 
908
952
  # Create scatter plot with size based on total questions
909
953
  plt.scatter(
910
954
  trial_durations,
911
955
  trial_scores,
912
- s=[q * 5 for q in total_questions], # Size based on total questions
956
+ s=[
957
+ q * 5 for q in total_questions
958
+ ], # Size based on total questions
913
959
  alpha=0.7,
914
960
  c=range(len(trial_durations)),
915
961
  cmap="viridis",
@@ -923,7 +969,9 @@ class OptunaOptimizer:
923
969
 
924
970
  # Add trial number annotations for selected points
925
971
  for i, (d, s) in enumerate(zip(trial_durations, trial_scores)):
926
- if i % max(1, len(trial_durations) // 5) == 0: # Annotate ~5 points
972
+ if (
973
+ i % max(1, len(trial_durations) // 5) == 0
974
+ ): # Annotate ~5 points
927
975
  plt.annotate(
928
976
  f"{trial_iterations[i]}×{trial_questions[i]}",
929
977
  (d, s),
@@ -935,7 +983,8 @@ class OptunaOptimizer:
935
983
  plt.tight_layout()
936
984
  plt.savefig(
937
985
  os.path.join(
938
- viz_dir, f"{self.study_name}_duration_vs_score_{timestamp}.png"
986
+ viz_dir,
987
+ f"{self.study_name}_duration_vs_score_{timestamp}.png",
939
988
  )
940
989
  )
941
990
  plt.close()
@@ -92,10 +92,14 @@ def run_benchmark(
92
92
  # Load the examples
93
93
  dataset = dataset_instance.load()
94
94
 
95
- logger.info(f"Loaded {len(dataset)} examples using dataset class {type(dataset_instance).__name__}")
95
+ logger.info(
96
+ f"Loaded {len(dataset)} examples using dataset class {type(dataset_instance).__name__}"
97
+ )
96
98
  except Exception as e:
97
99
  # Fallback to legacy function if there's any issue
98
- logger.warning(f"Error using dataset class: {e}. Falling back to legacy function.")
100
+ logger.warning(
101
+ f"Error using dataset class: {e}. Falling back to legacy function."
102
+ )
99
103
  dataset = load_dataset(
100
104
  dataset_type=dataset_type,
101
105
  dataset_path=dataset_path,
@@ -105,11 +109,15 @@ def run_benchmark(
105
109
 
106
110
  # Set up output files
107
111
  timestamp = time.strftime("%Y%m%d_%H%M%S")
108
- results_file = os.path.join(output_dir, f"{dataset_type}_{timestamp}_results.jsonl")
112
+ results_file = os.path.join(
113
+ output_dir, f"{dataset_type}_{timestamp}_results.jsonl"
114
+ )
109
115
  evaluation_file = os.path.join(
110
116
  output_dir, f"{dataset_type}_{timestamp}_evaluation.jsonl"
111
117
  )
112
- report_file = os.path.join(output_dir, f"{dataset_type}_{timestamp}_report.md")
118
+ report_file = os.path.join(
119
+ output_dir, f"{dataset_type}_{timestamp}_report.md"
120
+ )
113
121
 
114
122
  # Make sure output files don't exist
115
123
  for file in [results_file, evaluation_file, report_file]:
@@ -135,11 +143,16 @@ def run_benchmark(
135
143
 
136
144
  for i, example in enumerate(dataset):
137
145
  # Extract question and answer in a way that uses the dataset class when available
138
- if 'dataset_instance' in locals() and isinstance(dataset_instance, DatasetRegistry.get_dataset_class(dataset_type.lower())):
146
+ if "dataset_instance" in locals() and isinstance(
147
+ dataset_instance,
148
+ DatasetRegistry.get_dataset_class(dataset_type.lower()),
149
+ ):
139
150
  # Use the dataset class methods to extract question and answer
140
151
  question = dataset_instance.get_question(example)
141
152
  correct_answer = dataset_instance.get_answer(example)
142
- logger.debug(f"Using dataset class methods to extract question and answer")
153
+ logger.debug(
154
+ "Using dataset class methods to extract question and answer"
155
+ )
143
156
  else:
144
157
  # Fallback to the legacy approach
145
158
  if dataset_type.lower() == "simpleqa":
@@ -163,7 +176,9 @@ def run_benchmark(
163
176
  "current": i + 1,
164
177
  "total": total_examples,
165
178
  "question": (
166
- question[:50] + "..." if len(question) > 50 else question
179
+ question[:50] + "..."
180
+ if len(question) > 50
181
+ else question
167
182
  ),
168
183
  },
169
184
  )
@@ -181,7 +196,9 @@ def run_benchmark(
181
196
  search_result = quick_summary(
182
197
  query=formatted_query,
183
198
  iterations=search_config.get("iterations", 3),
184
- questions_per_iteration=search_config.get("questions_per_iteration", 3),
199
+ questions_per_iteration=search_config.get(
200
+ "questions_per_iteration", 3
201
+ ),
185
202
  search_tool=search_config.get("search_tool", "searxng"),
186
203
  )
187
204
 
@@ -278,7 +295,9 @@ def run_benchmark(
278
295
 
279
296
  logger.info("Running human evaluation...")
280
297
  evaluation_results = evaluate(
281
- results_file=results_file, output_file=evaluation_file, interactive=True
298
+ results_file=results_file,
299
+ output_file=evaluation_file,
300
+ interactive=True,
282
301
  )
283
302
  else:
284
303
  logger.info("Running automated evaluation...")
@@ -349,7 +368,9 @@ def run_benchmark(
349
368
 
350
369
  # Generate report
351
370
  if progress_callback:
352
- progress_callback("Generating report", 95, {"status": "generating_report"})
371
+ progress_callback(
372
+ "Generating report", 95, {"status": "generating_report"}
373
+ )
353
374
 
354
375
  dataset_name = dataset_type.capitalize()
355
376
  report_path = generate_report(
@@ -366,7 +387,9 @@ def run_benchmark(
366
387
  "questions_per_iteration", 3
367
388
  ),
368
389
  "Search tool": search_config.get("search_tool", "searxng"),
369
- "Evaluation method": "Human" if human_evaluation else "Automated",
390
+ "Evaluation method": "Human"
391
+ if human_evaluation
392
+ else "Automated",
370
393
  },
371
394
  )
372
395
 
@@ -375,7 +398,11 @@ def run_benchmark(
375
398
  progress_callback(
376
399
  "Benchmark complete",
377
400
  100,
378
- {"status": "complete", "metrics": metrics, "report_path": report_path},
401
+ {
402
+ "status": "complete",
403
+ "metrics": metrics,
404
+ "report_path": report_path,
405
+ },
379
406
  )
380
407
 
381
408
  return {
@@ -417,10 +444,14 @@ def run_simpleqa_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:
417
444
  Returns:
418
445
  Dictionary with benchmark results
419
446
  """
420
- return run_benchmark(dataset_type="simpleqa", num_examples=num_examples, **kwargs)
447
+ return run_benchmark(
448
+ dataset_type="simpleqa", num_examples=num_examples, **kwargs
449
+ )
421
450
 
422
451
 
423
- def run_browsecomp_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:
452
+ def run_browsecomp_benchmark(
453
+ num_examples: int = 100, **kwargs
454
+ ) -> Dict[str, Any]:
424
455
  """
425
456
  Run BrowseComp benchmark with default settings.
426
457
 
@@ -431,4 +462,6 @@ def run_browsecomp_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any
431
462
  Returns:
432
463
  Dictionary with benchmark results
433
464
  """
434
- return run_benchmark(dataset_type="browsecomp", num_examples=num_examples, **kwargs)
465
+ return run_benchmark(
466
+ dataset_type="browsecomp", num_examples=num_examples, **kwargs
467
+ )