local-deep-research 0.3.12__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. local_deep_research/__version__.py +1 -1
  2. local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
  3. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
  4. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
  5. local_deep_research/advanced_search_system/findings/repository.py +0 -3
  6. local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
  7. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
  8. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
  9. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
  10. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
  11. local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
  12. local_deep_research/api/benchmark_functions.py +288 -0
  13. local_deep_research/api/research_functions.py +8 -4
  14. local_deep_research/benchmarks/README.md +162 -0
  15. local_deep_research/benchmarks/__init__.py +51 -0
  16. local_deep_research/benchmarks/benchmark_functions.py +353 -0
  17. local_deep_research/benchmarks/cli/__init__.py +16 -0
  18. local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
  19. local_deep_research/benchmarks/cli.py +347 -0
  20. local_deep_research/benchmarks/comparison/__init__.py +12 -0
  21. local_deep_research/benchmarks/comparison/evaluator.py +768 -0
  22. local_deep_research/benchmarks/datasets/__init__.py +53 -0
  23. local_deep_research/benchmarks/datasets/base.py +295 -0
  24. local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
  25. local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
  26. local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
  27. local_deep_research/benchmarks/datasets/utils.py +116 -0
  28. local_deep_research/benchmarks/datasets.py +31 -0
  29. local_deep_research/benchmarks/efficiency/__init__.py +14 -0
  30. local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
  31. local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
  32. local_deep_research/benchmarks/evaluators/__init__.py +18 -0
  33. local_deep_research/benchmarks/evaluators/base.py +74 -0
  34. local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
  35. local_deep_research/benchmarks/evaluators/composite.py +121 -0
  36. local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
  37. local_deep_research/benchmarks/graders.py +410 -0
  38. local_deep_research/benchmarks/metrics/README.md +80 -0
  39. local_deep_research/benchmarks/metrics/__init__.py +24 -0
  40. local_deep_research/benchmarks/metrics/calculation.py +385 -0
  41. local_deep_research/benchmarks/metrics/reporting.py +155 -0
  42. local_deep_research/benchmarks/metrics/visualization.py +205 -0
  43. local_deep_research/benchmarks/metrics.py +11 -0
  44. local_deep_research/benchmarks/optimization/__init__.py +32 -0
  45. local_deep_research/benchmarks/optimization/api.py +274 -0
  46. local_deep_research/benchmarks/optimization/metrics.py +20 -0
  47. local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
  48. local_deep_research/benchmarks/runners.py +434 -0
  49. local_deep_research/benchmarks/templates.py +65 -0
  50. local_deep_research/config/llm_config.py +26 -23
  51. local_deep_research/config/search_config.py +1 -5
  52. local_deep_research/defaults/default_settings.json +108 -7
  53. local_deep_research/search_system.py +16 -8
  54. local_deep_research/utilities/db_utils.py +3 -6
  55. local_deep_research/utilities/es_utils.py +441 -0
  56. local_deep_research/utilities/log_utils.py +36 -0
  57. local_deep_research/utilities/search_utilities.py +8 -9
  58. local_deep_research/web/app.py +7 -9
  59. local_deep_research/web/app_factory.py +9 -12
  60. local_deep_research/web/database/migrations.py +8 -5
  61. local_deep_research/web/database/models.py +20 -0
  62. local_deep_research/web/database/schema_upgrade.py +5 -8
  63. local_deep_research/web/models/database.py +15 -18
  64. local_deep_research/web/routes/benchmark_routes.py +427 -0
  65. local_deep_research/web/routes/research_routes.py +13 -17
  66. local_deep_research/web/routes/settings_routes.py +264 -67
  67. local_deep_research/web/services/research_service.py +47 -57
  68. local_deep_research/web/services/settings_manager.py +1 -4
  69. local_deep_research/web/services/settings_service.py +4 -6
  70. local_deep_research/web/static/css/styles.css +12 -0
  71. local_deep_research/web/static/js/components/logpanel.js +164 -155
  72. local_deep_research/web/static/js/components/research.js +44 -3
  73. local_deep_research/web/static/js/components/settings.js +27 -0
  74. local_deep_research/web/static/js/services/socket.js +47 -0
  75. local_deep_research/web_search_engines/default_search_engines.py +38 -0
  76. local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
  77. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
  78. local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
  79. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
  80. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
  81. local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
  82. local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
  83. local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
  84. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
  85. local_deep_research/web_search_engines/search_engine_base.py +22 -5
  86. local_deep_research/web_search_engines/search_engine_factory.py +32 -11
  87. local_deep_research/web_search_engines/search_engines_config.py +14 -1
  88. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/METADATA +10 -2
  89. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/RECORD +92 -49
  90. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/WHEEL +0 -0
  91. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/entry_points.txt +0 -0
  92. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,80 @@
1
+ # Unified Metrics Module
2
+
3
+ This module provides a unified approach to metrics calculation, reporting, and visualization for both standard benchmarks and parameter optimization.
4
+
5
+ ## Overview
6
+
7
+ The metrics module consists of three primary components:
8
+
9
+ 1. **Calculation**: Core functions for calculating metrics from benchmark results and system configurations
10
+ 2. **Reporting**: Functions for generating detailed reports from benchmark results
11
+ 3. **Visualization**: Utilities for creating visualizations of optimization results
12
+
13
+ ## Usage
14
+
15
+ ### Basic Metrics Calculation
16
+
17
+ ```python
18
+ from local_deep_research.benchmarks.metrics import calculate_metrics
19
+
20
+ # Calculate metrics from a results file
21
+ metrics = calculate_metrics("path/to/results.jsonl")
22
+ ```
23
+
24
+ ### Generating Reports
25
+
26
+ ```python
27
+ from local_deep_research.benchmarks.metrics import generate_report
28
+
29
+ # Generate a detailed report
30
+ report_path = generate_report(
31
+ metrics=metrics,
32
+ results_file="path/to/results.jsonl",
33
+ output_file="report.md",
34
+ dataset_name="SimpleQA",
35
+ config_info={"Dataset": "SimpleQA", "Examples": 100}
36
+ )
37
+ ```
38
+
39
+ ### Optimization Metrics
40
+
41
+ ```python
42
+ from local_deep_research.benchmarks.metrics import (
43
+ calculate_quality_metrics,
44
+ calculate_speed_metrics,
45
+ calculate_resource_metrics,
46
+ calculate_combined_score
47
+ )
48
+
49
+ # Calculate quality metrics for a configuration
50
+ quality_metrics = calculate_quality_metrics(
51
+ system_config={"iterations": 3, "questions_per_iteration": 3}
52
+ )
53
+
54
+ # Calculate a combined score using multiple metrics
55
+ combined_score = calculate_combined_score(
56
+ metrics={
57
+ "quality": quality_metrics,
58
+ "speed": speed_metrics,
59
+ "resource": resource_metrics
60
+ },
61
+ weights={"quality": 0.6, "speed": 0.3, "resource": 0.1}
62
+ )
63
+ ```
64
+
65
+ ### Visualization
66
+
67
+ ```python
68
+ from local_deep_research.benchmarks.metrics.visualization import (
69
+ plot_optimization_history,
70
+ plot_parameter_importance,
71
+ plot_quality_vs_speed
72
+ )
73
+
74
+ # Plot optimization history
75
+ fig = plot_optimization_history(
76
+ trial_values=[0.5, 0.6, 0.7, 0.65, 0.8],
77
+ best_values=[0.5, 0.6, 0.7, 0.7, 0.8],
78
+ output_file="optimization_history.png"
79
+ )
80
+ ```
@@ -0,0 +1,24 @@
1
+ """
2
+ Unified metrics module for benchmark evaluation and optimization.
3
+
4
+ This package provides metrics calculation, reporting, and visualization
5
+ functionality for both regular benchmarks and parameter optimization.
6
+ """
7
+
8
+ from .calculation import (
9
+ calculate_combined_score,
10
+ calculate_metrics,
11
+ calculate_quality_metrics,
12
+ calculate_resource_metrics,
13
+ calculate_speed_metrics,
14
+ )
15
+ from .reporting import generate_report
16
+
17
+ __all__ = [
18
+ "calculate_metrics",
19
+ "calculate_quality_metrics",
20
+ "calculate_speed_metrics",
21
+ "calculate_resource_metrics",
22
+ "calculate_combined_score",
23
+ "generate_report",
24
+ ]
@@ -0,0 +1,385 @@
1
+ """
2
+ Unified metrics calculation module.
3
+
4
+ This module provides functions for calculating metrics for both
5
+ standard benchmarks and optimization tasks.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ import os
11
+ import tempfile
12
+ import time
13
+ from datetime import datetime
14
+ from typing import Any, Dict, List, Optional, Union
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def calculate_metrics(results_file: str) -> Dict[str, Any]:
20
+ """
21
+ Calculate evaluation metrics from results.
22
+
23
+ Args:
24
+ results_file: Path to results file
25
+
26
+ Returns:
27
+ Dictionary of metrics
28
+ """
29
+ # Load results
30
+ results = []
31
+ try:
32
+ with open(results_file, "r") as f:
33
+ for line in f:
34
+ if line.strip():
35
+ results.append(json.loads(line))
36
+ except Exception as e:
37
+ logger.error(f"Error loading results file: {e}")
38
+ return {"error": str(e)}
39
+
40
+ if not results:
41
+ return {"error": "No results found"}
42
+
43
+ # Calculate accuracy
44
+ graded_results = [r for r in results if "is_correct" in r]
45
+ correct_count = sum(1 for r in graded_results if r.get("is_correct", False))
46
+ total_graded = len(graded_results)
47
+ accuracy = correct_count / total_graded if total_graded else 0
48
+
49
+ # Calculate average processing time if available
50
+ processing_times = [
51
+ r.get("processing_time", 0) for r in results if "processing_time" in r
52
+ ]
53
+ avg_time = sum(processing_times) / len(processing_times) if processing_times else 0
54
+
55
+ # Average confidence if available
56
+ confidence_values = []
57
+ for r in results:
58
+ if "confidence" in r and r["confidence"]:
59
+ try:
60
+ confidence_values.append(int(r["confidence"]))
61
+ except (ValueError, TypeError):
62
+ pass
63
+
64
+ avg_confidence = (
65
+ sum(confidence_values) / len(confidence_values) if confidence_values else 0
66
+ )
67
+
68
+ # Calculate error rate
69
+ error_count = sum(1 for r in results if "error" in r)
70
+ error_rate = error_count / len(results) if results else 0
71
+
72
+ # Basic metrics
73
+ metrics = {
74
+ "total_examples": len(results),
75
+ "graded_examples": total_graded,
76
+ "correct": correct_count,
77
+ "accuracy": accuracy,
78
+ "average_processing_time": avg_time,
79
+ "average_confidence": avg_confidence,
80
+ "error_count": error_count,
81
+ "error_rate": error_rate,
82
+ "timestamp": datetime.now().isoformat(),
83
+ }
84
+
85
+ # If we have category information, calculate per-category metrics
86
+ categories = {}
87
+ for r in graded_results:
88
+ if "category" in r:
89
+ category = r["category"]
90
+ if category not in categories:
91
+ categories[category] = {"total": 0, "correct": 0}
92
+ categories[category]["total"] += 1
93
+ if r.get("is_correct", False):
94
+ categories[category]["correct"] += 1
95
+
96
+ if categories:
97
+ category_metrics = {}
98
+ for category, counts in categories.items():
99
+ category_metrics[category] = {
100
+ "total": counts["total"],
101
+ "correct": counts["correct"],
102
+ "accuracy": (
103
+ counts["correct"] / counts["total"] if counts["total"] else 0
104
+ ),
105
+ }
106
+ metrics["categories"] = category_metrics
107
+
108
+ return metrics
109
+
110
+
111
+ def evaluate_benchmark_quality(
112
+ system_config: Dict[str, Any],
113
+ num_examples: int = 10,
114
+ output_dir: Optional[str] = None,
115
+ ) -> Dict[str, float]:
116
+ """
117
+ Evaluate quality using SimpleQA benchmark.
118
+
119
+ Args:
120
+ system_config: Configuration parameters to evaluate
121
+ num_examples: Number of benchmark examples to use
122
+ output_dir: Directory to save results (temporary if None)
123
+
124
+ Returns:
125
+ Dictionary with benchmark metrics
126
+ """
127
+ from ..runners import run_simpleqa_benchmark
128
+
129
+ # Create temporary directory if not provided
130
+ temp_dir = None
131
+ if output_dir is None:
132
+ temp_dir = tempfile.mkdtemp(prefix="ldr_benchmark_")
133
+ output_dir = temp_dir
134
+
135
+ try:
136
+ # Create search configuration from system config
137
+ search_config = {
138
+ "iterations": system_config.get("iterations", 2),
139
+ "questions_per_iteration": system_config.get("questions_per_iteration", 2),
140
+ "search_strategy": system_config.get("search_strategy", "iterdrag"),
141
+ "search_tool": system_config.get("search_tool", "searxng"),
142
+ "model_name": system_config.get("model_name"),
143
+ "provider": system_config.get("provider"),
144
+ }
145
+
146
+ # Run benchmark
147
+ logger.info(f"Running SimpleQA benchmark with {num_examples} examples")
148
+ benchmark_results = run_simpleqa_benchmark(
149
+ num_examples=num_examples,
150
+ output_dir=output_dir,
151
+ search_config=search_config,
152
+ run_evaluation=True,
153
+ )
154
+
155
+ # Extract key metrics
156
+ metrics = benchmark_results.get("metrics", {})
157
+ accuracy = metrics.get("accuracy", 0.0)
158
+
159
+ # Return only the most relevant metrics
160
+ return {
161
+ "accuracy": accuracy,
162
+ "quality_score": accuracy, # Map accuracy directly to quality score
163
+ }
164
+
165
+ except Exception as e:
166
+ logger.error(f"Error in benchmark evaluation: {str(e)}")
167
+ return {"accuracy": 0.0, "quality_score": 0.0, "error": str(e)}
168
+
169
+ finally:
170
+ # Clean up temporary directory if we created it
171
+ if temp_dir and os.path.exists(temp_dir):
172
+ import shutil
173
+
174
+ try:
175
+ shutil.rmtree(temp_dir)
176
+ except Exception as e:
177
+ logger.warning(f"Failed to clean up temporary directory: {str(e)}")
178
+
179
+
180
+ def measure_execution_time(
181
+ system_config: Dict[str, Any],
182
+ query: str = "test query",
183
+ search_tool: Optional[str] = None,
184
+ num_runs: int = 1,
185
+ ) -> Dict[str, float]:
186
+ """
187
+ Measure execution time for a given configuration.
188
+
189
+ Args:
190
+ system_config: Configuration parameters to evaluate
191
+ query: Query to use for timing tests
192
+ search_tool: Override search tool
193
+ num_runs: Number of runs to average time over
194
+
195
+ Returns:
196
+ Dictionary with speed metrics
197
+ """
198
+ from local_deep_research.search_system import SearchSystem
199
+
200
+ if search_tool:
201
+ system_config["search_tool"] = search_tool
202
+
203
+ # Configure system
204
+ system = SearchSystem(
205
+ iterations=system_config.get("iterations", 2),
206
+ questions_per_iteration=system_config.get("questions_per_iteration", 2),
207
+ search_strategy=system_config.get("search_strategy", "iterdrag"),
208
+ search_tool=system_config.get("search_tool", "searxng"),
209
+ model_name=system_config.get("model_name"),
210
+ provider=system_config.get("provider"),
211
+ )
212
+
213
+ # Run multiple times and calculate average
214
+ total_time = 0
215
+ times = []
216
+
217
+ try:
218
+ for i in range(num_runs):
219
+ logger.info(f"Executing speed test run {i+1}/{num_runs}")
220
+ start_time = time.time()
221
+ system.search(query, full_response=False)
222
+ end_time = time.time()
223
+ run_time = end_time - start_time
224
+ times.append(run_time)
225
+ total_time += run_time
226
+
227
+ # Calculate metrics
228
+ average_time = total_time / num_runs
229
+
230
+ # Calculate speed score (0-1 scale, lower times are better)
231
+ # Using sigmoid-like normalization where:
232
+ # - Times around 30s get ~0.5 score
233
+ # - Times under 10s get >0.8 score
234
+ # - Times over 2min get <0.2 score
235
+ speed_score = 1.0 / (1.0 + (average_time / 30.0))
236
+
237
+ return {
238
+ "average_time": average_time,
239
+ "min_time": min(times),
240
+ "max_time": max(times),
241
+ "speed_score": speed_score,
242
+ }
243
+
244
+ except Exception as e:
245
+ logger.error(f"Error in speed measurement: {str(e)}")
246
+ return {"average_time": 0.0, "speed_score": 0.0, "error": str(e)}
247
+
248
+
249
+ def calculate_quality_metrics(
250
+ system_config: Dict[str, Any],
251
+ num_examples: int = 2, # Reduced for quicker demo
252
+ output_dir: Optional[str] = None,
253
+ ) -> Dict[str, float]:
254
+ """
255
+ Calculate quality-related metrics for a configuration.
256
+
257
+ Args:
258
+ system_config: Configuration parameters to evaluate
259
+ num_examples: Number of benchmark examples to use
260
+ output_dir: Directory to save results (temporary if None)
261
+
262
+ Returns:
263
+ Dictionary with quality metrics
264
+ """
265
+ # Run quality evaluation
266
+ quality_results = evaluate_benchmark_quality(
267
+ system_config=system_config, num_examples=num_examples, output_dir=output_dir
268
+ )
269
+
270
+ # Return normalized quality score
271
+ return {
272
+ "quality_score": quality_results.get("quality_score", 0.0),
273
+ "accuracy": quality_results.get("accuracy", 0.0),
274
+ }
275
+
276
+
277
+ def calculate_speed_metrics(
278
+ system_config: Dict[str, Any],
279
+ query: str = "test query",
280
+ search_tool: Optional[str] = None,
281
+ num_runs: int = 1,
282
+ ) -> Dict[str, float]:
283
+ """
284
+ Calculate speed-related metrics for a configuration.
285
+
286
+ Args:
287
+ system_config: Configuration parameters to evaluate
288
+ query: Query to use for timing tests
289
+ search_tool: Override search tool
290
+ num_runs: Number of runs to average time over
291
+
292
+ Returns:
293
+ Dictionary with speed metrics
294
+ """
295
+ # Run speed measurement
296
+ speed_results = measure_execution_time(
297
+ system_config=system_config,
298
+ query=query,
299
+ search_tool=search_tool,
300
+ num_runs=num_runs,
301
+ )
302
+
303
+ # Return normalized speed score
304
+ return {
305
+ "speed_score": speed_results.get("speed_score", 0.0),
306
+ "average_time": speed_results.get("average_time", 0.0),
307
+ }
308
+
309
+
310
+ def calculate_resource_metrics(
311
+ system_config: Dict[str, Any],
312
+ query: str = "test query",
313
+ search_tool: Optional[str] = None,
314
+ ) -> Dict[str, float]:
315
+ """
316
+ Calculate resource usage metrics for a configuration.
317
+
318
+ Args:
319
+ system_config: Configuration parameters to evaluate
320
+ query: Query to use for resource tests
321
+ search_tool: Override search tool
322
+
323
+ Returns:
324
+ Dictionary with resource metrics
325
+ """
326
+ # This is a simplified version - in a real implementation,
327
+ # you would measure memory usage, API call counts, etc.
328
+
329
+ # For now, we'll use a heuristic based on configuration values
330
+ iterations = system_config.get("iterations", 2)
331
+ questions = system_config.get("questions_per_iteration", 2)
332
+ max_results = system_config.get("max_results", 50)
333
+
334
+ # Simple heuristic: more iterations, questions, and results = more resources
335
+ complexity = iterations * questions * (max_results / 50)
336
+
337
+ # Normalize to 0-1 scale (lower is better)
338
+ resource_score = 1.0 / (1.0 + (complexity / 4.0))
339
+
340
+ return {"resource_score": resource_score, "estimated_complexity": complexity}
341
+
342
+
343
+ def calculate_combined_score(
344
+ metrics: Dict[str, Dict[str, float]], weights: Dict[str, float] = None
345
+ ) -> float:
346
+ """
347
+ Calculate a combined optimization score from multiple metrics.
348
+
349
+ Args:
350
+ metrics: Dictionary of metric categories and their values
351
+ weights: Dictionary of weights for each metric category
352
+
353
+ Returns:
354
+ Combined score between 0 and 1
355
+ """
356
+ # Default weights if not provided
357
+ if weights is None:
358
+ weights = {"quality": 0.6, "speed": 0.3, "resource": 0.1}
359
+
360
+ # Normalize weights to sum to 1
361
+ total_weight = sum(weights.values())
362
+ if total_weight == 0:
363
+ return 0.0
364
+
365
+ norm_weights = {k: v / total_weight for k, v in weights.items()}
366
+
367
+ # Calculate weighted score
368
+ score = 0.0
369
+
370
+ # Quality component
371
+ if "quality" in metrics and "quality" in norm_weights:
372
+ quality_score = metrics["quality"].get("quality_score", 0.0)
373
+ score += quality_score * norm_weights["quality"]
374
+
375
+ # Speed component
376
+ if "speed" in metrics and "speed" in norm_weights:
377
+ speed_score = metrics["speed"].get("speed_score", 0.0)
378
+ score += speed_score * norm_weights["speed"]
379
+
380
+ # Resource component
381
+ if "resource" in metrics and "resource" in norm_weights:
382
+ resource_score = metrics["resource"].get("resource_score", 0.0)
383
+ score += resource_score * norm_weights["resource"]
384
+
385
+ return score
@@ -0,0 +1,155 @@
1
+ """
2
+ Report generation for benchmark results.
3
+
4
+ This module provides functions for generating detailed reports from benchmark results.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from datetime import datetime
10
+ from typing import Any, Dict, Optional
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def generate_report(
16
+ metrics: Dict[str, Any],
17
+ results_file: str,
18
+ output_file: str = "evaluation_report.md",
19
+ dataset_name: str = "Unknown",
20
+ config_info: Optional[Dict[str, Any]] = None,
21
+ ) -> str:
22
+ """
23
+ Generate a detailed report from evaluation results.
24
+
25
+ Args:
26
+ metrics: Dictionary of evaluation metrics
27
+ results_file: Path to results file
28
+ output_file: Path to save report
29
+ dataset_name: Name of dataset
30
+ config_info: Optional configuration information
31
+
32
+ Returns:
33
+ Path to the generated report file
34
+ """
35
+ # Load a sample of results for examples
36
+ results = []
37
+ try:
38
+ with open(results_file, "r") as f:
39
+ for line in f:
40
+ if line.strip():
41
+ results.append(json.loads(line))
42
+ except Exception as e:
43
+ logger.error(f"Error loading results for report: {e}")
44
+ results = []
45
+
46
+ # Sample up to 5 correct and 5 incorrect examples
47
+ correct_examples = [r for r in results if r.get("is_correct", False)][:5]
48
+ incorrect_examples = [
49
+ r for r in results if "is_correct" in r and not r.get("is_correct", False)
50
+ ][:5]
51
+
52
+ # Create report
53
+ report = [
54
+ f"# Evaluation Report: {dataset_name}",
55
+ "",
56
+ "## Summary",
57
+ "",
58
+ f"- **Total Examples**: {metrics.get('total_examples', 0)}",
59
+ f"- **Graded Examples**: {metrics.get('graded_examples', 0)}",
60
+ f"- **Correct Answers**: {metrics.get('correct', 0)}",
61
+ f"- **Accuracy**: {metrics.get('accuracy', 0):.3f}",
62
+ ]
63
+
64
+ if "average_processing_time" in metrics:
65
+ report.append(
66
+ f"- **Average Processing Time**: {metrics['average_processing_time']:.2f} seconds"
67
+ )
68
+
69
+ if "average_confidence" in metrics:
70
+ report.append(f"- **Average Confidence**: {metrics['average_confidence']:.2f}%")
71
+
72
+ if "error_count" in metrics and metrics["error_count"] > 0:
73
+ report.append(f"- **Error Count**: {metrics['error_count']}")
74
+ report.append(f"- **Error Rate**: {metrics['error_rate']:.3f}")
75
+
76
+ report.append("")
77
+
78
+ # Add per-category metrics if available
79
+ if "categories" in metrics:
80
+ report.extend(["## Category Performance", ""])
81
+
82
+ for category, category_metrics in metrics["categories"].items():
83
+ report.append(f"### {category}")
84
+ report.append("")
85
+ report.append(f"- **Total**: {category_metrics['total']}")
86
+ report.append(f"- **Correct**: {category_metrics['correct']}")
87
+ report.append(f"- **Accuracy**: {category_metrics['accuracy']:.3f}")
88
+ report.append("")
89
+
90
+ # Add configuration info if provided
91
+ if config_info:
92
+ report.extend(["## Configuration", ""])
93
+
94
+ for key, value in config_info.items():
95
+ report.append(f"- **{key}**: {value}")
96
+
97
+ report.append("")
98
+
99
+ # Add example sections
100
+ if correct_examples:
101
+ report.extend(["## Example Correct Answers", ""])
102
+
103
+ for idx, example in enumerate(correct_examples):
104
+ report.extend(
105
+ [
106
+ f"### Example {idx + 1}",
107
+ "",
108
+ f"**Question**: {example.get('problem', '')}",
109
+ "",
110
+ f"**Correct Answer**: {example.get('correct_answer', '')}",
111
+ "",
112
+ f"**Model Answer**: {example.get('extracted_answer', '')}",
113
+ "",
114
+ f"**Reasoning**: {example.get('reasoning', '')}",
115
+ "",
116
+ ]
117
+ )
118
+
119
+ if incorrect_examples:
120
+ report.extend(["## Example Incorrect Answers", ""])
121
+
122
+ for idx, example in enumerate(incorrect_examples):
123
+ report.extend(
124
+ [
125
+ f"### Example {idx + 1}",
126
+ "",
127
+ f"**Question**: {example.get('problem', '')}",
128
+ "",
129
+ f"**Correct Answer**: {example.get('correct_answer', '')}",
130
+ "",
131
+ f"**Model Answer**: {example.get('extracted_answer', '')}",
132
+ "",
133
+ f"**Reasoning**: {example.get('reasoning', '')}",
134
+ "",
135
+ ]
136
+ )
137
+
138
+ # Add timestamp
139
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
140
+ report.extend(
141
+ [
142
+ "## Metadata",
143
+ "",
144
+ f"- **Generated**: {timestamp}",
145
+ f"- **Dataset**: {dataset_name}",
146
+ "",
147
+ ]
148
+ )
149
+
150
+ # Write report to file
151
+ with open(output_file, "w") as f:
152
+ f.write("\n".join(report))
153
+
154
+ logger.info(f"Report saved to {output_file}")
155
+ return output_file