local-deep-research 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. local_deep_research/__init__.py +1 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
  4. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
  5. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
  6. local_deep_research/advanced_search_system/findings/repository.py +0 -3
  7. local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
  8. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
  9. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
  10. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
  11. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
  12. local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
  13. local_deep_research/api/benchmark_functions.py +288 -0
  14. local_deep_research/api/research_functions.py +8 -4
  15. local_deep_research/benchmarks/README.md +162 -0
  16. local_deep_research/benchmarks/__init__.py +51 -0
  17. local_deep_research/benchmarks/benchmark_functions.py +353 -0
  18. local_deep_research/benchmarks/cli/__init__.py +16 -0
  19. local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
  20. local_deep_research/benchmarks/cli.py +347 -0
  21. local_deep_research/benchmarks/comparison/__init__.py +12 -0
  22. local_deep_research/benchmarks/comparison/evaluator.py +768 -0
  23. local_deep_research/benchmarks/datasets/__init__.py +53 -0
  24. local_deep_research/benchmarks/datasets/base.py +295 -0
  25. local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
  26. local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
  27. local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
  28. local_deep_research/benchmarks/datasets/utils.py +116 -0
  29. local_deep_research/benchmarks/datasets.py +31 -0
  30. local_deep_research/benchmarks/efficiency/__init__.py +14 -0
  31. local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
  32. local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
  33. local_deep_research/benchmarks/evaluators/__init__.py +18 -0
  34. local_deep_research/benchmarks/evaluators/base.py +74 -0
  35. local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
  36. local_deep_research/benchmarks/evaluators/composite.py +121 -0
  37. local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
  38. local_deep_research/benchmarks/graders.py +410 -0
  39. local_deep_research/benchmarks/metrics/README.md +80 -0
  40. local_deep_research/benchmarks/metrics/__init__.py +24 -0
  41. local_deep_research/benchmarks/metrics/calculation.py +385 -0
  42. local_deep_research/benchmarks/metrics/reporting.py +155 -0
  43. local_deep_research/benchmarks/metrics/visualization.py +205 -0
  44. local_deep_research/benchmarks/metrics.py +11 -0
  45. local_deep_research/benchmarks/optimization/__init__.py +32 -0
  46. local_deep_research/benchmarks/optimization/api.py +274 -0
  47. local_deep_research/benchmarks/optimization/metrics.py +20 -0
  48. local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
  49. local_deep_research/benchmarks/runners.py +434 -0
  50. local_deep_research/benchmarks/templates.py +65 -0
  51. local_deep_research/config/llm_config.py +26 -23
  52. local_deep_research/config/search_config.py +1 -5
  53. local_deep_research/defaults/default_settings.json +108 -7
  54. local_deep_research/search_system.py +16 -8
  55. local_deep_research/utilities/db_utils.py +3 -6
  56. local_deep_research/utilities/es_utils.py +441 -0
  57. local_deep_research/utilities/log_utils.py +36 -0
  58. local_deep_research/utilities/search_utilities.py +8 -9
  59. local_deep_research/web/app.py +15 -10
  60. local_deep_research/web/app_factory.py +9 -12
  61. local_deep_research/web/database/migrations.py +8 -5
  62. local_deep_research/web/database/models.py +20 -0
  63. local_deep_research/web/database/schema_upgrade.py +5 -8
  64. local_deep_research/web/models/database.py +15 -18
  65. local_deep_research/web/routes/benchmark_routes.py +427 -0
  66. local_deep_research/web/routes/research_routes.py +13 -17
  67. local_deep_research/web/routes/settings_routes.py +264 -67
  68. local_deep_research/web/services/research_service.py +58 -73
  69. local_deep_research/web/services/settings_manager.py +1 -4
  70. local_deep_research/web/services/settings_service.py +4 -6
  71. local_deep_research/web/static/css/styles.css +12 -0
  72. local_deep_research/web/static/js/components/logpanel.js +164 -155
  73. local_deep_research/web/static/js/components/research.js +44 -3
  74. local_deep_research/web/static/js/components/settings.js +27 -0
  75. local_deep_research/web/static/js/services/socket.js +47 -0
  76. local_deep_research/web_search_engines/default_search_engines.py +38 -0
  77. local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
  78. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
  79. local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
  80. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
  81. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
  82. local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
  83. local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
  84. local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
  85. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
  86. local_deep_research/web_search_engines/search_engine_base.py +22 -5
  87. local_deep_research/web_search_engines/search_engine_factory.py +30 -11
  88. local_deep_research/web_search_engines/search_engines_config.py +14 -1
  89. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/METADATA +10 -2
  90. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/RECORD +93 -51
  91. local_deep_research/app.py +0 -8
  92. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/WHEEL +0 -0
  93. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/entry_points.txt +0 -0
  94. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,434 @@
1
+ """
2
+ Benchmark runners for Local Deep Research.
3
+
4
+ This module provides the main functions for running benchmarks using LDR.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ import os
10
+ import time
11
+ from typing import Any, Callable, Dict, Optional
12
+
13
+ from ..api import quick_summary
14
+ from .datasets import DEFAULT_DATASET_URLS, load_dataset
15
+ from .datasets.base import DatasetRegistry
16
+ from .graders import extract_answer_from_response, grade_results
17
+ from .metrics import calculate_metrics, generate_report
18
+ from .templates import BROWSECOMP_QUERY_TEMPLATE
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def format_query(question: str, dataset_type: str = "simpleqa") -> str:
24
+ """
25
+ Format query based on dataset type.
26
+
27
+ Args:
28
+ question: Original question
29
+ dataset_type: Type of dataset
30
+
31
+ Returns:
32
+ Formatted query for LDR
33
+ """
34
+ if dataset_type.lower() == "browsecomp":
35
+ # BrowseComp requires specific formatting
36
+ return BROWSECOMP_QUERY_TEMPLATE.format(question=question)
37
+
38
+ # Simple format for SimpleQA
39
+ return question
40
+
41
+
42
+ def run_benchmark(
43
+ dataset_type: str,
44
+ dataset_path: Optional[str] = None,
45
+ num_examples: Optional[int] = None,
46
+ output_dir: str = "benchmark_results",
47
+ run_evaluation: bool = True,
48
+ evaluation_config: Optional[Dict[str, Any]] = None,
49
+ search_config: Optional[Dict[str, Any]] = None,
50
+ human_evaluation: bool = False,
51
+ progress_callback: Optional[Callable[[str, int, Dict], None]] = None,
52
+ seed: int = 42,
53
+ ) -> Dict[str, Any]:
54
+ """
55
+ Run a benchmark on the specified dataset.
56
+
57
+ Args:
58
+ dataset_type: Type of dataset ("simpleqa" or "browsecomp")
59
+ dataset_path: Optional custom dataset path
60
+ num_examples: Number of examples to use
61
+ output_dir: Directory to save results
62
+ run_evaluation: Whether to evaluate results
63
+ evaluation_config: Custom LLM config for evaluation
64
+ search_config: Custom search parameters
65
+ human_evaluation: Whether to use human evaluation
66
+ progress_callback: Optional callback for progress updates
67
+ seed: Random seed for reproducibility
68
+
69
+ Returns:
70
+ Dictionary with benchmark results and metrics
71
+ """
72
+ # Ensure output directory exists
73
+ os.makedirs(output_dir, exist_ok=True)
74
+
75
+ # Default search configuration
76
+ if not search_config:
77
+ search_config = {
78
+ "iterations": 3,
79
+ "questions_per_iteration": 3,
80
+ "search_tool": "searxng",
81
+ }
82
+
83
+ # Load dataset using the class-based approach
84
+ try:
85
+ # Create the dataset instance from registry
86
+ dataset_instance = DatasetRegistry.create_dataset(
87
+ dataset_id=dataset_type.lower(),
88
+ dataset_path=dataset_path,
89
+ num_examples=num_examples,
90
+ seed=seed,
91
+ )
92
+ # Load the examples
93
+ dataset = dataset_instance.load()
94
+
95
+ logger.info(f"Loaded {len(dataset)} examples using dataset class {type(dataset_instance).__name__}")
96
+ except Exception as e:
97
+ # Fallback to legacy function if there's any issue
98
+ logger.warning(f"Error using dataset class: {e}. Falling back to legacy function.")
99
+ dataset = load_dataset(
100
+ dataset_type=dataset_type,
101
+ dataset_path=dataset_path,
102
+ num_examples=num_examples,
103
+ seed=seed,
104
+ )
105
+
106
+ # Set up output files
107
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
108
+ results_file = os.path.join(output_dir, f"{dataset_type}_{timestamp}_results.jsonl")
109
+ evaluation_file = os.path.join(
110
+ output_dir, f"{dataset_type}_{timestamp}_evaluation.jsonl"
111
+ )
112
+ report_file = os.path.join(output_dir, f"{dataset_type}_{timestamp}_report.md")
113
+
114
+ # Make sure output files don't exist
115
+ for file in [results_file, evaluation_file, report_file]:
116
+ if os.path.exists(file):
117
+ os.remove(file)
118
+
119
+ # Progress tracking
120
+ total_examples = len(dataset)
121
+
122
+ if progress_callback:
123
+ progress_callback(
124
+ "Starting benchmark",
125
+ 0,
126
+ {
127
+ "status": "started",
128
+ "dataset_type": dataset_type,
129
+ "total_examples": total_examples,
130
+ },
131
+ )
132
+
133
+ # Process each example
134
+ results = []
135
+
136
+ for i, example in enumerate(dataset):
137
+ # Extract question and answer in a way that uses the dataset class when available
138
+ if 'dataset_instance' in locals() and isinstance(dataset_instance, DatasetRegistry.get_dataset_class(dataset_type.lower())):
139
+ # Use the dataset class methods to extract question and answer
140
+ question = dataset_instance.get_question(example)
141
+ correct_answer = dataset_instance.get_answer(example)
142
+ logger.debug(f"Using dataset class methods to extract question and answer")
143
+ else:
144
+ # Fallback to the legacy approach
145
+ if dataset_type.lower() == "simpleqa":
146
+ question = example.get("problem", "")
147
+ correct_answer = example.get("answer", "")
148
+ else: # browsecomp
149
+ question = example.get("problem", "")
150
+ # For BrowseComp, the answer should be in "correct_answer" after decryption
151
+ correct_answer = example.get("correct_answer", "")
152
+ if not correct_answer and "answer" in example:
153
+ # Fallback to "answer" field if "correct_answer" is not available
154
+ correct_answer = example.get("answer", "")
155
+
156
+ # Update progress
157
+ if progress_callback:
158
+ progress_callback(
159
+ f"Processing example {i + 1}/{total_examples}",
160
+ int(i / total_examples * 50),
161
+ {
162
+ "status": "processing",
163
+ "current": i + 1,
164
+ "total": total_examples,
165
+ "question": (
166
+ question[:50] + "..." if len(question) > 50 else question
167
+ ),
168
+ },
169
+ )
170
+
171
+ logger.info(f"Processing {i + 1}/{total_examples}: {question[:50]}...")
172
+
173
+ try:
174
+ # Format query based on dataset type
175
+ formatted_query = format_query(question, dataset_type)
176
+
177
+ # Time the search
178
+ start_time = time.time()
179
+
180
+ # Get response from LDR
181
+ search_result = quick_summary(
182
+ query=formatted_query,
183
+ iterations=search_config.get("iterations", 3),
184
+ questions_per_iteration=search_config.get("questions_per_iteration", 3),
185
+ search_tool=search_config.get("search_tool", "searxng"),
186
+ )
187
+
188
+ end_time = time.time()
189
+ processing_time = end_time - start_time
190
+
191
+ # Extract response and search info
192
+ response = search_result.get("summary", "")
193
+
194
+ # Extract structured information
195
+ extracted = extract_answer_from_response(response, dataset_type)
196
+
197
+ # Format result
198
+ result = {
199
+ "id": example.get("id", f"example_{i}"),
200
+ "problem": question,
201
+ "correct_answer": correct_answer,
202
+ "response": response,
203
+ "extracted_answer": extracted["extracted_answer"],
204
+ "confidence": extracted["confidence"],
205
+ "processing_time": processing_time,
206
+ "sources": search_result.get("sources", []),
207
+ "search_config": search_config,
208
+ }
209
+
210
+ # Add to results list
211
+ results.append(result)
212
+
213
+ # Write result to file
214
+ with open(results_file, "a") as f:
215
+ f.write(json.dumps(result) + "\n")
216
+
217
+ # Update progress
218
+ if progress_callback:
219
+ progress_callback(
220
+ f"Completed example {i + 1}/{total_examples}",
221
+ int((i + 0.5) / total_examples * 50),
222
+ {
223
+ "status": "completed_example",
224
+ "current": i + 1,
225
+ "total": total_examples,
226
+ "result": result,
227
+ },
228
+ )
229
+
230
+ except Exception as e:
231
+ logger.error(f"Error processing example {i + 1}: {str(e)}")
232
+
233
+ # Create error result
234
+ error_result = {
235
+ "id": example.get("id", f"example_{i}"),
236
+ "problem": question,
237
+ "correct_answer": correct_answer,
238
+ "error": str(e),
239
+ "processing_time": (
240
+ time.time() - start_time if "start_time" in locals() else 0
241
+ ),
242
+ }
243
+
244
+ # Add to results list
245
+ results.append(error_result)
246
+
247
+ # Write error result to file
248
+ with open(results_file, "a") as f:
249
+ f.write(json.dumps(error_result) + "\n")
250
+
251
+ # Update progress
252
+ if progress_callback:
253
+ progress_callback(
254
+ f"Error processing example {i + 1}/{total_examples}",
255
+ int((i + 0.5) / total_examples * 50),
256
+ {
257
+ "status": "error",
258
+ "current": i + 1,
259
+ "total": total_examples,
260
+ "error": str(e),
261
+ "result": error_result,
262
+ },
263
+ )
264
+
265
+ logger.info(f"Completed processing {total_examples} examples")
266
+
267
+ # Run evaluation if requested
268
+ if run_evaluation:
269
+ if progress_callback:
270
+ progress_callback(
271
+ "Starting evaluation",
272
+ 50,
273
+ {"status": "evaluating", "results_file": results_file},
274
+ )
275
+
276
+ if human_evaluation:
277
+ from .graders import human_evaluation as evaluate
278
+
279
+ logger.info("Running human evaluation...")
280
+ evaluation_results = evaluate(
281
+ results_file=results_file, output_file=evaluation_file, interactive=True
282
+ )
283
+ else:
284
+ logger.info("Running automated evaluation...")
285
+ try:
286
+ evaluation_results = grade_results(
287
+ results_file=results_file,
288
+ output_file=evaluation_file,
289
+ dataset_type=dataset_type,
290
+ evaluation_config=evaluation_config,
291
+ progress_callback=lambda current, total, meta: (
292
+ progress_callback(
293
+ f"Evaluating {current + 1}/{total}",
294
+ 50 + int((current + 0.5) / total * 40),
295
+ {**meta, "status": "evaluating"},
296
+ )
297
+ if progress_callback
298
+ else None
299
+ ),
300
+ )
301
+ except Exception as e:
302
+ logger.error(f"Automated evaluation failed: {str(e)}")
303
+
304
+ if progress_callback:
305
+ progress_callback(
306
+ "Automated evaluation failed. Falling back to human evaluation.",
307
+ 60,
308
+ {"status": "evaluation_fallback", "error": str(e)},
309
+ )
310
+
311
+ # Ask if user wants to fall back to human evaluation
312
+ fallback_to_human = False
313
+ print("\nAutomated evaluation failed with error:", str(e))
314
+ response = input(
315
+ "Do you want to fall back to human evaluation? (y/n): "
316
+ )
317
+ fallback_to_human = response.strip().lower() == "y"
318
+
319
+ if fallback_to_human:
320
+ logger.info("Falling back to human evaluation...")
321
+ from .graders import human_evaluation as evaluate
322
+
323
+ evaluation_results = evaluate(
324
+ results_file=results_file,
325
+ output_file=evaluation_file,
326
+ interactive=True,
327
+ )
328
+ else:
329
+ logger.info("Skipping evaluation due to error.")
330
+ # Create an empty evaluation file to prevent issues
331
+ with open(evaluation_file, "w") as f:
332
+ f.write("")
333
+
334
+ return {
335
+ "status": "evaluation_error",
336
+ "dataset_type": dataset_type,
337
+ "results_path": results_file,
338
+ "evaluation_error": str(e),
339
+ "total_examples": total_examples,
340
+ }
341
+
342
+ # Calculate metrics
343
+ if progress_callback:
344
+ progress_callback(
345
+ "Calculating metrics", 90, {"status": "calculating_metrics"}
346
+ )
347
+
348
+ metrics = calculate_metrics(evaluation_file)
349
+
350
+ # Generate report
351
+ if progress_callback:
352
+ progress_callback("Generating report", 95, {"status": "generating_report"})
353
+
354
+ dataset_name = dataset_type.capitalize()
355
+ report_path = generate_report(
356
+ metrics=metrics,
357
+ results_file=evaluation_file,
358
+ output_file=report_file,
359
+ dataset_name=dataset_name,
360
+ config_info={
361
+ "Dataset": dataset_path
362
+ or DEFAULT_DATASET_URLS.get(dataset_type, "Unknown"),
363
+ "Examples": total_examples,
364
+ "Iterations": search_config.get("iterations", 3),
365
+ "Questions per iteration": search_config.get(
366
+ "questions_per_iteration", 3
367
+ ),
368
+ "Search tool": search_config.get("search_tool", "searxng"),
369
+ "Evaluation method": "Human" if human_evaluation else "Automated",
370
+ },
371
+ )
372
+
373
+ # Mark as complete
374
+ if progress_callback:
375
+ progress_callback(
376
+ "Benchmark complete",
377
+ 100,
378
+ {"status": "complete", "metrics": metrics, "report_path": report_path},
379
+ )
380
+
381
+ return {
382
+ "status": "complete",
383
+ "dataset_type": dataset_type,
384
+ "results_path": results_file,
385
+ "evaluation_path": evaluation_file,
386
+ "report_path": report_path,
387
+ "metrics": metrics,
388
+ "total_examples": total_examples,
389
+ "accuracy": metrics.get("accuracy", 0),
390
+ }
391
+
392
+ else:
393
+ # No evaluation, just return results
394
+ if progress_callback:
395
+ progress_callback(
396
+ "Benchmark complete (no evaluation)",
397
+ 100,
398
+ {"status": "complete_no_eval", "results_path": results_file},
399
+ )
400
+
401
+ return {
402
+ "status": "complete_no_eval",
403
+ "dataset_type": dataset_type,
404
+ "results_path": results_file,
405
+ "total_examples": total_examples,
406
+ }
407
+
408
+
409
+ def run_simpleqa_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:
410
+ """
411
+ Run SimpleQA benchmark with default settings.
412
+
413
+ Args:
414
+ num_examples: Number of examples to process
415
+ **kwargs: Additional arguments to pass to run_benchmark
416
+
417
+ Returns:
418
+ Dictionary with benchmark results
419
+ """
420
+ return run_benchmark(dataset_type="simpleqa", num_examples=num_examples, **kwargs)
421
+
422
+
423
+ def run_browsecomp_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:
424
+ """
425
+ Run BrowseComp benchmark with default settings.
426
+
427
+ Args:
428
+ num_examples: Number of examples to process
429
+ **kwargs: Additional arguments to pass to run_benchmark
430
+
431
+ Returns:
432
+ Dictionary with benchmark results
433
+ """
434
+ return run_benchmark(dataset_type="browsecomp", num_examples=num_examples, **kwargs)
@@ -0,0 +1,65 @@
1
+ """
2
+ Templates for benchmark evaluation.
3
+
4
+ This module contains prompt templates used for evaluating model outputs
5
+ against reference answers.
6
+ """
7
+
8
+ # Template for SimpleQA grading
9
+ SIMPLEQA_GRADER_TEMPLATE = """
10
+ You are evaluating a question-answering system. Given a question, the correct answer,
11
+ and the model's response, determine if the model's answer is correct.
12
+
13
+ Question: {question}
14
+
15
+ Correct Answer: {correct_answer}
16
+
17
+ Model Response: {response}
18
+
19
+ First, extract the model's final answer, ignoring explanations or reasoning.
20
+ Then determine if the model's answer is correct by comparing it to the correct answer.
21
+ Allow for minor variations in wording but be strict about factual accuracy.
22
+
23
+ Provide your evaluation in this format:
24
+ Extracted Answer: [The model's final answer, extracted from the full response]
25
+ Reasoning: [Your explanation for why the answer is correct or incorrect]
26
+ Correct: [yes/no]
27
+ """
28
+
29
+ # Template for BrowseComp grading
30
+ BROWSECOMP_GRADER_TEMPLATE = """
31
+ Judge whether the following [response] to [question] is correct or not based on
32
+ the precise and unambiguous [correct_answer] below.
33
+
34
+ [question]: {question}
35
+
36
+ [response]: {response}
37
+
38
+ Your judgement must be in the format and criteria specified below:
39
+
40
+ extracted_final_answer: The final exact answer extracted from the [response].
41
+ Put the extracted answer as 'None' if there is no exact, final answer to extract
42
+ from the response.
43
+
44
+ [correct_answer]: {correct_answer}
45
+
46
+ reasoning: Explain why the extracted_final_answer is correct or incorrect based
47
+ on [correct_answer], focusing only on if there are meaningful differences between
48
+ [correct_answer] and the extracted_final_answer.
49
+
50
+ correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given
51
+ above, or is within a small margin of error for numerical problems. Answer 'no' otherwise.
52
+
53
+ confidence: The extracted confidence score between 0% and 100% from [response].
54
+ Put 100 if there is no confidence score available.
55
+ """
56
+
57
+ # Template for formatted BrowseComp queries
58
+ BROWSECOMP_QUERY_TEMPLATE = """
59
+ {question}
60
+
61
+ Your response should be in the following format:
62
+ Explanation: {{your explanation for your final answer}}
63
+ Exact Answer: {{your succinct, final answer}}
64
+ Confidence: {{your confidence score between 0% and 100% for your answer}}
65
+ """
@@ -1,4 +1,3 @@
1
- import logging
2
1
  import os
3
2
 
4
3
  from langchain_anthropic import ChatAnthropic
@@ -6,14 +5,12 @@ from langchain_community.llms import VLLM
6
5
  from langchain_core.language_models import FakeListChatModel
7
6
  from langchain_ollama import ChatOllama
8
7
  from langchain_openai import ChatOpenAI
8
+ from loguru import logger
9
9
 
10
10
  from ..utilities.db_utils import get_db_setting
11
11
  from ..utilities.search_utilities import remove_think_tags
12
12
  from ..utilities.url_utils import normalize_url
13
13
 
14
- # Setup logging
15
- logger = logging.getLogger(__name__)
16
-
17
14
  # Valid provider options
18
15
  VALID_PROVIDERS = [
19
16
  "ollama",
@@ -67,7 +64,7 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
67
64
  raise ValueError(
68
65
  f"Invalid provider: {provider}. Must be one of: {VALID_PROVIDERS}"
69
66
  )
70
- print(
67
+ logger.info(
71
68
  f"Getting LLM with model: {model_name}, temperature: {temperature}, provider: {provider}"
72
69
  )
73
70
 
@@ -75,8 +72,16 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
75
72
  common_params = {
76
73
  "temperature": temperature,
77
74
  }
75
+
76
+ # Get context window size from settings
77
+ context_window_size = get_db_setting("llm.context_window_size", 32000)
78
+
78
79
  if get_db_setting("llm.supports_max_tokens", True):
79
- common_params["max_tokens"] = get_db_setting("llm.max_tokens", 30000)
80
+ # Use 80% of context window to leave room for prompts
81
+ max_tokens = min(
82
+ get_db_setting("llm.max_tokens", 30000), int(context_window_size * 0.8)
83
+ )
84
+ common_params["max_tokens"] = max_tokens
80
85
 
81
86
  # Handle different providers
82
87
  if provider == "anthropic":
@@ -134,9 +139,8 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
134
139
  temperature=temperature,
135
140
  )
136
141
  return wrap_llm_without_think_tags(llm)
137
- except Exception as e:
138
- logger.error(f"Error loading VLLM model: {e}")
139
- logger.warning("Falling back.")
142
+ except Exception:
143
+ logger.exception("Error loading VLLM model")
140
144
  return get_fallback_model(temperature)
141
145
 
142
146
  elif provider == "ollama":
@@ -184,10 +188,8 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
184
188
  f"Model '{model_name}' not found in Ollama. Available models: {', '.join(model_names[:5])}"
185
189
  )
186
190
  return get_fallback_model(temperature)
187
- except Exception as model_check_error:
188
- logger.error(
189
- f"Error checking for model '{model_name}' in Ollama: {str(model_check_error)}"
190
- )
191
+ except Exception:
192
+ logger.exception(f"Error checking for model '{model_name}' in Ollama")
191
193
  # Continue anyway, let ChatOllama handle potential errors
192
194
 
193
195
  logger.info(
@@ -202,11 +204,11 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
202
204
  f"Ollama test successful. Response type: {type(test_result)}"
203
205
  )
204
206
  return wrap_llm_without_think_tags(llm)
205
- except Exception as chat_error:
206
- logger.error(f"Error creating or testing ChatOllama: {str(chat_error)}")
207
+ except Exception:
208
+ logger.exception("Error creating or testing ChatOllama")
207
209
  return get_fallback_model(temperature)
208
- except Exception as e:
209
- logger.error(f"Error in Ollama provider section: {str(e)}")
210
+ except Exception:
211
+ logger.exception("Error in Ollama provider section")
210
212
  return get_fallback_model(temperature)
211
213
 
212
214
  elif provider == "lmstudio":
@@ -218,7 +220,7 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
218
220
  api_key="lm-studio", # LM Studio doesn't require a real API key
219
221
  base_url=f"{lmstudio_url}/v1", # Use the configured URL with /v1 endpoint
220
222
  temperature=temperature,
221
- max_tokens=get_db_setting("llm.max_tokens", 30000),
223
+ max_tokens=max_tokens, # Use calculated max_tokens based on context size
222
224
  )
223
225
  return wrap_llm_without_think_tags(llm)
224
226
 
@@ -260,10 +262,11 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
260
262
  llm = LlamaCpp(
261
263
  model_path=model_path,
262
264
  temperature=temperature,
263
- max_tokens=get_db_setting("llm.max_tokens", 30000),
265
+ max_tokens=max_tokens, # Use calculated max_tokens
264
266
  n_gpu_layers=n_gpu_layers,
265
267
  n_batch=n_batch,
266
268
  f16_kv=f16_kv,
269
+ n_ctx=context_window_size, # Set context window size directly
267
270
  verbose=True,
268
271
  )
269
272
 
@@ -398,11 +401,11 @@ def is_ollama_available():
398
401
  except requests.exceptions.RequestException as req_error:
399
402
  logger.error(f"Request error when checking Ollama: {str(req_error)}")
400
403
  return False
401
- except Exception as e:
402
- logger.error(f"Unexpected error when checking Ollama: {str(e)}")
404
+ except Exception:
405
+ logger.exception("Unexpected error when checking Ollama")
403
406
  return False
404
- except Exception as outer_e:
405
- logger.error(f"Error in is_ollama_available: {str(outer_e)}")
407
+ except Exception:
408
+ logger.exception("Error in is_ollama_available")
406
409
  return False
407
410
 
408
411
 
@@ -1,14 +1,10 @@
1
1
  # local_deep_research/config.py
2
- import logging
2
+ from loguru import logger
3
3
 
4
4
  from ..utilities.db_utils import get_db_setting
5
5
  from ..web_search_engines.search_engine_factory import get_search as factory_get_search
6
6
  from .llm_config import get_llm
7
7
 
8
- # Setup logging
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
8
  # Whether to check the quality search results using the LLM.
13
9
  QUALITY_CHECK_DDG_URLS = True
14
10
  # Whether to only retrieve snippets instead of full search results.