PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/benchmarks/efficiency/speed_profiler.py CHANGED Viewed

@@ -8,7 +8,7 @@ of different components and processes in the research system.
 import logging
 import time
 from contextlib import contextmanager
-from typing import Dict, List, Optional, Any, Callable
+from typing import Any, Callable, Dict
 logger = logging.getLogger(__name__)
@@ -16,57 +16,57 @@ logger = logging.getLogger(__name__)
 class SpeedProfiler:
     """
     Profiler for tracking execution speed of components.
     This class provides methods for timing operations and
     collecting performance statistics for later analysis.
     """
     def __init__(self):
         """Initialize the profiler with empty timing data."""
         self.timings = {}
         self.current_timers = {}
         self.total_start_time = None
         self.total_end_time = None
     def start(self):
         """Start the global profiling session."""
         self.timings = {}
         self.current_timers = {}
         self.total_start_time = time.time()
     def stop(self):
         """Stop the global profiling session."""
         self.total_end_time = time.time()
         # Stop any timers that are still running
         for name in list(self.current_timers.keys()):
             self.stop_timer(name)
     def start_timer(self, name: str):
         """
         Start a named timer.
         Args:
             name: Name of the timer to start
         """
         if name in self.current_timers:
             logger.warning(f"Timer '{name}' is already running. Restarting.")
         self.current_timers[name] = time.time()
     def stop_timer(self, name: str):
         """
         Stop a named timer and record the elapsed time.
         Args:
             name: Name of the timer to stop
         """
         if name not in self.current_timers:
             logger.warning(f"Timer '{name}' was not started.")
             return
         elapsed = time.time() - self.current_timers[name]
         if name not in self.timings:
             self.timings[name] = {
                 "total": elapsed,
@@ -74,7 +74,7 @@ class SpeedProfiler:
                 "min": elapsed,
                 "max": elapsed,
                 "starts": [self.current_timers[name]],
-                "durations": [elapsed]
+                "durations": [elapsed],
             }
         else:
             self.timings[name]["total"] += elapsed
@@ -83,17 +83,17 @@ class SpeedProfiler:
             self.timings[name]["max"] = max(self.timings[name]["max"], elapsed)
             self.timings[name]["starts"].append(self.current_timers[name])
             self.timings[name]["durations"].append(elapsed)
         del self.current_timers[name]
     @contextmanager
     def timer(self, name: str):
         """
         Context manager for timing a block of code.
         Args:
             name: Name of the timer
         Example:
             with profiler.timer("my_operation"):
                 # Code to time
@@ -104,23 +104,26 @@ class SpeedProfiler:
             yield
         finally:
             self.stop_timer(name)
     def get_timings(self) -> Dict[str, Any]:
         """
         Get all recorded timings.
         Returns:
             Dictionary of timing data for all measured operations
         """
         result = self.timings.copy()
         # Add averages
         for name, data in result.items():
             if data["count"] > 0:
                 data["avg"] = data["total"] / data["count"]
         # Add total duration
-        if self.total_start_time is not None and self.total_end_time is not None:
+        if (
+            self.total_start_time is not None
+            and self.total_end_time is not None
+        ):
             result["total"] = {
                 "total": self.total_end_time - self.total_start_time,
                 "count": 1,
@@ -128,87 +131,98 @@ class SpeedProfiler:
                 "max": self.total_end_time - self.total_start_time,
                 "avg": self.total_end_time - self.total_start_time,
                 "starts": [self.total_start_time],
-                "durations": [self.total_end_time - self.total_start_time]
+                "durations": [self.total_end_time - self.total_start_time],
             }
         return result
     def get_summary(self) -> Dict[str, float]:
         """
         Get a summary of timing information.
         Returns:
             Dictionary with summary statistics
         """
         timings = self.get_timings()
         summary = {}
         # Total duration
         if "total" in timings:
             summary["total_duration"] = timings["total"]["total"]
-        elif self.total_start_time is not None and self.total_end_time is not None:
-            summary["total_duration"] = self.total_end_time - self.total_start_time
+        elif (
+            self.total_start_time is not None
+            and self.total_end_time is not None
+        ):
+            summary["total_duration"] = (
+                self.total_end_time - self.total_start_time
+            )
         else:
-            summary["total_duration"] = sum(t["total"] for t in timings.values())
+            summary["total_duration"] = sum(
+                t["total"] for t in timings.values()
+            )
         # Component durations
         for name, data in timings.items():
             if name != "total":
                 summary[f"{name}_duration"] = data["total"]
                 summary[f"{name}_percent"] = (
-                    data["total"] / summary["total_duration"] * 100
-                    if summary["total_duration"] > 0 else 0
+                    data["total"] / summary["total_duration"] * 100
+                    if summary["total_duration"] > 0
+                    else 0
                 )
         # Per-operation breakdowns
         for name, data in timings.items():
             if data["count"] > 0:
                 summary[f"{name}_per_operation"] = data["total"] / data["count"]
         return summary
     def print_summary(self):
         """Print a formatted summary of timing information."""
         summary = self.get_summary()
         total = summary.get("total_duration", 0)
         print("\n===== SPEED PROFILE SUMMARY =====")
         print(f"Total execution time: {total:.2f} seconds")
         print("\n--- Component Breakdown ---")
         # Print each component's timing
         for name, data in self.timings.items():
             if name != "total":
                 percent = data["total"] / total * 100 if total > 0 else 0
-                print(f"{name}: {data['total']:.2f}s ({percent:.1f}%) - "
-                      f"{data['count']} calls, avg {data['total'] / data['count']:.3f}s per call")
+                print(
+                    f"{name}: {data['total']:.2f}s ({percent:.1f}%) - "
+                    f"{data['count']} calls, avg {data['total'] / data['count']:.3f}s per call"
+                )
         print("\n==============================")
 def time_function(func: Callable) -> Callable:
     """
     Decorator to time a function's execution.
     Args:
         func: Function to time
     Returns:
         Wrapped function that logs its execution time
     Example:
         @time_function
         def my_slow_function():
             # Some slow code
             pass
     """
     def wrapper(*args, **kwargs):
         start_time = time.time()
         result = func(*args, **kwargs)
         elapsed = time.time() - start_time
         logger.info(f"{func.__name__} took {elapsed:.3f} seconds")
         return result
     return wrapper

local_deep_research/benchmarks/evaluators/browsecomp.py CHANGED Viewed

@@ -47,7 +47,9 @@ class BrowseCompEvaluator(BaseBenchmarkEvaluator):
         benchmark_dir = self._create_subdirectory(output_dir)
         # Log benchmark execution
-        logger.info(f"Running BrowseComp benchmark with {num_examples} examples")
+        logger.info(
+            f"Running BrowseComp benchmark with {num_examples} examples"
+        )
         try:
             # Run BrowseComp benchmark

local_deep_research/benchmarks/evaluators/composite.py CHANGED Viewed

@@ -54,7 +54,9 @@ class CompositeBenchmarkEvaluator:
             }
         # Log the weights being used
-        logger.info(f"Using normalized benchmark weights: {self.normalized_weights}")
+        logger.info(
+            f"Using normalized benchmark weights: {self.normalized_weights}"
+        )
     def evaluate(
         self,
@@ -105,7 +107,9 @@ class CompositeBenchmarkEvaluator:
                     combined_score += weighted_contribution
                 except Exception as e:
-                    logger.error(f"Error running {benchmark_name} benchmark: {str(e)}")
+                    logger.error(
+                        f"Error running {benchmark_name} benchmark: {str(e)}"
+                    )
                     all_results[benchmark_name] = {
                         "benchmark_type": benchmark_name,
                         "error": str(e),

local_deep_research/benchmarks/evaluators/simpleqa.py CHANGED Viewed

@@ -9,9 +9,9 @@ import json
 import logging
 import os
 import time
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict
-from local_deep_research.api import quick_summary
 from ..datasets.base import DatasetRegistry
 from ..metrics import calculate_metrics, generate_report
 from ..runners import run_simpleqa_benchmark  # Keep for backward compatibility
@@ -134,9 +134,15 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
             # Set up output files
             timestamp = time.strftime("%Y%m%d_%H%M%S")
-            results_file = os.path.join(output_dir, f"simpleqa_{timestamp}_results.jsonl")
-            evaluation_file = os.path.join(output_dir, f"simpleqa_{timestamp}_evaluation.jsonl")
-            report_file = os.path.join(output_dir, f"simpleqa_{timestamp}_report.md")
+            results_file = os.path.join(
+                output_dir, f"simpleqa_{timestamp}_results.jsonl"
+            )
+            evaluation_file = os.path.join(
+                output_dir, f"simpleqa_{timestamp}_evaluation.jsonl"
+            )
+            report_file = os.path.join(
+                output_dir, f"simpleqa_{timestamp}_report.md"
+            )
             # Process each example
             results = []
@@ -146,7 +152,9 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
                 question = dataset_instance.get_question(example)
                 correct_answer = dataset_instance.get_answer(example)
-                logger.info(f"Processing {i + 1}/{len(examples)}: {question[:50]}...")
+                logger.info(
+                    f"Processing {i + 1}/{len(examples)}: {question[:50]}..."
+                )
                 try:
                     # Format query based on dataset type
@@ -158,18 +166,25 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
                     # Create search config from system_config
                     search_params = {
                         "iterations": system_config.get("iterations", 3),
-                        "questions_per_iteration": system_config.get("questions_per_iteration", 3),
-                        "search_tool": system_config.get("search_tool", "searxng"),
+                        "questions_per_iteration": system_config.get(
+                            "questions_per_iteration", 3
+                        ),
+                        "search_tool": system_config.get(
+                            "search_tool", "searxng"
+                        ),
                         # Note: search_strategy is stored in the config but not passed to quick_summary
                         # as it's not supported by the underlying API
                     }
                     # Get response from LDR
                     from local_deep_research.api import quick_summary
                     search_result = quick_summary(
                         query=formatted_query,
                         iterations=search_params.get("iterations"),
-                        questions_per_iteration=search_params.get("questions_per_iteration"),
+                        questions_per_iteration=search_params.get(
+                            "questions_per_iteration"
+                        ),
                         search_tool=search_params.get("search_tool"),
                     )
@@ -181,7 +196,10 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
                     # Extract structured answer
                     from ..graders import extract_answer_from_response
-                    extracted = extract_answer_from_response(response, "simpleqa")
+                    extracted = extract_answer_from_response(
+                        response, "simpleqa"
+                    )
                     # Format result
                     result = {
@@ -224,7 +242,8 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
             # Grade results
             from ..graders import grade_results
-            evaluation_results = grade_results(
+            grade_results(
                 results_file=results_file,
                 output_file=evaluation_file,
                 dataset_type="simpleqa",
@@ -244,9 +263,13 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
                     "Dataset": "SimpleQA",
                     "Examples": len(examples),
                     "Iterations": search_params.get("iterations", 3),
-                    "Questions per iteration": search_params.get("questions_per_iteration", 3),
+                    "Questions per iteration": search_params.get(
+                        "questions_per_iteration", 3
+                    ),
                     "Search tool": search_params.get("search_tool", "searxng"),
-                    "Search strategy": search_params.get("search_strategy", "source_based"),
+                    "Search strategy": search_params.get(
+                        "search_strategy", "source_based"
+                    ),
                 },
             )

local_deep_research/benchmarks/graders.py CHANGED Viewed

@@ -59,7 +59,9 @@ def get_evaluation_llm(custom_config: Optional[Dict[str, Any]] = None):
         "api_key",
     }
-    filtered_config = {k: v for k, v in config.items() if k in ldr_supported_params}
+    filtered_config = {
+        k: v for k, v in config.items() if k in ldr_supported_params
+    }
     # Check if we're using openai_endpoint but don't have an API key configured
     if filtered_config.get("provider") == "openai_endpoint":
@@ -182,7 +184,9 @@ def grade_results(
         try:
             # Grade using LLM
-            if hasattr(evaluation_llm, "invoke") and callable(evaluation_llm.invoke):
+            if hasattr(evaluation_llm, "invoke") and callable(
+                evaluation_llm.invoke
+            ):
                 if hasattr(evaluation_llm, "chat_messages"):
                     # Handle ChatOpenAI and similar models that use messages
                     grading_response = evaluation_llm.invoke(
@@ -214,7 +218,9 @@ def grade_results(
                     grading_response,
                     re.DOTALL,
                 )
-                reasoning = reasoning_match.group(1).strip() if reasoning_match else ""
+                reasoning = (
+                    reasoning_match.group(1).strip() if reasoning_match else ""
+                )
                 correct_match = re.search(
                     r"correct:\s*(yes|no)", grading_response, re.IGNORECASE
@@ -225,8 +231,12 @@ def grade_results(
                     else False
                 )
-                confidence_match = re.search(r"confidence:\s*(\d+)", grading_response)
-                confidence = confidence_match.group(1) if confidence_match else "100"
+                confidence_match = re.search(
+                    r"confidence:\s*(\d+)", grading_response
+                )
+                confidence = (
+                    confidence_match.group(1) if confidence_match else "100"
+                )
             else:
                 # SimpleQA extraction
                 extracted_answer_match = re.search(
@@ -239,9 +249,13 @@ def grade_results(
                 )
                 reasoning_match = re.search(
-                    r"Reasoning:\s*(.*?)(?:\nCorrect:|\Z)", grading_response, re.DOTALL
+                    r"Reasoning:\s*(.*?)(?:\nCorrect:|\Z)",
+                    grading_response,
+                    re.DOTALL,
+                )
+                reasoning = (
+                    reasoning_match.group(1).strip() if reasoning_match else ""
                 )
-                reasoning = reasoning_match.group(1).strip() if reasoning_match else ""
                 correct_match = re.search(
                     r"Correct:\s*(yes|no)", grading_response, re.IGNORECASE
@@ -304,7 +318,11 @@ def grade_results(
                 progress_callback(
                     idx,
                     len(results),
-                    {"status": "error", "error": str(e), "result": error_result},
+                    {
+                        "status": "error",
+                        "error": str(e),
+                        "result": error_result,
+                    },
                 )
     accuracy = correct_count / len(results) if results else 0
@@ -366,7 +384,9 @@ def human_evaluation(
             # Get human judgment
             while True:
                 judgment = (
-                    input("\nIs the model's answer correct? (y/n): ").strip().lower()
+                    input("\nIs the model's answer correct? (y/n): ")
+                    .strip()
+                    .lower()
                 )
                 if judgment in ["y", "n"]:
                     break
@@ -375,7 +395,9 @@ def human_evaluation(
             is_correct = judgment == "y"
             # Get reasoning
-            reasoning = input("Please provide reasoning for your judgment: ").strip()
+            reasoning = input(
+                "Please provide reasoning for your judgment: "
+            ).strip()
         else:
             # Non-interactive mode - placeholder for API/UI implementation
             # In a real implementation, this would be filled by UI actions

local_deep_research/benchmarks/metrics/README.md CHANGED Viewed

@@ -77,4 +77,4 @@ fig = plot_optimization_history(
     best_values=[0.5, 0.6, 0.7, 0.7, 0.8],
     output_file="optimization_history.png"
 )
-```
+```

local_deep_research/benchmarks/metrics/calculation.py CHANGED Viewed

@@ -11,7 +11,7 @@ import os
 import tempfile
 import time
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Optional
 logger = logging.getLogger(__name__)
@@ -50,7 +50,9 @@ def calculate_metrics(results_file: str) -> Dict[str, Any]:
     processing_times = [
         r.get("processing_time", 0) for r in results if "processing_time" in r
     ]
-    avg_time = sum(processing_times) / len(processing_times) if processing_times else 0
+    avg_time = (
+        sum(processing_times) / len(processing_times) if processing_times else 0
+    )
     # Average confidence if available
     confidence_values = []
@@ -62,7 +64,9 @@ def calculate_metrics(results_file: str) -> Dict[str, Any]:
                 pass
     avg_confidence = (
-        sum(confidence_values) / len(confidence_values) if confidence_values else 0
+        sum(confidence_values) / len(confidence_values)
+        if confidence_values
+        else 0
     )
     # Calculate error rate
@@ -100,7 +104,9 @@ def calculate_metrics(results_file: str) -> Dict[str, Any]:
                 "total": counts["total"],
                 "correct": counts["correct"],
                 "accuracy": (
-                    counts["correct"] / counts["total"] if counts["total"] else 0
+                    counts["correct"] / counts["total"]
+                    if counts["total"]
+                    else 0
                 ),
             }
         metrics["categories"] = category_metrics
@@ -136,7 +142,9 @@ def evaluate_benchmark_quality(
         # Create search configuration from system config
         search_config = {
             "iterations": system_config.get("iterations", 2),
-            "questions_per_iteration": system_config.get("questions_per_iteration", 2),
+            "questions_per_iteration": system_config.get(
+                "questions_per_iteration", 2
+            ),
             "search_strategy": system_config.get("search_strategy", "iterdrag"),
             "search_tool": system_config.get("search_tool", "searxng"),
             "model_name": system_config.get("model_name"),
@@ -174,7 +182,9 @@ def evaluate_benchmark_quality(
             try:
                 shutil.rmtree(temp_dir)
             except Exception as e:
-                logger.warning(f"Failed to clean up temporary directory: {str(e)}")
+                logger.warning(
+                    f"Failed to clean up temporary directory: {str(e)}"
+                )
 def measure_execution_time(
@@ -216,7 +226,7 @@ def measure_execution_time(
     try:
         for i in range(num_runs):
-            logger.info(f"Executing speed test run {i+1}/{num_runs}")
+            logger.info(f"Executing speed test run {i + 1}/{num_runs}")
             start_time = time.time()
             system.search(query, full_response=False)
             end_time = time.time()
@@ -264,7 +274,9 @@ def calculate_quality_metrics(
     """
     # Run quality evaluation
     quality_results = evaluate_benchmark_quality(
-        system_config=system_config, num_examples=num_examples, output_dir=output_dir
+        system_config=system_config,
+        num_examples=num_examples,
+        output_dir=output_dir,
     )
     # Return normalized quality score
@@ -337,7 +349,10 @@ def calculate_resource_metrics(
     # Normalize to 0-1 scale (lower is better)
     resource_score = 1.0 / (1.0 + (complexity / 4.0))
-    return {"resource_score": resource_score, "estimated_complexity": complexity}
+    return {
+        "resource_score": resource_score,
+        "estimated_complexity": complexity,
+    }
 def calculate_combined_score(
@@ -382,4 +397,4 @@ def calculate_combined_score(
         resource_score = metrics["resource"].get("resource_score", 0.0)
         score += resource_score * norm_weights["resource"]
-    return score
+    return score

local_deep_research/benchmarks/metrics/reporting.py CHANGED Viewed

@@ -46,7 +46,9 @@ def generate_report(
     # Sample up to 5 correct and 5 incorrect examples
     correct_examples = [r for r in results if r.get("is_correct", False)][:5]
     incorrect_examples = [
-        r for r in results if "is_correct" in r and not r.get("is_correct", False)
+        r
+        for r in results
+        if "is_correct" in r and not r.get("is_correct", False)
     ][:5]
     # Create report
@@ -67,7 +69,9 @@ def generate_report(
         )
     if "average_confidence" in metrics:
-        report.append(f"- **Average Confidence**: {metrics['average_confidence']:.2f}%")
+        report.append(
+            f"- **Average Confidence**: {metrics['average_confidence']:.2f}%"
+        )
     if "error_count" in metrics and metrics["error_count"] > 0:
         report.append(f"- **Error Count**: {metrics['error_count']}")
@@ -152,4 +156,4 @@ def generate_report(
         f.write("\n".join(report))
     logger.info(f"Report saved to {output_file}")
-    return output_file
+    return output_file

local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl