local-deep-research 0.5.7__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. local_deep_research/__version__.py +1 -1
  2. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +11 -1
  3. local_deep_research/advanced_search_system/questions/browsecomp_question.py +32 -6
  4. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +33 -8
  5. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -0
  6. local_deep_research/api/__init__.py +2 -0
  7. local_deep_research/api/research_functions.py +177 -3
  8. local_deep_research/benchmarks/graders.py +150 -5
  9. local_deep_research/benchmarks/models/__init__.py +19 -0
  10. local_deep_research/benchmarks/models/benchmark_models.py +283 -0
  11. local_deep_research/benchmarks/ui/__init__.py +1 -0
  12. local_deep_research/benchmarks/web_api/__init__.py +6 -0
  13. local_deep_research/benchmarks/web_api/benchmark_routes.py +862 -0
  14. local_deep_research/benchmarks/web_api/benchmark_service.py +920 -0
  15. local_deep_research/config/llm_config.py +106 -21
  16. local_deep_research/defaults/default_settings.json +448 -3
  17. local_deep_research/error_handling/report_generator.py +10 -0
  18. local_deep_research/llm/__init__.py +19 -0
  19. local_deep_research/llm/llm_registry.py +155 -0
  20. local_deep_research/metrics/db_models.py +3 -7
  21. local_deep_research/metrics/search_tracker.py +25 -11
  22. local_deep_research/report_generator.py +3 -2
  23. local_deep_research/search_system.py +12 -9
  24. local_deep_research/utilities/log_utils.py +23 -10
  25. local_deep_research/utilities/thread_context.py +99 -0
  26. local_deep_research/web/app_factory.py +32 -8
  27. local_deep_research/web/database/benchmark_schema.py +230 -0
  28. local_deep_research/web/database/convert_research_id_to_string.py +161 -0
  29. local_deep_research/web/database/models.py +55 -1
  30. local_deep_research/web/database/schema_upgrade.py +397 -2
  31. local_deep_research/web/database/uuid_migration.py +265 -0
  32. local_deep_research/web/routes/api_routes.py +62 -31
  33. local_deep_research/web/routes/history_routes.py +13 -6
  34. local_deep_research/web/routes/metrics_routes.py +264 -4
  35. local_deep_research/web/routes/research_routes.py +45 -18
  36. local_deep_research/web/routes/route_registry.py +352 -0
  37. local_deep_research/web/routes/settings_routes.py +382 -22
  38. local_deep_research/web/services/research_service.py +22 -29
  39. local_deep_research/web/services/settings_manager.py +53 -0
  40. local_deep_research/web/services/settings_service.py +2 -0
  41. local_deep_research/web/static/css/styles.css +8 -0
  42. local_deep_research/web/static/js/components/detail.js +7 -14
  43. local_deep_research/web/static/js/components/details.js +8 -10
  44. local_deep_research/web/static/js/components/fallback/ui.js +4 -4
  45. local_deep_research/web/static/js/components/history.js +6 -6
  46. local_deep_research/web/static/js/components/logpanel.js +14 -11
  47. local_deep_research/web/static/js/components/progress.js +51 -46
  48. local_deep_research/web/static/js/components/research.js +250 -89
  49. local_deep_research/web/static/js/components/results.js +5 -7
  50. local_deep_research/web/static/js/components/settings.js +32 -26
  51. local_deep_research/web/static/js/components/settings_sync.js +24 -23
  52. local_deep_research/web/static/js/config/urls.js +285 -0
  53. local_deep_research/web/static/js/main.js +8 -8
  54. local_deep_research/web/static/js/research_form.js +267 -12
  55. local_deep_research/web/static/js/services/api.js +18 -18
  56. local_deep_research/web/static/js/services/keyboard.js +8 -8
  57. local_deep_research/web/static/js/services/socket.js +53 -35
  58. local_deep_research/web/static/js/services/ui.js +1 -1
  59. local_deep_research/web/templates/base.html +4 -1
  60. local_deep_research/web/templates/components/custom_dropdown.html +5 -3
  61. local_deep_research/web/templates/components/mobile_nav.html +3 -3
  62. local_deep_research/web/templates/components/sidebar.html +9 -3
  63. local_deep_research/web/templates/pages/benchmark.html +2697 -0
  64. local_deep_research/web/templates/pages/benchmark_results.html +1136 -0
  65. local_deep_research/web/templates/pages/benchmark_simple.html +453 -0
  66. local_deep_research/web/templates/pages/cost_analytics.html +1 -1
  67. local_deep_research/web/templates/pages/metrics.html +212 -39
  68. local_deep_research/web/templates/pages/research.html +8 -6
  69. local_deep_research/web/templates/pages/star_reviews.html +1 -1
  70. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -1
  71. local_deep_research/web_search_engines/engines/search_engine_brave.py +15 -1
  72. local_deep_research/web_search_engines/engines/search_engine_ddg.py +20 -1
  73. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +26 -2
  74. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +15 -1
  75. local_deep_research/web_search_engines/engines/search_engine_retriever.py +192 -0
  76. local_deep_research/web_search_engines/engines/search_engine_tavily.py +307 -0
  77. local_deep_research/web_search_engines/rate_limiting/__init__.py +14 -0
  78. local_deep_research/web_search_engines/rate_limiting/__main__.py +9 -0
  79. local_deep_research/web_search_engines/rate_limiting/cli.py +209 -0
  80. local_deep_research/web_search_engines/rate_limiting/exceptions.py +21 -0
  81. local_deep_research/web_search_engines/rate_limiting/tracker.py +506 -0
  82. local_deep_research/web_search_engines/retriever_registry.py +108 -0
  83. local_deep_research/web_search_engines/search_engine_base.py +161 -43
  84. local_deep_research/web_search_engines/search_engine_factory.py +14 -0
  85. local_deep_research/web_search_engines/search_engines_config.py +20 -0
  86. local_deep_research-0.6.0.dist-info/METADATA +374 -0
  87. {local_deep_research-0.5.7.dist-info → local_deep_research-0.6.0.dist-info}/RECORD +90 -65
  88. local_deep_research-0.5.7.dist-info/METADATA +0 -420
  89. {local_deep_research-0.5.7.dist-info → local_deep_research-0.6.0.dist-info}/WHEEL +0 -0
  90. {local_deep_research-0.5.7.dist-info → local_deep_research-0.6.0.dist-info}/entry_points.txt +0 -0
  91. {local_deep_research-0.5.7.dist-info → local_deep_research-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -65,15 +65,16 @@ def get_evaluation_llm(custom_config: Optional[Dict[str, Any]] = None):
65
65
 
66
66
  # Check if we're using openai_endpoint but don't have an API key configured
67
67
  if filtered_config.get("provider") == "openai_endpoint":
68
- # Try to get API key from environment or config
69
- import os
68
+ # Try to get API key from database settings first, then environment
69
+ from ..utilities.db_utils import get_db_setting
70
+
71
+ api_key = get_db_setting("llm.openai_endpoint.api_key")
70
72
 
71
- api_key = os.getenv("OPENAI_ENDPOINT_API_KEY")
72
73
  if not api_key:
73
74
  logger.warning(
74
75
  "Using openai_endpoint provider but no API key found. "
75
- "Set the OPENAI_ENDPOINT_API_KEY environment variable or "
76
- "specify api_key in the evaluation_config."
76
+ "Set the llm.openai_endpoint.api_key setting in the database or "
77
+ "LDR_LLM_OPENAI_ENDPOINT_API_KEY environment variable."
77
78
  )
78
79
  # Try to fall back to LDR's config if API key not explicitly provided
79
80
  # The get_llm function will handle this case
@@ -117,6 +118,150 @@ def extract_answer_from_response(
117
118
  }
118
119
 
119
120
 
121
+ def grade_single_result(
122
+ result_data: Dict[str, Any],
123
+ dataset_type: str = "simpleqa",
124
+ evaluation_config: Optional[Dict[str, Any]] = None,
125
+ ) -> Dict[str, Any]:
126
+ """
127
+ Grade a single benchmark result using LLM.
128
+
129
+ Args:
130
+ result_data: Dictionary containing result data with keys: id, problem, correct_answer, response, extracted_answer
131
+ dataset_type: Type of dataset
132
+ evaluation_config: Optional custom config for evaluation LLM
133
+
134
+ Returns:
135
+ Dictionary with grading results
136
+ """
137
+ # Get evaluation LLM
138
+ evaluation_llm = get_evaluation_llm(evaluation_config)
139
+
140
+ # Select appropriate template
141
+ template = (
142
+ BROWSECOMP_GRADER_TEMPLATE
143
+ if dataset_type.lower() == "browsecomp"
144
+ else SIMPLEQA_GRADER_TEMPLATE
145
+ )
146
+
147
+ question = result_data.get("problem", "")
148
+ correct_answer = result_data.get("correct_answer", "")
149
+ response = result_data.get("response", "")
150
+
151
+ logger.info(f"Grading single result: {question[:50]}...")
152
+
153
+ # Format grading prompt
154
+ grading_prompt = template.format(
155
+ question=question, correct_answer=correct_answer, response=response
156
+ )
157
+
158
+ try:
159
+ # Grade using LLM
160
+ if hasattr(evaluation_llm, "invoke") and callable(
161
+ evaluation_llm.invoke
162
+ ):
163
+ if hasattr(evaluation_llm, "chat_messages"):
164
+ # Handle ChatOpenAI and similar models that use messages
165
+ grading_response = evaluation_llm.invoke(
166
+ [HumanMessage(content=grading_prompt)]
167
+ ).content
168
+ else:
169
+ # Handle other LLM types
170
+ grading_response = evaluation_llm.invoke(grading_prompt)
171
+ if hasattr(grading_response, "content"):
172
+ grading_response = grading_response.content
173
+ else:
174
+ # Fallback for other LLM interfaces
175
+ grading_response = str(evaluation_llm(grading_prompt))
176
+
177
+ # Extract grading information using regex
178
+ if dataset_type.lower() == "browsecomp":
179
+ # BrowseComp-specific extraction
180
+ extracted_answer_match = re.search(
181
+ r"extracted_final_answer:\s*(.*?)(?:\n|$)", grading_response
182
+ )
183
+ extracted_answer = (
184
+ extracted_answer_match.group(1).strip()
185
+ if extracted_answer_match
186
+ else "None"
187
+ )
188
+
189
+ reasoning_match = re.search(
190
+ r"reasoning:\s*(.*?)(?:\n\n|\ncorrect:|\Z)",
191
+ grading_response,
192
+ re.DOTALL,
193
+ )
194
+ reasoning = (
195
+ reasoning_match.group(1).strip() if reasoning_match else ""
196
+ )
197
+
198
+ correct_match = re.search(
199
+ r"correct:\s*(yes|no)", grading_response, re.IGNORECASE
200
+ )
201
+ is_correct = (
202
+ (correct_match.group(1).lower() == "yes")
203
+ if correct_match
204
+ else False
205
+ )
206
+
207
+ confidence_match = re.search(
208
+ r"confidence:\s*(\d+)", grading_response
209
+ )
210
+ confidence = (
211
+ confidence_match.group(1) if confidence_match else "100"
212
+ )
213
+ else:
214
+ # SimpleQA extraction
215
+ extracted_answer_match = re.search(
216
+ r"Extracted Answer:\s*(.*?)(?:\n|$)", grading_response
217
+ )
218
+ extracted_answer = (
219
+ extracted_answer_match.group(1).strip()
220
+ if extracted_answer_match
221
+ else "None"
222
+ )
223
+
224
+ reasoning_match = re.search(
225
+ r"Reasoning:\s*(.*?)(?:\nCorrect:|\Z)",
226
+ grading_response,
227
+ re.DOTALL,
228
+ )
229
+ reasoning = (
230
+ reasoning_match.group(1).strip() if reasoning_match else ""
231
+ )
232
+
233
+ correct_match = re.search(
234
+ r"Correct:\s*(yes|no)", grading_response, re.IGNORECASE
235
+ )
236
+ is_correct = (
237
+ (correct_match.group(1).lower() == "yes")
238
+ if correct_match
239
+ else False
240
+ )
241
+
242
+ confidence = "100" # SimpleQA doesn't have confidence
243
+
244
+ # Format graded result
245
+ graded_result = {
246
+ "extracted_by_grader": extracted_answer,
247
+ "reasoning": reasoning,
248
+ "is_correct": is_correct,
249
+ "graded_confidence": confidence,
250
+ "grader_response": grading_response,
251
+ }
252
+
253
+ return graded_result
254
+
255
+ except Exception as e:
256
+ logger.error(f"Error grading single result: {str(e)}")
257
+ return {
258
+ "grading_error": str(e),
259
+ "is_correct": False,
260
+ "graded_confidence": "0",
261
+ "grader_response": f"Grading failed: {str(e)}",
262
+ }
263
+
264
+
120
265
  def grade_results(
121
266
  results_file: str,
122
267
  output_file: str,
@@ -0,0 +1,19 @@
1
+ """Benchmark database models for ORM."""
2
+
3
+ from .benchmark_models import (
4
+ BenchmarkConfig,
5
+ BenchmarkProgress,
6
+ BenchmarkResult,
7
+ BenchmarkRun,
8
+ BenchmarkStatus,
9
+ DatasetType,
10
+ )
11
+
12
+ __all__ = [
13
+ "BenchmarkRun",
14
+ "BenchmarkResult",
15
+ "BenchmarkConfig",
16
+ "BenchmarkProgress",
17
+ "BenchmarkStatus",
18
+ "DatasetType",
19
+ ]
@@ -0,0 +1,283 @@
1
+ """Database models for benchmark system."""
2
+
3
+ import enum
4
+
5
+ from sqlalchemy import (
6
+ JSON,
7
+ Boolean,
8
+ Column,
9
+ DateTime,
10
+ Enum,
11
+ Float,
12
+ ForeignKey,
13
+ Integer,
14
+ String,
15
+ Text,
16
+ UniqueConstraint,
17
+ Index,
18
+ )
19
+ from sqlalchemy.ext.declarative import declarative_base
20
+ from sqlalchemy.orm import relationship
21
+ from sqlalchemy.sql import func
22
+
23
+ # Use the same base as the main app
24
+ try:
25
+ from ...web.database.models import Base
26
+ except ImportError:
27
+ # Fallback for different import contexts
28
+ from sqlalchemy.ext.declarative import declarative_base
29
+
30
+ Base = declarative_base()
31
+
32
+
33
+ class BenchmarkStatus(enum.Enum):
34
+ """Status of a benchmark run."""
35
+
36
+ PENDING = "pending"
37
+ IN_PROGRESS = "in_progress"
38
+ COMPLETED = "completed"
39
+ FAILED = "failed"
40
+ CANCELLED = "cancelled"
41
+ PAUSED = "paused"
42
+
43
+
44
+ class DatasetType(enum.Enum):
45
+ """Supported dataset types."""
46
+
47
+ SIMPLEQA = "simpleqa"
48
+ BROWSECOMP = "browsecomp"
49
+ CUSTOM = "custom"
50
+
51
+
52
+ class BenchmarkRun(Base):
53
+ """Main benchmark run metadata."""
54
+
55
+ __tablename__ = "benchmark_runs"
56
+
57
+ id = Column(Integer, primary_key=True, index=True)
58
+
59
+ # Run identification
60
+ run_name = Column(String(255), nullable=True) # User-friendly name
61
+ config_hash = Column(
62
+ String(16), nullable=False, index=True
63
+ ) # For compatibility matching
64
+ query_hash_list = Column(
65
+ JSON, nullable=False
66
+ ) # List of query hashes to avoid duplication
67
+
68
+ # Configuration
69
+ search_config = Column(
70
+ JSON, nullable=False
71
+ ) # Complete search configuration
72
+ evaluation_config = Column(JSON, nullable=False) # Evaluation settings
73
+ datasets_config = Column(
74
+ JSON, nullable=False
75
+ ) # Dataset selection and quantities
76
+
77
+ # Status and timing
78
+ status = Column(
79
+ Enum(BenchmarkStatus), default=BenchmarkStatus.PENDING, nullable=False
80
+ )
81
+ created_at = Column(DateTime, server_default=func.now(), nullable=False)
82
+ updated_at = Column(
83
+ DateTime, server_default=func.now(), onupdate=func.now(), nullable=False
84
+ )
85
+ start_time = Column(DateTime, nullable=True)
86
+ end_time = Column(DateTime, nullable=True)
87
+
88
+ # Progress tracking
89
+ total_examples = Column(Integer, default=0, nullable=False)
90
+ completed_examples = Column(Integer, default=0, nullable=False)
91
+ failed_examples = Column(Integer, default=0, nullable=False)
92
+
93
+ # Results summary
94
+ overall_accuracy = Column(Float, nullable=True)
95
+ processing_rate = Column(Float, nullable=True) # Examples per minute
96
+
97
+ # Error handling
98
+ error_message = Column(Text, nullable=True)
99
+
100
+ # Relationships
101
+ results = relationship(
102
+ "BenchmarkResult",
103
+ back_populates="benchmark_run",
104
+ cascade="all, delete-orphan",
105
+ lazy="dynamic",
106
+ )
107
+ progress_updates = relationship(
108
+ "BenchmarkProgress",
109
+ back_populates="benchmark_run",
110
+ cascade="all, delete-orphan",
111
+ lazy="dynamic",
112
+ )
113
+
114
+ # Indexes for performance and extend existing
115
+ __table_args__ = (
116
+ Index("idx_benchmark_runs_config_hash", "config_hash"),
117
+ Index("idx_benchmark_runs_status_created", "status", "created_at"),
118
+ {"extend_existing": True},
119
+ )
120
+
121
+
122
+ class BenchmarkResult(Base):
123
+ """Individual benchmark result for a single question."""
124
+
125
+ __tablename__ = "benchmark_results"
126
+
127
+ id = Column(Integer, primary_key=True, index=True)
128
+
129
+ # Foreign key
130
+ benchmark_run_id = Column(
131
+ Integer,
132
+ ForeignKey("benchmark_runs.id", ondelete="CASCADE"),
133
+ nullable=False,
134
+ index=True,
135
+ )
136
+
137
+ # Question identification
138
+ example_id = Column(String(255), nullable=False) # Original dataset ID
139
+ query_hash = Column(
140
+ String(32), nullable=False, index=True
141
+ ) # For deduplication
142
+ dataset_type = Column(Enum(DatasetType), nullable=False)
143
+ research_id = Column(
144
+ String(36), nullable=True, index=True
145
+ ) # UUID string or converted integer
146
+
147
+ # Question and answer
148
+ question = Column(Text, nullable=False)
149
+ correct_answer = Column(Text, nullable=False)
150
+
151
+ # Research results
152
+ response = Column(Text, nullable=True)
153
+ extracted_answer = Column(Text, nullable=True)
154
+ confidence = Column(String(10), nullable=True)
155
+ processing_time = Column(Float, nullable=True)
156
+ sources = Column(JSON, nullable=True)
157
+
158
+ # Evaluation results
159
+ is_correct = Column(Boolean, nullable=True)
160
+ graded_confidence = Column(String(10), nullable=True)
161
+ grader_response = Column(Text, nullable=True)
162
+
163
+ # Timestamps
164
+ created_at = Column(DateTime, server_default=func.now(), nullable=False)
165
+ completed_at = Column(DateTime, nullable=True)
166
+
167
+ # Error handling
168
+ research_error = Column(Text, nullable=True)
169
+ evaluation_error = Column(Text, nullable=True)
170
+
171
+ # Additional metadata
172
+ task_index = Column(Integer, nullable=True) # Order in processing
173
+ result_metadata = Column(JSON, nullable=True) # Additional data
174
+
175
+ # Relationships
176
+ benchmark_run = relationship("BenchmarkRun", back_populates="results")
177
+
178
+ # Indexes for performance
179
+ __table_args__ = (
180
+ Index(
181
+ "idx_benchmark_results_run_dataset",
182
+ "benchmark_run_id",
183
+ "dataset_type",
184
+ ),
185
+ Index("idx_benchmark_results_query_hash", "query_hash"),
186
+ Index("idx_benchmark_results_completed", "completed_at"),
187
+ UniqueConstraint(
188
+ "benchmark_run_id", "query_hash", name="uix_run_query"
189
+ ),
190
+ {"extend_existing": True},
191
+ )
192
+
193
+
194
+ class BenchmarkConfig(Base):
195
+ """Saved benchmark configurations for reuse."""
196
+
197
+ __tablename__ = "benchmark_configs"
198
+
199
+ id = Column(Integer, primary_key=True, index=True)
200
+
201
+ # Configuration details
202
+ name = Column(String(255), nullable=False)
203
+ description = Column(Text, nullable=True)
204
+ config_hash = Column(String(16), nullable=False, index=True)
205
+
206
+ # Configuration data
207
+ search_config = Column(JSON, nullable=False)
208
+ evaluation_config = Column(JSON, nullable=False)
209
+ datasets_config = Column(JSON, nullable=False)
210
+
211
+ # Metadata
212
+ created_at = Column(DateTime, server_default=func.now(), nullable=False)
213
+ updated_at = Column(
214
+ DateTime, server_default=func.now(), onupdate=func.now(), nullable=False
215
+ )
216
+ is_default = Column(Boolean, default=False, nullable=False)
217
+ is_public = Column(Boolean, default=True, nullable=False)
218
+
219
+ # Usage tracking
220
+ usage_count = Column(Integer, default=0, nullable=False)
221
+ last_used = Column(DateTime, nullable=True)
222
+
223
+ # Performance data (if available)
224
+ best_accuracy = Column(Float, nullable=True)
225
+ avg_processing_rate = Column(Float, nullable=True)
226
+
227
+ # Indexes
228
+ __table_args__ = (
229
+ Index("idx_benchmark_configs_name", "name"),
230
+ Index("idx_benchmark_configs_hash", "config_hash"),
231
+ Index("idx_benchmark_configs_default", "is_default"),
232
+ {"extend_existing": True},
233
+ )
234
+
235
+
236
+ class BenchmarkProgress(Base):
237
+ """Real-time progress tracking for benchmark runs."""
238
+
239
+ __tablename__ = "benchmark_progress"
240
+
241
+ id = Column(Integer, primary_key=True, index=True)
242
+
243
+ # Foreign key
244
+ benchmark_run_id = Column(
245
+ Integer,
246
+ ForeignKey("benchmark_runs.id", ondelete="CASCADE"),
247
+ nullable=False,
248
+ index=True,
249
+ )
250
+
251
+ # Progress data
252
+ timestamp = Column(DateTime, server_default=func.now(), nullable=False)
253
+ completed_examples = Column(Integer, nullable=False)
254
+ total_examples = Column(Integer, nullable=False)
255
+
256
+ # Accuracy tracking
257
+ overall_accuracy = Column(Float, nullable=True)
258
+ dataset_accuracies = Column(JSON, nullable=True) # Per-dataset accuracy
259
+
260
+ # Performance metrics
261
+ processing_rate = Column(Float, nullable=True) # Examples per minute
262
+ estimated_completion = Column(DateTime, nullable=True)
263
+
264
+ # Current status
265
+ current_dataset = Column(Enum(DatasetType), nullable=True)
266
+ current_example_id = Column(String(255), nullable=True)
267
+
268
+ # Additional metrics
269
+ memory_usage = Column(Float, nullable=True) # MB
270
+ cpu_usage = Column(Float, nullable=True) # Percentage
271
+
272
+ # Relationships
273
+ benchmark_run = relationship(
274
+ "BenchmarkRun", back_populates="progress_updates"
275
+ )
276
+
277
+ # Indexes for real-time queries
278
+ __table_args__ = (
279
+ Index(
280
+ "idx_benchmark_progress_run_time", "benchmark_run_id", "timestamp"
281
+ ),
282
+ {"extend_existing": True},
283
+ )
@@ -0,0 +1 @@
1
+ """Benchmark UI components package."""
@@ -0,0 +1,6 @@
1
+ """Benchmark web API package."""
2
+
3
+ from .benchmark_service import BenchmarkService
4
+ from .benchmark_routes import benchmark_bp
5
+
6
+ __all__ = ["BenchmarkService", "benchmark_bp"]