local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. local_deep_research/__init__.py +7 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
  4. local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
  5. local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
  6. local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
  7. local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
  8. local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
  9. local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
  10. local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
  11. local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
  12. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
  13. local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
  14. local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
  15. local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
  16. local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
  17. local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
  18. local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
  19. local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
  20. local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
  21. local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
  22. local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
  23. local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
  24. local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
  25. local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
  26. local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
  27. local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
  28. local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
  29. local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
  30. local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
  31. local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
  32. local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
  33. local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
  34. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
  35. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
  36. local_deep_research/advanced_search_system/findings/repository.py +54 -17
  37. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
  38. local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
  39. local_deep_research/advanced_search_system/questions/__init__.py +16 -0
  40. local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
  41. local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
  42. local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
  43. local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
  44. local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
  45. local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
  46. local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
  47. local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
  48. local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
  49. local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
  50. local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
  51. local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
  52. local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
  53. local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
  54. local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
  55. local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
  56. local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
  57. local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
  58. local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
  59. local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
  60. local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
  61. local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
  62. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
  63. local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
  64. local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
  65. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
  66. local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
  67. local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
  68. local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
  69. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
  70. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
  71. local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
  72. local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
  73. local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
  74. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
  75. local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
  76. local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
  77. local_deep_research/api/benchmark_functions.py +6 -2
  78. local_deep_research/api/research_functions.py +10 -4
  79. local_deep_research/benchmarks/__init__.py +9 -7
  80. local_deep_research/benchmarks/benchmark_functions.py +6 -2
  81. local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
  82. local_deep_research/benchmarks/cli.py +38 -13
  83. local_deep_research/benchmarks/comparison/__init__.py +4 -2
  84. local_deep_research/benchmarks/comparison/evaluator.py +316 -239
  85. local_deep_research/benchmarks/datasets/__init__.py +1 -1
  86. local_deep_research/benchmarks/datasets/base.py +91 -72
  87. local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
  88. local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
  89. local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
  90. local_deep_research/benchmarks/datasets/utils.py +48 -29
  91. local_deep_research/benchmarks/datasets.py +4 -11
  92. local_deep_research/benchmarks/efficiency/__init__.py +8 -4
  93. local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
  94. local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
  95. local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
  96. local_deep_research/benchmarks/evaluators/composite.py +6 -2
  97. local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
  98. local_deep_research/benchmarks/graders.py +32 -10
  99. local_deep_research/benchmarks/metrics/README.md +1 -1
  100. local_deep_research/benchmarks/metrics/calculation.py +25 -10
  101. local_deep_research/benchmarks/metrics/reporting.py +7 -3
  102. local_deep_research/benchmarks/metrics/visualization.py +42 -23
  103. local_deep_research/benchmarks/metrics.py +1 -1
  104. local_deep_research/benchmarks/optimization/__init__.py +3 -1
  105. local_deep_research/benchmarks/optimization/api.py +7 -1
  106. local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
  107. local_deep_research/benchmarks/runners.py +48 -15
  108. local_deep_research/citation_handler.py +65 -92
  109. local_deep_research/citation_handlers/__init__.py +15 -0
  110. local_deep_research/citation_handlers/base_citation_handler.py +70 -0
  111. local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
  112. local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
  113. local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
  114. local_deep_research/config/llm_config.py +271 -169
  115. local_deep_research/config/search_config.py +14 -5
  116. local_deep_research/defaults/__init__.py +0 -1
  117. local_deep_research/metrics/__init__.py +13 -0
  118. local_deep_research/metrics/database.py +58 -0
  119. local_deep_research/metrics/db_models.py +115 -0
  120. local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
  121. local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
  122. local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
  123. local_deep_research/metrics/migrate_research_ratings.py +31 -0
  124. local_deep_research/metrics/models.py +61 -0
  125. local_deep_research/metrics/pricing/__init__.py +12 -0
  126. local_deep_research/metrics/pricing/cost_calculator.py +237 -0
  127. local_deep_research/metrics/pricing/pricing_cache.py +143 -0
  128. local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
  129. local_deep_research/metrics/query_utils.py +51 -0
  130. local_deep_research/metrics/search_tracker.py +380 -0
  131. local_deep_research/metrics/token_counter.py +1078 -0
  132. local_deep_research/migrate_db.py +3 -1
  133. local_deep_research/report_generator.py +22 -8
  134. local_deep_research/search_system.py +390 -9
  135. local_deep_research/test_migration.py +15 -5
  136. local_deep_research/utilities/db_utils.py +7 -4
  137. local_deep_research/utilities/es_utils.py +115 -104
  138. local_deep_research/utilities/llm_utils.py +15 -5
  139. local_deep_research/utilities/log_utils.py +151 -0
  140. local_deep_research/utilities/search_cache.py +387 -0
  141. local_deep_research/utilities/search_utilities.py +14 -6
  142. local_deep_research/utilities/threading_utils.py +92 -0
  143. local_deep_research/utilities/url_utils.py +6 -0
  144. local_deep_research/web/api.py +347 -0
  145. local_deep_research/web/app.py +13 -17
  146. local_deep_research/web/app_factory.py +71 -66
  147. local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
  148. local_deep_research/web/database/migrations.py +20 -3
  149. local_deep_research/web/database/models.py +74 -25
  150. local_deep_research/web/database/schema_upgrade.py +49 -29
  151. local_deep_research/web/models/database.py +63 -83
  152. local_deep_research/web/routes/api_routes.py +56 -22
  153. local_deep_research/web/routes/benchmark_routes.py +4 -1
  154. local_deep_research/web/routes/globals.py +22 -0
  155. local_deep_research/web/routes/history_routes.py +71 -46
  156. local_deep_research/web/routes/metrics_routes.py +1155 -0
  157. local_deep_research/web/routes/research_routes.py +192 -54
  158. local_deep_research/web/routes/settings_routes.py +156 -55
  159. local_deep_research/web/services/research_service.py +412 -251
  160. local_deep_research/web/services/resource_service.py +36 -11
  161. local_deep_research/web/services/settings_manager.py +55 -17
  162. local_deep_research/web/services/settings_service.py +12 -4
  163. local_deep_research/web/services/socket_service.py +295 -188
  164. local_deep_research/web/static/css/custom_dropdown.css +180 -0
  165. local_deep_research/web/static/css/styles.css +39 -1
  166. local_deep_research/web/static/js/components/detail.js +633 -267
  167. local_deep_research/web/static/js/components/details.js +751 -0
  168. local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
  169. local_deep_research/web/static/js/components/fallback/ui.js +23 -23
  170. local_deep_research/web/static/js/components/history.js +76 -76
  171. local_deep_research/web/static/js/components/logpanel.js +61 -13
  172. local_deep_research/web/static/js/components/progress.js +13 -2
  173. local_deep_research/web/static/js/components/research.js +99 -12
  174. local_deep_research/web/static/js/components/results.js +239 -106
  175. local_deep_research/web/static/js/main.js +40 -40
  176. local_deep_research/web/static/js/services/audio.js +1 -1
  177. local_deep_research/web/static/js/services/formatting.js +11 -11
  178. local_deep_research/web/static/js/services/keyboard.js +157 -0
  179. local_deep_research/web/static/js/services/pdf.js +80 -80
  180. local_deep_research/web/static/sounds/README.md +1 -1
  181. local_deep_research/web/templates/base.html +1 -0
  182. local_deep_research/web/templates/components/log_panel.html +7 -1
  183. local_deep_research/web/templates/components/mobile_nav.html +1 -1
  184. local_deep_research/web/templates/components/sidebar.html +3 -0
  185. local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
  186. local_deep_research/web/templates/pages/details.html +325 -24
  187. local_deep_research/web/templates/pages/history.html +1 -1
  188. local_deep_research/web/templates/pages/metrics.html +1929 -0
  189. local_deep_research/web/templates/pages/progress.html +2 -2
  190. local_deep_research/web/templates/pages/research.html +53 -17
  191. local_deep_research/web/templates/pages/results.html +12 -1
  192. local_deep_research/web/templates/pages/star_reviews.html +803 -0
  193. local_deep_research/web/utils/formatters.py +9 -3
  194. local_deep_research/web_search_engines/default_search_engines.py +5 -3
  195. local_deep_research/web_search_engines/engines/full_search.py +8 -2
  196. local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
  197. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
  198. local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
  199. local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
  200. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
  201. local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
  202. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
  203. local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
  204. local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
  205. local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
  206. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
  207. local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
  208. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
  209. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
  210. local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
  211. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
  212. local_deep_research/web_search_engines/search_engine_base.py +83 -35
  213. local_deep_research/web_search_engines/search_engine_factory.py +25 -8
  214. local_deep_research/web_search_engines/search_engines_config.py +9 -3
  215. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/METADATA +7 -1
  216. local_deep_research-0.5.2.dist-info/RECORD +265 -0
  217. local_deep_research-0.4.4.dist-info/RECORD +0 -177
  218. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/WHEEL +0 -0
  219. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/entry_points.txt +0 -0
  220. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,3 @@
1
- import json
2
1
  import logging
3
2
  from typing import Any, Dict, List, Optional
4
3
 
@@ -48,40 +47,48 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
48
47
  """
49
48
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
50
49
  super().__init__(
51
- llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
50
+ llm=llm,
51
+ max_filtered_results=max_filtered_results,
52
+ max_results=max_results,
52
53
  )
53
-
54
+
54
55
  self.index_name = index_name
55
56
  self.highlight_fields = highlight_fields
56
57
  self.search_fields = search_fields
57
58
  self.filter_query = filter_query or {}
58
-
59
+
59
60
  # Initialize the Elasticsearch client
60
61
  es_args = {}
61
-
62
+
62
63
  # Basic authentication
63
64
  if username and password:
64
65
  es_args["basic_auth"] = (username, password)
65
-
66
+
66
67
  # API key authentication
67
68
  if api_key:
68
69
  es_args["api_key"] = api_key
69
-
70
+
70
71
  # Cloud ID for Elastic Cloud
71
72
  if cloud_id:
72
73
  es_args["cloud_id"] = cloud_id
73
-
74
+
74
75
  # Connect to Elasticsearch
75
76
  self.client = Elasticsearch(hosts, **es_args)
76
-
77
+
77
78
  # Verify connection
78
79
  try:
79
80
  info = self.client.info()
80
- logger.info(f"Connected to Elasticsearch cluster: {info.get('cluster_name')}")
81
- logger.info(f"Elasticsearch version: {info.get('version', {}).get('number')}")
81
+ logger.info(
82
+ f"Connected to Elasticsearch cluster: {info.get('cluster_name')}"
83
+ )
84
+ logger.info(
85
+ f"Elasticsearch version: {info.get('version', {}).get('number')}"
86
+ )
82
87
  except Exception as e:
83
88
  logger.error(f"Failed to connect to Elasticsearch: {str(e)}")
84
- raise ConnectionError(f"Could not connect to Elasticsearch: {str(e)}")
89
+ raise ConnectionError(
90
+ f"Could not connect to Elasticsearch: {str(e)}"
91
+ )
85
92
 
86
93
  def _get_previews(self, query: str) -> List[Dict[str, Any]]:
87
94
  """
@@ -93,7 +100,9 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
93
100
  Returns:
94
101
  List of preview dictionaries
95
102
  """
96
- logger.info(f"Getting document previews from Elasticsearch with query: {query}")
103
+ logger.info(
104
+ f"Getting document previews from Elasticsearch with query: {query}"
105
+ )
97
106
 
98
107
  try:
99
108
  # Build the search query
@@ -113,31 +122,31 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
113
122
  },
114
123
  "size": self.max_results,
115
124
  }
116
-
125
+
117
126
  # Add filter if provided
118
127
  if self.filter_query:
119
128
  search_query["query"] = {
120
129
  "bool": {
121
130
  "must": search_query["query"],
122
- "filter": self.filter_query
131
+ "filter": self.filter_query,
123
132
  }
124
133
  }
125
-
134
+
126
135
  # Execute the search
127
136
  response = self.client.search(
128
137
  index=self.index_name,
129
138
  body=search_query,
130
139
  )
131
-
140
+
132
141
  # Process the search results
133
142
  hits = response.get("hits", {}).get("hits", [])
134
-
143
+
135
144
  # Format results as previews with basic information
136
145
  previews = []
137
146
  for hit in hits:
138
147
  source = hit.get("_source", {})
139
148
  highlight = hit.get("highlight", {})
140
-
149
+
141
150
  # Extract highlighted snippets or fall back to original content
142
151
  snippet = ""
143
152
  for field in self.highlight_fields:
@@ -145,25 +154,30 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
145
154
  # Join all highlights for this field
146
155
  field_snippets = " ... ".join(highlight[field])
147
156
  snippet += field_snippets + " "
148
-
157
+
149
158
  # If no highlights, use a portion of the content
150
159
  if not snippet and "content" in source:
151
160
  content = source.get("content", "")
152
- snippet = content[:250] + "..." if len(content) > 250 else content
153
-
161
+ snippet = (
162
+ content[:250] + "..." if len(content) > 250 else content
163
+ )
164
+
154
165
  # Create preview object
155
166
  preview = {
156
167
  "id": hit.get("_id", ""),
157
168
  "title": source.get("title", "Untitled Document"),
158
- "link": source.get("url", "") or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
169
+ "link": source.get("url", "")
170
+ or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
159
171
  "snippet": snippet.strip(),
160
172
  "score": hit.get("_score", 0),
161
173
  "_index": hit.get("_index", self.index_name),
162
174
  }
163
-
175
+
164
176
  previews.append(preview)
165
-
166
- logger.info(f"Found {len(previews)} preview results from Elasticsearch")
177
+
178
+ logger.info(
179
+ f"Found {len(previews)} preview results from Elasticsearch"
180
+ )
167
181
  return previews
168
182
 
169
183
  except Exception as e:
@@ -196,7 +210,7 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
196
210
  for item in relevant_items:
197
211
  # Start with the preview data
198
212
  result = item.copy()
199
-
213
+
200
214
  # Get the document ID
201
215
  doc_id = item.get("id")
202
216
  if not doc_id:
@@ -204,30 +218,34 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
204
218
  logger.warning(f"Skipping item without ID: {item}")
205
219
  results.append(result)
206
220
  continue
207
-
221
+
208
222
  try:
209
223
  # Fetch the full document
210
224
  doc_response = self.client.get(
211
225
  index=self.index_name,
212
226
  id=doc_id,
213
227
  )
214
-
228
+
215
229
  # Get the source document
216
230
  source = doc_response.get("_source", {})
217
-
231
+
218
232
  # Add full content to the result
219
- result["content"] = source.get("content", result.get("snippet", ""))
233
+ result["content"] = source.get(
234
+ "content", result.get("snippet", "")
235
+ )
220
236
  result["full_content"] = source.get("content", "")
221
-
237
+
222
238
  # Add metadata from source
223
239
  for key, value in source.items():
224
240
  if key not in result and key not in ["content"]:
225
241
  result[key] = value
226
-
242
+
227
243
  except Exception as e:
228
- logger.error(f"Error fetching full content for document {doc_id}: {str(e)}")
244
+ logger.error(
245
+ f"Error fetching full content for document {doc_id}: {str(e)}"
246
+ )
229
247
  # Keep the preview data if we can't get the full content
230
-
248
+
231
249
  results.append(result)
232
250
 
233
251
  return results
@@ -235,10 +253,10 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
235
253
  def search_by_query_string(self, query_string: str) -> List[Dict[str, Any]]:
236
254
  """
237
255
  Perform a search using Elasticsearch Query String syntax.
238
-
256
+
239
257
  Args:
240
258
  query_string: The query in Elasticsearch Query String syntax
241
-
259
+
242
260
  Returns:
243
261
  List of search results
244
262
  """
@@ -258,28 +276,28 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
258
276
  },
259
277
  "size": self.max_results,
260
278
  }
261
-
279
+
262
280
  # Execute the search
263
281
  response = self.client.search(
264
282
  index=self.index_name,
265
283
  body=search_query,
266
284
  )
267
-
285
+
268
286
  # Process and return the results
269
287
  previews = self._process_es_response(response)
270
288
  return self._get_full_content(previews)
271
-
289
+
272
290
  except Exception as e:
273
291
  logger.error(f"Error in query_string search: {str(e)}")
274
292
  return []
275
-
293
+
276
294
  def search_by_dsl(self, query_dsl: Dict[str, Any]) -> List[Dict[str, Any]]:
277
295
  """
278
296
  Perform a search using Elasticsearch DSL (Query Domain Specific Language).
279
-
297
+
280
298
  Args:
281
299
  query_dsl: The query in Elasticsearch DSL format
282
-
300
+
283
301
  Returns:
284
302
  List of search results
285
303
  """
@@ -289,55 +307,60 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
289
307
  index=self.index_name,
290
308
  body=query_dsl,
291
309
  )
292
-
310
+
293
311
  # Process and return the results
294
312
  previews = self._process_es_response(response)
295
313
  return self._get_full_content(previews)
296
-
314
+
297
315
  except Exception as e:
298
316
  logger.error(f"Error in DSL search: {str(e)}")
299
317
  return []
300
-
301
- def _process_es_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
318
+
319
+ def _process_es_response(
320
+ self, response: Dict[str, Any]
321
+ ) -> List[Dict[str, Any]]:
302
322
  """
303
323
  Process Elasticsearch response into preview dictionaries.
304
-
324
+
305
325
  Args:
306
326
  response: Elasticsearch response dictionary
307
-
327
+
308
328
  Returns:
309
329
  List of preview dictionaries
310
330
  """
311
331
  hits = response.get("hits", {}).get("hits", [])
312
-
332
+
313
333
  # Format results as previews
314
334
  previews = []
315
335
  for hit in hits:
316
336
  source = hit.get("_source", {})
317
337
  highlight = hit.get("highlight", {})
318
-
338
+
319
339
  # Extract highlighted snippets or fall back to original content
320
340
  snippet = ""
321
341
  for field in self.highlight_fields:
322
342
  if field in highlight and highlight[field]:
323
343
  field_snippets = " ... ".join(highlight[field])
324
344
  snippet += field_snippets + " "
325
-
345
+
326
346
  # If no highlights, use a portion of the content
327
347
  if not snippet and "content" in source:
328
348
  content = source.get("content", "")
329
- snippet = content[:250] + "..." if len(content) > 250 else content
330
-
349
+ snippet = (
350
+ content[:250] + "..." if len(content) > 250 else content
351
+ )
352
+
331
353
  # Create preview object
332
354
  preview = {
333
355
  "id": hit.get("_id", ""),
334
356
  "title": source.get("title", "Untitled Document"),
335
- "link": source.get("url", "") or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
357
+ "link": source.get("url", "")
358
+ or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
336
359
  "snippet": snippet.strip(),
337
360
  "score": hit.get("_score", 0),
338
361
  "_index": hit.get("_index", self.index_name),
339
362
  }
340
-
363
+
341
364
  previews.append(preview)
342
-
343
- return previews
365
+
366
+ return previews
@@ -46,7 +46,9 @@ class GitHubSearchEngine(BaseSearchEngine):
46
46
  """
47
47
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
48
48
  super().__init__(
49
- llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
49
+ llm=llm,
50
+ max_filtered_results=max_filtered_results,
51
+ max_results=max_results,
50
52
  )
51
53
  self.api_key = api_key or os.getenv("GITHUB_API_KEY")
52
54
  self.search_type = search_type
@@ -224,7 +226,9 @@ class GitHubSearchEngine(BaseSearchEngine):
224
226
 
225
227
  # If no results, try to provide more guidance
226
228
  if not results:
227
- logger.warning("No results found. Consider these search tips:")
229
+ logger.warning(
230
+ "No results found. Consider these search tips:"
231
+ )
228
232
  logger.warning("1. Use shorter, more specific queries")
229
233
  logger.warning(
230
234
  "2. For repositories, try adding 'stars:>100' or 'language:python'"
@@ -255,7 +259,8 @@ class GitHubSearchEngine(BaseSearchEngine):
255
259
  try:
256
260
  # Get README
257
261
  response = requests.get(
258
- f"{self.api_base}/repos/{repo_full_name}/readme", headers=self.headers
262
+ f"{self.api_base}/repos/{repo_full_name}/readme",
263
+ headers=self.headers,
259
264
  )
260
265
 
261
266
  # Check for rate limiting
@@ -267,7 +272,9 @@ class GitHubSearchEngine(BaseSearchEngine):
267
272
  encoding = data.get("encoding", "")
268
273
 
269
274
  if encoding == "base64" and content:
270
- return base64.b64decode(content).decode("utf-8", errors="replace")
275
+ return base64.b64decode(content).decode(
276
+ "utf-8", errors="replace"
277
+ )
271
278
  return content
272
279
  else:
273
280
  logger.warning(
@@ -312,7 +319,9 @@ class GitHubSearchEngine(BaseSearchEngine):
312
319
 
313
320
  if response.status_code == 200:
314
321
  issues = response.json()
315
- logger.info(f"Got {len(issues)} recent issues for {repo_full_name}")
322
+ logger.info(
323
+ f"Got {len(issues)} recent issues for {repo_full_name}"
324
+ )
316
325
  else:
317
326
  logger.warning(
318
327
  f"Could not get issues for {repo_full_name}: {response.status_code}"
@@ -346,17 +355,23 @@ class GitHubSearchEngine(BaseSearchEngine):
346
355
  encoding = data.get("encoding", "")
347
356
 
348
357
  if encoding == "base64" and content:
349
- return base64.b64decode(content).decode("utf-8", errors="replace")
358
+ return base64.b64decode(content).decode(
359
+ "utf-8", errors="replace"
360
+ )
350
361
  return content
351
362
  else:
352
- logger.warning(f"Could not get file content: {response.status_code}")
363
+ logger.warning(
364
+ f"Could not get file content: {response.status_code}"
365
+ )
353
366
  return ""
354
367
 
355
368
  except Exception as e:
356
369
  logger.error(f"Error getting file content: {e}")
357
370
  return ""
358
371
 
359
- def _format_repository_preview(self, repo: Dict[str, Any]) -> Dict[str, Any]:
372
+ def _format_repository_preview(
373
+ self, repo: Dict[str, Any]
374
+ ) -> Dict[str, Any]:
360
375
  """Format repository search result as preview"""
361
376
  return {
362
377
  "id": str(repo.get("id", "")),
@@ -393,7 +408,9 @@ class GitHubSearchEngine(BaseSearchEngine):
393
408
  def _format_issue_preview(self, issue: Dict[str, Any]) -> Dict[str, Any]:
394
409
  """Format issue search result as preview"""
395
410
  repo = (
396
- issue.get("repository", {}) if "repository" in issue else {"full_name": ""}
411
+ issue.get("repository", {})
412
+ if "repository" in issue
413
+ else {"full_name": ""}
397
414
  )
398
415
  return {
399
416
  "id": f"issue_{issue.get('number', '')}",
@@ -503,7 +520,9 @@ class GitHubSearchEngine(BaseSearchEngine):
503
520
  keywords[:5]
504
521
  ) # Add up to 5 keywords
505
522
 
506
- logger.info(f"Using specialized contribution query: {specialized_query}")
523
+ logger.info(
524
+ f"Using specialized contribution query: {specialized_query}"
525
+ )
507
526
 
508
527
  # Perform GitHub search with specialized query
509
528
  results = self._search_github(specialized_query)
@@ -560,7 +579,9 @@ class GitHubSearchEngine(BaseSearchEngine):
560
579
  logger.info("Snippet-only mode, skipping full content retrieval")
561
580
  return relevant_items
562
581
 
563
- logger.info(f"Getting full content for {len(relevant_items)} GitHub results")
582
+ logger.info(
583
+ f"Getting full content for {len(relevant_items)} GitHub results"
584
+ )
564
585
 
565
586
  results = []
566
587
  for item in relevant_items:
@@ -610,7 +631,10 @@ class GitHubSearchEngine(BaseSearchEngine):
610
631
  f"Public repositories: {item.get('public_repos', 0)}\n"
611
632
  )
612
633
 
613
- if item.get("snippet") and item.get("snippet") != "No bio provided":
634
+ if (
635
+ item.get("snippet")
636
+ and item.get("snippet") != "No bio provided"
637
+ ):
614
638
  profile_summary += f"\nBio: {item.get('snippet')}\n"
615
639
 
616
640
  result["full_content"] = profile_summary
@@ -620,7 +644,9 @@ class GitHubSearchEngine(BaseSearchEngine):
620
644
 
621
645
  return results
622
646
 
623
- def search_repository(self, repo_owner: str, repo_name: str) -> Dict[str, Any]:
647
+ def search_repository(
648
+ self, repo_owner: str, repo_name: str
649
+ ) -> Dict[str, Any]:
624
650
  """
625
651
  Get detailed information about a specific repository.
626
652
 
@@ -672,7 +698,10 @@ class GitHubSearchEngine(BaseSearchEngine):
672
698
  return {}
673
699
 
674
700
  def search_code(
675
- self, query: str, language: Optional[str] = None, user: Optional[str] = None
701
+ self,
702
+ query: str,
703
+ language: Optional[str] = None,
704
+ user: Optional[str] = None,
676
705
  ) -> List[Dict[str, Any]]:
677
706
  """
678
707
  Search for code with more specific parameters.
@@ -769,7 +798,9 @@ class GitHubSearchEngine(BaseSearchEngine):
769
798
  results = data.get("items", [])
770
799
 
771
800
  # Format results
772
- previews = [self._format_issue_preview(result) for result in results]
801
+ previews = [
802
+ self._format_issue_preview(result) for result in results
803
+ ]
773
804
 
774
805
  # For issues, we don't need to get full content
775
806
  return previews
@@ -51,7 +51,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
51
51
  """
52
52
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
53
53
  super().__init__(
54
- llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
54
+ llm=llm,
55
+ max_filtered_results=max_filtered_results,
56
+ max_results=max_results,
55
57
  )
56
58
  self.include_full_content = include_full_content
57
59
 
@@ -61,7 +63,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
61
63
 
62
64
  # Rate limiting - keep track of last request time
63
65
  self.last_request_time = 0
64
- self.min_request_interval = 0.5 # Minimum time between requests in seconds
66
+ self.min_request_interval = (
67
+ 0.5 # Minimum time between requests in seconds
68
+ )
65
69
 
66
70
  # Language code mapping
67
71
  language_code_mapping = {
@@ -92,7 +96,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
92
96
 
93
97
  self.api_key = api_key
94
98
  if not self.api_key:
95
- self.api_key = get_db_setting("search.engine.web.google_pse.api_key")
99
+ self.api_key = get_db_setting(
100
+ "search.engine.web.google_pse.api_key"
101
+ )
96
102
 
97
103
  self.search_engine_id = search_engine_id
98
104
  if not self.search_engine_id:
@@ -187,7 +193,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
187
193
  # Add jitter to retries after the first attempt
188
194
  if attempt > 0:
189
195
  jitter = random.uniform(0.5, 1.5)
190
- sleep_time = self.retry_delay * (2 ** (attempt - 1)) * jitter
196
+ sleep_time = (
197
+ self.retry_delay * (2 ** (attempt - 1)) * jitter
198
+ )
191
199
  logger.info(
192
200
  "Retry attempt %s / %s for query '%s'. Waiting %s s",
193
201
  attempt + 1,
@@ -272,7 +280,7 @@ class GooglePSESearchEngine(BaseSearchEngine):
272
280
  {
273
281
  "title": title,
274
282
  "snippet": snippet,
275
- "url": url,
283
+ "link": url,
276
284
  "source": "Google Programmable Search",
277
285
  }
278
286
  )
@@ -296,7 +304,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
296
304
  logger.error("Error getting search results: %s", str(e))
297
305
  break
298
306
 
299
- logger.info("Retrieved %s search results for query: '%s'", len(results), query)
307
+ logger.info(
308
+ "Retrieved %s search results for query: '%s'", len(results), query
309
+ )
300
310
  return results
301
311
 
302
312
  def _get_full_content(
@@ -48,7 +48,9 @@ class GuardianSearchEngine(BaseSearchEngine):
48
48
  """
49
49
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
50
50
  super().__init__(
51
- llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
51
+ llm=llm,
52
+ max_filtered_results=max_filtered_results,
53
+ max_results=max_results,
52
54
  )
53
55
  self.api_key = api_key or os.getenv("GUARDIAN_API_KEY")
54
56
  self.optimize_queries = optimize_queries
@@ -204,15 +206,19 @@ ONE WORD ONLY:"""
204
206
  logger.info(
205
207
  "Query classified as HISTORICAL - extending search timeframe"
206
208
  )
207
- ten_years_ago = (datetime.now() - timedelta(days=3650)).strftime(
208
- "%Y-%m-%d"
209
- )
209
+ ten_years_ago = (
210
+ datetime.now() - timedelta(days=3650)
211
+ ).strftime("%Y-%m-%d")
210
212
  self.from_date = ten_years_ago
211
213
 
212
214
  elif "CURRENT" in answer:
213
215
  # For current events, focus on recent content
214
- logger.info("Query classified as CURRENT - focusing on recent content")
215
- recent = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
216
+ logger.info(
217
+ "Query classified as CURRENT - focusing on recent content"
218
+ )
219
+ recent = (datetime.now() - timedelta(days=60)).strftime(
220
+ "%Y-%m-%d"
221
+ )
216
222
  self.from_date = recent
217
223
  self.order_by = "newest" # Prioritize newest for current events
218
224
 
@@ -246,7 +252,9 @@ ONE WORD ONLY:"""
246
252
 
247
253
  # Strategy 1: Expand to 6 months
248
254
  logger.info("Strategy 1: Expanding time range to 6 months")
249
- six_months_ago = (datetime.now() - timedelta(days=180)).strftime("%Y-%m-%d")
255
+ six_months_ago = (datetime.now() - timedelta(days=180)).strftime(
256
+ "%Y-%m-%d"
257
+ )
250
258
  self.from_date = six_months_ago
251
259
 
252
260
  articles1 = self._get_all_data(query)
@@ -256,7 +264,9 @@ ONE WORD ONLY:"""
256
264
 
257
265
  # Strategy 2: Expand to all time and try relevance order
258
266
  if len(articles) < 3:
259
- logger.info("Strategy 2: Expanding to all time with relevance ordering")
267
+ logger.info(
268
+ "Strategy 2: Expanding to all time with relevance ordering"
269
+ )
260
270
  self.from_date = "2000-01-01" # Effectively "all time"
261
271
  self.order_by = "relevance"
262
272
 
@@ -315,12 +325,15 @@ ONE WORD ONLY:"""
315
325
  # Always request all fields for simplicity
316
326
  # Ensure max_results is an integer to avoid comparison errors
317
327
  page_size = min(
318
- int(self.max_results) if self.max_results is not None else 10, 50
328
+ int(self.max_results) if self.max_results is not None else 10,
329
+ 50,
319
330
  )
320
331
 
321
332
  # Log full parameters for debugging
322
333
  logger.info(f"Guardian API search query: '{query}'")
323
- logger.info(f"Guardian API date range: {self.from_date} to {self.to_date}")
334
+ logger.info(
335
+ f"Guardian API date range: {self.from_date} to {self.to_date}"
336
+ )
324
337
 
325
338
  params = {
326
339
  "q": query,
@@ -363,7 +376,9 @@ ONE WORD ONLY:"""
363
376
  # Format the article with all fields
364
377
  result = {
365
378
  "id": article.get("id", ""),
366
- "title": fields.get("headline", article.get("webTitle", "")),
379
+ "title": fields.get(
380
+ "headline", article.get("webTitle", "")
381
+ ),
367
382
  "link": article.get("webUrl", ""),
368
383
  "snippet": fields.get("trailText", ""),
369
384
  "publication_date": article.get("webPublicationDate", ""),
@@ -399,7 +414,9 @@ ONE WORD ONLY:"""
399
414
  Returns:
400
415
  List of preview dictionaries
401
416
  """
402
- logger.info(f"Getting articles from The Guardian API for query: {query}")
417
+ logger.info(
418
+ f"Getting articles from The Guardian API for query: {query}"
419
+ )
403
420
 
404
421
  # Step 1: Optimize the query using LLM
405
422
  optimized_query = self._optimize_query_for_guardian(query)
@@ -471,7 +488,10 @@ ONE WORD ONLY:"""
471
488
  article_id = item.get("id", "")
472
489
 
473
490
  # Get the full article from our cache
474
- if hasattr(self, "_full_articles") and article_id in self._full_articles:
491
+ if (
492
+ hasattr(self, "_full_articles")
493
+ and article_id in self._full_articles
494
+ ):
475
495
  results.append(self._full_articles[article_id])
476
496
  else:
477
497
  # If not found (shouldn't happen), just use the preview
@@ -502,7 +522,9 @@ ONE WORD ONLY:"""
502
522
 
503
523
  # If no results, try one more time with a simplified query
504
524
  if not previews:
505
- simple_query = " ".join([w for w in query.split() if len(w) > 3][:3])
525
+ simple_query = " ".join(
526
+ [w for w in query.split() if len(w) > 3][:3]
527
+ )
506
528
  logger.warning(
507
529
  f"No Guardian articles found, trying simplified query: {simple_query}"
508
530
  )
@@ -518,7 +540,9 @@ ONE WORD ONLY:"""
518
540
 
519
541
  # If still no results after all attempts, return empty list
520
542
  if not previews:
521
- logger.warning("No Guardian articles found after multiple attempts")
543
+ logger.warning(
544
+ "No Guardian articles found after multiple attempts"
545
+ )
522
546
  return []
523
547
 
524
548
  # Filter for relevance if we have an LLM