local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. local_deep_research/__init__.py +7 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
  4. local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
  5. local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
  6. local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
  7. local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
  8. local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
  9. local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
  10. local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
  11. local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
  12. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
  13. local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
  14. local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
  15. local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
  16. local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
  17. local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
  18. local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
  19. local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
  20. local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
  21. local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
  22. local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
  23. local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
  24. local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
  25. local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
  26. local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
  27. local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
  28. local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
  29. local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
  30. local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
  31. local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
  32. local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
  33. local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
  34. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
  35. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
  36. local_deep_research/advanced_search_system/findings/repository.py +54 -17
  37. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
  38. local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
  39. local_deep_research/advanced_search_system/questions/__init__.py +16 -0
  40. local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
  41. local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
  42. local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
  43. local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
  44. local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
  45. local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
  46. local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
  47. local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
  48. local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
  49. local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
  50. local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
  51. local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
  52. local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
  53. local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
  54. local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
  55. local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
  56. local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
  57. local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
  58. local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
  59. local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
  60. local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
  61. local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
  62. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
  63. local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
  64. local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
  65. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
  66. local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
  67. local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
  68. local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
  69. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
  70. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
  71. local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
  72. local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
  73. local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
  74. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
  75. local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
  76. local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
  77. local_deep_research/api/benchmark_functions.py +6 -2
  78. local_deep_research/api/research_functions.py +10 -4
  79. local_deep_research/benchmarks/__init__.py +9 -7
  80. local_deep_research/benchmarks/benchmark_functions.py +6 -2
  81. local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
  82. local_deep_research/benchmarks/cli.py +38 -13
  83. local_deep_research/benchmarks/comparison/__init__.py +4 -2
  84. local_deep_research/benchmarks/comparison/evaluator.py +316 -239
  85. local_deep_research/benchmarks/datasets/__init__.py +1 -1
  86. local_deep_research/benchmarks/datasets/base.py +91 -72
  87. local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
  88. local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
  89. local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
  90. local_deep_research/benchmarks/datasets/utils.py +48 -29
  91. local_deep_research/benchmarks/datasets.py +4 -11
  92. local_deep_research/benchmarks/efficiency/__init__.py +8 -4
  93. local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
  94. local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
  95. local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
  96. local_deep_research/benchmarks/evaluators/composite.py +6 -2
  97. local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
  98. local_deep_research/benchmarks/graders.py +32 -10
  99. local_deep_research/benchmarks/metrics/README.md +1 -1
  100. local_deep_research/benchmarks/metrics/calculation.py +25 -10
  101. local_deep_research/benchmarks/metrics/reporting.py +7 -3
  102. local_deep_research/benchmarks/metrics/visualization.py +42 -23
  103. local_deep_research/benchmarks/metrics.py +1 -1
  104. local_deep_research/benchmarks/optimization/__init__.py +3 -1
  105. local_deep_research/benchmarks/optimization/api.py +7 -1
  106. local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
  107. local_deep_research/benchmarks/runners.py +48 -15
  108. local_deep_research/citation_handler.py +65 -92
  109. local_deep_research/citation_handlers/__init__.py +15 -0
  110. local_deep_research/citation_handlers/base_citation_handler.py +70 -0
  111. local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
  112. local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
  113. local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
  114. local_deep_research/config/llm_config.py +271 -169
  115. local_deep_research/config/search_config.py +14 -5
  116. local_deep_research/defaults/__init__.py +0 -1
  117. local_deep_research/metrics/__init__.py +13 -0
  118. local_deep_research/metrics/database.py +58 -0
  119. local_deep_research/metrics/db_models.py +115 -0
  120. local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
  121. local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
  122. local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
  123. local_deep_research/metrics/migrate_research_ratings.py +31 -0
  124. local_deep_research/metrics/models.py +61 -0
  125. local_deep_research/metrics/pricing/__init__.py +12 -0
  126. local_deep_research/metrics/pricing/cost_calculator.py +237 -0
  127. local_deep_research/metrics/pricing/pricing_cache.py +143 -0
  128. local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
  129. local_deep_research/metrics/query_utils.py +51 -0
  130. local_deep_research/metrics/search_tracker.py +380 -0
  131. local_deep_research/metrics/token_counter.py +1078 -0
  132. local_deep_research/migrate_db.py +3 -1
  133. local_deep_research/report_generator.py +22 -8
  134. local_deep_research/search_system.py +390 -9
  135. local_deep_research/test_migration.py +15 -5
  136. local_deep_research/utilities/db_utils.py +7 -4
  137. local_deep_research/utilities/es_utils.py +115 -104
  138. local_deep_research/utilities/llm_utils.py +15 -5
  139. local_deep_research/utilities/log_utils.py +151 -0
  140. local_deep_research/utilities/search_cache.py +387 -0
  141. local_deep_research/utilities/search_utilities.py +14 -6
  142. local_deep_research/utilities/threading_utils.py +92 -0
  143. local_deep_research/utilities/url_utils.py +6 -0
  144. local_deep_research/web/api.py +347 -0
  145. local_deep_research/web/app.py +13 -17
  146. local_deep_research/web/app_factory.py +71 -66
  147. local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
  148. local_deep_research/web/database/migrations.py +5 -3
  149. local_deep_research/web/database/models.py +51 -2
  150. local_deep_research/web/database/schema_upgrade.py +49 -29
  151. local_deep_research/web/models/database.py +51 -61
  152. local_deep_research/web/routes/api_routes.py +56 -22
  153. local_deep_research/web/routes/benchmark_routes.py +4 -1
  154. local_deep_research/web/routes/globals.py +22 -0
  155. local_deep_research/web/routes/history_routes.py +71 -46
  156. local_deep_research/web/routes/metrics_routes.py +1155 -0
  157. local_deep_research/web/routes/research_routes.py +227 -41
  158. local_deep_research/web/routes/settings_routes.py +156 -55
  159. local_deep_research/web/services/research_service.py +310 -103
  160. local_deep_research/web/services/resource_service.py +36 -11
  161. local_deep_research/web/services/settings_manager.py +55 -17
  162. local_deep_research/web/services/settings_service.py +12 -4
  163. local_deep_research/web/services/socket_service.py +295 -188
  164. local_deep_research/web/static/css/custom_dropdown.css +180 -0
  165. local_deep_research/web/static/css/styles.css +39 -1
  166. local_deep_research/web/static/js/components/detail.js +633 -267
  167. local_deep_research/web/static/js/components/details.js +751 -0
  168. local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
  169. local_deep_research/web/static/js/components/fallback/ui.js +23 -23
  170. local_deep_research/web/static/js/components/history.js +76 -76
  171. local_deep_research/web/static/js/components/logpanel.js +61 -13
  172. local_deep_research/web/static/js/components/progress.js +13 -2
  173. local_deep_research/web/static/js/components/research.js +99 -12
  174. local_deep_research/web/static/js/components/results.js +239 -106
  175. local_deep_research/web/static/js/main.js +40 -40
  176. local_deep_research/web/static/js/services/audio.js +1 -1
  177. local_deep_research/web/static/js/services/formatting.js +11 -11
  178. local_deep_research/web/static/js/services/keyboard.js +157 -0
  179. local_deep_research/web/static/js/services/pdf.js +80 -80
  180. local_deep_research/web/static/sounds/README.md +1 -1
  181. local_deep_research/web/templates/base.html +1 -0
  182. local_deep_research/web/templates/components/log_panel.html +7 -1
  183. local_deep_research/web/templates/components/mobile_nav.html +1 -1
  184. local_deep_research/web/templates/components/sidebar.html +3 -0
  185. local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
  186. local_deep_research/web/templates/pages/details.html +325 -24
  187. local_deep_research/web/templates/pages/history.html +1 -1
  188. local_deep_research/web/templates/pages/metrics.html +1929 -0
  189. local_deep_research/web/templates/pages/progress.html +2 -2
  190. local_deep_research/web/templates/pages/research.html +53 -17
  191. local_deep_research/web/templates/pages/results.html +12 -1
  192. local_deep_research/web/templates/pages/star_reviews.html +803 -0
  193. local_deep_research/web/utils/formatters.py +9 -3
  194. local_deep_research/web_search_engines/default_search_engines.py +5 -3
  195. local_deep_research/web_search_engines/engines/full_search.py +8 -2
  196. local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
  197. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
  198. local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
  199. local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
  200. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
  201. local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
  202. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
  203. local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
  204. local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
  205. local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
  206. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
  207. local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
  208. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
  209. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
  210. local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
  211. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
  212. local_deep_research/web_search_engines/search_engine_base.py +83 -35
  213. local_deep_research/web_search_engines/search_engine_factory.py +25 -8
  214. local_deep_research/web_search_engines/search_engines_config.py +9 -3
  215. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
  216. local_deep_research-0.5.0.dist-info/RECORD +265 -0
  217. local_deep_research-0.4.4.dist-info/RECORD +0 -177
  218. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
  219. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
  220. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,457 @@
1
+ """
2
+ Diversity-focused candidate explorer implementation.
3
+
4
+ This explorer prioritizes finding diverse candidates across different
5
+ categories, types, and characteristics.
6
+ """
7
+
8
+ import time
9
+ from collections import defaultdict
10
+ from typing import List, Optional
11
+
12
+ from loguru import logger
13
+
14
+ from ..candidates.base_candidate import Candidate
15
+ from ..constraints.base_constraint import Constraint
16
+ from .base_explorer import (
17
+ BaseCandidateExplorer,
18
+ ExplorationResult,
19
+ ExplorationStrategy,
20
+ )
21
+
22
+
23
+ class DiversityExplorer(BaseCandidateExplorer):
24
+ """
25
+ Diversity-focused candidate explorer.
26
+
27
+ This explorer:
28
+ 1. Seeks candidates from different categories/types
29
+ 2. Avoids clustering around similar candidates
30
+ 3. Uses diversity metrics to guide exploration
31
+ 4. Balances breadth over depth
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ *args,
37
+ diversity_threshold: float = 0.7, # Minimum diversity score
38
+ category_limit: int = 10, # Max candidates per category
39
+ similarity_threshold: float = 0.8, # Similarity threshold for deduplication
40
+ **kwargs,
41
+ ):
42
+ """
43
+ Initialize diversity explorer.
44
+
45
+ Args:
46
+ diversity_threshold: Minimum diversity score to maintain
47
+ category_limit: Maximum candidates per category
48
+ similarity_threshold: Threshold for considering candidates similar
49
+ """
50
+ super().__init__(*args, **kwargs)
51
+ self.diversity_threshold = diversity_threshold
52
+ self.category_limit = category_limit
53
+ self.similarity_threshold = similarity_threshold
54
+
55
+ # Track diversity
56
+ self.category_counts = defaultdict(int)
57
+ self.diversity_categories = set()
58
+
59
+ def explore(
60
+ self,
61
+ initial_query: str,
62
+ constraints: Optional[List[Constraint]] = None,
63
+ entity_type: Optional[str] = None,
64
+ ) -> ExplorationResult:
65
+ """Explore candidates using diversity-focused strategy."""
66
+ start_time = time.time()
67
+ logger.info(
68
+ f"Starting diversity-focused exploration for: {initial_query}"
69
+ )
70
+
71
+ all_candidates = []
72
+ exploration_paths = []
73
+ total_searched = 0
74
+
75
+ # Initial broad search
76
+ initial_results = self._execute_search(initial_query)
77
+ initial_candidates = self._extract_candidates_from_results(
78
+ initial_results, entity_type
79
+ )
80
+ all_candidates.extend(initial_candidates)
81
+ total_searched += 1
82
+ exploration_paths.append(
83
+ f"Initial search: {initial_query} -> {len(initial_candidates)} candidates"
84
+ )
85
+
86
+ # Categorize initial candidates
87
+ self._categorize_candidates(initial_candidates)
88
+
89
+ # Generate diverse exploration paths
90
+ while self._should_continue_exploration(
91
+ start_time, len(all_candidates)
92
+ ):
93
+ # Calculate current diversity
94
+ diversity_score = self._calculate_diversity_score(all_candidates)
95
+
96
+ if (
97
+ diversity_score >= self.diversity_threshold
98
+ and len(all_candidates) >= 10
99
+ ):
100
+ logger.info(f"Diversity threshold met ({diversity_score:.2f})")
101
+ break
102
+
103
+ # Find underrepresented categories
104
+ underrepresented_categories = (
105
+ self._find_underrepresented_categories()
106
+ )
107
+
108
+ if not underrepresented_categories:
109
+ # Generate new category exploration
110
+ new_queries = self._generate_diversity_queries(
111
+ initial_query, all_candidates, entity_type
112
+ )
113
+ else:
114
+ # Focus on underrepresented categories
115
+ new_queries = self._generate_category_queries(
116
+ underrepresented_categories, initial_query, entity_type
117
+ )
118
+
119
+ if not new_queries:
120
+ break
121
+
122
+ # Execute diverse searches
123
+ for query in new_queries[:3]: # Limit concurrent searches
124
+ if query.lower() in self.explored_queries:
125
+ continue
126
+
127
+ results = self._execute_search(query)
128
+ candidates = self._extract_candidates_from_results(
129
+ results, entity_type
130
+ )
131
+
132
+ # Filter for diversity
133
+ diverse_candidates = self._filter_for_diversity(
134
+ candidates, all_candidates
135
+ )
136
+
137
+ all_candidates.extend(diverse_candidates)
138
+ total_searched += 1
139
+
140
+ # Update categories
141
+ self._categorize_candidates(diverse_candidates)
142
+
143
+ exploration_paths.append(
144
+ f"Diversity search: {query} -> {len(diverse_candidates)} diverse candidates"
145
+ )
146
+
147
+ if not self._should_continue_exploration(
148
+ start_time, len(all_candidates)
149
+ ):
150
+ break
151
+
152
+ # Final diversity filtering and ranking
153
+ diverse_candidates = self._final_diversity_selection(all_candidates)
154
+ ranked_candidates = self._rank_by_diversity(
155
+ diverse_candidates, initial_query
156
+ )
157
+ final_candidates = ranked_candidates[: self.max_candidates]
158
+
159
+ elapsed_time = time.time() - start_time
160
+ final_diversity = self._calculate_diversity_score(final_candidates)
161
+
162
+ logger.info(
163
+ f"Diversity exploration completed: {len(final_candidates)} candidates, diversity: {final_diversity:.2f}"
164
+ )
165
+
166
+ return ExplorationResult(
167
+ candidates=final_candidates,
168
+ total_searched=total_searched,
169
+ unique_candidates=len(diverse_candidates),
170
+ exploration_paths=exploration_paths,
171
+ metadata={
172
+ "strategy": "diversity_focused",
173
+ "final_diversity_score": final_diversity,
174
+ "categories_found": len(self.diversity_categories),
175
+ "category_distribution": dict(self.category_counts),
176
+ "entity_type": entity_type,
177
+ },
178
+ elapsed_time=elapsed_time,
179
+ strategy_used=ExplorationStrategy.DIVERSITY_FOCUSED,
180
+ )
181
+
182
+ def generate_exploration_queries(
183
+ self,
184
+ base_query: str,
185
+ found_candidates: List[Candidate],
186
+ constraints: Optional[List[Constraint]] = None,
187
+ ) -> List[str]:
188
+ """Generate diversity-focused exploration queries."""
189
+ return self._generate_diversity_queries(base_query, found_candidates)
190
+
191
+ def _categorize_candidates(self, candidates: List[Candidate]):
192
+ """Categorize candidates for diversity tracking."""
193
+ for candidate in candidates:
194
+ category = self._determine_category(candidate)
195
+ self.category_counts[category] += 1
196
+ self.diversity_categories.add(category)
197
+
198
+ # Store category in candidate metadata
199
+ if not candidate.metadata:
200
+ candidate.metadata = {}
201
+ candidate.metadata["diversity_category"] = category
202
+
203
+ def _determine_category(self, candidate: Candidate) -> str:
204
+ """Determine the category of a candidate."""
205
+ name = candidate.name.lower()
206
+
207
+ # Simple categorization based on common patterns
208
+ if any(word in name for word in ["mountain", "peak", "summit", "hill"]):
209
+ return "mountain"
210
+ elif any(
211
+ word in name
212
+ for word in ["lake", "river", "creek", "stream", "pond"]
213
+ ):
214
+ return "water"
215
+ elif any(
216
+ word in name for word in ["park", "forest", "reserve", "wilderness"]
217
+ ):
218
+ return "park"
219
+ elif any(word in name for word in ["trail", "path", "route", "way"]):
220
+ return "trail"
221
+ elif any(word in name for word in ["canyon", "gorge", "valley", "gap"]):
222
+ return "canyon"
223
+ elif any(
224
+ word in name for word in ["cliff", "bluff", "overlook", "viewpoint"]
225
+ ):
226
+ return "viewpoint"
227
+ elif any(
228
+ word in name for word in ["island", "beach", "coast", "shore"]
229
+ ):
230
+ return "coastal"
231
+ elif any(word in name for word in ["city", "town", "county", "state"]):
232
+ return "place"
233
+ else:
234
+ return "other"
235
+
236
+ def _calculate_diversity_score(self, candidates: List[Candidate]) -> float:
237
+ """Calculate diversity score for a set of candidates."""
238
+ if not candidates:
239
+ return 0.0
240
+
241
+ # Count categories
242
+ category_counts = defaultdict(int)
243
+ for candidate in candidates:
244
+ category = candidate.metadata.get("diversity_category", "other")
245
+ category_counts[category] += 1
246
+
247
+ # Calculate diversity using Shannon entropy
248
+ total = len(candidates)
249
+ entropy = 0.0
250
+
251
+ for count in category_counts.values():
252
+ if count > 0:
253
+ p = count / total
254
+ entropy -= p * (p.bit_length() - 1) if p > 0 else 0
255
+
256
+ # Normalize to 0-1 scale
257
+ max_entropy = (
258
+ (len(category_counts).bit_length() - 1)
259
+ if len(category_counts) > 1
260
+ else 1
261
+ )
262
+ return entropy / max_entropy if max_entropy > 0 else 0.0
263
+
264
+ def _find_underrepresented_categories(self) -> List[str]:
265
+ """Find categories that are underrepresented."""
266
+ if not self.category_counts:
267
+ return []
268
+
269
+ avg_count = sum(self.category_counts.values()) / len(
270
+ self.category_counts
271
+ )
272
+ threshold = avg_count * 0.5 # Categories with less than 50% of average
273
+
274
+ underrepresented = [
275
+ category
276
+ for category, count in self.category_counts.items()
277
+ if count < threshold and count < self.category_limit
278
+ ]
279
+
280
+ return underrepresented
281
+
282
+ def _generate_diversity_queries(
283
+ self,
284
+ base_query: str,
285
+ found_candidates: List[Candidate],
286
+ entity_type: Optional[str] = None,
287
+ ) -> List[str]:
288
+ """Generate queries to increase diversity."""
289
+ queries = []
290
+
291
+ # Analyze existing categories
292
+ existing_categories = set()
293
+ for candidate in found_candidates:
294
+ if (
295
+ candidate.metadata
296
+ and "diversity_category" in candidate.metadata
297
+ ):
298
+ existing_categories.add(
299
+ candidate.metadata["diversity_category"]
300
+ )
301
+
302
+ # Generate queries for missing categories
303
+ all_categories = [
304
+ "mountain",
305
+ "water",
306
+ "park",
307
+ "trail",
308
+ "canyon",
309
+ "viewpoint",
310
+ "coastal",
311
+ "place",
312
+ ]
313
+ missing_categories = [
314
+ cat for cat in all_categories if cat not in existing_categories
315
+ ]
316
+
317
+ base = entity_type or base_query
318
+
319
+ for category in missing_categories[:3]: # Limit to 3 new categories
320
+ if category == "mountain":
321
+ queries.append(f"{base} mountain peak summit")
322
+ elif category == "water":
323
+ queries.append(f"{base} lake river creek")
324
+ elif category == "park":
325
+ queries.append(f"{base} park forest reserve")
326
+ elif category == "trail":
327
+ queries.append(f"{base} trail path route")
328
+ elif category == "canyon":
329
+ queries.append(f"{base} canyon gorge valley")
330
+ elif category == "viewpoint":
331
+ queries.append(f"{base} overlook viewpoint cliff")
332
+ elif category == "coastal":
333
+ queries.append(f"{base} beach coast island")
334
+ elif category == "place":
335
+ queries.append(f"{base} location place area")
336
+
337
+ return queries
338
+
339
+ def _generate_category_queries(
340
+ self, categories: List[str], base_query: str, entity_type: Optional[str]
341
+ ) -> List[str]:
342
+ """Generate queries for specific underrepresented categories."""
343
+ queries = []
344
+ base = entity_type or base_query
345
+
346
+ for category in categories[:3]:
347
+ queries.append(f"{base} {category}")
348
+ queries.append(f"{category} examples {base}")
349
+
350
+ return queries
351
+
352
+ def _filter_for_diversity(
353
+ self,
354
+ new_candidates: List[Candidate],
355
+ existing_candidates: List[Candidate],
356
+ ) -> List[Candidate]:
357
+ """Filter new candidates to maintain diversity."""
358
+ filtered = []
359
+
360
+ for candidate in new_candidates:
361
+ category = self._determine_category(candidate)
362
+
363
+ # Check if this category is already well-represented
364
+ if self.category_counts[category] >= self.category_limit:
365
+ continue
366
+
367
+ # Check for similarity with existing candidates
368
+ if not self._is_sufficiently_different(
369
+ candidate, existing_candidates
370
+ ):
371
+ continue
372
+
373
+ filtered.append(candidate)
374
+
375
+ return filtered
376
+
377
+ def _is_sufficiently_different(
378
+ self, candidate: Candidate, existing_candidates: List[Candidate]
379
+ ) -> bool:
380
+ """Check if candidate is sufficiently different from existing ones."""
381
+ candidate_words = set(candidate.name.lower().split())
382
+
383
+ for existing in existing_candidates[
384
+ -10:
385
+ ]: # Check against recent candidates
386
+ existing_words = set(existing.name.lower().split())
387
+
388
+ # Calculate Jaccard similarity
389
+ intersection = len(candidate_words.intersection(existing_words))
390
+ union = len(candidate_words.union(existing_words))
391
+
392
+ if union > 0:
393
+ similarity = intersection / union
394
+ if similarity > self.similarity_threshold:
395
+ return False
396
+
397
+ return True
398
+
399
+ def _final_diversity_selection(
400
+ self, candidates: List[Candidate]
401
+ ) -> List[Candidate]:
402
+ """Final selection to maximize diversity."""
403
+ if not candidates:
404
+ return candidates
405
+
406
+ # Group by category
407
+ category_groups = defaultdict(list)
408
+ for candidate in candidates:
409
+ category = candidate.metadata.get("diversity_category", "other")
410
+ category_groups[category].append(candidate)
411
+
412
+ # Select balanced representation from each category
413
+ selected = []
414
+ max_per_category = max(1, self.max_candidates // len(category_groups))
415
+
416
+ for category, group in category_groups.items():
417
+ # Sort by relevance score if available
418
+ sorted_group = sorted(
419
+ group,
420
+ key=lambda c: getattr(c, "relevance_score", 0.0),
421
+ reverse=True,
422
+ )
423
+ selected.extend(sorted_group[:max_per_category])
424
+
425
+ return selected
426
+
427
+ def _rank_by_diversity(
428
+ self, candidates: List[Candidate], base_query: str
429
+ ) -> List[Candidate]:
430
+ """Rank candidates considering both relevance and diversity contribution."""
431
+ # First rank by relevance
432
+ relevance_ranked = self._rank_candidates_by_relevance(
433
+ candidates, base_query
434
+ )
435
+
436
+ # Then adjust based on diversity contribution
437
+ for i, candidate in enumerate(relevance_ranked):
438
+ category = candidate.metadata.get("diversity_category", "other")
439
+
440
+ # Boost score for underrepresented categories
441
+ category_count = self.category_counts[category]
442
+ avg_count = (
443
+ sum(self.category_counts.values()) / len(self.category_counts)
444
+ if self.category_counts
445
+ else 1
446
+ )
447
+
448
+ diversity_boost = max(0, (avg_count - category_count) / avg_count)
449
+
450
+ relevance_score = getattr(candidate, "relevance_score", 0.0)
451
+ candidate.final_score = relevance_score + (diversity_boost * 0.2)
452
+
453
+ return sorted(
454
+ relevance_ranked,
455
+ key=lambda c: getattr(c, "final_score", 0.0),
456
+ reverse=True,
457
+ )
@@ -0,0 +1,250 @@
1
+ """
2
+ Parallel candidate explorer implementation.
3
+
4
+ This explorer runs multiple search queries in parallel to quickly discover
5
+ a wide range of candidates.
6
+ """
7
+
8
+ import concurrent.futures
9
+ import time
10
+ from typing import List, Optional
11
+
12
+ from loguru import logger
13
+
14
+ from ..candidates.base_candidate import Candidate
15
+ from ..constraints.base_constraint import Constraint
16
+ from .base_explorer import (
17
+ BaseCandidateExplorer,
18
+ ExplorationResult,
19
+ ExplorationStrategy,
20
+ )
21
+
22
+
23
+ class ParallelExplorer(BaseCandidateExplorer):
24
+ """
25
+ Parallel candidate explorer that runs multiple searches concurrently.
26
+
27
+ This explorer:
28
+ 1. Generates multiple search queries from the initial query
29
+ 2. Runs searches in parallel for speed
30
+ 3. Collects and deduplicates candidates
31
+ 4. Focuses on breadth-first exploration
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ *args,
37
+ max_workers: int = 5,
38
+ queries_per_round: int = 8,
39
+ max_rounds: int = 3,
40
+ **kwargs,
41
+ ):
42
+ """
43
+ Initialize parallel explorer.
44
+
45
+ Args:
46
+ max_workers: Maximum number of parallel search threads
47
+ queries_per_round: Number of queries to generate per round
48
+ max_rounds: Maximum exploration rounds
49
+ """
50
+ super().__init__(*args, **kwargs)
51
+ self.max_workers = max_workers
52
+ self.queries_per_round = queries_per_round
53
+ self.max_rounds = max_rounds
54
+
55
+ def explore(
56
+ self,
57
+ initial_query: str,
58
+ constraints: Optional[List[Constraint]] = None,
59
+ entity_type: Optional[str] = None,
60
+ ) -> ExplorationResult:
61
+ """Explore candidates using parallel search strategy."""
62
+ start_time = time.time()
63
+ logger.info(f"Starting parallel exploration for: {initial_query}")
64
+
65
+ all_candidates = []
66
+ exploration_paths = []
67
+ total_searched = 0
68
+
69
+ # Initial search
70
+ current_queries = [initial_query]
71
+
72
+ with concurrent.futures.ThreadPoolExecutor(
73
+ max_workers=self.max_workers
74
+ ) as executor:
75
+ for round_num in range(self.max_rounds):
76
+ if not self._should_continue_exploration(
77
+ start_time, len(all_candidates)
78
+ ):
79
+ break
80
+
81
+ logger.info(
82
+ f"Exploration round {round_num + 1}: {len(current_queries)} queries"
83
+ )
84
+
85
+ # Submit all queries for parallel execution
86
+ future_to_query = {
87
+ executor.submit(self._execute_search, query): query
88
+ for query in current_queries
89
+ }
90
+
91
+ round_candidates = []
92
+
93
+ # Collect results as they complete
94
+ for future in concurrent.futures.as_completed(future_to_query):
95
+ query = future_to_query[future]
96
+ total_searched += 1
97
+
98
+ try:
99
+ results = future.result()
100
+ candidates = self._extract_candidates_from_results(
101
+ results, entity_type
102
+ )
103
+ round_candidates.extend(candidates)
104
+ exploration_paths.append(
105
+ f"Round {round_num + 1}: {query} -> {len(candidates)} candidates"
106
+ )
107
+
108
+ except Exception as e:
109
+ logger.error(f"Error processing query '{query}': {e}")
110
+
111
+ # Add new candidates
112
+ all_candidates.extend(round_candidates)
113
+
114
+ # Generate queries for next round
115
+ if round_num < self.max_rounds - 1:
116
+ current_queries = self.generate_exploration_queries(
117
+ initial_query, all_candidates, constraints
118
+ )[: self.queries_per_round]
119
+
120
+ if not current_queries:
121
+ logger.info("No more queries to explore")
122
+ break
123
+
124
+ # Deduplicate and rank
125
+ unique_candidates = self._deduplicate_candidates(all_candidates)
126
+ ranked_candidates = self._rank_candidates_by_relevance(
127
+ unique_candidates, initial_query
128
+ )
129
+
130
+ # Limit to max candidates
131
+ final_candidates = ranked_candidates[: self.max_candidates]
132
+
133
+ elapsed_time = time.time() - start_time
134
+ logger.info(
135
+ f"Parallel exploration completed: {len(final_candidates)} unique candidates in {elapsed_time:.1f}s"
136
+ )
137
+
138
+ return ExplorationResult(
139
+ candidates=final_candidates,
140
+ total_searched=total_searched,
141
+ unique_candidates=len(unique_candidates),
142
+ exploration_paths=exploration_paths,
143
+ metadata={
144
+ "strategy": "parallel",
145
+ "rounds": min(round_num + 1, self.max_rounds),
146
+ "max_workers": self.max_workers,
147
+ "entity_type": entity_type,
148
+ },
149
+ elapsed_time=elapsed_time,
150
+ strategy_used=ExplorationStrategy.BREADTH_FIRST,
151
+ )
152
+
153
+ def generate_exploration_queries(
154
+ self,
155
+ base_query: str,
156
+ found_candidates: List[Candidate],
157
+ constraints: Optional[List[Constraint]] = None,
158
+ ) -> List[str]:
159
+ """Generate queries for parallel exploration."""
160
+ queries = []
161
+
162
+ # Query variations based on base query
163
+ base_variations = self._generate_query_variations(base_query)
164
+ queries.extend(base_variations)
165
+
166
+ # Queries based on found candidates
167
+ if found_candidates:
168
+ candidate_queries = self._generate_candidate_based_queries(
169
+ found_candidates, base_query
170
+ )
171
+ queries.extend(candidate_queries)
172
+
173
+ # Constraint-based queries
174
+ if constraints:
175
+ constraint_queries = self._generate_constraint_queries(
176
+ constraints, base_query
177
+ )
178
+ queries.extend(constraint_queries)
179
+
180
+ # Remove already explored queries
181
+ new_queries = [
182
+ q for q in queries if q.lower() not in self.explored_queries
183
+ ]
184
+
185
+ return new_queries[: self.queries_per_round]
186
+
187
+ def _generate_query_variations(self, base_query: str) -> List[str]:
188
+ """Generate variations of the base query."""
189
+ try:
190
+ prompt = f"""
191
+ Generate 4 search query variations for: "{base_query}"
192
+
193
+ Each variation should:
194
+ 1. Use different keywords but same intent
195
+ 2. Be specific and searchable
196
+ 3. Focus on finding concrete examples or instances
197
+
198
+ Format as numbered list:
199
+ 1. [query]
200
+ 2. [query]
201
+ 3. [query]
202
+ 4. [query]
203
+ """
204
+
205
+ response = self.model.invoke(prompt).content.strip()
206
+
207
+ # Parse numbered list
208
+ queries = []
209
+ for line in response.split("\n"):
210
+ line = line.strip()
211
+ if line and any(line.startswith(f"{i}.") for i in range(1, 10)):
212
+ # Remove number prefix
213
+ query = line.split(".", 1)[1].strip()
214
+ if query:
215
+ queries.append(query)
216
+
217
+ return queries[:4]
218
+
219
+ except Exception as e:
220
+ logger.error(f"Error generating query variations: {e}")
221
+ return []
222
+
223
+ def _generate_candidate_based_queries(
224
+ self, candidates: List[Candidate], base_query: str
225
+ ) -> List[str]:
226
+ """Generate queries based on found candidates."""
227
+ queries = []
228
+
229
+ # Sample a few candidates to avoid too many queries
230
+ sample_candidates = candidates[:3]
231
+
232
+ for candidate in sample_candidates:
233
+ # Query for similar entities
234
+ queries.append(f'similar to "{candidate.name}"')
235
+ queries.append(f'like "{candidate.name}" examples')
236
+
237
+ return queries
238
+
239
+ def _generate_constraint_queries(
240
+ self, constraints: List[Constraint], base_query: str
241
+ ) -> List[str]:
242
+ """Generate queries focusing on specific constraints."""
243
+ queries = []
244
+
245
+ # Sample constraints to avoid too many queries
246
+ for constraint in constraints[:2]:
247
+ queries.append(f"{constraint.value} examples")
248
+ queries.append(f'"{constraint.value}" instances')
249
+
250
+ return queries