local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. local_deep_research/__init__.py +7 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
  4. local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
  5. local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
  6. local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
  7. local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
  8. local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
  9. local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
  10. local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
  11. local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
  12. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
  13. local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
  14. local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
  15. local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
  16. local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
  17. local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
  18. local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
  19. local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
  20. local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
  21. local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
  22. local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
  23. local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
  24. local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
  25. local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
  26. local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
  27. local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
  28. local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
  29. local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
  30. local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
  31. local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
  32. local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
  33. local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
  34. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
  35. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
  36. local_deep_research/advanced_search_system/findings/repository.py +54 -17
  37. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
  38. local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
  39. local_deep_research/advanced_search_system/questions/__init__.py +16 -0
  40. local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
  41. local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
  42. local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
  43. local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
  44. local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
  45. local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
  46. local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
  47. local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
  48. local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
  49. local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
  50. local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
  51. local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
  52. local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
  53. local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
  54. local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
  55. local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
  56. local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
  57. local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
  58. local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
  59. local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
  60. local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
  61. local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
  62. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
  63. local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
  64. local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
  65. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
  66. local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
  67. local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
  68. local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
  69. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
  70. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
  71. local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
  72. local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
  73. local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
  74. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
  75. local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
  76. local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
  77. local_deep_research/api/benchmark_functions.py +6 -2
  78. local_deep_research/api/research_functions.py +10 -4
  79. local_deep_research/benchmarks/__init__.py +9 -7
  80. local_deep_research/benchmarks/benchmark_functions.py +6 -2
  81. local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
  82. local_deep_research/benchmarks/cli.py +38 -13
  83. local_deep_research/benchmarks/comparison/__init__.py +4 -2
  84. local_deep_research/benchmarks/comparison/evaluator.py +316 -239
  85. local_deep_research/benchmarks/datasets/__init__.py +1 -1
  86. local_deep_research/benchmarks/datasets/base.py +91 -72
  87. local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
  88. local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
  89. local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
  90. local_deep_research/benchmarks/datasets/utils.py +48 -29
  91. local_deep_research/benchmarks/datasets.py +4 -11
  92. local_deep_research/benchmarks/efficiency/__init__.py +8 -4
  93. local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
  94. local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
  95. local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
  96. local_deep_research/benchmarks/evaluators/composite.py +6 -2
  97. local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
  98. local_deep_research/benchmarks/graders.py +32 -10
  99. local_deep_research/benchmarks/metrics/README.md +1 -1
  100. local_deep_research/benchmarks/metrics/calculation.py +25 -10
  101. local_deep_research/benchmarks/metrics/reporting.py +7 -3
  102. local_deep_research/benchmarks/metrics/visualization.py +42 -23
  103. local_deep_research/benchmarks/metrics.py +1 -1
  104. local_deep_research/benchmarks/optimization/__init__.py +3 -1
  105. local_deep_research/benchmarks/optimization/api.py +7 -1
  106. local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
  107. local_deep_research/benchmarks/runners.py +48 -15
  108. local_deep_research/citation_handler.py +65 -92
  109. local_deep_research/citation_handlers/__init__.py +15 -0
  110. local_deep_research/citation_handlers/base_citation_handler.py +70 -0
  111. local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
  112. local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
  113. local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
  114. local_deep_research/config/llm_config.py +271 -169
  115. local_deep_research/config/search_config.py +14 -5
  116. local_deep_research/defaults/__init__.py +0 -1
  117. local_deep_research/metrics/__init__.py +13 -0
  118. local_deep_research/metrics/database.py +58 -0
  119. local_deep_research/metrics/db_models.py +115 -0
  120. local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
  121. local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
  122. local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
  123. local_deep_research/metrics/migrate_research_ratings.py +31 -0
  124. local_deep_research/metrics/models.py +61 -0
  125. local_deep_research/metrics/pricing/__init__.py +12 -0
  126. local_deep_research/metrics/pricing/cost_calculator.py +237 -0
  127. local_deep_research/metrics/pricing/pricing_cache.py +143 -0
  128. local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
  129. local_deep_research/metrics/query_utils.py +51 -0
  130. local_deep_research/metrics/search_tracker.py +380 -0
  131. local_deep_research/metrics/token_counter.py +1078 -0
  132. local_deep_research/migrate_db.py +3 -1
  133. local_deep_research/report_generator.py +22 -8
  134. local_deep_research/search_system.py +390 -9
  135. local_deep_research/test_migration.py +15 -5
  136. local_deep_research/utilities/db_utils.py +7 -4
  137. local_deep_research/utilities/es_utils.py +115 -104
  138. local_deep_research/utilities/llm_utils.py +15 -5
  139. local_deep_research/utilities/log_utils.py +151 -0
  140. local_deep_research/utilities/search_cache.py +387 -0
  141. local_deep_research/utilities/search_utilities.py +14 -6
  142. local_deep_research/utilities/threading_utils.py +92 -0
  143. local_deep_research/utilities/url_utils.py +6 -0
  144. local_deep_research/web/api.py +347 -0
  145. local_deep_research/web/app.py +13 -17
  146. local_deep_research/web/app_factory.py +71 -66
  147. local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
  148. local_deep_research/web/database/migrations.py +5 -3
  149. local_deep_research/web/database/models.py +51 -2
  150. local_deep_research/web/database/schema_upgrade.py +49 -29
  151. local_deep_research/web/models/database.py +51 -61
  152. local_deep_research/web/routes/api_routes.py +56 -22
  153. local_deep_research/web/routes/benchmark_routes.py +4 -1
  154. local_deep_research/web/routes/globals.py +22 -0
  155. local_deep_research/web/routes/history_routes.py +71 -46
  156. local_deep_research/web/routes/metrics_routes.py +1155 -0
  157. local_deep_research/web/routes/research_routes.py +227 -41
  158. local_deep_research/web/routes/settings_routes.py +156 -55
  159. local_deep_research/web/services/research_service.py +310 -103
  160. local_deep_research/web/services/resource_service.py +36 -11
  161. local_deep_research/web/services/settings_manager.py +55 -17
  162. local_deep_research/web/services/settings_service.py +12 -4
  163. local_deep_research/web/services/socket_service.py +295 -188
  164. local_deep_research/web/static/css/custom_dropdown.css +180 -0
  165. local_deep_research/web/static/css/styles.css +39 -1
  166. local_deep_research/web/static/js/components/detail.js +633 -267
  167. local_deep_research/web/static/js/components/details.js +751 -0
  168. local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
  169. local_deep_research/web/static/js/components/fallback/ui.js +23 -23
  170. local_deep_research/web/static/js/components/history.js +76 -76
  171. local_deep_research/web/static/js/components/logpanel.js +61 -13
  172. local_deep_research/web/static/js/components/progress.js +13 -2
  173. local_deep_research/web/static/js/components/research.js +99 -12
  174. local_deep_research/web/static/js/components/results.js +239 -106
  175. local_deep_research/web/static/js/main.js +40 -40
  176. local_deep_research/web/static/js/services/audio.js +1 -1
  177. local_deep_research/web/static/js/services/formatting.js +11 -11
  178. local_deep_research/web/static/js/services/keyboard.js +157 -0
  179. local_deep_research/web/static/js/services/pdf.js +80 -80
  180. local_deep_research/web/static/sounds/README.md +1 -1
  181. local_deep_research/web/templates/base.html +1 -0
  182. local_deep_research/web/templates/components/log_panel.html +7 -1
  183. local_deep_research/web/templates/components/mobile_nav.html +1 -1
  184. local_deep_research/web/templates/components/sidebar.html +3 -0
  185. local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
  186. local_deep_research/web/templates/pages/details.html +325 -24
  187. local_deep_research/web/templates/pages/history.html +1 -1
  188. local_deep_research/web/templates/pages/metrics.html +1929 -0
  189. local_deep_research/web/templates/pages/progress.html +2 -2
  190. local_deep_research/web/templates/pages/research.html +53 -17
  191. local_deep_research/web/templates/pages/results.html +12 -1
  192. local_deep_research/web/templates/pages/star_reviews.html +803 -0
  193. local_deep_research/web/utils/formatters.py +9 -3
  194. local_deep_research/web_search_engines/default_search_engines.py +5 -3
  195. local_deep_research/web_search_engines/engines/full_search.py +8 -2
  196. local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
  197. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
  198. local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
  199. local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
  200. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
  201. local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
  202. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
  203. local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
  204. local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
  205. local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
  206. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
  207. local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
  208. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
  209. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
  210. local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
  211. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
  212. local_deep_research/web_search_engines/search_engine_base.py +83 -35
  213. local_deep_research/web_search_engines/search_engine_factory.py +25 -8
  214. local_deep_research/web_search_engines/search_engines_config.py +9 -3
  215. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
  216. local_deep_research-0.5.0.dist-info/RECORD +265 -0
  217. local_deep_research-0.4.4.dist-info/RECORD +0 -177
  218. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
  219. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
  220. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,515 @@
1
+ """
2
+ Smart query generation strategy that works for any type of search target.
3
+ """
4
+
5
+ import concurrent.futures
6
+ from typing import Dict, List
7
+
8
+ from loguru import logger
9
+
10
+ from ..constraints.base_constraint import Constraint
11
+ from ..constraints.constraint_analyzer import ConstraintType
12
+ from .early_stop_constrained_strategy import EarlyStopConstrainedStrategy
13
+
14
+
15
+ class SmartQueryStrategy(EarlyStopConstrainedStrategy):
16
+ """
17
+ Enhanced strategy with intelligent query generation that:
18
+ 1. Analyzes constraints to identify key search terms
19
+ 2. Uses LLM to suggest search queries based on constraint meaning
20
+ 3. Generates multiple query variations for better coverage
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ *args,
26
+ use_llm_query_generation: bool = True,
27
+ queries_per_combination: int = 3,
28
+ use_entity_seeding: bool = True,
29
+ use_direct_property_search: bool = True,
30
+ **kwargs,
31
+ ):
32
+ super().__init__(*args, **kwargs)
33
+ self.use_llm_query_generation = use_llm_query_generation
34
+ self.queries_per_combination = queries_per_combination
35
+ self.use_entity_seeding = use_entity_seeding
36
+ self.use_direct_property_search = use_direct_property_search
37
+ # Track queries to avoid duplicates
38
+ self.searched_queries = set()
39
+ self.query_variations = set()
40
+ # Store entity seeds for targeted search
41
+ self.entity_seeds = []
42
+
43
+ def _build_query(self, constraints: List[Constraint]) -> str:
44
+ """Build intelligent queries using constraint analysis."""
45
+ if self.use_llm_query_generation:
46
+ # Use LLM to generate smart queries
47
+ return self._generate_smart_query(constraints)
48
+ else:
49
+ # Fallback to improved standard approach
50
+ return self._build_standard_query(constraints)
51
+
52
+ def _generate_smart_query(self, constraints: List[Constraint]) -> str:
53
+ """Use LLM to generate optimal search queries."""
54
+ constraint_text = "\n".join(
55
+ [
56
+ f"- {c.type.value}: {c.value} (weight: {c.weight})"
57
+ for c in constraints
58
+ ]
59
+ )
60
+
61
+ # Build a list of already searched queries to avoid duplication
62
+ searched_list = list(self.searched_queries)[:10] # Show last 10 to LLM
63
+ already_searched = (
64
+ "\n".join([f"- {q}" for q in searched_list])
65
+ if searched_list
66
+ else "None"
67
+ )
68
+
69
+ prompt = f"""
70
+ Analyze these search constraints and generate an optimal web search query:
71
+
72
+ Constraints:
73
+ {constraint_text}
74
+
75
+ Target type: {getattr(self, "entity_type", "unknown")}
76
+
77
+ Already searched queries (avoid these):
78
+ {already_searched}
79
+
80
+ Generate a single search query that would most effectively find results matching these constraints.
81
+ The query should:
82
+ 1. Include the most identifying/unique terms
83
+ 2. Use appropriate search operators (quotes, AND, OR)
84
+ 3. Be specific enough to find relevant results but not too narrow
85
+ 4. Focus on the highest weighted constraints
86
+ 5. Be different from already searched queries
87
+
88
+ Return only the search query, nothing else.
89
+ """
90
+
91
+ try:
92
+ query = self.model.invoke(prompt).content.strip()
93
+
94
+ # Check if this query is too similar to existing ones
95
+ normalized_query = query.strip().lower()
96
+ if normalized_query in self.searched_queries:
97
+ logger.info(
98
+ f"LLM generated duplicate query, using fallback: {query}"
99
+ )
100
+ return self._build_standard_query(constraints)
101
+
102
+ logger.info(f"LLM generated query: {query}")
103
+ return query
104
+ except Exception as e:
105
+ logger.error(f"Failed to generate smart query: {e}")
106
+ return self._build_standard_query(constraints)
107
+
108
+ def _build_standard_query(self, constraints: List[Constraint]) -> str:
109
+ """Improved standard query building."""
110
+ # Group constraints by importance
111
+ critical_terms = []
112
+ supplementary_terms = []
113
+
114
+ for c in constraints:
115
+ term = c.value
116
+
117
+ # Quote multi-word terms
118
+ if " " in term and not term.startswith('"'):
119
+ term = f'"{term}"'
120
+
121
+ if c.weight > 0.7:
122
+ critical_terms.append(term)
123
+ else:
124
+ supplementary_terms.append(term)
125
+
126
+ # Build query with critical terms required, supplementary optional
127
+ query_parts = []
128
+
129
+ # Add entity type if known
130
+ entity_type = getattr(self, "entity_type", None)
131
+ if entity_type and entity_type != "unknown entity":
132
+ query_parts.append(entity_type)
133
+
134
+ # Add critical terms
135
+ if critical_terms:
136
+ query_parts.extend(critical_terms)
137
+
138
+ # Add some supplementary terms
139
+ if supplementary_terms:
140
+ query_parts.extend(
141
+ supplementary_terms[:2]
142
+ ) # Limit to avoid overly specific queries
143
+
144
+ return " ".join(query_parts)
145
+
146
+ def _execute_combination_search(self, combo) -> List:
147
+ """Override to generate multiple query variations per combination."""
148
+ all_candidates = []
149
+
150
+ if self.use_llm_query_generation:
151
+ # Generate multiple query variations
152
+ queries = self._generate_query_variations(combo.constraints)
153
+
154
+ # Execute searches in parallel
155
+ with concurrent.futures.ThreadPoolExecutor(
156
+ max_workers=self.queries_per_combination
157
+ ) as executor:
158
+ futures = []
159
+ for query in queries:
160
+ # Check if we've already searched this query
161
+ normalized_query = query.strip().lower()
162
+ if normalized_query in self.searched_queries:
163
+ logger.info(f"Skipping duplicate query: '{query}'")
164
+ continue
165
+
166
+ self.searched_queries.add(normalized_query)
167
+ future = executor.submit(self._execute_search, query)
168
+ futures.append((query, future))
169
+
170
+ for query, future in futures:
171
+ try:
172
+ results = future.result()
173
+ candidates = self._extract_candidates_from_results(
174
+ results
175
+ )
176
+ all_candidates.extend(candidates)
177
+
178
+ logger.info(
179
+ f"Query '{query}' found {len(candidates)} candidates"
180
+ )
181
+ except Exception as e:
182
+ logger.error(f"Search failed for query '{query}': {e}")
183
+ else:
184
+ # Use single query from parent implementation
185
+ candidates = super()._execute_combination_search(combo)
186
+ all_candidates.extend(candidates)
187
+
188
+ return all_candidates
189
+
190
+ def _generate_query_variations(
191
+ self, constraints: List[Constraint]
192
+ ) -> List[str]:
193
+ """Generate multiple query variations for better coverage."""
194
+ # Handle single constraint case
195
+ if isinstance(constraints, Constraint):
196
+ constraints = [constraints]
197
+
198
+ constraint_text = "\n".join(
199
+ [f"- {c.type.value}: {c.value}" for c in constraints]
200
+ )
201
+
202
+ # Build a list of already searched queries to avoid duplication
203
+ searched_list = list(self.searched_queries)[:20] # Show last 20 to LLM
204
+ already_searched = (
205
+ "\n".join([f"- {q}" for q in searched_list])
206
+ if searched_list
207
+ else "None"
208
+ )
209
+
210
+ prompt = f"""
211
+ Generate {self.queries_per_combination} different search queries for these constraints:
212
+
213
+ {constraint_text}
214
+
215
+ Already searched queries (avoid these):
216
+ {already_searched}
217
+
218
+ Each query should:
219
+ - Approach the search from a different angle
220
+ - Use different search terms or operators
221
+ - Target different aspects of the constraints
222
+ - Be distinctly different from already searched queries
223
+
224
+ Provide each query on a separate line.
225
+ """
226
+
227
+ try:
228
+ response = self.model.invoke(prompt).content
229
+ queries = [q.strip() for q in response.split("\n") if q.strip()]
230
+
231
+ # Filter out duplicates
232
+ unique_queries = []
233
+ for query in queries:
234
+ normalized = query.strip().lower()
235
+ if (
236
+ normalized not in self.searched_queries
237
+ and normalized not in self.query_variations
238
+ ):
239
+ unique_queries.append(query)
240
+ self.query_variations.add(normalized)
241
+ else:
242
+ logger.info(
243
+ f"Filtering out duplicate query variation: {query}"
244
+ )
245
+
246
+ # If all queries were duplicates, generate a fallback
247
+ if not unique_queries:
248
+ fallback = self._build_standard_query(constraints)
249
+ if fallback.strip().lower() not in self.searched_queries:
250
+ unique_queries = [fallback]
251
+
252
+ return unique_queries[: self.queries_per_combination]
253
+ except Exception as e:
254
+ logger.error(f"Failed to generate query variations: {e}")
255
+ # Fallback to single query
256
+ return [self._build_standard_query(constraints)]
257
+
258
+ def _extract_candidates_from_results(self, results: Dict) -> List:
259
+ """Improved candidate extraction that's more generic."""
260
+ candidates = []
261
+ content = results.get("current_knowledge", "")
262
+
263
+ if not content:
264
+ return candidates
265
+
266
+ # Use LLM to extract relevant entities/topics from the content
267
+ prompt = f"""
268
+ From the following search results, extract all relevant entities, topics, or answers that match our search target type: {getattr(self, "entity_type", "unknown")}
269
+
270
+ Content:
271
+ {content}
272
+
273
+ List each potential match on a separate line.
274
+ Include only names/titles/identifiers, not descriptions.
275
+ """
276
+
277
+ try:
278
+ response = self.model.invoke(prompt).content
279
+ entity_names = [
280
+ name.strip() for name in response.split("\n") if name.strip()
281
+ ]
282
+
283
+ # Create candidates from extracted names
284
+ from ..candidates.base_candidate import Candidate
285
+
286
+ for name in entity_names:
287
+ if name and len(name) < 100: # Basic validation
288
+ candidate = Candidate(name=name)
289
+ candidates.append(candidate)
290
+
291
+ logger.info(f"Extracted {len(candidates)} candidates from results")
292
+
293
+ except Exception as e:
294
+ logger.error(f"Error extracting candidates: {e}")
295
+
296
+ return candidates
297
+
298
+ def _should_use_entity_seeding(self) -> bool:
299
+ """Determine if entity seeding would be beneficial."""
300
+ entity_type = getattr(self, "entity_type", "").lower()
301
+ return (
302
+ "character" in entity_type
303
+ or "person" in entity_type
304
+ or "hero" in entity_type
305
+ )
306
+
307
+ def _perform_entity_seeding(self):
308
+ """Use LLM to suggest specific entity names based on constraints."""
309
+ logger.info("Performing entity seeding based on constraints")
310
+
311
+ # Extract key properties from constraints
312
+ key_properties = []
313
+ for constraint in self.constraint_ranking:
314
+ if constraint.weight > 0.7: # High-weight constraints
315
+ key_properties.append(constraint.value)
316
+
317
+ if not key_properties:
318
+ return
319
+
320
+ properties_text = "\n".join([f"- {prop}" for prop in key_properties])
321
+
322
+ prompt = f"""
323
+ Based on these properties, suggest 5-10 specific {self.entity_type} names that might match:
324
+
325
+ Properties:
326
+ {properties_text}
327
+
328
+ For example, if looking for a scientist from the 19th century, you might suggest:
329
+ - Charles Darwin
330
+ - Marie Curie
331
+ - Louis Pasteur
332
+ - Thomas Edison
333
+
334
+ Provide one name per line. Be specific with actual character/entity names.
335
+ """
336
+
337
+ try:
338
+ response = self.model.invoke(prompt).content
339
+ self.entity_seeds = [
340
+ name.strip() for name in response.split("\n") if name.strip()
341
+ ]
342
+ logger.info(f"Generated entity seeds: {self.entity_seeds}")
343
+
344
+ # Immediately search for these seeds
345
+ self._search_entity_seeds()
346
+
347
+ except Exception as e:
348
+ logger.error(f"Error generating entity seeds: {e}")
349
+
350
+ def _search_entity_seeds(self):
351
+ """Search for the entity seeds directly."""
352
+ if not self.entity_seeds:
353
+ return
354
+
355
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
356
+ futures = []
357
+ for seed in self.entity_seeds[:5]: # Limit to top 5
358
+ query = f'"{seed}"'
359
+ if query.lower() not in self.searched_queries:
360
+ self.searched_queries.add(query.lower())
361
+ future = executor.submit(self._execute_search, query)
362
+ futures.append((seed, future))
363
+
364
+ for seed, future in futures:
365
+ try:
366
+ results = future.result()
367
+ candidates = self._extract_candidates_from_results(results)
368
+
369
+ # Look for exact matches
370
+ for candidate in candidates:
371
+ if seed.lower() in candidate.name.lower():
372
+ logger.info(
373
+ f"Found seeded entity: {candidate.name}"
374
+ )
375
+ # Evaluate immediately
376
+ if hasattr(self, "_evaluate_candidate_immediately"):
377
+ self._evaluate_candidate_immediately(candidate)
378
+ else:
379
+ # Add to candidates list
380
+ if not hasattr(self, "candidates"):
381
+ self.candidates = []
382
+ self.candidates.append(candidate)
383
+
384
+ except Exception as e:
385
+ logger.error(f"Error searching for seed {seed}: {e}")
386
+
387
+ def _try_direct_property_search(self):
388
+ """Try direct searches for high-weight property constraints."""
389
+ property_queries = []
390
+
391
+ for constraint in self.constraint_ranking:
392
+ if (
393
+ constraint.weight > 0.7
394
+ and constraint.type == ConstraintType.PROPERTY
395
+ ):
396
+ # Create specific property-based queries
397
+ if (
398
+ "elastic" in constraint.value.lower()
399
+ or "stretch" in constraint.value.lower()
400
+ ):
401
+ property_queries.extend(
402
+ [
403
+ f'"{constraint.value}" superhero character',
404
+ f'characters with "{constraint.value}"',
405
+ f"list of {self.entity_type} {constraint.value}",
406
+ ]
407
+ )
408
+ elif (
409
+ "voice" in constraint.value.lower()
410
+ or "actor" in constraint.value.lower()
411
+ ):
412
+ property_queries.append(
413
+ f"{constraint.value} {self.entity_type}"
414
+ )
415
+
416
+ # Execute property searches
417
+ if property_queries:
418
+ logger.info(
419
+ f"Executing direct property searches: {property_queries}"
420
+ )
421
+ with concurrent.futures.ThreadPoolExecutor(
422
+ max_workers=3
423
+ ) as executor:
424
+ futures = []
425
+ for query in property_queries[
426
+ :3
427
+ ]: # Limit to avoid too many searches
428
+ if query.lower() not in self.searched_queries:
429
+ self.searched_queries.add(query.lower())
430
+ future = executor.submit(self._execute_search, query)
431
+ futures.append(future)
432
+
433
+ for future in futures:
434
+ try:
435
+ results = future.result()
436
+ candidates = self._extract_candidates_from_results(
437
+ results
438
+ )
439
+
440
+ for candidate in candidates:
441
+ if hasattr(self, "_evaluate_candidate_immediately"):
442
+ self._evaluate_candidate_immediately(candidate)
443
+
444
+ except Exception as e:
445
+ logger.error(f"Property search error: {e}")
446
+
447
+ def _perform_entity_name_search(self):
448
+ """Last resort: search for entity names directly with constraints."""
449
+ logger.info("Performing entity name search fallback")
450
+
451
+ for entity_name in self.entity_seeds[:3]: # Top 3 seeds
452
+ # Combine entity name with key constraints
453
+ constraint_terms = []
454
+ for constraint in self.constraint_ranking[:2]: # Top 2 constraints
455
+ if constraint.weight > 0.5:
456
+ constraint_terms.append(constraint.value)
457
+
458
+ if constraint_terms:
459
+ query = f'"{entity_name}" {" ".join(constraint_terms)}'
460
+ if query.lower() not in self.searched_queries:
461
+ logger.info(f"Trying targeted entity search: {query}")
462
+ self.searched_queries.add(query.lower())
463
+
464
+ try:
465
+ results = self._execute_search(query)
466
+ candidates = self._extract_candidates_from_results(
467
+ results
468
+ )
469
+
470
+ for candidate in candidates:
471
+ if entity_name.lower() in candidate.name.lower():
472
+ logger.info(
473
+ f"Found target entity in fallback: {candidate.name}"
474
+ )
475
+ if hasattr(
476
+ self, "_evaluate_candidate_immediately"
477
+ ):
478
+ self._evaluate_candidate_immediately(
479
+ candidate
480
+ )
481
+
482
+ # Check for early stop
483
+ if (
484
+ hasattr(self, "best_score")
485
+ and self.best_score >= 0.99
486
+ ):
487
+ return
488
+
489
+ except Exception as e:
490
+ logger.error(f"Entity name search error: {e}")
491
+
492
+ def _progressive_constraint_search(self):
493
+ """Override to add entity seeding and property search."""
494
+ # Detect entity type first
495
+ self.entity_type = self._detect_entity_type()
496
+ logger.info(f"Detected entity type: {self.entity_type}")
497
+
498
+ # Perform entity seeding if enabled and entity type suggests specific entities
499
+ if self.use_entity_seeding and self._should_use_entity_seeding():
500
+ self._perform_entity_seeding()
501
+
502
+ # Try direct property search for high-weight properties
503
+ if self.use_direct_property_search:
504
+ self._try_direct_property_search()
505
+
506
+ # Continue with normal progressive search
507
+ super()._progressive_constraint_search()
508
+
509
+ # If still no good results, try name-based fallback
510
+ if (
511
+ hasattr(self, "best_score")
512
+ and self.best_score < 0.9
513
+ and self.entity_seeds
514
+ ):
515
+ self._perform_entity_name_search()
@@ -1,18 +1,19 @@
1
1
  import concurrent.futures
2
- import logging
3
2
  from typing import Dict
4
3
 
4
+ from loguru import logger
5
+
5
6
  from ...citation_handler import CitationHandler
6
7
  from ...config.llm_config import get_llm
7
8
  from ...config.search_config import get_search
8
9
  from ...utilities.db_utils import get_db_setting
10
+ from ...utilities.threading_utils import thread_context, thread_with_app_context
9
11
  from ..filters.cross_engine_filter import CrossEngineFilter
10
12
  from ..findings.repository import FindingsRepository
13
+ from ..questions.atomic_fact_question import AtomicFactQuestionGenerator
11
14
  from ..questions.standard_question import StandardQuestionGenerator
12
15
  from .base_strategy import BaseSearchStrategy
13
16
 
14
- logger = logging.getLogger(__name__)
15
-
16
17
 
17
18
  class SourceBasedSearchStrategy(BaseSearchStrategy):
18
19
  """
@@ -31,6 +32,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
31
32
  filter_reindex: bool = True,
32
33
  cross_engine_max_results: int = None,
33
34
  all_links_of_system=None,
35
+ use_atomic_facts: bool = False,
34
36
  ):
35
37
  """Initialize with optional dependency injection for testing."""
36
38
  # Pass the links list to the parent class
@@ -61,7 +63,10 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
61
63
  self.citation_handler = citation_handler or CitationHandler(self.model)
62
64
 
63
65
  # Initialize components
64
- self.question_generator = StandardQuestionGenerator(self.model)
66
+ if use_atomic_facts:
67
+ self.question_generator = AtomicFactQuestionGenerator(self.model)
68
+ else:
69
+ self.question_generator = StandardQuestionGenerator(self.model)
65
70
  self.findings_repository = FindingsRepository(self.model)
66
71
 
67
72
  def _format_search_results_as_context(self, search_results):
@@ -87,9 +92,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
87
92
  Analyze a topic using source-based search strategy.
88
93
  """
89
94
  logger.info(f"Starting source-based research on topic: {query}")
90
- accumulated_search_results_across_all_iterations = (
91
- []
92
- ) # tracking links across iterations but not global
95
+ accumulated_search_results_across_all_iterations = [] # tracking links across iterations but not global
93
96
  findings = []
94
97
  total_citation_count_before_this_search = len(self.all_links_of_system)
95
98
 
@@ -120,10 +123,14 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
120
123
  iterations_to_run = int(iterations_to_run)
121
124
  try:
122
125
  filtered_search_results = []
123
- total_citation_count_before_this_search = len(self.all_links_of_system)
126
+ total_citation_count_before_this_search = len(
127
+ self.all_links_of_system
128
+ )
124
129
  # Run each iteration
125
130
  for iteration in range(1, iterations_to_run + 1):
126
- iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
131
+ iteration_progress_base = 5 + (iteration - 1) * (
132
+ 70 / iterations_to_run
133
+ )
127
134
 
128
135
  self._update_progress(
129
136
  f"Starting iteration {iteration}/{iterations_to_run}",
@@ -141,7 +148,9 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
141
148
  # For first iteration, use initial query
142
149
  if iteration == 1:
143
150
  # Generate questions for first iteration
144
- context = f"""Iteration: {iteration} of {iterations_to_run}"""
151
+ context = (
152
+ f"""Iteration: {iteration} of {iterations_to_run}"""
153
+ )
145
154
  questions = self.question_generator.generate_questions(
146
155
  current_knowledge=context,
147
156
  query=query,
@@ -171,7 +180,9 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
171
180
  elif iterations_to_run == 1:
172
181
  context = ""
173
182
  else:
174
- context = f"""Iteration: {iteration} of {iterations_to_run}"""
183
+ context = (
184
+ f"""Iteration: {iteration} of {iterations_to_run}"""
185
+ )
175
186
  # Use standard question generator with search results as context
176
187
  questions = self.question_generator.generate_questions(
177
188
  current_knowledge=context,
@@ -199,6 +210,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
199
210
  )
200
211
 
201
212
  # Function for thread pool
213
+ @thread_with_app_context
202
214
  def search_question(q):
203
215
  try:
204
216
  result = self.search.run(q)
@@ -212,7 +224,8 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
212
224
  max_workers=len(all_questions)
213
225
  ) as executor:
214
226
  futures = [
215
- executor.submit(search_question, q) for q in all_questions
227
+ executor.submit(search_question, thread_context(), q)
228
+ for q in all_questions
216
229
  ]
217
230
  iteration_search_dict = {}
218
231
  iteration_search_results = []
@@ -227,7 +240,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
227
240
  iteration_search_dict[question] = search_results
228
241
 
229
242
  self._update_progress(
230
- f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
243
+ f"Completed search {i + 1} of {len(all_questions)}: {question[:3000]}",
231
244
  iteration_progress_base
232
245
  + 10
233
246
  + ((i + 1) / len(all_questions) * 30),
@@ -245,7 +258,10 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
245
258
  self._update_progress(
246
259
  f"Filtering search results for iteration {iteration}",
247
260
  iteration_progress_base + 45,
248
- {"phase": "cross_engine_filtering", "iteration": iteration},
261
+ {
262
+ "phase": "cross_engine_filtering",
263
+ "iteration": iteration,
264
+ },
249
265
  )
250
266
 
251
267
  existing_link_count = len(self.all_links_of_system)
@@ -301,13 +317,17 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
301
317
  80,
302
318
  {"phase": "final_filtering"},
303
319
  )
304
- final_filtered_results = self.cross_engine_filter.filter_results(
305
- accumulated_search_results_across_all_iterations,
306
- query,
307
- reorder=True, # Always reorder in final filtering
308
- reindex=True, # Always reindex in final filtering
309
- max_results=int(get_db_setting("search.final_max_results") or 100),
310
- start_index=len(self.all_links_of_system),
320
+ final_filtered_results = (
321
+ self.cross_engine_filter.filter_results(
322
+ accumulated_search_results_across_all_iterations,
323
+ query,
324
+ reorder=True, # Always reorder in final filtering
325
+ reindex=True, # Always reindex in final filtering
326
+ max_results=int(
327
+ get_db_setting("search.final_max_results") or 100
328
+ ),
329
+ start_index=len(self.all_links_of_system),
330
+ )
311
331
  )
312
332
  self._update_progress(
313
333
  f"Filtered from {len(accumulated_search_results_across_all_iterations)} to {len(final_filtered_results)} results",
@@ -341,7 +361,9 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
341
361
  synthesized_content = final_citation_result["content"]
342
362
  documents = final_citation_result.get("documents", [])
343
363
  else:
344
- synthesized_content = "No relevant results found in final synthesis."
364
+ synthesized_content = (
365
+ "No relevant results found in final synthesis."
366
+ )
345
367
  documents = []
346
368
 
347
369
  # Add a final synthesis finding
@@ -363,8 +385,10 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
363
385
  )
364
386
 
365
387
  # Format findings
366
- formatted_findings = self.findings_repository.format_findings_to_text(
367
- findings, synthesized_content
388
+ formatted_findings = (
389
+ self.findings_repository.format_findings_to_text(
390
+ findings, synthesized_content
391
+ )
368
392
  )
369
393
 
370
394
  except Exception as e: