local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. local_deep_research/__init__.py +7 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
  4. local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
  5. local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
  6. local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
  7. local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
  8. local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
  9. local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
  10. local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
  11. local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
  12. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
  13. local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
  14. local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
  15. local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
  16. local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
  17. local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
  18. local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
  19. local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
  20. local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
  21. local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
  22. local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
  23. local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
  24. local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
  25. local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
  26. local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
  27. local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
  28. local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
  29. local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
  30. local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
  31. local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
  32. local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
  33. local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
  34. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
  35. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
  36. local_deep_research/advanced_search_system/findings/repository.py +54 -17
  37. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
  38. local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
  39. local_deep_research/advanced_search_system/questions/__init__.py +16 -0
  40. local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
  41. local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
  42. local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
  43. local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
  44. local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
  45. local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
  46. local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
  47. local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
  48. local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
  49. local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
  50. local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
  51. local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
  52. local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
  53. local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
  54. local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
  55. local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
  56. local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
  57. local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
  58. local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
  59. local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
  60. local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
  61. local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
  62. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
  63. local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
  64. local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
  65. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
  66. local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
  67. local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
  68. local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
  69. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
  70. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
  71. local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
  72. local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
  73. local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
  74. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
  75. local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
  76. local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
  77. local_deep_research/api/benchmark_functions.py +6 -2
  78. local_deep_research/api/research_functions.py +10 -4
  79. local_deep_research/benchmarks/__init__.py +9 -7
  80. local_deep_research/benchmarks/benchmark_functions.py +6 -2
  81. local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
  82. local_deep_research/benchmarks/cli.py +38 -13
  83. local_deep_research/benchmarks/comparison/__init__.py +4 -2
  84. local_deep_research/benchmarks/comparison/evaluator.py +316 -239
  85. local_deep_research/benchmarks/datasets/__init__.py +1 -1
  86. local_deep_research/benchmarks/datasets/base.py +91 -72
  87. local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
  88. local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
  89. local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
  90. local_deep_research/benchmarks/datasets/utils.py +48 -29
  91. local_deep_research/benchmarks/datasets.py +4 -11
  92. local_deep_research/benchmarks/efficiency/__init__.py +8 -4
  93. local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
  94. local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
  95. local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
  96. local_deep_research/benchmarks/evaluators/composite.py +6 -2
  97. local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
  98. local_deep_research/benchmarks/graders.py +32 -10
  99. local_deep_research/benchmarks/metrics/README.md +1 -1
  100. local_deep_research/benchmarks/metrics/calculation.py +25 -10
  101. local_deep_research/benchmarks/metrics/reporting.py +7 -3
  102. local_deep_research/benchmarks/metrics/visualization.py +42 -23
  103. local_deep_research/benchmarks/metrics.py +1 -1
  104. local_deep_research/benchmarks/optimization/__init__.py +3 -1
  105. local_deep_research/benchmarks/optimization/api.py +7 -1
  106. local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
  107. local_deep_research/benchmarks/runners.py +48 -15
  108. local_deep_research/citation_handler.py +65 -92
  109. local_deep_research/citation_handlers/__init__.py +15 -0
  110. local_deep_research/citation_handlers/base_citation_handler.py +70 -0
  111. local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
  112. local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
  113. local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
  114. local_deep_research/config/llm_config.py +271 -169
  115. local_deep_research/config/search_config.py +14 -5
  116. local_deep_research/defaults/__init__.py +0 -1
  117. local_deep_research/metrics/__init__.py +13 -0
  118. local_deep_research/metrics/database.py +58 -0
  119. local_deep_research/metrics/db_models.py +115 -0
  120. local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
  121. local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
  122. local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
  123. local_deep_research/metrics/migrate_research_ratings.py +31 -0
  124. local_deep_research/metrics/models.py +61 -0
  125. local_deep_research/metrics/pricing/__init__.py +12 -0
  126. local_deep_research/metrics/pricing/cost_calculator.py +237 -0
  127. local_deep_research/metrics/pricing/pricing_cache.py +143 -0
  128. local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
  129. local_deep_research/metrics/query_utils.py +51 -0
  130. local_deep_research/metrics/search_tracker.py +380 -0
  131. local_deep_research/metrics/token_counter.py +1078 -0
  132. local_deep_research/migrate_db.py +3 -1
  133. local_deep_research/report_generator.py +22 -8
  134. local_deep_research/search_system.py +390 -9
  135. local_deep_research/test_migration.py +15 -5
  136. local_deep_research/utilities/db_utils.py +7 -4
  137. local_deep_research/utilities/es_utils.py +115 -104
  138. local_deep_research/utilities/llm_utils.py +15 -5
  139. local_deep_research/utilities/log_utils.py +151 -0
  140. local_deep_research/utilities/search_cache.py +387 -0
  141. local_deep_research/utilities/search_utilities.py +14 -6
  142. local_deep_research/utilities/threading_utils.py +92 -0
  143. local_deep_research/utilities/url_utils.py +6 -0
  144. local_deep_research/web/api.py +347 -0
  145. local_deep_research/web/app.py +13 -17
  146. local_deep_research/web/app_factory.py +71 -66
  147. local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
  148. local_deep_research/web/database/migrations.py +5 -3
  149. local_deep_research/web/database/models.py +51 -2
  150. local_deep_research/web/database/schema_upgrade.py +49 -29
  151. local_deep_research/web/models/database.py +51 -61
  152. local_deep_research/web/routes/api_routes.py +56 -22
  153. local_deep_research/web/routes/benchmark_routes.py +4 -1
  154. local_deep_research/web/routes/globals.py +22 -0
  155. local_deep_research/web/routes/history_routes.py +71 -46
  156. local_deep_research/web/routes/metrics_routes.py +1155 -0
  157. local_deep_research/web/routes/research_routes.py +227 -41
  158. local_deep_research/web/routes/settings_routes.py +156 -55
  159. local_deep_research/web/services/research_service.py +310 -103
  160. local_deep_research/web/services/resource_service.py +36 -11
  161. local_deep_research/web/services/settings_manager.py +55 -17
  162. local_deep_research/web/services/settings_service.py +12 -4
  163. local_deep_research/web/services/socket_service.py +295 -188
  164. local_deep_research/web/static/css/custom_dropdown.css +180 -0
  165. local_deep_research/web/static/css/styles.css +39 -1
  166. local_deep_research/web/static/js/components/detail.js +633 -267
  167. local_deep_research/web/static/js/components/details.js +751 -0
  168. local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
  169. local_deep_research/web/static/js/components/fallback/ui.js +23 -23
  170. local_deep_research/web/static/js/components/history.js +76 -76
  171. local_deep_research/web/static/js/components/logpanel.js +61 -13
  172. local_deep_research/web/static/js/components/progress.js +13 -2
  173. local_deep_research/web/static/js/components/research.js +99 -12
  174. local_deep_research/web/static/js/components/results.js +239 -106
  175. local_deep_research/web/static/js/main.js +40 -40
  176. local_deep_research/web/static/js/services/audio.js +1 -1
  177. local_deep_research/web/static/js/services/formatting.js +11 -11
  178. local_deep_research/web/static/js/services/keyboard.js +157 -0
  179. local_deep_research/web/static/js/services/pdf.js +80 -80
  180. local_deep_research/web/static/sounds/README.md +1 -1
  181. local_deep_research/web/templates/base.html +1 -0
  182. local_deep_research/web/templates/components/log_panel.html +7 -1
  183. local_deep_research/web/templates/components/mobile_nav.html +1 -1
  184. local_deep_research/web/templates/components/sidebar.html +3 -0
  185. local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
  186. local_deep_research/web/templates/pages/details.html +325 -24
  187. local_deep_research/web/templates/pages/history.html +1 -1
  188. local_deep_research/web/templates/pages/metrics.html +1929 -0
  189. local_deep_research/web/templates/pages/progress.html +2 -2
  190. local_deep_research/web/templates/pages/research.html +53 -17
  191. local_deep_research/web/templates/pages/results.html +12 -1
  192. local_deep_research/web/templates/pages/star_reviews.html +803 -0
  193. local_deep_research/web/utils/formatters.py +9 -3
  194. local_deep_research/web_search_engines/default_search_engines.py +5 -3
  195. local_deep_research/web_search_engines/engines/full_search.py +8 -2
  196. local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
  197. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
  198. local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
  199. local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
  200. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
  201. local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
  202. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
  203. local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
  204. local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
  205. local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
  206. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
  207. local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
  208. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
  209. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
  210. local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
  211. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
  212. local_deep_research/web_search_engines/search_engine_base.py +83 -35
  213. local_deep_research/web_search_engines/search_engine_factory.py +25 -8
  214. local_deep_research/web_search_engines/search_engines_config.py +9 -3
  215. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
  216. local_deep_research-0.5.0.dist-info/RECORD +265 -0
  217. local_deep_research-0.4.4.dist-info/RECORD +0 -177
  218. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
  219. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
  220. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1348 @@
1
+ """
2
+ Constrained search strategy that progressively narrows down candidates.
3
+
4
+ This strategy mimics human problem-solving by:
5
+ 1. Starting with the most restrictive constraints
6
+ 2. Finding candidates that match those constraints
7
+ 3. Progressively checking additional constraints
8
+ 4. Narrowing down the candidate pool step by step
9
+ """
10
+
11
+ from datetime import datetime
12
+ from typing import Any, Dict, List
13
+
14
+ from langchain_core.language_models import BaseChatModel
15
+ from loguru import logger
16
+
17
+ from ...utilities.search_utilities import remove_think_tags
18
+ from ..candidates.base_candidate import Candidate
19
+ from ..constraints.base_constraint import Constraint, ConstraintType
20
+ from ..evidence.base_evidence import Evidence, EvidenceType
21
+ from .evidence_based_strategy import EvidenceBasedStrategy
22
+
23
+
24
+ class ConstrainedSearchStrategy(EvidenceBasedStrategy):
25
+ """
26
+ Strategy that progressively narrows down candidates using constraints.
27
+
28
+ Key approach:
29
+ 1. Rank constraints by restrictiveness
30
+ 2. Start with most restrictive constraint
31
+ 3. Find candidates matching that constraint
32
+ 4. Progressively filter using additional constraints
33
+ 5. Gather evidence only for promising candidates
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ model: BaseChatModel,
39
+ search: Any,
40
+ all_links_of_system: List[str],
41
+ max_iterations: int = 20,
42
+ confidence_threshold: float = 0.85,
43
+ candidate_limit: int = 100, # Increased to get more candidates
44
+ evidence_threshold: float = 0.6,
45
+ max_search_iterations: int = 2,
46
+ questions_per_iteration: int = 3,
47
+ min_candidates_per_stage: int = 20, # Need more candidates before filtering
48
+ ):
49
+ """Initialize the constrained search strategy."""
50
+ super().__init__(
51
+ model=model,
52
+ search=search,
53
+ all_links_of_system=all_links_of_system,
54
+ max_iterations=max_iterations,
55
+ confidence_threshold=confidence_threshold,
56
+ candidate_limit=candidate_limit,
57
+ evidence_threshold=evidence_threshold,
58
+ max_search_iterations=max_search_iterations,
59
+ questions_per_iteration=questions_per_iteration,
60
+ )
61
+
62
+ self.min_candidates_per_stage = min_candidates_per_stage
63
+ self.constraint_ranking: List[Constraint] = []
64
+ self.stage_candidates: Dict[int, List[Candidate]] = {}
65
+
66
+ # Enable direct search optimization for entity identification
67
+ # Note: parent class already sets this, but we ensure it's True
68
+ self.use_direct_search = True
69
+ logger.info(
70
+ f"ConstrainedSearchStrategy init: use_direct_search={self.use_direct_search}"
71
+ )
72
+
73
+ def analyze_topic(self, query: str) -> Dict:
74
+ """Analyze topic using progressive constraint narrowing."""
75
+ # Initialize
76
+ self.all_links_of_system.clear()
77
+ self.questions_by_iteration = []
78
+ self.findings = []
79
+ self.iteration = 0
80
+
81
+ if self.progress_callback:
82
+ self.progress_callback(
83
+ "Analyzing query to identify constraints and rank by restrictiveness",
84
+ 2,
85
+ {"phase": "initialization", "strategy": "constrained_search"},
86
+ )
87
+
88
+ # Extract and rank constraints
89
+ self.constraints = self.constraint_analyzer.extract_constraints(query)
90
+ self.constraint_ranking = self._rank_constraints_by_restrictiveness()
91
+
92
+ if self.progress_callback:
93
+ ranking_summary = ", ".join(
94
+ [
95
+ f"{i + 1}. {c.description[:30]}..."
96
+ for i, c in enumerate(self.constraint_ranking[:3])
97
+ ]
98
+ )
99
+ self.progress_callback(
100
+ f"Found {len(self.constraints)} constraints. Order: {ranking_summary}",
101
+ 5,
102
+ {
103
+ "phase": "constraint_ranking",
104
+ "constraint_count": len(self.constraints),
105
+ "ranking": [
106
+ (c.description, c.type.value)
107
+ for c in self.constraint_ranking
108
+ ],
109
+ },
110
+ )
111
+
112
+ # Add initial analysis finding
113
+ initial_finding = {
114
+ "phase": "Constraint Analysis",
115
+ "content": self._format_constraint_analysis(),
116
+ "timestamp": self._get_timestamp(),
117
+ }
118
+ self.findings.append(initial_finding)
119
+
120
+ # Progressive constraint search
121
+ self._progressive_constraint_search()
122
+
123
+ # Add search summary finding
124
+ search_finding = {
125
+ "phase": "Progressive Search Summary",
126
+ "content": self._format_search_summary(),
127
+ "timestamp": self._get_timestamp(),
128
+ }
129
+ self.findings.append(search_finding)
130
+
131
+ # Evidence gathering for final candidates
132
+ self._focused_evidence_gathering()
133
+
134
+ # Add evidence summary finding
135
+ evidence_finding = {
136
+ "phase": "Evidence Summary",
137
+ "content": self._format_evidence_summary(),
138
+ "timestamp": self._get_timestamp(),
139
+ }
140
+ self.findings.append(evidence_finding)
141
+
142
+ # Add comprehensive debug summary
143
+ debug_finding = {
144
+ "phase": "Debug Summary",
145
+ "content": self._format_debug_summary(),
146
+ "timestamp": self._get_timestamp(),
147
+ "metadata": {
148
+ "total_searches": (
149
+ len(self.search_history)
150
+ if hasattr(self, "search_history")
151
+ else 0
152
+ ),
153
+ "final_candidates": len(self.candidates),
154
+ "constraints_used": len(self.constraints),
155
+ "stages_completed": len(self.stage_candidates),
156
+ },
157
+ }
158
+ self.findings.append(debug_finding)
159
+
160
+ # Final synthesis
161
+ return self._synthesize_final_answer(query)
162
+
163
+ def _rank_constraints_by_restrictiveness(self) -> List[Constraint]:
164
+ """Rank constraints from most to least restrictive."""
165
+ # Scoring system for restrictiveness
166
+ restrictiveness_scores = []
167
+
168
+ for constraint in self.constraints:
169
+ score = 0
170
+
171
+ # Type-based scoring
172
+ if constraint.type == ConstraintType.STATISTIC:
173
+ score += 10 # Numbers are usually very restrictive
174
+ elif constraint.type == ConstraintType.EVENT:
175
+ score += 8 # Events/time periods are restrictive
176
+ elif constraint.type == ConstraintType.LOCATION:
177
+ score += 6 # Locations are moderately restrictive
178
+ elif constraint.type == ConstraintType.PROPERTY:
179
+ score += 4 # Properties are less restrictive
180
+
181
+ # Specificity scoring
182
+ if constraint.value:
183
+ # Check for specific markers
184
+ if any(char.isdigit() for char in constraint.value):
185
+ score += 5 # Contains numbers
186
+ if len(constraint.value.split()) > 3:
187
+ score += 3 # Longer, more specific
188
+ if any(
189
+ term in constraint.value.lower()
190
+ for term in ["specific", "exact", "only", "must"]
191
+ ):
192
+ score += 2 # Explicit specificity
193
+
194
+ restrictiveness_scores.append((constraint, score))
195
+
196
+ # Sort by score (highest first)
197
+ ranked = sorted(
198
+ restrictiveness_scores, key=lambda x: x[1], reverse=True
199
+ )
200
+ return [constraint for constraint, _ in ranked]
201
+
202
+ def _progressive_constraint_search(self):
203
+ """Progressively search using constraints from most to least restrictive."""
204
+ current_candidates = []
205
+
206
+ for stage, constraint in enumerate(self.constraint_ranking):
207
+ self.current_stage = stage # Track current stage for logging
208
+ if self.progress_callback:
209
+ stage_desc = f"[{constraint.type.value}] {constraint.value} ({len(current_candidates)} candidates)"
210
+ self.progress_callback(
211
+ f"Stage {stage + 1}/{len(self.constraint_ranking)}: {stage_desc}",
212
+ 10 + (stage * 15),
213
+ {
214
+ "phase": "progressive_search",
215
+ "stage": stage + 1,
216
+ "total_stages": len(self.constraint_ranking),
217
+ "constraint": constraint.description,
218
+ "constraint_type": constraint.type.value,
219
+ "constraint_value": constraint.value,
220
+ "current_candidates": len(current_candidates),
221
+ "search_intent": f"Finding entities matching: {constraint.value}",
222
+ },
223
+ )
224
+
225
+ if stage == 0:
226
+ # First stage - find initial candidates
227
+ current_candidates = self._search_with_single_constraint(
228
+ constraint
229
+ )
230
+ else:
231
+ # Subsequent stages - filter existing candidates
232
+ current_candidates = self._filter_candidates_with_constraint(
233
+ current_candidates, constraint
234
+ )
235
+
236
+ # Store stage results
237
+ self.stage_candidates[stage] = current_candidates.copy()
238
+
239
+ if self.progress_callback:
240
+ candidate_names = ", ".join(
241
+ [c.name for c in current_candidates[:3]]
242
+ )
243
+ more = (
244
+ f" (+{len(current_candidates) - 3})"
245
+ if len(current_candidates) > 3
246
+ else ""
247
+ )
248
+ change = len(current_candidates) - len(
249
+ self.stage_candidates.get(stage - 1, [])
250
+ )
251
+ change_str = f" (Δ{change:+d})" if stage > 0 else ""
252
+
253
+ self.progress_callback(
254
+ f"Stage {stage + 1} complete: {len(current_candidates)} candidates{change_str}. {candidate_names}{more}",
255
+ None,
256
+ {
257
+ "phase": "stage_complete",
258
+ "stage": stage + 1,
259
+ "candidates_found": len(current_candidates),
260
+ "candidates_delta": change if stage > 0 else 0,
261
+ "sample": [c.name for c in current_candidates[:10]],
262
+ },
263
+ )
264
+
265
+ # Add stage finding
266
+ stage_finding = {
267
+ "phase": f"Stage {stage + 1} - {constraint.type.value}",
268
+ "content": self._format_stage_results(
269
+ stage, constraint, current_candidates
270
+ ),
271
+ "timestamp": self._get_timestamp(),
272
+ }
273
+ self.findings.append(stage_finding)
274
+
275
+ # Continue applying constraints unless we have very few candidates
276
+ if len(current_candidates) <= 3:
277
+ if self.progress_callback:
278
+ self.progress_callback(
279
+ f"Too few candidates ({len(current_candidates)}) - stopping constraint application",
280
+ None,
281
+ {
282
+ "phase": "early_stop",
283
+ "candidates_remaining": len(current_candidates),
284
+ },
285
+ )
286
+ break
287
+
288
+ # Stop if no candidates remain
289
+ if not current_candidates:
290
+ # Backtrack to previous stage if possible
291
+ if stage > 0:
292
+ current_candidates = self.stage_candidates[stage - 1]
293
+ break
294
+
295
+ self.candidates = current_candidates[: self.candidate_limit]
296
+
297
+ def _search_with_single_constraint(
298
+ self, constraint: Constraint
299
+ ) -> List[Candidate]:
300
+ """Search for candidates using a single constraint."""
301
+ candidates = []
302
+
303
+ # Generate targeted queries for this constraint
304
+ queries = self._generate_constraint_specific_queries(constraint)
305
+
306
+ # Add more diverse query patterns
307
+ additional_queries = self._generate_additional_queries(constraint)
308
+ queries.extend(additional_queries)
309
+
310
+ # Diversify query execution order
311
+ import random
312
+
313
+ random.shuffle(queries)
314
+
315
+ for i, query in enumerate(queries[:20]): # Increased query limit
316
+ if self.progress_callback:
317
+ # Show query and what we're looking for
318
+ self.progress_callback(
319
+ f"Q{i + 1}/{min(20, len(queries))}: '{query}' | Found: {len(candidates)} candidates",
320
+ None,
321
+ {
322
+ "phase": "constraint_search",
323
+ "query": query,
324
+ "query_index": i + 1,
325
+ "total_queries": min(20, len(queries)),
326
+ "constraint_type": constraint.type.value,
327
+ "constraint_value": constraint.value,
328
+ "candidates_so_far": len(candidates),
329
+ "search_context": f"Stage {getattr(self, 'current_stage', 0) + 1}: {constraint.value}",
330
+ },
331
+ )
332
+
333
+ results = self._execute_search(query)
334
+
335
+ # Validate search results before extraction
336
+ if self._validate_search_results(results, constraint):
337
+ extracted = self._extract_relevant_candidates(
338
+ results, constraint
339
+ )
340
+ candidates.extend(extracted)
341
+
342
+ # Track stage information in search history
343
+ if self.search_history:
344
+ self.search_history[-1]["stage"] = getattr(
345
+ self, "current_stage", 0
346
+ )
347
+ self.search_history[-1]["results_count"] = len(extracted)
348
+ self.search_history[-1]["results_preview"] = results.get(
349
+ "current_knowledge", ""
350
+ )[:200]
351
+ else:
352
+ logger.info(f"Skipping invalid results for query: {query}")
353
+
354
+ # Continue searching to build a comprehensive list
355
+ # Don't stop too early - we want diversity
356
+ if len(candidates) >= self.candidate_limit * 2:
357
+ break
358
+
359
+ return self._deduplicate_candidates(candidates)
360
+
361
+ def _generate_additional_queries(self, constraint: Constraint) -> List[str]:
362
+ """Generate additional diverse queries for better coverage."""
363
+ queries = []
364
+ base_value = constraint.value
365
+
366
+ # Add reference source queries
367
+ queries.extend(
368
+ [
369
+ f"reference {base_value}",
370
+ f"authoritative {base_value}",
371
+ f"official {base_value}",
372
+ ]
373
+ )
374
+
375
+ # Add structured data queries
376
+ if constraint.type == ConstraintType.STATISTIC:
377
+ queries.extend(
378
+ [
379
+ f"spreadsheet {base_value}",
380
+ f"dataset {base_value}",
381
+ f"statistical analysis {base_value}",
382
+ f"quantitative {base_value}",
383
+ ]
384
+ )
385
+ elif constraint.type == ConstraintType.PROPERTY:
386
+ queries.extend(
387
+ [
388
+ f"characterized by {base_value}",
389
+ f"known for {base_value}",
390
+ f"featuring {base_value}",
391
+ ]
392
+ )
393
+ else:
394
+ # Generic comprehensive queries
395
+ queries.extend(
396
+ [
397
+ f"exhaustive {base_value}",
398
+ f"thorough {base_value}",
399
+ f"detailed {base_value}",
400
+ ]
401
+ )
402
+
403
+ return queries
404
+
405
+ def _generate_constraint_specific_queries(
406
+ self, constraint: Constraint
407
+ ) -> List[str]:
408
+ """Generate queries specific to a constraint type."""
409
+ queries = []
410
+ base_value = constraint.value
411
+
412
+ # Add context from other constraints for more targeted searches
413
+ context_parts = []
414
+ if hasattr(self, "constraints") and self.constraints:
415
+ for other_constraint in self.constraints[
416
+ :2
417
+ ]: # Use top 2 constraints for context
418
+ if other_constraint.id != constraint.id:
419
+ context_parts.append(other_constraint.value)
420
+
421
+ # Base queries using the constraint description
422
+ if hasattr(constraint, "description") and constraint.description:
423
+ queries.append(constraint.description)
424
+ if context_parts:
425
+ queries.append(f"{constraint.description} {context_parts[0]}")
426
+
427
+ # Type-specific patterns
428
+ if constraint.type == ConstraintType.STATISTIC:
429
+ # Numeric constraints - look for quantitative information
430
+ queries.extend(
431
+ [
432
+ f"list {base_value}",
433
+ f"complete {base_value}",
434
+ f"all {base_value}",
435
+ f"comprehensive {base_value}",
436
+ f"database {base_value}",
437
+ f"statistics {base_value}",
438
+ f"data {base_value}",
439
+ f"comparison {base_value}",
440
+ ]
441
+ )
442
+
443
+ elif (
444
+ constraint.type == ConstraintType.EVENT
445
+ or hasattr(constraint.type, "value")
446
+ and constraint.type.value == "temporal"
447
+ ):
448
+ # Time-based constraints
449
+ queries.extend(
450
+ [
451
+ f"during {base_value}",
452
+ f"in {base_value}",
453
+ f"list {base_value}",
454
+ f"comprehensive {base_value}",
455
+ f"all from {base_value}",
456
+ f"complete list {base_value}",
457
+ f"history {base_value}",
458
+ f"timeline {base_value}",
459
+ ]
460
+ )
461
+
462
+ elif constraint.type == ConstraintType.PROPERTY:
463
+ # Property constraints - characteristics and attributes
464
+ queries.extend(
465
+ [
466
+ f"with {base_value}",
467
+ f"having {base_value}",
468
+ f"characterized by {base_value}",
469
+ f"examples {base_value}",
470
+ f"instances {base_value}",
471
+ f"who {base_value}",
472
+ f"which {base_value}",
473
+ f"known for {base_value}",
474
+ ]
475
+ )
476
+ else:
477
+ # Generic queries
478
+ queries.extend(
479
+ [
480
+ f"{base_value}",
481
+ f"list {base_value}",
482
+ f"examples {base_value}",
483
+ f"all {base_value}",
484
+ f"complete {base_value}",
485
+ ]
486
+ )
487
+
488
+ # Add combined queries with other constraints
489
+ if context_parts:
490
+ queries.extend(
491
+ [
492
+ f"{base_value} {context_parts[0]}",
493
+ f"list {base_value} with {context_parts[0]}",
494
+ f"{base_value} and {context_parts[0]}",
495
+ ]
496
+ )
497
+
498
+ return queries
499
+
500
+ def _filter_candidates_with_constraint(
501
+ self, candidates: List[Candidate], constraint: Constraint
502
+ ) -> List[Candidate]:
503
+ """Filter existing candidates using an additional constraint."""
504
+ filtered = []
505
+
506
+ for candidate in candidates:
507
+ # Check if candidate matches the constraint
508
+ query = f"{candidate.name} {constraint.value}"
509
+
510
+ results = self._execute_search(query)
511
+
512
+ # Quick evidence check
513
+ evidence = self._quick_evidence_check(
514
+ results, candidate, constraint
515
+ )
516
+
517
+ if evidence.confidence >= 0.5: # Lower threshold for filtering
518
+ candidate.add_evidence(constraint.id, evidence)
519
+ filtered.append(candidate)
520
+
521
+ return filtered
522
+
523
+ def _extract_relevant_candidates(
524
+ self, results: Dict, constraint: Constraint
525
+ ) -> List[Candidate]:
526
+ """Extract candidates relevant to a specific constraint."""
527
+ content = results.get("current_knowledge", "")
528
+
529
+ # If no content, return empty list
530
+ if not content or "Error" in content or "No results found" in content:
531
+ logger.warning(
532
+ f"No valid content to extract candidates from for constraint: {constraint.description}"
533
+ )
534
+ return []
535
+
536
+ # Determine what type of entity we're looking for
537
+ entity_type = getattr(self, "entity_type", "entity")
538
+
539
+ # Use LLM to extract entities matching the constraint
540
+ prompt = f"""Analyze these search results and extract {entity_type} names that could satisfy this constraint:
541
+
542
+ Constraint: {constraint.description}
543
+ Type: {constraint.type.value}
544
+ Value: {constraint.value}
545
+
546
+ Search Results:
547
+ {content}
548
+
549
+ Your task:
550
+ 1. Understand what the constraint is asking for
551
+ 2. Identify mentions of specific {entity_type} names in the search results
552
+ 3. Extract only those names that could potentially satisfy the constraint
553
+ 4. Focus on proper nouns and specific names
554
+
555
+ Important:
556
+ - Extract actual {entity_type} names, not descriptions or categories
557
+ - If the search results mention a specific {entity_type} that matches the constraint criteria, extract it
558
+ - Be thorough - don't miss names that are mentioned in passing
559
+ - Consider the context to determine if a name is relevant to the constraint
560
+
561
+ Return one {entity_type} name per line. Only include names that could satisfy the constraint."""
562
+
563
+ try:
564
+ response = self.model.invoke(prompt)
565
+ extracted_text = remove_think_tags(response.content)
566
+
567
+ candidates = []
568
+ seen_names = set() # Track unique names
569
+
570
+ for line in extracted_text.strip().split("\n"):
571
+ name = line.strip()
572
+ # Remove common list markers and clean up
573
+ name = name.lstrip("- •·*0123456789.").strip()
574
+
575
+ # Skip empty lines or very short names
576
+ if not name or len(name) <= 2:
577
+ continue
578
+
579
+ # Normalize for deduplication
580
+ normalized_name = name.lower()
581
+ if normalized_name in seen_names:
582
+ continue
583
+
584
+ # Exclude meta-commentary patterns
585
+ exclude_patterns = [
586
+ "search result",
587
+ "based on",
588
+ "provided",
589
+ "found",
590
+ "does not",
591
+ "doesn't",
592
+ "cannot",
593
+ "there are no",
594
+ "according to",
595
+ "mentions",
596
+ "discusses",
597
+ "shows that",
598
+ "indicates",
599
+ "suggests",
600
+ "appears",
601
+ "seems",
602
+ "search",
603
+ "constraint",
604
+ "extract",
605
+ "entity",
606
+ ]
607
+
608
+ # Check if it's meta-commentary
609
+ is_meta = any(
610
+ pattern in name.lower() for pattern in exclude_patterns
611
+ )
612
+ is_too_long = (
613
+ len(name.split()) > 10
614
+ ) # Very long strings are usually explanations
615
+ is_sentence = name.endswith(".") and len(name.split()) > 5
616
+
617
+ if not is_meta and not is_too_long and not is_sentence:
618
+ # Accept various name formats
619
+ if (
620
+ name[0].isupper() # Capitalized
621
+ or any(c.isupper() for c in name) # Has capitals
622
+ or any(c.isdigit() for c in name) # Contains numbers
623
+ or any(
624
+ c in name
625
+ for c in ["-", "&", "/", ":", "(", ")", '"', "'"]
626
+ ) # Special chars
627
+ or len(name.split()) <= 6
628
+ ): # Reasonable length phrases
629
+ candidate = Candidate(name=name)
630
+ candidates.append(candidate)
631
+ seen_names.add(normalized_name)
632
+
633
+ # Log extraction results for debugging
634
+ logger.info(
635
+ f"Extracted {len(candidates)} candidates for constraint: {constraint.description}"
636
+ )
637
+ if candidates:
638
+ logger.debug(
639
+ f"Sample candidates: {[c.name for c in candidates[:5]]}"
640
+ )
641
+
642
+ return candidates[:50] # Limit per search
643
+
644
+ except Exception as e:
645
+ logger.error(f"Error extracting candidates: {e}")
646
+ import traceback
647
+
648
+ logger.error(traceback.format_exc())
649
+ return []
650
+
651
+ def _quick_evidence_check(
652
+ self, results: Dict, candidate: Candidate, constraint: Constraint
653
+ ) -> Evidence:
654
+ """Quick evidence check for filtering with enhanced scoring."""
655
+ content = results.get("current_knowledge", "")
656
+ search_results = results.get("search_results", [])
657
+
658
+ # Initialize confidence components
659
+ name_presence = 0.0
660
+ constraint_presence = 0.0
661
+ co_occurrence = 0.0
662
+ context_quality = 0.0
663
+
664
+ candidate_lower = candidate.name.lower()
665
+ value_lower = constraint.value.lower()
666
+ content_lower = content.lower()
667
+
668
+ # Check candidate name presence
669
+ if candidate_lower in content_lower:
670
+ name_count = content_lower.count(candidate_lower)
671
+ name_presence = min(
672
+ 0.3 + (name_count * 0.05), 0.4
673
+ ) # More occurrences = higher confidence
674
+
675
+ # Check constraint value presence
676
+ if value_lower in content_lower:
677
+ value_count = content_lower.count(value_lower)
678
+ constraint_presence = min(0.3 + (value_count * 0.05), 0.4)
679
+
680
+ # Check co-occurrence and proximity
681
+ if name_presence > 0 and constraint_presence > 0:
682
+ # Find all positions
683
+ name_positions = []
684
+ start = 0
685
+ while start < len(content_lower):
686
+ pos = content_lower.find(candidate_lower, start)
687
+ if pos == -1:
688
+ break
689
+ name_positions.append(pos)
690
+ start = pos + 1
691
+
692
+ value_positions = []
693
+ start = 0
694
+ while start < len(content_lower):
695
+ pos = content_lower.find(value_lower, start)
696
+ if pos == -1:
697
+ break
698
+ value_positions.append(pos)
699
+ start = pos + 1
700
+
701
+ # Calculate minimum distance
702
+ if name_positions and value_positions:
703
+ min_distance = min(
704
+ abs(n - v) for n in name_positions for v in value_positions
705
+ )
706
+
707
+ if min_distance < 100: # Very close proximity
708
+ co_occurrence = 0.2
709
+ elif min_distance < 200: # Close proximity
710
+ co_occurrence = 0.15
711
+ elif min_distance < 500: # Moderate proximity
712
+ co_occurrence = 0.1
713
+ else: # Same document
714
+ co_occurrence = 0.05
715
+
716
+ # Check result quality
717
+ if search_results:
718
+ # Count how many results mention both candidate and constraint
719
+ relevant_results = 0
720
+ for result in search_results[:10]:
721
+ title = result.get("title", "").lower()
722
+ snippet = result.get("snippet", "").lower()
723
+
724
+ if (
725
+ candidate_lower in title or candidate_lower in snippet
726
+ ) and (value_lower in title or value_lower in snippet):
727
+ relevant_results += 1
728
+
729
+ context_quality = min(relevant_results * 0.05, 0.2)
730
+
731
+ # Calculate final confidence
732
+ confidence = (
733
+ name_presence
734
+ + constraint_presence
735
+ + co_occurrence
736
+ + context_quality
737
+ )
738
+
739
+ # Apply constraint type weight
740
+ if constraint.type == ConstraintType.STATISTIC:
741
+ confidence *= 1.1 # Numeric constraints need precise matching
742
+ elif constraint.type == ConstraintType.PROPERTY:
743
+ confidence *= 0.95 # Properties can be more flexible
744
+
745
+ return Evidence(
746
+ claim=f"Evidence for {candidate.name} matching {constraint.description}",
747
+ confidence=min(confidence, 1.0),
748
+ type=EvidenceType.INFERENCE,
749
+ source="quick_evidence_check",
750
+ metadata={
751
+ "name_presence": name_presence,
752
+ "constraint_presence": constraint_presence,
753
+ "co_occurrence": co_occurrence,
754
+ "context_quality": context_quality,
755
+ },
756
+ )
757
+
758
+ def _focused_evidence_gathering(self):
759
+ """Gather detailed evidence for the narrowed candidates."""
760
+ if self.progress_callback:
761
+ constraint_count = len(self.constraints)
762
+ evidence_needed = len(self.candidates) * constraint_count
763
+ self.progress_callback(
764
+ f"Verifying {len(self.candidates)} candidates against {constraint_count} constraints ({evidence_needed} checks)",
765
+ 80,
766
+ {
767
+ "phase": "evidence_gathering",
768
+ "candidate_count": len(self.candidates),
769
+ "constraint_count": constraint_count,
770
+ "total_evidence_needed": evidence_needed,
771
+ },
772
+ )
773
+
774
+ for i, candidate in enumerate(self.candidates):
775
+ for j, constraint in enumerate(self.constraints):
776
+ # Skip if we already have evidence from filtering
777
+ if constraint.id in candidate.evidence:
778
+ continue
779
+
780
+ # Detailed evidence search
781
+ query = f'"{candidate.name}" {constraint.value} verification'
782
+ results = self._execute_search(query)
783
+
784
+ evidence = self.evidence_evaluator.extract_evidence(
785
+ results.get("current_knowledge", ""),
786
+ candidate.name,
787
+ constraint,
788
+ )
789
+
790
+ candidate.add_evidence(constraint.id, evidence)
791
+
792
+ if (
793
+ self.progress_callback and i < 5
794
+ ): # Report progress for top candidates
795
+ conf_emoji = (
796
+ "✓"
797
+ if evidence.confidence >= self.evidence_threshold
798
+ else "○"
799
+ )
800
+ self.progress_callback(
801
+ f"{conf_emoji} {candidate.name} | {constraint.type.value}: {evidence.confidence:.0%}",
802
+ None,
803
+ {
804
+ "phase": "evidence_detail",
805
+ "candidate": candidate.name,
806
+ "constraint": constraint.description,
807
+ "constraint_type": constraint.type.value,
808
+ "confidence": evidence.confidence,
809
+ "evidence_type": evidence.type.value,
810
+ "meets_threshold": evidence.confidence
811
+ >= self.evidence_threshold,
812
+ },
813
+ )
814
+
815
+ # Final scoring
816
+ for candidate in self.candidates:
817
+ candidate.calculate_score(self.constraints)
818
+
819
+ # Sort by score
820
+ self.candidates.sort(key=lambda c: c.score, reverse=True)
821
+
822
+ def _deduplicate_candidates(
823
+ self, candidates: List[Candidate]
824
+ ) -> List[Candidate]:
825
+ """Remove duplicate candidates."""
826
+ seen = {}
827
+ unique = []
828
+
829
+ for candidate in candidates:
830
+ key = candidate.name.lower().strip()
831
+ if key not in seen:
832
+ seen[key] = candidate
833
+ unique.append(candidate)
834
+
835
+ return unique
836
+
837
+ def _format_constraint_analysis(self) -> str:
838
+ """Format initial constraint analysis."""
839
+ analysis = "**Query Constraint Analysis**\n\n"
840
+ analysis += f"Total constraints identified: {len(self.constraints)}\n\n"
841
+ analysis += "**Constraint Ranking (by restrictiveness):**\n"
842
+
843
+ for i, constraint in enumerate(self.constraint_ranking):
844
+ score = self._calculate_restrictiveness_score(constraint)
845
+ analysis += (
846
+ f"{i + 1}. [{constraint.type.value}] {constraint.description}\n"
847
+ )
848
+ analysis += f" Restrictiveness score: {score}\n"
849
+ analysis += f" Value: {constraint.value}\n\n"
850
+
851
+ return analysis
852
+
853
+ def _format_debug_summary(self) -> str:
854
+ """Format comprehensive debug summary."""
855
+ summary = "**Debug Summary**\n\n"
856
+
857
+ # Constraint analysis
858
+ summary += "**Constraint Processing:**\n"
859
+ for i, constraint in enumerate(self.constraint_ranking):
860
+ score = self._calculate_restrictiveness_score(constraint)
861
+ summary += f"{i + 1}. [{constraint.type.value}] {constraint.value} (score: {score})\n"
862
+
863
+ # Search progression
864
+ summary += "\n**Search Progression:**\n"
865
+ if hasattr(self, "stage_candidates"):
866
+ for stage, candidates in self.stage_candidates.items():
867
+ summary += f"Stage {stage + 1}: {len(candidates)} candidates\n"
868
+
869
+ # Evidence coverage
870
+ summary += "\n**Evidence Coverage:**\n"
871
+
872
+ for i, candidate in enumerate(self.candidates[:5]):
873
+ evidence_count = len(candidate.evidence)
874
+ satisfied = sum(
875
+ 1
876
+ for c in self.constraints
877
+ if c.id in candidate.evidence
878
+ and candidate.evidence[c.id].confidence
879
+ >= self.evidence_threshold
880
+ )
881
+
882
+ summary += f"{i + 1}. {candidate.name}: {evidence_count} evidence, "
883
+ summary += f"{satisfied}/{len(self.constraints)} constraints\n"
884
+
885
+ # Search statistics
886
+ summary += "\n**Search Statistics:**\n"
887
+ total_discovered = (
888
+ sum(len(c) for c in self.stage_candidates.values())
889
+ if hasattr(self, "stage_candidates")
890
+ else 0
891
+ )
892
+ summary += f"Total candidates discovered: {total_discovered}\n"
893
+ summary += f"Final candidates: {len(self.candidates)}\n"
894
+ summary += f"Constraints: {len(self.constraints)}\n"
895
+
896
+ return summary
897
+
898
+ def _calculate_restrictiveness_score(self, constraint: Constraint) -> int:
899
+ """Calculate restrictiveness score for a constraint."""
900
+ score = 0
901
+
902
+ # Type-based scoring
903
+ if constraint.type == ConstraintType.STATISTIC:
904
+ score += 10
905
+ elif constraint.type == ConstraintType.EVENT:
906
+ score += 8
907
+ elif constraint.type == ConstraintType.LOCATION:
908
+ score += 6
909
+ elif constraint.type == ConstraintType.PROPERTY:
910
+ score += 4
911
+
912
+ # Specificity scoring
913
+ if constraint.value:
914
+ if any(char.isdigit() for char in constraint.value):
915
+ score += 5
916
+ if len(constraint.value.split()) > 3:
917
+ score += 3
918
+ if any(
919
+ term in constraint.value.lower()
920
+ for term in ["specific", "exact", "only", "must"]
921
+ ):
922
+ score += 2
923
+
924
+ return score
925
+
926
+ def _format_stage_results(
927
+ self, stage: int, constraint: Constraint, candidates: List[Candidate]
928
+ ) -> str:
929
+ """Format results for a search stage with detailed information."""
930
+ result = f"**Search Stage {stage + 1}**\n\n"
931
+ result += f"Constraint: {constraint.description}\n"
932
+ result += f"Type: {constraint.type.value}\n"
933
+ result += f"Search Value: {constraint.value}\n"
934
+ result += f"Candidates found: {len(candidates)}\n\n"
935
+
936
+ # Add search statistics
937
+ result += "**Search Statistics:**\n"
938
+ if hasattr(self, "search_history"):
939
+ stage_searches = [
940
+ s for s in self.search_history if s.get("stage", -1) == stage
941
+ ]
942
+ result += f"- Queries executed: {len(stage_searches)}\n"
943
+ result += f"- Total results analyzed: {getattr(self, f'stage_{stage}_results_count', 0)}\n"
944
+
945
+ result += f"- Candidates before filtering: {getattr(self, f'stage_{stage}_raw_candidates', len(candidates))}\n"
946
+ result += f"- Candidates after deduplication: {len(candidates)}\n\n"
947
+
948
+ if candidates:
949
+ result += "**Top Candidates:**\n"
950
+ # Group candidates to show variety
951
+ grouped = self._group_similar_candidates(candidates[:20])
952
+ for group_name, group_items in grouped.items():
953
+ result += f"\n{group_name} ({len(group_items)} items):\n"
954
+ for i, candidate in enumerate(group_items[:5]):
955
+ result += f" {i + 1}. {candidate.name}\n"
956
+ if len(group_items) > 5:
957
+ result += f" ... and {len(group_items) - 5} more\n"
958
+ else:
959
+ result += "No candidates found for this constraint.\n"
960
+
961
+ # Add sample search results for debugging
962
+ if hasattr(self, "search_history") and candidates:
963
+ result += "\n**Sample Search Results:**\n"
964
+ recent_searches = [
965
+ s
966
+ for s in self.search_history[-3:]
967
+ if s.get("stage", -1) == stage
968
+ ]
969
+ for search in recent_searches[:2]:
970
+ result += f"- Query: '{search.get('query', '')}'\n"
971
+ if "results_preview" in search:
972
+ result += (
973
+ f" Preview: {search['results_preview'][:100]}...\n"
974
+ )
975
+
976
+ return result
977
+
978
+ def _format_search_summary(self) -> str:
979
+ """Format progressive search summary."""
980
+ summary = "**Progressive Search Summary**\n\n"
981
+
982
+ # Show search progression
983
+ summary += "**Stage-by-Stage Filtering:**\n"
984
+ prev_count = 0
985
+
986
+ for stage, candidates in self.stage_candidates.items():
987
+ constraint = (
988
+ self.constraint_ranking[stage]
989
+ if stage < len(self.constraint_ranking)
990
+ else None
991
+ )
992
+ if constraint:
993
+ count = len(candidates)
994
+ change = count - prev_count if stage > 0 else count
995
+ change_str = f" ({change:+d})" if stage > 0 else ""
996
+
997
+ summary += f"\nStage {stage + 1} [{constraint.type.value}]: {constraint.value[:40]}\n"
998
+ summary += f" Results: {count} candidates{change_str}\n"
999
+
1000
+ if candidates:
1001
+ # Group candidates by type
1002
+ grouped = self._group_similar_candidates(candidates[:20])
1003
+ for group_name, group_items in grouped.items():
1004
+ summary += f" {group_name}: {len(group_items)} items\n"
1005
+ for item in group_items[:3]:
1006
+ summary += f" • {item.name}\n"
1007
+ if len(group_items) > 3:
1008
+ summary += (
1009
+ f" ... and {len(group_items) - 3} more\n"
1010
+ )
1011
+
1012
+ prev_count = count
1013
+
1014
+ summary += (
1015
+ f"\n**Final Result: {len(self.candidates)} candidates selected**\n"
1016
+ )
1017
+
1018
+ return summary
1019
+
1020
+ def _format_evidence_summary(self) -> str:
1021
+ """Format evidence gathering summary."""
1022
+ summary = "**Evidence Gathering Summary**\n\n"
1023
+
1024
+ for i, candidate in enumerate(self.candidates[:5]):
1025
+ summary += f"**{i + 1}. {candidate.name}**\n"
1026
+
1027
+ for constraint in self.constraints:
1028
+ evidence = candidate.evidence.get(constraint.id)
1029
+ if evidence:
1030
+ conf_str = f"{evidence.confidence:.0%}"
1031
+ summary += (
1032
+ f" • {constraint.description[:40]}...: {conf_str}\n"
1033
+ )
1034
+ else:
1035
+ summary += (
1036
+ f" • {constraint.description[:40]}...: No evidence\n"
1037
+ )
1038
+
1039
+ summary += f" Overall Score: {candidate.score:.2f}\n\n"
1040
+
1041
+ return summary
1042
+
1043
+ # Commented out to use the parent's optimized _execute_search method
1044
+ '''def _execute_search(self, search_query: str) -> Dict:
1045
+ """Execute a comprehensive search using source-based strategy for complex queries."""
1046
+ if not hasattr(self, "search_history"):
1047
+ self.search_history = []
1048
+
1049
+ self.search_history.append(
1050
+ {
1051
+ "query": search_query,
1052
+ "timestamp": self._get_timestamp(),
1053
+ "iteration": getattr(self, "iteration", 0),
1054
+ }
1055
+ )
1056
+
1057
+ # Debug: Check if search engine is available
1058
+ if not hasattr(self, "search") or self.search is None:
1059
+ logger.error(f"No search engine configured for query: {search_query}")
1060
+ logger.error(f"Strategy attributes: {list(self.__dict__.keys())}")
1061
+ return {"current_knowledge": "", "search_results": []}
1062
+
1063
+ try:
1064
+ # Log that we're attempting to use source-based strategy
1065
+ logger.info(f"Attempting source-based search for: {search_query}")
1066
+
1067
+ # For complex queries, use source-based strategy with multiple iterations
1068
+ if hasattr(self, "source_strategy"):
1069
+ source_strategy = self.source_strategy
1070
+ else:
1071
+ logger.info("Creating new SourceBasedSearchStrategy instance")
1072
+ source_strategy = SourceBasedSearchStrategy(
1073
+ model=self.model,
1074
+ search=self.search,
1075
+ all_links_of_system=self.all_links_of_system,
1076
+ include_text_content=True,
1077
+ use_cross_engine_filter=False, # We'll handle filtering ourselves
1078
+ use_atomic_facts=False,
1079
+ )
1080
+ source_strategy.max_iterations = (
1081
+ 1 # More efficient with single iteration
1082
+ )
1083
+ source_strategy.questions_per_iteration = (
1084
+ 9 # More questions for broader coverage
1085
+ )
1086
+
1087
+ # Use source-based strategy for complex search
1088
+ try:
1089
+ # Set a simple progress callback if we have one
1090
+ if self.progress_callback:
1091
+
1092
+ def sub_callback(msg, prog, data):
1093
+ # Don't propagate all sub-progress updates
1094
+ if "phase" in data and data["phase"] in [
1095
+ "search_complete",
1096
+ "final_results",
1097
+ ]:
1098
+ self.progress_callback(f"Sub-search: {msg}", None, data)
1099
+
1100
+ source_strategy.set_progress_callback(sub_callback)
1101
+
1102
+ logger.info("Executing source-based search...")
1103
+ # Run the search
1104
+ result = source_strategy.analyze_topic(search_query)
1105
+
1106
+ if (
1107
+ result
1108
+ and "current_knowledge" in result
1109
+ and "all_links_of_system" in result
1110
+ ):
1111
+ search_results = result.get("all_links_of_system", [])
1112
+
1113
+ # Extract the most relevant information from the findings
1114
+ knowledge_parts = []
1115
+ if "findings" in result:
1116
+ for finding in result["findings"]:
1117
+ if "content" in finding and finding["content"]:
1118
+ knowledge_parts.append(finding["content"])
1119
+
1120
+ # Also include search results summaries
1121
+ for i, link in enumerate(search_results[:15]): # More results
1122
+ if isinstance(link, dict):
1123
+ title = link.get("title", "")
1124
+ snippet = link.get("snippet", "")
1125
+ content = link.get("content", "")
1126
+ url = link.get("link", link.get("url", ""))
1127
+
1128
+ if title or snippet:
1129
+ result_text = f"\nResult {i+1}: {title}"
1130
+ if url:
1131
+ result_text += f"\nURL: {url}"
1132
+ if snippet:
1133
+ result_text += f"\nSnippet: {snippet}"
1134
+ if content and content != snippet:
1135
+ result_text += f"\nContent: {content[:500]}..."
1136
+ knowledge_parts.append(result_text)
1137
+
1138
+ current_knowledge = "\n\n".join(knowledge_parts)
1139
+
1140
+ return {
1141
+ "current_knowledge": current_knowledge,
1142
+ "search_results": search_results,
1143
+ "detailed_findings": result.get("findings", []),
1144
+ }
1145
+ else:
1146
+ # Fallback to simple search
1147
+ logger.warning(
1148
+ "Source-based search returned empty results, falling back to simple search"
1149
+ )
1150
+ return self._simple_search(search_query)
1151
+
1152
+ except Exception as e:
1153
+ logger.error(f"Source-based search failed with error: {e}")
1154
+ logger.error(f"Error type: {type(e).__name__}")
1155
+ import traceback
1156
+
1157
+ logger.error(f"Traceback: {traceback.format_exc()}")
1158
+ logger.warning("Falling back to simple search")
1159
+ return self._simple_search(search_query)
1160
+
1161
+ except Exception as e:
1162
+ logger.error(f"Error during search for '{search_query}': {str(e)}")
1163
+ return {
1164
+ "current_knowledge": f"Error during search: {str(e)}",
1165
+ "search_results": [],
1166
+ }'''
1167
+
1168
+ def _simple_search(self, search_query: str) -> Dict:
1169
+ """Fallback simple search using search engine directly."""
1170
+ try:
1171
+ # Use the search engine directly for simple queries
1172
+ search_results = self.search.run(search_query)
1173
+
1174
+ if search_results and isinstance(search_results, list):
1175
+ # Format search results into a knowledge string
1176
+ content_parts = []
1177
+
1178
+ for i, result in enumerate(search_results[:15]): # More results
1179
+ title = result.get("title", "Untitled")
1180
+ snippet = result.get("snippet", "")
1181
+ content = result.get("content", "")
1182
+ url = result.get("link", result.get("url", ""))
1183
+
1184
+ content_parts.append(f"Result {i + 1}: {title}")
1185
+ if url:
1186
+ content_parts.append(f"URL: {url}")
1187
+ if snippet:
1188
+ content_parts.append(f"Snippet: {snippet}")
1189
+ if content and content != snippet:
1190
+ content_parts.append(
1191
+ f"Content preview: {content[:300]}..."
1192
+ )
1193
+ content_parts.append("") # Empty line between results
1194
+
1195
+ current_knowledge = "\n".join(content_parts)
1196
+
1197
+ return {
1198
+ "current_knowledge": current_knowledge,
1199
+ "search_results": search_results,
1200
+ }
1201
+ else:
1202
+ # Return empty knowledge if no results
1203
+ return {
1204
+ "current_knowledge": f"No results found for query: {search_query}",
1205
+ "search_results": [],
1206
+ }
1207
+ except Exception as e:
1208
+ logger.error(f"Simple search error: {e}")
1209
+ return {
1210
+ "current_knowledge": f"Search error: {str(e)}",
1211
+ "search_results": [],
1212
+ }
1213
+
1214
+ def _validate_search_results(
1215
+ self, results: Dict, constraint: Constraint
1216
+ ) -> bool:
1217
+ """Validate that search results contain relevant information."""
1218
+ if not results:
1219
+ return False
1220
+
1221
+ content = results.get("current_knowledge", "")
1222
+ search_results = results.get("search_results", [])
1223
+
1224
+ # Basic validation checks
1225
+ if not content or len(content) < 50: # Too short to be meaningful
1226
+ logger.debug(f"Content too short: {len(content)} characters")
1227
+ return False
1228
+
1229
+ if "Error" in content and len(content) < 100:
1230
+ logger.debug(f"Error in results: {content[:100]}")
1231
+ return False
1232
+
1233
+ if "No results found" in content:
1234
+ logger.debug("No results found")
1235
+ return False
1236
+
1237
+ # For stats/numeric constraints, check for related terms
1238
+ if constraint.type == ConstraintType.STATISTIC:
1239
+ # Look for related terms about TV shows, episodes, etc
1240
+ relevant_terms = [
1241
+ "tv",
1242
+ "show",
1243
+ "series",
1244
+ "episode",
1245
+ "season",
1246
+ "program",
1247
+ "character",
1248
+ "fiction",
1249
+ ]
1250
+ content_lower = content.lower()
1251
+
1252
+ term_found = any(term in content_lower for term in relevant_terms)
1253
+ if not term_found:
1254
+ logger.debug(
1255
+ "No relevant TV/show terms found for statistic constraint"
1256
+ )
1257
+ return False
1258
+ else:
1259
+ # Check for relevance to constraint using key terms
1260
+ constraint_terms = [
1261
+ term
1262
+ for term in constraint.value.lower().split()
1263
+ if len(term) > 2
1264
+ and term
1265
+ not in ["the", "and", "with", "for", "had", "his", "her"]
1266
+ ]
1267
+ content_lower = content.lower()
1268
+
1269
+ # Count how many meaningful terms appear
1270
+ if constraint_terms:
1271
+ term_matches = sum(
1272
+ 1 for term in constraint_terms if term in content_lower
1273
+ )
1274
+ relevance_ratio = term_matches / len(constraint_terms)
1275
+
1276
+ # Require at least one term match
1277
+ if relevance_ratio < 0.2:
1278
+ logger.debug(
1279
+ f"Low relevance: {relevance_ratio:.0%} term matches"
1280
+ )
1281
+ return False
1282
+
1283
+ # Check search results quality
1284
+ if search_results and isinstance(search_results, list):
1285
+ valid_results = sum(
1286
+ 1
1287
+ for r in search_results
1288
+ if isinstance(r, dict) and (r.get("title") or r.get("snippet"))
1289
+ )
1290
+ if valid_results < 1:
1291
+ logger.debug("No valid search results with title/snippet")
1292
+ return False
1293
+
1294
+ return True
1295
+
1296
+ def _get_timestamp(self) -> str:
1297
+ """Get current timestamp."""
1298
+ return datetime.utcnow().isoformat()
1299
+
1300
+ def _group_similar_candidates(
1301
+ self, candidates: List[Candidate]
1302
+ ) -> Dict[str, List[Candidate]]:
1303
+ """Group candidates by similar characteristics."""
1304
+ grouped = {}
1305
+
1306
+ for candidate in candidates:
1307
+ # Try to determine group type based on name patterns
1308
+ name = candidate.name.lower()
1309
+
1310
+ if any(
1311
+ keyword in name
1312
+ for keyword in ["model", "llm", "gpt", "claude", "gemini"]
1313
+ ):
1314
+ group = "AI Models"
1315
+ elif any(
1316
+ keyword in name
1317
+ for keyword in ["country", "nation", "republic", "kingdom"]
1318
+ ):
1319
+ group = "Countries"
1320
+ elif any(
1321
+ keyword in name for keyword in ["city", "town", "village"]
1322
+ ):
1323
+ group = "Cities"
1324
+ elif any(
1325
+ keyword in name for keyword in ["year", "century", "decade"]
1326
+ ):
1327
+ group = "Time Periods"
1328
+ elif any(
1329
+ keyword in name
1330
+ for keyword in ["person", "mr", "ms", "dr", "prof"]
1331
+ ):
1332
+ group = "People"
1333
+ elif any(c.isdigit() for c in name):
1334
+ group = "Numeric Items"
1335
+ else:
1336
+ # Default grouping based on first word
1337
+ first_word = (
1338
+ candidate.name.split()[0]
1339
+ if candidate.name.split()
1340
+ else "Other"
1341
+ )
1342
+ group = f"{first_word} Items"
1343
+
1344
+ if group not in grouped:
1345
+ grouped[group] = []
1346
+ grouped[group].append(candidate)
1347
+
1348
+ return grouped