local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. local_deep_research/__init__.py +7 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
  4. local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
  5. local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
  6. local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
  7. local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
  8. local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
  9. local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
  10. local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
  11. local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
  12. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
  13. local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
  14. local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
  15. local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
  16. local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
  17. local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
  18. local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
  19. local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
  20. local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
  21. local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
  22. local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
  23. local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
  24. local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
  25. local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
  26. local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
  27. local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
  28. local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
  29. local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
  30. local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
  31. local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
  32. local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
  33. local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
  34. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
  35. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
  36. local_deep_research/advanced_search_system/findings/repository.py +54 -17
  37. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
  38. local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
  39. local_deep_research/advanced_search_system/questions/__init__.py +16 -0
  40. local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
  41. local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
  42. local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
  43. local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
  44. local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
  45. local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
  46. local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
  47. local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
  48. local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
  49. local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
  50. local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
  51. local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
  52. local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
  53. local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
  54. local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
  55. local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
  56. local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
  57. local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
  58. local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
  59. local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
  60. local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
  61. local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
  62. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
  63. local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
  64. local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
  65. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
  66. local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
  67. local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
  68. local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
  69. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
  70. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
  71. local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
  72. local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
  73. local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
  74. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
  75. local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
  76. local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
  77. local_deep_research/api/benchmark_functions.py +6 -2
  78. local_deep_research/api/research_functions.py +10 -4
  79. local_deep_research/benchmarks/__init__.py +9 -7
  80. local_deep_research/benchmarks/benchmark_functions.py +6 -2
  81. local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
  82. local_deep_research/benchmarks/cli.py +38 -13
  83. local_deep_research/benchmarks/comparison/__init__.py +4 -2
  84. local_deep_research/benchmarks/comparison/evaluator.py +316 -239
  85. local_deep_research/benchmarks/datasets/__init__.py +1 -1
  86. local_deep_research/benchmarks/datasets/base.py +91 -72
  87. local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
  88. local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
  89. local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
  90. local_deep_research/benchmarks/datasets/utils.py +48 -29
  91. local_deep_research/benchmarks/datasets.py +4 -11
  92. local_deep_research/benchmarks/efficiency/__init__.py +8 -4
  93. local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
  94. local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
  95. local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
  96. local_deep_research/benchmarks/evaluators/composite.py +6 -2
  97. local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
  98. local_deep_research/benchmarks/graders.py +32 -10
  99. local_deep_research/benchmarks/metrics/README.md +1 -1
  100. local_deep_research/benchmarks/metrics/calculation.py +25 -10
  101. local_deep_research/benchmarks/metrics/reporting.py +7 -3
  102. local_deep_research/benchmarks/metrics/visualization.py +42 -23
  103. local_deep_research/benchmarks/metrics.py +1 -1
  104. local_deep_research/benchmarks/optimization/__init__.py +3 -1
  105. local_deep_research/benchmarks/optimization/api.py +7 -1
  106. local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
  107. local_deep_research/benchmarks/runners.py +48 -15
  108. local_deep_research/citation_handler.py +65 -92
  109. local_deep_research/citation_handlers/__init__.py +15 -0
  110. local_deep_research/citation_handlers/base_citation_handler.py +70 -0
  111. local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
  112. local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
  113. local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
  114. local_deep_research/config/llm_config.py +271 -169
  115. local_deep_research/config/search_config.py +14 -5
  116. local_deep_research/defaults/__init__.py +0 -1
  117. local_deep_research/metrics/__init__.py +13 -0
  118. local_deep_research/metrics/database.py +58 -0
  119. local_deep_research/metrics/db_models.py +115 -0
  120. local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
  121. local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
  122. local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
  123. local_deep_research/metrics/migrate_research_ratings.py +31 -0
  124. local_deep_research/metrics/models.py +61 -0
  125. local_deep_research/metrics/pricing/__init__.py +12 -0
  126. local_deep_research/metrics/pricing/cost_calculator.py +237 -0
  127. local_deep_research/metrics/pricing/pricing_cache.py +143 -0
  128. local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
  129. local_deep_research/metrics/query_utils.py +51 -0
  130. local_deep_research/metrics/search_tracker.py +380 -0
  131. local_deep_research/metrics/token_counter.py +1078 -0
  132. local_deep_research/migrate_db.py +3 -1
  133. local_deep_research/report_generator.py +22 -8
  134. local_deep_research/search_system.py +390 -9
  135. local_deep_research/test_migration.py +15 -5
  136. local_deep_research/utilities/db_utils.py +7 -4
  137. local_deep_research/utilities/es_utils.py +115 -104
  138. local_deep_research/utilities/llm_utils.py +15 -5
  139. local_deep_research/utilities/log_utils.py +151 -0
  140. local_deep_research/utilities/search_cache.py +387 -0
  141. local_deep_research/utilities/search_utilities.py +14 -6
  142. local_deep_research/utilities/threading_utils.py +92 -0
  143. local_deep_research/utilities/url_utils.py +6 -0
  144. local_deep_research/web/api.py +347 -0
  145. local_deep_research/web/app.py +13 -17
  146. local_deep_research/web/app_factory.py +71 -66
  147. local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
  148. local_deep_research/web/database/migrations.py +5 -3
  149. local_deep_research/web/database/models.py +51 -2
  150. local_deep_research/web/database/schema_upgrade.py +49 -29
  151. local_deep_research/web/models/database.py +51 -61
  152. local_deep_research/web/routes/api_routes.py +56 -22
  153. local_deep_research/web/routes/benchmark_routes.py +4 -1
  154. local_deep_research/web/routes/globals.py +22 -0
  155. local_deep_research/web/routes/history_routes.py +71 -46
  156. local_deep_research/web/routes/metrics_routes.py +1155 -0
  157. local_deep_research/web/routes/research_routes.py +227 -41
  158. local_deep_research/web/routes/settings_routes.py +156 -55
  159. local_deep_research/web/services/research_service.py +310 -103
  160. local_deep_research/web/services/resource_service.py +36 -11
  161. local_deep_research/web/services/settings_manager.py +55 -17
  162. local_deep_research/web/services/settings_service.py +12 -4
  163. local_deep_research/web/services/socket_service.py +295 -188
  164. local_deep_research/web/static/css/custom_dropdown.css +180 -0
  165. local_deep_research/web/static/css/styles.css +39 -1
  166. local_deep_research/web/static/js/components/detail.js +633 -267
  167. local_deep_research/web/static/js/components/details.js +751 -0
  168. local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
  169. local_deep_research/web/static/js/components/fallback/ui.js +23 -23
  170. local_deep_research/web/static/js/components/history.js +76 -76
  171. local_deep_research/web/static/js/components/logpanel.js +61 -13
  172. local_deep_research/web/static/js/components/progress.js +13 -2
  173. local_deep_research/web/static/js/components/research.js +99 -12
  174. local_deep_research/web/static/js/components/results.js +239 -106
  175. local_deep_research/web/static/js/main.js +40 -40
  176. local_deep_research/web/static/js/services/audio.js +1 -1
  177. local_deep_research/web/static/js/services/formatting.js +11 -11
  178. local_deep_research/web/static/js/services/keyboard.js +157 -0
  179. local_deep_research/web/static/js/services/pdf.js +80 -80
  180. local_deep_research/web/static/sounds/README.md +1 -1
  181. local_deep_research/web/templates/base.html +1 -0
  182. local_deep_research/web/templates/components/log_panel.html +7 -1
  183. local_deep_research/web/templates/components/mobile_nav.html +1 -1
  184. local_deep_research/web/templates/components/sidebar.html +3 -0
  185. local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
  186. local_deep_research/web/templates/pages/details.html +325 -24
  187. local_deep_research/web/templates/pages/history.html +1 -1
  188. local_deep_research/web/templates/pages/metrics.html +1929 -0
  189. local_deep_research/web/templates/pages/progress.html +2 -2
  190. local_deep_research/web/templates/pages/research.html +53 -17
  191. local_deep_research/web/templates/pages/results.html +12 -1
  192. local_deep_research/web/templates/pages/star_reviews.html +803 -0
  193. local_deep_research/web/utils/formatters.py +9 -3
  194. local_deep_research/web_search_engines/default_search_engines.py +5 -3
  195. local_deep_research/web_search_engines/engines/full_search.py +8 -2
  196. local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
  197. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
  198. local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
  199. local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
  200. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
  201. local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
  202. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
  203. local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
  204. local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
  205. local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
  206. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
  207. local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
  208. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
  209. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
  210. local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
  211. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
  212. local_deep_research/web_search_engines/search_engine_base.py +83 -35
  213. local_deep_research/web_search_engines/search_engine_factory.py +25 -8
  214. local_deep_research/web_search_engines/search_engines_config.py +9 -3
  215. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
  216. local_deep_research-0.5.0.dist-info/RECORD +265 -0
  217. local_deep_research-0.4.4.dist-info/RECORD +0 -177
  218. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
  219. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
  220. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,287 @@
1
+ """
2
+ BrowseComp-specific question generation that creates progressive, entity-focused searches.
3
+ """
4
+
5
+ import logging
6
+ import re
7
+ from typing import Dict, List
8
+
9
+ from .base_question import BaseQuestionGenerator
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class BrowseCompQuestionGenerator(BaseQuestionGenerator):
15
+ """
16
+ Question generator optimized for BrowseComp-style queries.
17
+
18
+ Key features:
19
+ 1. Extract concrete entities (dates, numbers, names, places)
20
+ 2. Generate progressive search combinations
21
+ 3. Start broad, then narrow systematically
22
+ 4. Focus on verifiable facts
23
+ """
24
+
25
+ def __init__(self, model):
26
+ super().__init__(model)
27
+ self.extracted_entities = {}
28
+ self.search_progression = []
29
+
30
+ def generate_questions(
31
+ self,
32
+ current_knowledge: str,
33
+ query: str,
34
+ questions_per_iteration: int = 5,
35
+ questions_by_iteration: dict = None,
36
+ iteration: int = 1,
37
+ ) -> List[str]:
38
+ """Generate progressive search queries for BrowseComp problems."""
39
+ questions_by_iteration = questions_by_iteration or {}
40
+
41
+ # First iteration: Extract entities and create initial searches
42
+ if iteration == 1 or not self.extracted_entities:
43
+ self.extracted_entities = self._extract_entities(query)
44
+ return self._generate_initial_searches(
45
+ query, self.extracted_entities, questions_per_iteration
46
+ )
47
+
48
+ # Subsequent iterations: Progressive refinement
49
+ return self._generate_progressive_searches(
50
+ query,
51
+ current_knowledge,
52
+ self.extracted_entities,
53
+ questions_by_iteration,
54
+ questions_per_iteration,
55
+ iteration,
56
+ )
57
+
58
+ def _extract_entities(self, query: str) -> Dict[str, List[str]]:
59
+ """Extract concrete entities from the query."""
60
+ prompt = f"""Extract ALL concrete, searchable entities from this query:
61
+
62
+ Query: {query}
63
+
64
+ Extract:
65
+ 1. TEMPORAL: All years, dates, time periods (e.g., "2018", "between 1995 and 2006", "2023")
66
+ 2. NUMERICAL: All numbers, statistics, counts (e.g., "300", "more than 3", "4-3", "84.5%")
67
+ 3. NAMES: Partial names, name hints, proper nouns (e.g., "Dartmouth", "EMNLP", "Plastic Man")
68
+ 4. LOCATIONS: Places, institutions, geographic features (e.g., "Pennsylvania", "Grand Canyon")
69
+ 5. DESCRIPTORS: Key descriptive terms (e.g., "fourth wall", "ascetics", "decider game")
70
+
71
+ For TEMPORAL entities, if there's a range (e.g., "between 2018-2023"), list EACH individual year.
72
+
73
+ Format your response as:
74
+ TEMPORAL: [entity1], [entity2], ...
75
+ NUMERICAL: [entity1], [entity2], ...
76
+ NAMES: [entity1], [entity2], ...
77
+ LOCATIONS: [entity1], [entity2], ...
78
+ DESCRIPTORS: [entity1], [entity2], ...
79
+ """
80
+
81
+ response = self.model.invoke(prompt)
82
+ content = (
83
+ response.content if hasattr(response, "content") else str(response)
84
+ )
85
+
86
+ entities = {
87
+ "temporal": [],
88
+ "numerical": [],
89
+ "names": [],
90
+ "locations": [],
91
+ "descriptors": [],
92
+ }
93
+
94
+ # current_category = None # Not currently used
95
+ for line in content.strip().split("\n"):
96
+ line = line.strip()
97
+ if ":" in line:
98
+ category, values = line.split(":", 1)
99
+ category = category.strip().lower()
100
+ if category in entities:
101
+ # Parse comma-separated values
102
+ values = [v.strip() for v in values.split(",") if v.strip()]
103
+ entities[category].extend(values)
104
+
105
+ # Expand temporal ranges
106
+ entities["temporal"] = self._expand_temporal_ranges(
107
+ entities["temporal"]
108
+ )
109
+
110
+ logger.info(f"Extracted entities: {entities}")
111
+ return entities
112
+
113
+ def _expand_temporal_ranges(
114
+ self, temporal_entities: List[str]
115
+ ) -> List[str]:
116
+ """Expand year ranges into individual years."""
117
+ expanded = []
118
+ for entity in temporal_entities:
119
+ # Check for range patterns like "2018-2023" or "between 1995 and 2006"
120
+ range_match = re.search(
121
+ r"(\d{4})[-\s]+(?:to|and)?\s*(\d{4})", entity
122
+ )
123
+ if range_match:
124
+ start_year = int(range_match.group(1))
125
+ end_year = int(range_match.group(2))
126
+ for year in range(start_year, end_year + 1):
127
+ expanded.append(str(year))
128
+ else:
129
+ # Single year or other temporal entity
130
+ year_match = re.search(r"\d{4}", entity)
131
+ if year_match:
132
+ expanded.append(year_match.group())
133
+ else:
134
+ expanded.append(entity)
135
+
136
+ return list(set(expanded)) # Remove duplicates
137
+
138
+ def _generate_initial_searches(
139
+ self, query: str, entities: Dict[str, List[str]], num_questions: int
140
+ ) -> List[str]:
141
+ """Generate initial broad searches."""
142
+ searches = []
143
+
144
+ # 1. Original query (always include)
145
+ searches.append(query)
146
+
147
+ # 2. Domain exploration searches (combine key entities)
148
+ if entities["names"]:
149
+ for name in entities["names"][:2]: # Top 2 names
150
+ searches.append(f"{name}")
151
+ if entities["descriptors"]:
152
+ searches.append(f"{name} {entities['descriptors'][0]}")
153
+
154
+ # 3. Temporal searches if years are important
155
+ if entities["temporal"] and len(entities["temporal"]) <= 10:
156
+ # For small year ranges, search each year with a key term
157
+ key_term = (
158
+ entities["names"][0]
159
+ if entities["names"]
160
+ else entities["descriptors"][0]
161
+ if entities["descriptors"]
162
+ else ""
163
+ )
164
+ for year in entities["temporal"][:5]: # Limit to 5 years initially
165
+ if key_term:
166
+ searches.append(f"{key_term} {year}")
167
+
168
+ # 4. Location-based searches
169
+ if entities["locations"]:
170
+ for location in entities["locations"][:2]:
171
+ searches.append(f"{location}")
172
+ if entities["descriptors"]:
173
+ searches.append(f"{location} {entities['descriptors'][0]}")
174
+
175
+ # Remove duplicates and limit to requested number
176
+ seen = set()
177
+ unique_searches = []
178
+ for s in searches:
179
+ if s.lower() not in seen:
180
+ seen.add(s.lower())
181
+ unique_searches.append(s)
182
+
183
+ return unique_searches[:num_questions]
184
+
185
+ def _generate_progressive_searches(
186
+ self,
187
+ query: str,
188
+ current_knowledge: str,
189
+ entities: Dict[str, List[str]],
190
+ questions_by_iteration: dict,
191
+ num_questions: int,
192
+ iteration: int,
193
+ ) -> List[str]:
194
+ """Generate progressively more specific searches based on findings."""
195
+
196
+ # Analyze what we've found so far
197
+ prompt = f"""Based on our search progress, generate targeted follow-up searches.
198
+
199
+ Original Query: {query}
200
+
201
+ Entities Found:
202
+ - Names/Terms: {", ".join(entities["names"][:5])}
203
+ - Years: {", ".join(entities["temporal"][:5])}
204
+ - Locations: {", ".join(entities["locations"][:3])}
205
+ - Key Features: {", ".join(entities["descriptors"][:3])}
206
+
207
+ Current Knowledge Summary:
208
+ {current_knowledge[:1500]}
209
+
210
+ Previous Searches:
211
+ {self._format_previous_searches(questions_by_iteration)}
212
+
213
+ Generate {num_questions} NEW search queries that:
214
+ 1. Combine 2-3 entities we haven't tried together
215
+ 2. If we found candidate names, search for them with other constraints
216
+ 3. For year ranges, systematically cover years we haven't searched
217
+ 4. Use quotes for exact phrases when beneficial
218
+
219
+ Focus on finding the specific answer, not general information.
220
+
221
+ Format: One search per line
222
+ """
223
+
224
+ response = self.model.invoke(prompt)
225
+ content = (
226
+ response.content if hasattr(response, "content") else str(response)
227
+ )
228
+
229
+ # Extract searches from response
230
+ searches = []
231
+ for line in content.strip().split("\n"):
232
+ line = line.strip()
233
+ if line and not line.endswith(":") and len(line) > 5:
234
+ # Clean up common prefixes
235
+ for prefix in ["Q:", "Search:", "-", "*", "•"]:
236
+ if line.startswith(prefix):
237
+ line = line[len(prefix) :].strip()
238
+ if line:
239
+ searches.append(line)
240
+
241
+ # Ensure we have enough searches
242
+ while len(searches) < num_questions:
243
+ # Generate combinations programmatically
244
+ if iteration <= 5 and entities["temporal"]:
245
+ # Continue with year-based searches
246
+ for year in entities["temporal"]:
247
+ if not self._was_searched(year, questions_by_iteration):
248
+ base_term = (
249
+ entities["names"][0] if entities["names"] else ""
250
+ )
251
+ searches.append(f"{base_term} {year}".strip())
252
+ if len(searches) >= num_questions:
253
+ break
254
+ else:
255
+ # Combine multiple constraints
256
+ if entities["names"] and entities["descriptors"]:
257
+ for name in entities["names"]:
258
+ for desc in entities["descriptors"]:
259
+ combo = f"{name} {desc}"
260
+ if not self._was_searched(
261
+ combo, questions_by_iteration
262
+ ):
263
+ searches.append(combo)
264
+ if len(searches) >= num_questions:
265
+ break
266
+
267
+ return searches[:num_questions]
268
+
269
+ def _format_previous_searches(self, questions_by_iteration: dict) -> str:
270
+ """Format previous searches for context."""
271
+ formatted = []
272
+ for iteration, questions in questions_by_iteration.items():
273
+ if isinstance(questions, list):
274
+ formatted.extend(
275
+ [f"Iteration {iteration}: {q}" for q in questions[:3]]
276
+ )
277
+ return "\n".join(formatted[-10:]) # Last 10 searches
278
+
279
+ def _was_searched(self, term: str, questions_by_iteration: dict) -> bool:
280
+ """Check if a term was already searched."""
281
+ term_lower = term.lower()
282
+ for questions in questions_by_iteration.values():
283
+ if isinstance(questions, list):
284
+ for q in questions:
285
+ if term_lower in q.lower():
286
+ return True
287
+ return False
@@ -101,7 +101,9 @@ class DecompositionQuestionGenerator(BaseQuestionGenerator):
101
101
  if subject.lower().startswith(article):
102
102
  subject = subject[len(article) :].strip()
103
103
 
104
- logger.info(f"Original query: '{query}', Extracted subject: '{subject}'")
104
+ logger.info(
105
+ f"Original query: '{query}', Extracted subject: '{subject}'"
106
+ )
105
107
 
106
108
  # Create a prompt to decompose the query into sub-questions
107
109
  prompt = f"""Decompose the main research topic into 3-5 specific sub-queries that can be answered independently.
@@ -223,7 +225,9 @@ What are the security implications of X?
223
225
  for conjunction in conjunctions:
224
226
  if conjunction in topic_text.lower():
225
227
  # Take only the part before the conjunction
226
- topic_text = topic_text.split(conjunction)[0].strip()
228
+ topic_text = topic_text.split(conjunction)[
229
+ 0
230
+ ].strip()
227
231
  logger.info(
228
232
  f"Simplified prompt: Split compound query at '{conjunction}', extracted: '{topic_text}'"
229
233
  )
@@ -288,7 +292,9 @@ Sub-questions:
288
292
  )
289
293
  return self._generate_default_questions(query)
290
294
 
291
- logger.info(f"Generated {len(sub_queries)} sub-questions: {sub_queries}")
295
+ logger.info(
296
+ f"Generated {len(sub_queries)} sub-questions: {sub_queries}"
297
+ )
292
298
  return sub_queries[: self.max_subqueries] # Limit to max_subqueries
293
299
 
294
300
  except Exception as e:
@@ -380,7 +386,10 @@ Sub-questions:
380
386
  )
381
387
 
382
388
  # Special case for CSRF - if we've extracted just "csrf" from a longer query
383
- if subject.lower() == "csrf" or subject.lower() == "cross-site request forgery":
389
+ if (
390
+ subject.lower() == "csrf"
391
+ or subject.lower() == "cross-site request forgery"
392
+ ):
384
393
  # CSRF-specific questions
385
394
  default_questions = [
386
395
  "What is Cross-Site Request Forgery (CSRF)?",
@@ -0,0 +1,184 @@
1
+ """
2
+ Entity-aware question generation for improved entity identification.
3
+ """
4
+
5
+ import logging
6
+ from datetime import datetime
7
+ from typing import List
8
+
9
+ from .base_question import BaseQuestionGenerator
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class EntityAwareQuestionGenerator(BaseQuestionGenerator):
15
+ """Question generator that creates more targeted searches for entity identification."""
16
+
17
+ def generate_questions(
18
+ self,
19
+ current_knowledge: str,
20
+ query: str,
21
+ questions_per_iteration: int = 2,
22
+ questions_by_iteration: dict = None,
23
+ ) -> List[str]:
24
+ """Generate questions with entity-aware search patterns."""
25
+ now = datetime.now()
26
+ current_time = now.strftime("%Y-%m-%d")
27
+ questions_by_iteration = questions_by_iteration or {}
28
+
29
+ logger.info("Generating entity-aware follow-up questions...")
30
+
31
+ # Detect if this is likely an entity identification query
32
+ entity_keywords = [
33
+ "who",
34
+ "what",
35
+ "which",
36
+ "identify",
37
+ "name",
38
+ "character",
39
+ "person",
40
+ "place",
41
+ "organization",
42
+ "company",
43
+ "author",
44
+ "scientist",
45
+ "inventor",
46
+ "city",
47
+ "country",
48
+ "book",
49
+ "movie",
50
+ ]
51
+
52
+ is_entity_query = any(
53
+ keyword in query.lower() for keyword in entity_keywords
54
+ )
55
+
56
+ if is_entity_query:
57
+ # Use more direct entity-focused prompt
58
+ if questions_by_iteration:
59
+ prompt = f"""Generate {questions_per_iteration} targeted search queries to identify the specific entity in the query.
60
+
61
+ Query: {query}
62
+ Today: {current_time}
63
+ Past questions: {str(questions_by_iteration)}
64
+ Current knowledge: {current_knowledge}
65
+
66
+ Create direct search queries that combine the key identifying features to find the specific name/entity.
67
+ Focus on:
68
+ 1. Combining multiple constraints in a single search
69
+ 2. Using quotation marks for exact phrases
70
+ 3. Including specific details that narrow down results
71
+
72
+ Format: One question per line, e.g.
73
+ Q: "fictional character" "breaks fourth wall" "TV show" 1960s 1980s
74
+ Q: character name ascetics humor television fewer than 50 episodes
75
+ """
76
+ else:
77
+ prompt = f"""Generate {questions_per_iteration} direct search queries to identify the specific entity in: {query}
78
+
79
+ Today: {current_time}
80
+
81
+ Create search queries that:
82
+ 1. Combine multiple identifying features
83
+ 2. Target the specific entity name/identification
84
+ 3. Use variations of key terms
85
+
86
+ Format: One question per line, e.g.
87
+ Q: question1
88
+ Q: question2
89
+ """
90
+ else:
91
+ # Fall back to standard question generation for non-entity queries
92
+ return super().generate_questions(
93
+ current_knowledge,
94
+ query,
95
+ questions_per_iteration,
96
+ questions_by_iteration,
97
+ )
98
+
99
+ response = self.model.invoke(prompt)
100
+
101
+ # Handle both string responses and responses with .content attribute
102
+ response_text = ""
103
+ if hasattr(response, "content"):
104
+ response_text = response.content
105
+ else:
106
+ response_text = str(response)
107
+
108
+ questions = [
109
+ q.replace("Q:", "").strip()
110
+ for q in response_text.split("\n")
111
+ if q.strip().startswith("Q:")
112
+ ][:questions_per_iteration]
113
+
114
+ logger.info(f"Generated {len(questions)} entity-aware questions")
115
+
116
+ return questions
117
+
118
+ def generate_sub_questions(
119
+ self, query: str, context: str = ""
120
+ ) -> List[str]:
121
+ """Generate sub-questions with entity focus when appropriate."""
122
+ # Check if this is an entity identification query
123
+ entity_keywords = [
124
+ "who",
125
+ "what",
126
+ "which",
127
+ "identify",
128
+ "name",
129
+ "character",
130
+ "person",
131
+ "place",
132
+ "organization",
133
+ "company",
134
+ ]
135
+
136
+ is_entity_query = any(
137
+ keyword in query.lower() for keyword in entity_keywords
138
+ )
139
+
140
+ if is_entity_query:
141
+ prompt = f"""Break down this entity identification query into targeted sub-questions.
142
+
143
+ Original Question: {query}
144
+ {context}
145
+
146
+ Generate 2-5 sub-questions that will help identify the specific entity.
147
+ Focus on:
148
+ 1. Combining constraints to narrow down results
149
+ 2. Finding the actual name/identity
150
+ 3. Verifying the entity matches all criteria
151
+
152
+ Format your response as:
153
+ 1. First sub-question
154
+ 2. Second sub-question
155
+ ...
156
+
157
+ Only provide the numbered sub-questions."""
158
+ else:
159
+ return super().generate_sub_questions(query, context)
160
+
161
+ try:
162
+ response = self.model.invoke(prompt)
163
+ content = ""
164
+ if hasattr(response, "content"):
165
+ content = response.content
166
+ else:
167
+ content = str(response)
168
+
169
+ # Extract numbered questions
170
+ questions = []
171
+ for line in content.strip().split("\n"):
172
+ line = line.strip()
173
+ if line and (line[0].isdigit() or line.startswith("-")):
174
+ # Remove the number/bullet and clean up
175
+ question = line.split(".", 1)[-1].strip()
176
+ question = question.lstrip("- ").strip()
177
+ if question:
178
+ questions.append(question)
179
+
180
+ return questions
181
+
182
+ except Exception as e:
183
+ logger.error(f"Error generating sub-questions: {str(e)}")
184
+ return []
@@ -50,7 +50,7 @@ class StandardQuestionGenerator(BaseQuestionGenerator):
50
50
  response_text = str(response)
51
51
 
52
52
  questions = [
53
- q.replace("Q:", "").strip()
53
+ q.replace("Q:", "").strip().strip("\"'")
54
54
  for q in response_text.split("\n")
55
55
  if q.strip().startswith("Q:")
56
56
  ][:questions_per_iteration]
@@ -59,7 +59,9 @@ class StandardQuestionGenerator(BaseQuestionGenerator):
59
59
 
60
60
  return questions
61
61
 
62
- def generate_sub_questions(self, query: str, context: str = "") -> List[str]:
62
+ def generate_sub_questions(
63
+ self, query: str, context: str = ""
64
+ ) -> List[str]:
63
65
  """
64
66
  Generate sub-questions from a main query.
65
67
 
@@ -107,7 +109,11 @@ Only provide the numbered sub-questions, nothing else."""
107
109
  line = line.strip()
108
110
  if line and (line[0].isdigit() or line.startswith("-")):
109
111
  # Extract sub-question from numbered or bulleted list
110
- parts = line.split(".", 1) if "." in line else line.split(" ", 1)
112
+ parts = (
113
+ line.split(".", 1)
114
+ if "." in line
115
+ else line.split(" ", 1)
116
+ )
111
117
  if len(parts) > 1:
112
118
  sub_question = parts[1].strip()
113
119
  sub_questions.append(sub_question)