local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. local_deep_research/__init__.py +7 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
  4. local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
  5. local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
  6. local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
  7. local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
  8. local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
  9. local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
  10. local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
  11. local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
  12. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
  13. local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
  14. local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
  15. local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
  16. local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
  17. local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
  18. local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
  19. local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
  20. local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
  21. local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
  22. local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
  23. local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
  24. local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
  25. local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
  26. local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
  27. local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
  28. local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
  29. local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
  30. local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
  31. local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
  32. local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
  33. local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
  34. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
  35. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
  36. local_deep_research/advanced_search_system/findings/repository.py +54 -17
  37. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
  38. local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
  39. local_deep_research/advanced_search_system/questions/__init__.py +16 -0
  40. local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
  41. local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
  42. local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
  43. local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
  44. local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
  45. local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
  46. local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
  47. local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
  48. local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
  49. local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
  50. local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
  51. local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
  52. local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
  53. local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
  54. local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
  55. local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
  56. local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
  57. local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
  58. local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
  59. local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
  60. local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
  61. local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
  62. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
  63. local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
  64. local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
  65. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
  66. local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
  67. local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
  68. local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
  69. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
  70. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
  71. local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
  72. local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
  73. local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
  74. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
  75. local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
  76. local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
  77. local_deep_research/api/benchmark_functions.py +6 -2
  78. local_deep_research/api/research_functions.py +10 -4
  79. local_deep_research/benchmarks/__init__.py +9 -7
  80. local_deep_research/benchmarks/benchmark_functions.py +6 -2
  81. local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
  82. local_deep_research/benchmarks/cli.py +38 -13
  83. local_deep_research/benchmarks/comparison/__init__.py +4 -2
  84. local_deep_research/benchmarks/comparison/evaluator.py +316 -239
  85. local_deep_research/benchmarks/datasets/__init__.py +1 -1
  86. local_deep_research/benchmarks/datasets/base.py +91 -72
  87. local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
  88. local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
  89. local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
  90. local_deep_research/benchmarks/datasets/utils.py +48 -29
  91. local_deep_research/benchmarks/datasets.py +4 -11
  92. local_deep_research/benchmarks/efficiency/__init__.py +8 -4
  93. local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
  94. local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
  95. local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
  96. local_deep_research/benchmarks/evaluators/composite.py +6 -2
  97. local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
  98. local_deep_research/benchmarks/graders.py +32 -10
  99. local_deep_research/benchmarks/metrics/README.md +1 -1
  100. local_deep_research/benchmarks/metrics/calculation.py +25 -10
  101. local_deep_research/benchmarks/metrics/reporting.py +7 -3
  102. local_deep_research/benchmarks/metrics/visualization.py +42 -23
  103. local_deep_research/benchmarks/metrics.py +1 -1
  104. local_deep_research/benchmarks/optimization/__init__.py +3 -1
  105. local_deep_research/benchmarks/optimization/api.py +7 -1
  106. local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
  107. local_deep_research/benchmarks/runners.py +48 -15
  108. local_deep_research/citation_handler.py +65 -92
  109. local_deep_research/citation_handlers/__init__.py +15 -0
  110. local_deep_research/citation_handlers/base_citation_handler.py +70 -0
  111. local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
  112. local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
  113. local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
  114. local_deep_research/config/llm_config.py +271 -169
  115. local_deep_research/config/search_config.py +14 -5
  116. local_deep_research/defaults/__init__.py +0 -1
  117. local_deep_research/metrics/__init__.py +13 -0
  118. local_deep_research/metrics/database.py +58 -0
  119. local_deep_research/metrics/db_models.py +115 -0
  120. local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
  121. local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
  122. local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
  123. local_deep_research/metrics/migrate_research_ratings.py +31 -0
  124. local_deep_research/metrics/models.py +61 -0
  125. local_deep_research/metrics/pricing/__init__.py +12 -0
  126. local_deep_research/metrics/pricing/cost_calculator.py +237 -0
  127. local_deep_research/metrics/pricing/pricing_cache.py +143 -0
  128. local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
  129. local_deep_research/metrics/query_utils.py +51 -0
  130. local_deep_research/metrics/search_tracker.py +380 -0
  131. local_deep_research/metrics/token_counter.py +1078 -0
  132. local_deep_research/migrate_db.py +3 -1
  133. local_deep_research/report_generator.py +22 -8
  134. local_deep_research/search_system.py +390 -9
  135. local_deep_research/test_migration.py +15 -5
  136. local_deep_research/utilities/db_utils.py +7 -4
  137. local_deep_research/utilities/es_utils.py +115 -104
  138. local_deep_research/utilities/llm_utils.py +15 -5
  139. local_deep_research/utilities/log_utils.py +151 -0
  140. local_deep_research/utilities/search_cache.py +387 -0
  141. local_deep_research/utilities/search_utilities.py +14 -6
  142. local_deep_research/utilities/threading_utils.py +92 -0
  143. local_deep_research/utilities/url_utils.py +6 -0
  144. local_deep_research/web/api.py +347 -0
  145. local_deep_research/web/app.py +13 -17
  146. local_deep_research/web/app_factory.py +71 -66
  147. local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
  148. local_deep_research/web/database/migrations.py +20 -3
  149. local_deep_research/web/database/models.py +74 -25
  150. local_deep_research/web/database/schema_upgrade.py +49 -29
  151. local_deep_research/web/models/database.py +63 -83
  152. local_deep_research/web/routes/api_routes.py +56 -22
  153. local_deep_research/web/routes/benchmark_routes.py +4 -1
  154. local_deep_research/web/routes/globals.py +22 -0
  155. local_deep_research/web/routes/history_routes.py +71 -46
  156. local_deep_research/web/routes/metrics_routes.py +1155 -0
  157. local_deep_research/web/routes/research_routes.py +192 -54
  158. local_deep_research/web/routes/settings_routes.py +156 -55
  159. local_deep_research/web/services/research_service.py +412 -251
  160. local_deep_research/web/services/resource_service.py +36 -11
  161. local_deep_research/web/services/settings_manager.py +55 -17
  162. local_deep_research/web/services/settings_service.py +12 -4
  163. local_deep_research/web/services/socket_service.py +295 -188
  164. local_deep_research/web/static/css/custom_dropdown.css +180 -0
  165. local_deep_research/web/static/css/styles.css +39 -1
  166. local_deep_research/web/static/js/components/detail.js +633 -267
  167. local_deep_research/web/static/js/components/details.js +751 -0
  168. local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
  169. local_deep_research/web/static/js/components/fallback/ui.js +23 -23
  170. local_deep_research/web/static/js/components/history.js +76 -76
  171. local_deep_research/web/static/js/components/logpanel.js +61 -13
  172. local_deep_research/web/static/js/components/progress.js +13 -2
  173. local_deep_research/web/static/js/components/research.js +99 -12
  174. local_deep_research/web/static/js/components/results.js +239 -106
  175. local_deep_research/web/static/js/main.js +40 -40
  176. local_deep_research/web/static/js/services/audio.js +1 -1
  177. local_deep_research/web/static/js/services/formatting.js +11 -11
  178. local_deep_research/web/static/js/services/keyboard.js +157 -0
  179. local_deep_research/web/static/js/services/pdf.js +80 -80
  180. local_deep_research/web/static/sounds/README.md +1 -1
  181. local_deep_research/web/templates/base.html +1 -0
  182. local_deep_research/web/templates/components/log_panel.html +7 -1
  183. local_deep_research/web/templates/components/mobile_nav.html +1 -1
  184. local_deep_research/web/templates/components/sidebar.html +3 -0
  185. local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
  186. local_deep_research/web/templates/pages/details.html +325 -24
  187. local_deep_research/web/templates/pages/history.html +1 -1
  188. local_deep_research/web/templates/pages/metrics.html +1929 -0
  189. local_deep_research/web/templates/pages/progress.html +2 -2
  190. local_deep_research/web/templates/pages/research.html +53 -17
  191. local_deep_research/web/templates/pages/results.html +12 -1
  192. local_deep_research/web/templates/pages/star_reviews.html +803 -0
  193. local_deep_research/web/utils/formatters.py +9 -3
  194. local_deep_research/web_search_engines/default_search_engines.py +5 -3
  195. local_deep_research/web_search_engines/engines/full_search.py +8 -2
  196. local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
  197. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
  198. local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
  199. local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
  200. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
  201. local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
  202. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
  203. local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
  204. local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
  205. local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
  206. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
  207. local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
  208. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
  209. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
  210. local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
  211. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
  212. local_deep_research/web_search_engines/search_engine_base.py +83 -35
  213. local_deep_research/web_search_engines/search_engine_factory.py +25 -8
  214. local_deep_research/web_search_engines/search_engines_config.py +9 -3
  215. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/METADATA +7 -1
  216. local_deep_research-0.5.2.dist-info/RECORD +265 -0
  217. local_deep_research-0.4.4.dist-info/RECORD +0 -177
  218. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/WHEEL +0 -0
  219. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/entry_points.txt +0 -0
  220. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,613 @@
1
+ """
2
+ Source diversity management for improved evidence quality.
3
+ """
4
+
5
+ import re
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime
9
+ from typing import Dict, List, Optional, Set, Tuple, Any
10
+
11
+ from langchain_core.language_models import BaseChatModel
12
+
13
+ from ..constraints.base_constraint import Constraint, ConstraintType
14
+
15
+
16
+ @dataclass
17
+ class SourceProfile:
18
+ """Profile of a source for diversity tracking."""
19
+
20
+ url: str
21
+ domain: str
22
+ source_type: str # 'academic', 'news', 'government', 'wiki', 'blog', etc.
23
+ credibility_score: float
24
+ specialties: List[str] = field(default_factory=list)
25
+ temporal_coverage: Optional[Tuple[datetime, datetime]] = None
26
+ geographic_focus: Optional[str] = None
27
+ evidence_count: int = 0
28
+ last_accessed: Optional[datetime] = None
29
+
30
+
31
+ @dataclass
32
+ class DiversityMetrics:
33
+ """Metrics for source diversity assessment."""
34
+
35
+ type_diversity: float # 0.0 to 1.0
36
+ temporal_diversity: float
37
+ geographic_diversity: float
38
+ credibility_distribution: Dict[str, float]
39
+ specialty_coverage: Dict[str, int]
40
+ overall_score: float
41
+
42
+
43
+ class SourceDiversityManager:
44
+ """
45
+ Manages source diversity to ensure comprehensive evidence collection.
46
+
47
+ Key features:
48
+ 1. Tracks source types and characteristics
49
+ 2. Ensures diverse source selection
50
+ 3. Prioritizes high-credibility sources
51
+ 4. Manages geographic and temporal diversity
52
+ """
53
+
54
+ def __init__(self, model: BaseChatModel):
55
+ """Initialize the source diversity manager."""
56
+ self.model = model
57
+ self.source_profiles: Dict[str, SourceProfile] = {}
58
+ self.source_types: Dict[str, Set[str]] = defaultdict(set)
59
+ self.type_priorities: Dict[str, float] = {
60
+ "academic": 0.9,
61
+ "government": 0.85,
62
+ "news": 0.7,
63
+ "wiki": 0.75,
64
+ "blog": 0.5,
65
+ "forum": 0.4,
66
+ "social": 0.3,
67
+ }
68
+ self.minimum_source_types: int = 3
69
+ self.credibility_threshold: float = 0.6
70
+
71
+ def analyze_source(
72
+ self, url: str, content: Optional[str] = None
73
+ ) -> SourceProfile:
74
+ """Analyze a source and create its profile."""
75
+ if url in self.source_profiles:
76
+ profile = self.source_profiles[url]
77
+ profile.evidence_count += 1
78
+ profile.last_accessed = datetime.utcnow()
79
+ return profile
80
+
81
+ # Extract domain
82
+ domain = self._extract_domain(url)
83
+
84
+ # Determine source type
85
+ source_type = self._determine_source_type(url, domain, content)
86
+
87
+ # Calculate credibility
88
+ credibility = self._calculate_credibility(
89
+ url, domain, source_type, content
90
+ )
91
+
92
+ # Extract specialties
93
+ specialties = self._extract_specialties(url, content)
94
+
95
+ # Determine temporal and geographic coverage
96
+ temporal_coverage = self._extract_temporal_coverage(content)
97
+ geographic_focus = self._extract_geographic_focus(url, content)
98
+
99
+ profile = SourceProfile(
100
+ url=url,
101
+ domain=domain,
102
+ source_type=source_type,
103
+ credibility_score=credibility,
104
+ specialties=specialties,
105
+ temporal_coverage=temporal_coverage,
106
+ geographic_focus=geographic_focus,
107
+ evidence_count=1,
108
+ last_accessed=datetime.utcnow(),
109
+ )
110
+
111
+ self.source_profiles[url] = profile
112
+ self.source_types[source_type].add(url)
113
+
114
+ return profile
115
+
116
+ def _extract_domain(self, url: str) -> str:
117
+ """Extract domain from URL."""
118
+ import re
119
+
120
+ pattern = r"https?://(?:www\.)?([^/]+)"
121
+ match = re.match(pattern, url)
122
+ if match:
123
+ return match.group(1)
124
+ return url
125
+
126
+ def _determine_source_type(
127
+ self, url: str, domain: str, content: Optional[str]
128
+ ) -> str:
129
+ """Determine the type of source."""
130
+ # Check known patterns
131
+ academic_domains = [
132
+ ".edu",
133
+ ".ac.",
134
+ "scholar",
135
+ "pubmed",
136
+ "arxiv",
137
+ "jstor",
138
+ ]
139
+ government_domains = [".gov", ".mil"]
140
+ news_domains = [
141
+ "news",
142
+ "times",
143
+ "post",
144
+ "guardian",
145
+ "bbc",
146
+ "cnn",
147
+ "reuters",
148
+ ]
149
+ wiki_domains = ["wikipedia", "wiki"]
150
+
151
+ lower_domain = domain.lower()
152
+ lower_url = url.lower()
153
+
154
+ # Check patterns
155
+ for pattern in academic_domains:
156
+ if pattern in lower_domain or pattern in lower_url:
157
+ return "academic"
158
+
159
+ for pattern in government_domains:
160
+ if pattern in lower_domain:
161
+ return "government"
162
+
163
+ for pattern in wiki_domains:
164
+ if pattern in lower_domain:
165
+ return "wiki"
166
+
167
+ for pattern in news_domains:
168
+ if pattern in lower_domain:
169
+ return "news"
170
+
171
+ # Use content analysis as fallback
172
+ if content:
173
+ return self._analyze_content_type(content)
174
+
175
+ return "general"
176
+
177
+ def _analyze_content_type(self, content: str) -> str:
178
+ """Analyze content to determine source type."""
179
+ prompt = f"""
180
+ Analyze this content excerpt and determine the source type:
181
+
182
+ {content[:500]}
183
+
184
+ Choose from: academic, government, news, wiki, blog, forum, social, general
185
+
186
+ Return only the source type.
187
+ """
188
+
189
+ response = self.model.invoke(prompt)
190
+ source_type = response.content.strip().lower()
191
+
192
+ if source_type in self.type_priorities:
193
+ return source_type
194
+ return "general"
195
+
196
+ def _calculate_credibility(
197
+ self, url: str, domain: str, source_type: str, content: Optional[str]
198
+ ) -> float:
199
+ """Calculate credibility score for a source."""
200
+ # Base score from source type
201
+ base_score = self.type_priorities.get(source_type, 0.5)
202
+
203
+ # Adjust based on domain characteristics
204
+ if ".edu" in domain or ".gov" in domain:
205
+ base_score = min(base_score + 0.1, 1.0)
206
+
207
+ # Check for HTTPS
208
+ if url.startswith("https://"):
209
+ base_score = min(base_score + 0.05, 1.0)
210
+
211
+ # Additional analysis if content provided
212
+ if content:
213
+ # Check for citations/references
214
+ if re.search(r"\[\d+\]|\(\d{4}\)", content):
215
+ base_score = min(base_score + 0.1, 1.0)
216
+
217
+ # Check for author information
218
+ if re.search(
219
+ r"[Aa]uthor:|[Bb]y\s+[A-Z][a-z]+\s+[A-Z][a-z]+", content
220
+ ):
221
+ base_score = min(base_score + 0.05, 1.0)
222
+
223
+ return base_score
224
+
225
+ def _extract_specialties(
226
+ self, url: str, content: Optional[str]
227
+ ) -> List[str]:
228
+ """Extract topic specialties from source."""
229
+ specialties = []
230
+
231
+ # URL-based extraction
232
+ url_keywords = re.findall(r"/([a-z]+)/", url.lower())
233
+ specialties.extend([kw for kw in url_keywords if len(kw) > 3][:3])
234
+
235
+ # Content-based extraction if available
236
+ if content:
237
+ prompt = f"""
238
+ Identify the main topic areas or specialties covered in this content:
239
+
240
+ {content[:500]}
241
+
242
+ Return up to 3 topic areas, one per line.
243
+ """
244
+
245
+ response = self.model.invoke(prompt)
246
+ topics = [
247
+ line.strip()
248
+ for line in response.content.strip().split("\n")
249
+ if line.strip()
250
+ ]
251
+ specialties.extend(topics[:3])
252
+
253
+ return list(set(specialties))[:5]
254
+
255
+ def _extract_temporal_coverage(
256
+ self, content: Optional[str]
257
+ ) -> Optional[Tuple[datetime, datetime]]:
258
+ """Extract temporal coverage from content."""
259
+ if not content:
260
+ return None
261
+
262
+ # Look for year patterns
263
+ years = re.findall(r"\b(19\d{2}|20\d{2})\b", content)
264
+
265
+ if years:
266
+ years = [int(year) for year in years]
267
+ min_year = min(years)
268
+ max_year = max(years)
269
+
270
+ try:
271
+ return (datetime(min_year, 1, 1), datetime(max_year, 12, 31))
272
+ except ValueError:
273
+ return None
274
+
275
+ return None
276
+
277
+ def _extract_geographic_focus(
278
+ self, url: str, content: Optional[str]
279
+ ) -> Optional[str]:
280
+ """Extract geographic focus from source."""
281
+ # Check URL for geographic indicators
282
+ geo_patterns = {
283
+ "us": "United States",
284
+ "uk": "United Kingdom",
285
+ "ca": "Canada",
286
+ "au": "Australia",
287
+ "eu": "Europe",
288
+ }
289
+
290
+ for pattern, location in geo_patterns.items():
291
+ if f".{pattern}" in url or f"/{pattern}/" in url:
292
+ return location
293
+
294
+ # Content-based extraction
295
+ if content:
296
+ # Look for country/region mentions
297
+ locations = re.findall(
298
+ r"\b(?:United States|UK|Canada|Australia|Europe|Asia|Africa|Americas)\b",
299
+ content[:1000],
300
+ re.IGNORECASE,
301
+ )
302
+
303
+ if locations:
304
+ # Return most frequent
305
+ from collections import Counter
306
+
307
+ location_counts = Counter(locations)
308
+ return location_counts.most_common(1)[0][0]
309
+
310
+ return None
311
+
312
+ def calculate_diversity_metrics(
313
+ self, sources: List[str]
314
+ ) -> DiversityMetrics:
315
+ """Calculate diversity metrics for a set of sources."""
316
+ if not sources:
317
+ return DiversityMetrics(
318
+ type_diversity=0.0,
319
+ temporal_diversity=0.0,
320
+ geographic_diversity=0.0,
321
+ credibility_distribution={},
322
+ specialty_coverage={},
323
+ overall_score=0.0,
324
+ )
325
+
326
+ # Get profiles
327
+ profiles = [
328
+ self.source_profiles.get(url) or self.analyze_source(url)
329
+ for url in sources
330
+ ]
331
+
332
+ # Type diversity
333
+ source_types = [p.source_type for p in profiles]
334
+ unique_types = len(set(source_types))
335
+ type_diversity = min(unique_types / self.minimum_source_types, 1.0)
336
+
337
+ # Temporal diversity
338
+ temporal_ranges = [
339
+ p.temporal_coverage for p in profiles if p.temporal_coverage
340
+ ]
341
+ temporal_diversity = self._calculate_temporal_diversity(temporal_ranges)
342
+
343
+ # Geographic diversity
344
+ geo_focuses = [
345
+ p.geographic_focus for p in profiles if p.geographic_focus
346
+ ]
347
+ unique_geos = len(set(geo_focuses))
348
+ geographic_diversity = min(unique_geos / 3, 1.0) if geo_focuses else 0.0
349
+
350
+ # Credibility distribution
351
+ credibility_distribution = {}
352
+ for p in profiles:
353
+ level = (
354
+ "high"
355
+ if p.credibility_score >= 0.8
356
+ else "medium"
357
+ if p.credibility_score >= 0.6
358
+ else "low"
359
+ )
360
+ credibility_distribution[level] = (
361
+ credibility_distribution.get(level, 0) + 1
362
+ )
363
+
364
+ # Specialty coverage
365
+ specialty_coverage = {}
366
+ for p in profiles:
367
+ for specialty in p.specialties:
368
+ specialty_coverage[specialty] = (
369
+ specialty_coverage.get(specialty, 0) + 1
370
+ )
371
+
372
+ # Overall score
373
+ overall_score = (
374
+ type_diversity * 0.3
375
+ + temporal_diversity * 0.2
376
+ + geographic_diversity * 0.2
377
+ + (credibility_distribution.get("high", 0) / len(profiles)) * 0.3
378
+ )
379
+
380
+ return DiversityMetrics(
381
+ type_diversity=type_diversity,
382
+ temporal_diversity=temporal_diversity,
383
+ geographic_diversity=geographic_diversity,
384
+ credibility_distribution=credibility_distribution,
385
+ specialty_coverage=specialty_coverage,
386
+ overall_score=overall_score,
387
+ )
388
+
389
+ def _calculate_temporal_diversity(
390
+ self, ranges: List[Tuple[datetime, datetime]]
391
+ ) -> float:
392
+ """Calculate temporal diversity from date ranges."""
393
+ if not ranges:
394
+ return 0.0
395
+
396
+ # Calculate span coverage
397
+ all_years = set()
398
+ for start, end in ranges:
399
+ for year in range(start.year, end.year + 1):
400
+ all_years.add(year)
401
+
402
+ # Diversity based on year span
403
+ if len(all_years) > 1:
404
+ year_span = max(all_years) - min(all_years)
405
+ # Normalize to 0-1 (20 years = max diversity)
406
+ return min(year_span / 20, 1.0)
407
+
408
+ return 0.0
409
+
410
+ def recommend_additional_sources(
411
+ self, current_sources: List[str], constraints: List[Constraint]
412
+ ) -> List[Dict[str, Any]]:
413
+ """Recommend additional sources to improve diversity."""
414
+ current_metrics = self.calculate_diversity_metrics(current_sources)
415
+ recommendations = []
416
+
417
+ # Identify gaps
418
+ gaps = self._identify_diversity_gaps(current_metrics, constraints)
419
+
420
+ for gap_type, gap_details in gaps.items():
421
+ if gap_type == "source_type":
422
+ # Recommend sources of missing types
423
+ for missing_type in gap_details:
424
+ rec = {
425
+ "type": "source_type",
426
+ "target": missing_type,
427
+ "query_modifier": self._get_source_type_modifier(
428
+ missing_type
429
+ ),
430
+ "reason": f"Add {missing_type} sources for better perspective",
431
+ }
432
+ recommendations.append(rec)
433
+
434
+ elif gap_type == "temporal":
435
+ # Recommend sources for missing time periods
436
+ rec = {
437
+ "type": "temporal",
438
+ "target": gap_details,
439
+ "query_modifier": f'"{gap_details}" historical archive',
440
+ "reason": f"Add sources covering {gap_details}",
441
+ }
442
+ recommendations.append(rec)
443
+
444
+ elif gap_type == "geographic":
445
+ # Recommend sources from missing regions
446
+ for region in gap_details:
447
+ rec = {
448
+ "type": "geographic",
449
+ "target": region,
450
+ "query_modifier": f"site:{self._get_region_domain(region)}",
451
+ "reason": f"Add sources from {region}",
452
+ }
453
+ recommendations.append(rec)
454
+
455
+ elif gap_type == "credibility":
456
+ # Recommend higher credibility sources
457
+ rec = {
458
+ "type": "credibility",
459
+ "target": "high_credibility",
460
+ "query_modifier": "site:.edu OR site:.gov OR peer-reviewed",
461
+ "reason": "Add more authoritative sources",
462
+ }
463
+ recommendations.append(rec)
464
+
465
+ return recommendations[:5] # Limit recommendations
466
+
467
+ def _identify_diversity_gaps(
468
+ self, metrics: DiversityMetrics, constraints: List[Constraint]
469
+ ) -> Dict[str, Any]:
470
+ """Identify gaps in source diversity."""
471
+ gaps = {}
472
+
473
+ # Source type gaps
474
+ if metrics.type_diversity < 0.7:
475
+ current_types = set(
476
+ p.source_type for p in self.source_profiles.values()
477
+ )
478
+ desired_types = {"academic", "government", "news", "wiki"}
479
+ missing_types = desired_types - current_types
480
+ if missing_types:
481
+ gaps["source_type"] = list(missing_types)
482
+
483
+ # Temporal gaps (based on constraints)
484
+ temporal_constraints = [
485
+ c for c in constraints if c.type == ConstraintType.TEMPORAL
486
+ ]
487
+ if temporal_constraints and metrics.temporal_diversity < 0.5:
488
+ # Extract years from constraints
489
+ years_needed = []
490
+ for c in temporal_constraints:
491
+ year_match = re.search(r"\b(19\d{2}|20\d{2})\b", c.value)
492
+ if year_match:
493
+ years_needed.append(year_match.group(1))
494
+
495
+ if years_needed:
496
+ gaps["temporal"] = f"{min(years_needed)}-{max(years_needed)}"
497
+
498
+ # Geographic gaps
499
+ location_constraints = [
500
+ c for c in constraints if c.type == ConstraintType.LOCATION
501
+ ]
502
+ if location_constraints and metrics.geographic_diversity < 0.5:
503
+ locations_needed = [c.value for c in location_constraints]
504
+ gaps["geographic"] = locations_needed
505
+
506
+ # Credibility gaps
507
+ high_cred_ratio = metrics.credibility_distribution.get("high", 0) / max(
508
+ sum(metrics.credibility_distribution.values()), 1
509
+ )
510
+ if high_cred_ratio < 0.3:
511
+ gaps["credibility"] = True
512
+
513
+ return gaps
514
+
515
+ def _get_source_type_modifier(self, source_type: str) -> str:
516
+ """Get search modifier for specific source type."""
517
+ modifiers = {
518
+ "academic": "site:.edu OR site:scholar.google.com OR site:pubmed.gov",
519
+ "government": "site:.gov OR site:.mil",
520
+ "news": 'news OR "press release" OR journalism',
521
+ "wiki": "site:wikipedia.org OR wiki",
522
+ "blog": 'blog OR "posted by" OR comments',
523
+ }
524
+ return modifiers.get(source_type, "")
525
+
526
+ def _get_region_domain(self, region: str) -> str:
527
+ """Get domain suffix for a region."""
528
+ region_domains = {
529
+ "United States": ".us OR .com",
530
+ "United Kingdom": ".uk",
531
+ "Canada": ".ca",
532
+ "Australia": ".au",
533
+ "Europe": ".eu OR .de OR .fr",
534
+ }
535
+ return region_domains.get(region, ".com")
536
+
537
+ def select_diverse_sources(
538
+ self, available_sources: List[str], target_count: int
539
+ ) -> List[str]:
540
+ """Select a diverse subset of sources."""
541
+ if len(available_sources) <= target_count:
542
+ return available_sources
543
+
544
+ # Score each source based on diversity contribution
545
+ source_scores = []
546
+
547
+ for source in available_sources:
548
+ profile = self.source_profiles.get(source) or self.analyze_source(
549
+ source
550
+ )
551
+
552
+ # Calculate diversity score
553
+ score = (
554
+ self.type_priorities.get(profile.source_type, 0.5) * 0.4
555
+ + profile.credibility_score * 0.3
556
+ + (1.0 if profile.specialties else 0.5) * 0.15
557
+ + (1.0 if profile.temporal_coverage else 0.5) * 0.15
558
+ )
559
+
560
+ source_scores.append((source, score, profile))
561
+
562
+ # Sort by score
563
+ source_scores.sort(key=lambda x: x[1], reverse=True)
564
+
565
+ # Select diverse sources
566
+ selected = []
567
+ selected_types = set()
568
+ selected_geos = set()
569
+
570
+ for source, score, profile in source_scores:
571
+ # Prioritize diversity
572
+ is_diverse = profile.source_type not in selected_types or (
573
+ profile.geographic_focus
574
+ and profile.geographic_focus not in selected_geos
575
+ )
576
+
577
+ if is_diverse or len(selected) < target_count // 2:
578
+ selected.append(source)
579
+ selected_types.add(profile.source_type)
580
+ if profile.geographic_focus:
581
+ selected_geos.add(profile.geographic_focus)
582
+
583
+ if len(selected) >= target_count:
584
+ break
585
+
586
+ return selected
587
+
588
+ def track_source_effectiveness(
589
+ self, source: str, evidence_quality: float, constraint_satisfied: bool
590
+ ):
591
+ """Track how effective a source is for evidence gathering."""
592
+ profile = self.source_profiles.get(source)
593
+ if not profile:
594
+ return
595
+
596
+ # Update profile based on effectiveness
597
+ if constraint_satisfied:
598
+ # Boost credibility slightly
599
+ profile.credibility_score = min(
600
+ profile.credibility_score * 1.05, 1.0
601
+ )
602
+
603
+ # Track in metadata
604
+ if "effectiveness" not in profile.__dict__:
605
+ profile.effectiveness = []
606
+
607
+ profile.effectiveness.append(
608
+ {
609
+ "timestamp": datetime.utcnow(),
610
+ "evidence_quality": evidence_quality,
611
+ "constraint_satisfied": constraint_satisfied,
612
+ }
613
+ )
@@ -1 +1,43 @@
1
1
  # Search System Strategies Package
2
+
3
+ from .adaptive_decomposition_strategy import AdaptiveDecompositionStrategy
4
+ from .base_strategy import BaseSearchStrategy
5
+ from .browsecomp_entity_strategy import BrowseCompEntityStrategy
6
+ from .browsecomp_optimized_strategy import BrowseCompOptimizedStrategy
7
+ from .constraint_parallel_strategy import ConstraintParallelStrategy
8
+ from .dual_confidence_strategy import DualConfidenceStrategy
9
+ from .dual_confidence_with_rejection import DualConfidenceWithRejectionStrategy
10
+ from .evidence_based_strategy import EvidenceBasedStrategy
11
+ from .focused_iteration_strategy import FocusedIterationStrategy
12
+ from .iterative_reasoning_strategy import IterativeReasoningStrategy
13
+ from .iterdrag_strategy import IterDRAGStrategy
14
+ from .modular_strategy import ModularStrategy
15
+ from .parallel_constrained_strategy import ParallelConstrainedStrategy
16
+ from .parallel_search_strategy import ParallelSearchStrategy
17
+ from .rapid_search_strategy import RapidSearchStrategy
18
+ from .recursive_decomposition_strategy import RecursiveDecompositionStrategy
19
+ from .smart_decomposition_strategy import SmartDecompositionStrategy
20
+ from .source_based_strategy import SourceBasedSearchStrategy
21
+ from .standard_strategy import StandardSearchStrategy
22
+
23
+ __all__ = [
24
+ "BaseSearchStrategy",
25
+ "StandardSearchStrategy",
26
+ "ParallelSearchStrategy",
27
+ "ParallelConstrainedStrategy",
28
+ "SourceBasedSearchStrategy",
29
+ "RapidSearchStrategy",
30
+ "IterDRAGStrategy",
31
+ "RecursiveDecompositionStrategy",
32
+ "AdaptiveDecompositionStrategy",
33
+ "SmartDecompositionStrategy",
34
+ "IterativeReasoningStrategy",
35
+ "BrowseCompOptimizedStrategy",
36
+ "BrowseCompEntityStrategy",
37
+ "EvidenceBasedStrategy",
38
+ "DualConfidenceStrategy",
39
+ "DualConfidenceWithRejectionStrategy",
40
+ "ConstraintParallelStrategy",
41
+ "ModularStrategy",
42
+ "FocusedIterationStrategy",
43
+ ]