local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. local_deep_research/__init__.py +7 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
  4. local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
  5. local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
  6. local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
  7. local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
  8. local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
  9. local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
  10. local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
  11. local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
  12. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
  13. local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
  14. local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
  15. local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
  16. local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
  17. local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
  18. local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
  19. local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
  20. local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
  21. local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
  22. local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
  23. local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
  24. local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
  25. local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
  26. local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
  27. local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
  28. local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
  29. local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
  30. local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
  31. local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
  32. local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
  33. local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
  34. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
  35. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
  36. local_deep_research/advanced_search_system/findings/repository.py +54 -17
  37. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
  38. local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
  39. local_deep_research/advanced_search_system/questions/__init__.py +16 -0
  40. local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
  41. local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
  42. local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
  43. local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
  44. local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
  45. local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
  46. local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
  47. local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
  48. local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
  49. local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
  50. local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
  51. local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
  52. local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
  53. local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
  54. local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
  55. local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
  56. local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
  57. local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
  58. local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
  59. local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
  60. local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
  61. local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
  62. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
  63. local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
  64. local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
  65. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
  66. local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
  67. local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
  68. local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
  69. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
  70. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
  71. local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
  72. local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
  73. local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
  74. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
  75. local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
  76. local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
  77. local_deep_research/api/benchmark_functions.py +6 -2
  78. local_deep_research/api/research_functions.py +10 -4
  79. local_deep_research/benchmarks/__init__.py +9 -7
  80. local_deep_research/benchmarks/benchmark_functions.py +6 -2
  81. local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
  82. local_deep_research/benchmarks/cli.py +38 -13
  83. local_deep_research/benchmarks/comparison/__init__.py +4 -2
  84. local_deep_research/benchmarks/comparison/evaluator.py +316 -239
  85. local_deep_research/benchmarks/datasets/__init__.py +1 -1
  86. local_deep_research/benchmarks/datasets/base.py +91 -72
  87. local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
  88. local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
  89. local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
  90. local_deep_research/benchmarks/datasets/utils.py +48 -29
  91. local_deep_research/benchmarks/datasets.py +4 -11
  92. local_deep_research/benchmarks/efficiency/__init__.py +8 -4
  93. local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
  94. local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
  95. local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
  96. local_deep_research/benchmarks/evaluators/composite.py +6 -2
  97. local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
  98. local_deep_research/benchmarks/graders.py +32 -10
  99. local_deep_research/benchmarks/metrics/README.md +1 -1
  100. local_deep_research/benchmarks/metrics/calculation.py +25 -10
  101. local_deep_research/benchmarks/metrics/reporting.py +7 -3
  102. local_deep_research/benchmarks/metrics/visualization.py +42 -23
  103. local_deep_research/benchmarks/metrics.py +1 -1
  104. local_deep_research/benchmarks/optimization/__init__.py +3 -1
  105. local_deep_research/benchmarks/optimization/api.py +7 -1
  106. local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
  107. local_deep_research/benchmarks/runners.py +48 -15
  108. local_deep_research/citation_handler.py +65 -92
  109. local_deep_research/citation_handlers/__init__.py +15 -0
  110. local_deep_research/citation_handlers/base_citation_handler.py +70 -0
  111. local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
  112. local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
  113. local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
  114. local_deep_research/config/llm_config.py +271 -169
  115. local_deep_research/config/search_config.py +14 -5
  116. local_deep_research/defaults/__init__.py +0 -1
  117. local_deep_research/metrics/__init__.py +13 -0
  118. local_deep_research/metrics/database.py +58 -0
  119. local_deep_research/metrics/db_models.py +115 -0
  120. local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
  121. local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
  122. local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
  123. local_deep_research/metrics/migrate_research_ratings.py +31 -0
  124. local_deep_research/metrics/models.py +61 -0
  125. local_deep_research/metrics/pricing/__init__.py +12 -0
  126. local_deep_research/metrics/pricing/cost_calculator.py +237 -0
  127. local_deep_research/metrics/pricing/pricing_cache.py +143 -0
  128. local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
  129. local_deep_research/metrics/query_utils.py +51 -0
  130. local_deep_research/metrics/search_tracker.py +380 -0
  131. local_deep_research/metrics/token_counter.py +1078 -0
  132. local_deep_research/migrate_db.py +3 -1
  133. local_deep_research/report_generator.py +22 -8
  134. local_deep_research/search_system.py +390 -9
  135. local_deep_research/test_migration.py +15 -5
  136. local_deep_research/utilities/db_utils.py +7 -4
  137. local_deep_research/utilities/es_utils.py +115 -104
  138. local_deep_research/utilities/llm_utils.py +15 -5
  139. local_deep_research/utilities/log_utils.py +151 -0
  140. local_deep_research/utilities/search_cache.py +387 -0
  141. local_deep_research/utilities/search_utilities.py +14 -6
  142. local_deep_research/utilities/threading_utils.py +92 -0
  143. local_deep_research/utilities/url_utils.py +6 -0
  144. local_deep_research/web/api.py +347 -0
  145. local_deep_research/web/app.py +13 -17
  146. local_deep_research/web/app_factory.py +71 -66
  147. local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
  148. local_deep_research/web/database/migrations.py +5 -3
  149. local_deep_research/web/database/models.py +51 -2
  150. local_deep_research/web/database/schema_upgrade.py +49 -29
  151. local_deep_research/web/models/database.py +51 -61
  152. local_deep_research/web/routes/api_routes.py +56 -22
  153. local_deep_research/web/routes/benchmark_routes.py +4 -1
  154. local_deep_research/web/routes/globals.py +22 -0
  155. local_deep_research/web/routes/history_routes.py +71 -46
  156. local_deep_research/web/routes/metrics_routes.py +1155 -0
  157. local_deep_research/web/routes/research_routes.py +227 -41
  158. local_deep_research/web/routes/settings_routes.py +156 -55
  159. local_deep_research/web/services/research_service.py +310 -103
  160. local_deep_research/web/services/resource_service.py +36 -11
  161. local_deep_research/web/services/settings_manager.py +55 -17
  162. local_deep_research/web/services/settings_service.py +12 -4
  163. local_deep_research/web/services/socket_service.py +295 -188
  164. local_deep_research/web/static/css/custom_dropdown.css +180 -0
  165. local_deep_research/web/static/css/styles.css +39 -1
  166. local_deep_research/web/static/js/components/detail.js +633 -267
  167. local_deep_research/web/static/js/components/details.js +751 -0
  168. local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
  169. local_deep_research/web/static/js/components/fallback/ui.js +23 -23
  170. local_deep_research/web/static/js/components/history.js +76 -76
  171. local_deep_research/web/static/js/components/logpanel.js +61 -13
  172. local_deep_research/web/static/js/components/progress.js +13 -2
  173. local_deep_research/web/static/js/components/research.js +99 -12
  174. local_deep_research/web/static/js/components/results.js +239 -106
  175. local_deep_research/web/static/js/main.js +40 -40
  176. local_deep_research/web/static/js/services/audio.js +1 -1
  177. local_deep_research/web/static/js/services/formatting.js +11 -11
  178. local_deep_research/web/static/js/services/keyboard.js +157 -0
  179. local_deep_research/web/static/js/services/pdf.js +80 -80
  180. local_deep_research/web/static/sounds/README.md +1 -1
  181. local_deep_research/web/templates/base.html +1 -0
  182. local_deep_research/web/templates/components/log_panel.html +7 -1
  183. local_deep_research/web/templates/components/mobile_nav.html +1 -1
  184. local_deep_research/web/templates/components/sidebar.html +3 -0
  185. local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
  186. local_deep_research/web/templates/pages/details.html +325 -24
  187. local_deep_research/web/templates/pages/history.html +1 -1
  188. local_deep_research/web/templates/pages/metrics.html +1929 -0
  189. local_deep_research/web/templates/pages/progress.html +2 -2
  190. local_deep_research/web/templates/pages/research.html +53 -17
  191. local_deep_research/web/templates/pages/results.html +12 -1
  192. local_deep_research/web/templates/pages/star_reviews.html +803 -0
  193. local_deep_research/web/utils/formatters.py +9 -3
  194. local_deep_research/web_search_engines/default_search_engines.py +5 -3
  195. local_deep_research/web_search_engines/engines/full_search.py +8 -2
  196. local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
  197. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
  198. local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
  199. local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
  200. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
  201. local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
  202. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
  203. local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
  204. local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
  205. local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
  206. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
  207. local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
  208. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
  209. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
  210. local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
  211. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
  212. local_deep_research/web_search_engines/search_engine_base.py +83 -35
  213. local_deep_research/web_search_engines/search_engine_factory.py +25 -8
  214. local_deep_research/web_search_engines/search_engines_config.py +9 -3
  215. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
  216. local_deep_research-0.5.0.dist-info/RECORD +265 -0
  217. local_deep_research-0.4.4.dist-info/RECORD +0 -177
  218. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
  219. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
  220. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,550 @@
1
+ """
2
+ Precision Extraction Citation Handler
3
+
4
+ This handler focuses on extracting precise, complete answers for SimpleQA-style questions.
5
+ It includes specialized extractors for:
6
+ - Full names (including middle names)
7
+ - Single answers when only one is requested
8
+ - Dimension-aware measurements
9
+ - Specific entities without extra information
10
+ """
11
+
12
+ import re
13
+ from typing import Any, Dict, List, Union
14
+
15
+ from loguru import logger
16
+
17
+ from .base_citation_handler import BaseCitationHandler
18
+
19
+
20
+ class PrecisionExtractionHandler(BaseCitationHandler):
21
+ """Citation handler optimized for precise answer extraction."""
22
+
23
+ def __init__(self, *args, **kwargs):
24
+ super().__init__(*args, **kwargs)
25
+
26
+ # Answer type patterns
27
+ self.answer_patterns = {
28
+ "full_name": re.compile(
29
+ r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})\b"
30
+ ),
31
+ "year": re.compile(r"\b(19\d{2}|20\d{2})\b"),
32
+ "number": re.compile(r"\b(\d+(?:\.\d+)?)\b"),
33
+ "dimension": re.compile(
34
+ r"(\d+(?:\.\d+)?)\s*(meters?|feet|inches|cm|km|miles?|m|ft|kg|pounds?|lbs?)",
35
+ re.I,
36
+ ),
37
+ "score": re.compile(r"(\d+)\s*[-–]\s*(\d+)"),
38
+ "percentage": re.compile(r"(\d+(?:\.\d+)?)\s*%"),
39
+ "location": re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b"),
40
+ }
41
+
42
+ def analyze_initial(
43
+ self, query: str, search_results: Union[str, List[Dict]]
44
+ ) -> Dict[str, Any]:
45
+ """Initial analysis with precision extraction."""
46
+ documents = self._create_documents(search_results)
47
+ formatted_sources = self._format_sources(documents)
48
+
49
+ # Determine question type for targeted extraction
50
+ question_type = self._identify_question_type(query)
51
+
52
+ prompt = f"""Analyze the following information and provide a PRECISE answer to the question. Include citations using numbers in square brackets [1], [2], etc.
53
+
54
+ Question: {query}
55
+ Question Type: {question_type}
56
+
57
+ Sources:
58
+ {formatted_sources}
59
+
60
+ PRECISION INSTRUCTIONS:
61
+ 1. Extract the EXACT answer as it appears in the sources
62
+ 2. For names: Include FULL names with all parts (first, middle, last)
63
+ 3. For numbers: Include exact values with units if present
64
+ 4. For single-answer questions: Provide ONLY ONE answer, not multiple options
65
+ 5. For dimensions: Specify the exact measurement type (height, length, width)
66
+ 6. Citations should support the specific answer given
67
+
68
+ Format: Start with the direct, precise answer, then explain with citations."""
69
+
70
+ response = self.llm.invoke(prompt)
71
+ if not isinstance(response, str):
72
+ response = response.content
73
+
74
+ # Apply precision extraction if needed
75
+ response = self._apply_precision_extraction(
76
+ response, query, question_type, formatted_sources
77
+ )
78
+
79
+ return {"content": response, "documents": documents}
80
+
81
+ def analyze_followup(
82
+ self,
83
+ question: str,
84
+ search_results: Union[str, List[Dict]],
85
+ previous_knowledge: str,
86
+ nr_of_links: int,
87
+ ) -> Dict[str, Any]:
88
+ """Follow-up analysis with precision extraction."""
89
+ documents = self._create_documents(
90
+ search_results, nr_of_links=nr_of_links
91
+ )
92
+ formatted_sources = self._format_sources(documents)
93
+
94
+ question_type = self._identify_question_type(question)
95
+
96
+ # Extract key facts from previous knowledge
97
+ key_facts = self._extract_key_facts(previous_knowledge, question_type)
98
+
99
+ prompt = f"""Using the previous knowledge and new sources, provide a PRECISE answer to the question.
100
+
101
+ Previous Key Facts:
102
+ {key_facts}
103
+
104
+ Question: {question}
105
+ Question Type: {question_type}
106
+
107
+ New Sources:
108
+ {formatted_sources}
109
+
110
+ PRECISION REQUIREMENTS:
111
+ 1. Build on previous knowledge to provide the MOST COMPLETE answer
112
+ 2. If a full name was partially found before, complete it now
113
+ 3. If multiple candidates exist, select the one with the MOST evidence
114
+ 4. For measurements, ensure units and dimension types match the question
115
+ 5. Reconcile any conflicts by choosing the most frequently cited answer
116
+
117
+ Provide the precise answer with citations."""
118
+
119
+ response = self.llm.invoke(prompt)
120
+ content = response.content
121
+
122
+ # Apply precision extraction
123
+ content = self._apply_precision_extraction(
124
+ content, question, question_type, formatted_sources
125
+ )
126
+
127
+ return {"content": content, "documents": documents}
128
+
129
+ def _identify_question_type(self, query: str) -> str:
130
+ """Identify the type of question for targeted extraction."""
131
+ query_lower = query.lower()
132
+
133
+ # Name questions
134
+ if any(
135
+ phrase in query_lower
136
+ for phrase in ["full name", "name of", "who was", "who is"]
137
+ ):
138
+ if "full name" in query_lower:
139
+ return "full_name"
140
+ return "name"
141
+
142
+ # Location questions
143
+ if any(
144
+ phrase in query_lower
145
+ for phrase in ["where", "location", "city", "country", "place"]
146
+ ):
147
+ return "location"
148
+
149
+ # Temporal questions
150
+ if any(phrase in query_lower for phrase in ["when", "year", "date"]):
151
+ return "temporal"
152
+
153
+ # Numerical questions
154
+ if any(
155
+ phrase in query_lower
156
+ for phrase in ["how many", "how much", "number", "count"]
157
+ ):
158
+ return "number"
159
+
160
+ # Score/result questions
161
+ if any(
162
+ phrase in query_lower
163
+ for phrase in ["score", "result", "final", "outcome"]
164
+ ):
165
+ return "score"
166
+
167
+ # Dimension questions
168
+ if any(
169
+ phrase in query_lower
170
+ for phrase in [
171
+ "height",
172
+ "length",
173
+ "width",
174
+ "size",
175
+ "tall",
176
+ "long",
177
+ "wide",
178
+ ]
179
+ ):
180
+ return "dimension"
181
+
182
+ # Single answer questions
183
+ if query_lower.startswith("which") and "one" in query_lower:
184
+ return "single_choice"
185
+
186
+ return "general"
187
+
188
+ def _apply_precision_extraction(
189
+ self, content: str, query: str, question_type: str, sources: str
190
+ ) -> str:
191
+ """Apply precision extraction based on question type."""
192
+
193
+ # Check if content already has a good answer in the first line
194
+ # first_line = content.split(".")[0].strip() # Not currently used
195
+
196
+ if question_type == "full_name":
197
+ return self._extract_full_name(content, query, sources)
198
+ elif question_type == "name":
199
+ return self._extract_best_name(content, query, sources)
200
+ elif question_type == "single_choice":
201
+ return self._extract_single_answer(content, query, sources)
202
+ elif question_type == "dimension":
203
+ return self._extract_dimension(content, query, sources)
204
+ elif question_type == "score":
205
+ return self._extract_score(content, query, sources)
206
+ elif question_type == "temporal":
207
+ return self._extract_temporal(content, query, sources)
208
+ elif question_type == "number":
209
+ return self._extract_number(content, query, sources)
210
+
211
+ return content
212
+
213
+ def _extract_full_name(self, content: str, query: str, sources: str) -> str:
214
+ """Extract complete full names."""
215
+ # First, use LLM to identify all name variations
216
+ extraction_prompt = f"""Find ALL variations of the person's name mentioned in the sources.
217
+
218
+ Question: {query}
219
+
220
+ Content: {content[:2000]}
221
+ Sources: {sources[:2000]}
222
+
223
+ List all name variations found:
224
+ 1. Shortest version:
225
+ 2. Longest/most complete version:
226
+ 3. Most frequently mentioned version:
227
+
228
+ Which is the FULL name (including middle name if present)?"""
229
+
230
+ try:
231
+ extraction = self.llm.invoke(extraction_prompt).content
232
+
233
+ # Extract the identified full name
234
+ if "full name" in extraction.lower():
235
+ lines = extraction.split("\n")
236
+ for line in lines:
237
+ if "full name" in line.lower() or "longest" in line.lower():
238
+ # Extract name from this line
239
+ matches = self.answer_patterns["full_name"].findall(
240
+ line
241
+ )
242
+ if matches:
243
+ # Choose the longest match
244
+ full_name = max(
245
+ matches, key=lambda x: len(x.split())
246
+ )
247
+ return f"{full_name}. {content}"
248
+
249
+ # Fallback: find all names and pick the longest
250
+ all_names = self.answer_patterns["full_name"].findall(
251
+ content + " " + sources
252
+ )
253
+ if all_names:
254
+ # Group similar names and pick the longest variant
255
+ name_groups = {}
256
+ for name in all_names:
257
+ last_word = name.split()[-1]
258
+ if last_word not in name_groups:
259
+ name_groups[last_word] = []
260
+ name_groups[last_word].append(name)
261
+
262
+ # Find the group with the most complete name
263
+ best_name = ""
264
+ for group in name_groups.values():
265
+ longest_in_group = max(group, key=lambda x: len(x.split()))
266
+ if len(longest_in_group.split()) > len(best_name.split()):
267
+ best_name = longest_in_group
268
+
269
+ if best_name:
270
+ return f"{best_name}. {content}"
271
+
272
+ except Exception as e:
273
+ logger.error(f"Error in full name extraction: {e}")
274
+
275
+ return content
276
+
277
+ def _extract_single_answer(
278
+ self, content: str, query: str, sources: str
279
+ ) -> str:
280
+ """Extract a single answer when multiple options might be present."""
281
+ extraction_prompt = f"""The question asks for ONE specific answer. Extract ONLY that answer.
282
+
283
+ Question: {query}
284
+ Content: {content[:1500]}
285
+
286
+ Rules:
287
+ 1. If multiple items are listed, identify which ONE actually answers the question
288
+ 2. Look for the PRIMARY or FIRST mentioned item
289
+ 3. Do not include alternatives or additional options
290
+
291
+ The single answer is:"""
292
+
293
+ try:
294
+ answer = self.llm.invoke(extraction_prompt).content.strip()
295
+
296
+ # Clean up the answer
297
+ answer = answer.split(",")[
298
+ 0
299
+ ].strip() # Take only first if comma-separated
300
+ answer = answer.split(" and ")[
301
+ 0
302
+ ].strip() # Take only first if "and"-separated
303
+ answer = answer.split(" or ")[
304
+ 0
305
+ ].strip() # Take only first if "or"-separated
306
+
307
+ return f"{answer}. {content}"
308
+
309
+ except Exception as e:
310
+ logger.error(f"Error in single answer extraction: {e}")
311
+
312
+ return content
313
+
314
+ def _extract_dimension(self, content: str, query: str, sources: str) -> str:
315
+ """Extract specific dimensions with correct units and context awareness."""
316
+ # Enhanced dimension type detection
317
+ dimension_types = {
318
+ "height": ["height", "tall", "high", "elevation", "altitude"],
319
+ "length": ["length", "long", "distance", "reach", "span"],
320
+ "width": ["width", "wide", "breadth", "diameter"],
321
+ "depth": ["depth", "deep", "thickness"],
322
+ "weight": ["weight", "weigh", "heavy", "mass"],
323
+ "speed": ["speed", "fast", "velocity", "mph", "kmh"],
324
+ "area": ["area", "square"],
325
+ "volume": ["volume", "cubic"],
326
+ }
327
+
328
+ query_lower = query.lower()
329
+ dimension_type = None
330
+ dimension_keywords = []
331
+
332
+ # Find the most specific dimension type
333
+ for dim_type, keywords in dimension_types.items():
334
+ matching_keywords = [kw for kw in keywords if kw in query_lower]
335
+ if matching_keywords:
336
+ dimension_type = dim_type
337
+ dimension_keywords = matching_keywords
338
+ break
339
+
340
+ extraction_prompt = f"""Extract the EXACT measurement that answers this question.
341
+
342
+ Question: {query}
343
+ Content: {content[:1500]}
344
+
345
+ Rules:
346
+ 1. Find the specific {dimension_type or "dimension"} measurement
347
+ 2. Return ONLY the number and unit (e.g., "20 meters", "5.5 feet")
348
+ 3. Distinguish between different types of measurements:
349
+ - Height/tall: vertical measurements
350
+ - Length/long: horizontal distance
351
+ - Width/wide: horizontal breadth
352
+ 4. Look for context clues near the measurement
353
+ 5. If multiple measurements, choose the one that matches the question type
354
+
355
+ The exact {dimension_type or "dimension"} is:"""
356
+
357
+ try:
358
+ answer = self.llm.invoke(extraction_prompt).content.strip()
359
+
360
+ # Clean and validate the answer
361
+ import re
362
+
363
+ measurement_match = re.search(
364
+ r"(\d+(?:\.\d+)?)\s*([a-zA-Z/°]+)", answer
365
+ )
366
+ if measurement_match:
367
+ number, unit = measurement_match.groups()
368
+ clean_answer = f"{number} {unit}"
369
+ return f"{clean_answer}. {content}"
370
+
371
+ # Fallback: intelligent pattern matching
372
+ all_dimensions = self.answer_patterns["dimension"].findall(
373
+ content + " " + sources
374
+ )
375
+ if all_dimensions:
376
+ # Score dimensions based on context and dimension type
377
+ scored_dimensions = []
378
+
379
+ for dim in all_dimensions:
380
+ number, unit = dim
381
+ dim_str = f"{number} {unit}"
382
+ score = 0
383
+
384
+ # Find the dimension in content
385
+ pos = content.find(dim_str)
386
+ if pos >= 0:
387
+ # Get context around this measurement
388
+ context = content[max(0, pos - 100) : pos + 100].lower()
389
+
390
+ # Score based on dimension keywords in context
391
+ for keyword in dimension_keywords:
392
+ if keyword in context:
393
+ score += 10
394
+
395
+ # Score based on unit appropriateness
396
+ unit_lower = unit.lower()
397
+ if dimension_type == "height" and any(
398
+ u in unit_lower
399
+ for u in ["m", "meter", "ft", "feet", "cm"]
400
+ ):
401
+ score += 5
402
+ elif dimension_type == "length" and any(
403
+ u in unit_lower
404
+ for u in ["m", "meter", "km", "mile", "ft"]
405
+ ):
406
+ score += 5
407
+ elif dimension_type == "weight" and any(
408
+ u in unit_lower
409
+ for u in ["kg", "lb", "pound", "gram", "ton"]
410
+ ):
411
+ score += 5
412
+ elif dimension_type == "speed" and any(
413
+ u in unit_lower
414
+ for u in ["mph", "kmh", "km/h", "m/s"]
415
+ ):
416
+ score += 5
417
+
418
+ # Prefer measurements closer to the beginning (more likely to be primary)
419
+ score += max(0, 5 - (pos / 100))
420
+
421
+ scored_dimensions.append((score, dim_str))
422
+
423
+ # Return the highest scoring dimension
424
+ if scored_dimensions:
425
+ scored_dimensions.sort(key=lambda x: x[0], reverse=True)
426
+ best_dimension = scored_dimensions[0][1]
427
+ return f"{best_dimension}. {content}"
428
+
429
+ # Final fallback: first dimension
430
+ return (
431
+ f"{all_dimensions[0][0]} {all_dimensions[0][1]}. {content}"
432
+ )
433
+
434
+ except Exception as e:
435
+ logger.error(f"Error in dimension extraction: {e}")
436
+
437
+ return content
438
+
439
+ def _extract_score(self, content: str, query: str, sources: str) -> str:
440
+ """Extract game scores or results."""
441
+ # Find all score patterns
442
+ scores = self.answer_patterns["score"].findall(content + " " + sources)
443
+
444
+ if scores:
445
+ # Use LLM to identify the correct score
446
+ extraction_prompt = f"""Which score/result answers this question?
447
+
448
+ Question: {query}
449
+ Found scores: {scores}
450
+ Context: {content[:1000]}
451
+
452
+ The answer is:"""
453
+
454
+ try:
455
+ answer = self.llm.invoke(extraction_prompt).content.strip()
456
+ return f"{answer}. {content}"
457
+ except Exception:
458
+ # Return first score found if LLM extraction fails
459
+ return f"{scores[0][0]}-{scores[0][1]}. {content}"
460
+
461
+ return content
462
+
463
+ def _extract_temporal(self, content: str, query: str, sources: str) -> str:
464
+ """Extract dates or years."""
465
+ # Find all year patterns
466
+ years = self.answer_patterns["year"].findall(content + " " + sources)
467
+
468
+ if years:
469
+ # Use LLM to pick the right one
470
+ extraction_prompt = f"""Which date/year specifically answers this question?
471
+
472
+ Question: {query}
473
+ Found years: {set(years)}
474
+ Context: {content[:1000]}
475
+
476
+ The answer is:"""
477
+
478
+ try:
479
+ answer = self.llm.invoke(extraction_prompt).content.strip()
480
+ # Clean to just the year/date
481
+ year_match = self.answer_patterns["year"].search(answer)
482
+ if year_match:
483
+ return f"{year_match.group()}. {content}"
484
+ return f"{answer}. {content}"
485
+ except Exception:
486
+ # Fallback to first found year if LLM extraction fails
487
+ return f"{years[0]}. {content}"
488
+
489
+ return content
490
+
491
+ def _extract_number(self, content: str, query: str, sources: str) -> str:
492
+ """Extract specific numbers."""
493
+ # Find all numbers
494
+ numbers = self.answer_patterns["number"].findall(
495
+ content + " " + sources
496
+ )
497
+
498
+ if numbers:
499
+ extraction_prompt = f"""Which number specifically answers this question?
500
+
501
+ Question: {query}
502
+ Found numbers: {numbers[:10]}
503
+ Context: {content[:1000]}
504
+
505
+ The answer is:"""
506
+
507
+ try:
508
+ answer = self.llm.invoke(extraction_prompt).content.strip()
509
+ return f"{answer}. {content}"
510
+ except Exception:
511
+ # Fallback to first found number if LLM extraction fails
512
+ return f"{numbers[0]}. {content}"
513
+
514
+ return content
515
+
516
+ def _extract_best_name(self, content: str, query: str, sources: str) -> str:
517
+ """Extract the best matching name (not necessarily full)."""
518
+ # Find all potential names
519
+ names = self.answer_patterns["full_name"].findall(
520
+ content + " " + sources
521
+ )
522
+
523
+ if names:
524
+ # Count frequency
525
+ name_counts = {}
526
+ for name in names:
527
+ name_counts[name] = name_counts.get(name, 0) + 1
528
+
529
+ # Get most frequent
530
+ best_name = max(name_counts.items(), key=lambda x: x[1])[0]
531
+ return f"{best_name}. {content}"
532
+
533
+ return content
534
+
535
+ def _extract_key_facts(
536
+ self, previous_knowledge: str, question_type: str
537
+ ) -> str:
538
+ """Extract key facts from previous knowledge."""
539
+ extraction_prompt = f"""Extract key facts related to a {question_type} question from this knowledge:
540
+
541
+ {previous_knowledge[:1500]}
542
+
543
+ List the most important facts (names, numbers, dates) found:"""
544
+
545
+ try:
546
+ facts = self.llm.invoke(extraction_prompt).content
547
+ return facts[:500]
548
+ except Exception:
549
+ # Fallback to truncated previous knowledge if LLM extraction fails
550
+ return previous_knowledge[:500]
@@ -0,0 +1,80 @@
1
+ """
2
+ Standard citation handler - the original implementation.
3
+ """
4
+
5
+ from typing import Any, Dict, List, Union
6
+
7
+ from ..utilities.db_utils import get_db_setting
8
+ from .base_citation_handler import BaseCitationHandler
9
+
10
+
11
+ class StandardCitationHandler(BaseCitationHandler):
12
+ """Standard citation handler with detailed analysis."""
13
+
14
+ def analyze_initial(
15
+ self, query: str, search_results: Union[str, List[Dict]]
16
+ ) -> Dict[str, Any]:
17
+ documents = self._create_documents(search_results)
18
+ formatted_sources = self._format_sources(documents)
19
+ prompt = f"""Analyze the following information concerning the question and include citations using numbers in square brackets [1], [2], etc. When citing, use the source number provided at the start of each source.
20
+
21
+ Question: {query}
22
+
23
+ Sources:
24
+ {formatted_sources}
25
+
26
+ Provide a detailed analysis with citations. Do not create the bibliography, it will be provided automatically. Never make up sources. Never write or create urls. Only write text relevant to the question. Example format: "According to the research [1], ..."
27
+ """
28
+
29
+ response = self.llm.invoke(prompt)
30
+ if not isinstance(response, str):
31
+ response = response.content
32
+ return {"content": response, "documents": documents}
33
+
34
+ def analyze_followup(
35
+ self,
36
+ question: str,
37
+ search_results: Union[str, List[Dict]],
38
+ previous_knowledge: str,
39
+ nr_of_links: int,
40
+ ) -> Dict[str, Any]:
41
+ """Process follow-up analysis with citations."""
42
+ documents = self._create_documents(
43
+ search_results, nr_of_links=nr_of_links
44
+ )
45
+ formatted_sources = self._format_sources(documents)
46
+ # Add fact-checking step
47
+ fact_check_prompt = f"""Analyze these sources for factual consistency:
48
+ 1. Cross-reference major claims between sources
49
+ 2. Identify and flag any contradictions
50
+ 3. Verify basic facts (dates, company names, ownership)
51
+ 4. Note when sources disagree
52
+
53
+ Previous Knowledge:
54
+ {previous_knowledge}
55
+
56
+ New Sources:
57
+ {formatted_sources}
58
+
59
+ Return any inconsistencies or conflicts found."""
60
+ if get_db_setting("general.enable_fact_checking", True):
61
+ fact_check_response = self.llm.invoke(fact_check_prompt).content
62
+
63
+ else:
64
+ fact_check_response = ""
65
+
66
+ prompt = f"""Using the previous knowledge and new sources, answer the question. Include citations using numbers in square brackets [1], [2], etc. When citing, use the source number provided at the start of each source. Reflect information from sources critically.
67
+
68
+ Previous Knowledge:
69
+ {previous_knowledge}
70
+
71
+ Question: {question}
72
+
73
+ New Sources:
74
+ {formatted_sources}
75
+ Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
76
+ Provide a detailed answer with citations. Example format: "According to [1], ..." """
77
+
78
+ response = self.llm.invoke(prompt)
79
+
80
+ return {"content": response.content, "documents": documents}