abstractcore 2.5.0__py3-none-any.whl → 2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/apps/__main__.py +8 -1
- abstractcore/apps/deepsearch.py +644 -0
- abstractcore/apps/intent.py +614 -0
- abstractcore/assets/model_capabilities.json +50 -34
- abstractcore/config/main.py +2 -1
- abstractcore/config/manager.py +11 -0
- abstractcore/core/session.py +46 -1
- abstractcore/processing/__init__.py +5 -1
- abstractcore/processing/basic_deepsearch.py +2173 -0
- abstractcore/processing/basic_intent.py +690 -0
- abstractcore/providers/anthropic_provider.py +1 -0
- abstractcore/providers/base.py +1 -0
- abstractcore/providers/huggingface_provider.py +95 -4
- abstractcore/providers/lmstudio_provider.py +14 -0
- abstractcore/providers/mlx_provider.py +76 -2
- abstractcore/providers/ollama_provider.py +6 -2
- abstractcore/providers/openai_provider.py +1 -0
- abstractcore/providers/registry.py +6 -6
- abstractcore/structured/handler.py +161 -1
- abstractcore/tools/common_tools.py +98 -3
- abstractcore/utils/cli.py +114 -1
- abstractcore/utils/version.py +1 -1
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.2.dist-info}/METADATA +34 -18
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.2.dist-info}/RECORD +28 -24
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.2.dist-info}/entry_points.txt +4 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.2.dist-info}/WHEEL +0 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.2.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.5.0.dist-info → abstractcore-2.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,2173 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Basic Deep Search - Autonomous research agent with multi-stage pipeline
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- Four-stage pipeline: Planning, Question Development, Web Exploration, Report Generation
|
|
6
|
+
- Parallel web exploration for speed and breadth
|
|
7
|
+
- Structured report generation with citations
|
|
8
|
+
- Verification and fact-checking capabilities
|
|
9
|
+
- Configurable search depth and focus areas
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import asyncio
|
|
15
|
+
import time
|
|
16
|
+
import re
|
|
17
|
+
import hashlib
|
|
18
|
+
from typing import Optional, List, Dict, Any, Union
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
21
|
+
from pydantic import BaseModel, Field
|
|
22
|
+
|
|
23
|
+
from ..core.interface import AbstractCoreInterface
|
|
24
|
+
from ..core.factory import create_llm
|
|
25
|
+
from ..structured.retry import FeedbackRetry
|
|
26
|
+
from ..utils.structured_logging import get_logger
|
|
27
|
+
from ..tools.common_tools import web_search, fetch_url
|
|
28
|
+
|
|
29
|
+
logger = get_logger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SourceManager:
|
|
33
|
+
"""Manages source collection with strict limits and deduplication"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, max_sources: int):
|
|
36
|
+
self.max_sources = max_sources
|
|
37
|
+
self.collected_sources = []
|
|
38
|
+
self.source_urls = set() # For deduplication
|
|
39
|
+
self.source_titles = set() # Additional deduplication by title
|
|
40
|
+
|
|
41
|
+
def add_source(self, source: Dict[str, Any]) -> bool:
|
|
42
|
+
"""Add source if under limit and not duplicate"""
|
|
43
|
+
if len(self.collected_sources) >= self.max_sources:
|
|
44
|
+
logger.debug(f"Source limit reached ({self.max_sources}), skipping: {source.get('url', 'unknown')}")
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
url = source.get('url', '')
|
|
48
|
+
title = source.get('title', '').lower().strip()
|
|
49
|
+
|
|
50
|
+
# Check for URL duplication
|
|
51
|
+
if url and url in self.source_urls:
|
|
52
|
+
logger.debug(f"Duplicate URL skipped: {url}")
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
# Check for title duplication (similar content from different URLs)
|
|
56
|
+
if title and title in self.source_titles:
|
|
57
|
+
logger.debug(f"Duplicate title skipped: {title}")
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
self.collected_sources.append(source)
|
|
61
|
+
if url:
|
|
62
|
+
self.source_urls.add(url)
|
|
63
|
+
if title:
|
|
64
|
+
self.source_titles.add(title)
|
|
65
|
+
|
|
66
|
+
logger.debug(f"Source added ({len(self.collected_sources)}/{self.max_sources}): {title or url}")
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
def get_remaining_capacity(self) -> int:
|
|
70
|
+
return max(0, self.max_sources - len(self.collected_sources))
|
|
71
|
+
|
|
72
|
+
def get_sources(self) -> List[Dict[str, Any]]:
|
|
73
|
+
return self.collected_sources.copy()
|
|
74
|
+
|
|
75
|
+
def is_full(self) -> bool:
|
|
76
|
+
return len(self.collected_sources) >= self.max_sources
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class CitationValidator:
|
|
80
|
+
"""Validates and enforces citations in generated content"""
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def validate_citations(text: str, sources: List[Dict]) -> Dict[str, Any]:
|
|
84
|
+
"""Check if text contains proper citations for claims"""
|
|
85
|
+
if not text or not sources:
|
|
86
|
+
return {
|
|
87
|
+
'citations_found': 0,
|
|
88
|
+
'factual_sentences': 0,
|
|
89
|
+
'citation_ratio': 0.0,
|
|
90
|
+
'is_adequately_cited': False,
|
|
91
|
+
'missing_citations': []
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
source_names = [s.get('title', '').strip() for s in sources if s.get('title')]
|
|
95
|
+
|
|
96
|
+
# Count citation patterns (case-insensitive)
|
|
97
|
+
citation_patterns = [
|
|
98
|
+
r'according to \[([^\]]+)\]',
|
|
99
|
+
r'as reported by \[([^\]]+)\]',
|
|
100
|
+
r'according to ([^,.]+)',
|
|
101
|
+
r'as reported by ([^,.]+)',
|
|
102
|
+
r'\(([^)]+)\)', # Parenthetical citations
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
citations_found = 0
|
|
106
|
+
cited_sources = set()
|
|
107
|
+
|
|
108
|
+
for pattern in citation_patterns:
|
|
109
|
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
110
|
+
citations_found += len(matches)
|
|
111
|
+
for match in matches:
|
|
112
|
+
cited_sources.add(match.strip().lower())
|
|
113
|
+
|
|
114
|
+
# Estimate factual claims (sentences with factual indicators)
|
|
115
|
+
sentences = [s.strip() for s in text.split('.') if s.strip()]
|
|
116
|
+
factual_indicators = [
|
|
117
|
+
'show', 'found', 'research', 'study', 'data', 'report', 'analysis',
|
|
118
|
+
'indicates', 'reveals', 'demonstrates', 'confirms', 'suggests',
|
|
119
|
+
'according', 'published', 'announced', 'released', 'stated'
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
factual_sentences = []
|
|
123
|
+
for sentence in sentences:
|
|
124
|
+
if any(indicator in sentence.lower() for indicator in factual_indicators):
|
|
125
|
+
factual_sentences.append(sentence)
|
|
126
|
+
|
|
127
|
+
citation_ratio = citations_found / max(len(factual_sentences), 1)
|
|
128
|
+
|
|
129
|
+
# Check which sources are not cited
|
|
130
|
+
uncited_sources = []
|
|
131
|
+
for source in source_names:
|
|
132
|
+
source_lower = source.lower()
|
|
133
|
+
if not any(source_lower in cited.lower() for cited in cited_sources):
|
|
134
|
+
uncited_sources.append(source)
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
'citations_found': citations_found,
|
|
138
|
+
'factual_sentences': len(factual_sentences),
|
|
139
|
+
'citation_ratio': citation_ratio,
|
|
140
|
+
'is_adequately_cited': citation_ratio >= 0.5, # 50% threshold
|
|
141
|
+
'uncited_sources': uncited_sources,
|
|
142
|
+
'cited_sources': list(cited_sources)
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def enhance_text_with_citations(text: str, sources: List[Dict]) -> str:
|
|
147
|
+
"""Enhance text by adding missing citations where appropriate"""
|
|
148
|
+
if not sources:
|
|
149
|
+
return text
|
|
150
|
+
|
|
151
|
+
# Simple enhancement: add source list at the end if no citations found
|
|
152
|
+
validation = CitationValidator.validate_citations(text, sources)
|
|
153
|
+
|
|
154
|
+
if validation['citations_found'] == 0 and sources:
|
|
155
|
+
source_list = "\n\nSources:\n" + "\n".join([
|
|
156
|
+
f"- {s.get('title', 'Unknown')}: {s.get('url', 'No URL')}"
|
|
157
|
+
for s in sources[:5] # Limit to top 5 sources
|
|
158
|
+
])
|
|
159
|
+
return text + source_list
|
|
160
|
+
|
|
161
|
+
return text
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# Pydantic models for structured output
|
|
165
|
+
class ResearchSubTaskModel(BaseModel):
|
|
166
|
+
"""Pydantic model for research sub-task (for structured output)"""
|
|
167
|
+
id: str = Field(description="Unique task identifier")
|
|
168
|
+
question: str = Field(description="Specific research question")
|
|
169
|
+
theme: str = Field(description="Research theme this task addresses")
|
|
170
|
+
priority: int = Field(description="Priority level (1=essential, 2=important, 3=supplementary)", ge=1, le=3)
|
|
171
|
+
|
|
172
|
+
class ResearchThemeModel(BaseModel):
|
|
173
|
+
"""Pydantic model for research theme (for structured output)"""
|
|
174
|
+
name: str = Field(description="Theme name")
|
|
175
|
+
questions: List[str] = Field(description="3 specific research questions for this theme", min_items=3, max_items=3)
|
|
176
|
+
priority: int = Field(description="Theme priority (1=essential, 2=important, 3=supplementary)", ge=1, le=3)
|
|
177
|
+
|
|
178
|
+
class ResearchPlanModel(BaseModel):
|
|
179
|
+
"""Pydantic model for research plan (for structured output)"""
|
|
180
|
+
research_objective: str = Field(description="Clear research objective")
|
|
181
|
+
themes: List[ResearchThemeModel] = Field(description="Research themes with questions")
|
|
182
|
+
search_strategy: str = Field(description="Search strategy", default="parallel")
|
|
183
|
+
estimated_time_minutes: int = Field(description="Estimated time in minutes", gt=0)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class SearchQueriesModel(BaseModel):
|
|
187
|
+
"""Pydantic model for search queries (for structured output)"""
|
|
188
|
+
queries: List[str] = Field(description="List of specific search queries", min_items=1, max_items=5)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@dataclass
|
|
192
|
+
class ResearchSubTask:
|
|
193
|
+
"""Represents a single research sub-task"""
|
|
194
|
+
id: str
|
|
195
|
+
question: str
|
|
196
|
+
focus_area: str
|
|
197
|
+
priority: int = 1 # 1=high, 2=medium, 3=low
|
|
198
|
+
search_queries: List[str] = None
|
|
199
|
+
findings: List[Dict[str, Any]] = None
|
|
200
|
+
status: str = "pending" # pending, in_progress, completed, failed
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@dataclass
|
|
204
|
+
class ResearchPlan:
|
|
205
|
+
"""Represents the overall research plan"""
|
|
206
|
+
original_query: str
|
|
207
|
+
research_objective: str
|
|
208
|
+
sub_tasks: List[ResearchSubTask]
|
|
209
|
+
estimated_time_minutes: int
|
|
210
|
+
focus_areas: List[str]
|
|
211
|
+
search_strategy: str = "breadth_first" # breadth_first, depth_first, parallel
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class ResearchFinding(BaseModel):
|
|
215
|
+
"""Structured representation of a research finding"""
|
|
216
|
+
source_url: str = Field(description="URL of the source")
|
|
217
|
+
title: str = Field(description="Title of the source")
|
|
218
|
+
content: str = Field(description="Relevant content excerpt")
|
|
219
|
+
relevance_score: float = Field(description="Relevance score 0-1")
|
|
220
|
+
timestamp: str = Field(description="When this was found")
|
|
221
|
+
sub_task_id: str = Field(description="Which sub-task this relates to")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class ResearchReport(BaseModel):
|
|
225
|
+
"""Structured research report"""
|
|
226
|
+
title: str = Field(description="Report title")
|
|
227
|
+
executive_summary: str = Field(description="Brief executive summary")
|
|
228
|
+
key_findings: List[str] = Field(description="List of key findings")
|
|
229
|
+
detailed_analysis: str = Field(description="Detailed analysis section")
|
|
230
|
+
conclusions: str = Field(description="Conclusions and implications")
|
|
231
|
+
sources: List[Dict[str, Any]] = Field(description="List of sources with URLs, titles, and relevance scores")
|
|
232
|
+
methodology: str = Field(description="Research methodology used")
|
|
233
|
+
limitations: str = Field(description="Research limitations and caveats")
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class BasicDeepSearch:
|
|
237
|
+
"""
|
|
238
|
+
Basic Deep Search implementation following the four-stage pipeline:
|
|
239
|
+
1. Planning: Decompose query into structured research plan
|
|
240
|
+
2. Question Development: Generate specific search queries
|
|
241
|
+
3. Web Exploration: Execute searches and gather evidence
|
|
242
|
+
4. Report Generation: Synthesize findings into structured report
|
|
243
|
+
|
|
244
|
+
Key features:
|
|
245
|
+
- Autonomous multi-step research workflow
|
|
246
|
+
- Parallel web exploration for speed
|
|
247
|
+
- Structured output with citations
|
|
248
|
+
- Configurable search depth and focus
|
|
249
|
+
- Verification and fact-checking
|
|
250
|
+
|
|
251
|
+
Examples:
|
|
252
|
+
>>> searcher = BasicDeepSearch()
|
|
253
|
+
|
|
254
|
+
# Basic research query
|
|
255
|
+
>>> report = searcher.research("What are the latest developments in quantum computing?")
|
|
256
|
+
|
|
257
|
+
# Research with specific focus
|
|
258
|
+
>>> report = searcher.research(
|
|
259
|
+
... "Impact of AI on healthcare",
|
|
260
|
+
... focus_areas=["medical diagnosis", "drug discovery", "patient care"]
|
|
261
|
+
... )
|
|
262
|
+
|
|
263
|
+
# Deep research with custom parameters
|
|
264
|
+
>>> report = searcher.research(
|
|
265
|
+
... "Sustainable energy solutions 2025",
|
|
266
|
+
... max_sources=20,
|
|
267
|
+
... search_depth="comprehensive",
|
|
268
|
+
... include_verification=True
|
|
269
|
+
... )
|
|
270
|
+
"""
|
|
271
|
+
|
|
272
|
+
def __init__(
|
|
273
|
+
self,
|
|
274
|
+
llm: Optional[AbstractCoreInterface] = None,
|
|
275
|
+
max_tokens: int = 32000,
|
|
276
|
+
max_output_tokens: int = 8000,
|
|
277
|
+
timeout: Optional[float] = None,
|
|
278
|
+
max_parallel_searches: int = 5,
|
|
279
|
+
full_text_extraction: bool = False,
|
|
280
|
+
reflexive_mode: bool = False,
|
|
281
|
+
max_reflexive_iterations: int = 2,
|
|
282
|
+
temperature: float = 0.1, # Low temperature for consistency
|
|
283
|
+
debug_mode: bool = False
|
|
284
|
+
):
|
|
285
|
+
"""Initialize the deep search system
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
llm: AbstractCore instance (any provider). If None, uses default Ollama model
|
|
289
|
+
max_tokens: Maximum total tokens for LLM context (default 32000)
|
|
290
|
+
max_output_tokens: Maximum tokens for LLM output generation (default 8000)
|
|
291
|
+
timeout: HTTP request timeout in seconds. None for unlimited timeout (default None)
|
|
292
|
+
max_parallel_searches: Maximum number of parallel web searches (default 5)
|
|
293
|
+
full_text_extraction: Whether to extract full text content from pages (default False)
|
|
294
|
+
reflexive_mode: Whether to enable reflexive research that analyzes gaps and refines (default False)
|
|
295
|
+
max_reflexive_iterations: Maximum number of reflexive refinement cycles (default 2)
|
|
296
|
+
temperature: LLM temperature for consistency (default 0.1 for deterministic outputs)
|
|
297
|
+
debug_mode: Enable comprehensive debug logging (default False)
|
|
298
|
+
"""
|
|
299
|
+
if llm is None:
|
|
300
|
+
try:
|
|
301
|
+
self.llm = create_llm(
|
|
302
|
+
"ollama",
|
|
303
|
+
model="qwen3:4b-instruct-2507-q4_K_M",
|
|
304
|
+
max_tokens=max_tokens,
|
|
305
|
+
max_output_tokens=max_output_tokens,
|
|
306
|
+
temperature=temperature, # Use consistent low temperature
|
|
307
|
+
timeout=timeout
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
except Exception as e:
|
|
311
|
+
error_msg = (
|
|
312
|
+
f"❌ Failed to initialize default Ollama model 'qwen3:4b-instruct-2507-q4_K_M': {e}\n\n"
|
|
313
|
+
"💡 To use the default model, please:\n"
|
|
314
|
+
" 1. Install Ollama from: https://ollama.com/\n"
|
|
315
|
+
" 2. Download the model: ollama pull qwen3:4b-instruct-2507-q4_K_M\n"
|
|
316
|
+
" 3. Start Ollama service\n\n"
|
|
317
|
+
"⚡ For best deep search performance, consider these models:\n"
|
|
318
|
+
" - qwen3-coder:30b (excellent for research and analysis, requires 32GB RAM)\n"
|
|
319
|
+
" - gpt-4o-mini (cloud-based, fast and reliable)\n"
|
|
320
|
+
" - claude-3-5-haiku (cloud-based, excellent reasoning)\n\n"
|
|
321
|
+
"🔧 Alternatively, provide a custom LLM instance:\n"
|
|
322
|
+
" from abstractcore import create_llm\n"
|
|
323
|
+
" from abstractcore.processing import BasicDeepSearch\n"
|
|
324
|
+
" \n"
|
|
325
|
+
" llm = create_llm('openai', model='gpt-4o-mini', max_tokens=32000, max_output_tokens=8000)\n"
|
|
326
|
+
" searcher = BasicDeepSearch(llm)"
|
|
327
|
+
)
|
|
328
|
+
raise RuntimeError(error_msg) from e
|
|
329
|
+
else:
|
|
330
|
+
self.llm = llm
|
|
331
|
+
|
|
332
|
+
self.max_parallel_searches = max_parallel_searches
|
|
333
|
+
self.full_text_extraction = full_text_extraction
|
|
334
|
+
self.reflexive_mode = reflexive_mode
|
|
335
|
+
self.max_reflexive_iterations = max_reflexive_iterations
|
|
336
|
+
self.temperature = temperature
|
|
337
|
+
self.debug_mode = debug_mode
|
|
338
|
+
self.retry_strategy = FeedbackRetry(max_attempts=3)
|
|
339
|
+
print(f"🤖 Initialized LLM: {self.llm.provider} {self.llm.model}")
|
|
340
|
+
|
|
341
|
+
# Debug tracking
|
|
342
|
+
if self.debug_mode:
|
|
343
|
+
self.debug_info = {
|
|
344
|
+
'all_queries': [],
|
|
345
|
+
'all_urls_found': [],
|
|
346
|
+
'relevance_assessments': [],
|
|
347
|
+
'accepted_sources': [],
|
|
348
|
+
'rejected_sources': []
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
def research(
|
|
352
|
+
self,
|
|
353
|
+
query: str,
|
|
354
|
+
focus_areas: Optional[List[str]] = None,
|
|
355
|
+
max_sources: int = 15,
|
|
356
|
+
search_depth: str = "standard", # brief, standard, comprehensive
|
|
357
|
+
include_verification: bool = True,
|
|
358
|
+
output_format: str = "structured" # structured, narrative, executive
|
|
359
|
+
) -> Union[ResearchReport, Dict[str, Any]]:
|
|
360
|
+
"""
|
|
361
|
+
Conduct autonomous deep research on a given query
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
query: The research question or topic
|
|
365
|
+
focus_areas: Specific areas to focus on (optional)
|
|
366
|
+
max_sources: Maximum number of sources to gather (default 15)
|
|
367
|
+
search_depth: Research depth - brief, standard, comprehensive (default standard)
|
|
368
|
+
include_verification: Whether to include fact-checking (default True)
|
|
369
|
+
output_format: Output format - structured, narrative, executive (default structured)
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
ResearchReport object or dictionary with research findings
|
|
373
|
+
"""
|
|
374
|
+
logger.info(f"🔍 Starting deep search research: {query}")
|
|
375
|
+
start_time = time.time()
|
|
376
|
+
|
|
377
|
+
try:
|
|
378
|
+
# Initialize source manager with strict limits
|
|
379
|
+
source_manager = SourceManager(max_sources)
|
|
380
|
+
logger.info(f"🎯 Initialized source manager with limit: {max_sources}")
|
|
381
|
+
|
|
382
|
+
# Stage 1: Planning
|
|
383
|
+
logger.info("📋 Stage 1: Planning research approach...")
|
|
384
|
+
research_plan = self._create_research_plan(query, focus_areas, search_depth)
|
|
385
|
+
|
|
386
|
+
# Stage 2: Question Development
|
|
387
|
+
logger.info("❓ Stage 2: Developing search questions...")
|
|
388
|
+
self._develop_search_questions(research_plan, max_sources)
|
|
389
|
+
|
|
390
|
+
# Debug: Show all generated queries
|
|
391
|
+
if self.debug_mode:
|
|
392
|
+
print("\n" + "="*80)
|
|
393
|
+
print("🔍 DEBUG: ALL GENERATED SEARCH QUERIES")
|
|
394
|
+
print("="*80)
|
|
395
|
+
for i, sub_task in enumerate(research_plan.sub_tasks, 1):
|
|
396
|
+
print(f"\n📋 Sub-task {i}: {sub_task.question}")
|
|
397
|
+
print(f"🎯 Focus: {sub_task.focus_area}")
|
|
398
|
+
print(f"🔍 Queries ({len(sub_task.search_queries)}):")
|
|
399
|
+
for j, query in enumerate(sub_task.search_queries, 1):
|
|
400
|
+
print(f" {j}. \"{query}\"")
|
|
401
|
+
self.debug_info['all_queries'].append({
|
|
402
|
+
'sub_task_id': sub_task.id,
|
|
403
|
+
'sub_task_question': sub_task.question,
|
|
404
|
+
'query': query
|
|
405
|
+
})
|
|
406
|
+
print("="*80)
|
|
407
|
+
|
|
408
|
+
# Stage 3: Web Exploration
|
|
409
|
+
logger.info("🌐 Stage 3: Exploring web sources...")
|
|
410
|
+
findings = self._explore_web_sources(research_plan, source_manager)
|
|
411
|
+
|
|
412
|
+
# Stage 4: Report Generation
|
|
413
|
+
logger.info("📝 Stage 4: Generating research report...")
|
|
414
|
+
report = self._generate_report(research_plan, findings, output_format)
|
|
415
|
+
|
|
416
|
+
# Optional: Verification
|
|
417
|
+
if include_verification:
|
|
418
|
+
logger.info("✅ Stage 5: Verifying findings...")
|
|
419
|
+
report = self._verify_report(report, findings)
|
|
420
|
+
|
|
421
|
+
# Stage 6: Reflexive improvement (if enabled)
|
|
422
|
+
if self.reflexive_mode:
|
|
423
|
+
logger.info("🔄 Stage 6: Reflexive analysis and refinement...")
|
|
424
|
+
report = self._reflexive_refinement(report, research_plan, findings)
|
|
425
|
+
|
|
426
|
+
elapsed_time = time.time() - start_time
|
|
427
|
+
logger.info(f"✨ Deep search completed in {elapsed_time:.1f} seconds")
|
|
428
|
+
|
|
429
|
+
# Debug: Show comprehensive summary
|
|
430
|
+
if self.debug_mode:
|
|
431
|
+
self._print_debug_summary()
|
|
432
|
+
|
|
433
|
+
return report
|
|
434
|
+
|
|
435
|
+
except Exception as e:
|
|
436
|
+
logger.error(f"❌ Deep search failed: {e}")
|
|
437
|
+
raise
|
|
438
|
+
|
|
439
|
+
def _create_research_plan(
|
|
440
|
+
self,
|
|
441
|
+
query: str,
|
|
442
|
+
focus_areas: Optional[List[str]],
|
|
443
|
+
search_depth: str
|
|
444
|
+
) -> ResearchPlan:
|
|
445
|
+
"""Stage 1: Create a structured research plan"""
|
|
446
|
+
|
|
447
|
+
# Detect query type and get appropriate focus areas
|
|
448
|
+
query_type = self._detect_query_type(query)
|
|
449
|
+
if not focus_areas:
|
|
450
|
+
focus_areas = self._get_focus_areas_by_type(query_type)
|
|
451
|
+
|
|
452
|
+
# Determine number of sub-tasks based on search depth
|
|
453
|
+
depth_config = {
|
|
454
|
+
"brief": {"sub_tasks": 3, "time_estimate": 5},
|
|
455
|
+
"standard": {"sub_tasks": 5, "time_estimate": 10},
|
|
456
|
+
"comprehensive": {"sub_tasks": 8, "time_estimate": 20}
|
|
457
|
+
}
|
|
458
|
+
config = depth_config.get(search_depth, depth_config["standard"])
|
|
459
|
+
|
|
460
|
+
planning_prompt = f"""
|
|
461
|
+
You are an expert research strategist. Analyze the following research query and create a comprehensive research plan.
|
|
462
|
+
|
|
463
|
+
RESEARCH QUERY: "{query}"
|
|
464
|
+
SEARCH DEPTH: {search_depth}
|
|
465
|
+
TARGET SUB-TASKS: {config['sub_tasks']}
|
|
466
|
+
|
|
467
|
+
Your task is to intelligently decompose this query into focused research themes and specific sub-questions.
|
|
468
|
+
|
|
469
|
+
INSTRUCTIONS:
|
|
470
|
+
1. First, identify the key themes/dimensions that need to be researched to fully answer this query
|
|
471
|
+
2. For each theme, generate 3 specific, actionable research questions
|
|
472
|
+
3. Prioritize the themes based on their importance to answering the original query
|
|
473
|
+
4. Ensure the questions are diverse and cover different aspects of the topic
|
|
474
|
+
|
|
475
|
+
RESEARCH OBJECTIVE: Write a clear 1-2 sentence objective that captures what we're trying to learn.
|
|
476
|
+
|
|
477
|
+
THEMES & SUB-QUESTIONS:
|
|
478
|
+
For each theme you identify, provide:
|
|
479
|
+
- Theme name (e.g., "Technical Overview", "Historical Context", "Current Applications")
|
|
480
|
+
- 3 specific research questions for that theme
|
|
481
|
+
- Priority level (1=essential, 2=important, 3=supplementary)
|
|
482
|
+
|
|
483
|
+
REQUIREMENTS:
|
|
484
|
+
- Generate exactly {config['sub_tasks']} total sub-questions across all themes
|
|
485
|
+
- Each question should be specific enough to guide targeted web searches
|
|
486
|
+
- Questions should be complementary, not overlapping
|
|
487
|
+
- Adapt the themes naturally to the query - don't force artificial categories
|
|
488
|
+
- For person queries: focus on biography, work, impact, affiliations, recent activities
|
|
489
|
+
- For concept queries: focus on definition, applications, development, current state, implications
|
|
490
|
+
- For technology queries: focus on how it works, use cases, advantages/limitations, market adoption
|
|
491
|
+
- For location queries: focus on geography, culture, economy, politics, current events
|
|
492
|
+
- For organization queries: focus on mission, leadership, products/services, market position, recent news
|
|
493
|
+
|
|
494
|
+
The themes should emerge naturally from understanding what someone would want to know about this specific topic.
|
|
495
|
+
"""
|
|
496
|
+
|
|
497
|
+
try:
|
|
498
|
+
# Use structured output for reliable JSON parsing
|
|
499
|
+
plan_model = self.llm.generate(
|
|
500
|
+
planning_prompt,
|
|
501
|
+
temperature=0.3,
|
|
502
|
+
response_model=ResearchPlanModel
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# Convert Pydantic model to dataclass objects
|
|
506
|
+
sub_tasks = []
|
|
507
|
+
focus_areas = []
|
|
508
|
+
task_counter = 1
|
|
509
|
+
|
|
510
|
+
for theme_model in plan_model.themes:
|
|
511
|
+
focus_areas.append(theme_model.name)
|
|
512
|
+
|
|
513
|
+
# Create sub-tasks from theme questions
|
|
514
|
+
for question in theme_model.questions:
|
|
515
|
+
sub_task = ResearchSubTask(
|
|
516
|
+
id=f"task_{task_counter}",
|
|
517
|
+
question=question,
|
|
518
|
+
focus_area=theme_model.name,
|
|
519
|
+
priority=theme_model.priority,
|
|
520
|
+
search_queries=[],
|
|
521
|
+
findings=[],
|
|
522
|
+
status="pending"
|
|
523
|
+
)
|
|
524
|
+
sub_tasks.append(sub_task)
|
|
525
|
+
task_counter += 1
|
|
526
|
+
|
|
527
|
+
research_plan = ResearchPlan(
|
|
528
|
+
original_query=query,
|
|
529
|
+
research_objective=plan_model.research_objective,
|
|
530
|
+
sub_tasks=sub_tasks,
|
|
531
|
+
estimated_time_minutes=plan_model.estimated_time_minutes,
|
|
532
|
+
focus_areas=focus_areas,
|
|
533
|
+
search_strategy=plan_model.search_strategy
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
logger.info(f"📋 Created research plan with {len(sub_tasks)} sub-tasks")
|
|
537
|
+
return research_plan
|
|
538
|
+
|
|
539
|
+
except Exception as e:
|
|
540
|
+
logger.error(f"Failed to parse research plan: {e}")
|
|
541
|
+
# Fallback to simple plan
|
|
542
|
+
return self._create_fallback_plan(query, focus_areas, search_depth)
|
|
543
|
+
|
|
544
|
+
def _create_fallback_plan(self, query: str, focus_areas: Optional[List[str]], search_depth: str) -> ResearchPlan:
|
|
545
|
+
"""Create a simple fallback research plan if JSON parsing fails"""
|
|
546
|
+
|
|
547
|
+
# Simple sub-tasks based on common research patterns
|
|
548
|
+
sub_tasks = [
|
|
549
|
+
ResearchSubTask("task_1", f"What is {query}? Provide definitions and overview", "overview", 1),
|
|
550
|
+
ResearchSubTask("task_2", f"What are the current developments in {query}?", "current_state", 1),
|
|
551
|
+
ResearchSubTask("task_3", f"What are the key challenges or issues with {query}?", "challenges", 2),
|
|
552
|
+
ResearchSubTask("task_4", f"What are future trends and predictions for {query}?", "future", 2),
|
|
553
|
+
]
|
|
554
|
+
|
|
555
|
+
if search_depth == "comprehensive":
|
|
556
|
+
sub_tasks.extend([
|
|
557
|
+
ResearchSubTask("task_5", f"Who are the key players or experts in {query}?", "stakeholders", 2),
|
|
558
|
+
ResearchSubTask("task_6", f"What are the economic or business implications of {query}?", "economics", 3),
|
|
559
|
+
ResearchSubTask("task_7", f"What are the technical or scientific aspects of {query}?", "technical", 3),
|
|
560
|
+
])
|
|
561
|
+
|
|
562
|
+
return ResearchPlan(
|
|
563
|
+
original_query=query,
|
|
564
|
+
research_objective=f"Comprehensive research on {query}",
|
|
565
|
+
sub_tasks=sub_tasks,
|
|
566
|
+
estimated_time_minutes=10,
|
|
567
|
+
focus_areas=focus_areas or ["overview", "current_state", "challenges", "future"],
|
|
568
|
+
search_strategy="parallel"
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
def _develop_search_questions(self, research_plan: ResearchPlan, max_sources: int) -> None:
|
|
572
|
+
"""Stage 2: Develop specific search queries for each sub-task"""
|
|
573
|
+
|
|
574
|
+
queries_per_task = max(2, max_sources // len(research_plan.sub_tasks))
|
|
575
|
+
|
|
576
|
+
for sub_task in research_plan.sub_tasks:
|
|
577
|
+
query_prompt = f"""
|
|
578
|
+
Generate {queries_per_task} specific, diverse search queries for this research question:
|
|
579
|
+
|
|
580
|
+
RESEARCH QUESTION: {sub_task.question}
|
|
581
|
+
FOCUS AREA: {sub_task.focus_area}
|
|
582
|
+
ORIGINAL QUERY: {research_plan.original_query}
|
|
583
|
+
|
|
584
|
+
Create search queries that:
|
|
585
|
+
1. Use different keywords and phrasings
|
|
586
|
+
2. Target different types of sources (news, academic, industry, etc.)
|
|
587
|
+
3. Include recent time-sensitive queries where relevant (2024, 2025)
|
|
588
|
+
4. Are specific enough to find relevant information
|
|
589
|
+
5. Avoid generic terms that might return irrelevant results
|
|
590
|
+
|
|
591
|
+
Examples of good search queries for quantum computing:
|
|
592
|
+
- "quantum computing hardware advances 2024"
|
|
593
|
+
- "quantum computer error correction breakthrough 2024"
|
|
594
|
+
- "IBM Google quantum computing progress 2024"
|
|
595
|
+
- "quantum computing industry applications 2024"
|
|
596
|
+
- "quantum supremacy achievements 2024"
|
|
597
|
+
|
|
598
|
+
Avoid generic terms like "qubit" alone (which returns lab instruments) - be specific about quantum computing context.
|
|
599
|
+
"""
|
|
600
|
+
|
|
601
|
+
try:
|
|
602
|
+
# Use structured output for reliable parsing
|
|
603
|
+
queries_model = self.llm.generate(
|
|
604
|
+
query_prompt,
|
|
605
|
+
temperature=0.5,
|
|
606
|
+
response_model=SearchQueriesModel
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
sub_task.search_queries = queries_model.queries[:queries_per_task]
|
|
610
|
+
logger.info(f"📝 Generated queries for {sub_task.id}: {sub_task.search_queries}")
|
|
611
|
+
|
|
612
|
+
except Exception as e:
|
|
613
|
+
logger.warning(f"Failed to parse queries for {sub_task.id}, using fallback")
|
|
614
|
+
# Improved fallback queries with better specificity
|
|
615
|
+
base_topic = research_plan.original_query.replace("What are the latest developments in ", "").replace("?", "")
|
|
616
|
+
sub_task.search_queries = [
|
|
617
|
+
f"{base_topic} {sub_task.focus_area} 2024",
|
|
618
|
+
f"{base_topic} advances {sub_task.focus_area} 2024",
|
|
619
|
+
f"{base_topic} research {sub_task.focus_area} latest"
|
|
620
|
+
][:queries_per_task]
|
|
621
|
+
logger.info(f"📝 Using fallback queries for {sub_task.id}: {sub_task.search_queries}")
|
|
622
|
+
|
|
623
|
+
def _explore_web_sources(self, research_plan: ResearchPlan, source_manager: SourceManager) -> List[ResearchFinding]:
|
|
624
|
+
"""Stage 3: Execute web searches and gather evidence"""
|
|
625
|
+
|
|
626
|
+
all_findings = []
|
|
627
|
+
|
|
628
|
+
# Collect all search queries with their sub-task context
|
|
629
|
+
search_tasks = []
|
|
630
|
+
for sub_task in research_plan.sub_tasks:
|
|
631
|
+
for query in sub_task.search_queries:
|
|
632
|
+
search_tasks.append((sub_task.id, query, sub_task.priority))
|
|
633
|
+
|
|
634
|
+
# Sort by priority (1=high priority first)
|
|
635
|
+
search_tasks.sort(key=lambda x: x[2])
|
|
636
|
+
|
|
637
|
+
# Global URL deduplication across all sub-tasks
|
|
638
|
+
processed_urls = set()
|
|
639
|
+
|
|
640
|
+
if self.debug_mode:
|
|
641
|
+
print(f"\n🔍 DEBUG: Starting web exploration with {len(search_tasks)} search tasks")
|
|
642
|
+
print(f"🎯 Source limit: {source_manager.max_sources}")
|
|
643
|
+
for i, (sub_task_id, query, priority) in enumerate(search_tasks, 1):
|
|
644
|
+
print(f" {i}. [{sub_task_id}] \"{query}\" (Priority: {priority})")
|
|
645
|
+
|
|
646
|
+
# Execute searches in parallel with source limit management
|
|
647
|
+
with ThreadPoolExecutor(max_workers=self.max_parallel_searches) as executor:
|
|
648
|
+
# Submit search tasks
|
|
649
|
+
future_to_task = {}
|
|
650
|
+
for sub_task_id, query, priority in search_tasks:
|
|
651
|
+
# Check if we still have capacity
|
|
652
|
+
if source_manager.is_full():
|
|
653
|
+
logger.info(f"🎯 Source limit reached ({source_manager.max_sources}), stopping search submission")
|
|
654
|
+
break
|
|
655
|
+
|
|
656
|
+
future = executor.submit(self._execute_search, sub_task_id, query, source_manager, processed_urls)
|
|
657
|
+
future_to_task[future] = (sub_task_id, query)
|
|
658
|
+
|
|
659
|
+
# Collect results as they complete
|
|
660
|
+
for future in as_completed(future_to_task):
|
|
661
|
+
sub_task_id, query = future_to_task[future]
|
|
662
|
+
try:
|
|
663
|
+
findings = future.result()
|
|
664
|
+
all_findings.extend(findings)
|
|
665
|
+
logger.debug(f"Completed search for {sub_task_id}: {query} - {len(findings)} findings")
|
|
666
|
+
except Exception as e:
|
|
667
|
+
logger.warning(f"Search failed for {sub_task_id} '{query}': {e}")
|
|
668
|
+
|
|
669
|
+
# Early termination if source limit reached
|
|
670
|
+
if source_manager.is_full():
|
|
671
|
+
logger.info(f"🎯 Source limit reached ({source_manager.max_sources}), stopping early")
|
|
672
|
+
break
|
|
673
|
+
|
|
674
|
+
# Update sub-tasks with their findings
|
|
675
|
+
findings_by_task = {}
|
|
676
|
+
for finding in all_findings:
|
|
677
|
+
task_id = finding.sub_task_id
|
|
678
|
+
if task_id not in findings_by_task:
|
|
679
|
+
findings_by_task[task_id] = []
|
|
680
|
+
findings_by_task[task_id].append(finding)
|
|
681
|
+
|
|
682
|
+
for sub_task in research_plan.sub_tasks:
|
|
683
|
+
sub_task.findings = findings_by_task.get(sub_task.id, [])
|
|
684
|
+
sub_task.status = "completed" if sub_task.findings else "failed"
|
|
685
|
+
|
|
686
|
+
logger.info(f"🌐 Gathered {len(all_findings)} findings from web exploration")
|
|
687
|
+
return all_findings
|
|
688
|
+
|
|
689
|
+
def _execute_search(self, sub_task_id: str, query: str, source_manager: SourceManager, processed_urls: set) -> List[ResearchFinding]:
|
|
690
|
+
"""Execute a single web search and extract findings"""
|
|
691
|
+
|
|
692
|
+
findings = []
|
|
693
|
+
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
694
|
+
|
|
695
|
+
try:
|
|
696
|
+
# Perform web search
|
|
697
|
+
logger.info(f"🔍 Executing search for: {query}")
|
|
698
|
+
search_results = web_search(query, num_results=5)
|
|
699
|
+
logger.debug(f"📄 Search results length: {len(search_results)}")
|
|
700
|
+
logger.debug(f"📄 Search results preview: {search_results[:500]}")
|
|
701
|
+
|
|
702
|
+
# Parse search results to extract URLs and content
|
|
703
|
+
urls = self._extract_urls_from_search(search_results)
|
|
704
|
+
logger.info(f"🔗 Extracted {len(urls)} URLs from search results")
|
|
705
|
+
|
|
706
|
+
# Deduplicate URLs globally across all sub-tasks
|
|
707
|
+
original_count = len(urls)
|
|
708
|
+
urls = [(url, title) for url, title in urls if url not in processed_urls]
|
|
709
|
+
deduplicated_count = len(urls)
|
|
710
|
+
|
|
711
|
+
# Add new URLs to processed set
|
|
712
|
+
for url, title in urls:
|
|
713
|
+
processed_urls.add(url)
|
|
714
|
+
|
|
715
|
+
if self.debug_mode and original_count > deduplicated_count:
|
|
716
|
+
print(f"\n🔄 DEBUG: URL Deduplication for query \"{query}\":")
|
|
717
|
+
print(f" 📊 Original URLs: {original_count}")
|
|
718
|
+
print(f" 📊 After deduplication: {deduplicated_count}")
|
|
719
|
+
print(f" 📊 Duplicates removed: {original_count - deduplicated_count}")
|
|
720
|
+
|
|
721
|
+
# Debug: Show all URLs found for this query
|
|
722
|
+
if self.debug_mode:
|
|
723
|
+
print(f"\n🔍 DEBUG: URLs found for query \"{query}\":")
|
|
724
|
+
for i, (url, title) in enumerate(urls, 1):
|
|
725
|
+
print(f" {i}. {title}")
|
|
726
|
+
print(f" 🔗 {url}")
|
|
727
|
+
self.debug_info['all_urls_found'].append({
|
|
728
|
+
'query': query,
|
|
729
|
+
'sub_task_id': sub_task_id,
|
|
730
|
+
'url': url,
|
|
731
|
+
'title': title
|
|
732
|
+
})
|
|
733
|
+
|
|
734
|
+
if not urls:
|
|
735
|
+
logger.warning(f"⚠️ No URLs found in search results for query: {query}")
|
|
736
|
+
logger.debug(f"Full search results: {search_results}")
|
|
737
|
+
# Try to create a synthetic finding from the search results if they contain useful information
|
|
738
|
+
if len(search_results) > 100 and "Error searching internet" not in search_results:
|
|
739
|
+
synthetic_finding = ResearchFinding(
|
|
740
|
+
source_url="https://duckduckgo.com/?q=" + query.replace(" ", "+"),
|
|
741
|
+
title=f"Search results for: {query}",
|
|
742
|
+
content=search_results[:500] + "...",
|
|
743
|
+
relevance_score=0.3,
|
|
744
|
+
timestamp=timestamp,
|
|
745
|
+
sub_task_id=sub_task_id
|
|
746
|
+
)
|
|
747
|
+
findings.append(synthetic_finding)
|
|
748
|
+
logger.info(f"✅ Created synthetic finding from search results")
|
|
749
|
+
return findings
|
|
750
|
+
|
|
751
|
+
# Fetch content from promising URLs with source manager control
|
|
752
|
+
for i, (url, title) in enumerate(urls):
|
|
753
|
+
# Check source manager capacity before processing
|
|
754
|
+
if source_manager.is_full():
|
|
755
|
+
logger.info(f"🎯 Source limit reached, stopping URL processing for query: {query}")
|
|
756
|
+
break
|
|
757
|
+
|
|
758
|
+
try:
|
|
759
|
+
logger.debug(f"🌐 Fetching content from URL {i+1}: {url}")
|
|
760
|
+
content = fetch_url(url, timeout=15)
|
|
761
|
+
|
|
762
|
+
if "Error" in content or len(content) < 100:
|
|
763
|
+
logger.debug(f"⚠️ Skipping URL due to fetch error or short content: {url}")
|
|
764
|
+
continue
|
|
765
|
+
|
|
766
|
+
# Extract relevant content using structured parsing or LLM
|
|
767
|
+
if self.full_text_extraction:
|
|
768
|
+
# For full text mode, use custom fetch with more content
|
|
769
|
+
relevant_content = self._extract_relevant_content_full_text(content, query, url)
|
|
770
|
+
else:
|
|
771
|
+
# Standard mode with structured parsing
|
|
772
|
+
relevant_content = self._extract_relevant_content(content, query)
|
|
773
|
+
|
|
774
|
+
if relevant_content:
|
|
775
|
+
# Use LLM to assess content relevance and quality
|
|
776
|
+
quality_assessment = self._assess_content_relevance(relevant_content, query, title)
|
|
777
|
+
|
|
778
|
+
# Debug: Show relevance assessment details
|
|
779
|
+
if self.debug_mode:
|
|
780
|
+
print(f"\n🧠 DEBUG: Relevance Assessment for {title}")
|
|
781
|
+
print(f" 🔗 URL: {url}")
|
|
782
|
+
print(f" 📊 Relevant: {quality_assessment['is_relevant']}")
|
|
783
|
+
print(f" 📈 Score: {quality_assessment['relevance_score']:.2f}")
|
|
784
|
+
print(f" 💭 Reason: {quality_assessment['reason']}")
|
|
785
|
+
print(f" 📝 Content preview: {relevant_content[:200]}...")
|
|
786
|
+
|
|
787
|
+
self.debug_info['relevance_assessments'].append({
|
|
788
|
+
'url': url,
|
|
789
|
+
'title': title,
|
|
790
|
+
'query': query,
|
|
791
|
+
'is_relevant': quality_assessment['is_relevant'],
|
|
792
|
+
'relevance_score': quality_assessment['relevance_score'],
|
|
793
|
+
'reason': quality_assessment['reason'],
|
|
794
|
+
'content_preview': relevant_content[:200]
|
|
795
|
+
})
|
|
796
|
+
|
|
797
|
+
if quality_assessment['is_relevant']:
|
|
798
|
+
# Create source for manager validation
|
|
799
|
+
source_data = {
|
|
800
|
+
'url': url,
|
|
801
|
+
'title': title,
|
|
802
|
+
'content': relevant_content,
|
|
803
|
+
'relevance_score': quality_assessment['relevance_score'],
|
|
804
|
+
'timestamp': timestamp,
|
|
805
|
+
'sub_task_id': sub_task_id
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
# Try to add to source manager (handles deduplication and limits)
|
|
809
|
+
if source_manager.add_source(source_data):
|
|
810
|
+
finding = ResearchFinding(
|
|
811
|
+
source_url=url,
|
|
812
|
+
title=title,
|
|
813
|
+
content=relevant_content,
|
|
814
|
+
relevance_score=quality_assessment['relevance_score'],
|
|
815
|
+
timestamp=timestamp,
|
|
816
|
+
sub_task_id=sub_task_id
|
|
817
|
+
)
|
|
818
|
+
findings.append(finding)
|
|
819
|
+
logger.info(f"✅ Added relevant finding from {url} (score: {quality_assessment['relevance_score']:.2f}) ({len(source_manager.get_sources())}/{source_manager.max_sources})")
|
|
820
|
+
|
|
821
|
+
if self.debug_mode:
|
|
822
|
+
self.debug_info['accepted_sources'].append({
|
|
823
|
+
'url': url,
|
|
824
|
+
'title': title,
|
|
825
|
+
'relevance_score': quality_assessment['relevance_score'],
|
|
826
|
+
'reason': 'Accepted by source manager'
|
|
827
|
+
})
|
|
828
|
+
else:
|
|
829
|
+
logger.debug(f"🎯 Source not added (duplicate or limit reached): {url}")
|
|
830
|
+
if self.debug_mode:
|
|
831
|
+
self.debug_info['rejected_sources'].append({
|
|
832
|
+
'url': url,
|
|
833
|
+
'title': title,
|
|
834
|
+
'relevance_score': quality_assessment['relevance_score'],
|
|
835
|
+
'reason': 'Duplicate or source limit reached'
|
|
836
|
+
})
|
|
837
|
+
else:
|
|
838
|
+
logger.info(f"🚫 Content filtered out from {url}: {quality_assessment['reason']}")
|
|
839
|
+
if self.debug_mode:
|
|
840
|
+
self.debug_info['rejected_sources'].append({
|
|
841
|
+
'url': url,
|
|
842
|
+
'title': title,
|
|
843
|
+
'relevance_score': quality_assessment['relevance_score'],
|
|
844
|
+
'reason': f"Not relevant: {quality_assessment['reason']}"
|
|
845
|
+
})
|
|
846
|
+
else:
|
|
847
|
+
logger.debug(f"⚠️ No relevant content extracted from {url}")
|
|
848
|
+
if self.debug_mode:
|
|
849
|
+
self.debug_info['rejected_sources'].append({
|
|
850
|
+
'url': url,
|
|
851
|
+
'title': title,
|
|
852
|
+
'relevance_score': 0.0,
|
|
853
|
+
'reason': 'No relevant content could be extracted'
|
|
854
|
+
})
|
|
855
|
+
|
|
856
|
+
except Exception as e:
|
|
857
|
+
logger.warning(f"Failed to fetch {url}: {e}")
|
|
858
|
+
continue
|
|
859
|
+
|
|
860
|
+
except Exception as e:
|
|
861
|
+
logger.error(f"Search execution failed for '{query}': {e}")
|
|
862
|
+
|
|
863
|
+
logger.info(f"📊 Search completed for '{query}': {len(findings)} findings")
|
|
864
|
+
return findings
|
|
865
|
+
|
|
866
|
+
def _check_authority_indicators(self, title: str, content: str, query: str) -> Dict[str, Any]:
|
|
867
|
+
"""Check for high-authority source indicators that should be prioritized"""
|
|
868
|
+
|
|
869
|
+
title_lower = title.lower()
|
|
870
|
+
content_lower = content.lower()
|
|
871
|
+
query_lower = query.lower()
|
|
872
|
+
|
|
873
|
+
# Extract potential person name from query
|
|
874
|
+
query_words = query_lower.split()
|
|
875
|
+
potential_names = [word for word in query_words if word.istitle() or len(word) > 3]
|
|
876
|
+
|
|
877
|
+
# High-authority indicators
|
|
878
|
+
authority_indicators = [
|
|
879
|
+
# Academic/Professional profiles
|
|
880
|
+
('google scholar', 0.95, 'Official Google Scholar profile'),
|
|
881
|
+
('orcid', 0.95, 'Official ORCID researcher profile'),
|
|
882
|
+
('researchgate', 0.90, 'ResearchGate academic profile'),
|
|
883
|
+
('linkedin', 0.85, 'Professional LinkedIn profile'),
|
|
884
|
+
('academia.edu', 0.85, 'Academia.edu academic profile'),
|
|
885
|
+
|
|
886
|
+
# Institutional websites
|
|
887
|
+
('university', 0.90, 'University/academic institution'),
|
|
888
|
+
('institute', 0.90, 'Research institute'),
|
|
889
|
+
('laboratory', 0.85, 'Research laboratory'),
|
|
890
|
+
('.edu', 0.90, 'Educational institution domain'),
|
|
891
|
+
('.ac.', 0.90, 'Academic institution domain'),
|
|
892
|
+
|
|
893
|
+
# Personal/official websites
|
|
894
|
+
('personal website', 0.95, 'Personal/official website'),
|
|
895
|
+
('official site', 0.95, 'Official website'),
|
|
896
|
+
]
|
|
897
|
+
|
|
898
|
+
# Check for personal name match in title/content
|
|
899
|
+
name_match_score = 0.0
|
|
900
|
+
if potential_names:
|
|
901
|
+
for name in potential_names:
|
|
902
|
+
if name in title_lower or name in content_lower:
|
|
903
|
+
name_match_score = 0.8
|
|
904
|
+
break
|
|
905
|
+
|
|
906
|
+
# Check authority indicators
|
|
907
|
+
for indicator, base_score, reason in authority_indicators:
|
|
908
|
+
if indicator in title_lower or indicator in content_lower:
|
|
909
|
+
final_score = min(1.0, base_score + name_match_score * 0.2)
|
|
910
|
+
return {
|
|
911
|
+
'is_high_authority': True,
|
|
912
|
+
'authority_score': final_score,
|
|
913
|
+
'reason': reason + (f' with name match' if name_match_score > 0 else '')
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
# Check for personal domain (e.g., lpalbou.info)
|
|
917
|
+
if any(name in title_lower for name in potential_names if len(name) > 3):
|
|
918
|
+
return {
|
|
919
|
+
'is_high_authority': True,
|
|
920
|
+
'authority_score': 0.95,
|
|
921
|
+
'reason': 'Personal domain/website matching query subject'
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
return {
|
|
925
|
+
'is_high_authority': False,
|
|
926
|
+
'authority_score': 0.0,
|
|
927
|
+
'reason': 'No high-authority indicators found'
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
def _assess_content_relevance(self, content: str, query: str, title: str) -> Dict[str, Any]:
|
|
931
|
+
"""Use LLM to quickly assess if content is relevant to the research query"""
|
|
932
|
+
|
|
933
|
+
# First check for high-authority sources that should be prioritized
|
|
934
|
+
authority_indicators = self._check_authority_indicators(title, content, query)
|
|
935
|
+
if authority_indicators['is_high_authority']:
|
|
936
|
+
return {
|
|
937
|
+
'is_relevant': True,
|
|
938
|
+
'relevance_score': authority_indicators['authority_score'],
|
|
939
|
+
'reason': f"High-authority source: {authority_indicators['reason']}"
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
# Limit content for efficient assessment
|
|
943
|
+
assessment_content = content[:1500] + "..." if len(content) > 1500 else content
|
|
944
|
+
|
|
945
|
+
assessment_prompt = f"""
|
|
946
|
+
Assess if this content contains meaningful information related to the research query.
|
|
947
|
+
|
|
948
|
+
RESEARCH QUERY: {query}
|
|
949
|
+
SOURCE TITLE: {title}
|
|
950
|
+
|
|
951
|
+
CONTENT:
|
|
952
|
+
{assessment_content}
|
|
953
|
+
|
|
954
|
+
Respond with ONLY a JSON object in this exact format:
|
|
955
|
+
{{
|
|
956
|
+
"is_relevant": true/false,
|
|
957
|
+
"relevance_score": 0.0-1.0,
|
|
958
|
+
"reason": "brief explanation"
|
|
959
|
+
}}
|
|
960
|
+
|
|
961
|
+
CRITICAL: Mark as RELEVANT (true) if the content:
|
|
962
|
+
- Contains ANY substantive information that could help answer or relate to the query
|
|
963
|
+
- Provides facts, data, explanations, or details about the query topic
|
|
964
|
+
- Is from authoritative sources (official websites, academic profiles, institutional pages)
|
|
965
|
+
- Has meaningful textual content beyond navigation elements
|
|
966
|
+
- Shows biographical, professional, or academic information when querying about a person
|
|
967
|
+
|
|
968
|
+
Mark as NOT RELEVANT (false) ONLY if the content:
|
|
969
|
+
- Is completely unrelated to the query topic (different person, concept, etc.)
|
|
970
|
+
- Contains ONLY navigation menus, headers, footers, or structural elements
|
|
971
|
+
- Shows clear error messages, access restrictions, or "page not found"
|
|
972
|
+
- Is purely promotional/advertising without ANY informational value
|
|
973
|
+
- Discusses entirely different subjects with no connection to the query
|
|
974
|
+
|
|
975
|
+
BE GENEROUS with relevance assessment - when in doubt, mark as relevant.
|
|
976
|
+
"""
|
|
977
|
+
|
|
978
|
+
try:
|
|
979
|
+
response = self.llm.generate(assessment_prompt, temperature=0.1)
|
|
980
|
+
|
|
981
|
+
# Extract text from response (handle different response types)
|
|
982
|
+
if hasattr(response, 'text'):
|
|
983
|
+
response_text = response.text
|
|
984
|
+
elif hasattr(response, 'content'):
|
|
985
|
+
response_text = response.content
|
|
986
|
+
else:
|
|
987
|
+
response_text = str(response)
|
|
988
|
+
|
|
989
|
+
# Parse JSON from response
|
|
990
|
+
import json
|
|
991
|
+
json_start = response_text.find('{')
|
|
992
|
+
json_end = response_text.rfind('}') + 1
|
|
993
|
+
|
|
994
|
+
if json_start != -1 and json_end > json_start:
|
|
995
|
+
json_text = response_text[json_start:json_end]
|
|
996
|
+
assessment = json.loads(json_text)
|
|
997
|
+
|
|
998
|
+
# Validate and normalize
|
|
999
|
+
if 'is_relevant' in assessment and 'relevance_score' in assessment:
|
|
1000
|
+
assessment['relevance_score'] = max(0.0, min(1.0, float(assessment['relevance_score'])))
|
|
1001
|
+
assessment['reason'] = assessment.get('reason', 'No reason provided')
|
|
1002
|
+
return assessment
|
|
1003
|
+
|
|
1004
|
+
# Fallback if JSON parsing fails
|
|
1005
|
+
logger.debug(f"Content relevance assessment JSON parsing failed, using fallback")
|
|
1006
|
+
return self._fallback_relevance_assessment(content, query)
|
|
1007
|
+
|
|
1008
|
+
except Exception as e:
|
|
1009
|
+
logger.debug(f"Content relevance assessment failed: {e}")
|
|
1010
|
+
return self._fallback_relevance_assessment(content, query)
|
|
1011
|
+
|
|
1012
|
+
def _fallback_relevance_assessment(self, content: str, query: str) -> Dict[str, Any]:
|
|
1013
|
+
"""Fallback relevance assessment using general content quality heuristics"""
|
|
1014
|
+
|
|
1015
|
+
content_lower = content.lower()
|
|
1016
|
+
words = content.split()
|
|
1017
|
+
word_count = len(words)
|
|
1018
|
+
|
|
1019
|
+
# Check for obvious error/empty content indicators
|
|
1020
|
+
error_indicators = [
|
|
1021
|
+
'page not found', '404 error', '403 error', '500 error',
|
|
1022
|
+
'access denied', 'login required', 'sign in required',
|
|
1023
|
+
'javascript required', 'cookies required', 'enable javascript',
|
|
1024
|
+
'subscribe to continue', 'sign up to read', 'premium content',
|
|
1025
|
+
'page does not exist', 'content not available'
|
|
1026
|
+
]
|
|
1027
|
+
|
|
1028
|
+
has_errors = any(indicator in content_lower for indicator in error_indicators)
|
|
1029
|
+
|
|
1030
|
+
# Check for navigation-heavy content (low information density)
|
|
1031
|
+
navigation_indicators = ['home', 'about', 'contact', 'menu', 'navigation', 'footer', 'header']
|
|
1032
|
+
nav_count = sum(1 for indicator in navigation_indicators if indicator in content_lower)
|
|
1033
|
+
nav_ratio = nav_count / max(word_count, 1)
|
|
1034
|
+
|
|
1035
|
+
# Basic content quality assessment
|
|
1036
|
+
if has_errors:
|
|
1037
|
+
return {
|
|
1038
|
+
'is_relevant': False,
|
|
1039
|
+
'relevance_score': 0.0,
|
|
1040
|
+
'reason': 'Contains error messages or access restrictions'
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
if word_count < 10:
|
|
1044
|
+
return {
|
|
1045
|
+
'is_relevant': False,
|
|
1046
|
+
'relevance_score': 0.0,
|
|
1047
|
+
'reason': f'Too little content ({word_count} words)'
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
if nav_ratio > 0.3: # More than 30% navigation terms
|
|
1051
|
+
return {
|
|
1052
|
+
'is_relevant': False,
|
|
1053
|
+
'relevance_score': 0.2,
|
|
1054
|
+
'reason': 'Content appears to be mostly navigation elements'
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
# If content passes basic quality checks, calculate relevance
|
|
1058
|
+
query_words = [word.lower().strip('.,!?;:"()[]{}') for word in query.split() if len(word) > 2]
|
|
1059
|
+
|
|
1060
|
+
if not query_words:
|
|
1061
|
+
# If query has no meaningful words, accept content based on quality
|
|
1062
|
+
relevance_score = 0.7 if word_count >= 50 else 0.5
|
|
1063
|
+
return {
|
|
1064
|
+
'is_relevant': True,
|
|
1065
|
+
'relevance_score': relevance_score,
|
|
1066
|
+
'reason': f'Query has no key terms, accepting based on content quality ({word_count} words)'
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
# Calculate keyword overlap
|
|
1070
|
+
matches = sum(1 for word in query_words if word in content_lower)
|
|
1071
|
+
keyword_relevance = matches / len(query_words)
|
|
1072
|
+
|
|
1073
|
+
# Content length bonus (longer content more likely to be informative)
|
|
1074
|
+
length_bonus = min(0.3, word_count / 200) # Up to 0.3 bonus for 200+ words
|
|
1075
|
+
|
|
1076
|
+
final_relevance = keyword_relevance + length_bonus
|
|
1077
|
+
is_relevant = final_relevance >= 0.4 # Require meaningful keyword overlap, don't accept long irrelevant content
|
|
1078
|
+
|
|
1079
|
+
return {
|
|
1080
|
+
'is_relevant': is_relevant,
|
|
1081
|
+
'relevance_score': min(1.0, final_relevance),
|
|
1082
|
+
'reason': f'{matches}/{len(query_words)} keywords, {word_count} words, score: {final_relevance:.2f}'
|
|
1083
|
+
}
|
|
1084
|
+
|
|
1085
|
+
def _print_debug_summary(self):
|
|
1086
|
+
"""Print comprehensive debug summary"""
|
|
1087
|
+
print("\n" + "="*80)
|
|
1088
|
+
print("🔍 DEBUG SUMMARY: COMPLETE RESEARCH PROCESS")
|
|
1089
|
+
print("="*80)
|
|
1090
|
+
|
|
1091
|
+
# Query summary
|
|
1092
|
+
print(f"\n📋 TOTAL QUERIES GENERATED: {len(self.debug_info['all_queries'])}")
|
|
1093
|
+
query_by_subtask = {}
|
|
1094
|
+
for q in self.debug_info['all_queries']:
|
|
1095
|
+
subtask = q['sub_task_question']
|
|
1096
|
+
if subtask not in query_by_subtask:
|
|
1097
|
+
query_by_subtask[subtask] = []
|
|
1098
|
+
query_by_subtask[subtask].append(q['query'])
|
|
1099
|
+
|
|
1100
|
+
for subtask, queries in query_by_subtask.items():
|
|
1101
|
+
print(f"\n🎯 {subtask}")
|
|
1102
|
+
for i, query in enumerate(queries, 1):
|
|
1103
|
+
print(f" {i}. \"{query}\"")
|
|
1104
|
+
|
|
1105
|
+
# URL summary
|
|
1106
|
+
print(f"\n🔗 TOTAL URLS DISCOVERED: {len(self.debug_info['all_urls_found'])}")
|
|
1107
|
+
urls_by_query = {}
|
|
1108
|
+
for url_info in self.debug_info['all_urls_found']:
|
|
1109
|
+
query = url_info['query']
|
|
1110
|
+
if query not in urls_by_query:
|
|
1111
|
+
urls_by_query[query] = []
|
|
1112
|
+
urls_by_query[query].append((url_info['title'], url_info['url']))
|
|
1113
|
+
|
|
1114
|
+
for query, urls in urls_by_query.items():
|
|
1115
|
+
print(f"\n🔍 Query: \"{query}\" → {len(urls)} URLs")
|
|
1116
|
+
for i, (title, url) in enumerate(urls, 1):
|
|
1117
|
+
print(f" {i}. {title}")
|
|
1118
|
+
print(f" 🔗 {url}")
|
|
1119
|
+
|
|
1120
|
+
# Relevance assessment summary
|
|
1121
|
+
print(f"\n🧠 RELEVANCE ASSESSMENTS: {len(self.debug_info['relevance_assessments'])}")
|
|
1122
|
+
relevant_count = sum(1 for a in self.debug_info['relevance_assessments'] if a['is_relevant'])
|
|
1123
|
+
irrelevant_count = len(self.debug_info['relevance_assessments']) - relevant_count
|
|
1124
|
+
|
|
1125
|
+
print(f" ✅ Relevant: {relevant_count}")
|
|
1126
|
+
print(f" ❌ Not Relevant: {irrelevant_count}")
|
|
1127
|
+
|
|
1128
|
+
if self.debug_info['relevance_assessments']:
|
|
1129
|
+
print(f"\n📊 DETAILED ASSESSMENTS:")
|
|
1130
|
+
for i, assessment in enumerate(self.debug_info['relevance_assessments'], 1):
|
|
1131
|
+
status = "✅" if assessment['is_relevant'] else "❌"
|
|
1132
|
+
print(f"\n {i}. {status} {assessment['title']}")
|
|
1133
|
+
print(f" 🔗 {assessment['url']}")
|
|
1134
|
+
print(f" 📈 Score: {assessment['relevance_score']:.2f}")
|
|
1135
|
+
print(f" 💭 Reason: {assessment['reason']}")
|
|
1136
|
+
print(f" 📝 Preview: {assessment['content_preview']}...")
|
|
1137
|
+
|
|
1138
|
+
# Final source summary
|
|
1139
|
+
print(f"\n📚 FINAL SOURCES:")
|
|
1140
|
+
print(f" ✅ Accepted: {len(self.debug_info['accepted_sources'])}")
|
|
1141
|
+
print(f" ❌ Rejected: {len(self.debug_info['rejected_sources'])}")
|
|
1142
|
+
|
|
1143
|
+
if self.debug_info['accepted_sources']:
|
|
1144
|
+
print(f"\n✅ ACCEPTED SOURCES:")
|
|
1145
|
+
for i, source in enumerate(self.debug_info['accepted_sources'], 1):
|
|
1146
|
+
print(f" {i}. {source['title']} (Score: {source['relevance_score']:.2f})")
|
|
1147
|
+
print(f" 🔗 {source['url']}")
|
|
1148
|
+
print(f" ✅ {source['reason']}")
|
|
1149
|
+
|
|
1150
|
+
if self.debug_info['rejected_sources']:
|
|
1151
|
+
print(f"\n❌ REJECTED SOURCES:")
|
|
1152
|
+
for i, source in enumerate(self.debug_info['rejected_sources'], 1):
|
|
1153
|
+
print(f" {i}. {source['title']} (Score: {source['relevance_score']:.2f})")
|
|
1154
|
+
print(f" 🔗 {source['url']}")
|
|
1155
|
+
print(f" ❌ {source['reason']}")
|
|
1156
|
+
|
|
1157
|
+
print("\n" + "="*80)
|
|
1158
|
+
print("🔍 END DEBUG SUMMARY")
|
|
1159
|
+
print("="*80)
|
|
1160
|
+
|
|
1161
|
+
def _detect_query_type(self, query: str) -> str:
|
|
1162
|
+
"""Detect the type of query to generate appropriate research plan"""
|
|
1163
|
+
query_lower = query.lower()
|
|
1164
|
+
|
|
1165
|
+
# Person indicators
|
|
1166
|
+
person_indicators = [
|
|
1167
|
+
'who is', 'biography of', 'background of', 'profile of',
|
|
1168
|
+
'researcher', 'scientist', 'professor', 'dr.', 'phd'
|
|
1169
|
+
]
|
|
1170
|
+
|
|
1171
|
+
# Concept/idea indicators
|
|
1172
|
+
concept_indicators = [
|
|
1173
|
+
'what is', 'explain', 'definition of', 'concept of', 'theory of',
|
|
1174
|
+
'how does', 'why does', 'principle of', 'mechanism of'
|
|
1175
|
+
]
|
|
1176
|
+
|
|
1177
|
+
# Location/country indicators
|
|
1178
|
+
location_indicators = [
|
|
1179
|
+
'country', 'city', 'region', 'geography of', 'history of',
|
|
1180
|
+
'economy of', 'politics of', 'culture of'
|
|
1181
|
+
]
|
|
1182
|
+
|
|
1183
|
+
# Technology/product indicators
|
|
1184
|
+
technology_indicators = [
|
|
1185
|
+
'technology', 'software', 'algorithm', 'method', 'technique',
|
|
1186
|
+
'system', 'platform', 'tool', 'framework'
|
|
1187
|
+
]
|
|
1188
|
+
|
|
1189
|
+
# Company/organization indicators
|
|
1190
|
+
organization_indicators = [
|
|
1191
|
+
'company', 'organization', 'institution', 'startup', 'business',
|
|
1192
|
+
'corporation', 'agency', 'foundation'
|
|
1193
|
+
]
|
|
1194
|
+
|
|
1195
|
+
# Check for patterns
|
|
1196
|
+
if any(indicator in query_lower for indicator in person_indicators):
|
|
1197
|
+
return "person"
|
|
1198
|
+
elif any(indicator in query_lower for indicator in concept_indicators):
|
|
1199
|
+
return "concept"
|
|
1200
|
+
elif any(indicator in query_lower for indicator in location_indicators):
|
|
1201
|
+
return "location"
|
|
1202
|
+
elif any(indicator in query_lower for indicator in technology_indicators):
|
|
1203
|
+
return "technology"
|
|
1204
|
+
elif any(indicator in query_lower for indicator in organization_indicators):
|
|
1205
|
+
return "organization"
|
|
1206
|
+
else:
|
|
1207
|
+
# Default based on query structure
|
|
1208
|
+
words = query_lower.split()
|
|
1209
|
+
if len(words) <= 3 and any(word.istitle() for word in query.split()):
|
|
1210
|
+
return "person" # Likely a name
|
|
1211
|
+
else:
|
|
1212
|
+
return "concept" # General topic
|
|
1213
|
+
|
|
1214
|
+
def _get_focus_areas_by_type(self, query_type: str) -> List[str]:
|
|
1215
|
+
"""Get appropriate focus areas based on query type"""
|
|
1216
|
+
focus_areas_map = {
|
|
1217
|
+
"person": [
|
|
1218
|
+
"Professional Biography", "Academic Output", "Industry Impact",
|
|
1219
|
+
"Public Presence", "Professional Affiliations"
|
|
1220
|
+
],
|
|
1221
|
+
"concept": [
|
|
1222
|
+
"Definition & Overview", "Historical Development", "Key Applications",
|
|
1223
|
+
"Current Research", "Future Implications"
|
|
1224
|
+
],
|
|
1225
|
+
"location": [
|
|
1226
|
+
"Geography & Demographics", "History & Culture", "Economy & Politics",
|
|
1227
|
+
"Current Events", "International Relations"
|
|
1228
|
+
],
|
|
1229
|
+
"technology": [
|
|
1230
|
+
"Technical Overview", "Development History", "Current Applications",
|
|
1231
|
+
"Market Analysis", "Future Trends"
|
|
1232
|
+
],
|
|
1233
|
+
"organization": [
|
|
1234
|
+
"Company Overview", "Business Model", "Leadership & History",
|
|
1235
|
+
"Market Position", "Recent Developments"
|
|
1236
|
+
]
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
return focus_areas_map.get(query_type, focus_areas_map["concept"])
|
|
1240
|
+
|
|
1241
|
+
def _extract_urls_from_search(self, search_results: str) -> List[tuple]:
|
|
1242
|
+
"""Extract URLs and titles from search results"""
|
|
1243
|
+
urls = []
|
|
1244
|
+
lines = search_results.split('\n')
|
|
1245
|
+
|
|
1246
|
+
current_title = ""
|
|
1247
|
+
for line in lines:
|
|
1248
|
+
line = line.strip()
|
|
1249
|
+
|
|
1250
|
+
# Look for numbered results (1., 2., etc.)
|
|
1251
|
+
if line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')):
|
|
1252
|
+
current_title = line[2:].strip()
|
|
1253
|
+
|
|
1254
|
+
# Look for URLs with link emoji
|
|
1255
|
+
elif line.startswith('🔗'):
|
|
1256
|
+
url = line.replace('🔗', '').strip()
|
|
1257
|
+
if url.startswith('http'):
|
|
1258
|
+
urls.append((url, current_title or "Web Result"))
|
|
1259
|
+
|
|
1260
|
+
# Also look for direct URLs in the text (fallback)
|
|
1261
|
+
elif 'http' in line and ('://' in line):
|
|
1262
|
+
import re
|
|
1263
|
+
url_matches = re.findall(r'https?://[^\s<>"{}|\\^`\[\]]+', line)
|
|
1264
|
+
for url in url_matches:
|
|
1265
|
+
# Clean up URL (remove trailing punctuation)
|
|
1266
|
+
url = url.rstrip('.,;:!?)')
|
|
1267
|
+
if url not in [u[0] for u in urls]: # Avoid duplicates
|
|
1268
|
+
title = current_title or f"Web Result from {url.split('/')[2]}"
|
|
1269
|
+
urls.append((url, title))
|
|
1270
|
+
|
|
1271
|
+
# If no URLs found, try a more aggressive search
|
|
1272
|
+
if not urls:
|
|
1273
|
+
import re
|
|
1274
|
+
all_urls = re.findall(r'https?://[^\s<>"{}|\\^`\[\]]+', search_results)
|
|
1275
|
+
for url in all_urls:
|
|
1276
|
+
url = url.rstrip('.,;:!?)')
|
|
1277
|
+
title = f"Web Result from {url.split('/')[2] if '/' in url else 'Unknown'}"
|
|
1278
|
+
urls.append((url, title))
|
|
1279
|
+
|
|
1280
|
+
logger.debug(f"🔗 URL extraction found {len(urls)} URLs: {[u[0] for u in urls[:3]]}")
|
|
1281
|
+
return urls
|
|
1282
|
+
|
|
1283
|
+
def _extract_relevant_content(self, content: str, query: str) -> str:
|
|
1284
|
+
"""Extract relevant content from fetched web page using structured parsing"""
|
|
1285
|
+
|
|
1286
|
+
# First, try to parse the structured output from fetch_url
|
|
1287
|
+
structured_content = self._parse_fetch_url_output(content)
|
|
1288
|
+
|
|
1289
|
+
if structured_content:
|
|
1290
|
+
# Use structured data for more efficient extraction
|
|
1291
|
+
return self._extract_from_structured_content(structured_content, query)
|
|
1292
|
+
else:
|
|
1293
|
+
# Fallback to LLM-based extraction for unstructured content
|
|
1294
|
+
return self._extract_with_llm(content, query)
|
|
1295
|
+
|
|
1296
|
+
def _parse_fetch_url_output(self, content: str) -> Optional[Dict[str, Any]]:
|
|
1297
|
+
"""Parse structured output from fetch_url tool"""
|
|
1298
|
+
try:
|
|
1299
|
+
# Look for the structured sections in fetch_url output
|
|
1300
|
+
if "📄 Content Analysis:" not in content:
|
|
1301
|
+
return None
|
|
1302
|
+
|
|
1303
|
+
structured = {}
|
|
1304
|
+
lines = content.split('\n')
|
|
1305
|
+
|
|
1306
|
+
for i, line in enumerate(lines):
|
|
1307
|
+
line = line.strip()
|
|
1308
|
+
|
|
1309
|
+
# Extract title
|
|
1310
|
+
if line.startswith('📰 Title:'):
|
|
1311
|
+
structured['title'] = line.replace('📰 Title:', '').strip()
|
|
1312
|
+
|
|
1313
|
+
# Extract description
|
|
1314
|
+
elif line.startswith('📝 Description:'):
|
|
1315
|
+
structured['description'] = line.replace('📝 Description:', '').strip()
|
|
1316
|
+
|
|
1317
|
+
# Extract headings
|
|
1318
|
+
elif line.startswith('📋 Headings'):
|
|
1319
|
+
headings = []
|
|
1320
|
+
j = i + 1
|
|
1321
|
+
while j < len(lines) and lines[j].strip().startswith('•'):
|
|
1322
|
+
heading = lines[j].strip().replace('• ', '')
|
|
1323
|
+
headings.append(heading)
|
|
1324
|
+
j += 1
|
|
1325
|
+
structured['headings'] = headings
|
|
1326
|
+
|
|
1327
|
+
# Extract text content preview
|
|
1328
|
+
elif line.startswith('📄 Text Content Preview:'):
|
|
1329
|
+
# Collect multiple lines of text content
|
|
1330
|
+
text_lines = []
|
|
1331
|
+
j = i + 1
|
|
1332
|
+
while j < len(lines) and not lines[j].strip().startswith(('📊', '📄', '🔗', '📋', '📰', '📝')):
|
|
1333
|
+
if lines[j].strip(): # Skip empty lines
|
|
1334
|
+
text_lines.append(lines[j].strip())
|
|
1335
|
+
j += 1
|
|
1336
|
+
if text_lines:
|
|
1337
|
+
structured['text_preview'] = ' '.join(text_lines)
|
|
1338
|
+
|
|
1339
|
+
# Store raw content for full text extraction if needed
|
|
1340
|
+
if self.full_text_extraction:
|
|
1341
|
+
structured['_raw_content'] = content
|
|
1342
|
+
|
|
1343
|
+
return structured if structured else None
|
|
1344
|
+
|
|
1345
|
+
except Exception as e:
|
|
1346
|
+
logger.debug(f"Failed to parse fetch_url output: {e}")
|
|
1347
|
+
return None
|
|
1348
|
+
|
|
1349
|
+
def _extract_from_structured_content(self, structured: Dict[str, Any], query: str) -> str:
|
|
1350
|
+
"""Extract relevant information from structured content"""
|
|
1351
|
+
|
|
1352
|
+
# Build content summary from structured data
|
|
1353
|
+
content_parts = []
|
|
1354
|
+
|
|
1355
|
+
# Add title if relevant
|
|
1356
|
+
title = structured.get('title', '')
|
|
1357
|
+
if title and any(word.lower() in title.lower() for word in query.split()):
|
|
1358
|
+
content_parts.append(f"**Title:** {title}")
|
|
1359
|
+
|
|
1360
|
+
# Add description if available
|
|
1361
|
+
description = structured.get('description', '')
|
|
1362
|
+
if description:
|
|
1363
|
+
content_parts.append(f"**Summary:** {description}")
|
|
1364
|
+
|
|
1365
|
+
# Add relevant headings
|
|
1366
|
+
headings = structured.get('headings', [])
|
|
1367
|
+
relevant_headings = []
|
|
1368
|
+
query_words = [word.lower() for word in query.split()]
|
|
1369
|
+
|
|
1370
|
+
for heading in headings[:10]: # Limit to first 10 headings
|
|
1371
|
+
if any(word in heading.lower() for word in query_words):
|
|
1372
|
+
relevant_headings.append(heading)
|
|
1373
|
+
|
|
1374
|
+
if relevant_headings:
|
|
1375
|
+
content_parts.append(f"**Key Sections:** {'; '.join(relevant_headings[:5])}")
|
|
1376
|
+
|
|
1377
|
+
# Add text preview (longer or full text based on mode)
|
|
1378
|
+
text_preview = structured.get('text_preview', '')
|
|
1379
|
+
if text_preview:
|
|
1380
|
+
if self.full_text_extraction:
|
|
1381
|
+
# In full text mode, try to get more content from fetch_url
|
|
1382
|
+
full_text = self._extract_full_text_from_fetch_output(structured.get('_raw_content', ''))
|
|
1383
|
+
if full_text and len(full_text) > len(text_preview):
|
|
1384
|
+
content_parts.append(f"**Full Content:** {full_text}")
|
|
1385
|
+
else:
|
|
1386
|
+
content_parts.append(f"**Content:** {text_preview}")
|
|
1387
|
+
else:
|
|
1388
|
+
# Standard mode: use longer preview (up to 1000 chars)
|
|
1389
|
+
preview = text_preview[:1000] + "..." if len(text_preview) > 1000 else text_preview
|
|
1390
|
+
content_parts.append(f"**Content:** {preview}")
|
|
1391
|
+
|
|
1392
|
+
if not content_parts:
|
|
1393
|
+
return None
|
|
1394
|
+
|
|
1395
|
+
# Combine and validate relevance
|
|
1396
|
+
combined_content = '\n'.join(content_parts)
|
|
1397
|
+
|
|
1398
|
+
# Quick relevance check - if query words appear in the content
|
|
1399
|
+
query_words_lower = [word.lower() for word in query.split() if len(word) > 2]
|
|
1400
|
+
content_lower = combined_content.lower()
|
|
1401
|
+
|
|
1402
|
+
relevance_score = sum(1 for word in query_words_lower if word in content_lower) / len(query_words_lower)
|
|
1403
|
+
|
|
1404
|
+
if relevance_score < 0.2: # Less than 20% of query words found
|
|
1405
|
+
return None
|
|
1406
|
+
|
|
1407
|
+
return combined_content
|
|
1408
|
+
|
|
1409
|
+
def _extract_full_text_from_fetch_output(self, raw_content: str) -> str:
|
|
1410
|
+
"""Extract full clean text content from fetch_url output"""
|
|
1411
|
+
if not raw_content or "📄 Text Content Preview:" not in raw_content:
|
|
1412
|
+
return ""
|
|
1413
|
+
|
|
1414
|
+
try:
|
|
1415
|
+
# Find the text content section
|
|
1416
|
+
lines = raw_content.split('\n')
|
|
1417
|
+
text_lines = []
|
|
1418
|
+
in_text_section = False
|
|
1419
|
+
|
|
1420
|
+
for line in lines:
|
|
1421
|
+
line_stripped = line.strip()
|
|
1422
|
+
|
|
1423
|
+
# Start collecting after "Text Content Preview:"
|
|
1424
|
+
if line_stripped.startswith('📄 Text Content Preview:'):
|
|
1425
|
+
in_text_section = True
|
|
1426
|
+
continue
|
|
1427
|
+
|
|
1428
|
+
# Stop at next section or metadata
|
|
1429
|
+
elif in_text_section and line_stripped.startswith(('📊', '📄', '🔗', '📋', '📰', '📝', '⏰', '✅')):
|
|
1430
|
+
break
|
|
1431
|
+
|
|
1432
|
+
# Collect text lines
|
|
1433
|
+
elif in_text_section and line_stripped:
|
|
1434
|
+
# Skip obvious metadata or navigation
|
|
1435
|
+
if not any(skip in line_stripped.lower() for skip in [
|
|
1436
|
+
'total text length:', 'characters', 'download image',
|
|
1437
|
+
'press inquiries', 'contact:', 'email:', 'phone:',
|
|
1438
|
+
'breadcrumb', 'navigation', 'menu', 'footer'
|
|
1439
|
+
]):
|
|
1440
|
+
text_lines.append(line_stripped)
|
|
1441
|
+
|
|
1442
|
+
if text_lines:
|
|
1443
|
+
full_text = ' '.join(text_lines)
|
|
1444
|
+
# Clean up excessive whitespace
|
|
1445
|
+
full_text = ' '.join(full_text.split())
|
|
1446
|
+
return full_text
|
|
1447
|
+
|
|
1448
|
+
return ""
|
|
1449
|
+
|
|
1450
|
+
except Exception as e:
|
|
1451
|
+
logger.debug(f"Failed to extract full text: {e}")
|
|
1452
|
+
return ""
|
|
1453
|
+
|
|
1454
|
+
def _extract_relevant_content_full_text(self, content: str, query: str, url: str) -> str:
|
|
1455
|
+
"""Extract relevant content using full text mode with custom processing"""
|
|
1456
|
+
|
|
1457
|
+
# First try structured parsing
|
|
1458
|
+
structured_content = self._parse_fetch_url_output(content)
|
|
1459
|
+
|
|
1460
|
+
if structured_content:
|
|
1461
|
+
# Get the full text if available
|
|
1462
|
+
full_text = self._extract_full_text_from_fetch_output(content)
|
|
1463
|
+
|
|
1464
|
+
if full_text and len(full_text) > 200:
|
|
1465
|
+
# Use LLM to extract relevant parts from the full text
|
|
1466
|
+
llm_result = self._extract_with_llm_full_text(full_text, query)
|
|
1467
|
+
if llm_result:
|
|
1468
|
+
return llm_result
|
|
1469
|
+
|
|
1470
|
+
# Always try structured extraction as fallback
|
|
1471
|
+
structured_result = self._extract_from_structured_content(structured_content, query)
|
|
1472
|
+
if structured_result:
|
|
1473
|
+
return structured_result
|
|
1474
|
+
|
|
1475
|
+
# Final fallback to standard LLM extraction
|
|
1476
|
+
return self._extract_with_llm(content, query)
|
|
1477
|
+
|
|
1478
|
+
def _extract_with_llm_full_text(self, full_text: str, query: str) -> str:
|
|
1479
|
+
"""Extract relevant content from full text using LLM"""
|
|
1480
|
+
|
|
1481
|
+
# Limit content length for LLM processing (but allow more than standard mode)
|
|
1482
|
+
max_length = 15000 # 3x more than standard mode
|
|
1483
|
+
if len(full_text) > max_length:
|
|
1484
|
+
# Try to truncate at sentence boundary
|
|
1485
|
+
truncated = full_text[:max_length]
|
|
1486
|
+
last_period = truncated.rfind('.')
|
|
1487
|
+
if last_period > max_length - 500: # If period is reasonably close to end
|
|
1488
|
+
full_text = truncated[:last_period + 1]
|
|
1489
|
+
else:
|
|
1490
|
+
full_text = truncated + "..."
|
|
1491
|
+
|
|
1492
|
+
extraction_prompt = f"""
|
|
1493
|
+
Extract the most relevant and comprehensive information from this full text content for the research query.
|
|
1494
|
+
|
|
1495
|
+
RESEARCH QUERY: {query}
|
|
1496
|
+
|
|
1497
|
+
FULL TEXT CONTENT:
|
|
1498
|
+
{full_text}
|
|
1499
|
+
|
|
1500
|
+
Extract 3-5 key points that directly answer or relate to the research query.
|
|
1501
|
+
Focus on:
|
|
1502
|
+
- Specific facts, data, statistics, and recent developments
|
|
1503
|
+
- Technical details and performance metrics
|
|
1504
|
+
- Key findings and authoritative statements
|
|
1505
|
+
- Recent breakthroughs or announcements
|
|
1506
|
+
- Comparative information and benchmarks
|
|
1507
|
+
|
|
1508
|
+
Format as a comprehensive summary (max 800 words) with the most important information.
|
|
1509
|
+
Include specific details like numbers, dates, company names, and technical specifications.
|
|
1510
|
+
If the content is not relevant to the query, respond with "NOT_RELEVANT".
|
|
1511
|
+
"""
|
|
1512
|
+
|
|
1513
|
+
try:
|
|
1514
|
+
response = self.llm.generate(extraction_prompt, temperature=0.2)
|
|
1515
|
+
|
|
1516
|
+
# Extract text from response (handle different response object types)
|
|
1517
|
+
if hasattr(response, 'text'):
|
|
1518
|
+
response_text = response.text
|
|
1519
|
+
elif hasattr(response, 'content'):
|
|
1520
|
+
response_text = response.content
|
|
1521
|
+
else:
|
|
1522
|
+
response_text = str(response)
|
|
1523
|
+
|
|
1524
|
+
extracted = response_text.strip()
|
|
1525
|
+
|
|
1526
|
+
if extracted == "NOT_RELEVANT" or len(extracted) < 100:
|
|
1527
|
+
return None
|
|
1528
|
+
|
|
1529
|
+
return extracted
|
|
1530
|
+
|
|
1531
|
+
except Exception as e:
|
|
1532
|
+
logger.debug(f"Full text extraction failed: {e}")
|
|
1533
|
+
return None
|
|
1534
|
+
|
|
1535
|
+
def _extract_with_llm(self, content: str, query: str) -> str:
|
|
1536
|
+
"""Fallback LLM-based extraction for unstructured content"""
|
|
1537
|
+
|
|
1538
|
+
# Limit content length for processing
|
|
1539
|
+
if len(content) > 8000:
|
|
1540
|
+
content = content[:8000] + "..."
|
|
1541
|
+
|
|
1542
|
+
extraction_prompt = f"""
|
|
1543
|
+
Extract the most relevant information from this content for the research query.
|
|
1544
|
+
|
|
1545
|
+
RESEARCH QUERY: {query}
|
|
1546
|
+
|
|
1547
|
+
CONTENT:
|
|
1548
|
+
{content}
|
|
1549
|
+
|
|
1550
|
+
Extract 2-3 key points that directly answer or relate to the research query.
|
|
1551
|
+
Focus on facts, data, recent developments, and authoritative statements.
|
|
1552
|
+
Ignore navigation, ads, and irrelevant content.
|
|
1553
|
+
|
|
1554
|
+
Format as a concise summary (max 300 words) with the most important information.
|
|
1555
|
+
If the content is not relevant to the query, respond with "NOT_RELEVANT".
|
|
1556
|
+
"""
|
|
1557
|
+
|
|
1558
|
+
try:
|
|
1559
|
+
response = self.llm.generate(extraction_prompt, temperature=0.2)
|
|
1560
|
+
|
|
1561
|
+
# Extract text from response (handle different response object types)
|
|
1562
|
+
if hasattr(response, 'text'):
|
|
1563
|
+
response_text = response.text
|
|
1564
|
+
elif hasattr(response, 'content'):
|
|
1565
|
+
response_text = response.content
|
|
1566
|
+
else:
|
|
1567
|
+
response_text = str(response)
|
|
1568
|
+
|
|
1569
|
+
extracted = response_text.strip()
|
|
1570
|
+
|
|
1571
|
+
if extracted == "NOT_RELEVANT" or len(extracted) < 50:
|
|
1572
|
+
return None
|
|
1573
|
+
|
|
1574
|
+
return extracted
|
|
1575
|
+
|
|
1576
|
+
except Exception as e:
|
|
1577
|
+
logger.debug(f"Content extraction failed: {e}")
|
|
1578
|
+
return None
|
|
1579
|
+
|
|
1580
|
+
def _generate_report(
|
|
1581
|
+
self,
|
|
1582
|
+
research_plan: ResearchPlan,
|
|
1583
|
+
findings: List[ResearchFinding],
|
|
1584
|
+
output_format: str
|
|
1585
|
+
) -> ResearchReport:
|
|
1586
|
+
"""Stage 4: Generate structured research report"""
|
|
1587
|
+
|
|
1588
|
+
# Check if we have any findings
|
|
1589
|
+
if not findings:
|
|
1590
|
+
logger.warning("⚠️ No findings available for report generation")
|
|
1591
|
+
return self._create_no_findings_report(research_plan)
|
|
1592
|
+
|
|
1593
|
+
# Organize findings by sub-task
|
|
1594
|
+
findings_by_task = {}
|
|
1595
|
+
for finding in findings:
|
|
1596
|
+
task_id = finding.sub_task_id
|
|
1597
|
+
if task_id not in findings_by_task:
|
|
1598
|
+
findings_by_task[task_id] = []
|
|
1599
|
+
findings_by_task[task_id].append(finding)
|
|
1600
|
+
|
|
1601
|
+
# Collect research context for specific methodology/limitations
|
|
1602
|
+
search_queries_used = []
|
|
1603
|
+
for sub_task in research_plan.sub_tasks:
|
|
1604
|
+
if sub_task.search_queries:
|
|
1605
|
+
search_queries_used.extend(sub_task.search_queries)
|
|
1606
|
+
|
|
1607
|
+
successful_extractions = len([f for f in findings if f.content and len(f.content.strip()) > 50])
|
|
1608
|
+
total_sources_attempted = len(findings)
|
|
1609
|
+
failed_extractions = total_sources_attempted - successful_extractions
|
|
1610
|
+
|
|
1611
|
+
research_context = {
|
|
1612
|
+
'total_sources_found': total_sources_attempted,
|
|
1613
|
+
'successful_extractions': successful_extractions,
|
|
1614
|
+
'failed_extractions': failed_extractions,
|
|
1615
|
+
'search_queries_used': search_queries_used,
|
|
1616
|
+
'extraction_method': 'full_text' if self.full_text_extraction else 'preview',
|
|
1617
|
+
'focus_areas': research_plan.focus_areas,
|
|
1618
|
+
'sub_tasks_count': len(research_plan.sub_tasks)
|
|
1619
|
+
}
|
|
1620
|
+
|
|
1621
|
+
# Prepare findings summary for the LLM
|
|
1622
|
+
findings_summary = []
|
|
1623
|
+
total_findings_count = 0
|
|
1624
|
+
|
|
1625
|
+
for sub_task in research_plan.sub_tasks:
|
|
1626
|
+
task_findings = findings_by_task.get(sub_task.id, [])
|
|
1627
|
+
if task_findings:
|
|
1628
|
+
findings_summary.append(f"\n## {sub_task.question}")
|
|
1629
|
+
for finding in task_findings:
|
|
1630
|
+
findings_summary.append(f"- {finding.content}")
|
|
1631
|
+
findings_summary.append(f" Source: {finding.title} ({finding.source_url})")
|
|
1632
|
+
total_findings_count += 1
|
|
1633
|
+
|
|
1634
|
+
findings_text = "\n".join(findings_summary)
|
|
1635
|
+
|
|
1636
|
+
if not findings_text.strip():
|
|
1637
|
+
logger.warning("⚠️ No usable findings content for report generation")
|
|
1638
|
+
return self._create_no_findings_report(research_plan)
|
|
1639
|
+
|
|
1640
|
+
logger.info(f"📝 Generating report from {total_findings_count} findings across {len(findings_by_task)} sub-tasks")
|
|
1641
|
+
|
|
1642
|
+
# Generate report based on format
|
|
1643
|
+
if output_format == "executive":
|
|
1644
|
+
report_prompt = self._get_executive_report_prompt(research_plan, findings_text, research_context)
|
|
1645
|
+
elif output_format == "narrative":
|
|
1646
|
+
report_prompt = self._get_narrative_report_prompt(research_plan, findings_text, research_context)
|
|
1647
|
+
else: # structured
|
|
1648
|
+
report_prompt = self._get_structured_report_prompt(research_plan, findings_text, research_context)
|
|
1649
|
+
|
|
1650
|
+
try:
|
|
1651
|
+
response = self.llm.generate(report_prompt, temperature=0.3)
|
|
1652
|
+
|
|
1653
|
+
# Extract JSON from response (handle cases where LLM adds extra text)
|
|
1654
|
+
# Extract text from response (handle different response object types)
|
|
1655
|
+
if hasattr(response, 'text'):
|
|
1656
|
+
response_text = response.text.strip()
|
|
1657
|
+
elif hasattr(response, 'content'):
|
|
1658
|
+
response_text = response.content.strip()
|
|
1659
|
+
else:
|
|
1660
|
+
response_text = str(response).strip()
|
|
1661
|
+
|
|
1662
|
+
# Try to find JSON in the response
|
|
1663
|
+
json_start = response_text.find('{')
|
|
1664
|
+
json_end = response_text.rfind('}') + 1
|
|
1665
|
+
|
|
1666
|
+
if json_start != -1 and json_end > json_start:
|
|
1667
|
+
json_text = response_text[json_start:json_end]
|
|
1668
|
+
logger.debug(f"📄 Extracted JSON: {json_text[:200]}...")
|
|
1669
|
+
report_data = json.loads(json_text)
|
|
1670
|
+
else:
|
|
1671
|
+
logger.warning("⚠️ No JSON found in LLM response, using fallback")
|
|
1672
|
+
raise json.JSONDecodeError("No JSON found", response_text, 0)
|
|
1673
|
+
|
|
1674
|
+
# Create sources list
|
|
1675
|
+
sources = []
|
|
1676
|
+
for finding in findings:
|
|
1677
|
+
source_entry = {
|
|
1678
|
+
"title": finding.title,
|
|
1679
|
+
"url": finding.source_url,
|
|
1680
|
+
"relevance": finding.relevance_score
|
|
1681
|
+
}
|
|
1682
|
+
if source_entry not in sources:
|
|
1683
|
+
sources.append(source_entry)
|
|
1684
|
+
|
|
1685
|
+
# Validate and enhance citations in the generated content
|
|
1686
|
+
detailed_analysis = report_data.get("detailed_analysis", "")
|
|
1687
|
+
key_findings = report_data.get("key_findings", [])
|
|
1688
|
+
|
|
1689
|
+
# Validate citations in detailed analysis
|
|
1690
|
+
citation_validation = CitationValidator.validate_citations(detailed_analysis, sources)
|
|
1691
|
+
logger.info(f"📊 Citation validation: {citation_validation['citations_found']} citations found, "
|
|
1692
|
+
f"{citation_validation['citation_ratio']:.2f} ratio, "
|
|
1693
|
+
f"adequately cited: {citation_validation['is_adequately_cited']}")
|
|
1694
|
+
|
|
1695
|
+
# Enhance content if citations are insufficient
|
|
1696
|
+
if not citation_validation['is_adequately_cited']:
|
|
1697
|
+
logger.warning("⚠️ Insufficient citations detected, enhancing content")
|
|
1698
|
+
detailed_analysis = CitationValidator.enhance_text_with_citations(detailed_analysis, sources)
|
|
1699
|
+
|
|
1700
|
+
# Also enhance key findings if they lack citations
|
|
1701
|
+
enhanced_findings = []
|
|
1702
|
+
for finding in key_findings:
|
|
1703
|
+
if isinstance(finding, str):
|
|
1704
|
+
finding_validation = CitationValidator.validate_citations(finding, sources)
|
|
1705
|
+
if finding_validation['citations_found'] == 0:
|
|
1706
|
+
enhanced_finding = CitationValidator.enhance_text_with_citations(finding, sources[:2]) # Limit to top 2 sources
|
|
1707
|
+
enhanced_findings.append(enhanced_finding)
|
|
1708
|
+
else:
|
|
1709
|
+
enhanced_findings.append(finding)
|
|
1710
|
+
else:
|
|
1711
|
+
enhanced_findings.append(finding)
|
|
1712
|
+
key_findings = enhanced_findings
|
|
1713
|
+
|
|
1714
|
+
# Ensure all fields are properly formatted for Pydantic validation
|
|
1715
|
+
def ensure_string(value, default=""):
|
|
1716
|
+
"""Convert list or other types to string"""
|
|
1717
|
+
if isinstance(value, list):
|
|
1718
|
+
return " ".join(str(item) for item in value)
|
|
1719
|
+
elif value is None:
|
|
1720
|
+
return default
|
|
1721
|
+
else:
|
|
1722
|
+
return str(value)
|
|
1723
|
+
|
|
1724
|
+
def ensure_list(value, default=None):
|
|
1725
|
+
"""Ensure value is a list"""
|
|
1726
|
+
if default is None:
|
|
1727
|
+
default = []
|
|
1728
|
+
if isinstance(value, list):
|
|
1729
|
+
return value
|
|
1730
|
+
elif isinstance(value, str):
|
|
1731
|
+
return [value] if value else default
|
|
1732
|
+
else:
|
|
1733
|
+
return default
|
|
1734
|
+
|
|
1735
|
+
report = ResearchReport(
|
|
1736
|
+
title=ensure_string(report_data.get("title"), f"Research Report: {research_plan.original_query}"),
|
|
1737
|
+
executive_summary=ensure_string(report_data.get("executive_summary"), ""),
|
|
1738
|
+
key_findings=ensure_list(key_findings, []),
|
|
1739
|
+
detailed_analysis=ensure_string(detailed_analysis, ""),
|
|
1740
|
+
conclusions=ensure_string(report_data.get("conclusions"), ""),
|
|
1741
|
+
sources=sources,
|
|
1742
|
+
methodology=ensure_string(report_data.get("methodology"), "Web-based research using multi-stage pipeline"),
|
|
1743
|
+
limitations=ensure_string(report_data.get("limitations"), "Limited to publicly available web sources")
|
|
1744
|
+
)
|
|
1745
|
+
|
|
1746
|
+
return report
|
|
1747
|
+
|
|
1748
|
+
except (json.JSONDecodeError, KeyError) as e:
|
|
1749
|
+
logger.error(f"Failed to parse report: {e}")
|
|
1750
|
+
# Return fallback report
|
|
1751
|
+
return self._create_fallback_report(research_plan, findings)
|
|
1752
|
+
|
|
1753
|
+
def _create_no_findings_report(self, research_plan: ResearchPlan) -> ResearchReport:
|
|
1754
|
+
"""Create a report when no findings are available"""
|
|
1755
|
+
|
|
1756
|
+
return ResearchReport(
|
|
1757
|
+
title=f"Research Report: {research_plan.original_query} (No Sources Found)",
|
|
1758
|
+
executive_summary="Research could not be completed due to inability to access web sources. This may be due to network connectivity issues, search service limitations, or content access restrictions.",
|
|
1759
|
+
key_findings=[
|
|
1760
|
+
"No web sources could be accessed for this research query",
|
|
1761
|
+
"Search functionality may be limited due to network or service issues",
|
|
1762
|
+
"Manual research using alternative sources is recommended"
|
|
1763
|
+
],
|
|
1764
|
+
detailed_analysis="The automated research process was unable to gather information from web sources for this query. This could be due to several factors: network connectivity issues preventing access to search services, search API limitations or rate limiting, content access restrictions, or the specific query terms not yielding accessible results. To complete this research, manual investigation using academic databases, library resources, or direct access to authoritative sources would be recommended.",
|
|
1765
|
+
conclusions="Automated web-based research could not be completed for this query. Alternative research methods should be employed to gather the required information.",
|
|
1766
|
+
sources=[],
|
|
1767
|
+
methodology="Attempted web-based research using multi-stage pipeline with parallel search execution. Search services were inaccessible or returned no usable results.",
|
|
1768
|
+
limitations="Complete inability to access web sources severely limits the scope and reliability of this research. No factual claims can be made without source verification."
|
|
1769
|
+
)
|
|
1770
|
+
|
|
1771
|
+
def _get_structured_report_prompt(self, research_plan: ResearchPlan, findings_text: str, research_context: Dict[str, Any] = None) -> str:
|
|
1772
|
+
"""Get prompt for structured report generation"""
|
|
1773
|
+
|
|
1774
|
+
# Build research context information
|
|
1775
|
+
context_info = ""
|
|
1776
|
+
if research_context:
|
|
1777
|
+
total_sources = research_context.get('total_sources_found', 0)
|
|
1778
|
+
successful_extractions = research_context.get('successful_extractions', 0)
|
|
1779
|
+
failed_extractions = research_context.get('failed_extractions', 0)
|
|
1780
|
+
search_queries_used = research_context.get('search_queries_used', [])
|
|
1781
|
+
extraction_method = research_context.get('extraction_method', 'standard')
|
|
1782
|
+
|
|
1783
|
+
context_info = f"""
|
|
1784
|
+
RESEARCH PROCESS CONTEXT:
|
|
1785
|
+
- Total sources discovered: {total_sources}
|
|
1786
|
+
- Successfully analyzed: {successful_extractions}
|
|
1787
|
+
- Failed to access: {failed_extractions}
|
|
1788
|
+
- Extraction method: {extraction_method}
|
|
1789
|
+
- Search queries executed: {len(search_queries_used)}
|
|
1790
|
+
- Key search terms: {', '.join(search_queries_used[:5]) if search_queries_used else 'None'}
|
|
1791
|
+
"""
|
|
1792
|
+
|
|
1793
|
+
return f"""
|
|
1794
|
+
Generate a comprehensive research report based on the findings below.
|
|
1795
|
+
|
|
1796
|
+
IMPORTANT: Respond with ONLY valid JSON, no additional text before or after.
|
|
1797
|
+
|
|
1798
|
+
RESEARCH OBJECTIVE: {research_plan.research_objective}
|
|
1799
|
+
ORIGINAL QUERY: {research_plan.original_query}
|
|
1800
|
+
{context_info}
|
|
1801
|
+
RESEARCH FINDINGS:
|
|
1802
|
+
{findings_text}
|
|
1803
|
+
|
|
1804
|
+
Create a structured research report with this EXACT JSON format:
|
|
1805
|
+
{{
|
|
1806
|
+
"title": "Descriptive report title about {research_plan.original_query}",
|
|
1807
|
+
"executive_summary": "2-3 sentence summary of key insights from the research findings",
|
|
1808
|
+
"key_findings": [
|
|
1809
|
+
"Key finding 1 with specific details and citation (according to Source Name)",
|
|
1810
|
+
"Key finding 2 with specific details and citation (according to Source Name)",
|
|
1811
|
+
"Key finding 3 with specific details and citation (according to Source Name)"
|
|
1812
|
+
],
|
|
1813
|
+
"detailed_analysis": "Comprehensive analysis section (3-4 paragraphs) that synthesizes the findings, identifies patterns, and provides context. MUST include citations like 'according to [Source Name]' or 'as reported by [Source Name]' for every claim and fact",
|
|
1814
|
+
"conclusions": "Clear conclusions and implications (2-3 paragraphs) based on the evidence gathered",
|
|
1815
|
+
"methodology": "Detailed description of the specific research approach used, including: search strategies employed, number of sources analyzed, types of sources accessed, any challenges encountered, and verification methods applied",
|
|
1816
|
+
"limitations": "Specific limitations encountered during THIS research, including: sources that were inaccessible, information gaps identified, potential biases in available sources, temporal constraints, and areas requiring further investigation"
|
|
1817
|
+
}}
|
|
1818
|
+
|
|
1819
|
+
CRITICAL REQUIREMENTS:
|
|
1820
|
+
- Respond with ONLY the JSON object, no other text
|
|
1821
|
+
- Base all content strictly on the provided findings
|
|
1822
|
+
- ALWAYS include proper citations for every claim using source titles or "according to [Source]"
|
|
1823
|
+
- Include specific facts, data, and examples from the sources WITH citations
|
|
1824
|
+
- Use proper JSON formatting with escaped quotes if needed
|
|
1825
|
+
- Do not include markdown formatting or code blocks
|
|
1826
|
+
- Every key finding and analysis point MUST reference its source
|
|
1827
|
+
"""
|
|
1828
|
+
|
|
1829
|
+
def _get_executive_report_prompt(self, research_plan: ResearchPlan, findings_text: str, research_context: Dict[str, Any] = None) -> str:
|
|
1830
|
+
"""Get prompt for executive report generation"""
|
|
1831
|
+
|
|
1832
|
+
# Build research context information
|
|
1833
|
+
context_info = ""
|
|
1834
|
+
if research_context:
|
|
1835
|
+
total_sources = research_context.get('total_sources_found', 0)
|
|
1836
|
+
successful_extractions = research_context.get('successful_extractions', 0)
|
|
1837
|
+
failed_extractions = research_context.get('failed_extractions', 0)
|
|
1838
|
+
search_queries_used = research_context.get('search_queries_used', [])
|
|
1839
|
+
extraction_method = research_context.get('extraction_method', 'standard')
|
|
1840
|
+
|
|
1841
|
+
context_info = f"""
|
|
1842
|
+
RESEARCH PROCESS CONTEXT:
|
|
1843
|
+
- Total sources discovered: {total_sources}
|
|
1844
|
+
- Successfully analyzed: {successful_extractions}
|
|
1845
|
+
- Failed to access: {failed_extractions}
|
|
1846
|
+
- Extraction method: {extraction_method}
|
|
1847
|
+
- Search queries executed: {len(search_queries_used)}
|
|
1848
|
+
- Key search terms: {', '.join(search_queries_used[:5]) if search_queries_used else 'None'}
|
|
1849
|
+
"""
|
|
1850
|
+
|
|
1851
|
+
return f"""
|
|
1852
|
+
Generate a concise executive research report based on the findings below.
|
|
1853
|
+
|
|
1854
|
+
RESEARCH OBJECTIVE: {research_plan.research_objective}
|
|
1855
|
+
ORIGINAL QUERY: {research_plan.original_query}
|
|
1856
|
+
{context_info}
|
|
1857
|
+
RESEARCH FINDINGS:
|
|
1858
|
+
{findings_text}
|
|
1859
|
+
|
|
1860
|
+
Create an executive-style report with the following JSON format:
|
|
1861
|
+
{{
|
|
1862
|
+
"title": "Executive Brief: [Topic]",
|
|
1863
|
+
"executive_summary": "3-4 sentence executive summary highlighting the most critical insights and implications",
|
|
1864
|
+
"key_findings": [
|
|
1865
|
+
"3-5 bullet points with the most important findings",
|
|
1866
|
+
"Focus on actionable insights and strategic implications"
|
|
1867
|
+
],
|
|
1868
|
+
"detailed_analysis": "2-3 paragraphs of focused analysis on the most critical aspects",
|
|
1869
|
+
"conclusions": "Clear, actionable conclusions and recommendations",
|
|
1870
|
+
"methodology": "Concise description of research approach: sources analyzed, search methods used, and verification applied",
|
|
1871
|
+
"limitations": "Specific limitations from this research: inaccessible sources, information gaps, or areas needing further study"
|
|
1872
|
+
}}
|
|
1873
|
+
|
|
1874
|
+
Guidelines:
|
|
1875
|
+
- Focus on strategic insights and business implications
|
|
1876
|
+
- Prioritize actionable information
|
|
1877
|
+
- Keep language clear and executive-friendly
|
|
1878
|
+
- Highlight trends, opportunities, and risks
|
|
1879
|
+
- Be concise but comprehensive
|
|
1880
|
+
- ALWAYS cite sources for every claim using "according to [Source]" or similar
|
|
1881
|
+
- Include proper attribution for all facts and data points
|
|
1882
|
+
"""
|
|
1883
|
+
|
|
1884
|
+
def _get_narrative_report_prompt(self, research_plan: ResearchPlan, findings_text: str, research_context: Dict[str, Any] = None) -> str:
|
|
1885
|
+
"""Get prompt for narrative report generation"""
|
|
1886
|
+
|
|
1887
|
+
# Build research context information
|
|
1888
|
+
context_info = ""
|
|
1889
|
+
if research_context:
|
|
1890
|
+
total_sources = research_context.get('total_sources_found', 0)
|
|
1891
|
+
successful_extractions = research_context.get('successful_extractions', 0)
|
|
1892
|
+
failed_extractions = research_context.get('failed_extractions', 0)
|
|
1893
|
+
search_queries_used = research_context.get('search_queries_used', [])
|
|
1894
|
+
extraction_method = research_context.get('extraction_method', 'standard')
|
|
1895
|
+
|
|
1896
|
+
context_info = f"""
|
|
1897
|
+
RESEARCH PROCESS CONTEXT:
|
|
1898
|
+
- Total sources discovered: {total_sources}
|
|
1899
|
+
- Successfully analyzed: {successful_extractions}
|
|
1900
|
+
- Failed to access: {failed_extractions}
|
|
1901
|
+
- Extraction method: {extraction_method}
|
|
1902
|
+
- Search queries executed: {len(search_queries_used)}
|
|
1903
|
+
- Key search terms: {', '.join(search_queries_used[:5]) if search_queries_used else 'None'}
|
|
1904
|
+
"""
|
|
1905
|
+
|
|
1906
|
+
return f"""
|
|
1907
|
+
Generate a narrative research report based on the findings below.
|
|
1908
|
+
|
|
1909
|
+
RESEARCH OBJECTIVE: {research_plan.research_objective}
|
|
1910
|
+
ORIGINAL QUERY: {research_plan.original_query}
|
|
1911
|
+
{context_info}
|
|
1912
|
+
RESEARCH FINDINGS:
|
|
1913
|
+
{findings_text}
|
|
1914
|
+
|
|
1915
|
+
Create a narrative-style report with the following JSON format:
|
|
1916
|
+
{{
|
|
1917
|
+
"title": "Research Report: [Topic]",
|
|
1918
|
+
"executive_summary": "Engaging summary that tells the story of what was discovered",
|
|
1919
|
+
"key_findings": [
|
|
1920
|
+
"Key discoveries presented as narrative points",
|
|
1921
|
+
"Each finding should tell part of the overall story"
|
|
1922
|
+
],
|
|
1923
|
+
"detailed_analysis": "Comprehensive narrative analysis (4-5 paragraphs) that weaves together the findings into a coherent story, showing how different aspects connect and build upon each other",
|
|
1924
|
+
"conclusions": "Narrative conclusions that bring the story together and point toward future implications",
|
|
1925
|
+
"methodology": "Narrative account of the research journey: what sources were explored, how information was gathered, challenges faced, and methods used to verify findings",
|
|
1926
|
+
"limitations": "Honest reflection on what this specific research couldn't uncover: missing perspectives, inaccessible information, temporal constraints, and areas requiring deeper investigation"
|
|
1927
|
+
}}
|
|
1928
|
+
|
|
1929
|
+
Guidelines:
|
|
1930
|
+
- Write in an engaging, narrative style
|
|
1931
|
+
- Show connections and relationships between findings
|
|
1932
|
+
- Use storytelling techniques to make the report compelling
|
|
1933
|
+
- Maintain objectivity while being engaging
|
|
1934
|
+
- Create a logical flow from introduction to conclusion
|
|
1935
|
+
- ALWAYS include proper citations throughout the narrative using source titles
|
|
1936
|
+
- Attribute all facts, quotes, and data to their specific sources
|
|
1937
|
+
"""
|
|
1938
|
+
|
|
1939
|
+
def _create_fallback_report(self, research_plan: ResearchPlan, findings: List[ResearchFinding]) -> ResearchReport:
|
|
1940
|
+
"""Create a simple fallback report if JSON parsing fails"""
|
|
1941
|
+
|
|
1942
|
+
# Extract key information from findings
|
|
1943
|
+
key_findings = []
|
|
1944
|
+
sources = []
|
|
1945
|
+
|
|
1946
|
+
for finding in findings[:10]: # Limit to top 10 findings
|
|
1947
|
+
key_findings.append(finding.content[:200] + "..." if len(finding.content) > 200 else finding.content)
|
|
1948
|
+
sources.append({
|
|
1949
|
+
"title": finding.title,
|
|
1950
|
+
"url": finding.source_url,
|
|
1951
|
+
"relevance": finding.relevance_score
|
|
1952
|
+
})
|
|
1953
|
+
|
|
1954
|
+
return ResearchReport(
|
|
1955
|
+
title=f"Research Report: {research_plan.original_query}",
|
|
1956
|
+
executive_summary=f"Research conducted on {research_plan.original_query} with {len(findings)} sources analyzed.",
|
|
1957
|
+
key_findings=key_findings,
|
|
1958
|
+
detailed_analysis="Detailed analysis could not be generated due to processing error. Please refer to key findings and sources.",
|
|
1959
|
+
conclusions="Further analysis recommended based on the gathered sources.",
|
|
1960
|
+
sources=sources,
|
|
1961
|
+
methodology="Web-based research using multi-stage pipeline",
|
|
1962
|
+
limitations="Limited to publicly available web sources. Report generation encountered technical issues."
|
|
1963
|
+
)
|
|
1964
|
+
|
|
1965
|
+
def _verify_report(self, report: ResearchReport, findings: List[ResearchFinding]) -> ResearchReport:
|
|
1966
|
+
"""Stage 5: Verify report accuracy and add fact-checking"""
|
|
1967
|
+
|
|
1968
|
+
verification_prompt = f"""
|
|
1969
|
+
Review this research report for accuracy and consistency with the source findings.
|
|
1970
|
+
|
|
1971
|
+
REPORT TITLE: {report.title}
|
|
1972
|
+
EXECUTIVE SUMMARY: {report.executive_summary}
|
|
1973
|
+
KEY FINDINGS: {report.key_findings}
|
|
1974
|
+
|
|
1975
|
+
SOURCE FINDINGS:
|
|
1976
|
+
{[f"- {f.content[:200]}..." for f in findings[:10]]}
|
|
1977
|
+
|
|
1978
|
+
Identify any potential issues:
|
|
1979
|
+
1. Claims not supported by the source findings
|
|
1980
|
+
2. Overgeneralizations or unsupported conclusions
|
|
1981
|
+
3. Missing important caveats or limitations
|
|
1982
|
+
4. Factual inconsistencies
|
|
1983
|
+
|
|
1984
|
+
Provide verification results as JSON:
|
|
1985
|
+
{{
|
|
1986
|
+
"verification_status": "verified|needs_review|issues_found",
|
|
1987
|
+
"issues_identified": ["list of specific issues if any"],
|
|
1988
|
+
"confidence_score": 0.85,
|
|
1989
|
+
"recommendations": ["suggestions for improvement"]
|
|
1990
|
+
}}
|
|
1991
|
+
"""
|
|
1992
|
+
|
|
1993
|
+
try:
|
|
1994
|
+
response = self.llm.generate(verification_prompt, temperature=0.2)
|
|
1995
|
+
|
|
1996
|
+
# Extract text from response (handle different response object types)
|
|
1997
|
+
if hasattr(response, 'text'):
|
|
1998
|
+
response_text = response.text
|
|
1999
|
+
elif hasattr(response, 'content'):
|
|
2000
|
+
response_text = response.content
|
|
2001
|
+
else:
|
|
2002
|
+
response_text = str(response)
|
|
2003
|
+
|
|
2004
|
+
verification = json.loads(response_text)
|
|
2005
|
+
|
|
2006
|
+
# Add verification metadata to report
|
|
2007
|
+
if hasattr(report, 'metadata'):
|
|
2008
|
+
report.metadata = {}
|
|
2009
|
+
|
|
2010
|
+
# Update limitations if issues were found
|
|
2011
|
+
if verification.get("verification_status") == "issues_found":
|
|
2012
|
+
issues = verification.get("issues_identified", [])
|
|
2013
|
+
additional_limitations = f" Verification identified potential issues: {'; '.join(issues)}"
|
|
2014
|
+
report.limitations += additional_limitations
|
|
2015
|
+
|
|
2016
|
+
logger.info(f"✅ Report verification completed: {verification.get('verification_status', 'unknown')}")
|
|
2017
|
+
|
|
2018
|
+
except Exception as e:
|
|
2019
|
+
logger.warning(f"Report verification failed: {e}")
|
|
2020
|
+
report.limitations += " Report verification could not be completed."
|
|
2021
|
+
|
|
2022
|
+
return report
|
|
2023
|
+
|
|
2024
|
+
def _reflexive_refinement(
|
|
2025
|
+
self,
|
|
2026
|
+
initial_report: ResearchReport,
|
|
2027
|
+
research_plan: ResearchPlan,
|
|
2028
|
+
existing_findings: List[ResearchFinding]
|
|
2029
|
+
) -> ResearchReport:
|
|
2030
|
+
"""Stage 6: Reflexive analysis and iterative improvement"""
|
|
2031
|
+
|
|
2032
|
+
current_report = initial_report
|
|
2033
|
+
current_findings = existing_findings.copy()
|
|
2034
|
+
|
|
2035
|
+
for iteration in range(self.max_reflexive_iterations):
|
|
2036
|
+
logger.info(f"🔄 Reflexive iteration {iteration + 1}/{self.max_reflexive_iterations}")
|
|
2037
|
+
|
|
2038
|
+
# Analyze gaps and limitations
|
|
2039
|
+
gaps = self._analyze_research_gaps(current_report, research_plan)
|
|
2040
|
+
|
|
2041
|
+
if not gaps or len(gaps) == 0:
|
|
2042
|
+
logger.info("✅ No significant gaps identified - reflexive analysis complete")
|
|
2043
|
+
break
|
|
2044
|
+
|
|
2045
|
+
logger.info(f"🎯 Identified {len(gaps)} research gaps to address")
|
|
2046
|
+
|
|
2047
|
+
# Execute targeted searches for gaps
|
|
2048
|
+
new_findings = self._execute_gap_searches(gaps, research_plan.original_query)
|
|
2049
|
+
|
|
2050
|
+
if new_findings:
|
|
2051
|
+
logger.info(f"📚 Found {len(new_findings)} additional sources")
|
|
2052
|
+
current_findings.extend(new_findings)
|
|
2053
|
+
|
|
2054
|
+
# Regenerate report with enhanced findings
|
|
2055
|
+
current_report = self._generate_report(research_plan, current_findings, "structured")
|
|
2056
|
+
|
|
2057
|
+
# Update methodology to reflect reflexive process
|
|
2058
|
+
current_report.methodology += f" Enhanced through {iteration + 1} reflexive analysis cycle(s) addressing identified gaps."
|
|
2059
|
+
else:
|
|
2060
|
+
logger.info("⚠️ No additional sources found for identified gaps")
|
|
2061
|
+
break
|
|
2062
|
+
|
|
2063
|
+
return current_report
|
|
2064
|
+
|
|
2065
|
+
def _analyze_research_gaps(self, report: ResearchReport, research_plan: ResearchPlan) -> List[Dict[str, Any]]:
|
|
2066
|
+
"""Analyze methodology and limitations to identify actionable research gaps"""
|
|
2067
|
+
|
|
2068
|
+
gap_analysis_prompt = f"""
|
|
2069
|
+
Analyze this research report to identify SPECIFIC, ACTIONABLE information gaps that could be addressed with targeted web searches.
|
|
2070
|
+
|
|
2071
|
+
ORIGINAL QUERY: {research_plan.original_query}
|
|
2072
|
+
|
|
2073
|
+
CURRENT METHODOLOGY: {report.methodology}
|
|
2074
|
+
|
|
2075
|
+
CURRENT LIMITATIONS: {report.limitations}
|
|
2076
|
+
|
|
2077
|
+
CURRENT KEY FINDINGS: {report.key_findings}
|
|
2078
|
+
|
|
2079
|
+
FOCUS AREAS EXPLORED: {research_plan.focus_areas}
|
|
2080
|
+
|
|
2081
|
+
Identify gaps that are:
|
|
2082
|
+
1. SPECIFIC enough to generate targeted search queries
|
|
2083
|
+
2. IMPORTANT for answering the original query
|
|
2084
|
+
3. FEASIBLE to find through web search
|
|
2085
|
+
4. NOT already covered in current findings
|
|
2086
|
+
|
|
2087
|
+
For each gap, provide:
|
|
2088
|
+
- gap_type: "missing_perspective", "insufficient_data", "outdated_info", "alternative_viewpoint", "technical_detail", "recent_development"
|
|
2089
|
+
- description: What specific information is missing
|
|
2090
|
+
- importance: 1-10 scale (10 = critical for answering original query)
|
|
2091
|
+
- search_strategy: Specific approach to find this information
|
|
2092
|
+
- target_queries: 2-3 specific search queries to address this gap
|
|
2093
|
+
|
|
2094
|
+
Return ONLY a JSON array of gaps (max 5 most important):
|
|
2095
|
+
[
|
|
2096
|
+
{{
|
|
2097
|
+
"gap_type": "missing_perspective",
|
|
2098
|
+
"description": "Lack of industry expert opinions on quantum computing timeline",
|
|
2099
|
+
"importance": 8,
|
|
2100
|
+
"search_strategy": "Search for expert interviews, industry reports, and analyst predictions",
|
|
2101
|
+
"target_queries": ["quantum computing expert predictions 2024", "industry analysis quantum timeline", "quantum computing roadmap enterprise"]
|
|
2102
|
+
}}
|
|
2103
|
+
]
|
|
2104
|
+
|
|
2105
|
+
CRITICAL: Return ONLY the JSON array, no other text.
|
|
2106
|
+
"""
|
|
2107
|
+
|
|
2108
|
+
try:
|
|
2109
|
+
response = self.llm.generate(gap_analysis_prompt)
|
|
2110
|
+
|
|
2111
|
+
# Extract text from response (handle GenerateResponse objects)
|
|
2112
|
+
if hasattr(response, 'text'):
|
|
2113
|
+
response_text = response.text
|
|
2114
|
+
elif hasattr(response, 'content'):
|
|
2115
|
+
response_text = response.content
|
|
2116
|
+
else:
|
|
2117
|
+
response_text = str(response)
|
|
2118
|
+
|
|
2119
|
+
# Extract JSON from response
|
|
2120
|
+
import json
|
|
2121
|
+
import re
|
|
2122
|
+
|
|
2123
|
+
# Find JSON array in response
|
|
2124
|
+
json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
|
|
2125
|
+
if json_match:
|
|
2126
|
+
gaps_data = json.loads(json_match.group())
|
|
2127
|
+
|
|
2128
|
+
# Filter gaps by importance (only keep high-importance ones)
|
|
2129
|
+
important_gaps = [gap for gap in gaps_data if gap.get('importance', 0) >= 6]
|
|
2130
|
+
|
|
2131
|
+
logger.info(f"🔍 Gap analysis identified {len(important_gaps)} high-priority gaps")
|
|
2132
|
+
return important_gaps
|
|
2133
|
+
else:
|
|
2134
|
+
logger.warning("No valid JSON found in gap analysis response")
|
|
2135
|
+
return []
|
|
2136
|
+
|
|
2137
|
+
except Exception as e:
|
|
2138
|
+
logger.warning(f"Gap analysis failed: {e}")
|
|
2139
|
+
return []
|
|
2140
|
+
|
|
2141
|
+
def _execute_gap_searches(self, gaps: List[Dict[str, Any]], original_query: str) -> List[ResearchFinding]:
|
|
2142
|
+
"""Execute targeted searches to address identified gaps"""
|
|
2143
|
+
|
|
2144
|
+
new_findings = []
|
|
2145
|
+
|
|
2146
|
+
for gap in gaps:
|
|
2147
|
+
gap_type = gap.get('gap_type', 'unknown')
|
|
2148
|
+
description = gap.get('description', '')
|
|
2149
|
+
target_queries = gap.get('target_queries', [])
|
|
2150
|
+
|
|
2151
|
+
logger.info(f"🎯 Addressing gap: {gap_type} - {description}")
|
|
2152
|
+
|
|
2153
|
+
# Execute searches for this gap
|
|
2154
|
+
for query in target_queries[:2]: # Limit to 2 queries per gap
|
|
2155
|
+
try:
|
|
2156
|
+
logger.info(f"🔍 Gap search: {query}")
|
|
2157
|
+
|
|
2158
|
+
# Use existing search infrastructure
|
|
2159
|
+
gap_findings = self._execute_search(query, f"gap_{gap_type}")
|
|
2160
|
+
|
|
2161
|
+
if gap_findings:
|
|
2162
|
+
# Mark findings as gap-addressing
|
|
2163
|
+
for finding in gap_findings:
|
|
2164
|
+
finding.sub_task_id = f"reflexive_gap_{gap_type}"
|
|
2165
|
+
|
|
2166
|
+
new_findings.extend(gap_findings)
|
|
2167
|
+
logger.info(f"✅ Found {len(gap_findings)} sources for gap: {description}")
|
|
2168
|
+
|
|
2169
|
+
except Exception as e:
|
|
2170
|
+
logger.warning(f"Gap search failed for '{query}': {e}")
|
|
2171
|
+
continue
|
|
2172
|
+
|
|
2173
|
+
return new_findings
|