abstractcore 2.5.0__py3-none-any.whl → 2.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2173 @@
1
+ """
2
+ Basic Deep Search - Autonomous research agent with multi-stage pipeline
3
+
4
+ Features:
5
+ - Four-stage pipeline: Planning, Question Development, Web Exploration, Report Generation
6
+ - Parallel web exploration for speed and breadth
7
+ - Structured report generation with citations
8
+ - Verification and fact-checking capabilities
9
+ - Configurable search depth and focus areas
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import asyncio
15
+ import time
16
+ import re
17
+ import hashlib
18
+ from typing import Optional, List, Dict, Any, Union
19
+ from dataclasses import dataclass
20
+ from concurrent.futures import ThreadPoolExecutor, as_completed
21
+ from pydantic import BaseModel, Field
22
+
23
+ from ..core.interface import AbstractCoreInterface
24
+ from ..core.factory import create_llm
25
+ from ..structured.retry import FeedbackRetry
26
+ from ..utils.structured_logging import get_logger
27
+ from ..tools.common_tools import web_search, fetch_url
28
+
29
+ logger = get_logger(__name__)
30
+
31
+
32
+ class SourceManager:
33
+ """Manages source collection with strict limits and deduplication"""
34
+
35
+ def __init__(self, max_sources: int):
36
+ self.max_sources = max_sources
37
+ self.collected_sources = []
38
+ self.source_urls = set() # For deduplication
39
+ self.source_titles = set() # Additional deduplication by title
40
+
41
+ def add_source(self, source: Dict[str, Any]) -> bool:
42
+ """Add source if under limit and not duplicate"""
43
+ if len(self.collected_sources) >= self.max_sources:
44
+ logger.debug(f"Source limit reached ({self.max_sources}), skipping: {source.get('url', 'unknown')}")
45
+ return False
46
+
47
+ url = source.get('url', '')
48
+ title = source.get('title', '').lower().strip()
49
+
50
+ # Check for URL duplication
51
+ if url and url in self.source_urls:
52
+ logger.debug(f"Duplicate URL skipped: {url}")
53
+ return False
54
+
55
+ # Check for title duplication (similar content from different URLs)
56
+ if title and title in self.source_titles:
57
+ logger.debug(f"Duplicate title skipped: {title}")
58
+ return False
59
+
60
+ self.collected_sources.append(source)
61
+ if url:
62
+ self.source_urls.add(url)
63
+ if title:
64
+ self.source_titles.add(title)
65
+
66
+ logger.debug(f"Source added ({len(self.collected_sources)}/{self.max_sources}): {title or url}")
67
+ return True
68
+
69
+ def get_remaining_capacity(self) -> int:
70
+ return max(0, self.max_sources - len(self.collected_sources))
71
+
72
+ def get_sources(self) -> List[Dict[str, Any]]:
73
+ return self.collected_sources.copy()
74
+
75
+ def is_full(self) -> bool:
76
+ return len(self.collected_sources) >= self.max_sources
77
+
78
+
79
+ class CitationValidator:
80
+ """Validates and enforces citations in generated content"""
81
+
82
+ @staticmethod
83
+ def validate_citations(text: str, sources: List[Dict]) -> Dict[str, Any]:
84
+ """Check if text contains proper citations for claims"""
85
+ if not text or not sources:
86
+ return {
87
+ 'citations_found': 0,
88
+ 'factual_sentences': 0,
89
+ 'citation_ratio': 0.0,
90
+ 'is_adequately_cited': False,
91
+ 'missing_citations': []
92
+ }
93
+
94
+ source_names = [s.get('title', '').strip() for s in sources if s.get('title')]
95
+
96
+ # Count citation patterns (case-insensitive)
97
+ citation_patterns = [
98
+ r'according to \[([^\]]+)\]',
99
+ r'as reported by \[([^\]]+)\]',
100
+ r'according to ([^,.]+)',
101
+ r'as reported by ([^,.]+)',
102
+ r'\(([^)]+)\)', # Parenthetical citations
103
+ ]
104
+
105
+ citations_found = 0
106
+ cited_sources = set()
107
+
108
+ for pattern in citation_patterns:
109
+ matches = re.findall(pattern, text, re.IGNORECASE)
110
+ citations_found += len(matches)
111
+ for match in matches:
112
+ cited_sources.add(match.strip().lower())
113
+
114
+ # Estimate factual claims (sentences with factual indicators)
115
+ sentences = [s.strip() for s in text.split('.') if s.strip()]
116
+ factual_indicators = [
117
+ 'show', 'found', 'research', 'study', 'data', 'report', 'analysis',
118
+ 'indicates', 'reveals', 'demonstrates', 'confirms', 'suggests',
119
+ 'according', 'published', 'announced', 'released', 'stated'
120
+ ]
121
+
122
+ factual_sentences = []
123
+ for sentence in sentences:
124
+ if any(indicator in sentence.lower() for indicator in factual_indicators):
125
+ factual_sentences.append(sentence)
126
+
127
+ citation_ratio = citations_found / max(len(factual_sentences), 1)
128
+
129
+ # Check which sources are not cited
130
+ uncited_sources = []
131
+ for source in source_names:
132
+ source_lower = source.lower()
133
+ if not any(source_lower in cited.lower() for cited in cited_sources):
134
+ uncited_sources.append(source)
135
+
136
+ return {
137
+ 'citations_found': citations_found,
138
+ 'factual_sentences': len(factual_sentences),
139
+ 'citation_ratio': citation_ratio,
140
+ 'is_adequately_cited': citation_ratio >= 0.5, # 50% threshold
141
+ 'uncited_sources': uncited_sources,
142
+ 'cited_sources': list(cited_sources)
143
+ }
144
+
145
+ @staticmethod
146
+ def enhance_text_with_citations(text: str, sources: List[Dict]) -> str:
147
+ """Enhance text by adding missing citations where appropriate"""
148
+ if not sources:
149
+ return text
150
+
151
+ # Simple enhancement: add source list at the end if no citations found
152
+ validation = CitationValidator.validate_citations(text, sources)
153
+
154
+ if validation['citations_found'] == 0 and sources:
155
+ source_list = "\n\nSources:\n" + "\n".join([
156
+ f"- {s.get('title', 'Unknown')}: {s.get('url', 'No URL')}"
157
+ for s in sources[:5] # Limit to top 5 sources
158
+ ])
159
+ return text + source_list
160
+
161
+ return text
162
+
163
+
164
+ # Pydantic models for structured output
165
+ class ResearchSubTaskModel(BaseModel):
166
+ """Pydantic model for research sub-task (for structured output)"""
167
+ id: str = Field(description="Unique task identifier")
168
+ question: str = Field(description="Specific research question")
169
+ theme: str = Field(description="Research theme this task addresses")
170
+ priority: int = Field(description="Priority level (1=essential, 2=important, 3=supplementary)", ge=1, le=3)
171
+
172
+ class ResearchThemeModel(BaseModel):
173
+ """Pydantic model for research theme (for structured output)"""
174
+ name: str = Field(description="Theme name")
175
+ questions: List[str] = Field(description="3 specific research questions for this theme", min_items=3, max_items=3)
176
+ priority: int = Field(description="Theme priority (1=essential, 2=important, 3=supplementary)", ge=1, le=3)
177
+
178
+ class ResearchPlanModel(BaseModel):
179
+ """Pydantic model for research plan (for structured output)"""
180
+ research_objective: str = Field(description="Clear research objective")
181
+ themes: List[ResearchThemeModel] = Field(description="Research themes with questions")
182
+ search_strategy: str = Field(description="Search strategy", default="parallel")
183
+ estimated_time_minutes: int = Field(description="Estimated time in minutes", gt=0)
184
+
185
+
186
+ class SearchQueriesModel(BaseModel):
187
+ """Pydantic model for search queries (for structured output)"""
188
+ queries: List[str] = Field(description="List of specific search queries", min_items=1, max_items=5)
189
+
190
+
191
+ @dataclass
192
+ class ResearchSubTask:
193
+ """Represents a single research sub-task"""
194
+ id: str
195
+ question: str
196
+ focus_area: str
197
+ priority: int = 1 # 1=high, 2=medium, 3=low
198
+ search_queries: List[str] = None
199
+ findings: List[Dict[str, Any]] = None
200
+ status: str = "pending" # pending, in_progress, completed, failed
201
+
202
+
203
+ @dataclass
204
+ class ResearchPlan:
205
+ """Represents the overall research plan"""
206
+ original_query: str
207
+ research_objective: str
208
+ sub_tasks: List[ResearchSubTask]
209
+ estimated_time_minutes: int
210
+ focus_areas: List[str]
211
+ search_strategy: str = "breadth_first" # breadth_first, depth_first, parallel
212
+
213
+
214
+ class ResearchFinding(BaseModel):
215
+ """Structured representation of a research finding"""
216
+ source_url: str = Field(description="URL of the source")
217
+ title: str = Field(description="Title of the source")
218
+ content: str = Field(description="Relevant content excerpt")
219
+ relevance_score: float = Field(description="Relevance score 0-1")
220
+ timestamp: str = Field(description="When this was found")
221
+ sub_task_id: str = Field(description="Which sub-task this relates to")
222
+
223
+
224
+ class ResearchReport(BaseModel):
225
+ """Structured research report"""
226
+ title: str = Field(description="Report title")
227
+ executive_summary: str = Field(description="Brief executive summary")
228
+ key_findings: List[str] = Field(description="List of key findings")
229
+ detailed_analysis: str = Field(description="Detailed analysis section")
230
+ conclusions: str = Field(description="Conclusions and implications")
231
+ sources: List[Dict[str, Any]] = Field(description="List of sources with URLs, titles, and relevance scores")
232
+ methodology: str = Field(description="Research methodology used")
233
+ limitations: str = Field(description="Research limitations and caveats")
234
+
235
+
236
+ class BasicDeepSearch:
237
+ """
238
+ Basic Deep Search implementation following the four-stage pipeline:
239
+ 1. Planning: Decompose query into structured research plan
240
+ 2. Question Development: Generate specific search queries
241
+ 3. Web Exploration: Execute searches and gather evidence
242
+ 4. Report Generation: Synthesize findings into structured report
243
+
244
+ Key features:
245
+ - Autonomous multi-step research workflow
246
+ - Parallel web exploration for speed
247
+ - Structured output with citations
248
+ - Configurable search depth and focus
249
+ - Verification and fact-checking
250
+
251
+ Examples:
252
+ >>> searcher = BasicDeepSearch()
253
+
254
+ # Basic research query
255
+ >>> report = searcher.research("What are the latest developments in quantum computing?")
256
+
257
+ # Research with specific focus
258
+ >>> report = searcher.research(
259
+ ... "Impact of AI on healthcare",
260
+ ... focus_areas=["medical diagnosis", "drug discovery", "patient care"]
261
+ ... )
262
+
263
+ # Deep research with custom parameters
264
+ >>> report = searcher.research(
265
+ ... "Sustainable energy solutions 2025",
266
+ ... max_sources=20,
267
+ ... search_depth="comprehensive",
268
+ ... include_verification=True
269
+ ... )
270
+ """
271
+
272
+ def __init__(
273
+ self,
274
+ llm: Optional[AbstractCoreInterface] = None,
275
+ max_tokens: int = 32000,
276
+ max_output_tokens: int = 8000,
277
+ timeout: Optional[float] = None,
278
+ max_parallel_searches: int = 5,
279
+ full_text_extraction: bool = False,
280
+ reflexive_mode: bool = False,
281
+ max_reflexive_iterations: int = 2,
282
+ temperature: float = 0.1, # Low temperature for consistency
283
+ debug_mode: bool = False
284
+ ):
285
+ """Initialize the deep search system
286
+
287
+ Args:
288
+ llm: AbstractCore instance (any provider). If None, uses default Ollama model
289
+ max_tokens: Maximum total tokens for LLM context (default 32000)
290
+ max_output_tokens: Maximum tokens for LLM output generation (default 8000)
291
+ timeout: HTTP request timeout in seconds. None for unlimited timeout (default None)
292
+ max_parallel_searches: Maximum number of parallel web searches (default 5)
293
+ full_text_extraction: Whether to extract full text content from pages (default False)
294
+ reflexive_mode: Whether to enable reflexive research that analyzes gaps and refines (default False)
295
+ max_reflexive_iterations: Maximum number of reflexive refinement cycles (default 2)
296
+ temperature: LLM temperature for consistency (default 0.1 for deterministic outputs)
297
+ debug_mode: Enable comprehensive debug logging (default False)
298
+ """
299
+ if llm is None:
300
+ try:
301
+ self.llm = create_llm(
302
+ "ollama",
303
+ model="qwen3:4b-instruct-2507-q4_K_M",
304
+ max_tokens=max_tokens,
305
+ max_output_tokens=max_output_tokens,
306
+ temperature=temperature, # Use consistent low temperature
307
+ timeout=timeout
308
+ )
309
+
310
+ except Exception as e:
311
+ error_msg = (
312
+ f"❌ Failed to initialize default Ollama model 'qwen3:4b-instruct-2507-q4_K_M': {e}\n\n"
313
+ "💡 To use the default model, please:\n"
314
+ " 1. Install Ollama from: https://ollama.com/\n"
315
+ " 2. Download the model: ollama pull qwen3:4b-instruct-2507-q4_K_M\n"
316
+ " 3. Start Ollama service\n\n"
317
+ "⚡ For best deep search performance, consider these models:\n"
318
+ " - qwen3-coder:30b (excellent for research and analysis, requires 32GB RAM)\n"
319
+ " - gpt-4o-mini (cloud-based, fast and reliable)\n"
320
+ " - claude-3-5-haiku (cloud-based, excellent reasoning)\n\n"
321
+ "🔧 Alternatively, provide a custom LLM instance:\n"
322
+ " from abstractcore import create_llm\n"
323
+ " from abstractcore.processing import BasicDeepSearch\n"
324
+ " \n"
325
+ " llm = create_llm('openai', model='gpt-4o-mini', max_tokens=32000, max_output_tokens=8000)\n"
326
+ " searcher = BasicDeepSearch(llm)"
327
+ )
328
+ raise RuntimeError(error_msg) from e
329
+ else:
330
+ self.llm = llm
331
+
332
+ self.max_parallel_searches = max_parallel_searches
333
+ self.full_text_extraction = full_text_extraction
334
+ self.reflexive_mode = reflexive_mode
335
+ self.max_reflexive_iterations = max_reflexive_iterations
336
+ self.temperature = temperature
337
+ self.debug_mode = debug_mode
338
+ self.retry_strategy = FeedbackRetry(max_attempts=3)
339
+ print(f"🤖 Initialized LLM: {self.llm.provider} {self.llm.model}")
340
+
341
+ # Debug tracking
342
+ if self.debug_mode:
343
+ self.debug_info = {
344
+ 'all_queries': [],
345
+ 'all_urls_found': [],
346
+ 'relevance_assessments': [],
347
+ 'accepted_sources': [],
348
+ 'rejected_sources': []
349
+ }
350
+
351
+ def research(
352
+ self,
353
+ query: str,
354
+ focus_areas: Optional[List[str]] = None,
355
+ max_sources: int = 15,
356
+ search_depth: str = "standard", # brief, standard, comprehensive
357
+ include_verification: bool = True,
358
+ output_format: str = "structured" # structured, narrative, executive
359
+ ) -> Union[ResearchReport, Dict[str, Any]]:
360
+ """
361
+ Conduct autonomous deep research on a given query
362
+
363
+ Args:
364
+ query: The research question or topic
365
+ focus_areas: Specific areas to focus on (optional)
366
+ max_sources: Maximum number of sources to gather (default 15)
367
+ search_depth: Research depth - brief, standard, comprehensive (default standard)
368
+ include_verification: Whether to include fact-checking (default True)
369
+ output_format: Output format - structured, narrative, executive (default structured)
370
+
371
+ Returns:
372
+ ResearchReport object or dictionary with research findings
373
+ """
374
+ logger.info(f"🔍 Starting deep search research: {query}")
375
+ start_time = time.time()
376
+
377
+ try:
378
+ # Initialize source manager with strict limits
379
+ source_manager = SourceManager(max_sources)
380
+ logger.info(f"🎯 Initialized source manager with limit: {max_sources}")
381
+
382
+ # Stage 1: Planning
383
+ logger.info("📋 Stage 1: Planning research approach...")
384
+ research_plan = self._create_research_plan(query, focus_areas, search_depth)
385
+
386
+ # Stage 2: Question Development
387
+ logger.info("❓ Stage 2: Developing search questions...")
388
+ self._develop_search_questions(research_plan, max_sources)
389
+
390
+ # Debug: Show all generated queries
391
+ if self.debug_mode:
392
+ print("\n" + "="*80)
393
+ print("🔍 DEBUG: ALL GENERATED SEARCH QUERIES")
394
+ print("="*80)
395
+ for i, sub_task in enumerate(research_plan.sub_tasks, 1):
396
+ print(f"\n📋 Sub-task {i}: {sub_task.question}")
397
+ print(f"🎯 Focus: {sub_task.focus_area}")
398
+ print(f"🔍 Queries ({len(sub_task.search_queries)}):")
399
+ for j, query in enumerate(sub_task.search_queries, 1):
400
+ print(f" {j}. \"{query}\"")
401
+ self.debug_info['all_queries'].append({
402
+ 'sub_task_id': sub_task.id,
403
+ 'sub_task_question': sub_task.question,
404
+ 'query': query
405
+ })
406
+ print("="*80)
407
+
408
+ # Stage 3: Web Exploration
409
+ logger.info("🌐 Stage 3: Exploring web sources...")
410
+ findings = self._explore_web_sources(research_plan, source_manager)
411
+
412
+ # Stage 4: Report Generation
413
+ logger.info("📝 Stage 4: Generating research report...")
414
+ report = self._generate_report(research_plan, findings, output_format)
415
+
416
+ # Optional: Verification
417
+ if include_verification:
418
+ logger.info("✅ Stage 5: Verifying findings...")
419
+ report = self._verify_report(report, findings)
420
+
421
+ # Stage 6: Reflexive improvement (if enabled)
422
+ if self.reflexive_mode:
423
+ logger.info("🔄 Stage 6: Reflexive analysis and refinement...")
424
+ report = self._reflexive_refinement(report, research_plan, findings)
425
+
426
+ elapsed_time = time.time() - start_time
427
+ logger.info(f"✨ Deep search completed in {elapsed_time:.1f} seconds")
428
+
429
+ # Debug: Show comprehensive summary
430
+ if self.debug_mode:
431
+ self._print_debug_summary()
432
+
433
+ return report
434
+
435
+ except Exception as e:
436
+ logger.error(f"❌ Deep search failed: {e}")
437
+ raise
438
+
439
+ def _create_research_plan(
440
+ self,
441
+ query: str,
442
+ focus_areas: Optional[List[str]],
443
+ search_depth: str
444
+ ) -> ResearchPlan:
445
+ """Stage 1: Create a structured research plan"""
446
+
447
+ # Detect query type and get appropriate focus areas
448
+ query_type = self._detect_query_type(query)
449
+ if not focus_areas:
450
+ focus_areas = self._get_focus_areas_by_type(query_type)
451
+
452
+ # Determine number of sub-tasks based on search depth
453
+ depth_config = {
454
+ "brief": {"sub_tasks": 3, "time_estimate": 5},
455
+ "standard": {"sub_tasks": 5, "time_estimate": 10},
456
+ "comprehensive": {"sub_tasks": 8, "time_estimate": 20}
457
+ }
458
+ config = depth_config.get(search_depth, depth_config["standard"])
459
+
460
+ planning_prompt = f"""
461
+ You are an expert research strategist. Analyze the following research query and create a comprehensive research plan.
462
+
463
+ RESEARCH QUERY: "{query}"
464
+ SEARCH DEPTH: {search_depth}
465
+ TARGET SUB-TASKS: {config['sub_tasks']}
466
+
467
+ Your task is to intelligently decompose this query into focused research themes and specific sub-questions.
468
+
469
+ INSTRUCTIONS:
470
+ 1. First, identify the key themes/dimensions that need to be researched to fully answer this query
471
+ 2. For each theme, generate 3 specific, actionable research questions
472
+ 3. Prioritize the themes based on their importance to answering the original query
473
+ 4. Ensure the questions are diverse and cover different aspects of the topic
474
+
475
+ RESEARCH OBJECTIVE: Write a clear 1-2 sentence objective that captures what we're trying to learn.
476
+
477
+ THEMES & SUB-QUESTIONS:
478
+ For each theme you identify, provide:
479
+ - Theme name (e.g., "Technical Overview", "Historical Context", "Current Applications")
480
+ - 3 specific research questions for that theme
481
+ - Priority level (1=essential, 2=important, 3=supplementary)
482
+
483
+ REQUIREMENTS:
484
+ - Generate exactly {config['sub_tasks']} total sub-questions across all themes
485
+ - Each question should be specific enough to guide targeted web searches
486
+ - Questions should be complementary, not overlapping
487
+ - Adapt the themes naturally to the query - don't force artificial categories
488
+ - For person queries: focus on biography, work, impact, affiliations, recent activities
489
+ - For concept queries: focus on definition, applications, development, current state, implications
490
+ - For technology queries: focus on how it works, use cases, advantages/limitations, market adoption
491
+ - For location queries: focus on geography, culture, economy, politics, current events
492
+ - For organization queries: focus on mission, leadership, products/services, market position, recent news
493
+
494
+ The themes should emerge naturally from understanding what someone would want to know about this specific topic.
495
+ """
496
+
497
+ try:
498
+ # Use structured output for reliable JSON parsing
499
+ plan_model = self.llm.generate(
500
+ planning_prompt,
501
+ temperature=0.3,
502
+ response_model=ResearchPlanModel
503
+ )
504
+
505
+ # Convert Pydantic model to dataclass objects
506
+ sub_tasks = []
507
+ focus_areas = []
508
+ task_counter = 1
509
+
510
+ for theme_model in plan_model.themes:
511
+ focus_areas.append(theme_model.name)
512
+
513
+ # Create sub-tasks from theme questions
514
+ for question in theme_model.questions:
515
+ sub_task = ResearchSubTask(
516
+ id=f"task_{task_counter}",
517
+ question=question,
518
+ focus_area=theme_model.name,
519
+ priority=theme_model.priority,
520
+ search_queries=[],
521
+ findings=[],
522
+ status="pending"
523
+ )
524
+ sub_tasks.append(sub_task)
525
+ task_counter += 1
526
+
527
+ research_plan = ResearchPlan(
528
+ original_query=query,
529
+ research_objective=plan_model.research_objective,
530
+ sub_tasks=sub_tasks,
531
+ estimated_time_minutes=plan_model.estimated_time_minutes,
532
+ focus_areas=focus_areas,
533
+ search_strategy=plan_model.search_strategy
534
+ )
535
+
536
+ logger.info(f"📋 Created research plan with {len(sub_tasks)} sub-tasks")
537
+ return research_plan
538
+
539
+ except Exception as e:
540
+ logger.error(f"Failed to parse research plan: {e}")
541
+ # Fallback to simple plan
542
+ return self._create_fallback_plan(query, focus_areas, search_depth)
543
+
544
+ def _create_fallback_plan(self, query: str, focus_areas: Optional[List[str]], search_depth: str) -> ResearchPlan:
545
+ """Create a simple fallback research plan if JSON parsing fails"""
546
+
547
+ # Simple sub-tasks based on common research patterns
548
+ sub_tasks = [
549
+ ResearchSubTask("task_1", f"What is {query}? Provide definitions and overview", "overview", 1),
550
+ ResearchSubTask("task_2", f"What are the current developments in {query}?", "current_state", 1),
551
+ ResearchSubTask("task_3", f"What are the key challenges or issues with {query}?", "challenges", 2),
552
+ ResearchSubTask("task_4", f"What are future trends and predictions for {query}?", "future", 2),
553
+ ]
554
+
555
+ if search_depth == "comprehensive":
556
+ sub_tasks.extend([
557
+ ResearchSubTask("task_5", f"Who are the key players or experts in {query}?", "stakeholders", 2),
558
+ ResearchSubTask("task_6", f"What are the economic or business implications of {query}?", "economics", 3),
559
+ ResearchSubTask("task_7", f"What are the technical or scientific aspects of {query}?", "technical", 3),
560
+ ])
561
+
562
+ return ResearchPlan(
563
+ original_query=query,
564
+ research_objective=f"Comprehensive research on {query}",
565
+ sub_tasks=sub_tasks,
566
+ estimated_time_minutes=10,
567
+ focus_areas=focus_areas or ["overview", "current_state", "challenges", "future"],
568
+ search_strategy="parallel"
569
+ )
570
+
571
+ def _develop_search_questions(self, research_plan: ResearchPlan, max_sources: int) -> None:
572
+ """Stage 2: Develop specific search queries for each sub-task"""
573
+
574
+ queries_per_task = max(2, max_sources // len(research_plan.sub_tasks))
575
+
576
+ for sub_task in research_plan.sub_tasks:
577
+ query_prompt = f"""
578
+ Generate {queries_per_task} specific, diverse search queries for this research question:
579
+
580
+ RESEARCH QUESTION: {sub_task.question}
581
+ FOCUS AREA: {sub_task.focus_area}
582
+ ORIGINAL QUERY: {research_plan.original_query}
583
+
584
+ Create search queries that:
585
+ 1. Use different keywords and phrasings
586
+ 2. Target different types of sources (news, academic, industry, etc.)
587
+ 3. Include recent time-sensitive queries where relevant (2024, 2025)
588
+ 4. Are specific enough to find relevant information
589
+ 5. Avoid generic terms that might return irrelevant results
590
+
591
+ Examples of good search queries for quantum computing:
592
+ - "quantum computing hardware advances 2024"
593
+ - "quantum computer error correction breakthrough 2024"
594
+ - "IBM Google quantum computing progress 2024"
595
+ - "quantum computing industry applications 2024"
596
+ - "quantum supremacy achievements 2024"
597
+
598
+ Avoid generic terms like "qubit" alone (which returns lab instruments) - be specific about quantum computing context.
599
+ """
600
+
601
+ try:
602
+ # Use structured output for reliable parsing
603
+ queries_model = self.llm.generate(
604
+ query_prompt,
605
+ temperature=0.5,
606
+ response_model=SearchQueriesModel
607
+ )
608
+
609
+ sub_task.search_queries = queries_model.queries[:queries_per_task]
610
+ logger.info(f"📝 Generated queries for {sub_task.id}: {sub_task.search_queries}")
611
+
612
+ except Exception as e:
613
+ logger.warning(f"Failed to parse queries for {sub_task.id}, using fallback")
614
+ # Improved fallback queries with better specificity
615
+ base_topic = research_plan.original_query.replace("What are the latest developments in ", "").replace("?", "")
616
+ sub_task.search_queries = [
617
+ f"{base_topic} {sub_task.focus_area} 2024",
618
+ f"{base_topic} advances {sub_task.focus_area} 2024",
619
+ f"{base_topic} research {sub_task.focus_area} latest"
620
+ ][:queries_per_task]
621
+ logger.info(f"📝 Using fallback queries for {sub_task.id}: {sub_task.search_queries}")
622
+
623
+ def _explore_web_sources(self, research_plan: ResearchPlan, source_manager: SourceManager) -> List[ResearchFinding]:
624
+ """Stage 3: Execute web searches and gather evidence"""
625
+
626
+ all_findings = []
627
+
628
+ # Collect all search queries with their sub-task context
629
+ search_tasks = []
630
+ for sub_task in research_plan.sub_tasks:
631
+ for query in sub_task.search_queries:
632
+ search_tasks.append((sub_task.id, query, sub_task.priority))
633
+
634
+ # Sort by priority (1=high priority first)
635
+ search_tasks.sort(key=lambda x: x[2])
636
+
637
+ # Global URL deduplication across all sub-tasks
638
+ processed_urls = set()
639
+
640
+ if self.debug_mode:
641
+ print(f"\n🔍 DEBUG: Starting web exploration with {len(search_tasks)} search tasks")
642
+ print(f"🎯 Source limit: {source_manager.max_sources}")
643
+ for i, (sub_task_id, query, priority) in enumerate(search_tasks, 1):
644
+ print(f" {i}. [{sub_task_id}] \"{query}\" (Priority: {priority})")
645
+
646
+ # Execute searches in parallel with source limit management
647
+ with ThreadPoolExecutor(max_workers=self.max_parallel_searches) as executor:
648
+ # Submit search tasks
649
+ future_to_task = {}
650
+ for sub_task_id, query, priority in search_tasks:
651
+ # Check if we still have capacity
652
+ if source_manager.is_full():
653
+ logger.info(f"🎯 Source limit reached ({source_manager.max_sources}), stopping search submission")
654
+ break
655
+
656
+ future = executor.submit(self._execute_search, sub_task_id, query, source_manager, processed_urls)
657
+ future_to_task[future] = (sub_task_id, query)
658
+
659
+ # Collect results as they complete
660
+ for future in as_completed(future_to_task):
661
+ sub_task_id, query = future_to_task[future]
662
+ try:
663
+ findings = future.result()
664
+ all_findings.extend(findings)
665
+ logger.debug(f"Completed search for {sub_task_id}: {query} - {len(findings)} findings")
666
+ except Exception as e:
667
+ logger.warning(f"Search failed for {sub_task_id} '{query}': {e}")
668
+
669
+ # Early termination if source limit reached
670
+ if source_manager.is_full():
671
+ logger.info(f"🎯 Source limit reached ({source_manager.max_sources}), stopping early")
672
+ break
673
+
674
+ # Update sub-tasks with their findings
675
+ findings_by_task = {}
676
+ for finding in all_findings:
677
+ task_id = finding.sub_task_id
678
+ if task_id not in findings_by_task:
679
+ findings_by_task[task_id] = []
680
+ findings_by_task[task_id].append(finding)
681
+
682
+ for sub_task in research_plan.sub_tasks:
683
+ sub_task.findings = findings_by_task.get(sub_task.id, [])
684
+ sub_task.status = "completed" if sub_task.findings else "failed"
685
+
686
+ logger.info(f"🌐 Gathered {len(all_findings)} findings from web exploration")
687
+ return all_findings
688
+
689
+ def _execute_search(self, sub_task_id: str, query: str, source_manager: SourceManager, processed_urls: set) -> List[ResearchFinding]:
690
+ """Execute a single web search and extract findings"""
691
+
692
+ findings = []
693
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
694
+
695
+ try:
696
+ # Perform web search
697
+ logger.info(f"🔍 Executing search for: {query}")
698
+ search_results = web_search(query, num_results=5)
699
+ logger.debug(f"📄 Search results length: {len(search_results)}")
700
+ logger.debug(f"📄 Search results preview: {search_results[:500]}")
701
+
702
+ # Parse search results to extract URLs and content
703
+ urls = self._extract_urls_from_search(search_results)
704
+ logger.info(f"🔗 Extracted {len(urls)} URLs from search results")
705
+
706
+ # Deduplicate URLs globally across all sub-tasks
707
+ original_count = len(urls)
708
+ urls = [(url, title) for url, title in urls if url not in processed_urls]
709
+ deduplicated_count = len(urls)
710
+
711
+ # Add new URLs to processed set
712
+ for url, title in urls:
713
+ processed_urls.add(url)
714
+
715
+ if self.debug_mode and original_count > deduplicated_count:
716
+ print(f"\n🔄 DEBUG: URL Deduplication for query \"{query}\":")
717
+ print(f" 📊 Original URLs: {original_count}")
718
+ print(f" 📊 After deduplication: {deduplicated_count}")
719
+ print(f" 📊 Duplicates removed: {original_count - deduplicated_count}")
720
+
721
+ # Debug: Show all URLs found for this query
722
+ if self.debug_mode:
723
+ print(f"\n🔍 DEBUG: URLs found for query \"{query}\":")
724
+ for i, (url, title) in enumerate(urls, 1):
725
+ print(f" {i}. {title}")
726
+ print(f" 🔗 {url}")
727
+ self.debug_info['all_urls_found'].append({
728
+ 'query': query,
729
+ 'sub_task_id': sub_task_id,
730
+ 'url': url,
731
+ 'title': title
732
+ })
733
+
734
+ if not urls:
735
+ logger.warning(f"⚠️ No URLs found in search results for query: {query}")
736
+ logger.debug(f"Full search results: {search_results}")
737
+ # Try to create a synthetic finding from the search results if they contain useful information
738
+ if len(search_results) > 100 and "Error searching internet" not in search_results:
739
+ synthetic_finding = ResearchFinding(
740
+ source_url="https://duckduckgo.com/?q=" + query.replace(" ", "+"),
741
+ title=f"Search results for: {query}",
742
+ content=search_results[:500] + "...",
743
+ relevance_score=0.3,
744
+ timestamp=timestamp,
745
+ sub_task_id=sub_task_id
746
+ )
747
+ findings.append(synthetic_finding)
748
+ logger.info(f"✅ Created synthetic finding from search results")
749
+ return findings
750
+
751
+ # Fetch content from promising URLs with source manager control
752
+ for i, (url, title) in enumerate(urls):
753
+ # Check source manager capacity before processing
754
+ if source_manager.is_full():
755
+ logger.info(f"🎯 Source limit reached, stopping URL processing for query: {query}")
756
+ break
757
+
758
+ try:
759
+ logger.debug(f"🌐 Fetching content from URL {i+1}: {url}")
760
+ content = fetch_url(url, timeout=15)
761
+
762
+ if "Error" in content or len(content) < 100:
763
+ logger.debug(f"⚠️ Skipping URL due to fetch error or short content: {url}")
764
+ continue
765
+
766
+ # Extract relevant content using structured parsing or LLM
767
+ if self.full_text_extraction:
768
+ # For full text mode, use custom fetch with more content
769
+ relevant_content = self._extract_relevant_content_full_text(content, query, url)
770
+ else:
771
+ # Standard mode with structured parsing
772
+ relevant_content = self._extract_relevant_content(content, query)
773
+
774
+ if relevant_content:
775
+ # Use LLM to assess content relevance and quality
776
+ quality_assessment = self._assess_content_relevance(relevant_content, query, title)
777
+
778
+ # Debug: Show relevance assessment details
779
+ if self.debug_mode:
780
+ print(f"\n🧠 DEBUG: Relevance Assessment for {title}")
781
+ print(f" 🔗 URL: {url}")
782
+ print(f" 📊 Relevant: {quality_assessment['is_relevant']}")
783
+ print(f" 📈 Score: {quality_assessment['relevance_score']:.2f}")
784
+ print(f" 💭 Reason: {quality_assessment['reason']}")
785
+ print(f" 📝 Content preview: {relevant_content[:200]}...")
786
+
787
+ self.debug_info['relevance_assessments'].append({
788
+ 'url': url,
789
+ 'title': title,
790
+ 'query': query,
791
+ 'is_relevant': quality_assessment['is_relevant'],
792
+ 'relevance_score': quality_assessment['relevance_score'],
793
+ 'reason': quality_assessment['reason'],
794
+ 'content_preview': relevant_content[:200]
795
+ })
796
+
797
+ if quality_assessment['is_relevant']:
798
+ # Create source for manager validation
799
+ source_data = {
800
+ 'url': url,
801
+ 'title': title,
802
+ 'content': relevant_content,
803
+ 'relevance_score': quality_assessment['relevance_score'],
804
+ 'timestamp': timestamp,
805
+ 'sub_task_id': sub_task_id
806
+ }
807
+
808
+ # Try to add to source manager (handles deduplication and limits)
809
+ if source_manager.add_source(source_data):
810
+ finding = ResearchFinding(
811
+ source_url=url,
812
+ title=title,
813
+ content=relevant_content,
814
+ relevance_score=quality_assessment['relevance_score'],
815
+ timestamp=timestamp,
816
+ sub_task_id=sub_task_id
817
+ )
818
+ findings.append(finding)
819
+ logger.info(f"✅ Added relevant finding from {url} (score: {quality_assessment['relevance_score']:.2f}) ({len(source_manager.get_sources())}/{source_manager.max_sources})")
820
+
821
+ if self.debug_mode:
822
+ self.debug_info['accepted_sources'].append({
823
+ 'url': url,
824
+ 'title': title,
825
+ 'relevance_score': quality_assessment['relevance_score'],
826
+ 'reason': 'Accepted by source manager'
827
+ })
828
+ else:
829
+ logger.debug(f"🎯 Source not added (duplicate or limit reached): {url}")
830
+ if self.debug_mode:
831
+ self.debug_info['rejected_sources'].append({
832
+ 'url': url,
833
+ 'title': title,
834
+ 'relevance_score': quality_assessment['relevance_score'],
835
+ 'reason': 'Duplicate or source limit reached'
836
+ })
837
+ else:
838
+ logger.info(f"🚫 Content filtered out from {url}: {quality_assessment['reason']}")
839
+ if self.debug_mode:
840
+ self.debug_info['rejected_sources'].append({
841
+ 'url': url,
842
+ 'title': title,
843
+ 'relevance_score': quality_assessment['relevance_score'],
844
+ 'reason': f"Not relevant: {quality_assessment['reason']}"
845
+ })
846
+ else:
847
+ logger.debug(f"⚠️ No relevant content extracted from {url}")
848
+ if self.debug_mode:
849
+ self.debug_info['rejected_sources'].append({
850
+ 'url': url,
851
+ 'title': title,
852
+ 'relevance_score': 0.0,
853
+ 'reason': 'No relevant content could be extracted'
854
+ })
855
+
856
+ except Exception as e:
857
+ logger.warning(f"Failed to fetch {url}: {e}")
858
+ continue
859
+
860
+ except Exception as e:
861
+ logger.error(f"Search execution failed for '{query}': {e}")
862
+
863
+ logger.info(f"📊 Search completed for '{query}': {len(findings)} findings")
864
+ return findings
865
+
866
+ def _check_authority_indicators(self, title: str, content: str, query: str) -> Dict[str, Any]:
867
+ """Check for high-authority source indicators that should be prioritized"""
868
+
869
+ title_lower = title.lower()
870
+ content_lower = content.lower()
871
+ query_lower = query.lower()
872
+
873
+ # Extract potential person name from query
874
+ query_words = query_lower.split()
875
+ potential_names = [word for word in query_words if word.istitle() or len(word) > 3]
876
+
877
+ # High-authority indicators
878
+ authority_indicators = [
879
+ # Academic/Professional profiles
880
+ ('google scholar', 0.95, 'Official Google Scholar profile'),
881
+ ('orcid', 0.95, 'Official ORCID researcher profile'),
882
+ ('researchgate', 0.90, 'ResearchGate academic profile'),
883
+ ('linkedin', 0.85, 'Professional LinkedIn profile'),
884
+ ('academia.edu', 0.85, 'Academia.edu academic profile'),
885
+
886
+ # Institutional websites
887
+ ('university', 0.90, 'University/academic institution'),
888
+ ('institute', 0.90, 'Research institute'),
889
+ ('laboratory', 0.85, 'Research laboratory'),
890
+ ('.edu', 0.90, 'Educational institution domain'),
891
+ ('.ac.', 0.90, 'Academic institution domain'),
892
+
893
+ # Personal/official websites
894
+ ('personal website', 0.95, 'Personal/official website'),
895
+ ('official site', 0.95, 'Official website'),
896
+ ]
897
+
898
+ # Check for personal name match in title/content
899
+ name_match_score = 0.0
900
+ if potential_names:
901
+ for name in potential_names:
902
+ if name in title_lower or name in content_lower:
903
+ name_match_score = 0.8
904
+ break
905
+
906
+ # Check authority indicators
907
+ for indicator, base_score, reason in authority_indicators:
908
+ if indicator in title_lower or indicator in content_lower:
909
+ final_score = min(1.0, base_score + name_match_score * 0.2)
910
+ return {
911
+ 'is_high_authority': True,
912
+ 'authority_score': final_score,
913
+ 'reason': reason + (f' with name match' if name_match_score > 0 else '')
914
+ }
915
+
916
+ # Check for personal domain (e.g., lpalbou.info)
917
+ if any(name in title_lower for name in potential_names if len(name) > 3):
918
+ return {
919
+ 'is_high_authority': True,
920
+ 'authority_score': 0.95,
921
+ 'reason': 'Personal domain/website matching query subject'
922
+ }
923
+
924
+ return {
925
+ 'is_high_authority': False,
926
+ 'authority_score': 0.0,
927
+ 'reason': 'No high-authority indicators found'
928
+ }
929
+
930
+ def _assess_content_relevance(self, content: str, query: str, title: str) -> Dict[str, Any]:
931
+ """Use LLM to quickly assess if content is relevant to the research query"""
932
+
933
+ # First check for high-authority sources that should be prioritized
934
+ authority_indicators = self._check_authority_indicators(title, content, query)
935
+ if authority_indicators['is_high_authority']:
936
+ return {
937
+ 'is_relevant': True,
938
+ 'relevance_score': authority_indicators['authority_score'],
939
+ 'reason': f"High-authority source: {authority_indicators['reason']}"
940
+ }
941
+
942
+ # Limit content for efficient assessment
943
+ assessment_content = content[:1500] + "..." if len(content) > 1500 else content
944
+
945
+ assessment_prompt = f"""
946
+ Assess if this content contains meaningful information related to the research query.
947
+
948
+ RESEARCH QUERY: {query}
949
+ SOURCE TITLE: {title}
950
+
951
+ CONTENT:
952
+ {assessment_content}
953
+
954
+ Respond with ONLY a JSON object in this exact format:
955
+ {{
956
+ "is_relevant": true/false,
957
+ "relevance_score": 0.0-1.0,
958
+ "reason": "brief explanation"
959
+ }}
960
+
961
+ CRITICAL: Mark as RELEVANT (true) if the content:
962
+ - Contains ANY substantive information that could help answer or relate to the query
963
+ - Provides facts, data, explanations, or details about the query topic
964
+ - Is from authoritative sources (official websites, academic profiles, institutional pages)
965
+ - Has meaningful textual content beyond navigation elements
966
+ - Shows biographical, professional, or academic information when querying about a person
967
+
968
+ Mark as NOT RELEVANT (false) ONLY if the content:
969
+ - Is completely unrelated to the query topic (different person, concept, etc.)
970
+ - Contains ONLY navigation menus, headers, footers, or structural elements
971
+ - Shows clear error messages, access restrictions, or "page not found"
972
+ - Is purely promotional/advertising without ANY informational value
973
+ - Discusses entirely different subjects with no connection to the query
974
+
975
+ BE GENEROUS with relevance assessment - when in doubt, mark as relevant.
976
+ """
977
+
978
+ try:
979
+ response = self.llm.generate(assessment_prompt, temperature=0.1)
980
+
981
+ # Extract text from response (handle different response types)
982
+ if hasattr(response, 'text'):
983
+ response_text = response.text
984
+ elif hasattr(response, 'content'):
985
+ response_text = response.content
986
+ else:
987
+ response_text = str(response)
988
+
989
+ # Parse JSON from response
990
+ import json
991
+ json_start = response_text.find('{')
992
+ json_end = response_text.rfind('}') + 1
993
+
994
+ if json_start != -1 and json_end > json_start:
995
+ json_text = response_text[json_start:json_end]
996
+ assessment = json.loads(json_text)
997
+
998
+ # Validate and normalize
999
+ if 'is_relevant' in assessment and 'relevance_score' in assessment:
1000
+ assessment['relevance_score'] = max(0.0, min(1.0, float(assessment['relevance_score'])))
1001
+ assessment['reason'] = assessment.get('reason', 'No reason provided')
1002
+ return assessment
1003
+
1004
+ # Fallback if JSON parsing fails
1005
+ logger.debug(f"Content relevance assessment JSON parsing failed, using fallback")
1006
+ return self._fallback_relevance_assessment(content, query)
1007
+
1008
+ except Exception as e:
1009
+ logger.debug(f"Content relevance assessment failed: {e}")
1010
+ return self._fallback_relevance_assessment(content, query)
1011
+
1012
+ def _fallback_relevance_assessment(self, content: str, query: str) -> Dict[str, Any]:
1013
+ """Fallback relevance assessment using general content quality heuristics"""
1014
+
1015
+ content_lower = content.lower()
1016
+ words = content.split()
1017
+ word_count = len(words)
1018
+
1019
+ # Check for obvious error/empty content indicators
1020
+ error_indicators = [
1021
+ 'page not found', '404 error', '403 error', '500 error',
1022
+ 'access denied', 'login required', 'sign in required',
1023
+ 'javascript required', 'cookies required', 'enable javascript',
1024
+ 'subscribe to continue', 'sign up to read', 'premium content',
1025
+ 'page does not exist', 'content not available'
1026
+ ]
1027
+
1028
+ has_errors = any(indicator in content_lower for indicator in error_indicators)
1029
+
1030
+ # Check for navigation-heavy content (low information density)
1031
+ navigation_indicators = ['home', 'about', 'contact', 'menu', 'navigation', 'footer', 'header']
1032
+ nav_count = sum(1 for indicator in navigation_indicators if indicator in content_lower)
1033
+ nav_ratio = nav_count / max(word_count, 1)
1034
+
1035
+ # Basic content quality assessment
1036
+ if has_errors:
1037
+ return {
1038
+ 'is_relevant': False,
1039
+ 'relevance_score': 0.0,
1040
+ 'reason': 'Contains error messages or access restrictions'
1041
+ }
1042
+
1043
+ if word_count < 10:
1044
+ return {
1045
+ 'is_relevant': False,
1046
+ 'relevance_score': 0.0,
1047
+ 'reason': f'Too little content ({word_count} words)'
1048
+ }
1049
+
1050
+ if nav_ratio > 0.3: # More than 30% navigation terms
1051
+ return {
1052
+ 'is_relevant': False,
1053
+ 'relevance_score': 0.2,
1054
+ 'reason': 'Content appears to be mostly navigation elements'
1055
+ }
1056
+
1057
+ # If content passes basic quality checks, calculate relevance
1058
+ query_words = [word.lower().strip('.,!?;:"()[]{}') for word in query.split() if len(word) > 2]
1059
+
1060
+ if not query_words:
1061
+ # If query has no meaningful words, accept content based on quality
1062
+ relevance_score = 0.7 if word_count >= 50 else 0.5
1063
+ return {
1064
+ 'is_relevant': True,
1065
+ 'relevance_score': relevance_score,
1066
+ 'reason': f'Query has no key terms, accepting based on content quality ({word_count} words)'
1067
+ }
1068
+
1069
+ # Calculate keyword overlap
1070
+ matches = sum(1 for word in query_words if word in content_lower)
1071
+ keyword_relevance = matches / len(query_words)
1072
+
1073
+ # Content length bonus (longer content more likely to be informative)
1074
+ length_bonus = min(0.3, word_count / 200) # Up to 0.3 bonus for 200+ words
1075
+
1076
+ final_relevance = keyword_relevance + length_bonus
1077
+ is_relevant = final_relevance >= 0.4 # Require meaningful keyword overlap, don't accept long irrelevant content
1078
+
1079
+ return {
1080
+ 'is_relevant': is_relevant,
1081
+ 'relevance_score': min(1.0, final_relevance),
1082
+ 'reason': f'{matches}/{len(query_words)} keywords, {word_count} words, score: {final_relevance:.2f}'
1083
+ }
1084
+
1085
+ def _print_debug_summary(self):
1086
+ """Print comprehensive debug summary"""
1087
+ print("\n" + "="*80)
1088
+ print("🔍 DEBUG SUMMARY: COMPLETE RESEARCH PROCESS")
1089
+ print("="*80)
1090
+
1091
+ # Query summary
1092
+ print(f"\n📋 TOTAL QUERIES GENERATED: {len(self.debug_info['all_queries'])}")
1093
+ query_by_subtask = {}
1094
+ for q in self.debug_info['all_queries']:
1095
+ subtask = q['sub_task_question']
1096
+ if subtask not in query_by_subtask:
1097
+ query_by_subtask[subtask] = []
1098
+ query_by_subtask[subtask].append(q['query'])
1099
+
1100
+ for subtask, queries in query_by_subtask.items():
1101
+ print(f"\n🎯 {subtask}")
1102
+ for i, query in enumerate(queries, 1):
1103
+ print(f" {i}. \"{query}\"")
1104
+
1105
+ # URL summary
1106
+ print(f"\n🔗 TOTAL URLS DISCOVERED: {len(self.debug_info['all_urls_found'])}")
1107
+ urls_by_query = {}
1108
+ for url_info in self.debug_info['all_urls_found']:
1109
+ query = url_info['query']
1110
+ if query not in urls_by_query:
1111
+ urls_by_query[query] = []
1112
+ urls_by_query[query].append((url_info['title'], url_info['url']))
1113
+
1114
+ for query, urls in urls_by_query.items():
1115
+ print(f"\n🔍 Query: \"{query}\" → {len(urls)} URLs")
1116
+ for i, (title, url) in enumerate(urls, 1):
1117
+ print(f" {i}. {title}")
1118
+ print(f" 🔗 {url}")
1119
+
1120
+ # Relevance assessment summary
1121
+ print(f"\n🧠 RELEVANCE ASSESSMENTS: {len(self.debug_info['relevance_assessments'])}")
1122
+ relevant_count = sum(1 for a in self.debug_info['relevance_assessments'] if a['is_relevant'])
1123
+ irrelevant_count = len(self.debug_info['relevance_assessments']) - relevant_count
1124
+
1125
+ print(f" ✅ Relevant: {relevant_count}")
1126
+ print(f" ❌ Not Relevant: {irrelevant_count}")
1127
+
1128
+ if self.debug_info['relevance_assessments']:
1129
+ print(f"\n📊 DETAILED ASSESSMENTS:")
1130
+ for i, assessment in enumerate(self.debug_info['relevance_assessments'], 1):
1131
+ status = "✅" if assessment['is_relevant'] else "❌"
1132
+ print(f"\n {i}. {status} {assessment['title']}")
1133
+ print(f" 🔗 {assessment['url']}")
1134
+ print(f" 📈 Score: {assessment['relevance_score']:.2f}")
1135
+ print(f" 💭 Reason: {assessment['reason']}")
1136
+ print(f" 📝 Preview: {assessment['content_preview']}...")
1137
+
1138
+ # Final source summary
1139
+ print(f"\n📚 FINAL SOURCES:")
1140
+ print(f" ✅ Accepted: {len(self.debug_info['accepted_sources'])}")
1141
+ print(f" ❌ Rejected: {len(self.debug_info['rejected_sources'])}")
1142
+
1143
+ if self.debug_info['accepted_sources']:
1144
+ print(f"\n✅ ACCEPTED SOURCES:")
1145
+ for i, source in enumerate(self.debug_info['accepted_sources'], 1):
1146
+ print(f" {i}. {source['title']} (Score: {source['relevance_score']:.2f})")
1147
+ print(f" 🔗 {source['url']}")
1148
+ print(f" ✅ {source['reason']}")
1149
+
1150
+ if self.debug_info['rejected_sources']:
1151
+ print(f"\n❌ REJECTED SOURCES:")
1152
+ for i, source in enumerate(self.debug_info['rejected_sources'], 1):
1153
+ print(f" {i}. {source['title']} (Score: {source['relevance_score']:.2f})")
1154
+ print(f" 🔗 {source['url']}")
1155
+ print(f" ❌ {source['reason']}")
1156
+
1157
+ print("\n" + "="*80)
1158
+ print("🔍 END DEBUG SUMMARY")
1159
+ print("="*80)
1160
+
1161
+ def _detect_query_type(self, query: str) -> str:
1162
+ """Detect the type of query to generate appropriate research plan"""
1163
+ query_lower = query.lower()
1164
+
1165
+ # Person indicators
1166
+ person_indicators = [
1167
+ 'who is', 'biography of', 'background of', 'profile of',
1168
+ 'researcher', 'scientist', 'professor', 'dr.', 'phd'
1169
+ ]
1170
+
1171
+ # Concept/idea indicators
1172
+ concept_indicators = [
1173
+ 'what is', 'explain', 'definition of', 'concept of', 'theory of',
1174
+ 'how does', 'why does', 'principle of', 'mechanism of'
1175
+ ]
1176
+
1177
+ # Location/country indicators
1178
+ location_indicators = [
1179
+ 'country', 'city', 'region', 'geography of', 'history of',
1180
+ 'economy of', 'politics of', 'culture of'
1181
+ ]
1182
+
1183
+ # Technology/product indicators
1184
+ technology_indicators = [
1185
+ 'technology', 'software', 'algorithm', 'method', 'technique',
1186
+ 'system', 'platform', 'tool', 'framework'
1187
+ ]
1188
+
1189
+ # Company/organization indicators
1190
+ organization_indicators = [
1191
+ 'company', 'organization', 'institution', 'startup', 'business',
1192
+ 'corporation', 'agency', 'foundation'
1193
+ ]
1194
+
1195
+ # Check for patterns
1196
+ if any(indicator in query_lower for indicator in person_indicators):
1197
+ return "person"
1198
+ elif any(indicator in query_lower for indicator in concept_indicators):
1199
+ return "concept"
1200
+ elif any(indicator in query_lower for indicator in location_indicators):
1201
+ return "location"
1202
+ elif any(indicator in query_lower for indicator in technology_indicators):
1203
+ return "technology"
1204
+ elif any(indicator in query_lower for indicator in organization_indicators):
1205
+ return "organization"
1206
+ else:
1207
+ # Default based on query structure
1208
+ words = query_lower.split()
1209
+ if len(words) <= 3 and any(word.istitle() for word in query.split()):
1210
+ return "person" # Likely a name
1211
+ else:
1212
+ return "concept" # General topic
1213
+
1214
+ def _get_focus_areas_by_type(self, query_type: str) -> List[str]:
1215
+ """Get appropriate focus areas based on query type"""
1216
+ focus_areas_map = {
1217
+ "person": [
1218
+ "Professional Biography", "Academic Output", "Industry Impact",
1219
+ "Public Presence", "Professional Affiliations"
1220
+ ],
1221
+ "concept": [
1222
+ "Definition & Overview", "Historical Development", "Key Applications",
1223
+ "Current Research", "Future Implications"
1224
+ ],
1225
+ "location": [
1226
+ "Geography & Demographics", "History & Culture", "Economy & Politics",
1227
+ "Current Events", "International Relations"
1228
+ ],
1229
+ "technology": [
1230
+ "Technical Overview", "Development History", "Current Applications",
1231
+ "Market Analysis", "Future Trends"
1232
+ ],
1233
+ "organization": [
1234
+ "Company Overview", "Business Model", "Leadership & History",
1235
+ "Market Position", "Recent Developments"
1236
+ ]
1237
+ }
1238
+
1239
+ return focus_areas_map.get(query_type, focus_areas_map["concept"])
1240
+
1241
+ def _extract_urls_from_search(self, search_results: str) -> List[tuple]:
1242
+ """Extract URLs and titles from search results"""
1243
+ urls = []
1244
+ lines = search_results.split('\n')
1245
+
1246
+ current_title = ""
1247
+ for line in lines:
1248
+ line = line.strip()
1249
+
1250
+ # Look for numbered results (1., 2., etc.)
1251
+ if line.startswith(('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.')):
1252
+ current_title = line[2:].strip()
1253
+
1254
+ # Look for URLs with link emoji
1255
+ elif line.startswith('🔗'):
1256
+ url = line.replace('🔗', '').strip()
1257
+ if url.startswith('http'):
1258
+ urls.append((url, current_title or "Web Result"))
1259
+
1260
+ # Also look for direct URLs in the text (fallback)
1261
+ elif 'http' in line and ('://' in line):
1262
+ import re
1263
+ url_matches = re.findall(r'https?://[^\s<>"{}|\\^`\[\]]+', line)
1264
+ for url in url_matches:
1265
+ # Clean up URL (remove trailing punctuation)
1266
+ url = url.rstrip('.,;:!?)')
1267
+ if url not in [u[0] for u in urls]: # Avoid duplicates
1268
+ title = current_title or f"Web Result from {url.split('/')[2]}"
1269
+ urls.append((url, title))
1270
+
1271
+ # If no URLs found, try a more aggressive search
1272
+ if not urls:
1273
+ import re
1274
+ all_urls = re.findall(r'https?://[^\s<>"{}|\\^`\[\]]+', search_results)
1275
+ for url in all_urls:
1276
+ url = url.rstrip('.,;:!?)')
1277
+ title = f"Web Result from {url.split('/')[2] if '/' in url else 'Unknown'}"
1278
+ urls.append((url, title))
1279
+
1280
+ logger.debug(f"🔗 URL extraction found {len(urls)} URLs: {[u[0] for u in urls[:3]]}")
1281
+ return urls
1282
+
1283
+ def _extract_relevant_content(self, content: str, query: str) -> str:
1284
+ """Extract relevant content from fetched web page using structured parsing"""
1285
+
1286
+ # First, try to parse the structured output from fetch_url
1287
+ structured_content = self._parse_fetch_url_output(content)
1288
+
1289
+ if structured_content:
1290
+ # Use structured data for more efficient extraction
1291
+ return self._extract_from_structured_content(structured_content, query)
1292
+ else:
1293
+ # Fallback to LLM-based extraction for unstructured content
1294
+ return self._extract_with_llm(content, query)
1295
+
1296
+ def _parse_fetch_url_output(self, content: str) -> Optional[Dict[str, Any]]:
1297
+ """Parse structured output from fetch_url tool"""
1298
+ try:
1299
+ # Look for the structured sections in fetch_url output
1300
+ if "📄 Content Analysis:" not in content:
1301
+ return None
1302
+
1303
+ structured = {}
1304
+ lines = content.split('\n')
1305
+
1306
+ for i, line in enumerate(lines):
1307
+ line = line.strip()
1308
+
1309
+ # Extract title
1310
+ if line.startswith('📰 Title:'):
1311
+ structured['title'] = line.replace('📰 Title:', '').strip()
1312
+
1313
+ # Extract description
1314
+ elif line.startswith('📝 Description:'):
1315
+ structured['description'] = line.replace('📝 Description:', '').strip()
1316
+
1317
+ # Extract headings
1318
+ elif line.startswith('📋 Headings'):
1319
+ headings = []
1320
+ j = i + 1
1321
+ while j < len(lines) and lines[j].strip().startswith('•'):
1322
+ heading = lines[j].strip().replace('• ', '')
1323
+ headings.append(heading)
1324
+ j += 1
1325
+ structured['headings'] = headings
1326
+
1327
+ # Extract text content preview
1328
+ elif line.startswith('📄 Text Content Preview:'):
1329
+ # Collect multiple lines of text content
1330
+ text_lines = []
1331
+ j = i + 1
1332
+ while j < len(lines) and not lines[j].strip().startswith(('📊', '📄', '🔗', '📋', '📰', '📝')):
1333
+ if lines[j].strip(): # Skip empty lines
1334
+ text_lines.append(lines[j].strip())
1335
+ j += 1
1336
+ if text_lines:
1337
+ structured['text_preview'] = ' '.join(text_lines)
1338
+
1339
+ # Store raw content for full text extraction if needed
1340
+ if self.full_text_extraction:
1341
+ structured['_raw_content'] = content
1342
+
1343
+ return structured if structured else None
1344
+
1345
+ except Exception as e:
1346
+ logger.debug(f"Failed to parse fetch_url output: {e}")
1347
+ return None
1348
+
1349
+ def _extract_from_structured_content(self, structured: Dict[str, Any], query: str) -> str:
1350
+ """Extract relevant information from structured content"""
1351
+
1352
+ # Build content summary from structured data
1353
+ content_parts = []
1354
+
1355
+ # Add title if relevant
1356
+ title = structured.get('title', '')
1357
+ if title and any(word.lower() in title.lower() for word in query.split()):
1358
+ content_parts.append(f"**Title:** {title}")
1359
+
1360
+ # Add description if available
1361
+ description = structured.get('description', '')
1362
+ if description:
1363
+ content_parts.append(f"**Summary:** {description}")
1364
+
1365
+ # Add relevant headings
1366
+ headings = structured.get('headings', [])
1367
+ relevant_headings = []
1368
+ query_words = [word.lower() for word in query.split()]
1369
+
1370
+ for heading in headings[:10]: # Limit to first 10 headings
1371
+ if any(word in heading.lower() for word in query_words):
1372
+ relevant_headings.append(heading)
1373
+
1374
+ if relevant_headings:
1375
+ content_parts.append(f"**Key Sections:** {'; '.join(relevant_headings[:5])}")
1376
+
1377
+ # Add text preview (longer or full text based on mode)
1378
+ text_preview = structured.get('text_preview', '')
1379
+ if text_preview:
1380
+ if self.full_text_extraction:
1381
+ # In full text mode, try to get more content from fetch_url
1382
+ full_text = self._extract_full_text_from_fetch_output(structured.get('_raw_content', ''))
1383
+ if full_text and len(full_text) > len(text_preview):
1384
+ content_parts.append(f"**Full Content:** {full_text}")
1385
+ else:
1386
+ content_parts.append(f"**Content:** {text_preview}")
1387
+ else:
1388
+ # Standard mode: use longer preview (up to 1000 chars)
1389
+ preview = text_preview[:1000] + "..." if len(text_preview) > 1000 else text_preview
1390
+ content_parts.append(f"**Content:** {preview}")
1391
+
1392
+ if not content_parts:
1393
+ return None
1394
+
1395
+ # Combine and validate relevance
1396
+ combined_content = '\n'.join(content_parts)
1397
+
1398
+ # Quick relevance check - if query words appear in the content
1399
+ query_words_lower = [word.lower() for word in query.split() if len(word) > 2]
1400
+ content_lower = combined_content.lower()
1401
+
1402
+ relevance_score = sum(1 for word in query_words_lower if word in content_lower) / len(query_words_lower)
1403
+
1404
+ if relevance_score < 0.2: # Less than 20% of query words found
1405
+ return None
1406
+
1407
+ return combined_content
1408
+
1409
+ def _extract_full_text_from_fetch_output(self, raw_content: str) -> str:
1410
+ """Extract full clean text content from fetch_url output"""
1411
+ if not raw_content or "📄 Text Content Preview:" not in raw_content:
1412
+ return ""
1413
+
1414
+ try:
1415
+ # Find the text content section
1416
+ lines = raw_content.split('\n')
1417
+ text_lines = []
1418
+ in_text_section = False
1419
+
1420
+ for line in lines:
1421
+ line_stripped = line.strip()
1422
+
1423
+ # Start collecting after "Text Content Preview:"
1424
+ if line_stripped.startswith('📄 Text Content Preview:'):
1425
+ in_text_section = True
1426
+ continue
1427
+
1428
+ # Stop at next section or metadata
1429
+ elif in_text_section and line_stripped.startswith(('📊', '📄', '🔗', '📋', '📰', '📝', '⏰', '✅')):
1430
+ break
1431
+
1432
+ # Collect text lines
1433
+ elif in_text_section and line_stripped:
1434
+ # Skip obvious metadata or navigation
1435
+ if not any(skip in line_stripped.lower() for skip in [
1436
+ 'total text length:', 'characters', 'download image',
1437
+ 'press inquiries', 'contact:', 'email:', 'phone:',
1438
+ 'breadcrumb', 'navigation', 'menu', 'footer'
1439
+ ]):
1440
+ text_lines.append(line_stripped)
1441
+
1442
+ if text_lines:
1443
+ full_text = ' '.join(text_lines)
1444
+ # Clean up excessive whitespace
1445
+ full_text = ' '.join(full_text.split())
1446
+ return full_text
1447
+
1448
+ return ""
1449
+
1450
+ except Exception as e:
1451
+ logger.debug(f"Failed to extract full text: {e}")
1452
+ return ""
1453
+
1454
+ def _extract_relevant_content_full_text(self, content: str, query: str, url: str) -> str:
1455
+ """Extract relevant content using full text mode with custom processing"""
1456
+
1457
+ # First try structured parsing
1458
+ structured_content = self._parse_fetch_url_output(content)
1459
+
1460
+ if structured_content:
1461
+ # Get the full text if available
1462
+ full_text = self._extract_full_text_from_fetch_output(content)
1463
+
1464
+ if full_text and len(full_text) > 200:
1465
+ # Use LLM to extract relevant parts from the full text
1466
+ llm_result = self._extract_with_llm_full_text(full_text, query)
1467
+ if llm_result:
1468
+ return llm_result
1469
+
1470
+ # Always try structured extraction as fallback
1471
+ structured_result = self._extract_from_structured_content(structured_content, query)
1472
+ if structured_result:
1473
+ return structured_result
1474
+
1475
+ # Final fallback to standard LLM extraction
1476
+ return self._extract_with_llm(content, query)
1477
+
1478
+ def _extract_with_llm_full_text(self, full_text: str, query: str) -> str:
1479
+ """Extract relevant content from full text using LLM"""
1480
+
1481
+ # Limit content length for LLM processing (but allow more than standard mode)
1482
+ max_length = 15000 # 3x more than standard mode
1483
+ if len(full_text) > max_length:
1484
+ # Try to truncate at sentence boundary
1485
+ truncated = full_text[:max_length]
1486
+ last_period = truncated.rfind('.')
1487
+ if last_period > max_length - 500: # If period is reasonably close to end
1488
+ full_text = truncated[:last_period + 1]
1489
+ else:
1490
+ full_text = truncated + "..."
1491
+
1492
+ extraction_prompt = f"""
1493
+ Extract the most relevant and comprehensive information from this full text content for the research query.
1494
+
1495
+ RESEARCH QUERY: {query}
1496
+
1497
+ FULL TEXT CONTENT:
1498
+ {full_text}
1499
+
1500
+ Extract 3-5 key points that directly answer or relate to the research query.
1501
+ Focus on:
1502
+ - Specific facts, data, statistics, and recent developments
1503
+ - Technical details and performance metrics
1504
+ - Key findings and authoritative statements
1505
+ - Recent breakthroughs or announcements
1506
+ - Comparative information and benchmarks
1507
+
1508
+ Format as a comprehensive summary (max 800 words) with the most important information.
1509
+ Include specific details like numbers, dates, company names, and technical specifications.
1510
+ If the content is not relevant to the query, respond with "NOT_RELEVANT".
1511
+ """
1512
+
1513
+ try:
1514
+ response = self.llm.generate(extraction_prompt, temperature=0.2)
1515
+
1516
+ # Extract text from response (handle different response object types)
1517
+ if hasattr(response, 'text'):
1518
+ response_text = response.text
1519
+ elif hasattr(response, 'content'):
1520
+ response_text = response.content
1521
+ else:
1522
+ response_text = str(response)
1523
+
1524
+ extracted = response_text.strip()
1525
+
1526
+ if extracted == "NOT_RELEVANT" or len(extracted) < 100:
1527
+ return None
1528
+
1529
+ return extracted
1530
+
1531
+ except Exception as e:
1532
+ logger.debug(f"Full text extraction failed: {e}")
1533
+ return None
1534
+
1535
+ def _extract_with_llm(self, content: str, query: str) -> str:
1536
+ """Fallback LLM-based extraction for unstructured content"""
1537
+
1538
+ # Limit content length for processing
1539
+ if len(content) > 8000:
1540
+ content = content[:8000] + "..."
1541
+
1542
+ extraction_prompt = f"""
1543
+ Extract the most relevant information from this content for the research query.
1544
+
1545
+ RESEARCH QUERY: {query}
1546
+
1547
+ CONTENT:
1548
+ {content}
1549
+
1550
+ Extract 2-3 key points that directly answer or relate to the research query.
1551
+ Focus on facts, data, recent developments, and authoritative statements.
1552
+ Ignore navigation, ads, and irrelevant content.
1553
+
1554
+ Format as a concise summary (max 300 words) with the most important information.
1555
+ If the content is not relevant to the query, respond with "NOT_RELEVANT".
1556
+ """
1557
+
1558
+ try:
1559
+ response = self.llm.generate(extraction_prompt, temperature=0.2)
1560
+
1561
+ # Extract text from response (handle different response object types)
1562
+ if hasattr(response, 'text'):
1563
+ response_text = response.text
1564
+ elif hasattr(response, 'content'):
1565
+ response_text = response.content
1566
+ else:
1567
+ response_text = str(response)
1568
+
1569
+ extracted = response_text.strip()
1570
+
1571
+ if extracted == "NOT_RELEVANT" or len(extracted) < 50:
1572
+ return None
1573
+
1574
+ return extracted
1575
+
1576
+ except Exception as e:
1577
+ logger.debug(f"Content extraction failed: {e}")
1578
+ return None
1579
+
1580
+ def _generate_report(
1581
+ self,
1582
+ research_plan: ResearchPlan,
1583
+ findings: List[ResearchFinding],
1584
+ output_format: str
1585
+ ) -> ResearchReport:
1586
+ """Stage 4: Generate structured research report"""
1587
+
1588
+ # Check if we have any findings
1589
+ if not findings:
1590
+ logger.warning("⚠️ No findings available for report generation")
1591
+ return self._create_no_findings_report(research_plan)
1592
+
1593
+ # Organize findings by sub-task
1594
+ findings_by_task = {}
1595
+ for finding in findings:
1596
+ task_id = finding.sub_task_id
1597
+ if task_id not in findings_by_task:
1598
+ findings_by_task[task_id] = []
1599
+ findings_by_task[task_id].append(finding)
1600
+
1601
+ # Collect research context for specific methodology/limitations
1602
+ search_queries_used = []
1603
+ for sub_task in research_plan.sub_tasks:
1604
+ if sub_task.search_queries:
1605
+ search_queries_used.extend(sub_task.search_queries)
1606
+
1607
+ successful_extractions = len([f for f in findings if f.content and len(f.content.strip()) > 50])
1608
+ total_sources_attempted = len(findings)
1609
+ failed_extractions = total_sources_attempted - successful_extractions
1610
+
1611
+ research_context = {
1612
+ 'total_sources_found': total_sources_attempted,
1613
+ 'successful_extractions': successful_extractions,
1614
+ 'failed_extractions': failed_extractions,
1615
+ 'search_queries_used': search_queries_used,
1616
+ 'extraction_method': 'full_text' if self.full_text_extraction else 'preview',
1617
+ 'focus_areas': research_plan.focus_areas,
1618
+ 'sub_tasks_count': len(research_plan.sub_tasks)
1619
+ }
1620
+
1621
+ # Prepare findings summary for the LLM
1622
+ findings_summary = []
1623
+ total_findings_count = 0
1624
+
1625
+ for sub_task in research_plan.sub_tasks:
1626
+ task_findings = findings_by_task.get(sub_task.id, [])
1627
+ if task_findings:
1628
+ findings_summary.append(f"\n## {sub_task.question}")
1629
+ for finding in task_findings:
1630
+ findings_summary.append(f"- {finding.content}")
1631
+ findings_summary.append(f" Source: {finding.title} ({finding.source_url})")
1632
+ total_findings_count += 1
1633
+
1634
+ findings_text = "\n".join(findings_summary)
1635
+
1636
+ if not findings_text.strip():
1637
+ logger.warning("⚠️ No usable findings content for report generation")
1638
+ return self._create_no_findings_report(research_plan)
1639
+
1640
+ logger.info(f"📝 Generating report from {total_findings_count} findings across {len(findings_by_task)} sub-tasks")
1641
+
1642
+ # Generate report based on format
1643
+ if output_format == "executive":
1644
+ report_prompt = self._get_executive_report_prompt(research_plan, findings_text, research_context)
1645
+ elif output_format == "narrative":
1646
+ report_prompt = self._get_narrative_report_prompt(research_plan, findings_text, research_context)
1647
+ else: # structured
1648
+ report_prompt = self._get_structured_report_prompt(research_plan, findings_text, research_context)
1649
+
1650
+ try:
1651
+ response = self.llm.generate(report_prompt, temperature=0.3)
1652
+
1653
+ # Extract JSON from response (handle cases where LLM adds extra text)
1654
+ # Extract text from response (handle different response object types)
1655
+ if hasattr(response, 'text'):
1656
+ response_text = response.text.strip()
1657
+ elif hasattr(response, 'content'):
1658
+ response_text = response.content.strip()
1659
+ else:
1660
+ response_text = str(response).strip()
1661
+
1662
+ # Try to find JSON in the response
1663
+ json_start = response_text.find('{')
1664
+ json_end = response_text.rfind('}') + 1
1665
+
1666
+ if json_start != -1 and json_end > json_start:
1667
+ json_text = response_text[json_start:json_end]
1668
+ logger.debug(f"📄 Extracted JSON: {json_text[:200]}...")
1669
+ report_data = json.loads(json_text)
1670
+ else:
1671
+ logger.warning("⚠️ No JSON found in LLM response, using fallback")
1672
+ raise json.JSONDecodeError("No JSON found", response_text, 0)
1673
+
1674
+ # Create sources list
1675
+ sources = []
1676
+ for finding in findings:
1677
+ source_entry = {
1678
+ "title": finding.title,
1679
+ "url": finding.source_url,
1680
+ "relevance": finding.relevance_score
1681
+ }
1682
+ if source_entry not in sources:
1683
+ sources.append(source_entry)
1684
+
1685
+ # Validate and enhance citations in the generated content
1686
+ detailed_analysis = report_data.get("detailed_analysis", "")
1687
+ key_findings = report_data.get("key_findings", [])
1688
+
1689
+ # Validate citations in detailed analysis
1690
+ citation_validation = CitationValidator.validate_citations(detailed_analysis, sources)
1691
+ logger.info(f"📊 Citation validation: {citation_validation['citations_found']} citations found, "
1692
+ f"{citation_validation['citation_ratio']:.2f} ratio, "
1693
+ f"adequately cited: {citation_validation['is_adequately_cited']}")
1694
+
1695
+ # Enhance content if citations are insufficient
1696
+ if not citation_validation['is_adequately_cited']:
1697
+ logger.warning("⚠️ Insufficient citations detected, enhancing content")
1698
+ detailed_analysis = CitationValidator.enhance_text_with_citations(detailed_analysis, sources)
1699
+
1700
+ # Also enhance key findings if they lack citations
1701
+ enhanced_findings = []
1702
+ for finding in key_findings:
1703
+ if isinstance(finding, str):
1704
+ finding_validation = CitationValidator.validate_citations(finding, sources)
1705
+ if finding_validation['citations_found'] == 0:
1706
+ enhanced_finding = CitationValidator.enhance_text_with_citations(finding, sources[:2]) # Limit to top 2 sources
1707
+ enhanced_findings.append(enhanced_finding)
1708
+ else:
1709
+ enhanced_findings.append(finding)
1710
+ else:
1711
+ enhanced_findings.append(finding)
1712
+ key_findings = enhanced_findings
1713
+
1714
+ # Ensure all fields are properly formatted for Pydantic validation
1715
+ def ensure_string(value, default=""):
1716
+ """Convert list or other types to string"""
1717
+ if isinstance(value, list):
1718
+ return " ".join(str(item) for item in value)
1719
+ elif value is None:
1720
+ return default
1721
+ else:
1722
+ return str(value)
1723
+
1724
+ def ensure_list(value, default=None):
1725
+ """Ensure value is a list"""
1726
+ if default is None:
1727
+ default = []
1728
+ if isinstance(value, list):
1729
+ return value
1730
+ elif isinstance(value, str):
1731
+ return [value] if value else default
1732
+ else:
1733
+ return default
1734
+
1735
+ report = ResearchReport(
1736
+ title=ensure_string(report_data.get("title"), f"Research Report: {research_plan.original_query}"),
1737
+ executive_summary=ensure_string(report_data.get("executive_summary"), ""),
1738
+ key_findings=ensure_list(key_findings, []),
1739
+ detailed_analysis=ensure_string(detailed_analysis, ""),
1740
+ conclusions=ensure_string(report_data.get("conclusions"), ""),
1741
+ sources=sources,
1742
+ methodology=ensure_string(report_data.get("methodology"), "Web-based research using multi-stage pipeline"),
1743
+ limitations=ensure_string(report_data.get("limitations"), "Limited to publicly available web sources")
1744
+ )
1745
+
1746
+ return report
1747
+
1748
+ except (json.JSONDecodeError, KeyError) as e:
1749
+ logger.error(f"Failed to parse report: {e}")
1750
+ # Return fallback report
1751
+ return self._create_fallback_report(research_plan, findings)
1752
+
1753
+ def _create_no_findings_report(self, research_plan: ResearchPlan) -> ResearchReport:
1754
+ """Create a report when no findings are available"""
1755
+
1756
+ return ResearchReport(
1757
+ title=f"Research Report: {research_plan.original_query} (No Sources Found)",
1758
+ executive_summary="Research could not be completed due to inability to access web sources. This may be due to network connectivity issues, search service limitations, or content access restrictions.",
1759
+ key_findings=[
1760
+ "No web sources could be accessed for this research query",
1761
+ "Search functionality may be limited due to network or service issues",
1762
+ "Manual research using alternative sources is recommended"
1763
+ ],
1764
+ detailed_analysis="The automated research process was unable to gather information from web sources for this query. This could be due to several factors: network connectivity issues preventing access to search services, search API limitations or rate limiting, content access restrictions, or the specific query terms not yielding accessible results. To complete this research, manual investigation using academic databases, library resources, or direct access to authoritative sources would be recommended.",
1765
+ conclusions="Automated web-based research could not be completed for this query. Alternative research methods should be employed to gather the required information.",
1766
+ sources=[],
1767
+ methodology="Attempted web-based research using multi-stage pipeline with parallel search execution. Search services were inaccessible or returned no usable results.",
1768
+ limitations="Complete inability to access web sources severely limits the scope and reliability of this research. No factual claims can be made without source verification."
1769
+ )
1770
+
1771
+ def _get_structured_report_prompt(self, research_plan: ResearchPlan, findings_text: str, research_context: Dict[str, Any] = None) -> str:
1772
+ """Get prompt for structured report generation"""
1773
+
1774
+ # Build research context information
1775
+ context_info = ""
1776
+ if research_context:
1777
+ total_sources = research_context.get('total_sources_found', 0)
1778
+ successful_extractions = research_context.get('successful_extractions', 0)
1779
+ failed_extractions = research_context.get('failed_extractions', 0)
1780
+ search_queries_used = research_context.get('search_queries_used', [])
1781
+ extraction_method = research_context.get('extraction_method', 'standard')
1782
+
1783
+ context_info = f"""
1784
+ RESEARCH PROCESS CONTEXT:
1785
+ - Total sources discovered: {total_sources}
1786
+ - Successfully analyzed: {successful_extractions}
1787
+ - Failed to access: {failed_extractions}
1788
+ - Extraction method: {extraction_method}
1789
+ - Search queries executed: {len(search_queries_used)}
1790
+ - Key search terms: {', '.join(search_queries_used[:5]) if search_queries_used else 'None'}
1791
+ """
1792
+
1793
+ return f"""
1794
+ Generate a comprehensive research report based on the findings below.
1795
+
1796
+ IMPORTANT: Respond with ONLY valid JSON, no additional text before or after.
1797
+
1798
+ RESEARCH OBJECTIVE: {research_plan.research_objective}
1799
+ ORIGINAL QUERY: {research_plan.original_query}
1800
+ {context_info}
1801
+ RESEARCH FINDINGS:
1802
+ {findings_text}
1803
+
1804
+ Create a structured research report with this EXACT JSON format:
1805
+ {{
1806
+ "title": "Descriptive report title about {research_plan.original_query}",
1807
+ "executive_summary": "2-3 sentence summary of key insights from the research findings",
1808
+ "key_findings": [
1809
+ "Key finding 1 with specific details and citation (according to Source Name)",
1810
+ "Key finding 2 with specific details and citation (according to Source Name)",
1811
+ "Key finding 3 with specific details and citation (according to Source Name)"
1812
+ ],
1813
+ "detailed_analysis": "Comprehensive analysis section (3-4 paragraphs) that synthesizes the findings, identifies patterns, and provides context. MUST include citations like 'according to [Source Name]' or 'as reported by [Source Name]' for every claim and fact",
1814
+ "conclusions": "Clear conclusions and implications (2-3 paragraphs) based on the evidence gathered",
1815
+ "methodology": "Detailed description of the specific research approach used, including: search strategies employed, number of sources analyzed, types of sources accessed, any challenges encountered, and verification methods applied",
1816
+ "limitations": "Specific limitations encountered during THIS research, including: sources that were inaccessible, information gaps identified, potential biases in available sources, temporal constraints, and areas requiring further investigation"
1817
+ }}
1818
+
1819
+ CRITICAL REQUIREMENTS:
1820
+ - Respond with ONLY the JSON object, no other text
1821
+ - Base all content strictly on the provided findings
1822
+ - ALWAYS include proper citations for every claim using source titles or "according to [Source]"
1823
+ - Include specific facts, data, and examples from the sources WITH citations
1824
+ - Use proper JSON formatting with escaped quotes if needed
1825
+ - Do not include markdown formatting or code blocks
1826
+ - Every key finding and analysis point MUST reference its source
1827
+ """
1828
+
1829
+ def _get_executive_report_prompt(self, research_plan: ResearchPlan, findings_text: str, research_context: Dict[str, Any] = None) -> str:
1830
+ """Get prompt for executive report generation"""
1831
+
1832
+ # Build research context information
1833
+ context_info = ""
1834
+ if research_context:
1835
+ total_sources = research_context.get('total_sources_found', 0)
1836
+ successful_extractions = research_context.get('successful_extractions', 0)
1837
+ failed_extractions = research_context.get('failed_extractions', 0)
1838
+ search_queries_used = research_context.get('search_queries_used', [])
1839
+ extraction_method = research_context.get('extraction_method', 'standard')
1840
+
1841
+ context_info = f"""
1842
+ RESEARCH PROCESS CONTEXT:
1843
+ - Total sources discovered: {total_sources}
1844
+ - Successfully analyzed: {successful_extractions}
1845
+ - Failed to access: {failed_extractions}
1846
+ - Extraction method: {extraction_method}
1847
+ - Search queries executed: {len(search_queries_used)}
1848
+ - Key search terms: {', '.join(search_queries_used[:5]) if search_queries_used else 'None'}
1849
+ """
1850
+
1851
+ return f"""
1852
+ Generate a concise executive research report based on the findings below.
1853
+
1854
+ RESEARCH OBJECTIVE: {research_plan.research_objective}
1855
+ ORIGINAL QUERY: {research_plan.original_query}
1856
+ {context_info}
1857
+ RESEARCH FINDINGS:
1858
+ {findings_text}
1859
+
1860
+ Create an executive-style report with the following JSON format:
1861
+ {{
1862
+ "title": "Executive Brief: [Topic]",
1863
+ "executive_summary": "3-4 sentence executive summary highlighting the most critical insights and implications",
1864
+ "key_findings": [
1865
+ "3-5 bullet points with the most important findings",
1866
+ "Focus on actionable insights and strategic implications"
1867
+ ],
1868
+ "detailed_analysis": "2-3 paragraphs of focused analysis on the most critical aspects",
1869
+ "conclusions": "Clear, actionable conclusions and recommendations",
1870
+ "methodology": "Concise description of research approach: sources analyzed, search methods used, and verification applied",
1871
+ "limitations": "Specific limitations from this research: inaccessible sources, information gaps, or areas needing further study"
1872
+ }}
1873
+
1874
+ Guidelines:
1875
+ - Focus on strategic insights and business implications
1876
+ - Prioritize actionable information
1877
+ - Keep language clear and executive-friendly
1878
+ - Highlight trends, opportunities, and risks
1879
+ - Be concise but comprehensive
1880
+ - ALWAYS cite sources for every claim using "according to [Source]" or similar
1881
+ - Include proper attribution for all facts and data points
1882
+ """
1883
+
1884
+ def _get_narrative_report_prompt(self, research_plan: ResearchPlan, findings_text: str, research_context: Dict[str, Any] = None) -> str:
1885
+ """Get prompt for narrative report generation"""
1886
+
1887
+ # Build research context information
1888
+ context_info = ""
1889
+ if research_context:
1890
+ total_sources = research_context.get('total_sources_found', 0)
1891
+ successful_extractions = research_context.get('successful_extractions', 0)
1892
+ failed_extractions = research_context.get('failed_extractions', 0)
1893
+ search_queries_used = research_context.get('search_queries_used', [])
1894
+ extraction_method = research_context.get('extraction_method', 'standard')
1895
+
1896
+ context_info = f"""
1897
+ RESEARCH PROCESS CONTEXT:
1898
+ - Total sources discovered: {total_sources}
1899
+ - Successfully analyzed: {successful_extractions}
1900
+ - Failed to access: {failed_extractions}
1901
+ - Extraction method: {extraction_method}
1902
+ - Search queries executed: {len(search_queries_used)}
1903
+ - Key search terms: {', '.join(search_queries_used[:5]) if search_queries_used else 'None'}
1904
+ """
1905
+
1906
+ return f"""
1907
+ Generate a narrative research report based on the findings below.
1908
+
1909
+ RESEARCH OBJECTIVE: {research_plan.research_objective}
1910
+ ORIGINAL QUERY: {research_plan.original_query}
1911
+ {context_info}
1912
+ RESEARCH FINDINGS:
1913
+ {findings_text}
1914
+
1915
+ Create a narrative-style report with the following JSON format:
1916
+ {{
1917
+ "title": "Research Report: [Topic]",
1918
+ "executive_summary": "Engaging summary that tells the story of what was discovered",
1919
+ "key_findings": [
1920
+ "Key discoveries presented as narrative points",
1921
+ "Each finding should tell part of the overall story"
1922
+ ],
1923
+ "detailed_analysis": "Comprehensive narrative analysis (4-5 paragraphs) that weaves together the findings into a coherent story, showing how different aspects connect and build upon each other",
1924
+ "conclusions": "Narrative conclusions that bring the story together and point toward future implications",
1925
+ "methodology": "Narrative account of the research journey: what sources were explored, how information was gathered, challenges faced, and methods used to verify findings",
1926
+ "limitations": "Honest reflection on what this specific research couldn't uncover: missing perspectives, inaccessible information, temporal constraints, and areas requiring deeper investigation"
1927
+ }}
1928
+
1929
+ Guidelines:
1930
+ - Write in an engaging, narrative style
1931
+ - Show connections and relationships between findings
1932
+ - Use storytelling techniques to make the report compelling
1933
+ - Maintain objectivity while being engaging
1934
+ - Create a logical flow from introduction to conclusion
1935
+ - ALWAYS include proper citations throughout the narrative using source titles
1936
+ - Attribute all facts, quotes, and data to their specific sources
1937
+ """
1938
+
1939
+ def _create_fallback_report(self, research_plan: ResearchPlan, findings: List[ResearchFinding]) -> ResearchReport:
1940
+ """Create a simple fallback report if JSON parsing fails"""
1941
+
1942
+ # Extract key information from findings
1943
+ key_findings = []
1944
+ sources = []
1945
+
1946
+ for finding in findings[:10]: # Limit to top 10 findings
1947
+ key_findings.append(finding.content[:200] + "..." if len(finding.content) > 200 else finding.content)
1948
+ sources.append({
1949
+ "title": finding.title,
1950
+ "url": finding.source_url,
1951
+ "relevance": finding.relevance_score
1952
+ })
1953
+
1954
+ return ResearchReport(
1955
+ title=f"Research Report: {research_plan.original_query}",
1956
+ executive_summary=f"Research conducted on {research_plan.original_query} with {len(findings)} sources analyzed.",
1957
+ key_findings=key_findings,
1958
+ detailed_analysis="Detailed analysis could not be generated due to processing error. Please refer to key findings and sources.",
1959
+ conclusions="Further analysis recommended based on the gathered sources.",
1960
+ sources=sources,
1961
+ methodology="Web-based research using multi-stage pipeline",
1962
+ limitations="Limited to publicly available web sources. Report generation encountered technical issues."
1963
+ )
1964
+
1965
+ def _verify_report(self, report: ResearchReport, findings: List[ResearchFinding]) -> ResearchReport:
1966
+ """Stage 5: Verify report accuracy and add fact-checking"""
1967
+
1968
+ verification_prompt = f"""
1969
+ Review this research report for accuracy and consistency with the source findings.
1970
+
1971
+ REPORT TITLE: {report.title}
1972
+ EXECUTIVE SUMMARY: {report.executive_summary}
1973
+ KEY FINDINGS: {report.key_findings}
1974
+
1975
+ SOURCE FINDINGS:
1976
+ {[f"- {f.content[:200]}..." for f in findings[:10]]}
1977
+
1978
+ Identify any potential issues:
1979
+ 1. Claims not supported by the source findings
1980
+ 2. Overgeneralizations or unsupported conclusions
1981
+ 3. Missing important caveats or limitations
1982
+ 4. Factual inconsistencies
1983
+
1984
+ Provide verification results as JSON:
1985
+ {{
1986
+ "verification_status": "verified|needs_review|issues_found",
1987
+ "issues_identified": ["list of specific issues if any"],
1988
+ "confidence_score": 0.85,
1989
+ "recommendations": ["suggestions for improvement"]
1990
+ }}
1991
+ """
1992
+
1993
+ try:
1994
+ response = self.llm.generate(verification_prompt, temperature=0.2)
1995
+
1996
+ # Extract text from response (handle different response object types)
1997
+ if hasattr(response, 'text'):
1998
+ response_text = response.text
1999
+ elif hasattr(response, 'content'):
2000
+ response_text = response.content
2001
+ else:
2002
+ response_text = str(response)
2003
+
2004
+ verification = json.loads(response_text)
2005
+
2006
+ # Add verification metadata to report
2007
+ if hasattr(report, 'metadata'):
2008
+ report.metadata = {}
2009
+
2010
+ # Update limitations if issues were found
2011
+ if verification.get("verification_status") == "issues_found":
2012
+ issues = verification.get("issues_identified", [])
2013
+ additional_limitations = f" Verification identified potential issues: {'; '.join(issues)}"
2014
+ report.limitations += additional_limitations
2015
+
2016
+ logger.info(f"✅ Report verification completed: {verification.get('verification_status', 'unknown')}")
2017
+
2018
+ except Exception as e:
2019
+ logger.warning(f"Report verification failed: {e}")
2020
+ report.limitations += " Report verification could not be completed."
2021
+
2022
+ return report
2023
+
2024
+ def _reflexive_refinement(
2025
+ self,
2026
+ initial_report: ResearchReport,
2027
+ research_plan: ResearchPlan,
2028
+ existing_findings: List[ResearchFinding]
2029
+ ) -> ResearchReport:
2030
+ """Stage 6: Reflexive analysis and iterative improvement"""
2031
+
2032
+ current_report = initial_report
2033
+ current_findings = existing_findings.copy()
2034
+
2035
+ for iteration in range(self.max_reflexive_iterations):
2036
+ logger.info(f"🔄 Reflexive iteration {iteration + 1}/{self.max_reflexive_iterations}")
2037
+
2038
+ # Analyze gaps and limitations
2039
+ gaps = self._analyze_research_gaps(current_report, research_plan)
2040
+
2041
+ if not gaps or len(gaps) == 0:
2042
+ logger.info("✅ No significant gaps identified - reflexive analysis complete")
2043
+ break
2044
+
2045
+ logger.info(f"🎯 Identified {len(gaps)} research gaps to address")
2046
+
2047
+ # Execute targeted searches for gaps
2048
+ new_findings = self._execute_gap_searches(gaps, research_plan.original_query)
2049
+
2050
+ if new_findings:
2051
+ logger.info(f"📚 Found {len(new_findings)} additional sources")
2052
+ current_findings.extend(new_findings)
2053
+
2054
+ # Regenerate report with enhanced findings
2055
+ current_report = self._generate_report(research_plan, current_findings, "structured")
2056
+
2057
+ # Update methodology to reflect reflexive process
2058
+ current_report.methodology += f" Enhanced through {iteration + 1} reflexive analysis cycle(s) addressing identified gaps."
2059
+ else:
2060
+ logger.info("⚠️ No additional sources found for identified gaps")
2061
+ break
2062
+
2063
+ return current_report
2064
+
2065
+ def _analyze_research_gaps(self, report: ResearchReport, research_plan: ResearchPlan) -> List[Dict[str, Any]]:
2066
+ """Analyze methodology and limitations to identify actionable research gaps"""
2067
+
2068
+ gap_analysis_prompt = f"""
2069
+ Analyze this research report to identify SPECIFIC, ACTIONABLE information gaps that could be addressed with targeted web searches.
2070
+
2071
+ ORIGINAL QUERY: {research_plan.original_query}
2072
+
2073
+ CURRENT METHODOLOGY: {report.methodology}
2074
+
2075
+ CURRENT LIMITATIONS: {report.limitations}
2076
+
2077
+ CURRENT KEY FINDINGS: {report.key_findings}
2078
+
2079
+ FOCUS AREAS EXPLORED: {research_plan.focus_areas}
2080
+
2081
+ Identify gaps that are:
2082
+ 1. SPECIFIC enough to generate targeted search queries
2083
+ 2. IMPORTANT for answering the original query
2084
+ 3. FEASIBLE to find through web search
2085
+ 4. NOT already covered in current findings
2086
+
2087
+ For each gap, provide:
2088
+ - gap_type: "missing_perspective", "insufficient_data", "outdated_info", "alternative_viewpoint", "technical_detail", "recent_development"
2089
+ - description: What specific information is missing
2090
+ - importance: 1-10 scale (10 = critical for answering original query)
2091
+ - search_strategy: Specific approach to find this information
2092
+ - target_queries: 2-3 specific search queries to address this gap
2093
+
2094
+ Return ONLY a JSON array of gaps (max 5 most important):
2095
+ [
2096
+ {{
2097
+ "gap_type": "missing_perspective",
2098
+ "description": "Lack of industry expert opinions on quantum computing timeline",
2099
+ "importance": 8,
2100
+ "search_strategy": "Search for expert interviews, industry reports, and analyst predictions",
2101
+ "target_queries": ["quantum computing expert predictions 2024", "industry analysis quantum timeline", "quantum computing roadmap enterprise"]
2102
+ }}
2103
+ ]
2104
+
2105
+ CRITICAL: Return ONLY the JSON array, no other text.
2106
+ """
2107
+
2108
+ try:
2109
+ response = self.llm.generate(gap_analysis_prompt)
2110
+
2111
+ # Extract text from response (handle GenerateResponse objects)
2112
+ if hasattr(response, 'text'):
2113
+ response_text = response.text
2114
+ elif hasattr(response, 'content'):
2115
+ response_text = response.content
2116
+ else:
2117
+ response_text = str(response)
2118
+
2119
+ # Extract JSON from response
2120
+ import json
2121
+ import re
2122
+
2123
+ # Find JSON array in response
2124
+ json_match = re.search(r'\[.*\]', response_text, re.DOTALL)
2125
+ if json_match:
2126
+ gaps_data = json.loads(json_match.group())
2127
+
2128
+ # Filter gaps by importance (only keep high-importance ones)
2129
+ important_gaps = [gap for gap in gaps_data if gap.get('importance', 0) >= 6]
2130
+
2131
+ logger.info(f"🔍 Gap analysis identified {len(important_gaps)} high-priority gaps")
2132
+ return important_gaps
2133
+ else:
2134
+ logger.warning("No valid JSON found in gap analysis response")
2135
+ return []
2136
+
2137
+ except Exception as e:
2138
+ logger.warning(f"Gap analysis failed: {e}")
2139
+ return []
2140
+
2141
+ def _execute_gap_searches(self, gaps: List[Dict[str, Any]], original_query: str) -> List[ResearchFinding]:
2142
+ """Execute targeted searches to address identified gaps"""
2143
+
2144
+ new_findings = []
2145
+
2146
+ for gap in gaps:
2147
+ gap_type = gap.get('gap_type', 'unknown')
2148
+ description = gap.get('description', '')
2149
+ target_queries = gap.get('target_queries', [])
2150
+
2151
+ logger.info(f"🎯 Addressing gap: {gap_type} - {description}")
2152
+
2153
+ # Execute searches for this gap
2154
+ for query in target_queries[:2]: # Limit to 2 queries per gap
2155
+ try:
2156
+ logger.info(f"🔍 Gap search: {query}")
2157
+
2158
+ # Use existing search infrastructure
2159
+ gap_findings = self._execute_search(query, f"gap_{gap_type}")
2160
+
2161
+ if gap_findings:
2162
+ # Mark findings as gap-addressing
2163
+ for finding in gap_findings:
2164
+ finding.sub_task_id = f"reflexive_gap_{gap_type}"
2165
+
2166
+ new_findings.extend(gap_findings)
2167
+ logger.info(f"✅ Found {len(gap_findings)} sources for gap: {description}")
2168
+
2169
+ except Exception as e:
2170
+ logger.warning(f"Gap search failed for '{query}': {e}")
2171
+ continue
2172
+
2173
+ return new_findings