cite-agent 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cite-agent might be problematic. Click here for more details.
- cite_agent/__init__.py +1 -1
- cite_agent/account_client.py +19 -46
- cite_agent/agent_backend_only.py +30 -4
- cite_agent/cli.py +24 -26
- cite_agent/cli_conversational.py +294 -0
- cite_agent/enhanced_ai_agent.py +2776 -118
- cite_agent/setup_config.py +5 -21
- cite_agent/streaming_ui.py +252 -0
- {cite_agent-1.0.4.dist-info → cite_agent-1.0.5.dist-info}/METADATA +4 -3
- cite_agent-1.0.5.dist-info/RECORD +50 -0
- {cite_agent-1.0.4.dist-info → cite_agent-1.0.5.dist-info}/top_level.txt +1 -0
- src/__init__.py +1 -0
- src/services/__init__.py +132 -0
- src/services/auth_service/__init__.py +3 -0
- src/services/auth_service/auth_manager.py +33 -0
- src/services/graph/__init__.py +1 -0
- src/services/graph/knowledge_graph.py +194 -0
- src/services/llm_service/__init__.py +5 -0
- src/services/llm_service/llm_manager.py +495 -0
- src/services/paper_service/__init__.py +5 -0
- src/services/paper_service/openalex.py +231 -0
- src/services/performance_service/__init__.py +1 -0
- src/services/performance_service/rust_performance.py +395 -0
- src/services/research_service/__init__.py +23 -0
- src/services/research_service/chatbot.py +2056 -0
- src/services/research_service/citation_manager.py +436 -0
- src/services/research_service/context_manager.py +1441 -0
- src/services/research_service/conversation_manager.py +597 -0
- src/services/research_service/critical_paper_detector.py +577 -0
- src/services/research_service/enhanced_research.py +121 -0
- src/services/research_service/enhanced_synthesizer.py +375 -0
- src/services/research_service/query_generator.py +777 -0
- src/services/research_service/synthesizer.py +1273 -0
- src/services/search_service/__init__.py +5 -0
- src/services/search_service/indexer.py +186 -0
- src/services/search_service/search_engine.py +342 -0
- src/services/simple_enhanced_main.py +287 -0
- cite_agent/__distribution__.py +0 -7
- cite_agent-1.0.4.dist-info/RECORD +0 -23
- {cite_agent-1.0.4.dist-info → cite_agent-1.0.5.dist-info}/WHEEL +0 -0
- {cite_agent-1.0.4.dist-info → cite_agent-1.0.5.dist-info}/entry_points.txt +0 -0
- {cite_agent-1.0.4.dist-info → cite_agent-1.0.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,777 @@
|
|
|
1
|
+
# src/services/research_service/query_generator.py
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
from typing import List, Dict, Any, Optional
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
|
|
10
|
+
from src.services.llm_service.llm_manager import LLMManager
|
|
11
|
+
|
|
12
|
+
# Configure structured logging
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _utc_timestamp() -> str:
|
|
17
|
+
return datetime.now(timezone.utc).isoformat()
|
|
18
|
+
|
|
19
|
+
class EnhancedQueryGenerator:
|
|
20
|
+
"""
|
|
21
|
+
Enhanced query generator with comprehensive error handling, security, and observability.
|
|
22
|
+
|
|
23
|
+
Features:
|
|
24
|
+
- Secure query generation and optimization
|
|
25
|
+
- Input validation and sanitization
|
|
26
|
+
- Comprehensive error handling and retry logic
|
|
27
|
+
- Structured logging and monitoring
|
|
28
|
+
- Protection against injection attacks
|
|
29
|
+
- Research plan generation
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, llm_manager: LLMManager):
|
|
33
|
+
"""
|
|
34
|
+
Initialize query generator with enhanced security and error handling.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
llm_manager: LLM manager instance
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
ValueError: If LLM manager is invalid
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
if not llm_manager:
|
|
44
|
+
raise ValueError("LLM manager instance is required")
|
|
45
|
+
|
|
46
|
+
logger.info("Initializing EnhancedQueryGenerator with enhanced security")
|
|
47
|
+
self.llm_manager = llm_manager
|
|
48
|
+
logger.info("EnhancedQueryGenerator initialized successfully")
|
|
49
|
+
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logger.error(f"Failed to initialize EnhancedQueryGenerator: {str(e)}")
|
|
52
|
+
raise
|
|
53
|
+
|
|
54
|
+
def _validate_topic(self, topic: str) -> None:
|
|
55
|
+
"""
|
|
56
|
+
Validate research topic for security and safety.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
topic: Research topic to validate
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If topic is invalid
|
|
63
|
+
"""
|
|
64
|
+
if not isinstance(topic, str):
|
|
65
|
+
raise ValueError("Topic must be a string")
|
|
66
|
+
|
|
67
|
+
if not topic.strip():
|
|
68
|
+
raise ValueError("Topic cannot be empty")
|
|
69
|
+
|
|
70
|
+
if len(topic) > 500: # Reasonable limit
|
|
71
|
+
raise ValueError("Topic too long (max 500 characters)")
|
|
72
|
+
|
|
73
|
+
# Check for potentially dangerous content
|
|
74
|
+
dangerous_patterns = [
|
|
75
|
+
r'<script.*?>.*?</script>', # Script tags
|
|
76
|
+
r'javascript:', # JavaScript protocol
|
|
77
|
+
r'data:text/html', # Data URLs
|
|
78
|
+
r'vbscript:', # VBScript
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
for pattern in dangerous_patterns:
|
|
82
|
+
if re.search(pattern, topic, re.IGNORECASE):
|
|
83
|
+
raise ValueError(f"Topic contains potentially dangerous patterns: {pattern}")
|
|
84
|
+
|
|
85
|
+
def _validate_research_intent(self, research_intent: str) -> None:
|
|
86
|
+
"""
|
|
87
|
+
Validate research intent for security and safety.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
research_intent: Research intent to validate
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
ValueError: If research intent is invalid
|
|
94
|
+
"""
|
|
95
|
+
if not isinstance(research_intent, str):
|
|
96
|
+
raise ValueError("Research intent must be a string")
|
|
97
|
+
|
|
98
|
+
if not research_intent.strip():
|
|
99
|
+
raise ValueError("Research intent cannot be empty")
|
|
100
|
+
|
|
101
|
+
if len(research_intent) > 2000: # Reasonable limit
|
|
102
|
+
raise ValueError("Research intent too long (max 2000 characters)")
|
|
103
|
+
|
|
104
|
+
# Check for potentially dangerous content
|
|
105
|
+
dangerous_patterns = [
|
|
106
|
+
r'<script.*?>.*?</script>', # Script tags
|
|
107
|
+
r'javascript:', # JavaScript protocol
|
|
108
|
+
r'data:text/html', # Data URLs
|
|
109
|
+
r'vbscript:', # VBScript
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
for pattern in dangerous_patterns:
|
|
113
|
+
if re.search(pattern, research_intent, re.IGNORECASE):
|
|
114
|
+
raise ValueError(f"Research intent contains potentially dangerous patterns: {pattern}")
|
|
115
|
+
|
|
116
|
+
def _sanitize_text(self, text: str, max_length: int = 2000) -> str:
|
|
117
|
+
"""
|
|
118
|
+
Sanitize text to prevent injection attacks.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
text: Text to sanitize
|
|
122
|
+
max_length: Maximum allowed length
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Sanitized text
|
|
126
|
+
"""
|
|
127
|
+
if not isinstance(text, str):
|
|
128
|
+
raise ValueError("Text must be a string")
|
|
129
|
+
|
|
130
|
+
if len(text) > max_length:
|
|
131
|
+
text = text[:max_length]
|
|
132
|
+
|
|
133
|
+
# Basic XSS protection
|
|
134
|
+
sanitized = text.replace('<', '<').replace('>', '>')
|
|
135
|
+
|
|
136
|
+
# Remove null bytes and other control characters
|
|
137
|
+
sanitized = ''.join(char for char in sanitized if ord(char) >= 32 or char in '\n\r\t')
|
|
138
|
+
|
|
139
|
+
return sanitized.strip()
|
|
140
|
+
|
|
141
|
+
async def generate_research_queries(self,
|
|
142
|
+
topic: str,
|
|
143
|
+
research_intent: str,
|
|
144
|
+
context: Optional[Dict] = None) -> List[str]:
|
|
145
|
+
"""
|
|
146
|
+
Generate optimized search queries with enhanced error handling and security.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
topic: Main research topic
|
|
150
|
+
research_intent: Detailed description of research goals and focus
|
|
151
|
+
context: Optional additional context (background, field, etc.)
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
List of optimized search queries
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
ValueError: If inputs are invalid
|
|
158
|
+
ConnectionError: If query generation fails
|
|
159
|
+
"""
|
|
160
|
+
try:
|
|
161
|
+
# Input validation and sanitization
|
|
162
|
+
self._validate_topic(topic)
|
|
163
|
+
self._validate_research_intent(research_intent)
|
|
164
|
+
|
|
165
|
+
sanitized_topic = self._sanitize_text(topic, max_length=500)
|
|
166
|
+
sanitized_intent = self._sanitize_text(research_intent, max_length=2000)
|
|
167
|
+
|
|
168
|
+
logger.info(f"Generating research queries for topic: {sanitized_topic[:50]}...")
|
|
169
|
+
|
|
170
|
+
prompt = f"""You are an expert academic researcher helping to formulate optimal search queries.
|
|
171
|
+
|
|
172
|
+
RESEARCH TOPIC: {sanitized_topic}
|
|
173
|
+
|
|
174
|
+
RESEARCH INTENT:
|
|
175
|
+
{sanitized_intent}
|
|
176
|
+
|
|
177
|
+
{self._format_context(context) if context else ""}
|
|
178
|
+
|
|
179
|
+
Generate 5-7 search queries that would find the most relevant academic papers for this research.
|
|
180
|
+
For each query:
|
|
181
|
+
1. Focus on different aspects/angles of the research topic
|
|
182
|
+
2. Use terminology and phrasing typically found in academic papers
|
|
183
|
+
3. Include relevant field-specific keywords
|
|
184
|
+
4. Consider both broader conceptual searches and more specific technical searches
|
|
185
|
+
5. Optimize for finding high-quality, relevant papers rather than general information
|
|
186
|
+
|
|
187
|
+
Format your response as a list of queries only, one per line, with no numbering or other text.
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
# Generate queries with retry logic
|
|
191
|
+
queries = await self._generate_queries_with_retry(prompt, sanitized_topic)
|
|
192
|
+
|
|
193
|
+
logger.info(f"Successfully generated {len(queries)} research queries")
|
|
194
|
+
return queries
|
|
195
|
+
|
|
196
|
+
except ValueError as e:
|
|
197
|
+
logger.error(f"Invalid input for query generation: {str(e)}")
|
|
198
|
+
raise
|
|
199
|
+
except Exception as e:
|
|
200
|
+
logger.error(f"Error generating research queries: {str(e)}")
|
|
201
|
+
raise
|
|
202
|
+
|
|
203
|
+
async def _generate_queries_with_retry(self, prompt: str, fallback_topic: str, max_retries: int = 3) -> List[str]:
|
|
204
|
+
"""
|
|
205
|
+
Generate queries with retry logic.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
prompt: Generation prompt
|
|
209
|
+
fallback_topic: Fallback topic if generation fails
|
|
210
|
+
max_retries: Maximum retry attempts
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
List of generated queries
|
|
214
|
+
"""
|
|
215
|
+
last_error = None
|
|
216
|
+
|
|
217
|
+
for attempt in range(max_retries):
|
|
218
|
+
try:
|
|
219
|
+
response = await self.llm_manager.generate_synthesis(
|
|
220
|
+
[{"content": prompt}],
|
|
221
|
+
prompt
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
if isinstance(response, dict) and "summary" in response:
|
|
225
|
+
response_text = response["summary"]
|
|
226
|
+
else:
|
|
227
|
+
response_text = str(response)
|
|
228
|
+
|
|
229
|
+
# Parse queries from response
|
|
230
|
+
queries = [q.strip() for q in response_text.split('\n') if q.strip()]
|
|
231
|
+
|
|
232
|
+
# Validate queries
|
|
233
|
+
valid_queries = []
|
|
234
|
+
for query in queries:
|
|
235
|
+
if len(query) > 10 and len(query) < 200: # Reasonable length
|
|
236
|
+
sanitized_query = self._sanitize_text(query, max_length=200)
|
|
237
|
+
valid_queries.append(sanitized_query)
|
|
238
|
+
|
|
239
|
+
# Ensure we got at least one query
|
|
240
|
+
if valid_queries:
|
|
241
|
+
return valid_queries[:7] # Limit to 7 queries
|
|
242
|
+
else:
|
|
243
|
+
raise ValueError("No valid queries generated")
|
|
244
|
+
|
|
245
|
+
except Exception as e:
|
|
246
|
+
last_error = e
|
|
247
|
+
logger.warning(f"Query generation attempt {attempt + 1} failed: {str(e)}")
|
|
248
|
+
|
|
249
|
+
if attempt < max_retries - 1:
|
|
250
|
+
await asyncio.sleep(1) # Short delay between retries
|
|
251
|
+
|
|
252
|
+
# All retries failed, return fallback
|
|
253
|
+
logger.warning(f"All query generation attempts failed, using fallback")
|
|
254
|
+
return [fallback_topic]
|
|
255
|
+
|
|
256
|
+
async def generate_research_plan(self, topic: str, research_intent: str, context: Optional[Dict] = None) -> Dict[str, Any]:
|
|
257
|
+
"""
|
|
258
|
+
Generate a comprehensive research plan with enhanced error handling and security.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
topic: Main research topic
|
|
262
|
+
research_intent: Detailed description of research goals and focus
|
|
263
|
+
context: Optional additional context
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Comprehensive research plan
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
ValueError: If inputs are invalid
|
|
270
|
+
ConnectionError: If plan generation fails
|
|
271
|
+
"""
|
|
272
|
+
try:
|
|
273
|
+
# Input validation and sanitization
|
|
274
|
+
self._validate_topic(topic)
|
|
275
|
+
self._validate_research_intent(research_intent)
|
|
276
|
+
|
|
277
|
+
sanitized_topic = self._sanitize_text(topic, max_length=500)
|
|
278
|
+
sanitized_intent = self._sanitize_text(research_intent, max_length=2000)
|
|
279
|
+
|
|
280
|
+
logger.info(f"Generating research plan for topic: {sanitized_topic[:50]}...")
|
|
281
|
+
|
|
282
|
+
# Create a prompt specifically focused on detailed keywords and queries
|
|
283
|
+
prompt = f"""You are a research expert creating a structured research plan for "{sanitized_topic}".
|
|
284
|
+
|
|
285
|
+
RESEARCH TOPIC: {sanitized_topic}
|
|
286
|
+
|
|
287
|
+
RESEARCH INTENT:
|
|
288
|
+
{sanitized_intent}
|
|
289
|
+
|
|
290
|
+
{self._format_context(context) if context else ""}
|
|
291
|
+
|
|
292
|
+
I need a DETAILED research plan with particular focus on search terms and queries.
|
|
293
|
+
|
|
294
|
+
Your plan MUST include:
|
|
295
|
+
|
|
296
|
+
1. Primary Research Question: A precise, focused question that guides the investigation
|
|
297
|
+
|
|
298
|
+
2. Sub-Questions (5-7): Specific questions that break down the main research question into manageable parts
|
|
299
|
+
- Include technical questions about methods and implementations
|
|
300
|
+
- Include questions about current limitations and challenges
|
|
301
|
+
- Include questions about practical applications
|
|
302
|
+
|
|
303
|
+
3. Relevant Academic Disciplines: List 3-5 specific academic fields relevant to this research
|
|
304
|
+
|
|
305
|
+
4. Search Strategy:
|
|
306
|
+
- Keywords (10-15): Technical terms, scientific concepts, and domain-specific vocabulary
|
|
307
|
+
- Search Queries (5-7): Carefully crafted search strings that would yield relevant academic papers
|
|
308
|
+
|
|
309
|
+
5. Methodological Considerations: Brief notes on research approaches
|
|
310
|
+
|
|
311
|
+
CRITICAL: For each section, particularly keywords and search queries, BE EXTREMELY SPECIFIC TO THE TOPIC.
|
|
312
|
+
For example, for quantum computing in drug discovery, include terms like "quantum chemistry algorithms",
|
|
313
|
+
"molecular docking", "NISQ devices in pharmaceutical research", etc.
|
|
314
|
+
|
|
315
|
+
Format your response EXACTLY as a JSON object with these sections.
|
|
316
|
+
"""
|
|
317
|
+
|
|
318
|
+
# Generate plan with retry logic
|
|
319
|
+
plan = await self._generate_plan_with_retry(prompt, sanitized_topic)
|
|
320
|
+
|
|
321
|
+
logger.info("Successfully generated research plan")
|
|
322
|
+
return plan
|
|
323
|
+
|
|
324
|
+
except ValueError as e:
|
|
325
|
+
logger.error(f"Invalid input for plan generation: {str(e)}")
|
|
326
|
+
raise
|
|
327
|
+
except Exception as e:
|
|
328
|
+
logger.error(f"Error generating research plan: {str(e)}")
|
|
329
|
+
raise
|
|
330
|
+
|
|
331
|
+
async def _generate_plan_with_retry(self, prompt: str, topic: str, max_retries: int = 3) -> Dict[str, Any]:
|
|
332
|
+
"""
|
|
333
|
+
Generate research plan with retry logic.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
prompt: Generation prompt
|
|
337
|
+
topic: Research topic
|
|
338
|
+
max_retries: Maximum retry attempts
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Generated research plan
|
|
342
|
+
"""
|
|
343
|
+
last_error = None
|
|
344
|
+
|
|
345
|
+
for attempt in range(max_retries):
|
|
346
|
+
try:
|
|
347
|
+
# Generate plan
|
|
348
|
+
response = await self.llm_manager.generate_synthesis(
|
|
349
|
+
[{"content": prompt}],
|
|
350
|
+
prompt
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
if isinstance(response, dict) and "summary" in response:
|
|
354
|
+
response_text = response["summary"]
|
|
355
|
+
else:
|
|
356
|
+
response_text = str(response)
|
|
357
|
+
|
|
358
|
+
logger.debug(f"Research plan raw response: {response_text[:100]}...")
|
|
359
|
+
|
|
360
|
+
# Parse JSON response
|
|
361
|
+
plan = self._parse_json_plan(response_text, topic)
|
|
362
|
+
return plan
|
|
363
|
+
|
|
364
|
+
except Exception as e:
|
|
365
|
+
last_error = e
|
|
366
|
+
logger.warning(f"Plan generation attempt {attempt + 1} failed: {str(e)}")
|
|
367
|
+
|
|
368
|
+
if attempt < max_retries - 1:
|
|
369
|
+
await asyncio.sleep(2) # Longer delay for plan generation
|
|
370
|
+
|
|
371
|
+
# All retries failed, return default plan
|
|
372
|
+
logger.warning(f"All plan generation attempts failed, using default plan")
|
|
373
|
+
return self._generate_default_plan(topic)
|
|
374
|
+
|
|
375
|
+
def _parse_json_plan(self, response_text: str, topic: str) -> Dict[str, Any]:
|
|
376
|
+
"""
|
|
377
|
+
Parse JSON research plan with enhanced error handling.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
response_text: LLM response text
|
|
381
|
+
topic: Research topic
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
Parsed research plan
|
|
385
|
+
"""
|
|
386
|
+
try:
|
|
387
|
+
# Find JSON in response by looking for opening/closing braces
|
|
388
|
+
json_start = response_text.find('{')
|
|
389
|
+
json_end = response_text.rfind('}') + 1
|
|
390
|
+
|
|
391
|
+
if json_start >= 0 and json_end > json_start:
|
|
392
|
+
json_str = response_text[json_start:json_end]
|
|
393
|
+
# Try to fix common JSON issues before parsing
|
|
394
|
+
json_str = json_str.replace('\n', ' ').replace('\\', '\\\\')
|
|
395
|
+
plan = json.loads(json_str)
|
|
396
|
+
|
|
397
|
+
# Ensure we have the required fields with defaults if missing
|
|
398
|
+
plan = self._validate_and_fix_plan(plan, topic)
|
|
399
|
+
return plan
|
|
400
|
+
else:
|
|
401
|
+
# Extract structured data if JSON not found
|
|
402
|
+
return self._extract_structured_plan(response_text, topic)
|
|
403
|
+
|
|
404
|
+
except json.JSONDecodeError as e:
|
|
405
|
+
logger.warning(f"JSON parsing failed: {str(e)}, extracting structured data")
|
|
406
|
+
return self._extract_structured_plan(response_text, topic)
|
|
407
|
+
except Exception as e:
|
|
408
|
+
logger.error(f"Error parsing JSON plan: {str(e)}")
|
|
409
|
+
return self._generate_default_plan(topic)
|
|
410
|
+
|
|
411
|
+
def _validate_and_fix_plan(self, plan: Dict[str, Any], topic: str) -> Dict[str, Any]:
|
|
412
|
+
"""
|
|
413
|
+
Validate and fix research plan with defaults.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
plan: Research plan to validate
|
|
417
|
+
topic: Research topic
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
Validated and fixed plan
|
|
421
|
+
"""
|
|
422
|
+
try:
|
|
423
|
+
if not isinstance(plan, dict):
|
|
424
|
+
plan = {}
|
|
425
|
+
|
|
426
|
+
# Ensure required fields with defaults
|
|
427
|
+
if "primary_research_question" not in plan:
|
|
428
|
+
plan["primary_research_question"] = f"How can {topic} be effectively developed and applied?"
|
|
429
|
+
|
|
430
|
+
if "sub_questions" not in plan or not plan["sub_questions"]:
|
|
431
|
+
plan["sub_questions"] = [
|
|
432
|
+
f"What are the current applications of {topic}?",
|
|
433
|
+
f"What are the technical challenges in implementing {topic}?",
|
|
434
|
+
f"How does {topic} compare to traditional approaches?",
|
|
435
|
+
f"What are the performance metrics for evaluating {topic}?",
|
|
436
|
+
f"What future developments are expected in {topic}?"
|
|
437
|
+
]
|
|
438
|
+
|
|
439
|
+
if "disciplines" not in plan or not plan["disciplines"]:
|
|
440
|
+
plan["disciplines"] = ["Computer Science", "Physics", "Chemistry", "Bioinformatics"]
|
|
441
|
+
|
|
442
|
+
if "search_strategy" not in plan:
|
|
443
|
+
plan["search_strategy"] = {}
|
|
444
|
+
|
|
445
|
+
if "keywords" not in plan["search_strategy"] or not plan["search_strategy"]["keywords"]:
|
|
446
|
+
# Generate topic-specific keywords
|
|
447
|
+
words = topic.split()
|
|
448
|
+
plan["search_strategy"]["keywords"] = [
|
|
449
|
+
topic,
|
|
450
|
+
f"{topic} applications",
|
|
451
|
+
f"{topic} algorithms",
|
|
452
|
+
f"{topic} implementations",
|
|
453
|
+
f"{topic} challenges",
|
|
454
|
+
" ".join(words[:1] + ["quantum"]),
|
|
455
|
+
" ".join(words[:1] + ["simulation"]),
|
|
456
|
+
" ".join(words[:1] + ["optimization"]),
|
|
457
|
+
]
|
|
458
|
+
|
|
459
|
+
if "queries" not in plan["search_strategy"] or not plan["search_strategy"]["queries"]:
|
|
460
|
+
# Generate topic-specific queries
|
|
461
|
+
plan["search_strategy"]["queries"] = [
|
|
462
|
+
f'"{topic}" recent advances',
|
|
463
|
+
f'"{topic}" review',
|
|
464
|
+
f'"{topic}" applications',
|
|
465
|
+
f'"{topic}" implementation challenges',
|
|
466
|
+
f'"{topic}" performance comparison',
|
|
467
|
+
]
|
|
468
|
+
|
|
469
|
+
if "methodological_considerations" not in plan:
|
|
470
|
+
plan["methodological_considerations"] = f"Research on {topic} requires interdisciplinary approaches combining theoretical analysis and practical implementation."
|
|
471
|
+
|
|
472
|
+
return plan
|
|
473
|
+
|
|
474
|
+
except Exception as e:
|
|
475
|
+
logger.error(f"Error validating and fixing plan: {str(e)}")
|
|
476
|
+
return self._generate_default_plan(topic)
|
|
477
|
+
|
|
478
|
+
def _extract_structured_plan(self, text: str, topic: str = "this topic") -> Dict[str, Any]:
|
|
479
|
+
"""
|
|
480
|
+
Extract structured plan from text with enhanced error handling.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
text: Text to extract from
|
|
484
|
+
topic: Research topic
|
|
485
|
+
|
|
486
|
+
Returns:
|
|
487
|
+
Extracted plan
|
|
488
|
+
"""
|
|
489
|
+
try:
|
|
490
|
+
# Sanitize text
|
|
491
|
+
sanitized_text = self._sanitize_text(text, max_length=5000)
|
|
492
|
+
|
|
493
|
+
# Extract questions
|
|
494
|
+
questions = []
|
|
495
|
+
lines = sanitized_text.split('\n')
|
|
496
|
+
for line in lines:
|
|
497
|
+
line = line.strip()
|
|
498
|
+
if line and ('?' in line or line.startswith('Q')):
|
|
499
|
+
question = self._sanitize_text(line, max_length=300)
|
|
500
|
+
questions.append(question)
|
|
501
|
+
|
|
502
|
+
# Extract keywords
|
|
503
|
+
keywords = []
|
|
504
|
+
for line in lines:
|
|
505
|
+
line = line.strip()
|
|
506
|
+
if line and len(line) < 50 and not line.startswith(('Q', 'A', '-', '*')):
|
|
507
|
+
keyword = self._sanitize_text(line, max_length=100)
|
|
508
|
+
keywords.append(keyword)
|
|
509
|
+
|
|
510
|
+
return {
|
|
511
|
+
"primary_research_question": f"How can {topic} be effectively developed and applied?",
|
|
512
|
+
"sub_questions": questions[:7] if questions else [
|
|
513
|
+
f"What are the current applications of {topic}?",
|
|
514
|
+
f"What are the technical challenges in implementing {topic}?",
|
|
515
|
+
f"How does {topic} compare to traditional approaches?"
|
|
516
|
+
],
|
|
517
|
+
"disciplines": ["Computer Science", "Physics", "Chemistry"],
|
|
518
|
+
"search_strategy": {
|
|
519
|
+
"keywords": keywords[:15] if keywords else [topic, f"{topic} applications"],
|
|
520
|
+
"queries": [
|
|
521
|
+
f'"{topic}" recent advances',
|
|
522
|
+
f'"{topic}" review',
|
|
523
|
+
f'"{topic}" applications'
|
|
524
|
+
]
|
|
525
|
+
},
|
|
526
|
+
"methodological_considerations": f"Research on {topic} requires systematic analysis and experimental validation."
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
except Exception as e:
|
|
530
|
+
logger.error(f"Error extracting structured plan: {str(e)}")
|
|
531
|
+
return self._generate_default_plan(topic)
|
|
532
|
+
|
|
533
|
+
def _generate_default_plan(self, topic: str) -> Dict[str, Any]:
|
|
534
|
+
"""
|
|
535
|
+
Generate default research plan.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
topic: Research topic
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
Default research plan
|
|
542
|
+
"""
|
|
543
|
+
return {
|
|
544
|
+
"primary_research_question": f"How can {topic} be effectively developed and applied?",
|
|
545
|
+
"sub_questions": [
|
|
546
|
+
f"What are the current applications of {topic}?",
|
|
547
|
+
f"What are the technical challenges in implementing {topic}?",
|
|
548
|
+
f"How does {topic} compare to traditional approaches?",
|
|
549
|
+
f"What are the performance metrics for evaluating {topic}?",
|
|
550
|
+
f"What future developments are expected in {topic}?"
|
|
551
|
+
],
|
|
552
|
+
"disciplines": ["Computer Science", "Physics", "Chemistry", "Bioinformatics"],
|
|
553
|
+
"search_strategy": {
|
|
554
|
+
"keywords": [
|
|
555
|
+
topic,
|
|
556
|
+
f"{topic} algorithms",
|
|
557
|
+
f"{topic} implementations",
|
|
558
|
+
f"{topic} challenges",
|
|
559
|
+
"quantum computing",
|
|
560
|
+
"quantum chemistry",
|
|
561
|
+
"molecular simulation",
|
|
562
|
+
"quantum algorithms"
|
|
563
|
+
],
|
|
564
|
+
"queries": [
|
|
565
|
+
f'"{topic}" recent advances',
|
|
566
|
+
f'"{topic}" review',
|
|
567
|
+
f'"{topic}" applications',
|
|
568
|
+
f'"{topic}" implementation challenges',
|
|
569
|
+
f'"{topic}" performance comparison'
|
|
570
|
+
]
|
|
571
|
+
},
|
|
572
|
+
"methodological_considerations": f"Research on {topic} requires interdisciplinary approaches combining theoretical analysis and practical implementation."
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
def _format_context(self, context: Optional[Dict]) -> str:
|
|
576
|
+
"""
|
|
577
|
+
Format context for prompts with enhanced error handling.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
context: Context dictionary
|
|
581
|
+
|
|
582
|
+
Returns:
|
|
583
|
+
Formatted context string
|
|
584
|
+
"""
|
|
585
|
+
try:
|
|
586
|
+
if not context:
|
|
587
|
+
return ""
|
|
588
|
+
|
|
589
|
+
if not isinstance(context, dict):
|
|
590
|
+
logger.warning("Context is not a dictionary, ignoring")
|
|
591
|
+
return ""
|
|
592
|
+
|
|
593
|
+
context_parts = []
|
|
594
|
+
|
|
595
|
+
if "background" in context:
|
|
596
|
+
background = self._sanitize_text(str(context["background"]), max_length=500)
|
|
597
|
+
context_parts.append(f"BACKGROUND: {background}")
|
|
598
|
+
|
|
599
|
+
if "field" in context:
|
|
600
|
+
field = self._sanitize_text(str(context["field"]), max_length=200)
|
|
601
|
+
context_parts.append(f"FIELD: {field}")
|
|
602
|
+
|
|
603
|
+
if "constraints" in context:
|
|
604
|
+
constraints = self._sanitize_text(str(context["constraints"]), max_length=300)
|
|
605
|
+
context_parts.append(f"CONSTRAINTS: {constraints}")
|
|
606
|
+
|
|
607
|
+
if "goals" in context:
|
|
608
|
+
goals = self._sanitize_text(str(context["goals"]), max_length=300)
|
|
609
|
+
context_parts.append(f"GOALS: {goals}")
|
|
610
|
+
|
|
611
|
+
return "\n\n".join(context_parts) if context_parts else ""
|
|
612
|
+
|
|
613
|
+
except Exception as e:
|
|
614
|
+
logger.error(f"Error formatting context: {str(e)}")
|
|
615
|
+
return ""
|
|
616
|
+
|
|
617
|
+
async def generate_concept_queries(self, concept: str, context: Optional[Dict] = None) -> List[str]:
|
|
618
|
+
"""
|
|
619
|
+
Generate concept-specific queries with enhanced error handling and security.
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
concept: Concept to generate queries for
|
|
623
|
+
context: Optional context
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
List of concept queries
|
|
627
|
+
|
|
628
|
+
Raises:
|
|
629
|
+
ValueError: If concept is invalid
|
|
630
|
+
"""
|
|
631
|
+
try:
|
|
632
|
+
# Input validation
|
|
633
|
+
self._validate_topic(concept)
|
|
634
|
+
|
|
635
|
+
sanitized_concept = self._sanitize_text(concept, max_length=500)
|
|
636
|
+
|
|
637
|
+
logger.info(f"Generating concept queries for: {sanitized_concept[:50]}...")
|
|
638
|
+
|
|
639
|
+
prompt = f"""
|
|
640
|
+
Generate 5-7 search queries specifically for the concept: "{sanitized_concept}"
|
|
641
|
+
|
|
642
|
+
{self._format_context(context) if context else ""}
|
|
643
|
+
|
|
644
|
+
Focus on:
|
|
645
|
+
1. Core concept definition and theory
|
|
646
|
+
2. Practical applications and implementations
|
|
647
|
+
3. Related technologies and methods
|
|
648
|
+
4. Current research and developments
|
|
649
|
+
5. Challenges and limitations
|
|
650
|
+
|
|
651
|
+
Return only the queries, one per line.
|
|
652
|
+
"""
|
|
653
|
+
|
|
654
|
+
# Generate queries with retry logic
|
|
655
|
+
queries = await self._generate_queries_with_retry(prompt, sanitized_concept)
|
|
656
|
+
|
|
657
|
+
logger.info(f"Successfully generated {len(queries)} concept queries")
|
|
658
|
+
return queries
|
|
659
|
+
|
|
660
|
+
except ValueError as e:
|
|
661
|
+
logger.error(f"Invalid input for concept query generation: {str(e)}")
|
|
662
|
+
raise
|
|
663
|
+
except Exception as e:
|
|
664
|
+
logger.error(f"Error generating concept queries: {str(e)}")
|
|
665
|
+
return [concept] # Fallback
|
|
666
|
+
|
|
667
|
+
async def identify_related_concepts(self,
|
|
668
|
+
concept: str,
|
|
669
|
+
web_sources: List[Dict],
|
|
670
|
+
academic_sources: List[Dict]) -> List[str]:
|
|
671
|
+
"""
|
|
672
|
+
Identify related concepts with enhanced error handling and security.
|
|
673
|
+
|
|
674
|
+
Args:
|
|
675
|
+
concept: Main concept
|
|
676
|
+
web_sources: List of web sources
|
|
677
|
+
academic_sources: List of academic sources
|
|
678
|
+
|
|
679
|
+
Returns:
|
|
680
|
+
List of related concepts
|
|
681
|
+
|
|
682
|
+
Raises:
|
|
683
|
+
ValueError: If concept is invalid
|
|
684
|
+
"""
|
|
685
|
+
try:
|
|
686
|
+
# Input validation
|
|
687
|
+
self._validate_topic(concept)
|
|
688
|
+
|
|
689
|
+
if not isinstance(web_sources, list):
|
|
690
|
+
web_sources = []
|
|
691
|
+
if not isinstance(academic_sources, list):
|
|
692
|
+
academic_sources = []
|
|
693
|
+
|
|
694
|
+
sanitized_concept = self._sanitize_text(concept, max_length=500)
|
|
695
|
+
|
|
696
|
+
logger.info(f"Identifying related concepts for: {sanitized_concept[:50]}...")
|
|
697
|
+
|
|
698
|
+
# Prepare source summaries
|
|
699
|
+
source_texts = []
|
|
700
|
+
|
|
701
|
+
for source in web_sources[:5]: # Limit to 5 sources
|
|
702
|
+
if isinstance(source, dict) and source.get('content'):
|
|
703
|
+
content = self._sanitize_text(str(source['content']), max_length=500)
|
|
704
|
+
source_texts.append(content)
|
|
705
|
+
|
|
706
|
+
for source in academic_sources[:5]: # Limit to 5 sources
|
|
707
|
+
if isinstance(source, dict) and source.get('summary'):
|
|
708
|
+
summary = self._sanitize_text(str(source['summary']), max_length=500)
|
|
709
|
+
source_texts.append(summary)
|
|
710
|
+
|
|
711
|
+
if not source_texts:
|
|
712
|
+
logger.warning("No source content available for concept identification")
|
|
713
|
+
return []
|
|
714
|
+
|
|
715
|
+
prompt = f"""
|
|
716
|
+
Based on the following sources, identify 5-10 concepts related to "{sanitized_concept}":
|
|
717
|
+
|
|
718
|
+
Sources:
|
|
719
|
+
{' '.join(source_texts)}
|
|
720
|
+
|
|
721
|
+
Focus on:
|
|
722
|
+
1. Directly related concepts
|
|
723
|
+
2. Supporting technologies
|
|
724
|
+
3. Complementary approaches
|
|
725
|
+
4. Related methodologies
|
|
726
|
+
5. Associated applications
|
|
727
|
+
|
|
728
|
+
Return only the concept names, one per line.
|
|
729
|
+
"""
|
|
730
|
+
|
|
731
|
+
# Generate related concepts with retry logic
|
|
732
|
+
concepts = await self._generate_queries_with_retry(prompt, sanitized_concept)
|
|
733
|
+
|
|
734
|
+
logger.info(f"Successfully identified {len(concepts)} related concepts")
|
|
735
|
+
return concepts
|
|
736
|
+
|
|
737
|
+
except ValueError as e:
|
|
738
|
+
logger.error(f"Invalid input for concept identification: {str(e)}")
|
|
739
|
+
raise
|
|
740
|
+
except Exception as e:
|
|
741
|
+
logger.error(f"Error identifying related concepts: {str(e)}")
|
|
742
|
+
return []
|
|
743
|
+
|
|
744
|
+
async def health_check(self) -> Dict[str, Any]:
|
|
745
|
+
"""
|
|
746
|
+
Perform health check of the query generator.
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
Health status
|
|
750
|
+
"""
|
|
751
|
+
try:
|
|
752
|
+
health_status = {
|
|
753
|
+
"status": "healthy",
|
|
754
|
+
"timestamp": _utc_timestamp(),
|
|
755
|
+
"components": {}
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
# Check LLM manager
|
|
759
|
+
try:
|
|
760
|
+
llm_health = await self.llm_manager.health_check()
|
|
761
|
+
health_status["components"]["llm_manager"] = llm_health
|
|
762
|
+
if llm_health.get("status") != "healthy":
|
|
763
|
+
health_status["status"] = "degraded"
|
|
764
|
+
except Exception as e:
|
|
765
|
+
health_status["components"]["llm_manager"] = {"status": "error", "error": str(e)}
|
|
766
|
+
health_status["status"] = "degraded"
|
|
767
|
+
|
|
768
|
+
logger.info(f"Health check completed: {health_status['status']}")
|
|
769
|
+
return health_status
|
|
770
|
+
|
|
771
|
+
except Exception as e:
|
|
772
|
+
logger.error(f"Health check failed: {str(e)}")
|
|
773
|
+
return {
|
|
774
|
+
"status": "error",
|
|
775
|
+
"error": str(e),
|
|
776
|
+
"timestamp": _utc_timestamp()
|
|
777
|
+
}
|