cite-agent 1.3.6__py3-none-any.whl → 1.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cite-agent might be problematic. Click here for more details.

Files changed (36) hide show
  1. cite_agent/__version__.py +1 -1
  2. cite_agent/cli.py +9 -2
  3. cite_agent/enhanced_ai_agent.py +1100 -77
  4. {cite_agent-1.3.6.dist-info → cite_agent-1.3.8.dist-info}/METADATA +1 -1
  5. cite_agent-1.3.8.dist-info/RECORD +31 -0
  6. {cite_agent-1.3.6.dist-info → cite_agent-1.3.8.dist-info}/top_level.txt +0 -1
  7. cite_agent-1.3.6.dist-info/RECORD +0 -57
  8. src/__init__.py +0 -1
  9. src/services/__init__.py +0 -132
  10. src/services/auth_service/__init__.py +0 -3
  11. src/services/auth_service/auth_manager.py +0 -33
  12. src/services/graph/__init__.py +0 -1
  13. src/services/graph/knowledge_graph.py +0 -194
  14. src/services/llm_service/__init__.py +0 -5
  15. src/services/llm_service/llm_manager.py +0 -495
  16. src/services/paper_service/__init__.py +0 -5
  17. src/services/paper_service/openalex.py +0 -231
  18. src/services/performance_service/__init__.py +0 -1
  19. src/services/performance_service/rust_performance.py +0 -395
  20. src/services/research_service/__init__.py +0 -23
  21. src/services/research_service/chatbot.py +0 -2056
  22. src/services/research_service/citation_manager.py +0 -436
  23. src/services/research_service/context_manager.py +0 -1441
  24. src/services/research_service/conversation_manager.py +0 -597
  25. src/services/research_service/critical_paper_detector.py +0 -577
  26. src/services/research_service/enhanced_research.py +0 -121
  27. src/services/research_service/enhanced_synthesizer.py +0 -375
  28. src/services/research_service/query_generator.py +0 -777
  29. src/services/research_service/synthesizer.py +0 -1273
  30. src/services/search_service/__init__.py +0 -5
  31. src/services/search_service/indexer.py +0 -186
  32. src/services/search_service/search_engine.py +0 -342
  33. src/services/simple_enhanced_main.py +0 -287
  34. {cite_agent-1.3.6.dist-info → cite_agent-1.3.8.dist-info}/WHEEL +0 -0
  35. {cite_agent-1.3.6.dist-info → cite_agent-1.3.8.dist-info}/entry_points.txt +0 -0
  36. {cite_agent-1.3.6.dist-info → cite_agent-1.3.8.dist-info}/licenses/LICENSE +0 -0
@@ -1,777 +0,0 @@
1
- # src/services/research_service/query_generator.py
2
-
3
- import logging
4
- import re
5
- import asyncio
6
- import json
7
- from typing import List, Dict, Any, Optional
8
- from datetime import datetime, timezone
9
-
10
- from src.services.llm_service.llm_manager import LLMManager
11
-
12
- # Configure structured logging
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- def _utc_timestamp() -> str:
17
- return datetime.now(timezone.utc).isoformat()
18
-
19
- class EnhancedQueryGenerator:
20
- """
21
- Enhanced query generator with comprehensive error handling, security, and observability.
22
-
23
- Features:
24
- - Secure query generation and optimization
25
- - Input validation and sanitization
26
- - Comprehensive error handling and retry logic
27
- - Structured logging and monitoring
28
- - Protection against injection attacks
29
- - Research plan generation
30
- """
31
-
32
- def __init__(self, llm_manager: LLMManager):
33
- """
34
- Initialize query generator with enhanced security and error handling.
35
-
36
- Args:
37
- llm_manager: LLM manager instance
38
-
39
- Raises:
40
- ValueError: If LLM manager is invalid
41
- """
42
- try:
43
- if not llm_manager:
44
- raise ValueError("LLM manager instance is required")
45
-
46
- logger.info("Initializing EnhancedQueryGenerator with enhanced security")
47
- self.llm_manager = llm_manager
48
- logger.info("EnhancedQueryGenerator initialized successfully")
49
-
50
- except Exception as e:
51
- logger.error(f"Failed to initialize EnhancedQueryGenerator: {str(e)}")
52
- raise
53
-
54
- def _validate_topic(self, topic: str) -> None:
55
- """
56
- Validate research topic for security and safety.
57
-
58
- Args:
59
- topic: Research topic to validate
60
-
61
- Raises:
62
- ValueError: If topic is invalid
63
- """
64
- if not isinstance(topic, str):
65
- raise ValueError("Topic must be a string")
66
-
67
- if not topic.strip():
68
- raise ValueError("Topic cannot be empty")
69
-
70
- if len(topic) > 500: # Reasonable limit
71
- raise ValueError("Topic too long (max 500 characters)")
72
-
73
- # Check for potentially dangerous content
74
- dangerous_patterns = [
75
- r'<script.*?>.*?</script>', # Script tags
76
- r'javascript:', # JavaScript protocol
77
- r'data:text/html', # Data URLs
78
- r'vbscript:', # VBScript
79
- ]
80
-
81
- for pattern in dangerous_patterns:
82
- if re.search(pattern, topic, re.IGNORECASE):
83
- raise ValueError(f"Topic contains potentially dangerous patterns: {pattern}")
84
-
85
- def _validate_research_intent(self, research_intent: str) -> None:
86
- """
87
- Validate research intent for security and safety.
88
-
89
- Args:
90
- research_intent: Research intent to validate
91
-
92
- Raises:
93
- ValueError: If research intent is invalid
94
- """
95
- if not isinstance(research_intent, str):
96
- raise ValueError("Research intent must be a string")
97
-
98
- if not research_intent.strip():
99
- raise ValueError("Research intent cannot be empty")
100
-
101
- if len(research_intent) > 2000: # Reasonable limit
102
- raise ValueError("Research intent too long (max 2000 characters)")
103
-
104
- # Check for potentially dangerous content
105
- dangerous_patterns = [
106
- r'<script.*?>.*?</script>', # Script tags
107
- r'javascript:', # JavaScript protocol
108
- r'data:text/html', # Data URLs
109
- r'vbscript:', # VBScript
110
- ]
111
-
112
- for pattern in dangerous_patterns:
113
- if re.search(pattern, research_intent, re.IGNORECASE):
114
- raise ValueError(f"Research intent contains potentially dangerous patterns: {pattern}")
115
-
116
- def _sanitize_text(self, text: str, max_length: int = 2000) -> str:
117
- """
118
- Sanitize text to prevent injection attacks.
119
-
120
- Args:
121
- text: Text to sanitize
122
- max_length: Maximum allowed length
123
-
124
- Returns:
125
- Sanitized text
126
- """
127
- if not isinstance(text, str):
128
- raise ValueError("Text must be a string")
129
-
130
- if len(text) > max_length:
131
- text = text[:max_length]
132
-
133
- # Basic XSS protection
134
- sanitized = text.replace('<', '&lt;').replace('>', '&gt;')
135
-
136
- # Remove null bytes and other control characters
137
- sanitized = ''.join(char for char in sanitized if ord(char) >= 32 or char in '\n\r\t')
138
-
139
- return sanitized.strip()
140
-
141
- async def generate_research_queries(self,
142
- topic: str,
143
- research_intent: str,
144
- context: Optional[Dict] = None) -> List[str]:
145
- """
146
- Generate optimized search queries with enhanced error handling and security.
147
-
148
- Args:
149
- topic: Main research topic
150
- research_intent: Detailed description of research goals and focus
151
- context: Optional additional context (background, field, etc.)
152
-
153
- Returns:
154
- List of optimized search queries
155
-
156
- Raises:
157
- ValueError: If inputs are invalid
158
- ConnectionError: If query generation fails
159
- """
160
- try:
161
- # Input validation and sanitization
162
- self._validate_topic(topic)
163
- self._validate_research_intent(research_intent)
164
-
165
- sanitized_topic = self._sanitize_text(topic, max_length=500)
166
- sanitized_intent = self._sanitize_text(research_intent, max_length=2000)
167
-
168
- logger.info(f"Generating research queries for topic: {sanitized_topic[:50]}...")
169
-
170
- prompt = f"""You are an expert academic researcher helping to formulate optimal search queries.
171
-
172
- RESEARCH TOPIC: {sanitized_topic}
173
-
174
- RESEARCH INTENT:
175
- {sanitized_intent}
176
-
177
- {self._format_context(context) if context else ""}
178
-
179
- Generate 5-7 search queries that would find the most relevant academic papers for this research.
180
- For each query:
181
- 1. Focus on different aspects/angles of the research topic
182
- 2. Use terminology and phrasing typically found in academic papers
183
- 3. Include relevant field-specific keywords
184
- 4. Consider both broader conceptual searches and more specific technical searches
185
- 5. Optimize for finding high-quality, relevant papers rather than general information
186
-
187
- Format your response as a list of queries only, one per line, with no numbering or other text.
188
- """
189
-
190
- # Generate queries with retry logic
191
- queries = await self._generate_queries_with_retry(prompt, sanitized_topic)
192
-
193
- logger.info(f"Successfully generated {len(queries)} research queries")
194
- return queries
195
-
196
- except ValueError as e:
197
- logger.error(f"Invalid input for query generation: {str(e)}")
198
- raise
199
- except Exception as e:
200
- logger.error(f"Error generating research queries: {str(e)}")
201
- raise
202
-
203
- async def _generate_queries_with_retry(self, prompt: str, fallback_topic: str, max_retries: int = 3) -> List[str]:
204
- """
205
- Generate queries with retry logic.
206
-
207
- Args:
208
- prompt: Generation prompt
209
- fallback_topic: Fallback topic if generation fails
210
- max_retries: Maximum retry attempts
211
-
212
- Returns:
213
- List of generated queries
214
- """
215
- last_error = None
216
-
217
- for attempt in range(max_retries):
218
- try:
219
- response = await self.llm_manager.generate_synthesis(
220
- [{"content": prompt}],
221
- prompt
222
- )
223
-
224
- if isinstance(response, dict) and "summary" in response:
225
- response_text = response["summary"]
226
- else:
227
- response_text = str(response)
228
-
229
- # Parse queries from response
230
- queries = [q.strip() for q in response_text.split('\n') if q.strip()]
231
-
232
- # Validate queries
233
- valid_queries = []
234
- for query in queries:
235
- if len(query) > 10 and len(query) < 200: # Reasonable length
236
- sanitized_query = self._sanitize_text(query, max_length=200)
237
- valid_queries.append(sanitized_query)
238
-
239
- # Ensure we got at least one query
240
- if valid_queries:
241
- return valid_queries[:7] # Limit to 7 queries
242
- else:
243
- raise ValueError("No valid queries generated")
244
-
245
- except Exception as e:
246
- last_error = e
247
- logger.warning(f"Query generation attempt {attempt + 1} failed: {str(e)}")
248
-
249
- if attempt < max_retries - 1:
250
- await asyncio.sleep(1) # Short delay between retries
251
-
252
- # All retries failed, return fallback
253
- logger.warning(f"All query generation attempts failed, using fallback")
254
- return [fallback_topic]
255
-
256
- async def generate_research_plan(self, topic: str, research_intent: str, context: Optional[Dict] = None) -> Dict[str, Any]:
257
- """
258
- Generate a comprehensive research plan with enhanced error handling and security.
259
-
260
- Args:
261
- topic: Main research topic
262
- research_intent: Detailed description of research goals and focus
263
- context: Optional additional context
264
-
265
- Returns:
266
- Comprehensive research plan
267
-
268
- Raises:
269
- ValueError: If inputs are invalid
270
- ConnectionError: If plan generation fails
271
- """
272
- try:
273
- # Input validation and sanitization
274
- self._validate_topic(topic)
275
- self._validate_research_intent(research_intent)
276
-
277
- sanitized_topic = self._sanitize_text(topic, max_length=500)
278
- sanitized_intent = self._sanitize_text(research_intent, max_length=2000)
279
-
280
- logger.info(f"Generating research plan for topic: {sanitized_topic[:50]}...")
281
-
282
- # Create a prompt specifically focused on detailed keywords and queries
283
- prompt = f"""You are a research expert creating a structured research plan for "{sanitized_topic}".
284
-
285
- RESEARCH TOPIC: {sanitized_topic}
286
-
287
- RESEARCH INTENT:
288
- {sanitized_intent}
289
-
290
- {self._format_context(context) if context else ""}
291
-
292
- I need a DETAILED research plan with particular focus on search terms and queries.
293
-
294
- Your plan MUST include:
295
-
296
- 1. Primary Research Question: A precise, focused question that guides the investigation
297
-
298
- 2. Sub-Questions (5-7): Specific questions that break down the main research question into manageable parts
299
- - Include technical questions about methods and implementations
300
- - Include questions about current limitations and challenges
301
- - Include questions about practical applications
302
-
303
- 3. Relevant Academic Disciplines: List 3-5 specific academic fields relevant to this research
304
-
305
- 4. Search Strategy:
306
- - Keywords (10-15): Technical terms, scientific concepts, and domain-specific vocabulary
307
- - Search Queries (5-7): Carefully crafted search strings that would yield relevant academic papers
308
-
309
- 5. Methodological Considerations: Brief notes on research approaches
310
-
311
- CRITICAL: For each section, particularly keywords and search queries, BE EXTREMELY SPECIFIC TO THE TOPIC.
312
- For example, for quantum computing in drug discovery, include terms like "quantum chemistry algorithms",
313
- "molecular docking", "NISQ devices in pharmaceutical research", etc.
314
-
315
- Format your response EXACTLY as a JSON object with these sections.
316
- """
317
-
318
- # Generate plan with retry logic
319
- plan = await self._generate_plan_with_retry(prompt, sanitized_topic)
320
-
321
- logger.info("Successfully generated research plan")
322
- return plan
323
-
324
- except ValueError as e:
325
- logger.error(f"Invalid input for plan generation: {str(e)}")
326
- raise
327
- except Exception as e:
328
- logger.error(f"Error generating research plan: {str(e)}")
329
- raise
330
-
331
- async def _generate_plan_with_retry(self, prompt: str, topic: str, max_retries: int = 3) -> Dict[str, Any]:
332
- """
333
- Generate research plan with retry logic.
334
-
335
- Args:
336
- prompt: Generation prompt
337
- topic: Research topic
338
- max_retries: Maximum retry attempts
339
-
340
- Returns:
341
- Generated research plan
342
- """
343
- last_error = None
344
-
345
- for attempt in range(max_retries):
346
- try:
347
- # Generate plan
348
- response = await self.llm_manager.generate_synthesis(
349
- [{"content": prompt}],
350
- prompt
351
- )
352
-
353
- if isinstance(response, dict) and "summary" in response:
354
- response_text = response["summary"]
355
- else:
356
- response_text = str(response)
357
-
358
- logger.debug(f"Research plan raw response: {response_text[:100]}...")
359
-
360
- # Parse JSON response
361
- plan = self._parse_json_plan(response_text, topic)
362
- return plan
363
-
364
- except Exception as e:
365
- last_error = e
366
- logger.warning(f"Plan generation attempt {attempt + 1} failed: {str(e)}")
367
-
368
- if attempt < max_retries - 1:
369
- await asyncio.sleep(2) # Longer delay for plan generation
370
-
371
- # All retries failed, return default plan
372
- logger.warning(f"All plan generation attempts failed, using default plan")
373
- return self._generate_default_plan(topic)
374
-
375
- def _parse_json_plan(self, response_text: str, topic: str) -> Dict[str, Any]:
376
- """
377
- Parse JSON research plan with enhanced error handling.
378
-
379
- Args:
380
- response_text: LLM response text
381
- topic: Research topic
382
-
383
- Returns:
384
- Parsed research plan
385
- """
386
- try:
387
- # Find JSON in response by looking for opening/closing braces
388
- json_start = response_text.find('{')
389
- json_end = response_text.rfind('}') + 1
390
-
391
- if json_start >= 0 and json_end > json_start:
392
- json_str = response_text[json_start:json_end]
393
- # Try to fix common JSON issues before parsing
394
- json_str = json_str.replace('\n', ' ').replace('\\', '\\\\')
395
- plan = json.loads(json_str)
396
-
397
- # Ensure we have the required fields with defaults if missing
398
- plan = self._validate_and_fix_plan(plan, topic)
399
- return plan
400
- else:
401
- # Extract structured data if JSON not found
402
- return self._extract_structured_plan(response_text, topic)
403
-
404
- except json.JSONDecodeError as e:
405
- logger.warning(f"JSON parsing failed: {str(e)}, extracting structured data")
406
- return self._extract_structured_plan(response_text, topic)
407
- except Exception as e:
408
- logger.error(f"Error parsing JSON plan: {str(e)}")
409
- return self._generate_default_plan(topic)
410
-
411
- def _validate_and_fix_plan(self, plan: Dict[str, Any], topic: str) -> Dict[str, Any]:
412
- """
413
- Validate and fix research plan with defaults.
414
-
415
- Args:
416
- plan: Research plan to validate
417
- topic: Research topic
418
-
419
- Returns:
420
- Validated and fixed plan
421
- """
422
- try:
423
- if not isinstance(plan, dict):
424
- plan = {}
425
-
426
- # Ensure required fields with defaults
427
- if "primary_research_question" not in plan:
428
- plan["primary_research_question"] = f"How can {topic} be effectively developed and applied?"
429
-
430
- if "sub_questions" not in plan or not plan["sub_questions"]:
431
- plan["sub_questions"] = [
432
- f"What are the current applications of {topic}?",
433
- f"What are the technical challenges in implementing {topic}?",
434
- f"How does {topic} compare to traditional approaches?",
435
- f"What are the performance metrics for evaluating {topic}?",
436
- f"What future developments are expected in {topic}?"
437
- ]
438
-
439
- if "disciplines" not in plan or not plan["disciplines"]:
440
- plan["disciplines"] = ["Computer Science", "Physics", "Chemistry", "Bioinformatics"]
441
-
442
- if "search_strategy" not in plan:
443
- plan["search_strategy"] = {}
444
-
445
- if "keywords" not in plan["search_strategy"] or not plan["search_strategy"]["keywords"]:
446
- # Generate topic-specific keywords
447
- words = topic.split()
448
- plan["search_strategy"]["keywords"] = [
449
- topic,
450
- f"{topic} applications",
451
- f"{topic} algorithms",
452
- f"{topic} implementations",
453
- f"{topic} challenges",
454
- " ".join(words[:1] + ["quantum"]),
455
- " ".join(words[:1] + ["simulation"]),
456
- " ".join(words[:1] + ["optimization"]),
457
- ]
458
-
459
- if "queries" not in plan["search_strategy"] or not plan["search_strategy"]["queries"]:
460
- # Generate topic-specific queries
461
- plan["search_strategy"]["queries"] = [
462
- f'"{topic}" recent advances',
463
- f'"{topic}" review',
464
- f'"{topic}" applications',
465
- f'"{topic}" implementation challenges',
466
- f'"{topic}" performance comparison',
467
- ]
468
-
469
- if "methodological_considerations" not in plan:
470
- plan["methodological_considerations"] = f"Research on {topic} requires interdisciplinary approaches combining theoretical analysis and practical implementation."
471
-
472
- return plan
473
-
474
- except Exception as e:
475
- logger.error(f"Error validating and fixing plan: {str(e)}")
476
- return self._generate_default_plan(topic)
477
-
478
- def _extract_structured_plan(self, text: str, topic: str = "this topic") -> Dict[str, Any]:
479
- """
480
- Extract structured plan from text with enhanced error handling.
481
-
482
- Args:
483
- text: Text to extract from
484
- topic: Research topic
485
-
486
- Returns:
487
- Extracted plan
488
- """
489
- try:
490
- # Sanitize text
491
- sanitized_text = self._sanitize_text(text, max_length=5000)
492
-
493
- # Extract questions
494
- questions = []
495
- lines = sanitized_text.split('\n')
496
- for line in lines:
497
- line = line.strip()
498
- if line and ('?' in line or line.startswith('Q')):
499
- question = self._sanitize_text(line, max_length=300)
500
- questions.append(question)
501
-
502
- # Extract keywords
503
- keywords = []
504
- for line in lines:
505
- line = line.strip()
506
- if line and len(line) < 50 and not line.startswith(('Q', 'A', '-', '*')):
507
- keyword = self._sanitize_text(line, max_length=100)
508
- keywords.append(keyword)
509
-
510
- return {
511
- "primary_research_question": f"How can {topic} be effectively developed and applied?",
512
- "sub_questions": questions[:7] if questions else [
513
- f"What are the current applications of {topic}?",
514
- f"What are the technical challenges in implementing {topic}?",
515
- f"How does {topic} compare to traditional approaches?"
516
- ],
517
- "disciplines": ["Computer Science", "Physics", "Chemistry"],
518
- "search_strategy": {
519
- "keywords": keywords[:15] if keywords else [topic, f"{topic} applications"],
520
- "queries": [
521
- f'"{topic}" recent advances',
522
- f'"{topic}" review',
523
- f'"{topic}" applications'
524
- ]
525
- },
526
- "methodological_considerations": f"Research on {topic} requires systematic analysis and experimental validation."
527
- }
528
-
529
- except Exception as e:
530
- logger.error(f"Error extracting structured plan: {str(e)}")
531
- return self._generate_default_plan(topic)
532
-
533
- def _generate_default_plan(self, topic: str) -> Dict[str, Any]:
534
- """
535
- Generate default research plan.
536
-
537
- Args:
538
- topic: Research topic
539
-
540
- Returns:
541
- Default research plan
542
- """
543
- return {
544
- "primary_research_question": f"How can {topic} be effectively developed and applied?",
545
- "sub_questions": [
546
- f"What are the current applications of {topic}?",
547
- f"What are the technical challenges in implementing {topic}?",
548
- f"How does {topic} compare to traditional approaches?",
549
- f"What are the performance metrics for evaluating {topic}?",
550
- f"What future developments are expected in {topic}?"
551
- ],
552
- "disciplines": ["Computer Science", "Physics", "Chemistry", "Bioinformatics"],
553
- "search_strategy": {
554
- "keywords": [
555
- topic,
556
- f"{topic} algorithms",
557
- f"{topic} implementations",
558
- f"{topic} challenges",
559
- "quantum computing",
560
- "quantum chemistry",
561
- "molecular simulation",
562
- "quantum algorithms"
563
- ],
564
- "queries": [
565
- f'"{topic}" recent advances',
566
- f'"{topic}" review',
567
- f'"{topic}" applications',
568
- f'"{topic}" implementation challenges',
569
- f'"{topic}" performance comparison'
570
- ]
571
- },
572
- "methodological_considerations": f"Research on {topic} requires interdisciplinary approaches combining theoretical analysis and practical implementation."
573
- }
574
-
575
- def _format_context(self, context: Optional[Dict]) -> str:
576
- """
577
- Format context for prompts with enhanced error handling.
578
-
579
- Args:
580
- context: Context dictionary
581
-
582
- Returns:
583
- Formatted context string
584
- """
585
- try:
586
- if not context:
587
- return ""
588
-
589
- if not isinstance(context, dict):
590
- logger.warning("Context is not a dictionary, ignoring")
591
- return ""
592
-
593
- context_parts = []
594
-
595
- if "background" in context:
596
- background = self._sanitize_text(str(context["background"]), max_length=500)
597
- context_parts.append(f"BACKGROUND: {background}")
598
-
599
- if "field" in context:
600
- field = self._sanitize_text(str(context["field"]), max_length=200)
601
- context_parts.append(f"FIELD: {field}")
602
-
603
- if "constraints" in context:
604
- constraints = self._sanitize_text(str(context["constraints"]), max_length=300)
605
- context_parts.append(f"CONSTRAINTS: {constraints}")
606
-
607
- if "goals" in context:
608
- goals = self._sanitize_text(str(context["goals"]), max_length=300)
609
- context_parts.append(f"GOALS: {goals}")
610
-
611
- return "\n\n".join(context_parts) if context_parts else ""
612
-
613
- except Exception as e:
614
- logger.error(f"Error formatting context: {str(e)}")
615
- return ""
616
-
617
- async def generate_concept_queries(self, concept: str, context: Optional[Dict] = None) -> List[str]:
618
- """
619
- Generate concept-specific queries with enhanced error handling and security.
620
-
621
- Args:
622
- concept: Concept to generate queries for
623
- context: Optional context
624
-
625
- Returns:
626
- List of concept queries
627
-
628
- Raises:
629
- ValueError: If concept is invalid
630
- """
631
- try:
632
- # Input validation
633
- self._validate_topic(concept)
634
-
635
- sanitized_concept = self._sanitize_text(concept, max_length=500)
636
-
637
- logger.info(f"Generating concept queries for: {sanitized_concept[:50]}...")
638
-
639
- prompt = f"""
640
- Generate 5-7 search queries specifically for the concept: "{sanitized_concept}"
641
-
642
- {self._format_context(context) if context else ""}
643
-
644
- Focus on:
645
- 1. Core concept definition and theory
646
- 2. Practical applications and implementations
647
- 3. Related technologies and methods
648
- 4. Current research and developments
649
- 5. Challenges and limitations
650
-
651
- Return only the queries, one per line.
652
- """
653
-
654
- # Generate queries with retry logic
655
- queries = await self._generate_queries_with_retry(prompt, sanitized_concept)
656
-
657
- logger.info(f"Successfully generated {len(queries)} concept queries")
658
- return queries
659
-
660
- except ValueError as e:
661
- logger.error(f"Invalid input for concept query generation: {str(e)}")
662
- raise
663
- except Exception as e:
664
- logger.error(f"Error generating concept queries: {str(e)}")
665
- return [concept] # Fallback
666
-
667
- async def identify_related_concepts(self,
668
- concept: str,
669
- web_sources: List[Dict],
670
- academic_sources: List[Dict]) -> List[str]:
671
- """
672
- Identify related concepts with enhanced error handling and security.
673
-
674
- Args:
675
- concept: Main concept
676
- web_sources: List of web sources
677
- academic_sources: List of academic sources
678
-
679
- Returns:
680
- List of related concepts
681
-
682
- Raises:
683
- ValueError: If concept is invalid
684
- """
685
- try:
686
- # Input validation
687
- self._validate_topic(concept)
688
-
689
- if not isinstance(web_sources, list):
690
- web_sources = []
691
- if not isinstance(academic_sources, list):
692
- academic_sources = []
693
-
694
- sanitized_concept = self._sanitize_text(concept, max_length=500)
695
-
696
- logger.info(f"Identifying related concepts for: {sanitized_concept[:50]}...")
697
-
698
- # Prepare source summaries
699
- source_texts = []
700
-
701
- for source in web_sources[:5]: # Limit to 5 sources
702
- if isinstance(source, dict) and source.get('content'):
703
- content = self._sanitize_text(str(source['content']), max_length=500)
704
- source_texts.append(content)
705
-
706
- for source in academic_sources[:5]: # Limit to 5 sources
707
- if isinstance(source, dict) and source.get('summary'):
708
- summary = self._sanitize_text(str(source['summary']), max_length=500)
709
- source_texts.append(summary)
710
-
711
- if not source_texts:
712
- logger.warning("No source content available for concept identification")
713
- return []
714
-
715
- prompt = f"""
716
- Based on the following sources, identify 5-10 concepts related to "{sanitized_concept}":
717
-
718
- Sources:
719
- {' '.join(source_texts)}
720
-
721
- Focus on:
722
- 1. Directly related concepts
723
- 2. Supporting technologies
724
- 3. Complementary approaches
725
- 4. Related methodologies
726
- 5. Associated applications
727
-
728
- Return only the concept names, one per line.
729
- """
730
-
731
- # Generate related concepts with retry logic
732
- concepts = await self._generate_queries_with_retry(prompt, sanitized_concept)
733
-
734
- logger.info(f"Successfully identified {len(concepts)} related concepts")
735
- return concepts
736
-
737
- except ValueError as e:
738
- logger.error(f"Invalid input for concept identification: {str(e)}")
739
- raise
740
- except Exception as e:
741
- logger.error(f"Error identifying related concepts: {str(e)}")
742
- return []
743
-
744
- async def health_check(self) -> Dict[str, Any]:
745
- """
746
- Perform health check of the query generator.
747
-
748
- Returns:
749
- Health status
750
- """
751
- try:
752
- health_status = {
753
- "status": "healthy",
754
- "timestamp": _utc_timestamp(),
755
- "components": {}
756
- }
757
-
758
- # Check LLM manager
759
- try:
760
- llm_health = await self.llm_manager.health_check()
761
- health_status["components"]["llm_manager"] = llm_health
762
- if llm_health.get("status") != "healthy":
763
- health_status["status"] = "degraded"
764
- except Exception as e:
765
- health_status["components"]["llm_manager"] = {"status": "error", "error": str(e)}
766
- health_status["status"] = "degraded"
767
-
768
- logger.info(f"Health check completed: {health_status['status']}")
769
- return health_status
770
-
771
- except Exception as e:
772
- logger.error(f"Health check failed: {str(e)}")
773
- return {
774
- "status": "error",
775
- "error": str(e),
776
- "timestamp": _utc_timestamp()
777
- }