local-deep-research 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. local_deep_research/__init__.py +23 -22
  2. local_deep_research/__main__.py +16 -0
  3. local_deep_research/advanced_search_system/__init__.py +7 -0
  4. local_deep_research/advanced_search_system/filters/__init__.py +8 -0
  5. local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
  6. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
  7. local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
  8. local_deep_research/advanced_search_system/findings/repository.py +452 -0
  9. local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
  10. local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
  11. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
  12. local_deep_research/advanced_search_system/questions/__init__.py +1 -0
  13. local_deep_research/advanced_search_system/questions/base_question.py +64 -0
  14. local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
  15. local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
  16. local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
  17. local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
  18. local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
  19. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
  20. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
  21. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
  22. local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
  23. local_deep_research/advanced_search_system/tools/__init__.py +1 -0
  24. local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
  25. local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
  26. local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
  27. local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
  28. local_deep_research/api/__init__.py +5 -5
  29. local_deep_research/api/research_functions.py +96 -84
  30. local_deep_research/app.py +8 -0
  31. local_deep_research/citation_handler.py +25 -16
  32. local_deep_research/{config.py → config/config_files.py} +102 -110
  33. local_deep_research/config/llm_config.py +472 -0
  34. local_deep_research/config/search_config.py +77 -0
  35. local_deep_research/defaults/__init__.py +10 -5
  36. local_deep_research/defaults/main.toml +2 -2
  37. local_deep_research/defaults/search_engines.toml +60 -34
  38. local_deep_research/main.py +121 -19
  39. local_deep_research/migrate_db.py +147 -0
  40. local_deep_research/report_generator.py +72 -44
  41. local_deep_research/search_system.py +147 -283
  42. local_deep_research/setup_data_dir.py +35 -0
  43. local_deep_research/test_migration.py +178 -0
  44. local_deep_research/utilities/__init__.py +0 -0
  45. local_deep_research/utilities/db_utils.py +49 -0
  46. local_deep_research/{utilties → utilities}/enums.py +2 -2
  47. local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
  48. local_deep_research/utilities/search_utilities.py +242 -0
  49. local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
  50. local_deep_research/web/__init__.py +0 -1
  51. local_deep_research/web/app.py +86 -1709
  52. local_deep_research/web/app_factory.py +289 -0
  53. local_deep_research/web/database/README.md +70 -0
  54. local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
  55. local_deep_research/web/database/migrations.py +447 -0
  56. local_deep_research/web/database/models.py +117 -0
  57. local_deep_research/web/database/schema_upgrade.py +107 -0
  58. local_deep_research/web/models/database.py +294 -0
  59. local_deep_research/web/models/settings.py +94 -0
  60. local_deep_research/web/routes/api_routes.py +559 -0
  61. local_deep_research/web/routes/history_routes.py +354 -0
  62. local_deep_research/web/routes/research_routes.py +715 -0
  63. local_deep_research/web/routes/settings_routes.py +1592 -0
  64. local_deep_research/web/services/research_service.py +947 -0
  65. local_deep_research/web/services/resource_service.py +149 -0
  66. local_deep_research/web/services/settings_manager.py +669 -0
  67. local_deep_research/web/services/settings_service.py +187 -0
  68. local_deep_research/web/services/socket_service.py +210 -0
  69. local_deep_research/web/static/css/custom_dropdown.css +277 -0
  70. local_deep_research/web/static/css/settings.css +1223 -0
  71. local_deep_research/web/static/css/styles.css +525 -48
  72. local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
  73. local_deep_research/web/static/js/components/detail.js +348 -0
  74. local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
  75. local_deep_research/web/static/js/components/fallback/ui.js +215 -0
  76. local_deep_research/web/static/js/components/history.js +487 -0
  77. local_deep_research/web/static/js/components/logpanel.js +949 -0
  78. local_deep_research/web/static/js/components/progress.js +1107 -0
  79. local_deep_research/web/static/js/components/research.js +1865 -0
  80. local_deep_research/web/static/js/components/results.js +766 -0
  81. local_deep_research/web/static/js/components/settings.js +3981 -0
  82. local_deep_research/web/static/js/components/settings_sync.js +106 -0
  83. local_deep_research/web/static/js/main.js +226 -0
  84. local_deep_research/web/static/js/services/api.js +253 -0
  85. local_deep_research/web/static/js/services/audio.js +31 -0
  86. local_deep_research/web/static/js/services/formatting.js +119 -0
  87. local_deep_research/web/static/js/services/pdf.js +622 -0
  88. local_deep_research/web/static/js/services/socket.js +882 -0
  89. local_deep_research/web/static/js/services/ui.js +546 -0
  90. local_deep_research/web/templates/base.html +72 -0
  91. local_deep_research/web/templates/components/custom_dropdown.html +47 -0
  92. local_deep_research/web/templates/components/log_panel.html +32 -0
  93. local_deep_research/web/templates/components/mobile_nav.html +22 -0
  94. local_deep_research/web/templates/components/settings_form.html +299 -0
  95. local_deep_research/web/templates/components/sidebar.html +21 -0
  96. local_deep_research/web/templates/pages/details.html +73 -0
  97. local_deep_research/web/templates/pages/history.html +51 -0
  98. local_deep_research/web/templates/pages/progress.html +57 -0
  99. local_deep_research/web/templates/pages/research.html +139 -0
  100. local_deep_research/web/templates/pages/results.html +59 -0
  101. local_deep_research/web/templates/settings_dashboard.html +78 -192
  102. local_deep_research/web/utils/__init__.py +0 -0
  103. local_deep_research/web/utils/formatters.py +76 -0
  104. local_deep_research/web_search_engines/engines/full_search.py +18 -16
  105. local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
  106. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
  107. local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
  108. local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
  109. local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
  110. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
  111. local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
  112. local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
  113. local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
  114. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
  115. local_deep_research/web_search_engines/engines/search_engine_searxng.py +211 -159
  116. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
  117. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
  118. local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
  119. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
  120. local_deep_research/web_search_engines/search_engine_base.py +174 -99
  121. local_deep_research/web_search_engines/search_engine_factory.py +192 -102
  122. local_deep_research/web_search_engines/search_engines_config.py +22 -15
  123. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/METADATA +177 -97
  124. local_deep_research-0.2.0.dist-info/RECORD +135 -0
  125. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/WHEEL +1 -2
  126. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/entry_points.txt +3 -0
  127. local_deep_research/defaults/llm_config.py +0 -338
  128. local_deep_research/utilties/search_utilities.py +0 -114
  129. local_deep_research/web/static/js/app.js +0 -3763
  130. local_deep_research/web/templates/api_keys_config.html +0 -82
  131. local_deep_research/web/templates/collections_config.html +0 -90
  132. local_deep_research/web/templates/index.html +0 -348
  133. local_deep_research/web/templates/llm_config.html +0 -120
  134. local_deep_research/web/templates/main_config.html +0 -89
  135. local_deep_research/web/templates/search_engines_config.html +0 -154
  136. local_deep_research/web/templates/settings.html +0 -519
  137. local_deep_research-0.1.26.dist-info/RECORD +0 -61
  138. local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
  139. /local_deep_research/{utilties → config}/__init__.py +0 -0
  140. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,159 @@
1
+ """
2
+ Standard knowledge generator implementation.
3
+ """
4
+
5
+ import logging
6
+ from datetime import datetime
7
+ from typing import List
8
+
9
+ from .base_knowledge import BaseKnowledgeGenerator
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class StandardKnowledge(BaseKnowledgeGenerator):
15
+ """Standard knowledge generator implementation."""
16
+
17
+ def generate_knowledge(
18
+ self,
19
+ query: str,
20
+ context: str = "",
21
+ current_knowledge: str = "",
22
+ questions: List[str] = None,
23
+ ) -> str:
24
+ """Generate knowledge based on query and context."""
25
+ now = datetime.now()
26
+ current_time = now.strftime("%Y-%m-%d")
27
+
28
+ logger.info("Generating knowledge...")
29
+
30
+ if questions:
31
+ prompt = f"""Based on the following query and questions, generate comprehensive knowledge:
32
+
33
+ Query: {query}
34
+ Current Time: {current_time}
35
+ Context: {context}
36
+ Current Knowledge: {current_knowledge}
37
+ Questions: {questions}
38
+
39
+ Generate detailed knowledge that:
40
+ 1. Directly answers the query
41
+ 2. Addresses each question
42
+ 3. Includes relevant facts and details
43
+ 4. Is up-to-date with current information
44
+ 5. Synthesizes information from multiple sources
45
+
46
+ Format your response as a well-structured paragraph."""
47
+ else:
48
+ prompt = f"""Based on the following query, generate comprehensive knowledge:
49
+
50
+ Query: {query}
51
+ Current Time: {current_time}
52
+ Context: {context}
53
+ Current Knowledge: {current_knowledge}
54
+
55
+ Generate detailed knowledge that:
56
+ 1. Directly answers the query
57
+ 2. Includes relevant facts and details
58
+ 3. Is up-to-date with current information
59
+ 4. Synthesizes information from multiple sources
60
+
61
+ Format your response as a well-structured paragraph."""
62
+
63
+ response = self.model.invoke(prompt)
64
+ knowledge = response.content
65
+
66
+ logger.info("Generated knowledge successfully")
67
+ return knowledge
68
+
69
+ def generate_sub_knowledge(self, sub_query: str, context: str = "") -> str:
70
+ """
71
+ Generate knowledge for a sub-question.
72
+
73
+ Args:
74
+ sub_query: The sub-question to generate knowledge for
75
+ context: Additional context for knowledge generation
76
+
77
+ Returns:
78
+ str: Generated knowledge for the sub-question
79
+ """
80
+ prompt = f"""Generate comprehensive knowledge to answer this sub-question:
81
+
82
+ Sub-question: {sub_query}
83
+
84
+ {context}
85
+
86
+ Generate detailed knowledge that:
87
+ 1. Directly answers the sub-question
88
+ 2. Includes relevant facts and details
89
+ 3. Is up-to-date with current information
90
+ 4. Synthesizes information from multiple sources
91
+
92
+ Format your response as a well-structured paragraph."""
93
+
94
+ try:
95
+ response = self.model.invoke(prompt)
96
+ return response.content
97
+ except Exception as e:
98
+ logger.error(f"Error generating sub-knowledge: {str(e)}")
99
+ return ""
100
+
101
+ def generate(self, query: str, context: str) -> str:
102
+ """Generate knowledge from the given query and context."""
103
+ return self.generate_knowledge(query, context)
104
+
105
+ def compress_knowledge(
106
+ self, current_knowledge: str, query: str, section_links: list, **kwargs
107
+ ) -> str:
108
+ """
109
+ Compress and summarize accumulated knowledge.
110
+
111
+ Args:
112
+ current_knowledge: The accumulated knowledge to compress
113
+ query: The original research query
114
+ section_links: List of source links
115
+ **kwargs: Additional arguments
116
+
117
+ Returns:
118
+ str: Compressed knowledge
119
+ """
120
+ logger.info(
121
+ f"Compressing knowledge for query: {query}. Original length: {len(current_knowledge)}"
122
+ )
123
+
124
+ prompt = f"""Compress the following accumulated knowledge relevant to the query '{query}'.
125
+ Retain the key facts, findings, and citations. Remove redundancy.
126
+
127
+ Accumulated Knowledge:
128
+ {current_knowledge}
129
+
130
+ Compressed Knowledge:"""
131
+
132
+ try:
133
+ response = self.model.invoke(prompt)
134
+ compressed_knowledge = response.content
135
+ logger.info(f"Compressed knowledge length: {len(compressed_knowledge)}")
136
+ return compressed_knowledge
137
+ except Exception as e:
138
+ logger.error(f"Error compressing knowledge: {str(e)}")
139
+ return current_knowledge # Return original if compression fails
140
+
141
+ def format_citations(self, links: List[str]) -> str:
142
+ """
143
+ Format source links into citations using IEEE style.
144
+
145
+ Args:
146
+ links: List of source links
147
+
148
+ Returns:
149
+ str: Formatted citations in IEEE style
150
+ """
151
+ if not links:
152
+ return ""
153
+
154
+ # Format each link as an IEEE citation
155
+ citations = []
156
+ for i, link in enumerate(links, 1):
157
+ citations.append(f"[{i}] {link}")
158
+
159
+ return "\n".join(citations)
@@ -0,0 +1 @@
1
+ # Search System Questions Package
@@ -0,0 +1,64 @@
1
+ """
2
+ Base class for all question generators.
3
+ Defines the common interface and shared functionality for different question generation approaches.
4
+ """
5
+
6
+ import logging
7
+ from abc import ABC, abstractmethod
8
+ from typing import Dict, List
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class BaseQuestionGenerator(ABC):
14
+ """Abstract base class for all question generators."""
15
+
16
+ def __init__(self, model):
17
+ """
18
+ Initialize the question generator.
19
+
20
+ Args:
21
+ model: The language model to use for question generation
22
+ """
23
+ self.model = model
24
+
25
+ @abstractmethod
26
+ def generate_questions(
27
+ self,
28
+ current_knowledge: str,
29
+ query: str,
30
+ questions_per_iteration: int,
31
+ questions_by_iteration: Dict[int, List[str]],
32
+ ) -> List[str]:
33
+ """
34
+ Generate questions based on the current state of research.
35
+
36
+ Args:
37
+ current_knowledge: The accumulated knowledge so far
38
+ query: The original research query
39
+ questions_per_iteration: Number of questions to generate per iteration
40
+ questions_by_iteration: Questions generated in previous iterations
41
+
42
+ Returns:
43
+ List[str]: Generated questions
44
+ """
45
+ pass
46
+
47
+ def _format_previous_questions(
48
+ self, questions_by_iteration: Dict[int, List[str]]
49
+ ) -> str:
50
+ """
51
+ Format previous questions for context.
52
+
53
+ Args:
54
+ questions_by_iteration: Questions generated in previous iterations
55
+
56
+ Returns:
57
+ str: Formatted string of previous questions
58
+ """
59
+ formatted = []
60
+ for iteration, questions in questions_by_iteration.items():
61
+ formatted.append(f"Iteration {iteration}:")
62
+ for q in questions:
63
+ formatted.append(f"- {q}")
64
+ return "\n".join(formatted)
@@ -0,0 +1,445 @@
1
+ import logging
2
+ from typing import List
3
+
4
+ from langchain_core.language_models import BaseLLM
5
+
6
+ from .base_question import BaseQuestionGenerator
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class DecompositionQuestionGenerator(BaseQuestionGenerator):
12
+ """Question generator for decomposing complex queries into sub-queries."""
13
+
14
+ def __init__(self, model: BaseLLM, max_subqueries: int = 5):
15
+ """
16
+ Initialize the question generator.
17
+
18
+ Args:
19
+ model: The language model to use for question generation
20
+ max_subqueries: Maximum number of sub-queries to generate
21
+ """
22
+ super().__init__(model)
23
+ self.max_subqueries = max_subqueries
24
+
25
+ def generate_questions(
26
+ self,
27
+ query: str,
28
+ context: str,
29
+ **kwargs,
30
+ ) -> List[str]:
31
+ """
32
+ Generate sub-queries by decomposing the original query.
33
+
34
+ Args:
35
+ query: The main research query
36
+ context: Additional context for question generation
37
+ **kwargs: Additional keyword arguments
38
+
39
+ Returns:
40
+ List of generated sub-queries
41
+ """
42
+ # Extract subject if the query is in question format
43
+ subject = query
44
+ lower_query = query.lower()
45
+
46
+ if lower_query.endswith("?"):
47
+ # Handle question-format queries by extracting the subject
48
+ question_prefixes = [
49
+ "what is",
50
+ "what are",
51
+ "how does",
52
+ "how do",
53
+ "how can",
54
+ "why is",
55
+ "why are",
56
+ "when did",
57
+ "where is",
58
+ "which",
59
+ "who is",
60
+ "can",
61
+ "will",
62
+ ]
63
+
64
+ # Remove the question mark
65
+ subject_candidate = query[:-1].strip()
66
+
67
+ # Check for common question beginnings and extract the subject
68
+ for prefix in question_prefixes:
69
+ if lower_query.startswith(prefix):
70
+ # Extract everything after the question prefix
71
+ subject_candidate = query[len(prefix) :].strip()
72
+ # Remove trailing ? if present
73
+ if subject_candidate.endswith("?"):
74
+ subject_candidate = subject_candidate[:-1].strip()
75
+ subject = subject_candidate
76
+ break
77
+
78
+ # For compound questions, extract just the primary subject
79
+ conjunctions = [
80
+ " and ",
81
+ " or ",
82
+ " but ",
83
+ " as ",
84
+ " that ",
85
+ " which ",
86
+ " when ",
87
+ " where ",
88
+ " how ",
89
+ ]
90
+ for conjunction in conjunctions:
91
+ if conjunction in subject.lower():
92
+ # Take only the part before the conjunction
93
+ subject = subject.split(conjunction)[0].strip()
94
+ logger.info(
95
+ f"Split compound question at '{conjunction}', extracted: '{subject}'"
96
+ )
97
+ break
98
+
99
+ # Clean up the subject if it starts with articles
100
+ for article in ["a ", "an ", "the "]:
101
+ if subject.lower().startswith(article):
102
+ subject = subject[len(article) :].strip()
103
+
104
+ logger.info(f"Original query: '{query}', Extracted subject: '{subject}'")
105
+
106
+ # Create a prompt to decompose the query into sub-questions
107
+ prompt = f"""Decompose the main research topic into 3-5 specific sub-queries that can be answered independently.
108
+ Focus on breaking down complex concepts and identifying key aspects requiring separate investigation.
109
+ Ensure sub-queries are clear, targeted, and help build a comprehensive understanding.
110
+
111
+ Main Research Topic: {subject}
112
+ Original Query: {query}
113
+
114
+ Context Information:
115
+ {context[:2000]} # Limit context length to prevent token limit issues
116
+
117
+ Your task is to create 3-5 specific questions that will help thoroughly research this topic.
118
+ If the original query is already a question, extract the core subject and formulate questions around that subject.
119
+
120
+ Return ONLY the sub-queries, one per line, without numbering or bullet points.
121
+ Example format:
122
+ What is X technology?
123
+ How does X compare to Y?
124
+ What are the security implications of X?
125
+ """
126
+
127
+ logger.info(
128
+ f"Generating sub-questions for query: '{query}', subject: '{subject}'"
129
+ )
130
+
131
+ try:
132
+ # Get response from LLM
133
+ response = self.model.invoke(prompt)
134
+
135
+ # Handle different response formats (string or object with content attribute)
136
+ sub_queries_text = ""
137
+ if hasattr(response, "content"):
138
+ sub_queries_text = response.content.strip()
139
+ else:
140
+ # Handle string responses
141
+ sub_queries_text = str(response).strip()
142
+
143
+ # Check for the common "No language models available" error
144
+ if (
145
+ "No language models are available" in sub_queries_text
146
+ or "Please install Ollama" in sub_queries_text
147
+ ):
148
+ logger.warning(
149
+ "LLM returned error about language models not being available, using default questions"
150
+ )
151
+ # Create topic-specific default questions based on the query
152
+ return self._generate_default_questions(query)
153
+
154
+ # Extract sub-queries (one per line)
155
+ sub_queries = []
156
+ for line in sub_queries_text.split("\n"):
157
+ line = line.strip()
158
+ # Skip empty lines and lines that are just formatting (bullets, numbers)
159
+ if (
160
+ not line
161
+ or line in ["*", "-", "•"]
162
+ or line.startswith(("- ", "* ", "• ", "1. ", "2. ", "3. "))
163
+ ):
164
+ continue
165
+
166
+ # Remove any leading bullets or numbers if they exist
167
+ clean_line = line
168
+ for prefix in [
169
+ "- ",
170
+ "* ",
171
+ "• ",
172
+ "1. ",
173
+ "2. ",
174
+ "3. ",
175
+ "4. ",
176
+ "5. ",
177
+ "- ",
178
+ "#",
179
+ ]:
180
+ if clean_line.startswith(prefix):
181
+ clean_line = clean_line[len(prefix) :]
182
+
183
+ if (
184
+ clean_line and len(clean_line) > 10
185
+ ): # Ensure it's a meaningful question
186
+ sub_queries.append(clean_line)
187
+
188
+ # If no sub-queries were extracted, try again with a simpler prompt
189
+ if not sub_queries:
190
+ logger.warning(
191
+ "No sub-queries extracted from first attempt, trying simplified approach"
192
+ )
193
+
194
+ # Determine if the query is already a question and extract the subject
195
+ topic_text = query
196
+ if query.lower().endswith("?"):
197
+ # Try to extract subject from question
198
+ for prefix in [
199
+ "what is",
200
+ "what are",
201
+ "how does",
202
+ "how can",
203
+ "why is",
204
+ ]:
205
+ if query.lower().startswith(prefix):
206
+ topic_text = query[len(prefix) :].strip()
207
+ if topic_text.endswith("?"):
208
+ topic_text = topic_text[:-1].strip()
209
+ break
210
+
211
+ # For compound topics, extract just the primary subject
212
+ conjunctions = [
213
+ " and ",
214
+ " or ",
215
+ " but ",
216
+ " as ",
217
+ " that ",
218
+ " which ",
219
+ " when ",
220
+ " where ",
221
+ " how ",
222
+ ]
223
+ for conjunction in conjunctions:
224
+ if conjunction in topic_text.lower():
225
+ # Take only the part before the conjunction
226
+ topic_text = topic_text.split(conjunction)[0].strip()
227
+ logger.info(
228
+ f"Simplified prompt: Split compound query at '{conjunction}', extracted: '{topic_text}'"
229
+ )
230
+ break
231
+
232
+ # Clean up the topic if it starts with articles
233
+ for article in ["a ", "an ", "the "]:
234
+ if topic_text.lower().startswith(article):
235
+ topic_text = topic_text[len(article) :].strip()
236
+
237
+ # Simpler prompt
238
+ simple_prompt = f"""Break down this research topic into 3 simpler sub-questions:
239
+
240
+ Research Topic: {topic_text}
241
+ Original Query: {query}
242
+
243
+ Your task is to create 3 specific questions that will help thoroughly research this topic.
244
+ If the original query is already a question, use the core subject of that question.
245
+
246
+ Sub-questions:
247
+ 1.
248
+ 2.
249
+ 3. """
250
+
251
+ simple_response = self.model.invoke(simple_prompt)
252
+
253
+ # Handle different response formats
254
+ simple_text = ""
255
+ if hasattr(simple_response, "content"):
256
+ simple_text = simple_response.content.strip()
257
+ else:
258
+ simple_text = str(simple_response).strip()
259
+
260
+ # Check again for language model errors
261
+ if (
262
+ "No language models are available" in simple_text
263
+ or "Please install Ollama" in simple_text
264
+ ):
265
+ logger.warning(
266
+ "LLM returned error in simplified prompt, using default questions"
267
+ )
268
+ return self._generate_default_questions(query)
269
+
270
+ # Extract sub-queries from the simpler response
271
+ for line in simple_text.split("\n"):
272
+ line = line.strip()
273
+ if (
274
+ line
275
+ and not line.startswith("Sub-questions:")
276
+ and len(line) > 10
277
+ ):
278
+ # Clean up numbering
279
+ for prefix in ["1. ", "2. ", "3. ", "- ", "* "]:
280
+ if line.startswith(prefix):
281
+ line = line[len(prefix) :]
282
+ sub_queries.append(line.strip())
283
+
284
+ # If still no sub-queries, create default ones based on the original query
285
+ if not sub_queries:
286
+ logger.warning(
287
+ "Failed to generate meaningful sub-queries, using default decomposition"
288
+ )
289
+ return self._generate_default_questions(query)
290
+
291
+ logger.info(f"Generated {len(sub_queries)} sub-questions: {sub_queries}")
292
+ return sub_queries[: self.max_subqueries] # Limit to max_subqueries
293
+
294
+ except Exception as e:
295
+ logger.error(f"Error generating sub-questions: {str(e)}")
296
+ # Fallback to basic questions in case of error
297
+ return self._generate_default_questions(query)
298
+
299
+ def _generate_default_questions(self, query: str) -> List[str]:
300
+ """
301
+ Generate default questions for a given query when LLM fails.
302
+
303
+ Args:
304
+ query: The main research query
305
+
306
+ Returns:
307
+ List of default questions
308
+ """
309
+ # Adjust questions based on the type of query
310
+ query = query.strip()
311
+
312
+ # Check if the query is already in question format
313
+ question_prefixes = [
314
+ "what is",
315
+ "what are",
316
+ "how does",
317
+ "how do",
318
+ "how can",
319
+ "why is",
320
+ "why are",
321
+ "when did",
322
+ "where is",
323
+ "which",
324
+ "who is",
325
+ "can",
326
+ "will",
327
+ ]
328
+
329
+ # Extract the subject from a question-format query
330
+ subject = query
331
+ lower_query = query.lower()
332
+
333
+ # Check for common question formats and extract the subject
334
+ if lower_query.endswith("?"):
335
+ # Remove the question mark
336
+ subject = query[:-1].strip()
337
+
338
+ # Check for common question beginnings and extract the subject
339
+ for prefix in question_prefixes:
340
+ if lower_query.startswith(prefix):
341
+ # Extract everything after the question prefix
342
+ subject = query[len(prefix) :].strip()
343
+ # Remove trailing ? if present
344
+ if subject.endswith("?"):
345
+ subject = subject[:-1].strip()
346
+ break
347
+
348
+ # For compound questions, extract just the primary subject
349
+ # Look for conjunctions and prepositions that typically separate the subject from the rest
350
+ conjunctions = [
351
+ " and ",
352
+ " or ",
353
+ " but ",
354
+ " as ",
355
+ " that ",
356
+ " which ",
357
+ " when ",
358
+ " where ",
359
+ " how ",
360
+ ]
361
+ for conjunction in conjunctions:
362
+ if conjunction in subject.lower():
363
+ # Take only the part before the conjunction
364
+ subject = subject.split(conjunction)[0].strip()
365
+ logger.info(
366
+ f"Split compound question at '{conjunction}', extracted: '{subject}'"
367
+ )
368
+ break
369
+
370
+ # Clean up the subject if it starts with articles
371
+ for article in ["a ", "an ", "the "]:
372
+ if subject.lower().startswith(article):
373
+ subject = subject[len(article) :].strip()
374
+
375
+ # For single word or very short subjects, adapt the question format
376
+ is_short_subject = len(subject.split()) <= 2
377
+
378
+ logger.info(
379
+ f"Query: '{query}', Identified subject: '{subject}', Short subject: {is_short_subject}"
380
+ )
381
+
382
+ # Special case for CSRF - if we've extracted just "csrf" from a longer query
383
+ if subject.lower() == "csrf" or subject.lower() == "cross-site request forgery":
384
+ # CSRF-specific questions
385
+ default_questions = [
386
+ "What is Cross-Site Request Forgery (CSRF)?",
387
+ "How do CSRF attacks work and what are common attack vectors?",
388
+ "What are effective CSRF prevention methods and best practices?",
389
+ "How do CSRF tokens work to prevent attacks?",
390
+ "What are real-world examples of CSRF vulnerabilities and their impact?",
391
+ ]
392
+ elif not subject:
393
+ # Empty query case
394
+ default_questions = [
395
+ "What is the definition of this topic?",
396
+ "What are the key aspects of this topic?",
397
+ "What are practical applications of this concept?",
398
+ ]
399
+ elif any(
400
+ term in subject.lower()
401
+ for term in ["secure", "security", "vulnerability", "attack"]
402
+ ):
403
+ # Security-related questions
404
+ default_questions = [
405
+ f"What is {subject} and how does it work?",
406
+ f"What are common {subject} vulnerabilities or attack vectors?",
407
+ f"What are best practices for preventing {subject} issues?",
408
+ f"How can {subject} be detected and mitigated?",
409
+ f"What are real-world examples of {subject} incidents?",
410
+ ]
411
+ elif any(
412
+ term in subject.lower()
413
+ for term in ["programming", "language", "code", "software"]
414
+ ):
415
+ # Programming-related questions
416
+ default_questions = [
417
+ f"What is {subject} and how does it work?",
418
+ f"What are the main features and advantages of {subject}?",
419
+ f"What are common use cases and applications for {subject}?",
420
+ f"How does {subject} compare to similar technologies?",
421
+ f"What are best practices when working with {subject}?",
422
+ ]
423
+ elif is_short_subject:
424
+ # For short subjects (1-2 words), use a dedicated format
425
+ default_questions = [
426
+ f"What is {subject}?",
427
+ f"What are the main characteristics of {subject}?",
428
+ f"How is {subject} used in practice?",
429
+ f"What are the advantages and disadvantages of {subject}?",
430
+ f"How has {subject} evolved over time?",
431
+ ]
432
+ else:
433
+ # Generic questions for any topic
434
+ default_questions = [
435
+ f"What is the definition of {subject}?",
436
+ f"What are the key components or features of {subject}?",
437
+ f"What are common applications or use cases for {subject}?",
438
+ f"What are the advantages and limitations of {subject}?",
439
+ f"How does {subject} compare to alternatives?",
440
+ ]
441
+
442
+ logger.info(
443
+ f"Using {len(default_questions)} default questions: {default_questions}"
444
+ )
445
+ return default_questions[: self.max_subqueries]