local-deep-research 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +23 -22
- local_deep_research/__main__.py +16 -0
- local_deep_research/advanced_search_system/__init__.py +7 -0
- local_deep_research/advanced_search_system/filters/__init__.py +8 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
- local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
- local_deep_research/advanced_search_system/findings/repository.py +452 -0
- local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
- local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
- local_deep_research/advanced_search_system/questions/__init__.py +1 -0
- local_deep_research/advanced_search_system/questions/base_question.py +64 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
- local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
- local_deep_research/advanced_search_system/tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
- local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
- local_deep_research/api/__init__.py +5 -5
- local_deep_research/api/research_functions.py +96 -84
- local_deep_research/app.py +8 -0
- local_deep_research/citation_handler.py +25 -16
- local_deep_research/{config.py → config/config_files.py} +102 -110
- local_deep_research/config/llm_config.py +472 -0
- local_deep_research/config/search_config.py +77 -0
- local_deep_research/defaults/__init__.py +10 -5
- local_deep_research/defaults/main.toml +2 -2
- local_deep_research/defaults/search_engines.toml +60 -34
- local_deep_research/main.py +121 -19
- local_deep_research/migrate_db.py +147 -0
- local_deep_research/report_generator.py +72 -44
- local_deep_research/search_system.py +147 -283
- local_deep_research/setup_data_dir.py +35 -0
- local_deep_research/test_migration.py +178 -0
- local_deep_research/utilities/__init__.py +0 -0
- local_deep_research/utilities/db_utils.py +49 -0
- local_deep_research/{utilties → utilities}/enums.py +2 -2
- local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
- local_deep_research/utilities/search_utilities.py +242 -0
- local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
- local_deep_research/web/__init__.py +0 -1
- local_deep_research/web/app.py +86 -1709
- local_deep_research/web/app_factory.py +289 -0
- local_deep_research/web/database/README.md +70 -0
- local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
- local_deep_research/web/database/migrations.py +447 -0
- local_deep_research/web/database/models.py +117 -0
- local_deep_research/web/database/schema_upgrade.py +107 -0
- local_deep_research/web/models/database.py +294 -0
- local_deep_research/web/models/settings.py +94 -0
- local_deep_research/web/routes/api_routes.py +559 -0
- local_deep_research/web/routes/history_routes.py +354 -0
- local_deep_research/web/routes/research_routes.py +715 -0
- local_deep_research/web/routes/settings_routes.py +1592 -0
- local_deep_research/web/services/research_service.py +947 -0
- local_deep_research/web/services/resource_service.py +149 -0
- local_deep_research/web/services/settings_manager.py +669 -0
- local_deep_research/web/services/settings_service.py +187 -0
- local_deep_research/web/services/socket_service.py +210 -0
- local_deep_research/web/static/css/custom_dropdown.css +277 -0
- local_deep_research/web/static/css/settings.css +1223 -0
- local_deep_research/web/static/css/styles.css +525 -48
- local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
- local_deep_research/web/static/js/components/detail.js +348 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
- local_deep_research/web/static/js/components/fallback/ui.js +215 -0
- local_deep_research/web/static/js/components/history.js +487 -0
- local_deep_research/web/static/js/components/logpanel.js +949 -0
- local_deep_research/web/static/js/components/progress.js +1107 -0
- local_deep_research/web/static/js/components/research.js +1865 -0
- local_deep_research/web/static/js/components/results.js +766 -0
- local_deep_research/web/static/js/components/settings.js +3981 -0
- local_deep_research/web/static/js/components/settings_sync.js +106 -0
- local_deep_research/web/static/js/main.js +226 -0
- local_deep_research/web/static/js/services/api.js +253 -0
- local_deep_research/web/static/js/services/audio.js +31 -0
- local_deep_research/web/static/js/services/formatting.js +119 -0
- local_deep_research/web/static/js/services/pdf.js +622 -0
- local_deep_research/web/static/js/services/socket.js +882 -0
- local_deep_research/web/static/js/services/ui.js +546 -0
- local_deep_research/web/templates/base.html +72 -0
- local_deep_research/web/templates/components/custom_dropdown.html +47 -0
- local_deep_research/web/templates/components/log_panel.html +32 -0
- local_deep_research/web/templates/components/mobile_nav.html +22 -0
- local_deep_research/web/templates/components/settings_form.html +299 -0
- local_deep_research/web/templates/components/sidebar.html +21 -0
- local_deep_research/web/templates/pages/details.html +73 -0
- local_deep_research/web/templates/pages/history.html +51 -0
- local_deep_research/web/templates/pages/progress.html +57 -0
- local_deep_research/web/templates/pages/research.html +139 -0
- local_deep_research/web/templates/pages/results.html +59 -0
- local_deep_research/web/templates/settings_dashboard.html +78 -192
- local_deep_research/web/utils/__init__.py +0 -0
- local_deep_research/web/utils/formatters.py +76 -0
- local_deep_research/web_search_engines/engines/full_search.py +18 -16
- local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
- local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
- local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
- local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +211 -159
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
- local_deep_research/web_search_engines/search_engine_base.py +174 -99
- local_deep_research/web_search_engines/search_engine_factory.py +192 -102
- local_deep_research/web_search_engines/search_engines_config.py +22 -15
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/METADATA +177 -97
- local_deep_research-0.2.0.dist-info/RECORD +135 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/WHEEL +1 -2
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/entry_points.txt +3 -0
- local_deep_research/defaults/llm_config.py +0 -338
- local_deep_research/utilties/search_utilities.py +0 -114
- local_deep_research/web/static/js/app.js +0 -3763
- local_deep_research/web/templates/api_keys_config.html +0 -82
- local_deep_research/web/templates/collections_config.html +0 -90
- local_deep_research/web/templates/index.html +0 -348
- local_deep_research/web/templates/llm_config.html +0 -120
- local_deep_research/web/templates/main_config.html +0 -89
- local_deep_research/web/templates/search_engines_config.html +0 -154
- local_deep_research/web/templates/settings.html +0 -519
- local_deep_research-0.1.26.dist-info/RECORD +0 -61
- local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
- /local_deep_research/{utilties → config}/__init__.py +0 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,159 @@
|
|
1
|
+
"""
|
2
|
+
Standard knowledge generator implementation.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from datetime import datetime
|
7
|
+
from typing import List
|
8
|
+
|
9
|
+
from .base_knowledge import BaseKnowledgeGenerator
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class StandardKnowledge(BaseKnowledgeGenerator):
|
15
|
+
"""Standard knowledge generator implementation."""
|
16
|
+
|
17
|
+
def generate_knowledge(
|
18
|
+
self,
|
19
|
+
query: str,
|
20
|
+
context: str = "",
|
21
|
+
current_knowledge: str = "",
|
22
|
+
questions: List[str] = None,
|
23
|
+
) -> str:
|
24
|
+
"""Generate knowledge based on query and context."""
|
25
|
+
now = datetime.now()
|
26
|
+
current_time = now.strftime("%Y-%m-%d")
|
27
|
+
|
28
|
+
logger.info("Generating knowledge...")
|
29
|
+
|
30
|
+
if questions:
|
31
|
+
prompt = f"""Based on the following query and questions, generate comprehensive knowledge:
|
32
|
+
|
33
|
+
Query: {query}
|
34
|
+
Current Time: {current_time}
|
35
|
+
Context: {context}
|
36
|
+
Current Knowledge: {current_knowledge}
|
37
|
+
Questions: {questions}
|
38
|
+
|
39
|
+
Generate detailed knowledge that:
|
40
|
+
1. Directly answers the query
|
41
|
+
2. Addresses each question
|
42
|
+
3. Includes relevant facts and details
|
43
|
+
4. Is up-to-date with current information
|
44
|
+
5. Synthesizes information from multiple sources
|
45
|
+
|
46
|
+
Format your response as a well-structured paragraph."""
|
47
|
+
else:
|
48
|
+
prompt = f"""Based on the following query, generate comprehensive knowledge:
|
49
|
+
|
50
|
+
Query: {query}
|
51
|
+
Current Time: {current_time}
|
52
|
+
Context: {context}
|
53
|
+
Current Knowledge: {current_knowledge}
|
54
|
+
|
55
|
+
Generate detailed knowledge that:
|
56
|
+
1. Directly answers the query
|
57
|
+
2. Includes relevant facts and details
|
58
|
+
3. Is up-to-date with current information
|
59
|
+
4. Synthesizes information from multiple sources
|
60
|
+
|
61
|
+
Format your response as a well-structured paragraph."""
|
62
|
+
|
63
|
+
response = self.model.invoke(prompt)
|
64
|
+
knowledge = response.content
|
65
|
+
|
66
|
+
logger.info("Generated knowledge successfully")
|
67
|
+
return knowledge
|
68
|
+
|
69
|
+
def generate_sub_knowledge(self, sub_query: str, context: str = "") -> str:
|
70
|
+
"""
|
71
|
+
Generate knowledge for a sub-question.
|
72
|
+
|
73
|
+
Args:
|
74
|
+
sub_query: The sub-question to generate knowledge for
|
75
|
+
context: Additional context for knowledge generation
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
str: Generated knowledge for the sub-question
|
79
|
+
"""
|
80
|
+
prompt = f"""Generate comprehensive knowledge to answer this sub-question:
|
81
|
+
|
82
|
+
Sub-question: {sub_query}
|
83
|
+
|
84
|
+
{context}
|
85
|
+
|
86
|
+
Generate detailed knowledge that:
|
87
|
+
1. Directly answers the sub-question
|
88
|
+
2. Includes relevant facts and details
|
89
|
+
3. Is up-to-date with current information
|
90
|
+
4. Synthesizes information from multiple sources
|
91
|
+
|
92
|
+
Format your response as a well-structured paragraph."""
|
93
|
+
|
94
|
+
try:
|
95
|
+
response = self.model.invoke(prompt)
|
96
|
+
return response.content
|
97
|
+
except Exception as e:
|
98
|
+
logger.error(f"Error generating sub-knowledge: {str(e)}")
|
99
|
+
return ""
|
100
|
+
|
101
|
+
def generate(self, query: str, context: str) -> str:
|
102
|
+
"""Generate knowledge from the given query and context."""
|
103
|
+
return self.generate_knowledge(query, context)
|
104
|
+
|
105
|
+
def compress_knowledge(
|
106
|
+
self, current_knowledge: str, query: str, section_links: list, **kwargs
|
107
|
+
) -> str:
|
108
|
+
"""
|
109
|
+
Compress and summarize accumulated knowledge.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
current_knowledge: The accumulated knowledge to compress
|
113
|
+
query: The original research query
|
114
|
+
section_links: List of source links
|
115
|
+
**kwargs: Additional arguments
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
str: Compressed knowledge
|
119
|
+
"""
|
120
|
+
logger.info(
|
121
|
+
f"Compressing knowledge for query: {query}. Original length: {len(current_knowledge)}"
|
122
|
+
)
|
123
|
+
|
124
|
+
prompt = f"""Compress the following accumulated knowledge relevant to the query '{query}'.
|
125
|
+
Retain the key facts, findings, and citations. Remove redundancy.
|
126
|
+
|
127
|
+
Accumulated Knowledge:
|
128
|
+
{current_knowledge}
|
129
|
+
|
130
|
+
Compressed Knowledge:"""
|
131
|
+
|
132
|
+
try:
|
133
|
+
response = self.model.invoke(prompt)
|
134
|
+
compressed_knowledge = response.content
|
135
|
+
logger.info(f"Compressed knowledge length: {len(compressed_knowledge)}")
|
136
|
+
return compressed_knowledge
|
137
|
+
except Exception as e:
|
138
|
+
logger.error(f"Error compressing knowledge: {str(e)}")
|
139
|
+
return current_knowledge # Return original if compression fails
|
140
|
+
|
141
|
+
def format_citations(self, links: List[str]) -> str:
|
142
|
+
"""
|
143
|
+
Format source links into citations using IEEE style.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
links: List of source links
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
str: Formatted citations in IEEE style
|
150
|
+
"""
|
151
|
+
if not links:
|
152
|
+
return ""
|
153
|
+
|
154
|
+
# Format each link as an IEEE citation
|
155
|
+
citations = []
|
156
|
+
for i, link in enumerate(links, 1):
|
157
|
+
citations.append(f"[{i}] {link}")
|
158
|
+
|
159
|
+
return "\n".join(citations)
|
@@ -0,0 +1 @@
|
|
1
|
+
# Search System Questions Package
|
@@ -0,0 +1,64 @@
|
|
1
|
+
"""
|
2
|
+
Base class for all question generators.
|
3
|
+
Defines the common interface and shared functionality for different question generation approaches.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import logging
|
7
|
+
from abc import ABC, abstractmethod
|
8
|
+
from typing import Dict, List
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class BaseQuestionGenerator(ABC):
|
14
|
+
"""Abstract base class for all question generators."""
|
15
|
+
|
16
|
+
def __init__(self, model):
|
17
|
+
"""
|
18
|
+
Initialize the question generator.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
model: The language model to use for question generation
|
22
|
+
"""
|
23
|
+
self.model = model
|
24
|
+
|
25
|
+
@abstractmethod
|
26
|
+
def generate_questions(
|
27
|
+
self,
|
28
|
+
current_knowledge: str,
|
29
|
+
query: str,
|
30
|
+
questions_per_iteration: int,
|
31
|
+
questions_by_iteration: Dict[int, List[str]],
|
32
|
+
) -> List[str]:
|
33
|
+
"""
|
34
|
+
Generate questions based on the current state of research.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
current_knowledge: The accumulated knowledge so far
|
38
|
+
query: The original research query
|
39
|
+
questions_per_iteration: Number of questions to generate per iteration
|
40
|
+
questions_by_iteration: Questions generated in previous iterations
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
List[str]: Generated questions
|
44
|
+
"""
|
45
|
+
pass
|
46
|
+
|
47
|
+
def _format_previous_questions(
|
48
|
+
self, questions_by_iteration: Dict[int, List[str]]
|
49
|
+
) -> str:
|
50
|
+
"""
|
51
|
+
Format previous questions for context.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
questions_by_iteration: Questions generated in previous iterations
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
str: Formatted string of previous questions
|
58
|
+
"""
|
59
|
+
formatted = []
|
60
|
+
for iteration, questions in questions_by_iteration.items():
|
61
|
+
formatted.append(f"Iteration {iteration}:")
|
62
|
+
for q in questions:
|
63
|
+
formatted.append(f"- {q}")
|
64
|
+
return "\n".join(formatted)
|
@@ -0,0 +1,445 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
from langchain_core.language_models import BaseLLM
|
5
|
+
|
6
|
+
from .base_question import BaseQuestionGenerator
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
class DecompositionQuestionGenerator(BaseQuestionGenerator):
|
12
|
+
"""Question generator for decomposing complex queries into sub-queries."""
|
13
|
+
|
14
|
+
def __init__(self, model: BaseLLM, max_subqueries: int = 5):
|
15
|
+
"""
|
16
|
+
Initialize the question generator.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
model: The language model to use for question generation
|
20
|
+
max_subqueries: Maximum number of sub-queries to generate
|
21
|
+
"""
|
22
|
+
super().__init__(model)
|
23
|
+
self.max_subqueries = max_subqueries
|
24
|
+
|
25
|
+
def generate_questions(
|
26
|
+
self,
|
27
|
+
query: str,
|
28
|
+
context: str,
|
29
|
+
**kwargs,
|
30
|
+
) -> List[str]:
|
31
|
+
"""
|
32
|
+
Generate sub-queries by decomposing the original query.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
query: The main research query
|
36
|
+
context: Additional context for question generation
|
37
|
+
**kwargs: Additional keyword arguments
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
List of generated sub-queries
|
41
|
+
"""
|
42
|
+
# Extract subject if the query is in question format
|
43
|
+
subject = query
|
44
|
+
lower_query = query.lower()
|
45
|
+
|
46
|
+
if lower_query.endswith("?"):
|
47
|
+
# Handle question-format queries by extracting the subject
|
48
|
+
question_prefixes = [
|
49
|
+
"what is",
|
50
|
+
"what are",
|
51
|
+
"how does",
|
52
|
+
"how do",
|
53
|
+
"how can",
|
54
|
+
"why is",
|
55
|
+
"why are",
|
56
|
+
"when did",
|
57
|
+
"where is",
|
58
|
+
"which",
|
59
|
+
"who is",
|
60
|
+
"can",
|
61
|
+
"will",
|
62
|
+
]
|
63
|
+
|
64
|
+
# Remove the question mark
|
65
|
+
subject_candidate = query[:-1].strip()
|
66
|
+
|
67
|
+
# Check for common question beginnings and extract the subject
|
68
|
+
for prefix in question_prefixes:
|
69
|
+
if lower_query.startswith(prefix):
|
70
|
+
# Extract everything after the question prefix
|
71
|
+
subject_candidate = query[len(prefix) :].strip()
|
72
|
+
# Remove trailing ? if present
|
73
|
+
if subject_candidate.endswith("?"):
|
74
|
+
subject_candidate = subject_candidate[:-1].strip()
|
75
|
+
subject = subject_candidate
|
76
|
+
break
|
77
|
+
|
78
|
+
# For compound questions, extract just the primary subject
|
79
|
+
conjunctions = [
|
80
|
+
" and ",
|
81
|
+
" or ",
|
82
|
+
" but ",
|
83
|
+
" as ",
|
84
|
+
" that ",
|
85
|
+
" which ",
|
86
|
+
" when ",
|
87
|
+
" where ",
|
88
|
+
" how ",
|
89
|
+
]
|
90
|
+
for conjunction in conjunctions:
|
91
|
+
if conjunction in subject.lower():
|
92
|
+
# Take only the part before the conjunction
|
93
|
+
subject = subject.split(conjunction)[0].strip()
|
94
|
+
logger.info(
|
95
|
+
f"Split compound question at '{conjunction}', extracted: '{subject}'"
|
96
|
+
)
|
97
|
+
break
|
98
|
+
|
99
|
+
# Clean up the subject if it starts with articles
|
100
|
+
for article in ["a ", "an ", "the "]:
|
101
|
+
if subject.lower().startswith(article):
|
102
|
+
subject = subject[len(article) :].strip()
|
103
|
+
|
104
|
+
logger.info(f"Original query: '{query}', Extracted subject: '{subject}'")
|
105
|
+
|
106
|
+
# Create a prompt to decompose the query into sub-questions
|
107
|
+
prompt = f"""Decompose the main research topic into 3-5 specific sub-queries that can be answered independently.
|
108
|
+
Focus on breaking down complex concepts and identifying key aspects requiring separate investigation.
|
109
|
+
Ensure sub-queries are clear, targeted, and help build a comprehensive understanding.
|
110
|
+
|
111
|
+
Main Research Topic: {subject}
|
112
|
+
Original Query: {query}
|
113
|
+
|
114
|
+
Context Information:
|
115
|
+
{context[:2000]} # Limit context length to prevent token limit issues
|
116
|
+
|
117
|
+
Your task is to create 3-5 specific questions that will help thoroughly research this topic.
|
118
|
+
If the original query is already a question, extract the core subject and formulate questions around that subject.
|
119
|
+
|
120
|
+
Return ONLY the sub-queries, one per line, without numbering or bullet points.
|
121
|
+
Example format:
|
122
|
+
What is X technology?
|
123
|
+
How does X compare to Y?
|
124
|
+
What are the security implications of X?
|
125
|
+
"""
|
126
|
+
|
127
|
+
logger.info(
|
128
|
+
f"Generating sub-questions for query: '{query}', subject: '{subject}'"
|
129
|
+
)
|
130
|
+
|
131
|
+
try:
|
132
|
+
# Get response from LLM
|
133
|
+
response = self.model.invoke(prompt)
|
134
|
+
|
135
|
+
# Handle different response formats (string or object with content attribute)
|
136
|
+
sub_queries_text = ""
|
137
|
+
if hasattr(response, "content"):
|
138
|
+
sub_queries_text = response.content.strip()
|
139
|
+
else:
|
140
|
+
# Handle string responses
|
141
|
+
sub_queries_text = str(response).strip()
|
142
|
+
|
143
|
+
# Check for the common "No language models available" error
|
144
|
+
if (
|
145
|
+
"No language models are available" in sub_queries_text
|
146
|
+
or "Please install Ollama" in sub_queries_text
|
147
|
+
):
|
148
|
+
logger.warning(
|
149
|
+
"LLM returned error about language models not being available, using default questions"
|
150
|
+
)
|
151
|
+
# Create topic-specific default questions based on the query
|
152
|
+
return self._generate_default_questions(query)
|
153
|
+
|
154
|
+
# Extract sub-queries (one per line)
|
155
|
+
sub_queries = []
|
156
|
+
for line in sub_queries_text.split("\n"):
|
157
|
+
line = line.strip()
|
158
|
+
# Skip empty lines and lines that are just formatting (bullets, numbers)
|
159
|
+
if (
|
160
|
+
not line
|
161
|
+
or line in ["*", "-", "•"]
|
162
|
+
or line.startswith(("- ", "* ", "• ", "1. ", "2. ", "3. "))
|
163
|
+
):
|
164
|
+
continue
|
165
|
+
|
166
|
+
# Remove any leading bullets or numbers if they exist
|
167
|
+
clean_line = line
|
168
|
+
for prefix in [
|
169
|
+
"- ",
|
170
|
+
"* ",
|
171
|
+
"• ",
|
172
|
+
"1. ",
|
173
|
+
"2. ",
|
174
|
+
"3. ",
|
175
|
+
"4. ",
|
176
|
+
"5. ",
|
177
|
+
"- ",
|
178
|
+
"#",
|
179
|
+
]:
|
180
|
+
if clean_line.startswith(prefix):
|
181
|
+
clean_line = clean_line[len(prefix) :]
|
182
|
+
|
183
|
+
if (
|
184
|
+
clean_line and len(clean_line) > 10
|
185
|
+
): # Ensure it's a meaningful question
|
186
|
+
sub_queries.append(clean_line)
|
187
|
+
|
188
|
+
# If no sub-queries were extracted, try again with a simpler prompt
|
189
|
+
if not sub_queries:
|
190
|
+
logger.warning(
|
191
|
+
"No sub-queries extracted from first attempt, trying simplified approach"
|
192
|
+
)
|
193
|
+
|
194
|
+
# Determine if the query is already a question and extract the subject
|
195
|
+
topic_text = query
|
196
|
+
if query.lower().endswith("?"):
|
197
|
+
# Try to extract subject from question
|
198
|
+
for prefix in [
|
199
|
+
"what is",
|
200
|
+
"what are",
|
201
|
+
"how does",
|
202
|
+
"how can",
|
203
|
+
"why is",
|
204
|
+
]:
|
205
|
+
if query.lower().startswith(prefix):
|
206
|
+
topic_text = query[len(prefix) :].strip()
|
207
|
+
if topic_text.endswith("?"):
|
208
|
+
topic_text = topic_text[:-1].strip()
|
209
|
+
break
|
210
|
+
|
211
|
+
# For compound topics, extract just the primary subject
|
212
|
+
conjunctions = [
|
213
|
+
" and ",
|
214
|
+
" or ",
|
215
|
+
" but ",
|
216
|
+
" as ",
|
217
|
+
" that ",
|
218
|
+
" which ",
|
219
|
+
" when ",
|
220
|
+
" where ",
|
221
|
+
" how ",
|
222
|
+
]
|
223
|
+
for conjunction in conjunctions:
|
224
|
+
if conjunction in topic_text.lower():
|
225
|
+
# Take only the part before the conjunction
|
226
|
+
topic_text = topic_text.split(conjunction)[0].strip()
|
227
|
+
logger.info(
|
228
|
+
f"Simplified prompt: Split compound query at '{conjunction}', extracted: '{topic_text}'"
|
229
|
+
)
|
230
|
+
break
|
231
|
+
|
232
|
+
# Clean up the topic if it starts with articles
|
233
|
+
for article in ["a ", "an ", "the "]:
|
234
|
+
if topic_text.lower().startswith(article):
|
235
|
+
topic_text = topic_text[len(article) :].strip()
|
236
|
+
|
237
|
+
# Simpler prompt
|
238
|
+
simple_prompt = f"""Break down this research topic into 3 simpler sub-questions:
|
239
|
+
|
240
|
+
Research Topic: {topic_text}
|
241
|
+
Original Query: {query}
|
242
|
+
|
243
|
+
Your task is to create 3 specific questions that will help thoroughly research this topic.
|
244
|
+
If the original query is already a question, use the core subject of that question.
|
245
|
+
|
246
|
+
Sub-questions:
|
247
|
+
1.
|
248
|
+
2.
|
249
|
+
3. """
|
250
|
+
|
251
|
+
simple_response = self.model.invoke(simple_prompt)
|
252
|
+
|
253
|
+
# Handle different response formats
|
254
|
+
simple_text = ""
|
255
|
+
if hasattr(simple_response, "content"):
|
256
|
+
simple_text = simple_response.content.strip()
|
257
|
+
else:
|
258
|
+
simple_text = str(simple_response).strip()
|
259
|
+
|
260
|
+
# Check again for language model errors
|
261
|
+
if (
|
262
|
+
"No language models are available" in simple_text
|
263
|
+
or "Please install Ollama" in simple_text
|
264
|
+
):
|
265
|
+
logger.warning(
|
266
|
+
"LLM returned error in simplified prompt, using default questions"
|
267
|
+
)
|
268
|
+
return self._generate_default_questions(query)
|
269
|
+
|
270
|
+
# Extract sub-queries from the simpler response
|
271
|
+
for line in simple_text.split("\n"):
|
272
|
+
line = line.strip()
|
273
|
+
if (
|
274
|
+
line
|
275
|
+
and not line.startswith("Sub-questions:")
|
276
|
+
and len(line) > 10
|
277
|
+
):
|
278
|
+
# Clean up numbering
|
279
|
+
for prefix in ["1. ", "2. ", "3. ", "- ", "* "]:
|
280
|
+
if line.startswith(prefix):
|
281
|
+
line = line[len(prefix) :]
|
282
|
+
sub_queries.append(line.strip())
|
283
|
+
|
284
|
+
# If still no sub-queries, create default ones based on the original query
|
285
|
+
if not sub_queries:
|
286
|
+
logger.warning(
|
287
|
+
"Failed to generate meaningful sub-queries, using default decomposition"
|
288
|
+
)
|
289
|
+
return self._generate_default_questions(query)
|
290
|
+
|
291
|
+
logger.info(f"Generated {len(sub_queries)} sub-questions: {sub_queries}")
|
292
|
+
return sub_queries[: self.max_subqueries] # Limit to max_subqueries
|
293
|
+
|
294
|
+
except Exception as e:
|
295
|
+
logger.error(f"Error generating sub-questions: {str(e)}")
|
296
|
+
# Fallback to basic questions in case of error
|
297
|
+
return self._generate_default_questions(query)
|
298
|
+
|
299
|
+
def _generate_default_questions(self, query: str) -> List[str]:
|
300
|
+
"""
|
301
|
+
Generate default questions for a given query when LLM fails.
|
302
|
+
|
303
|
+
Args:
|
304
|
+
query: The main research query
|
305
|
+
|
306
|
+
Returns:
|
307
|
+
List of default questions
|
308
|
+
"""
|
309
|
+
# Adjust questions based on the type of query
|
310
|
+
query = query.strip()
|
311
|
+
|
312
|
+
# Check if the query is already in question format
|
313
|
+
question_prefixes = [
|
314
|
+
"what is",
|
315
|
+
"what are",
|
316
|
+
"how does",
|
317
|
+
"how do",
|
318
|
+
"how can",
|
319
|
+
"why is",
|
320
|
+
"why are",
|
321
|
+
"when did",
|
322
|
+
"where is",
|
323
|
+
"which",
|
324
|
+
"who is",
|
325
|
+
"can",
|
326
|
+
"will",
|
327
|
+
]
|
328
|
+
|
329
|
+
# Extract the subject from a question-format query
|
330
|
+
subject = query
|
331
|
+
lower_query = query.lower()
|
332
|
+
|
333
|
+
# Check for common question formats and extract the subject
|
334
|
+
if lower_query.endswith("?"):
|
335
|
+
# Remove the question mark
|
336
|
+
subject = query[:-1].strip()
|
337
|
+
|
338
|
+
# Check for common question beginnings and extract the subject
|
339
|
+
for prefix in question_prefixes:
|
340
|
+
if lower_query.startswith(prefix):
|
341
|
+
# Extract everything after the question prefix
|
342
|
+
subject = query[len(prefix) :].strip()
|
343
|
+
# Remove trailing ? if present
|
344
|
+
if subject.endswith("?"):
|
345
|
+
subject = subject[:-1].strip()
|
346
|
+
break
|
347
|
+
|
348
|
+
# For compound questions, extract just the primary subject
|
349
|
+
# Look for conjunctions and prepositions that typically separate the subject from the rest
|
350
|
+
conjunctions = [
|
351
|
+
" and ",
|
352
|
+
" or ",
|
353
|
+
" but ",
|
354
|
+
" as ",
|
355
|
+
" that ",
|
356
|
+
" which ",
|
357
|
+
" when ",
|
358
|
+
" where ",
|
359
|
+
" how ",
|
360
|
+
]
|
361
|
+
for conjunction in conjunctions:
|
362
|
+
if conjunction in subject.lower():
|
363
|
+
# Take only the part before the conjunction
|
364
|
+
subject = subject.split(conjunction)[0].strip()
|
365
|
+
logger.info(
|
366
|
+
f"Split compound question at '{conjunction}', extracted: '{subject}'"
|
367
|
+
)
|
368
|
+
break
|
369
|
+
|
370
|
+
# Clean up the subject if it starts with articles
|
371
|
+
for article in ["a ", "an ", "the "]:
|
372
|
+
if subject.lower().startswith(article):
|
373
|
+
subject = subject[len(article) :].strip()
|
374
|
+
|
375
|
+
# For single word or very short subjects, adapt the question format
|
376
|
+
is_short_subject = len(subject.split()) <= 2
|
377
|
+
|
378
|
+
logger.info(
|
379
|
+
f"Query: '{query}', Identified subject: '{subject}', Short subject: {is_short_subject}"
|
380
|
+
)
|
381
|
+
|
382
|
+
# Special case for CSRF - if we've extracted just "csrf" from a longer query
|
383
|
+
if subject.lower() == "csrf" or subject.lower() == "cross-site request forgery":
|
384
|
+
# CSRF-specific questions
|
385
|
+
default_questions = [
|
386
|
+
"What is Cross-Site Request Forgery (CSRF)?",
|
387
|
+
"How do CSRF attacks work and what are common attack vectors?",
|
388
|
+
"What are effective CSRF prevention methods and best practices?",
|
389
|
+
"How do CSRF tokens work to prevent attacks?",
|
390
|
+
"What are real-world examples of CSRF vulnerabilities and their impact?",
|
391
|
+
]
|
392
|
+
elif not subject:
|
393
|
+
# Empty query case
|
394
|
+
default_questions = [
|
395
|
+
"What is the definition of this topic?",
|
396
|
+
"What are the key aspects of this topic?",
|
397
|
+
"What are practical applications of this concept?",
|
398
|
+
]
|
399
|
+
elif any(
|
400
|
+
term in subject.lower()
|
401
|
+
for term in ["secure", "security", "vulnerability", "attack"]
|
402
|
+
):
|
403
|
+
# Security-related questions
|
404
|
+
default_questions = [
|
405
|
+
f"What is {subject} and how does it work?",
|
406
|
+
f"What are common {subject} vulnerabilities or attack vectors?",
|
407
|
+
f"What are best practices for preventing {subject} issues?",
|
408
|
+
f"How can {subject} be detected and mitigated?",
|
409
|
+
f"What are real-world examples of {subject} incidents?",
|
410
|
+
]
|
411
|
+
elif any(
|
412
|
+
term in subject.lower()
|
413
|
+
for term in ["programming", "language", "code", "software"]
|
414
|
+
):
|
415
|
+
# Programming-related questions
|
416
|
+
default_questions = [
|
417
|
+
f"What is {subject} and how does it work?",
|
418
|
+
f"What are the main features and advantages of {subject}?",
|
419
|
+
f"What are common use cases and applications for {subject}?",
|
420
|
+
f"How does {subject} compare to similar technologies?",
|
421
|
+
f"What are best practices when working with {subject}?",
|
422
|
+
]
|
423
|
+
elif is_short_subject:
|
424
|
+
# For short subjects (1-2 words), use a dedicated format
|
425
|
+
default_questions = [
|
426
|
+
f"What is {subject}?",
|
427
|
+
f"What are the main characteristics of {subject}?",
|
428
|
+
f"How is {subject} used in practice?",
|
429
|
+
f"What are the advantages and disadvantages of {subject}?",
|
430
|
+
f"How has {subject} evolved over time?",
|
431
|
+
]
|
432
|
+
else:
|
433
|
+
# Generic questions for any topic
|
434
|
+
default_questions = [
|
435
|
+
f"What is the definition of {subject}?",
|
436
|
+
f"What are the key components or features of {subject}?",
|
437
|
+
f"What are common applications or use cases for {subject}?",
|
438
|
+
f"What are the advantages and limitations of {subject}?",
|
439
|
+
f"How does {subject} compare to alternatives?",
|
440
|
+
]
|
441
|
+
|
442
|
+
logger.info(
|
443
|
+
f"Using {len(default_questions)} default questions: {default_questions}"
|
444
|
+
)
|
445
|
+
return default_questions[: self.max_subqueries]
|