local-deep-research 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -80,7 +80,7 @@ class ParallelSearchStrategy(BaseSearchStrategy):
80
80
 
81
81
  def analyze_topic(self, query: str) -> Dict:
82
82
  """
83
- Parallel implementation that generates questions and searches all at once.
83
+ Analyze a topic using parallel search, supporting multiple iterations.
84
84
 
85
85
  Args:
86
86
  query: The research query to analyze
@@ -89,6 +89,11 @@ class ParallelSearchStrategy(BaseSearchStrategy):
89
89
 
90
90
  findings = []
91
91
  all_search_results = []
92
+ current_knowledge = ""
93
+
94
+ # Track all search results across iterations
95
+ self.all_links_of_system = list()
96
+ self.questions_by_iteration = {}
92
97
 
93
98
  self._update_progress(
94
99
  "Initializing parallel research",
@@ -105,184 +110,301 @@ class ParallelSearchStrategy(BaseSearchStrategy):
105
110
  return {
106
111
  "findings": [],
107
112
  "iterations": 0,
108
- "questions": {},
113
+ "questions_by_iteration": {},
109
114
  "formatted_findings": "Error: Unable to conduct research without a search engine.",
110
115
  "current_knowledge": "",
111
116
  "error": "No search engine available",
112
117
  }
113
118
 
119
+ # Determine number of iterations to run
120
+ iterations_to_run = get_db_setting("search.iterations")
121
+ logger.debug("Selected amount of iterations: " + iterations_to_run)
122
+ iterations_to_run = int(iterations_to_run)
114
123
  try:
115
- # Step 1: Generate questions first
116
- self._update_progress(
117
- "Generating search questions", 10, {"phase": "question_generation"}
118
- )
124
+ # Run each iteration
125
+ for iteration in range(1, iterations_to_run + 1):
126
+ iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
119
127
 
120
- # Generate 3 additional questions (plus the main query = 4 total)
121
- questions = self.question_generator.generate_questions(
122
- current_knowledge="", # No knowledge accumulation
123
- query=query,
124
- questions_per_iteration=int(
125
- get_db_setting("search.questions_per_iteration")
126
- ), # 3 additional questions
127
- questions_by_iteration={},
128
- )
128
+ self._update_progress(
129
+ f"Starting iteration {iteration}/{iterations_to_run}",
130
+ iteration_progress_base,
131
+ {"phase": f"iteration_{iteration}", "iteration": iteration},
132
+ )
129
133
 
130
- # Add the original query as the first question
131
- all_questions = [query] + questions
134
+ # Step 1: Generate questions
135
+ self._update_progress(
136
+ f"Generating search questions for iteration {iteration}",
137
+ iteration_progress_base + 5,
138
+ {"phase": "question_generation", "iteration": iteration},
139
+ )
132
140
 
133
- # Store in questions_by_iteration
134
- self.questions_by_iteration[0] = questions
135
- logger.info(f"Generated questions: {questions}")
141
+ # For first iteration, generate initial questions
142
+ # For subsequent iterations, generate follow-up questions
143
+ logger.info("Starting to generate questions")
144
+ if iteration == 1:
145
+ # Generate additional questions (plus the main query)
146
+ if iterations_to_run > 1:
147
+ context = f"""Iteration: {1} of {iterations_to_run}"""
148
+ else:
149
+ context = ""
150
+ questions = self.question_generator.generate_questions(
151
+ current_knowledge=context,
152
+ query=query,
153
+ questions_per_iteration=int(
154
+ get_db_setting("search.questions_per_iteration")
155
+ ),
156
+ questions_by_iteration=self.questions_by_iteration,
157
+ )
136
158
 
137
- # Step 2: Run all searches in parallel
138
- self._update_progress(
139
- "Running parallel searches for all questions",
140
- 20,
141
- {"phase": "parallel_search"},
142
- )
159
+ # Add the original query as the first question
160
+ all_questions = [query] + questions
143
161
 
144
- # Function for thread pool
145
- def search_question(q):
146
- try:
147
- result = self.search.run(q)
148
- return {"question": q, "results": result or []}
149
- except Exception as e:
150
- logger.error(f"Error searching for '{q}': {str(e)}")
151
- return {"question": q, "results": [], "error": str(e)}
152
-
153
- # Run searches in parallel
154
- with concurrent.futures.ThreadPoolExecutor(
155
- max_workers=len(all_questions)
156
- ) as executor:
157
- futures = [executor.submit(search_question, q) for q in all_questions]
158
- all_search_dict = {}
159
-
160
- # Process results as they complete
161
- for i, future in enumerate(concurrent.futures.as_completed(futures)):
162
- result_dict = future.result()
163
- question = result_dict["question"]
164
- search_results = result_dict["results"]
165
- all_search_dict[question] = search_results
162
+ # Store in questions_by_iteration
163
+ self.questions_by_iteration[iteration] = questions
164
+ logger.info(
165
+ f"Generated questions for iteration {iteration}: {questions}"
166
+ )
167
+ else:
168
+ # Get past questions from all previous iterations
169
+ past_questions = []
170
+ for prev_iter in range(1, iteration):
171
+ if prev_iter in self.questions_by_iteration:
172
+ past_questions.extend(
173
+ self.questions_by_iteration[prev_iter]
174
+ )
175
+
176
+ # Generate follow-up questions based on accumulated knowledge if iterations > 2
177
+ use_knowledge = iterations_to_run > 2
178
+ knowledge_for_questions = current_knowledge if use_knowledge else ""
179
+ context = f"""Current Knowledge: {knowledge_for_questions}
180
+ Iteration: {iteration} of {iterations_to_run}"""
181
+
182
+ # Generate questions
183
+ questions = self.question_generator.generate_questions(
184
+ current_knowledge=context,
185
+ query=query,
186
+ questions_per_iteration=int(
187
+ get_db_setting("search.questions_per_iteration")
188
+ ),
189
+ questions_by_iteration=self.questions_by_iteration,
190
+ )
166
191
 
167
- self._update_progress(
168
- f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
169
- 20 + ((i + 1) / len(all_questions) * 40),
170
- {
171
- "phase": "search_complete",
172
- "result_count": len(search_results),
173
- "question": question,
174
- },
192
+ # Use only the new questions for this iteration's searches
193
+ all_questions = questions
194
+
195
+ # Store in questions_by_iteration
196
+ self.questions_by_iteration[iteration] = questions
197
+ logger.info(
198
+ f"Generated questions for iteration {iteration}: {questions}"
175
199
  )
176
200
 
177
- # Extract and save links
178
- if not self.use_cross_engine_filter:
179
- links = extract_links_from_search_results(search_results)
180
- self.all_links_of_system.extend(links)
181
- all_search_results.extend(search_results)
201
+ # Step 2: Run all searches in parallel for this iteration
202
+ self._update_progress(
203
+ f"Running parallel searches for iteration {iteration}",
204
+ iteration_progress_base + 10,
205
+ {"phase": "parallel_search", "iteration": iteration},
206
+ )
182
207
 
183
- # Step 3: Analysis of collected search results
184
- self._update_progress(
185
- "Analyzing all collected search results",
186
- 70,
187
- {"phase": "final_analysis"},
188
- )
189
- if self.use_cross_engine_filter:
208
+ # Function for thread pool
209
+ def search_question(q):
210
+ try:
211
+ result = self.search.run(q)
212
+ return {"question": q, "results": result or []}
213
+ except Exception as e:
214
+ logger.error(f"Error searching for '{q}': {str(e)}")
215
+ return {"question": q, "results": [], "error": str(e)}
216
+
217
+ # Run searches in parallel
218
+ with concurrent.futures.ThreadPoolExecutor(
219
+ max_workers=len(all_questions)
220
+ ) as executor:
221
+ futures = [
222
+ executor.submit(search_question, q) for q in all_questions
223
+ ]
224
+ iteration_search_dict = {}
225
+ iteration_search_results = []
226
+
227
+ # Process results as they complete
228
+ for i, future in enumerate(
229
+ concurrent.futures.as_completed(futures)
230
+ ):
231
+ result_dict = future.result()
232
+ question = result_dict["question"]
233
+ search_results = result_dict["results"]
234
+ iteration_search_dict[question] = search_results
235
+
236
+ self._update_progress(
237
+ f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
238
+ iteration_progress_base
239
+ + 10
240
+ + ((i + 1) / len(all_questions) * 30),
241
+ {
242
+ "phase": "search_complete",
243
+ "iteration": iteration,
244
+ "result_count": len(search_results),
245
+ "question": question,
246
+ },
247
+ )
248
+
249
+ # Collect all search results for this iteration
250
+ iteration_search_results.extend(search_results)
251
+
252
+ # Step 3: Filter and analyze results for this iteration
190
253
  self._update_progress(
191
- "Filtering search results across engines",
192
- 65,
193
- {"phase": "cross_engine_filtering"},
254
+ f"Analyzing results for iteration {iteration}",
255
+ iteration_progress_base + 45,
256
+ {"phase": "iteration_analysis", "iteration": iteration},
194
257
  )
195
258
 
196
- # Get the current link count (for indexing)
197
- existing_link_count = len(self.all_links_of_system)
259
+ # Apply cross-engine filtering if enabled
260
+ if self.use_cross_engine_filter:
261
+ self._update_progress(
262
+ f"Filtering search results for iteration {iteration}",
263
+ iteration_progress_base + 45,
264
+ {"phase": "cross_engine_filtering", "iteration": iteration},
265
+ )
198
266
 
199
- # Filter the search results
200
- filtered_search_results = self.cross_engine_filter.filter_results(
201
- all_search_results,
202
- query,
203
- reorder=self.filter_reorder,
204
- reindex=self.filter_reindex,
205
- start_index=existing_link_count, # Start indexing after existing links
206
- )
267
+ # Get the current link count (for indexing)
268
+ existing_link_count = len(self.all_links_of_system)
207
269
 
208
- links = extract_links_from_search_results(filtered_search_results)
209
- self.all_links_of_system.extend(links)
270
+ # Filter the search results
271
+ filtered_search_results = self.cross_engine_filter.filter_results(
272
+ iteration_search_results,
273
+ query,
274
+ reorder=self.filter_reorder,
275
+ reindex=self.filter_reindex,
276
+ start_index=existing_link_count, # Start indexing after existing links
277
+ )
278
+
279
+ links = extract_links_from_search_results(filtered_search_results)
280
+ self.all_links_of_system.extend(links)
281
+
282
+ self._update_progress(
283
+ f"Filtered from {len(iteration_search_results)} to {len(filtered_search_results)} results",
284
+ iteration_progress_base + 50,
285
+ {
286
+ "phase": "filtering_complete",
287
+ "iteration": iteration,
288
+ "links_count": len(self.all_links_of_system),
289
+ },
290
+ )
210
291
 
292
+ # Use filtered results for analysis
293
+ iteration_search_results = filtered_search_results
294
+ else:
295
+ # Just extract links without filtering
296
+ links = extract_links_from_search_results(iteration_search_results)
297
+ self.all_links_of_system.extend(links)
298
+
299
+ # Add to all search results
300
+ all_search_results.extend(iteration_search_results)
301
+
302
+ # Create a finding for this iteration's results
303
+ if self.include_text_content and iteration_search_results:
304
+ # For iteration > 1 with knowledge accumulation, use follow-up analysis
305
+ if iteration > 1 and iterations_to_run > 2:
306
+ citation_result = self.citation_handler.analyze_followup(
307
+ query,
308
+ iteration_search_results,
309
+ current_knowledge,
310
+ len(self.all_links_of_system) - len(links),
311
+ )
312
+ else:
313
+ # For first iteration or without knowledge accumulation, use initial analysis
314
+ citation_result = self.citation_handler.analyze_initial(
315
+ query, iteration_search_results
316
+ )
317
+
318
+ if citation_result:
319
+ # Create a finding for this iteration
320
+ iteration_content = citation_result["content"]
321
+
322
+ # Update current knowledge if iterations > 2
323
+ if iterations_to_run > 2:
324
+ if current_knowledge:
325
+ current_knowledge = f"{current_knowledge}\n\n## FINDINGS FROM ITERATION {iteration}:\n\n{iteration_content}"
326
+ else:
327
+ current_knowledge = iteration_content
328
+
329
+ finding = {
330
+ "phase": f"Iteration {iteration}",
331
+ "content": iteration_content,
332
+ "question": query,
333
+ "search_results": iteration_search_results,
334
+ "documents": citation_result.get("documents", []),
335
+ }
336
+ findings.append(finding)
337
+
338
+ # Add documents to repository
339
+ if "documents" in citation_result:
340
+ self.findings_repository.add_documents(
341
+ citation_result["documents"]
342
+ )
343
+
344
+ # Mark iteration as complete
345
+ iteration_progress = 5 + iteration * (70 / iterations_to_run)
211
346
  self._update_progress(
212
- f"Filtered from {len(all_search_results)} to {len(filtered_search_results)} results",
213
- 70,
214
- {
215
- "phase": "filtering_complete",
216
- "links_count": len(self.all_links_of_system),
217
- },
347
+ f"Completed iteration {iteration}/{iterations_to_run}",
348
+ iteration_progress,
349
+ {"phase": "iteration_complete", "iteration": iteration},
218
350
  )
219
351
 
220
- # Use filtered results for analysis
221
- all_search_results = filtered_search_results
352
+ # Final synthesis after all iterations
353
+ self._update_progress(
354
+ "Generating final synthesis", 80, {"phase": "synthesis"}
355
+ )
222
356
 
223
- # Now when we use the citation handler, ensure we're using all_search_results:
357
+ # Handle final synthesis based on include_text_content flag
224
358
  if self.include_text_content:
225
- # Use citation handler for analysis of all results together
226
- citation_result = self.citation_handler.analyze_initial(
227
- query, all_search_results
228
- )
229
-
230
- if citation_result:
231
- synthesized_content = citation_result["content"]
232
- finding = {
359
+ # Generate a final synthesis from all search results
360
+ if iterations_to_run > 1:
361
+ final_citation_result = self.citation_handler.analyze_initial(
362
+ query, all_search_results
363
+ )
364
+ # Add null check for final_citation_result
365
+ if final_citation_result:
366
+ synthesized_content = final_citation_result["content"]
367
+ else:
368
+ synthesized_content = (
369
+ "No relevant results found in final synthesis."
370
+ )
371
+ else:
372
+ # For single iteration, use the content from findings
373
+ synthesized_content = (
374
+ findings[0]["content"]
375
+ if findings
376
+ else "No relevant results found."
377
+ )
378
+ # Add a final synthesis finding
379
+ final_finding = {
233
380
  "phase": "Final synthesis",
234
381
  "content": synthesized_content,
235
382
  "question": query,
236
383
  "search_results": all_search_results,
237
- "documents": citation_result.get("documents", []),
384
+ "documents": [],
238
385
  }
239
- findings.append(finding)
240
-
241
- # Transfer questions to repository
242
- self.findings_repository.set_questions_by_iteration(
243
- self.questions_by_iteration
244
- )
245
-
246
- # Format findings
247
- formatted_findings = self.findings_repository.format_findings_to_text(
248
- findings, synthesized_content
249
- )
250
-
251
- # Add documents to repository
252
- if "documents" in citation_result:
253
- self.findings_repository.add_documents(citation_result["documents"])
254
- else:
255
- synthesized_content = "No relevant results found."
256
- formatted_findings = synthesized_content
257
- finding = {
258
- "phase": "Error",
259
- "content": "No relevant results found.",
260
- "question": query,
261
- "search_results": all_search_results,
262
- "documents": [],
263
- }
264
- findings.append(finding)
386
+ findings.append(final_finding)
265
387
  else:
266
388
  # Skip LLM analysis, just format the raw search results
267
389
  synthesized_content = "LLM analysis skipped"
268
- finding = {
390
+ final_finding = {
269
391
  "phase": "Raw search results",
270
392
  "content": "LLM analysis was skipped. Displaying raw search results with links.",
271
393
  "question": query,
272
394
  "search_results": all_search_results,
273
395
  "documents": [],
274
396
  }
275
- findings.append(finding)
397
+ findings.append(final_finding)
276
398
 
277
- # Transfer questions to repository
278
- self.findings_repository.set_questions_by_iteration(
279
- self.questions_by_iteration
280
- )
399
+ # Transfer questions to repository
400
+ self.findings_repository.set_questions_by_iteration(
401
+ self.questions_by_iteration
402
+ )
281
403
 
282
- # Format findings without synthesis
283
- formatted_findings = self.findings_repository.format_findings_to_text(
284
- findings, "Raw search results (LLM analysis skipped)"
285
- )
404
+ # Format findings
405
+ formatted_findings = self.findings_repository.format_findings_to_text(
406
+ findings, synthesized_content
407
+ )
286
408
 
287
409
  except Exception as e:
288
410
  import traceback
@@ -305,8 +427,8 @@ class ParallelSearchStrategy(BaseSearchStrategy):
305
427
 
306
428
  return {
307
429
  "findings": findings,
308
- "iterations": 1,
309
- "questions": self.questions_by_iteration,
430
+ "iterations": iterations_to_run,
431
+ "questions_by_iteration": self.questions_by_iteration,
310
432
  "formatted_findings": formatted_findings,
311
433
  "current_knowledge": synthesized_content,
312
434
  }
@@ -0,0 +1,407 @@
1
+ import concurrent.futures
2
+ import logging
3
+ from typing import Dict
4
+
5
+ from ...citation_handler import CitationHandler
6
+ from ...config.llm_config import get_llm
7
+ from ...config.search_config import get_search
8
+ from ...utilities.db_utils import get_db_setting
9
+ from ...utilities.search_utilities import extract_links_from_search_results
10
+ from ..filters.cross_engine_filter import CrossEngineFilter
11
+ from ..findings.repository import FindingsRepository
12
+ from ..questions.standard_question import StandardQuestionGenerator
13
+ from .base_strategy import BaseSearchStrategy
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class SourceBasedSearchStrategy(BaseSearchStrategy):
19
+ """
20
+ Source-based search strategy that generates questions based on search results and
21
+ defers content analysis until final synthesis.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ search=None,
27
+ model=None,
28
+ citation_handler=None,
29
+ include_text_content: bool = True,
30
+ use_cross_engine_filter: bool = True,
31
+ filter_reorder: bool = True,
32
+ filter_reindex: bool = True,
33
+ filter_max_results: int = 20,
34
+ ):
35
+ """Initialize with optional dependency injection for testing."""
36
+ super().__init__()
37
+ self.search = search or get_search()
38
+ self.model = model or get_llm()
39
+ self.progress_callback = None
40
+ self.all_links_of_system = list()
41
+ self.all_search_results = []
42
+ self.questions_by_iteration = {}
43
+ self.include_text_content = include_text_content
44
+ self.use_cross_engine_filter = use_cross_engine_filter
45
+ self.filter_reorder = filter_reorder
46
+ self.filter_reindex = filter_reindex
47
+
48
+ # Initialize the cross-engine filter
49
+ self.cross_engine_filter = CrossEngineFilter(
50
+ model=self.model,
51
+ max_results=filter_max_results,
52
+ default_reorder=filter_reorder,
53
+ default_reindex=filter_reindex,
54
+ )
55
+
56
+ # Set include_full_content on the search engine if it supports it
57
+ if hasattr(self.search, "include_full_content"):
58
+ self.search.include_full_content = include_text_content
59
+
60
+ # Use provided citation_handler or create one
61
+ self.citation_handler = citation_handler or CitationHandler(self.model)
62
+
63
+ # Initialize components
64
+ self.question_generator = StandardQuestionGenerator(self.model)
65
+ self.findings_repository = FindingsRepository(self.model)
66
+
67
+ def _format_search_results_as_context(self, search_results):
68
+ """Format search results into context for question generation."""
69
+ context_snippets = []
70
+
71
+ for i, result in enumerate(
72
+ search_results[:10]
73
+ ): # Limit to prevent context overflow
74
+ title = result.get("title", "Untitled")
75
+ snippet = result.get("snippet", "")
76
+ url = result.get("link", "")
77
+
78
+ if snippet:
79
+ context_snippets.append(
80
+ f"Source {i + 1}: {title}\nURL: {url}\nSnippet: {snippet}"
81
+ )
82
+
83
+ return "\n\n".join(context_snippets)
84
+
85
+ def analyze_topic(self, query: str) -> Dict:
86
+ """
87
+ Analyze a topic using source-based search strategy.
88
+ """
89
+ logger.info(f"Starting source-based research on topic: {query}")
90
+
91
+ findings = []
92
+ self.all_search_results = []
93
+
94
+ # Track all search results across iterations
95
+ self.all_links_of_system = list()
96
+ self.questions_by_iteration = {}
97
+
98
+ self._update_progress(
99
+ "Initializing source-based research",
100
+ 5,
101
+ {
102
+ "phase": "init",
103
+ "strategy": "source-based",
104
+ "include_text_content": self.include_text_content,
105
+ },
106
+ )
107
+
108
+ # Check search engine
109
+ if not self._validate_search_engine():
110
+ return {
111
+ "findings": [],
112
+ "iterations": 0,
113
+ "questions_by_iteration": {},
114
+ "formatted_findings": "Error: Unable to conduct research without a search engine.",
115
+ "current_knowledge": "",
116
+ "error": "No search engine available",
117
+ }
118
+
119
+ # Determine number of iterations to run
120
+ iterations_to_run = get_db_setting("search.iterations")
121
+ logger.debug("Selected amount of iterations: " + str(iterations_to_run))
122
+ iterations_to_run = int(iterations_to_run)
123
+ try:
124
+ # Run each iteration
125
+ for iteration in range(1, iterations_to_run + 1):
126
+ iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
127
+
128
+ self._update_progress(
129
+ f"Starting iteration {iteration}/{iterations_to_run}",
130
+ iteration_progress_base,
131
+ {"phase": f"iteration_{iteration}", "iteration": iteration},
132
+ )
133
+
134
+ # Step 1: Generate or use questions
135
+ self._update_progress(
136
+ f"Generating search questions for iteration {iteration}",
137
+ iteration_progress_base + 5,
138
+ {"phase": "question_generation", "iteration": iteration},
139
+ )
140
+
141
+ # For first iteration, use initial query
142
+ if iteration == 1:
143
+ # Generate questions for first iteration
144
+ source_context = self._format_search_results_as_context(
145
+ self.all_search_results
146
+ )
147
+ context = f"""Iteration: {iteration} of {iterations_to_run}"""
148
+ questions = self.question_generator.generate_questions(
149
+ current_knowledge=context,
150
+ query=query,
151
+ questions_per_iteration=int(
152
+ get_db_setting("search.questions_per_iteration")
153
+ ),
154
+ questions_by_iteration=self.questions_by_iteration,
155
+ )
156
+
157
+ # Always include the original query for the first iteration
158
+ if query not in questions:
159
+ all_questions = [query] + questions
160
+ else:
161
+ all_questions = questions
162
+
163
+ self.questions_by_iteration[iteration] = all_questions
164
+ logger.info(
165
+ f"Using questions for iteration {iteration}: {all_questions}"
166
+ )
167
+ else:
168
+ # For subsequent iterations, generate questions based on previous search results
169
+ source_context = self._format_search_results_as_context(
170
+ self.all_search_results
171
+ )
172
+ if iteration != 1:
173
+ context = f"""Previous search results:\n{source_context}\n\nIteration: {iteration} of {iterations_to_run}"""
174
+ elif iterations_to_run == 1:
175
+ context = ""
176
+ else:
177
+ context = f"""Iteration: {iteration} of {iterations_to_run}"""
178
+ # Use standard question generator with search results as context
179
+ questions = self.question_generator.generate_questions(
180
+ current_knowledge=context,
181
+ query=query,
182
+ questions_per_iteration=int(
183
+ get_db_setting("search.questions_per_iteration")
184
+ ),
185
+ questions_by_iteration=self.questions_by_iteration,
186
+ )
187
+
188
+ # Use only the new questions for this iteration's searches
189
+ all_questions = questions
190
+
191
+ # Store in questions_by_iteration
192
+ self.questions_by_iteration[iteration] = questions
193
+ logger.info(
194
+ f"Generated questions for iteration {iteration}: {questions}"
195
+ )
196
+
197
+ # Step 2: Run all searches in parallel for this iteration
198
+ self._update_progress(
199
+ f"Running parallel searches for iteration {iteration}",
200
+ iteration_progress_base + 10,
201
+ {"phase": "parallel_search", "iteration": iteration},
202
+ )
203
+
204
+ # Function for thread pool
205
+ def search_question(q):
206
+ try:
207
+ result = self.search.run(q)
208
+ return {"question": q, "results": result or []}
209
+ except Exception as e:
210
+ logger.error(f"Error searching for '{q}': {str(e)}")
211
+ return {"question": q, "results": [], "error": str(e)}
212
+
213
+ # Run searches in parallel
214
+ with concurrent.futures.ThreadPoolExecutor(
215
+ max_workers=len(all_questions)
216
+ ) as executor:
217
+ futures = [
218
+ executor.submit(search_question, q) for q in all_questions
219
+ ]
220
+ iteration_search_dict = {}
221
+ iteration_search_results = []
222
+
223
+ # Process results as they complete
224
+ for i, future in enumerate(
225
+ concurrent.futures.as_completed(futures)
226
+ ):
227
+ result_dict = future.result()
228
+ question = result_dict["question"]
229
+ search_results = result_dict["results"]
230
+ iteration_search_dict[question] = search_results
231
+
232
+ self._update_progress(
233
+ f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
234
+ iteration_progress_base
235
+ + 10
236
+ + ((i + 1) / len(all_questions) * 30),
237
+ {
238
+ "phase": "search_complete",
239
+ "iteration": iteration,
240
+ "result_count": len(search_results),
241
+ "question": question,
242
+ },
243
+ )
244
+
245
+ # Collect all search results for this iteration
246
+ iteration_search_results.extend(search_results)
247
+
248
+ # Step 3: Apply cross-engine filtering if enabled
249
+ if self.use_cross_engine_filter:
250
+ self._update_progress(
251
+ f"Filtering search results for iteration {iteration}",
252
+ iteration_progress_base + 45,
253
+ {"phase": "cross_engine_filtering", "iteration": iteration},
254
+ )
255
+
256
+ # Get the current link count (for indexing)
257
+ existing_link_count = len(self.all_links_of_system)
258
+
259
+ # Filter the search results
260
+ filtered_search_results = self.cross_engine_filter.filter_results(
261
+ iteration_search_results,
262
+ query,
263
+ reorder=self.filter_reorder,
264
+ reindex=self.filter_reindex,
265
+ start_index=existing_link_count, # Start indexing after existing links
266
+ )
267
+
268
+ links = extract_links_from_search_results(filtered_search_results)
269
+ self.all_links_of_system.extend(links)
270
+
271
+ self._update_progress(
272
+ f"Filtered from {len(iteration_search_results)} to {len(filtered_search_results)} results",
273
+ iteration_progress_base + 50,
274
+ {
275
+ "phase": "filtering_complete",
276
+ "iteration": iteration,
277
+ "links_count": len(self.all_links_of_system),
278
+ },
279
+ )
280
+
281
+ # Use filtered results
282
+ iteration_search_results = filtered_search_results
283
+ else:
284
+ # Just extract links without filtering
285
+ links = extract_links_from_search_results(iteration_search_results)
286
+ self.all_links_of_system.extend(links)
287
+
288
+ # Add to all search results
289
+ self.all_search_results.extend(iteration_search_results)
290
+
291
+ # Create a lightweight finding for this iteration's search metadata (no text content)
292
+ finding = {
293
+ "phase": f"Iteration {iteration}",
294
+ "content": f"Searched with {len(all_questions)} questions, found {len(iteration_search_results)} results.",
295
+ "question": query,
296
+ "search_results": iteration_search_results,
297
+ "documents": [],
298
+ }
299
+ findings.append(finding)
300
+
301
+ # Mark iteration as complete
302
+ iteration_progress = 5 + iteration * (70 / iterations_to_run)
303
+ self._update_progress(
304
+ f"Completed iteration {iteration}/{iterations_to_run}",
305
+ iteration_progress,
306
+ {"phase": "iteration_complete", "iteration": iteration},
307
+ )
308
+
309
+ # Final filtering of all accumulated search results
310
+ self._update_progress(
311
+ "Performing final filtering of all results",
312
+ 80,
313
+ {"phase": "final_filtering"},
314
+ )
315
+
316
+ # Apply final cross-engine filtering to all accumulated results if enabled
317
+ if self.use_cross_engine_filter:
318
+ final_filtered_results = self.cross_engine_filter.filter_results(
319
+ self.all_search_results,
320
+ query,
321
+ reorder=True, # Always reorder in final filtering
322
+ reindex=False, # Always reindex in final filtering
323
+ max_results=int(get_db_setting("search.final_max_results") or 30),
324
+ )
325
+ else:
326
+ final_filtered_results = self.all_search_results
327
+ self._update_progress(
328
+ f"Filtered from {len(self.all_search_results)} to {len(final_filtered_results)} results",
329
+ iteration_progress_base + 85,
330
+ {
331
+ "phase": "filtering_complete",
332
+ "iteration": iteration,
333
+ "links_count": len(self.all_links_of_system),
334
+ },
335
+ )
336
+ # Final synthesis after all iterations
337
+ self._update_progress(
338
+ "Generating final synthesis", 90, {"phase": "synthesis"}
339
+ )
340
+
341
+ total_citation_count = len(self.all_links_of_system)
342
+
343
+ # Final synthesis
344
+ final_citation_result = self.citation_handler.analyze_followup(
345
+ query,
346
+ final_filtered_results,
347
+ previous_knowledge="", # Empty string as we don't need previous knowledge here
348
+ nr_of_links=total_citation_count,
349
+ )
350
+
351
+ # Add null check for final_citation_result
352
+ if final_citation_result:
353
+ synthesized_content = final_citation_result["content"]
354
+ documents = final_citation_result.get("documents", [])
355
+ else:
356
+ synthesized_content = "No relevant results found in final synthesis."
357
+ documents = []
358
+
359
+ # Add a final synthesis finding
360
+ final_finding = {
361
+ "phase": "Final synthesis",
362
+ "content": synthesized_content,
363
+ "question": query,
364
+ "search_results": final_filtered_results,
365
+ "documents": documents,
366
+ }
367
+ findings.append(final_finding)
368
+
369
+ # Add documents to repository
370
+ self.findings_repository.add_documents(documents)
371
+
372
+ # Transfer questions to repository
373
+ self.findings_repository.set_questions_by_iteration(
374
+ self.questions_by_iteration
375
+ )
376
+
377
+ # Format findings
378
+ formatted_findings = self.findings_repository.format_findings_to_text(
379
+ findings, synthesized_content
380
+ )
381
+
382
+ except Exception as e:
383
+ import traceback
384
+
385
+ error_msg = f"Error in research process: {str(e)}"
386
+ logger.error(error_msg)
387
+ logger.error(traceback.format_exc())
388
+ synthesized_content = f"Error: {str(e)}"
389
+ formatted_findings = f"Error: {str(e)}"
390
+ finding = {
391
+ "phase": "Error",
392
+ "content": synthesized_content,
393
+ "question": query,
394
+ "search_results": [],
395
+ "documents": [],
396
+ }
397
+ findings.append(finding)
398
+
399
+ self._update_progress("Research complete", 100, {"phase": "complete"})
400
+
401
+ return {
402
+ "findings": findings,
403
+ "iterations": iterations_to_run,
404
+ "questions_by_iteration": self.questions_by_iteration,
405
+ "formatted_findings": formatted_findings,
406
+ "current_knowledge": synthesized_content,
407
+ }
@@ -82,18 +82,18 @@ Provide a detailed analysis with citations. Do not create the bibliography, it w
82
82
  formatted_sources = self._format_sources(documents)
83
83
  # Add fact-checking step
84
84
  fact_check_prompt = f"""Analyze these sources for factual consistency:
85
- 1. Cross-reference major claims between sources
86
- 2. Identify and flag any contradictions
87
- 3. Verify basic facts (dates, company names, ownership)
88
- 4. Note when sources disagree
85
+ 1. Cross-reference major claims between sources
86
+ 2. Identify and flag any contradictions
87
+ 3. Verify basic facts (dates, company names, ownership)
88
+ 4. Note when sources disagree
89
89
 
90
- Previous Knowledge:
91
- {previous_knowledge}
90
+ Previous Knowledge:
91
+ {previous_knowledge}
92
92
 
93
- New Sources:
94
- {formatted_sources}
93
+ New Sources:
94
+ {formatted_sources}
95
95
 
96
- Return any inconsistencies or conflicts found."""
96
+ Return any inconsistencies or conflicts found."""
97
97
  if get_db_setting(
98
98
  "general.enable_fact_checking", settings.general.enable_fact_checking
99
99
  ):
@@ -104,16 +104,15 @@ Provide a detailed analysis with citations. Do not create the bibliography, it w
104
104
 
105
105
  prompt = f"""Using the previous knowledge and new sources, answer the question. Include citations using numbers in square brackets [1], [2], etc. When citing, use the source number provided at the start of each source. Reflect information from sources critically.
106
106
 
107
- Previous Knowledge:
108
- {previous_knowledge}
107
+ Previous Knowledge:
108
+ {previous_knowledge}
109
109
 
110
- Question: {question}
110
+ Question: {question}
111
111
 
112
- New Sources:
113
- {formatted_sources}
114
- Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
115
- Provide a detailed answer with citations. Example format: "According to [1], ..."
116
- """
112
+ New Sources:
113
+ {formatted_sources}
114
+ Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
115
+ Provide a detailed answer with citations. Example format: "According to [1], ..." """
117
116
 
118
117
  response = self.llm.invoke(prompt)
119
118
 
@@ -13,6 +13,9 @@ from .advanced_search_system.strategies.parallel_search_strategy import (
13
13
  ParallelSearchStrategy,
14
14
  )
15
15
  from .advanced_search_system.strategies.rapid_search_strategy import RapidSearchStrategy
16
+ from .advanced_search_system.strategies.source_based_strategy import (
17
+ SourceBasedSearchStrategy,
18
+ )
16
19
  from .advanced_search_system.strategies.standard_strategy import StandardSearchStrategy
17
20
  from .citation_handler import CitationHandler
18
21
  from .config.config_files import settings
@@ -31,7 +34,7 @@ class AdvancedSearchSystem:
31
34
 
32
35
  def __init__(
33
36
  self,
34
- strategy_name: str = "parallel",
37
+ strategy_name: str = "source-based",
35
38
  include_text_content: bool = True,
36
39
  use_cross_engine_filter: bool = True,
37
40
  llm: BaseChatModel | None = None,
@@ -76,6 +79,14 @@ class AdvancedSearchSystem:
76
79
  if strategy_name.lower() == "iterdrag":
77
80
  logger.info("Creating IterDRAGStrategy instance")
78
81
  self.strategy = IterDRAGStrategy(model=self.model, search=self.search)
82
+ elif strategy_name.lower() == "source-based":
83
+ logger.info("Creating SourceBasedSearchStrategy instance")
84
+ self.strategy = SourceBasedSearchStrategy(
85
+ model=self.model,
86
+ search=self.search,
87
+ include_text_content=include_text_content,
88
+ use_cross_engine_filter=use_cross_engine_filter,
89
+ )
79
90
  elif strategy_name.lower() == "parallel":
80
91
  logger.info("Creating ParallelSearchStrategy instance")
81
92
  self.strategy = ParallelSearchStrategy(
@@ -690,6 +690,10 @@ def run_research_process(
690
690
  # Handle error
691
691
  error_message = f"Research failed: {str(e)}"
692
692
  logger.error(error_message)
693
+ import traceback
694
+
695
+ logger.error("Exception occurred:" + str(traceback.print_exc()))
696
+
693
697
  try:
694
698
  # Check for common Ollama error patterns in the exception and provide more user-friendly errors
695
699
  user_friendly_error = str(e)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: local-deep-research
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: AI-powered research assistant with deep, iterative analysis using LLMs and web searches
5
5
  Author-Email: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>, HashedViking <6432677+HashedViking@users.noreply.github.com>
6
6
  License: MIT License
@@ -1,7 +1,7 @@
1
- local_deep_research-0.2.2.dist-info/METADATA,sha256=MgFc30qd-f-kk07M_jDRZ7HAq8MzL92pDLxQ34YYQMU,19797
2
- local_deep_research-0.2.2.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
3
- local_deep_research-0.2.2.dist-info/entry_points.txt,sha256=GcXS501Rjh-P80S8db7hnrQ23mS_Jg27PwpVQVO77as,113
4
- local_deep_research-0.2.2.dist-info/licenses/LICENSE,sha256=Qg2CaTdu6SWnSqk1_JtgBPp_Da-LdqJDhT1Vt1MUc5s,1072
1
+ local_deep_research-0.2.3.dist-info/METADATA,sha256=46N1CYIqxccMSv3Iaq-Tm8gEFtnuc1ATUJqfM720HkE,19797
2
+ local_deep_research-0.2.3.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
3
+ local_deep_research-0.2.3.dist-info/entry_points.txt,sha256=GcXS501Rjh-P80S8db7hnrQ23mS_Jg27PwpVQVO77as,113
4
+ local_deep_research-0.2.3.dist-info/licenses/LICENSE,sha256=Qg2CaTdu6SWnSqk1_JtgBPp_Da-LdqJDhT1Vt1MUc5s,1072
5
5
  local_deep_research/__init__.py,sha256=tczbsYNZQqfPAuVtz6OFyo-uUqjNQLelEIT2G7mPTwA,870
6
6
  local_deep_research/__main__.py,sha256=LIxK5iS6aLAKMFBDpUS3V-jDcxchqi3eSUsI2jAZUXk,371
7
7
  local_deep_research/advanced_search_system/__init__.py,sha256=sGusMj4eFIrhXR6QbOM16UDKB6aI-iS4IFivKWpMlh0,234
@@ -21,8 +21,9 @@ local_deep_research/advanced_search_system/repositories/__init__.py,sha256=cCjAR
21
21
  local_deep_research/advanced_search_system/strategies/__init__.py,sha256=upbslnB6Ns8RJ0-b1bH74-f5gZbo7evpx1dRrKEkzHA,35
22
22
  local_deep_research/advanced_search_system/strategies/base_strategy.py,sha256=cK5DqvsjGlFyqKRtpl0-dI6cip32UIbGS8eqsuL9SjI,3781
23
23
  local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py,sha256=eKCyxNVRnN7pOr-8LEzREbRkHX6ffa9hmjGwBYHHDDc,18129
24
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py,sha256=n-UVHHpyRFtMmPdaDQ30wE2V839CWGrLOM-cVLtRUrE,12396
24
+ local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py,sha256=dA5KgS5G_1O82MLhWx1UOZi5P4c7hqWdQPRdtt1B49U,19006
25
25
  local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py,sha256=fiLTqCfpyoNlP_rRZB96gdi3KoOkCWk-Nw5fb7E9an4,10389
26
+ local_deep_research/advanced_search_system/strategies/source_based_strategy.py,sha256=PW5gHhpayon3d716Ooo02UITkoxfBGvgzrm7kFITWWo,17312
26
27
  local_deep_research/advanced_search_system/strategies/standard_strategy.py,sha256=FbZAHiRAhfFCtA46Im0KxF5QNzursiz0SqhimvNiaXs,12747
27
28
  local_deep_research/advanced_search_system/tools/__init__.py,sha256=73jLuCKigwc9lJQ0uD3_F16dgCg4pL-F2cwC6tk9-oc,30
28
29
  local_deep_research/advanced_search_system/tools/base_tool.py,sha256=jEs4eroCvo0dHP_uF-5kLiQP7OfkD1YzNAD650a8Ktk,2865
@@ -32,7 +33,7 @@ local_deep_research/advanced_search_system/tools/search_tools/__init__.py,sha256
32
33
  local_deep_research/api/__init__.py,sha256=-tJQp7Qm1aPg6fgfuw-w9dfNo8GzrJLOy2i3dG8Drl8,441
33
34
  local_deep_research/api/research_functions.py,sha256=8Q_Rzfc0Qj2oLxzvFJIA4ms10uQC0a5SBHkIkSoPcw4,10908
34
35
  local_deep_research/app.py,sha256=U_92UX0dpVAQoaXciVNy_By_AyDEWGlXSeTwFpohALQ,155
35
- local_deep_research/citation_handler.py,sha256=KdfwHqSewPyP2OrxEGu9o15pJtFDYLUsLwOTHkQe8I8,4564
36
+ local_deep_research/citation_handler.py,sha256=NoEvnpf7jqCAJX6H-H8i2Hz69CVPW6UBg12cBRYtVdA,4396
36
37
  local_deep_research/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
38
  local_deep_research/config/config_files.py,sha256=k6ptAKIvqGrhnRsfRjT9uD2xBLAiD0vqXoYxggF5mik,10041
38
39
  local_deep_research/config/llm_config.py,sha256=KhuDbxjndU939rMCKmeYDctsx7fRoXMoQRv1AgtZKI4,16536
@@ -45,7 +46,7 @@ local_deep_research/defaults/search_engines.toml,sha256=XBnqCxzFvXa1HoKLcb_Jg4EG
45
46
  local_deep_research/main.py,sha256=umGmaQmW7bpx27wUAgSNjNr4oSHV6mDX5hoyfb22HEY,7033
46
47
  local_deep_research/migrate_db.py,sha256=S1h6Bv0OJdRW4BaH7MIMrUXBRV_yqgH2T6LVOZKTQjI,4634
47
48
  local_deep_research/report_generator.py,sha256=-G3KDEbsuU3PdxDfuo5v28DIX7RE1yJCCBU2KgRbNzI,9084
48
- local_deep_research/search_system.py,sha256=MqaG435RzllyHlVuT7eCc_wC8_rCA4RLW7F5NDp9kxE,7108
49
+ local_deep_research/search_system.py,sha256=YmXu9ui-aB5kGb9rqQWUb7qSvd-iHfp3PvRenPwCdDA,7604
49
50
  local_deep_research/setup_data_dir.py,sha256=7MJa2MMdDUnktJVHwMpyNL2079-qylpIyyLpVbF5AUY,1134
50
51
  local_deep_research/test_migration.py,sha256=cXY9WbpxLslNEa1vFwLMvcvKBbUe7Wosm--AqmPIPYM,6459
51
52
  local_deep_research/utilities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -68,7 +69,7 @@ local_deep_research/web/routes/api_routes.py,sha256=S0UdCmfm0v1GEM4UiSbI0PE3xUOx
68
69
  local_deep_research/web/routes/history_routes.py,sha256=6a_8nX349viuvi1zP5S7BaPPpAh133eTi1NVWO545A8,12622
69
70
  local_deep_research/web/routes/research_routes.py,sha256=JlzaP1z-7XAP3E0nkEjLIfYj_NKf5qDcrjxBmUouAhM,23492
70
71
  local_deep_research/web/routes/settings_routes.py,sha256=rEvvFCVWJ80zchnzXBv9SAnDXMvDPLGDjSUfLRlCCi0,60012
71
- local_deep_research/web/services/research_service.py,sha256=sxvW4oNLiiKgQ8w0SblefzMmk8EEaNNOGd8oC96j85E,39556
72
+ local_deep_research/web/services/research_service.py,sha256=0tFx3wactXhZjFuZDHC3aAFgpDTtjfm_c-1HsZLxaos,39656
72
73
  local_deep_research/web/services/resource_service.py,sha256=yKgOC6GEOmHqRoGzwf52e19UaGCCS1DbDbOIXgWGvGc,4378
73
74
  local_deep_research/web/services/settings_manager.py,sha256=ybnhSlByuKA2oJPElN2WI8bh-ZzC6lP08x0Gsz8Ycbk,24310
74
75
  local_deep_research/web/services/settings_service.py,sha256=1XHvNBNs9gzor2AxOEDrqL-JsKyXKk5izCnoXAV78u8,5064
@@ -132,4 +133,4 @@ local_deep_research/web_search_engines/engines/search_engine_wikipedia.py,sha256
132
133
  local_deep_research/web_search_engines/search_engine_base.py,sha256=PLU_sAWhWKTOQWcv32GINuhLdIwB0sEQy-pp9oG9Ggo,9835
133
134
  local_deep_research/web_search_engines/search_engine_factory.py,sha256=mkIf6F-8-aooS47iqb8SanJ9shnl0UOVia8hr2xX0b0,12751
134
135
  local_deep_research/web_search_engines/search_engines_config.py,sha256=GmwpCT6vfeq1wrdr1R-zu6WRQ5XxyE7921HPsgGm3gI,2771
135
- local_deep_research-0.2.2.dist-info/RECORD,,
136
+ local_deep_research-0.2.3.dist-info/RECORD,,