local-deep-research 0.2.0__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -80,7 +80,7 @@ class ParallelSearchStrategy(BaseSearchStrategy):
80
80
 
81
81
  def analyze_topic(self, query: str) -> Dict:
82
82
  """
83
- Parallel implementation that generates questions and searches all at once.
83
+ Analyze a topic using parallel search, supporting multiple iterations.
84
84
 
85
85
  Args:
86
86
  query: The research query to analyze
@@ -89,6 +89,11 @@ class ParallelSearchStrategy(BaseSearchStrategy):
89
89
 
90
90
  findings = []
91
91
  all_search_results = []
92
+ current_knowledge = ""
93
+
94
+ # Track all search results across iterations
95
+ self.all_links_of_system = list()
96
+ self.questions_by_iteration = {}
92
97
 
93
98
  self._update_progress(
94
99
  "Initializing parallel research",
@@ -105,184 +110,301 @@ class ParallelSearchStrategy(BaseSearchStrategy):
105
110
  return {
106
111
  "findings": [],
107
112
  "iterations": 0,
108
- "questions": {},
113
+ "questions_by_iteration": {},
109
114
  "formatted_findings": "Error: Unable to conduct research without a search engine.",
110
115
  "current_knowledge": "",
111
116
  "error": "No search engine available",
112
117
  }
113
118
 
119
+ # Determine number of iterations to run
120
+ iterations_to_run = get_db_setting("search.iterations")
121
+ logger.debug("Selected amount of iterations: " + iterations_to_run)
122
+ iterations_to_run = int(iterations_to_run)
114
123
  try:
115
- # Step 1: Generate questions first
116
- self._update_progress(
117
- "Generating search questions", 10, {"phase": "question_generation"}
118
- )
124
+ # Run each iteration
125
+ for iteration in range(1, iterations_to_run + 1):
126
+ iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
119
127
 
120
- # Generate 3 additional questions (plus the main query = 4 total)
121
- questions = self.question_generator.generate_questions(
122
- current_knowledge="", # No knowledge accumulation
123
- query=query,
124
- questions_per_iteration=int(
125
- get_db_setting("search.questions_per_iteration")
126
- ), # 3 additional questions
127
- questions_by_iteration={},
128
- )
128
+ self._update_progress(
129
+ f"Starting iteration {iteration}/{iterations_to_run}",
130
+ iteration_progress_base,
131
+ {"phase": f"iteration_{iteration}", "iteration": iteration},
132
+ )
129
133
 
130
- # Add the original query as the first question
131
- all_questions = [query] + questions
134
+ # Step 1: Generate questions
135
+ self._update_progress(
136
+ f"Generating search questions for iteration {iteration}",
137
+ iteration_progress_base + 5,
138
+ {"phase": "question_generation", "iteration": iteration},
139
+ )
132
140
 
133
- # Store in questions_by_iteration
134
- self.questions_by_iteration[0] = questions
135
- logger.info(f"Generated questions: {questions}")
141
+ # For first iteration, generate initial questions
142
+ # For subsequent iterations, generate follow-up questions
143
+ logger.info("Starting to generate questions")
144
+ if iteration == 1:
145
+ # Generate additional questions (plus the main query)
146
+ if iterations_to_run > 1:
147
+ context = f"""Iteration: {1} of {iterations_to_run}"""
148
+ else:
149
+ context = ""
150
+ questions = self.question_generator.generate_questions(
151
+ current_knowledge=context,
152
+ query=query,
153
+ questions_per_iteration=int(
154
+ get_db_setting("search.questions_per_iteration")
155
+ ),
156
+ questions_by_iteration=self.questions_by_iteration,
157
+ )
136
158
 
137
- # Step 2: Run all searches in parallel
138
- self._update_progress(
139
- "Running parallel searches for all questions",
140
- 20,
141
- {"phase": "parallel_search"},
142
- )
159
+ # Add the original query as the first question
160
+ all_questions = [query] + questions
143
161
 
144
- # Function for thread pool
145
- def search_question(q):
146
- try:
147
- result = self.search.run(q)
148
- return {"question": q, "results": result or []}
149
- except Exception as e:
150
- logger.error(f"Error searching for '{q}': {str(e)}")
151
- return {"question": q, "results": [], "error": str(e)}
152
-
153
- # Run searches in parallel
154
- with concurrent.futures.ThreadPoolExecutor(
155
- max_workers=len(all_questions)
156
- ) as executor:
157
- futures = [executor.submit(search_question, q) for q in all_questions]
158
- all_search_dict = {}
159
-
160
- # Process results as they complete
161
- for i, future in enumerate(concurrent.futures.as_completed(futures)):
162
- result_dict = future.result()
163
- question = result_dict["question"]
164
- search_results = result_dict["results"]
165
- all_search_dict[question] = search_results
162
+ # Store in questions_by_iteration
163
+ self.questions_by_iteration[iteration] = questions
164
+ logger.info(
165
+ f"Generated questions for iteration {iteration}: {questions}"
166
+ )
167
+ else:
168
+ # Get past questions from all previous iterations
169
+ past_questions = []
170
+ for prev_iter in range(1, iteration):
171
+ if prev_iter in self.questions_by_iteration:
172
+ past_questions.extend(
173
+ self.questions_by_iteration[prev_iter]
174
+ )
175
+
176
+ # Generate follow-up questions based on accumulated knowledge if iterations > 2
177
+ use_knowledge = iterations_to_run > 2
178
+ knowledge_for_questions = current_knowledge if use_knowledge else ""
179
+ context = f"""Current Knowledge: {knowledge_for_questions}
180
+ Iteration: {iteration} of {iterations_to_run}"""
181
+
182
+ # Generate questions
183
+ questions = self.question_generator.generate_questions(
184
+ current_knowledge=context,
185
+ query=query,
186
+ questions_per_iteration=int(
187
+ get_db_setting("search.questions_per_iteration")
188
+ ),
189
+ questions_by_iteration=self.questions_by_iteration,
190
+ )
166
191
 
167
- self._update_progress(
168
- f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
169
- 20 + ((i + 1) / len(all_questions) * 40),
170
- {
171
- "phase": "search_complete",
172
- "result_count": len(search_results),
173
- "question": question,
174
- },
192
+ # Use only the new questions for this iteration's searches
193
+ all_questions = questions
194
+
195
+ # Store in questions_by_iteration
196
+ self.questions_by_iteration[iteration] = questions
197
+ logger.info(
198
+ f"Generated questions for iteration {iteration}: {questions}"
175
199
  )
176
200
 
177
- # Extract and save links
178
- if not self.use_cross_engine_filter:
179
- links = extract_links_from_search_results(search_results)
180
- self.all_links_of_system.extend(links)
181
- all_search_results.extend(search_results)
201
+ # Step 2: Run all searches in parallel for this iteration
202
+ self._update_progress(
203
+ f"Running parallel searches for iteration {iteration}",
204
+ iteration_progress_base + 10,
205
+ {"phase": "parallel_search", "iteration": iteration},
206
+ )
182
207
 
183
- # Step 3: Analysis of collected search results
184
- self._update_progress(
185
- "Analyzing all collected search results",
186
- 70,
187
- {"phase": "final_analysis"},
188
- )
189
- if self.use_cross_engine_filter:
208
+ # Function for thread pool
209
+ def search_question(q):
210
+ try:
211
+ result = self.search.run(q)
212
+ return {"question": q, "results": result or []}
213
+ except Exception as e:
214
+ logger.error(f"Error searching for '{q}': {str(e)}")
215
+ return {"question": q, "results": [], "error": str(e)}
216
+
217
+ # Run searches in parallel
218
+ with concurrent.futures.ThreadPoolExecutor(
219
+ max_workers=len(all_questions)
220
+ ) as executor:
221
+ futures = [
222
+ executor.submit(search_question, q) for q in all_questions
223
+ ]
224
+ iteration_search_dict = {}
225
+ iteration_search_results = []
226
+
227
+ # Process results as they complete
228
+ for i, future in enumerate(
229
+ concurrent.futures.as_completed(futures)
230
+ ):
231
+ result_dict = future.result()
232
+ question = result_dict["question"]
233
+ search_results = result_dict["results"]
234
+ iteration_search_dict[question] = search_results
235
+
236
+ self._update_progress(
237
+ f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
238
+ iteration_progress_base
239
+ + 10
240
+ + ((i + 1) / len(all_questions) * 30),
241
+ {
242
+ "phase": "search_complete",
243
+ "iteration": iteration,
244
+ "result_count": len(search_results),
245
+ "question": question,
246
+ },
247
+ )
248
+
249
+ # Collect all search results for this iteration
250
+ iteration_search_results.extend(search_results)
251
+
252
+ # Step 3: Filter and analyze results for this iteration
190
253
  self._update_progress(
191
- "Filtering search results across engines",
192
- 65,
193
- {"phase": "cross_engine_filtering"},
254
+ f"Analyzing results for iteration {iteration}",
255
+ iteration_progress_base + 45,
256
+ {"phase": "iteration_analysis", "iteration": iteration},
194
257
  )
195
258
 
196
- # Get the current link count (for indexing)
197
- existing_link_count = len(self.all_links_of_system)
259
+ # Apply cross-engine filtering if enabled
260
+ if self.use_cross_engine_filter:
261
+ self._update_progress(
262
+ f"Filtering search results for iteration {iteration}",
263
+ iteration_progress_base + 45,
264
+ {"phase": "cross_engine_filtering", "iteration": iteration},
265
+ )
198
266
 
199
- # Filter the search results
200
- filtered_search_results = self.cross_engine_filter.filter_results(
201
- all_search_results,
202
- query,
203
- reorder=self.filter_reorder,
204
- reindex=self.filter_reindex,
205
- start_index=existing_link_count, # Start indexing after existing links
206
- )
267
+ # Get the current link count (for indexing)
268
+ existing_link_count = len(self.all_links_of_system)
207
269
 
208
- links = extract_links_from_search_results(filtered_search_results)
209
- self.all_links_of_system.extend(links)
270
+ # Filter the search results
271
+ filtered_search_results = self.cross_engine_filter.filter_results(
272
+ iteration_search_results,
273
+ query,
274
+ reorder=self.filter_reorder,
275
+ reindex=self.filter_reindex,
276
+ start_index=existing_link_count, # Start indexing after existing links
277
+ )
278
+
279
+ links = extract_links_from_search_results(filtered_search_results)
280
+ self.all_links_of_system.extend(links)
281
+
282
+ self._update_progress(
283
+ f"Filtered from {len(iteration_search_results)} to {len(filtered_search_results)} results",
284
+ iteration_progress_base + 50,
285
+ {
286
+ "phase": "filtering_complete",
287
+ "iteration": iteration,
288
+ "links_count": len(self.all_links_of_system),
289
+ },
290
+ )
210
291
 
292
+ # Use filtered results for analysis
293
+ iteration_search_results = filtered_search_results
294
+ else:
295
+ # Just extract links without filtering
296
+ links = extract_links_from_search_results(iteration_search_results)
297
+ self.all_links_of_system.extend(links)
298
+
299
+ # Add to all search results
300
+ all_search_results.extend(iteration_search_results)
301
+
302
+ # Create a finding for this iteration's results
303
+ if self.include_text_content and iteration_search_results:
304
+ # For iteration > 1 with knowledge accumulation, use follow-up analysis
305
+ if iteration > 1 and iterations_to_run > 2:
306
+ citation_result = self.citation_handler.analyze_followup(
307
+ query,
308
+ iteration_search_results,
309
+ current_knowledge,
310
+ len(self.all_links_of_system) - len(links),
311
+ )
312
+ else:
313
+ # For first iteration or without knowledge accumulation, use initial analysis
314
+ citation_result = self.citation_handler.analyze_initial(
315
+ query, iteration_search_results
316
+ )
317
+
318
+ if citation_result:
319
+ # Create a finding for this iteration
320
+ iteration_content = citation_result["content"]
321
+
322
+ # Update current knowledge if iterations > 2
323
+ if iterations_to_run > 2:
324
+ if current_knowledge:
325
+ current_knowledge = f"{current_knowledge}\n\n## FINDINGS FROM ITERATION {iteration}:\n\n{iteration_content}"
326
+ else:
327
+ current_knowledge = iteration_content
328
+
329
+ finding = {
330
+ "phase": f"Iteration {iteration}",
331
+ "content": iteration_content,
332
+ "question": query,
333
+ "search_results": iteration_search_results,
334
+ "documents": citation_result.get("documents", []),
335
+ }
336
+ findings.append(finding)
337
+
338
+ # Add documents to repository
339
+ if "documents" in citation_result:
340
+ self.findings_repository.add_documents(
341
+ citation_result["documents"]
342
+ )
343
+
344
+ # Mark iteration as complete
345
+ iteration_progress = 5 + iteration * (70 / iterations_to_run)
211
346
  self._update_progress(
212
- f"Filtered from {len(all_search_results)} to {len(filtered_search_results)} results",
213
- 70,
214
- {
215
- "phase": "filtering_complete",
216
- "links_count": len(self.all_links_of_system),
217
- },
347
+ f"Completed iteration {iteration}/{iterations_to_run}",
348
+ iteration_progress,
349
+ {"phase": "iteration_complete", "iteration": iteration},
218
350
  )
219
351
 
220
- # Use filtered results for analysis
221
- all_search_results = filtered_search_results
352
+ # Final synthesis after all iterations
353
+ self._update_progress(
354
+ "Generating final synthesis", 80, {"phase": "synthesis"}
355
+ )
222
356
 
223
- # Now when we use the citation handler, ensure we're using all_search_results:
357
+ # Handle final synthesis based on include_text_content flag
224
358
  if self.include_text_content:
225
- # Use citation handler for analysis of all results together
226
- citation_result = self.citation_handler.analyze_initial(
227
- query, all_search_results
228
- )
229
-
230
- if citation_result:
231
- synthesized_content = citation_result["content"]
232
- finding = {
359
+ # Generate a final synthesis from all search results
360
+ if iterations_to_run > 1:
361
+ final_citation_result = self.citation_handler.analyze_initial(
362
+ query, all_search_results
363
+ )
364
+ # Add null check for final_citation_result
365
+ if final_citation_result:
366
+ synthesized_content = final_citation_result["content"]
367
+ else:
368
+ synthesized_content = (
369
+ "No relevant results found in final synthesis."
370
+ )
371
+ else:
372
+ # For single iteration, use the content from findings
373
+ synthesized_content = (
374
+ findings[0]["content"]
375
+ if findings
376
+ else "No relevant results found."
377
+ )
378
+ # Add a final synthesis finding
379
+ final_finding = {
233
380
  "phase": "Final synthesis",
234
381
  "content": synthesized_content,
235
382
  "question": query,
236
383
  "search_results": all_search_results,
237
- "documents": citation_result.get("documents", []),
384
+ "documents": [],
238
385
  }
239
- findings.append(finding)
240
-
241
- # Transfer questions to repository
242
- self.findings_repository.set_questions_by_iteration(
243
- self.questions_by_iteration
244
- )
245
-
246
- # Format findings
247
- formatted_findings = self.findings_repository.format_findings_to_text(
248
- findings, synthesized_content
249
- )
250
-
251
- # Add documents to repository
252
- if "documents" in citation_result:
253
- self.findings_repository.add_documents(citation_result["documents"])
254
- else:
255
- synthesized_content = "No relevant results found."
256
- formatted_findings = synthesized_content
257
- finding = {
258
- "phase": "Error",
259
- "content": "No relevant results found.",
260
- "question": query,
261
- "search_results": all_search_results,
262
- "documents": [],
263
- }
264
- findings.append(finding)
386
+ findings.append(final_finding)
265
387
  else:
266
388
  # Skip LLM analysis, just format the raw search results
267
389
  synthesized_content = "LLM analysis skipped"
268
- finding = {
390
+ final_finding = {
269
391
  "phase": "Raw search results",
270
392
  "content": "LLM analysis was skipped. Displaying raw search results with links.",
271
393
  "question": query,
272
394
  "search_results": all_search_results,
273
395
  "documents": [],
274
396
  }
275
- findings.append(finding)
397
+ findings.append(final_finding)
276
398
 
277
- # Transfer questions to repository
278
- self.findings_repository.set_questions_by_iteration(
279
- self.questions_by_iteration
280
- )
399
+ # Transfer questions to repository
400
+ self.findings_repository.set_questions_by_iteration(
401
+ self.questions_by_iteration
402
+ )
281
403
 
282
- # Format findings without synthesis
283
- formatted_findings = self.findings_repository.format_findings_to_text(
284
- findings, "Raw search results (LLM analysis skipped)"
285
- )
404
+ # Format findings
405
+ formatted_findings = self.findings_repository.format_findings_to_text(
406
+ findings, synthesized_content
407
+ )
286
408
 
287
409
  except Exception as e:
288
410
  import traceback
@@ -305,8 +427,8 @@ class ParallelSearchStrategy(BaseSearchStrategy):
305
427
 
306
428
  return {
307
429
  "findings": findings,
308
- "iterations": 1,
309
- "questions": self.questions_by_iteration,
430
+ "iterations": iterations_to_run,
431
+ "questions_by_iteration": self.questions_by_iteration,
310
432
  "formatted_findings": formatted_findings,
311
433
  "current_knowledge": synthesized_content,
312
434
  }