PyPI - local-deep-research - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

local-deep-research 0.2.0py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py CHANGED Viewed

@@ -80,7 +80,7 @@ class ParallelSearchStrategy(BaseSearchStrategy):
     def analyze_topic(self, query: str) -> Dict:
         """
-        Parallel implementation that generates questions and searches all at once.
+        Analyze a topic using parallel search, supporting multiple iterations.
         Args:
             query: The research query to analyze
@@ -89,6 +89,11 @@ class ParallelSearchStrategy(BaseSearchStrategy):
         findings = []
         all_search_results = []
+        current_knowledge = ""
+        # Track all search results across iterations
+        self.all_links_of_system = list()
+        self.questions_by_iteration = {}
         self._update_progress(
             "Initializing parallel research",
@@ -105,184 +110,301 @@ class ParallelSearchStrategy(BaseSearchStrategy):
             return {
                 "findings": [],
                 "iterations": 0,
-                "questions": {},
+                "questions_by_iteration": {},
                 "formatted_findings": "Error: Unable to conduct research without a search engine.",
                 "current_knowledge": "",
                 "error": "No search engine available",
             }
+        # Determine number of iterations to run
+        iterations_to_run = get_db_setting("search.iterations")
+        logger.debug("Selected amount of iterations: " + iterations_to_run)
+        iterations_to_run = int(iterations_to_run)
         try:
-            # Step 1: Generate questions first
-            self._update_progress(
-                "Generating search questions", 10, {"phase": "question_generation"}
-            )
+            # Run each iteration
+            for iteration in range(1, iterations_to_run + 1):
+                iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
-            # Generate 3 additional questions (plus the main query = 4 total)
-            questions = self.question_generator.generate_questions(
-                current_knowledge="",  # No knowledge accumulation
-                query=query,
-                questions_per_iteration=int(
-                    get_db_setting("search.questions_per_iteration")
-                ),  # 3 additional questions
-                questions_by_iteration={},
-            )
+                self._update_progress(
+                    f"Starting iteration {iteration}/{iterations_to_run}",
+                    iteration_progress_base,
+                    {"phase": f"iteration_{iteration}", "iteration": iteration},
+                )
-            # Add the original query as the first question
-            all_questions = [query] + questions
+                # Step 1: Generate questions
+                self._update_progress(
+                    f"Generating search questions for iteration {iteration}",
+                    iteration_progress_base + 5,
+                    {"phase": "question_generation", "iteration": iteration},
+                )
-            # Store in questions_by_iteration
-            self.questions_by_iteration[0] = questions
-            logger.info(f"Generated questions: {questions}")
+                # For first iteration, generate initial questions
+                # For subsequent iterations, generate follow-up questions
+                logger.info("Starting to generate questions")
+                if iteration == 1:
+                    # Generate additional questions (plus the main query)
+                    if iterations_to_run > 1:
+                        context = f"""Iteration: {1} of {iterations_to_run}"""
+                    else:
+                        context = ""
+                    questions = self.question_generator.generate_questions(
+                        current_knowledge=context,
+                        query=query,
+                        questions_per_iteration=int(
+                            get_db_setting("search.questions_per_iteration")
+                        ),
+                        questions_by_iteration=self.questions_by_iteration,
+                    )
-            # Step 2: Run all searches in parallel
-            self._update_progress(
-                "Running parallel searches for all questions",
-                20,
-                {"phase": "parallel_search"},
-            )
+                    # Add the original query as the first question
+                    all_questions = [query] + questions
-            # Function for thread pool
-            def search_question(q):
-                try:
-                    result = self.search.run(q)
-                    return {"question": q, "results": result or []}
-                except Exception as e:
-                    logger.error(f"Error searching for '{q}': {str(e)}")
-                    return {"question": q, "results": [], "error": str(e)}
-            # Run searches in parallel
-            with concurrent.futures.ThreadPoolExecutor(
-                max_workers=len(all_questions)
-            ) as executor:
-                futures = [executor.submit(search_question, q) for q in all_questions]
-                all_search_dict = {}
-                # Process results as they complete
-                for i, future in enumerate(concurrent.futures.as_completed(futures)):
-                    result_dict = future.result()
-                    question = result_dict["question"]
-                    search_results = result_dict["results"]
-                    all_search_dict[question] = search_results
+                    # Store in questions_by_iteration
+                    self.questions_by_iteration[iteration] = questions
+                    logger.info(
+                        f"Generated questions for iteration {iteration}: {questions}"
+                    )
+                else:
+                    # Get past questions from all previous iterations
+                    past_questions = []
+                    for prev_iter in range(1, iteration):
+                        if prev_iter in self.questions_by_iteration:
+                            past_questions.extend(
+                                self.questions_by_iteration[prev_iter]
+                            )
+                    # Generate follow-up questions based on accumulated knowledge if iterations > 2
+                    use_knowledge = iterations_to_run > 2
+                    knowledge_for_questions = current_knowledge if use_knowledge else ""
+                    context = f"""Current Knowledge: {knowledge_for_questions}
+                    Iteration: {iteration} of {iterations_to_run}"""
+                    # Generate questions
+                    questions = self.question_generator.generate_questions(
+                        current_knowledge=context,
+                        query=query,
+                        questions_per_iteration=int(
+                            get_db_setting("search.questions_per_iteration")
+                        ),
+                        questions_by_iteration=self.questions_by_iteration,
+                    )
-                    self._update_progress(
-                        f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
-                        20 + ((i + 1) / len(all_questions) * 40),
-                        {
-                            "phase": "search_complete",
-                            "result_count": len(search_results),
-                            "question": question,
-                        },
+                    # Use only the new questions for this iteration's searches
+                    all_questions = questions
+                    # Store in questions_by_iteration
+                    self.questions_by_iteration[iteration] = questions
+                    logger.info(
+                        f"Generated questions for iteration {iteration}: {questions}"
                     )
-                    # Extract and save links
-                    if not self.use_cross_engine_filter:
-                        links = extract_links_from_search_results(search_results)
-                        self.all_links_of_system.extend(links)
-                    all_search_results.extend(search_results)
+                # Step 2: Run all searches in parallel for this iteration
+                self._update_progress(
+                    f"Running parallel searches for iteration {iteration}",
+                    iteration_progress_base + 10,
+                    {"phase": "parallel_search", "iteration": iteration},
+                )
-            # Step 3: Analysis of collected search results
-            self._update_progress(
-                "Analyzing all collected search results",
-                70,
-                {"phase": "final_analysis"},
-            )
-            if self.use_cross_engine_filter:
+                # Function for thread pool
+                def search_question(q):
+                    try:
+                        result = self.search.run(q)
+                        return {"question": q, "results": result or []}
+                    except Exception as e:
+                        logger.error(f"Error searching for '{q}': {str(e)}")
+                        return {"question": q, "results": [], "error": str(e)}
+                # Run searches in parallel
+                with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=len(all_questions)
+                ) as executor:
+                    futures = [
+                        executor.submit(search_question, q) for q in all_questions
+                    ]
+                    iteration_search_dict = {}
+                    iteration_search_results = []
+                    # Process results as they complete
+                    for i, future in enumerate(
+                        concurrent.futures.as_completed(futures)
+                    ):
+                        result_dict = future.result()
+                        question = result_dict["question"]
+                        search_results = result_dict["results"]
+                        iteration_search_dict[question] = search_results
+                        self._update_progress(
+                            f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
+                            iteration_progress_base
+                            + 10
+                            + ((i + 1) / len(all_questions) * 30),
+                            {
+                                "phase": "search_complete",
+                                "iteration": iteration,
+                                "result_count": len(search_results),
+                                "question": question,
+                            },
+                        )
+                        # Collect all search results for this iteration
+                        iteration_search_results.extend(search_results)
+                # Step 3: Filter and analyze results for this iteration
                 self._update_progress(
-                    "Filtering search results across engines",
-                    65,
-                    {"phase": "cross_engine_filtering"},
+                    f"Analyzing results for iteration {iteration}",
+                    iteration_progress_base + 45,
+                    {"phase": "iteration_analysis", "iteration": iteration},
                 )
-                # Get the current link count (for indexing)
-                existing_link_count = len(self.all_links_of_system)
+                # Apply cross-engine filtering if enabled
+                if self.use_cross_engine_filter:
+                    self._update_progress(
+                        f"Filtering search results for iteration {iteration}",
+                        iteration_progress_base + 45,
+                        {"phase": "cross_engine_filtering", "iteration": iteration},
+                    )
-                # Filter the search results
-                filtered_search_results = self.cross_engine_filter.filter_results(
-                    all_search_results,
-                    query,
-                    reorder=self.filter_reorder,
-                    reindex=self.filter_reindex,
-                    start_index=existing_link_count,  # Start indexing after existing links
-                )
+                    # Get the current link count (for indexing)
+                    existing_link_count = len(self.all_links_of_system)
-                links = extract_links_from_search_results(filtered_search_results)
-                self.all_links_of_system.extend(links)
+                    # Filter the search results
+                    filtered_search_results = self.cross_engine_filter.filter_results(
+                        iteration_search_results,
+                        query,
+                        reorder=self.filter_reorder,
+                        reindex=self.filter_reindex,
+                        start_index=existing_link_count,  # Start indexing after existing links
+                    )
+                    links = extract_links_from_search_results(filtered_search_results)
+                    self.all_links_of_system.extend(links)
+                    self._update_progress(
+                        f"Filtered from {len(iteration_search_results)} to {len(filtered_search_results)} results",
+                        iteration_progress_base + 50,
+                        {
+                            "phase": "filtering_complete",
+                            "iteration": iteration,
+                            "links_count": len(self.all_links_of_system),
+                        },
+                    )
+                    # Use filtered results for analysis
+                    iteration_search_results = filtered_search_results
+                else:
+                    # Just extract links without filtering
+                    links = extract_links_from_search_results(iteration_search_results)
+                    self.all_links_of_system.extend(links)
+                # Add to all search results
+                all_search_results.extend(iteration_search_results)
+                # Create a finding for this iteration's results
+                if self.include_text_content and iteration_search_results:
+                    # For iteration > 1 with knowledge accumulation, use follow-up analysis
+                    if iteration > 1 and iterations_to_run > 2:
+                        citation_result = self.citation_handler.analyze_followup(
+                            query,
+                            iteration_search_results,
+                            current_knowledge,
+                            len(self.all_links_of_system) - len(links),
+                        )
+                    else:
+                        # For first iteration or without knowledge accumulation, use initial analysis
+                        citation_result = self.citation_handler.analyze_initial(
+                            query, iteration_search_results
+                        )
+                    if citation_result:
+                        # Create a finding for this iteration
+                        iteration_content = citation_result["content"]
+                        # Update current knowledge if iterations > 2
+                        if iterations_to_run > 2:
+                            if current_knowledge:
+                                current_knowledge = f"{current_knowledge}\n\n## FINDINGS FROM ITERATION {iteration}:\n\n{iteration_content}"
+                            else:
+                                current_knowledge = iteration_content
+                        finding = {
+                            "phase": f"Iteration {iteration}",
+                            "content": iteration_content,
+                            "question": query,
+                            "search_results": iteration_search_results,
+                            "documents": citation_result.get("documents", []),
+                        }
+                        findings.append(finding)
+                        # Add documents to repository
+                        if "documents" in citation_result:
+                            self.findings_repository.add_documents(
+                                citation_result["documents"]
+                            )
+                # Mark iteration as complete
+                iteration_progress = 5 + iteration * (70 / iterations_to_run)
                 self._update_progress(
-                    f"Filtered from {len(all_search_results)} to {len(filtered_search_results)} results",
-                    70,
-                    {
-                        "phase": "filtering_complete",
-                        "links_count": len(self.all_links_of_system),
-                    },
+                    f"Completed iteration {iteration}/{iterations_to_run}",
+                    iteration_progress,
+                    {"phase": "iteration_complete", "iteration": iteration},
                 )
-                # Use filtered results for analysis
-                all_search_results = filtered_search_results
+            # Final synthesis after all iterations
+            self._update_progress(
+                "Generating final synthesis", 80, {"phase": "synthesis"}
+            )
-            # Now when we use the citation handler, ensure we're using all_search_results:
+            # Handle final synthesis based on include_text_content flag
             if self.include_text_content:
-                # Use citation handler for analysis of all results together
-                citation_result = self.citation_handler.analyze_initial(
-                    query, all_search_results
-                )
-            if citation_result:
-                synthesized_content = citation_result["content"]
-                finding = {
+                # Generate a final synthesis from all search results
+                if iterations_to_run > 1:
+                    final_citation_result = self.citation_handler.analyze_initial(
+                        query, all_search_results
+                    )
+                    # Add null check for final_citation_result
+                    if final_citation_result:
+                        synthesized_content = final_citation_result["content"]
+                    else:
+                        synthesized_content = (
+                            "No relevant results found in final synthesis."
+                        )
+                else:
+                    # For single iteration, use the content from findings
+                    synthesized_content = (
+                        findings[0]["content"]
+                        if findings
+                        else "No relevant results found."
+                    )
+                # Add a final synthesis finding
+                final_finding = {
                     "phase": "Final synthesis",
                     "content": synthesized_content,
                     "question": query,
                     "search_results": all_search_results,
-                    "documents": citation_result.get("documents", []),
+                    "documents": [],
                 }
-                findings.append(finding)
-                # Transfer questions to repository
-                self.findings_repository.set_questions_by_iteration(
-                    self.questions_by_iteration
-                )
-                # Format findings
-                formatted_findings = self.findings_repository.format_findings_to_text(
-                    findings, synthesized_content
-                )
-                # Add documents to repository
-                if "documents" in citation_result:
-                    self.findings_repository.add_documents(citation_result["documents"])
-                else:
-                    synthesized_content = "No relevant results found."
-                    formatted_findings = synthesized_content
-                    finding = {
-                        "phase": "Error",
-                        "content": "No relevant results found.",
-                        "question": query,
-                        "search_results": all_search_results,
-                        "documents": [],
-                    }
-                    findings.append(finding)
+                findings.append(final_finding)
             else:
                 # Skip LLM analysis, just format the raw search results
                 synthesized_content = "LLM analysis skipped"
-                finding = {
+                final_finding = {
                     "phase": "Raw search results",
                     "content": "LLM analysis was skipped. Displaying raw search results with links.",
                     "question": query,
                     "search_results": all_search_results,
                     "documents": [],
                 }
-                findings.append(finding)
+                findings.append(final_finding)
-                # Transfer questions to repository
-                self.findings_repository.set_questions_by_iteration(
-                    self.questions_by_iteration
-                )
+            # Transfer questions to repository
+            self.findings_repository.set_questions_by_iteration(
+                self.questions_by_iteration
+            )
-                # Format findings without synthesis
-                formatted_findings = self.findings_repository.format_findings_to_text(
-                    findings, "Raw search results (LLM analysis skipped)"
-                )
+            # Format findings
+            formatted_findings = self.findings_repository.format_findings_to_text(
+                findings, synthesized_content
+            )
         except Exception as e:
             import traceback
@@ -305,8 +427,8 @@ class ParallelSearchStrategy(BaseSearchStrategy):
         return {
             "findings": findings,
-            "iterations": 1,
-            "questions": self.questions_by_iteration,
+            "iterations": iterations_to_run,
+            "questions_by_iteration": self.questions_by_iteration,
             "formatted_findings": formatted_findings,
             "current_knowledge": synthesized_content,
         }

local-deep-research 0.2.0__py3-none-any.whl → 0.2.3__py3-none-any.whl

local-deep-research 0.2.0py3-none-any.whl → 0.2.3py3-none-any.whl