local-deep-research 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. local_deep_research/__init__.py +1 -1
  2. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +5 -1
  3. local_deep_research/advanced_search_system/strategies/base_strategy.py +5 -2
  4. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +23 -16
  5. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +273 -144
  6. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +4 -3
  7. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +402 -0
  8. local_deep_research/advanced_search_system/strategies/standard_strategy.py +8 -4
  9. local_deep_research/api/research_functions.py +0 -46
  10. local_deep_research/citation_handler.py +16 -20
  11. local_deep_research/config/llm_config.py +25 -68
  12. local_deep_research/config/search_config.py +8 -21
  13. local_deep_research/defaults/default_settings.json +3814 -0
  14. local_deep_research/search_system.py +46 -32
  15. local_deep_research/utilities/db_utils.py +22 -3
  16. local_deep_research/utilities/search_utilities.py +10 -7
  17. local_deep_research/web/app.py +3 -23
  18. local_deep_research/web/app_factory.py +1 -25
  19. local_deep_research/web/database/migrations.py +20 -418
  20. local_deep_research/web/routes/settings_routes.py +75 -364
  21. local_deep_research/web/services/research_service.py +47 -43
  22. local_deep_research/web/services/settings_manager.py +108 -315
  23. local_deep_research/web/services/settings_service.py +3 -56
  24. local_deep_research/web/static/js/components/research.js +1 -1
  25. local_deep_research/web/static/js/components/settings.js +16 -4
  26. local_deep_research/web/static/js/research_form.js +106 -0
  27. local_deep_research/web/templates/pages/research.html +3 -2
  28. local_deep_research/web_search_engines/engines/meta_search_engine.py +13 -18
  29. local_deep_research/web_search_engines/engines/search_engine_local.py +11 -2
  30. local_deep_research/web_search_engines/engines/search_engine_local_all.py +7 -11
  31. local_deep_research/web_search_engines/search_engine_factory.py +12 -64
  32. local_deep_research/web_search_engines/search_engines_config.py +123 -64
  33. {local_deep_research-0.2.2.dist-info → local_deep_research-0.3.0.dist-info}/METADATA +16 -1
  34. {local_deep_research-0.2.2.dist-info → local_deep_research-0.3.0.dist-info}/RECORD +37 -38
  35. local_deep_research/config/config_files.py +0 -245
  36. local_deep_research/defaults/local_collections.toml +0 -53
  37. local_deep_research/defaults/main.toml +0 -80
  38. local_deep_research/defaults/search_engines.toml +0 -291
  39. {local_deep_research-0.2.2.dist-info → local_deep_research-0.3.0.dist-info}/WHEEL +0 -0
  40. {local_deep_research-0.2.2.dist-info → local_deep_research-0.3.0.dist-info}/entry_points.txt +0 -0
  41. {local_deep_research-0.2.2.dist-info → local_deep_research-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,402 @@
1
+ import concurrent.futures
2
+ import logging
3
+ from typing import Dict
4
+
5
+ from ...citation_handler import CitationHandler
6
+ from ...config.llm_config import get_llm
7
+ from ...config.search_config import get_search
8
+ from ...utilities.db_utils import get_db_setting
9
+ from ..filters.cross_engine_filter import CrossEngineFilter
10
+ from ..findings.repository import FindingsRepository
11
+ from ..questions.standard_question import StandardQuestionGenerator
12
+ from .base_strategy import BaseSearchStrategy
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class SourceBasedSearchStrategy(BaseSearchStrategy):
18
+ """
19
+ Source-based search strategy that generates questions based on search results and
20
+ defers content analysis until final synthesis.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ search=None,
26
+ model=None,
27
+ citation_handler=None,
28
+ include_text_content: bool = True,
29
+ use_cross_engine_filter: bool = True,
30
+ filter_reorder: bool = True,
31
+ filter_reindex: bool = True,
32
+ cross_engine_max_results: int = None,
33
+ all_links_of_system=None,
34
+ ):
35
+ """Initialize with optional dependency injection for testing."""
36
+ # Pass the links list to the parent class
37
+ super().__init__(all_links_of_system=all_links_of_system)
38
+ self.search = search or get_search()
39
+ self.model = model or get_llm()
40
+ self.progress_callback = None
41
+
42
+ self.questions_by_iteration = {}
43
+ self.include_text_content = include_text_content
44
+ self.use_cross_engine_filter = use_cross_engine_filter
45
+ self.filter_reorder = filter_reorder
46
+ self.filter_reindex = filter_reindex
47
+
48
+ # Get cross_engine_max_results from database if not provided
49
+ if cross_engine_max_results is None:
50
+ cross_engine_max_results = get_db_setting(
51
+ "search.cross_engine_max_results", 100
52
+ )
53
+
54
+ # Initialize the cross-engine filter
55
+ self.cross_engine_filter = CrossEngineFilter(
56
+ model=self.model,
57
+ max_results=cross_engine_max_results,
58
+ default_reorder=filter_reorder,
59
+ default_reindex=filter_reindex,
60
+ )
61
+
62
+ # Set include_full_content on the search engine if it supports it
63
+ if hasattr(self.search, "include_full_content"):
64
+ self.search.include_full_content = include_text_content
65
+
66
+ # Use provided citation_handler or create one
67
+ self.citation_handler = citation_handler or CitationHandler(self.model)
68
+
69
+ # Initialize components
70
+ self.question_generator = StandardQuestionGenerator(self.model)
71
+ self.findings_repository = FindingsRepository(self.model)
72
+
73
+ def _format_search_results_as_context(self, search_results):
74
+ """Format search results into context for question generation."""
75
+ context_snippets = []
76
+
77
+ for i, result in enumerate(
78
+ search_results[:10]
79
+ ): # Limit to prevent context overflow
80
+ title = result.get("title", "Untitled")
81
+ snippet = result.get("snippet", "")
82
+ url = result.get("link", "")
83
+
84
+ if snippet:
85
+ context_snippets.append(
86
+ f"Source {i + 1}: {title}\nURL: {url}\nSnippet: {snippet}"
87
+ )
88
+
89
+ return "\n\n".join(context_snippets)
90
+
91
+ def analyze_topic(self, query: str) -> Dict:
92
+ """
93
+ Analyze a topic using source-based search strategy.
94
+ """
95
+ logger.info(f"Starting source-based research on topic: {query}")
96
+ accumulated_search_results_across_all_iterations = (
97
+ []
98
+ ) # tracking links across iterations but not global
99
+ findings = []
100
+ total_citation_count_before_this_search = len(self.all_links_of_system)
101
+
102
+ self._update_progress(
103
+ "Initializing source-based research",
104
+ 5,
105
+ {
106
+ "phase": "init",
107
+ "strategy": "source-based",
108
+ "include_text_content": self.include_text_content,
109
+ },
110
+ )
111
+
112
+ # Check search engine
113
+ if not self._validate_search_engine():
114
+ return {
115
+ "findings": [],
116
+ "iterations": 0,
117
+ "questions_by_iteration": {},
118
+ "formatted_findings": "Error: Unable to conduct research without a search engine.",
119
+ "current_knowledge": "",
120
+ "error": "No search engine available",
121
+ }
122
+
123
+ # Determine number of iterations to run
124
+ iterations_to_run = get_db_setting("search.iterations")
125
+ logger.debug("Selected amount of iterations: " + str(iterations_to_run))
126
+ iterations_to_run = int(iterations_to_run)
127
+ try:
128
+ filtered_search_results = []
129
+ total_citation_count_before_this_search = len(self.all_links_of_system)
130
+ # Run each iteration
131
+ for iteration in range(1, iterations_to_run + 1):
132
+ iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
133
+
134
+ self._update_progress(
135
+ f"Starting iteration {iteration}/{iterations_to_run}",
136
+ iteration_progress_base,
137
+ {"phase": f"iteration_{iteration}", "iteration": iteration},
138
+ )
139
+
140
+ # Step 1: Generate or use questions
141
+ self._update_progress(
142
+ f"Generating search questions for iteration {iteration}",
143
+ iteration_progress_base + 5,
144
+ {"phase": "question_generation", "iteration": iteration},
145
+ )
146
+
147
+ # For first iteration, use initial query
148
+ if iteration == 1:
149
+ # Generate questions for first iteration
150
+ context = f"""Iteration: {iteration} of {iterations_to_run}"""
151
+ questions = self.question_generator.generate_questions(
152
+ current_knowledge=context,
153
+ query=query,
154
+ questions_per_iteration=int(
155
+ get_db_setting("search.questions_per_iteration")
156
+ ),
157
+ questions_by_iteration=self.questions_by_iteration,
158
+ )
159
+
160
+ # Always include the original query for the first iteration
161
+ if query not in questions:
162
+ all_questions = [query] + questions
163
+ else:
164
+ all_questions = questions
165
+
166
+ self.questions_by_iteration[iteration] = all_questions
167
+ logger.info(
168
+ f"Using questions for iteration {iteration}: {all_questions}"
169
+ )
170
+ else:
171
+ # For subsequent iterations, generate questions based on previous search results
172
+ source_context = self._format_search_results_as_context(
173
+ filtered_search_results
174
+ )
175
+ if iteration != 1:
176
+ context = f"""Previous search results:\n{source_context}\n\nIteration: {iteration} of {iterations_to_run}"""
177
+ elif iterations_to_run == 1:
178
+ context = ""
179
+ else:
180
+ context = f"""Iteration: {iteration} of {iterations_to_run}"""
181
+ # Use standard question generator with search results as context
182
+ questions = self.question_generator.generate_questions(
183
+ current_knowledge=context,
184
+ query=query,
185
+ questions_per_iteration=int(
186
+ get_db_setting("search.questions_per_iteration")
187
+ ),
188
+ questions_by_iteration=self.questions_by_iteration,
189
+ )
190
+
191
+ # Use only the new questions for this iteration's searches
192
+ all_questions = questions
193
+
194
+ # Store in questions_by_iteration
195
+ self.questions_by_iteration[iteration] = questions
196
+ logger.info(
197
+ f"Generated questions for iteration {iteration}: {questions}"
198
+ )
199
+
200
+ # Step 2: Run all searches in parallel for this iteration
201
+ self._update_progress(
202
+ f"Running parallel searches for iteration {iteration}",
203
+ iteration_progress_base + 10,
204
+ {"phase": "parallel_search", "iteration": iteration},
205
+ )
206
+
207
+ # Function for thread pool
208
+ def search_question(q):
209
+ try:
210
+ result = self.search.run(q)
211
+ return {"question": q, "results": result or []}
212
+ except Exception as e:
213
+ logger.error(f"Error searching for '{q}': {str(e)}")
214
+ return {"question": q, "results": [], "error": str(e)}
215
+
216
+ # Run searches in parallel
217
+ with concurrent.futures.ThreadPoolExecutor(
218
+ max_workers=len(all_questions)
219
+ ) as executor:
220
+ futures = [
221
+ executor.submit(search_question, q) for q in all_questions
222
+ ]
223
+ iteration_search_dict = {}
224
+ iteration_search_results = []
225
+
226
+ # Process results as they complete
227
+ for i, future in enumerate(
228
+ concurrent.futures.as_completed(futures)
229
+ ):
230
+ result_dict = future.result()
231
+ question = result_dict["question"]
232
+ search_results = result_dict["results"]
233
+ iteration_search_dict[question] = search_results
234
+
235
+ self._update_progress(
236
+ f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
237
+ iteration_progress_base
238
+ + 10
239
+ + ((i + 1) / len(all_questions) * 30),
240
+ {
241
+ "phase": "search_complete",
242
+ "iteration": iteration,
243
+ "result_count": len(search_results),
244
+ "question": question,
245
+ },
246
+ )
247
+
248
+ iteration_search_results.extend(search_results)
249
+
250
+ if False and self.use_cross_engine_filter:
251
+ self._update_progress(
252
+ f"Filtering search results for iteration {iteration}",
253
+ iteration_progress_base + 45,
254
+ {"phase": "cross_engine_filtering", "iteration": iteration},
255
+ )
256
+
257
+ existing_link_count = len(self.all_links_of_system)
258
+ logger.info(f"Existing link count: {existing_link_count}")
259
+ filtered_search_results = self.cross_engine_filter.filter_results(
260
+ iteration_search_results,
261
+ query,
262
+ reorder=True,
263
+ reindex=True,
264
+ start_index=existing_link_count, # Start indexing after existing links
265
+ )
266
+
267
+ self._update_progress(
268
+ f"Filtered from {len(iteration_search_results)} to {len(filtered_search_results)} results",
269
+ iteration_progress_base + 50,
270
+ {
271
+ "phase": "filtering_complete",
272
+ "iteration": iteration,
273
+ "links_count": len(self.all_links_of_system),
274
+ },
275
+ )
276
+ else:
277
+ # Use the search results as they are
278
+ filtered_search_results = iteration_search_results
279
+
280
+ # Use filtered results
281
+ accumulated_search_results_across_all_iterations.extend(
282
+ filtered_search_results
283
+ )
284
+
285
+ # Create a lightweight finding for this iteration's search metadata (no text content)
286
+ finding = {
287
+ "phase": f"Iteration {iteration}",
288
+ "content": f"Searched with {len(all_questions)} questions, found {len(filtered_search_results)} results.",
289
+ "question": query,
290
+ "documents": [],
291
+ }
292
+ findings.append(finding)
293
+
294
+ # Mark iteration as complete
295
+ iteration_progress = 5 + iteration * (70 / iterations_to_run)
296
+ self._update_progress(
297
+ f"Completed iteration {iteration}/{iterations_to_run}",
298
+ iteration_progress,
299
+ {"phase": "iteration_complete", "iteration": iteration},
300
+ )
301
+
302
+ # Do we need this filter?
303
+ if self.use_cross_engine_filter:
304
+ # Final filtering of all accumulated search results
305
+ self._update_progress(
306
+ "Performing final filtering of all results",
307
+ 80,
308
+ {"phase": "final_filtering"},
309
+ )
310
+ final_filtered_results = self.cross_engine_filter.filter_results(
311
+ accumulated_search_results_across_all_iterations,
312
+ query,
313
+ reorder=True, # Always reorder in final filtering
314
+ reindex=True, # Always reindex in final filtering
315
+ max_results=int(get_db_setting("search.final_max_results") or 100),
316
+ start_index=len(self.all_links_of_system),
317
+ )
318
+ self._update_progress(
319
+ f"Filtered from {len(accumulated_search_results_across_all_iterations)} to {len(final_filtered_results)} results",
320
+ iteration_progress_base + 85,
321
+ {
322
+ "phase": "filtering_complete",
323
+ "iteration": iteration,
324
+ "links_count": len(self.all_links_of_system),
325
+ },
326
+ )
327
+ else:
328
+ final_filtered_results = filtered_search_results
329
+ # links = extract_links_from_search_results()
330
+ self.all_links_of_system.extend(final_filtered_results)
331
+
332
+ # Final synthesis after all iterations
333
+ self._update_progress(
334
+ "Generating final synthesis", 90, {"phase": "synthesis"}
335
+ )
336
+
337
+ # Final synthesis
338
+ final_citation_result = self.citation_handler.analyze_followup(
339
+ query,
340
+ final_filtered_results,
341
+ previous_knowledge="", # Empty string as we don't need previous knowledge here
342
+ nr_of_links=total_citation_count_before_this_search,
343
+ )
344
+
345
+ # Add null check for final_citation_result
346
+ if final_citation_result:
347
+ synthesized_content = final_citation_result["content"]
348
+ documents = final_citation_result.get("documents", [])
349
+ else:
350
+ synthesized_content = "No relevant results found in final synthesis."
351
+ documents = []
352
+
353
+ # Add a final synthesis finding
354
+ final_finding = {
355
+ "phase": "Final synthesis",
356
+ "content": synthesized_content,
357
+ "question": query,
358
+ "search_results": self.all_links_of_system,
359
+ "documents": documents,
360
+ }
361
+ findings.append(final_finding)
362
+
363
+ # Add documents to repository
364
+ self.findings_repository.add_documents(documents)
365
+
366
+ # Transfer questions to repository
367
+ self.findings_repository.set_questions_by_iteration(
368
+ self.questions_by_iteration
369
+ )
370
+
371
+ # Format findings
372
+ formatted_findings = self.findings_repository.format_findings_to_text(
373
+ findings, synthesized_content
374
+ )
375
+
376
+ except Exception as e:
377
+ import traceback
378
+
379
+ error_msg = f"Error in research process: {str(e)}"
380
+ logger.error(error_msg)
381
+ logger.error(traceback.format_exc())
382
+ synthesized_content = f"Error: {str(e)}"
383
+ formatted_findings = f"Error: {str(e)}"
384
+ finding = {
385
+ "phase": "Error",
386
+ "content": synthesized_content,
387
+ "question": query,
388
+ "search_results": [],
389
+ "documents": [],
390
+ }
391
+ findings.append(finding)
392
+
393
+ self._update_progress("Research complete", 100, {"phase": "complete"})
394
+
395
+ return {
396
+ "findings": findings,
397
+ "iterations": iterations_to_run,
398
+ "questions_by_iteration": self.questions_by_iteration,
399
+ "formatted_findings": formatted_findings,
400
+ "current_knowledge": synthesized_content,
401
+ "all_links_of_system": self.all_links_of_system,
402
+ }
@@ -3,7 +3,6 @@ import logging
3
3
  from typing import Dict
4
4
 
5
5
  from ...citation_handler import CitationHandler
6
- from ...config.config_files import settings
7
6
  from ...config.llm_config import get_llm
8
7
  from ...config.search_config import get_search
9
8
  from ...utilities.db_utils import get_db_setting
@@ -20,11 +19,17 @@ logger = logging.getLogger(__name__)
20
19
  class StandardSearchStrategy(BaseSearchStrategy):
21
20
  """Standard iterative search strategy that generates follow-up questions."""
22
21
 
23
- def __init__(self, search=None, model=None, citation_handler=None):
22
+ def __init__(
23
+ self, search=None, model=None, citation_handler=None, all_links_of_system=None
24
+ ):
24
25
  """Initialize with optional dependency injection for testing."""
26
+ super().__init__(all_links_of_system=all_links_of_system)
25
27
  self.search = search or get_search()
26
28
  self.model = model or get_llm()
29
+
30
+ # Get iterations setting
27
31
  self.max_iterations = int(get_db_setting("search.iterations"))
32
+
28
33
  self.questions_per_iteration = int(
29
34
  get_db_setting("search.questions_per_iteration")
30
35
  )
@@ -43,7 +48,6 @@ class StandardSearchStrategy(BaseSearchStrategy):
43
48
 
44
49
  # Initialize other attributes
45
50
  self.progress_callback = None
46
- self.all_links_of_system = list()
47
51
 
48
52
  def _update_progress(
49
53
  self, message: str, progress_percent: int = None, metadata: dict = None
@@ -117,7 +121,7 @@ Iteration: {iteration + 1} of {total_iterations}"""
117
121
  question_count = len(questions)
118
122
  knowledge_accumulation = get_db_setting(
119
123
  "general.knowledge_accumulation",
120
- settings.general.knowledge_accumulation,
124
+ "ITERATION",
121
125
  )
122
126
  for q_idx, question in enumerate(questions):
123
127
  question_progress_base = iteration_progress_base + (
@@ -4,11 +4,8 @@ Provides programmatic access to search and research capabilities.
4
4
  """
5
5
 
6
6
  import logging
7
- import os
8
7
  from typing import Any, Callable, Dict, Optional
9
8
 
10
- import toml
11
-
12
9
  from ..config.llm_config import get_llm
13
10
  from ..config.search_config import get_search
14
11
  from ..report_generator import IntegratedReportGenerator
@@ -279,46 +276,3 @@ def analyze_documents(
279
276
  logger.info(f"Analysis saved to {output_file}")
280
277
 
281
278
  return analysis_result
282
-
283
-
284
- def get_available_search_engines() -> Dict[str, str]:
285
- """
286
- Get a dictionary of available search engines.
287
-
288
- Returns:
289
- Dictionary mapping engine names to descriptions
290
- """
291
-
292
- from ..web_search_engines.search_engine_factory import get_available_engines
293
-
294
- engines = get_available_engines()
295
-
296
- # Add some descriptions for common engines
297
- descriptions = {
298
- "auto": "Automatic selection based on query type",
299
- "wikipedia": "Wikipedia articles and general knowledge",
300
- "arxiv": "Scientific papers and research",
301
- "pubmed": "Medical and biomedical literature",
302
- "semantic_scholar": "Academic papers across all fields",
303
- "github": "Code repositories and technical documentation",
304
- "local_all": "All local document collections",
305
- }
306
-
307
- return {engine: descriptions.get(engine, "Search engine") for engine in engines}
308
-
309
-
310
- def get_available_collections() -> Dict[str, Dict[str, Any]]:
311
- """
312
- Get a dictionary of available local document collections.
313
-
314
- Returns:
315
- Dictionary mapping collection names to their configuration
316
- """
317
-
318
- from ..config.config_files import LOCAL_COLLECTIONS_FILE
319
-
320
- if os.path.exists(LOCAL_COLLECTIONS_FILE):
321
- collections = toml.load(LOCAL_COLLECTIONS_FILE)
322
- return collections
323
-
324
- return {}
@@ -4,7 +4,6 @@ from typing import Any, Dict, List, Union
4
4
 
5
5
  from langchain_core.documents import Document
6
6
 
7
- from .config.config_files import settings
8
7
  from .utilities.db_utils import get_db_setting
9
8
 
10
9
 
@@ -82,21 +81,19 @@ Provide a detailed analysis with citations. Do not create the bibliography, it w
82
81
  formatted_sources = self._format_sources(documents)
83
82
  # Add fact-checking step
84
83
  fact_check_prompt = f"""Analyze these sources for factual consistency:
85
- 1. Cross-reference major claims between sources
86
- 2. Identify and flag any contradictions
87
- 3. Verify basic facts (dates, company names, ownership)
88
- 4. Note when sources disagree
84
+ 1. Cross-reference major claims between sources
85
+ 2. Identify and flag any contradictions
86
+ 3. Verify basic facts (dates, company names, ownership)
87
+ 4. Note when sources disagree
89
88
 
90
- Previous Knowledge:
91
- {previous_knowledge}
89
+ Previous Knowledge:
90
+ {previous_knowledge}
92
91
 
93
- New Sources:
94
- {formatted_sources}
92
+ New Sources:
93
+ {formatted_sources}
95
94
 
96
95
  Return any inconsistencies or conflicts found."""
97
- if get_db_setting(
98
- "general.enable_fact_checking", settings.general.enable_fact_checking
99
- ):
96
+ if get_db_setting("general.enable_fact_checking", True):
100
97
  fact_check_response = self.llm.invoke(fact_check_prompt).content
101
98
 
102
99
  else:
@@ -104,16 +101,15 @@ Provide a detailed analysis with citations. Do not create the bibliography, it w
104
101
 
105
102
  prompt = f"""Using the previous knowledge and new sources, answer the question. Include citations using numbers in square brackets [1], [2], etc. When citing, use the source number provided at the start of each source. Reflect information from sources critically.
106
103
 
107
- Previous Knowledge:
108
- {previous_knowledge}
104
+ Previous Knowledge:
105
+ {previous_knowledge}
109
106
 
110
- Question: {question}
107
+ Question: {question}
111
108
 
112
- New Sources:
113
- {formatted_sources}
114
- Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
115
- Provide a detailed answer with citations. Example format: "According to [1], ..."
116
- """
109
+ New Sources:
110
+ {formatted_sources}
111
+ Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
112
+ Provide a detailed answer with citations. Example format: "According to [1], ..." """
117
113
 
118
114
  response = self.llm.invoke(prompt)
119
115