local-deep-research 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -145,6 +145,19 @@ def init_config_files():
145
145
  if not os.path.exists(search_engines_file) and os.path.exists(default_engines):
146
146
  shutil.copyfile(default_engines, search_engines_file)
147
147
  logger.info(f"Created search_engines.toml at {search_engines_file}")
148
+
149
+ # Create .env.template if it doesn't exist
150
+ env_template_file = CONFIG_DIR / ".env.template"
151
+ if not env_template_file.exists():
152
+ shutil.copy(defaults_dir / ".env.template", env_template_file)
153
+ logger.info(f"Created .env.template at {env_template_file}")
154
+
155
+ # Optionally create an empty .env file if it doesn't exist
156
+ env_file = CONFIG_DIR / ".env"
157
+ if not env_file.exists():
158
+ with open(env_file, "w") as f:
159
+ f.write("# Add your environment variables here\n")
160
+ logger.info(f"Created empty .env file at {env_file}")
148
161
  except Exception as e:
149
162
  logger.error(f"Error initializing Windows config files: {e}")
150
163
  else:
@@ -183,7 +196,17 @@ def init_config_files():
183
196
  if not search_engines_file.exists():
184
197
  shutil.copy(defaults_dir / "search_engines.toml", search_engines_file)
185
198
  logger.info(f"Created search_engines.toml at {search_engines_file}")
199
+ env_template_file = CONFIG_DIR / ".env.template"
200
+ if not env_template_file.exists():
201
+ shutil.copy(defaults_dir / ".env.template", env_template_file)
202
+ logger.info(f"Created .env.template at {env_template_file}")
186
203
 
204
+ # Optionally create an empty .env file if it doesn't exist
205
+ env_file = CONFIG_DIR / ".env"
206
+ if not env_file.exists():
207
+ with open(env_file, "w") as f:
208
+ f.write("# Add your environment variables here\n")
209
+ logger.info(f"Created empty .env file at {env_file}")
187
210
  secrets_file = CONFIG_DIR / ".secrets.toml"
188
211
  if not secrets_file.exists():
189
212
  with open(secrets_file, "w") as f:
@@ -12,78 +12,50 @@ from langchain_community.llms import VLLM
12
12
  from local_deep_research.config import settings
13
13
  import os
14
14
  import logging
15
- from enum import Enum, auto
16
15
 
17
16
  # Initialize environment
18
17
  logger = logging.getLogger(__name__)
19
18
 
20
- # Provider enum
21
- class ModelProvider(Enum):
22
- OLLAMA = auto()
23
- OPENAI = auto()
24
- ANTHROPIC = auto()
25
- VLLM = auto()
26
- OPENAI_ENDPOINT = auto()
27
- NONE = auto()
28
-
29
- # ================================
30
- # USER CONFIGURATION SECTION
31
- # ================================
32
-
33
- # Set your preferred model provider here
34
- DEFAULT_PROVIDER = ModelProvider.OLLAMA # Change this to your preferred provider
35
-
36
- # Set your default model name here
37
- DEFAULT_MODEL = "gemma3:12b" # Your default model
38
-
39
- # Set default model parameters
40
- DEFAULT_TEMPERATURE = 0.7
41
- MAX_TOKENS = 30000
42
-
43
- # Server URLs
44
- OPENAI_ENDPOINT_URL = "https://openrouter.ai/api/v1" # For OpenRouter or compatible services
45
- OLLAMA_BASE_URL = "http://localhost:11434" # URL for Ollama server
46
-
47
-
48
-
19
+ # Valid provider options
20
+ VALID_PROVIDERS = ["ollama", "openai", "anthropic", "vllm", "openai_endpoint", "lmstudio", "llamacpp", "none"]
49
21
 
50
22
  # ================================
51
23
  # LLM FUNCTIONS
52
24
  # ================================
53
25
 
54
-
55
-
56
-
57
-
58
26
  def get_llm(model_name=None, temperature=None, provider=None):
59
27
  """
60
28
  Get LLM instance based on model name and provider.
61
29
 
62
30
  Args:
63
- model_name: Name of the model to use (if None, uses DEFAULT_MODEL)
64
- temperature: Model temperature (if None, uses DEFAULT_TEMPERATURE)
65
- provider: Provider to use (if None, uses DEFAULT_PROVIDER)
31
+ model_name: Name of the model to use (if None, uses settings.llm.model)
32
+ temperature: Model temperature (if None, uses settings.llm.temperature)
33
+ provider: Provider to use (if None, uses settings.llm.provider)
66
34
 
67
35
  Returns:
68
36
  A LangChain LLM instance
69
37
  """
38
+ # Use settings values for parameters if not provided
70
39
  if model_name is None:
71
- model_name = DEFAULT_MODEL
40
+ model_name = settings.llm.model
72
41
 
73
42
  if temperature is None:
74
- temperature = DEFAULT_TEMPERATURE
43
+ temperature = settings.llm.temperature
75
44
 
76
45
  if provider is None:
77
- provider = DEFAULT_PROVIDER
46
+ provider = settings.llm.provider.lower()
47
+ if provider not in VALID_PROVIDERS:
48
+ logger.error(f"Invalid provider in settings: {provider}")
49
+ raise ValueError(f"Invalid provider: {provider}. Must be one of: {VALID_PROVIDERS}")
78
50
 
79
51
  # Common parameters for all models
80
52
  common_params = {
81
53
  "temperature": temperature,
82
- "max_tokens": MAX_TOKENS,
54
+ "max_tokens": settings.llm.max_tokens,
83
55
  }
84
56
 
85
57
  # Handle different providers
86
- if provider == ModelProvider.ANTHROPIC:
58
+ if provider == "anthropic":
87
59
  api_key = settings.get('ANTHROPIC_API_KEY', '')
88
60
  if not api_key:
89
61
  logger.warning("ANTHROPIC_API_KEY not found. Falling back to default model.")
@@ -93,7 +65,7 @@ def get_llm(model_name=None, temperature=None, provider=None):
93
65
  model=model_name, anthropic_api_key=api_key, **common_params
94
66
  )
95
67
 
96
- elif provider == ModelProvider.OPENAI:
68
+ elif provider == "openai":
97
69
  api_key = settings.get('OPENAI_API_KEY', '')
98
70
  if not api_key:
99
71
  logger.warning("OPENAI_API_KEY not found. Falling back to default model.")
@@ -101,21 +73,24 @@ def get_llm(model_name=None, temperature=None, provider=None):
101
73
 
102
74
  return ChatOpenAI(model=model_name, api_key=api_key, **common_params)
103
75
 
104
- elif provider == ModelProvider.OPENAI_ENDPOINT:
105
- api_key = settings.OPENAI_ENDPOINT_API_KEY
76
+ elif provider == "openai_endpoint":
77
+ api_key = settings.get('OPENAI_ENDPOINT_API_KEY', '')
106
78
 
107
79
  if not api_key:
108
80
  logger.warning("OPENAI_ENDPOINT_API_KEY not found. Falling back to default model.")
109
81
  return get_fallback_model(temperature)
110
82
 
83
+ # Get endpoint URL from settings
84
+ openai_endpoint_url = settings.llm.openai_endpoint_url
85
+
111
86
  return ChatOpenAI(
112
87
  model=model_name,
113
88
  api_key=api_key,
114
- openai_api_base=OPENAI_ENDPOINT_URL,
89
+ openai_api_base=openai_endpoint_url,
115
90
  **common_params
116
91
  )
117
92
 
118
- elif provider == ModelProvider.VLLM:
93
+ elif provider == "vllm":
119
94
  try:
120
95
  return VLLM(
121
96
  model=model_name,
@@ -130,19 +105,60 @@ def get_llm(model_name=None, temperature=None, provider=None):
130
105
  logger.warning("Falling back.")
131
106
  return get_fallback_model(temperature)
132
107
 
133
- elif provider == ModelProvider.OLLAMA:
108
+ elif provider == "ollama":
134
109
  try:
135
110
  # Use the configurable Ollama base URL
136
- base_url = settings.get('OLLAMA_BASE_URL', OLLAMA_BASE_URL)
111
+ base_url = settings.get('OLLAMA_BASE_URL', settings.llm.get('ollama_base_url', 'http://localhost:11434'))
137
112
  return ChatOllama(model=model_name, base_url=base_url, **common_params)
138
113
  except Exception as e:
139
114
  logger.error(f"Error loading Ollama model: {e}")
140
115
  return get_fallback_model(temperature)
141
116
 
117
+ elif provider == "lmstudio":
118
+
119
+ # LM Studio supports OpenAI API format, so we can use ChatOpenAI directly
120
+ lmstudio_url = settings.llm.get('lmstudio_url', "http://localhost:1234")
121
+
122
+ return ChatOpenAI(
123
+ model=model_name,
124
+ api_key="lm-studio", # LM Studio doesn't require a real API key
125
+ base_url=f"{lmstudio_url}/v1", # Use the configured URL with /v1 endpoint
126
+ temperature=temperature,
127
+ max_tokens=settings.llm.max_tokens
128
+ )
129
+
130
+
131
+ elif provider == "llamacpp":
132
+
133
+ # Import LlamaCpp
134
+ from langchain_community.llms import LlamaCpp
135
+
136
+ # Get LlamaCpp model path from settings
137
+ model_path = settings.llm.get('llamacpp_model_path', "")
138
+ if not model_path:
139
+ logger.error("llamacpp_model_path not set in settings")
140
+ raise ValueError("llamacpp_model_path not set in settings.toml")
141
+
142
+ # Get additional LlamaCpp parameters
143
+ n_gpu_layers = settings.llm.get('llamacpp_n_gpu_layers', 1)
144
+ n_batch = settings.llm.get('llamacpp_n_batch', 512)
145
+ f16_kv = settings.llm.get('llamacpp_f16_kv', True)
146
+
147
+ # Create LlamaCpp instance
148
+ return LlamaCpp(
149
+ model_path=model_path,
150
+ temperature=temperature,
151
+ max_tokens=settings.llm.max_tokens,
152
+ n_gpu_layers=n_gpu_layers,
153
+ n_batch=n_batch,
154
+ f16_kv=f16_kv,
155
+ verbose=True
156
+ )
157
+
142
158
  else:
143
159
  return get_fallback_model(temperature)
144
160
 
145
- def get_fallback_model(temperature=DEFAULT_TEMPERATURE):
161
+ def get_fallback_model(temperature=None):
146
162
  """Create a dummy model for when no providers are available"""
147
163
  from langchain_community.llms.fake import FakeListLLM
148
164
  return FakeListLLM(
@@ -169,6 +185,12 @@ def get_available_provider_types():
169
185
  if is_openai_endpoint_available():
170
186
  providers["openai_endpoint"] = "OpenAI-compatible Endpoint"
171
187
 
188
+ if is_lmstudio_available():
189
+ providers["lmstudio"] = "LM Studio (local models)"
190
+
191
+ if is_llamacpp_available():
192
+ providers["llamacpp"] = "LlamaCpp (local models)"
193
+
172
194
  # Check for VLLM capability
173
195
  try:
174
196
  import torch
@@ -183,9 +205,6 @@ def get_available_provider_types():
183
205
 
184
206
  return providers
185
207
 
186
-
187
-
188
-
189
208
  # ================================
190
209
  # HELPER FUNCTIONS
191
210
  # ================================
@@ -193,7 +212,7 @@ def get_available_provider_types():
193
212
  def is_openai_available():
194
213
  """Check if OpenAI is available"""
195
214
  try:
196
- api_key = settings.api_keys.get('OPENAI_API_KEY', '')
215
+ api_key = settings.get('OPENAI_API_KEY', '')
197
216
  return bool(api_key)
198
217
  except:
199
218
  return False
@@ -201,18 +220,15 @@ def is_openai_available():
201
220
  def is_anthropic_available():
202
221
  """Check if Anthropic is available"""
203
222
  try:
204
- api_key = settings.api_keys.get('ANTHROPIC_API_KEY', '')
223
+ api_key = settings.get('ANTHROPIC_API_KEY', '')
205
224
  return bool(api_key)
206
225
  except:
207
226
  return False
208
227
 
209
-
210
-
211
228
  def is_openai_endpoint_available():
212
229
  """Check if OpenAI endpoint is available"""
213
-
214
230
  try:
215
- api_key = settings.OPENAI_ENDPOINT_API_KEY
231
+ api_key = settings.get('OPENAI_ENDPOINT_API_KEY', '')
216
232
  return bool(api_key)
217
233
  except:
218
234
  return False
@@ -221,7 +237,7 @@ def is_ollama_available():
221
237
  """Check if Ollama is running"""
222
238
  try:
223
239
  import requests
224
- base_url = settings.get('OLLAMA_BASE_URL', OLLAMA_BASE_URL)
240
+ base_url = settings.get('OLLAMA_BASE_URL', settings.llm.get('ollama_base_url', 'http://localhost:11434'))
225
241
  response = requests.get(f"{base_url}/api/tags", timeout=1.0)
226
242
  return response.status_code == 200
227
243
  except:
@@ -236,34 +252,35 @@ def is_vllm_available():
236
252
  except ImportError:
237
253
  return False
238
254
 
255
+ def is_lmstudio_available():
256
+ """Check if LM Studio is available"""
257
+ try:
258
+ import requests
259
+ lmstudio_url = settings.llm.get('lmstudio_url', 'http://localhost:1234')
260
+ # LM Studio typically uses OpenAI-compatible endpoints
261
+ response = requests.get(f"{lmstudio_url}/v1/models", timeout=1.0)
262
+ return response.status_code == 200
263
+ except:
264
+ return False
265
+
266
+ def is_llamacpp_available():
267
+ """Check if LlamaCpp is available and configured"""
268
+ try:
269
+ from langchain_community.llms import LlamaCpp
270
+ model_path = settings.llm.get('llamacpp_model_path', '')
271
+ return bool(model_path) and os.path.exists(model_path)
272
+ except:
273
+ return False
274
+
239
275
  def get_available_providers():
240
276
  """Get dictionary of available providers"""
241
- providers = {}
242
-
243
- if is_ollama_available():
244
- providers[ModelProvider.OLLAMA] = "Ollama (local models)"
245
-
246
- if is_openai_available():
247
- providers[ModelProvider.OPENAI] = "OpenAI API"
248
-
249
- if is_anthropic_available():
250
- providers[ModelProvider.ANTHROPIC] = "Anthropic API"
251
-
252
- if is_openai_endpoint_available():
253
- providers[ModelProvider.OPENAI_ENDPOINT] = "OpenAI-compatible Endpoint"
254
-
255
- if is_vllm_available():
256
- providers[ModelProvider.VLLM] = "VLLM (local models)"
257
-
258
- if not providers:
259
- providers[ModelProvider.NONE] = "No model providers available"
260
-
261
- return providers
277
+ return get_available_provider_types()
262
278
 
263
279
  # Log which providers are available
264
280
  AVAILABLE_PROVIDERS = get_available_providers()
265
- logger.info(f"Available providers: {[p.name for p in AVAILABLE_PROVIDERS.keys()]}")
281
+ logger.info(f"Available providers: {list(AVAILABLE_PROVIDERS.keys())}")
266
282
 
267
283
  # Check if selected provider is available
268
- if DEFAULT_PROVIDER not in AVAILABLE_PROVIDERS and DEFAULT_PROVIDER != ModelProvider.NONE:
269
- logger.warning(f"Selected provider {DEFAULT_PROVIDER.name} is not available.")
284
+ selected_provider = settings.llm.provider.lower()
285
+ if selected_provider not in AVAILABLE_PROVIDERS and selected_provider != "none":
286
+ logger.warning(f"Selected provider {selected_provider} is not available.")
@@ -1,6 +1,3 @@
1
- # Default local document collections configuration
2
- # Each collection functions as an independent search engine
3
-
4
1
  # Project Documents Collection
5
2
  [project_docs]
6
3
  name = "Project Documents"
@@ -15,6 +12,9 @@ max_filtered_results = 5
15
12
  chunk_size = 1000
16
13
  chunk_overlap = 200
17
14
  cache_dir = "__CACHE_DIR__/local_search/project_docs"
15
+ strengths = ["project documentation", "specifications", "internal documents"]
16
+ weaknesses = ["no external information", "limited to organizational knowledge"]
17
+ reliability = 0.9
18
18
 
19
19
  # Research Papers Collection
20
20
  [research_papers]
@@ -30,6 +30,9 @@ max_filtered_results = 5
30
30
  chunk_size = 800
31
31
  chunk_overlap = 150
32
32
  cache_dir = "__CACHE_DIR__/local_search/research_papers"
33
+ strengths = ["academic research", "scientific papers", "scholarly content"]
34
+ weaknesses = ["potentially outdated", "limited to collected papers"]
35
+ reliability = 0.85
33
36
 
34
37
  # Personal Notes Collection
35
38
  [personal_notes]
@@ -44,4 +47,7 @@ max_results = 30
44
47
  max_filtered_results = 10
45
48
  chunk_size = 500
46
49
  chunk_overlap = 100
47
- cache_dir = "__CACHE_DIR__/local_search/personal_notes"
50
+ cache_dir = "__CACHE_DIR__/local_search/personal_notes"
51
+ strengths = ["personal knowledge", "notes", "private documents"]
52
+ weaknesses = ["subjective content", "informal information"]
53
+ reliability = 0.75
@@ -1,23 +1,41 @@
1
1
 
2
2
  # Main configuration for Local Deep Research
3
3
 
4
+ [web]
5
+ port = 5000
6
+ host = "0.0.0.0"
7
+ debug = true
8
+
9
+ [llm]
10
+ # LLM provider (one of: ollama, openai, anthropic, vllm, openai_endpoint, lmstudio, llamacpp)
11
+ provider = "ollama"
12
+ # Model name
13
+ model = "gemma3:12b"
14
+ # Temperature
15
+ temperature = 0.7
16
+ # Maximum tokens
17
+ max_tokens = 30000
18
+ # OpenAI-compatible endpoint URL
19
+ openai_endpoint_url = "https://openrouter.ai/api/v1"
20
+ # LM Studio URL (default: http://localhost:1234)
21
+ lmstudio_url = "http://localhost:1234"
22
+ # LlamaCpp model path
23
+ llamacpp_model_path = ""
24
+ # LlamaCpp parameters
25
+ llamacpp_n_gpu_layers = 1
26
+ llamacpp_n_batch = 512
27
+ llamacpp_f16_kv = true
28
+
4
29
  [general]
5
30
  # Directory for research outputs (relative to user data directory)
6
31
  output_dir = "research_outputs"
7
-
8
32
  # Knowledge accumulation approach (NONE, QUESTION, or ITERATION)
9
33
  knowledge_accumulation = "ITERATION"
10
-
11
34
  # Maximum context size for knowledge accumulation
12
35
  knowledge_accumulation_context_limit = 2000000
13
-
14
36
  # Enable fact checking (experimental, works better with large LLMs)
15
37
  enable_fact_checking = false
16
38
 
17
- [web]
18
- port = 5000
19
- host = "0.0.0.0"
20
- debug = true
21
39
 
22
40
  [search]
23
41
  # Search tool to use (auto, wikipedia, arxiv, duckduckgo, serp, google_pse, etc.)
@@ -0,0 +1,29 @@
1
+ # Sound Files for Notifications
2
+
3
+ This directory contains sound files used for notifications in the Deep Research application.
4
+
5
+ ## Required Files
6
+
7
+ 1. `success.mp3` - Played when research completes successfully
8
+ 2. `error.mp3` - Played when research fails or encounters an error
9
+
10
+ ## Sound Sources
11
+
12
+ You can download copyright-free sound files from these sources:
13
+
14
+ - [Freesound](https://freesound.org/)
15
+ - [Free Sound Library](https://www.freesoundslibrary.com/)
16
+
17
+ ## Recommended Sounds
18
+
19
+ ### Success Sound
20
+ - [Success Sound by grunz](https://freesound.org/people/grunz/sounds/109662/)
21
+ - Direct download: https://freesound.org/data/previews/109/109662_945474-lq.mp3
22
+
23
+ ### Error Sound
24
+ - [Error Sound by Autistic Lucario](https://freesound.org/people/Autistic%20Lucario/sounds/142608/)
25
+ - Direct download: https://freesound.org/data/previews/142/142608_1840739-lq.mp3
26
+
27
+ ## Usage
28
+
29
+ The application will automatically use these sounds when research tasks complete or fail, but only when the browser tab is not in focus.
@@ -91,14 +91,23 @@ class MetaSearchEngine(BaseSearchEngine):
91
91
  if not self.available_engines:
92
92
  logger.warning("No search engines available")
93
93
  return []
94
-
95
- # Create engine descriptions for the prompt
96
- engine_descriptions = "\n".join([
97
- f"- {name.upper()}: Good for {', '.join(SEARCH_ENGINES[name]['strengths'][:3])}. "
98
- f"Weaknesses: {', '.join(SEARCH_ENGINES[name]['weaknesses'][:2])}. "
99
- f"Reliability: {SEARCH_ENGINES[name]['reliability']*100:.0f}%"
100
- for name in self.available_engines
101
- ])
94
+ engine_descriptions = []
95
+ for name in self.available_engines:
96
+ logger.info(f"Processing search engine: {name}")
97
+ try:
98
+ description = f"- {name.upper()}: Good for {', '.join(SEARCH_ENGINES[name]['strengths'][:3])}. " \
99
+ f"Weaknesses: {', '.join(SEARCH_ENGINES[name]['weaknesses'][:2])}. " \
100
+ f"Reliability: {SEARCH_ENGINES[name]['reliability']*100:.0f}%"
101
+ engine_descriptions.append(description)
102
+ except KeyError as e:
103
+ logger.error(f"Missing key for engine {name}: {e}")
104
+ # Add a basic description for engines with missing configuration
105
+ engine_descriptions.append(f"- {name.upper()}: General purpose search engine.")
106
+ except Exception as e:
107
+ logger.error(f"Error processing engine {name}: {e}")
108
+ engine_descriptions.append(f"- {name.upper()}: General purpose search engine.")
109
+
110
+ engine_descriptions = "\n".join(engine_descriptions)
102
111
 
103
112
  prompt = f"""Analyze this search query and rank the available search engines in order of most to least appropriate for answering it.
104
113
 
@@ -10,6 +10,7 @@ import logging
10
10
  import re
11
11
  import pickle
12
12
 
13
+ from faiss import normalize_L2
13
14
  from langchain_core.language_models import BaseLLM
14
15
  from langchain_community.document_loaders import (
15
16
  PyPDFLoader,
@@ -23,6 +24,7 @@ from langchain_community.document_loaders import (
23
24
  from langchain_community.document_loaders.base import BaseLoader
24
25
  from langchain_text_splitters import RecursiveCharacterTextSplitter
25
26
  from langchain_community.vectorstores import FAISS
27
+ from langchain_community.vectorstores.utils import DistanceStrategy
26
28
  from langchain_community.embeddings import (
27
29
  HuggingFaceEmbeddings,
28
30
  OllamaEmbeddings,
@@ -136,7 +138,8 @@ class LocalEmbeddingManager:
136
138
  vector_store = FAISS.load_local(
137
139
  str(vector_store_path),
138
140
  self.embeddings,
139
- allow_dangerous_deserialization=True
141
+ allow_dangerous_deserialization=True,
142
+ normalize_L2=True
140
143
  )
141
144
 
142
145
  # Add this code to show document count
@@ -175,6 +178,10 @@ class LocalEmbeddingManager:
175
178
 
176
179
  def _get_folder_hash(self, folder_path: str) -> str:
177
180
  """Generate a hash for a folder based on its path"""
181
+ # Strip trailing slashes if we have them.
182
+ if folder_path.endswith("/"):
183
+ folder_path = folder_path[:-1]
184
+
178
185
  return hashlib.md5(folder_path.encode()).hexdigest()
179
186
 
180
187
  def _get_index_path(self, folder_path: str) -> Path:
@@ -205,6 +212,32 @@ class LocalEmbeddingManager:
205
212
  return True
206
213
 
207
214
  return False
215
+
216
+ def _check_config_changed(self, folder_path: str) -> bool:
217
+ """
218
+ Checks if the embedding configuration for a folder has been changed
219
+ since it was last indexed.
220
+ """
221
+ folder_hash = self._get_folder_hash(folder_path)
222
+
223
+ if folder_hash not in self.indexed_folders:
224
+ # It hasn't been indexed at all. That's a new configuration,
225
+ # technically.
226
+ return True
227
+
228
+ embedding_config = self.indexed_folders[folder_hash]
229
+ chunk_size = embedding_config.get("chunk_size", 0)
230
+ chunk_overlap = embedding_config.get("chunk_overlap", 0)
231
+ embedding_model = embedding_config.get("embedding_model", "")
232
+
233
+ if (chunk_size, chunk_overlap, embedding_model) != (
234
+ self.chunk_size, self.chunk_overlap, self.embedding_model
235
+ ):
236
+ logger.info(
237
+ "Embedding configuration has changed, re-indexing folder."
238
+ )
239
+ return True
240
+ return False
208
241
 
209
242
  def get_file_loader(self, file_path: str) -> Optional[BaseLoader]:
210
243
  """Get an appropriate document loader for a file based on its extension"""
@@ -257,9 +290,10 @@ class LocalEmbeddingManager:
257
290
  folder_str = str(folder_path)
258
291
  folder_hash = self._get_folder_hash(folder_str)
259
292
  index_path = self._get_index_path(folder_str)
260
-
293
+
261
294
  # Check if folder needs to be reindexed
262
- if not force_reindex and not self._check_folder_modified(folder_str):
295
+ if (not force_reindex and not self._check_folder_modified(folder_str)
296
+ and not self._check_config_changed(folder_str)):
263
297
  logger.info(f"Folder {folder_path} has not been modified since last indexing")
264
298
 
265
299
  # Load the vector store from disk if not already loaded
@@ -268,7 +302,8 @@ class LocalEmbeddingManager:
268
302
  self.vector_stores[folder_hash] = FAISS.load_local(
269
303
  str(index_path),
270
304
  self.embeddings,
271
- allow_dangerous_deserialization=True
305
+ allow_dangerous_deserialization=True,
306
+ normalize_L2=True,
272
307
  )
273
308
  logger.info(f"Loaded index for {folder_path} from disk")
274
309
  except Exception as e:
@@ -328,7 +363,11 @@ class LocalEmbeddingManager:
328
363
 
329
364
  # Create vector store
330
365
  logger.info(f"Creating vector store with {len(splits)} chunks")
331
- vector_store = FAISS.from_documents(splits, self.embeddings)
366
+ vector_store = FAISS.from_documents(
367
+ splits,
368
+ self.embeddings,
369
+ normalize_L2=True
370
+ )
332
371
 
333
372
  # Save the vector store to disk
334
373
  logger.info(f"Saving index to {index_path}")
@@ -421,7 +460,8 @@ class LocalEmbeddingManager:
421
460
  self.vector_stores[folder_hash] = FAISS.load_local(
422
461
  str(index_path),
423
462
  self.embeddings,
424
- allow_dangerous_deserialization=True
463
+ allow_dangerous_deserialization=True,
464
+ nomalize_L2=True
425
465
  )
426
466
  except Exception as e:
427
467
  logger.error(f"Error loading index for {folder_path}: {e}")
@@ -431,14 +471,14 @@ class LocalEmbeddingManager:
431
471
  vector_store = self.vector_stores[folder_hash]
432
472
 
433
473
  try:
434
- docs_with_scores = vector_store.similarity_search_with_score(query, k=limit)
474
+ docs_with_scores = (
475
+ vector_store.similarity_search_with_relevance_scores(
476
+ query,
477
+ k=limit
478
+ )
479
+ )
435
480
 
436
- for doc, score in docs_with_scores:
437
- # Convert score from distance to similarity (lower distance = higher similarity)
438
- # FAISS cosine distance is in [0, 2], where 0 is identical and 2 is opposite
439
- # Convert to a similarity score in [0, 1]
440
- similarity = 1.0 - (score / 2.0)
441
-
481
+ for doc, similarity in docs_with_scores:
442
482
  # Skip results below the threshold
443
483
  if similarity < score_threshold:
444
484
  continue
@@ -491,7 +531,7 @@ class LocalSearchEngine(BaseSearchEngine):
491
531
 
492
532
  def __init__(
493
533
  self,
494
- folder_paths: List[str],
534
+ paths: List[str],
495
535
  llm: Optional[BaseLLM] = None,
496
536
  max_results: int = 10,
497
537
  max_filtered_results: Optional[int] = None,
@@ -509,7 +549,7 @@ class LocalSearchEngine(BaseSearchEngine):
509
549
  Initialize the local search engine.
510
550
 
511
551
  Args:
512
- folder_paths: List of folder paths to search in
552
+ paths: List of folder paths to search in
513
553
  llm: Language model for relevance filtering
514
554
  max_results: Maximum number of results to return
515
555
  max_filtered_results: Maximum results after filtering
@@ -527,21 +567,21 @@ class LocalSearchEngine(BaseSearchEngine):
527
567
  super().__init__(llm=llm, max_filtered_results=max_filtered_results)
528
568
 
529
569
  # Validate folder paths
530
- self.folder_paths = folder_paths
570
+ self.folder_paths = paths
531
571
  self.valid_folder_paths = []
532
- for path in folder_paths:
572
+ for path in paths:
533
573
  if os.path.exists(path) and os.path.isdir(path):
534
574
  self.valid_folder_paths.append(path)
535
575
  else:
536
576
  logger.warning(f"Folder not found or is not a directory: {path}")
537
577
 
538
578
  # If no valid folders, log a clear message
539
- if not self.valid_folder_paths and folder_paths:
540
- logger.warning(f"No valid folders found among: {folder_paths}")
579
+ if not self.valid_folder_paths and paths:
580
+ logger.warning(f"No valid folders found among: {paths}")
541
581
  logger.warning("This search engine will return no results until valid folders are configured")
542
582
 
543
583
  self.max_results = max_results
544
- self.collections = collections or {"default": {"paths": folder_paths, "description": "Default collection"}}
584
+ self.collections = collections or {"default": {"paths": paths, "description": "Default collection"}}
545
585
 
546
586
  # Initialize the embedding manager with only valid folders
547
587
  self.embedding_manager = LocalEmbeddingManager(
@@ -885,7 +925,7 @@ class LocalSearchEngine(BaseSearchEngine):
885
925
  cache_dir = config_dict.get("cache_dir", ".cache/local_search")
886
926
 
887
927
  return cls(
888
- folder_paths=folder_paths,
928
+ paths=folder_paths,
889
929
  collections=collections,
890
930
  llm=llm,
891
931
  max_results=max_results,
@@ -4,11 +4,13 @@ Search engine that searches across all local collections
4
4
 
5
5
  import logging
6
6
  from typing import Dict, List, Any, Optional
7
+
8
+ import toml
7
9
  from langchain_core.language_models import BaseLLM
8
10
 
9
11
  from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
10
12
  from local_deep_research.web_search_engines.search_engine_factory import create_search_engine
11
- from local_deep_research import config
13
+ from local_deep_research.config import LOCAL_COLLECTIONS_FILE
12
14
 
13
15
  # Setup logging
14
16
  logger = logging.getLogger(__name__)
@@ -18,7 +20,7 @@ class LocalAllSearchEngine(BaseSearchEngine):
18
20
  Search engine that searches across all local document collections.
19
21
  Acts as a meta search engine specifically for local collections.
20
22
  """
21
-
23
+
22
24
  def __init__(
23
25
  self,
24
26
  llm: Optional[BaseLLM] = None,
@@ -41,9 +43,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
41
43
  # Find all local collection search engines
42
44
  self.local_engines = {}
43
45
  try:
44
- from local_collections import LOCAL_COLLECTIONS
45
-
46
- for collection_id, collection in LOCAL_COLLECTIONS.items():
46
+ local_collections = toml.load(LOCAL_COLLECTIONS_FILE)
47
+
48
+ for collection_id, collection in local_collections.items():
47
49
  if not collection.get("enabled", True):
48
50
  continue
49
51
 
@@ -6,7 +6,7 @@ import logging
6
6
  import os
7
7
  import toml
8
8
  from pathlib import Path
9
- from local_deep_research.config import CONFIG_DIR
9
+ from local_deep_research.config import CONFIG_DIR, LOCAL_COLLECTIONS_FILE
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
 
@@ -41,13 +41,30 @@ if 'auto' in SEARCH_ENGINES and 'meta' not in SEARCH_ENGINES:
41
41
  SEARCH_ENGINES['meta'] = SEARCH_ENGINES['auto']
42
42
 
43
43
  # Register local document collections
44
- try:
45
- from local_deep_research.local_collections import register_local_collections
46
- register_local_collections(SEARCH_ENGINES)
47
- logger.info(f"Registered local document collections as search engines")
48
- except ImportError:
49
- logger.info("No local collections configuration found. Local document search is disabled.")
50
44
 
45
+ if os.path.exists(LOCAL_COLLECTIONS_FILE):
46
+ try:
47
+ local_collections_data = toml.load(LOCAL_COLLECTIONS_FILE)
48
+
49
+ for collection, config in local_collections_data.items():
50
+ # Create a new dictionary with required search engine fields
51
+ engine_config = {
52
+ "module_path": "local_deep_research.web_search_engines.engines.search_engine_local",
53
+ "class_name": "LocalSearchEngine",
54
+ "default_params": config,
55
+ "requires_llm": True
56
+ }
57
+
58
+ # Copy these specific fields to the top level if they exist
59
+ for field in ["strengths", "weaknesses", "reliability", "description"]:
60
+ if field in config:
61
+ engine_config[field] = config[field]
62
+
63
+ SEARCH_ENGINES[collection] = engine_config
64
+
65
+ logger.info(f"Registered local document collections as search engines")
66
+ except Exception as e:
67
+ logger.error(f"Error loading local collections from TOML file: {e}")
51
68
  # Ensure the meta search engine is still available at the end if it exists
52
69
  if 'auto' in SEARCH_ENGINES:
53
70
  meta_config = SEARCH_ENGINES["auto"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: local-deep-research
3
- Version: 0.1.18
3
+ Version: 0.1.20
4
4
  Summary: AI-powered research assistant with deep, iterative analysis using LLMs and web searches
5
5
  Author-email: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>, HashedViking <6432677+HashedViking@users.noreply.github.com>
6
6
  License: MIT License
@@ -71,6 +71,14 @@ Dynamic: license-file
71
71
 
72
72
  A powerful AI-powered research assistant that performs deep, iterative analysis using multiple LLMs and web searches. The system can be run locally for privacy or configured to use cloud-based LLMs for enhanced capabilities.
73
73
 
74
+ <div align="center">
75
+ <a href="https://www.youtube.com/watch?v=0ISreg9q0p0">
76
+ <img src="https://img.youtube.com/vi/0ISreg9q0p0/0.jpg" alt="Local Deep Research">
77
+ <br>
78
+ <span>▶️ Watch Video</span>
79
+ </a>
80
+ </div>
81
+
74
82
  ## Quick Start
75
83
 
76
84
  ```bash
@@ -143,6 +151,8 @@ The package automatically creates and manages configuration files in your user d
143
151
 
144
152
  ### Default Configuration Files
145
153
 
154
+ If you prefere environment variables please refere to this file: https://github.com/LearningCircuit/local-deep-research/blob/main/docs/env_configuration.md
155
+
146
156
  When you first run the tool, it creates these configuration files:
147
157
 
148
158
  | File | Purpose |
@@ -215,6 +225,10 @@ The `OPENAI_ENDPOINT` provider can access any service with an OpenAI-compatible
215
225
 
216
226
  The system includes multiple search engines. Some require API keys:
217
227
 
228
+ Use .env in config folder if .secrets.toml doesnt work.
229
+
230
+ You can also overwrite other settings via environment variables, e.g. to overwrite [web] port setting in settings.toml please use: **LDR_WEB__PORT=8080**
231
+
218
232
  ```toml
219
233
  # Add to .secrets.toml
220
234
  SERP_API_KEY = "your-serpapi-key-here" # For Google results via SerpAPI
@@ -1,13 +1,13 @@
1
1
  local_deep_research/__init__.py,sha256=pfHzjzYc6Szo8VCNLtFZRXyAlEz7CViY7r2fH9O7yms,584
2
2
  local_deep_research/citation_handler.py,sha256=v_fwTy-2XvUuoH3OQRzmBrvaiN7mBk8jbNfySslmt5g,4357
3
- local_deep_research/config.py,sha256=PAruLZutlrjkGOKrv49hk8U4q9JPWWgAKLiY8Ukpsks,8572
3
+ local_deep_research/config.py,sha256=3g8-QPMrxoIMjHvyjSJBFUELmAIyOQFHApUnd8p50a8,9881
4
4
  local_deep_research/main.py,sha256=uQXtGQ6LtZNd5Qw63D5ke4Q_LjYimouWVSUknVsk3JQ,3645
5
5
  local_deep_research/report_generator.py,sha256=EvaArnWirMgg42fMzmZeJczoEYujEbJ2ryHHYuuoXx8,8058
6
6
  local_deep_research/search_system.py,sha256=yY3BEzX68vdtUcYF9h6lC3yVao0YA_NSBj6W3-RwlKk,15459
7
7
  local_deep_research/defaults/__init__.py,sha256=2Vvlkl-gmP_qPYWegE4JBgummypogl3VXrQ1XzptFDU,1381
8
- local_deep_research/defaults/llm_config.py,sha256=T03pntyNtOk1fvu-RZ-iEoh7L2D2hcICr8usIPpULuo,7870
9
- local_deep_research/defaults/local_collections.toml,sha256=_edVWVHrhunMfazjejhJlGPRkHKKIP51qQtNkMgNEiA,1406
10
- local_deep_research/defaults/main.toml,sha256=DLhFq88vdE2_psLaWhPV9BWPixqTHvR2Rllaj_rmjJ4,1403
8
+ local_deep_research/defaults/llm_config.py,sha256=7wTIugVYD_ypG7Xwvu3DBt0yO8TWBf_drOIQOSOkdQQ,9628
9
+ local_deep_research/defaults/local_collections.toml,sha256=zNa03PVnFrZ757JdZOuW6QDxkOc6ep5tG8baGBrMmXM,1778
10
+ local_deep_research/defaults/main.toml,sha256=6Lzbc5sVLxMwu83bLBp_tpYOZgmtThCfPL1L42eTGro,1939
11
11
  local_deep_research/defaults/search_engines.toml,sha256=TYkOqVaZq9JPawz4fIPyGdkAtYa4t8F9H50VY-wv2ak,8101
12
12
  local_deep_research/utilties/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  local_deep_research/utilties/enums.py,sha256=TVAZiu9szNbdacfb7whgaQJJlSk7oYByADaAierD4CE,229
@@ -18,6 +18,9 @@ local_deep_research/web/__init__.py,sha256=3oHMatNu8r24FBtpojriIVbHYOVSHj4Q-quyc
18
18
  local_deep_research/web/app.py,sha256=5_VLNdhJOqdgacucglUdS_lVURNgYNbXhK9vME6JmzA,72431
19
19
  local_deep_research/web/static/css/styles.css,sha256=_26yBV1fKM51Dfv67CxKSbK8aeoYK5Tl7b2TPs5Whuo,24641
20
20
  local_deep_research/web/static/js/app.js,sha256=GPncdWpw2YNTs56JY-0tjTTr9JnX-fIZSZX0agwKZMU,172813
21
+ local_deep_research/web/static/sounds/README.md,sha256=yNfVJIpKoSHSdAEj-lpxkjGy8F-OMStXCiIo1fY5I-0,1003
22
+ local_deep_research/web/static/sounds/error.mp3,sha256=OM3K-pDxkPDCcptqb7c4bIwkHTQa7cLREs4xdYAODPs,3177
23
+ local_deep_research/web/static/sounds/success.mp3,sha256=8EJRxWER-dt6vG6X6GDK3DNb8zoNa_1eDzusYJVcWLI,11818
21
24
  local_deep_research/web/templates/api_keys_config.html,sha256=jA8Y-nfUGJ1dTvbw2jK_8xPy2x6UG_5gHpbrTJAex2g,3527
22
25
  local_deep_research/web/templates/collections_config.html,sha256=Dci7KumXBON8rAXRX8TVjgqS-bbht7d6aQiedDUnxQ0,3560
23
26
  local_deep_research/web/templates/index.html,sha256=IW4cU5NgXVFXF6BxMhLuFzwkte_iYmLo3DQssxuYLZw,17490
@@ -29,27 +32,27 @@ local_deep_research/web/templates/settings_dashboard.html,sha256=De-v1KNdVvkXme5
29
32
  local_deep_research/web_search_engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
33
  local_deep_research/web_search_engines/search_engine_base.py,sha256=QmhfjuHK2deomh8tARghKuYnF-5t3wwBB661odS2VtU,8065
31
34
  local_deep_research/web_search_engines/search_engine_factory.py,sha256=Sld6bYTwcyTxgVLx04t00sD7vfJhSHFOl6iiGJ08ZUE,11118
32
- local_deep_research/web_search_engines/search_engines_config.py,sha256=bNCuR09NOk5cjnKIgDQfhPipqmvDKeE7WP_6p8LLZf0,1979
35
+ local_deep_research/web_search_engines/search_engines_config.py,sha256=5C0tCmy_Jpv1YHLZLlyS7h5B2XToYcWPAaBDEOsxMo0,2739
33
36
  local_deep_research/web_search_engines/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
37
  local_deep_research/web_search_engines/engines/full_search.py,sha256=BuOz8dX-XocazCG7gGBKFnIY99FZtNFI0-Wq3fhsfp4,4689
35
- local_deep_research/web_search_engines/engines/meta_search_engine.py,sha256=Zky4sowCortEaIj1pBU0sKuggXr5izkubgrD7cM8IOQ,11485
38
+ local_deep_research/web_search_engines/engines/meta_search_engine.py,sha256=0zU_L5thHzAmAx-BDsV6QNnSk10CfJ3quCRGjfC_Ys0,12130
36
39
  local_deep_research/web_search_engines/engines/search_engine_arxiv.py,sha256=cf8OzhSzE1zqaiOZ6EFQGy_6hTCJMaTysYd8rs1KJNU,15408
37
40
  local_deep_research/web_search_engines/engines/search_engine_brave.py,sha256=J242byUGG5ROQ_bh-mU292_t7Q7m20_9O0r1w5z6d9A,9688
38
41
  local_deep_research/web_search_engines/engines/search_engine_ddg.py,sha256=qK2i65dbPtr_ppoKPU_YA0mDqM_sDAvN6ZztvdFjsCk,4910
39
42
  local_deep_research/web_search_engines/engines/search_engine_github.py,sha256=qqipsw2ycjlRbR6mmMmxzGU3LEcFDJJJ7Ez7xUgWjRM,26768
40
43
  local_deep_research/web_search_engines/engines/search_engine_google_pse.py,sha256=YkXvBmgcqTImCxuyy6580SGRAvImGc6SzInXZgo1kNE,11294
41
44
  local_deep_research/web_search_engines/engines/search_engine_guardian.py,sha256=MW4WIwtNAwcpdigNXronyezAxr50EIZTV1NMedrAv2o,23912
42
- local_deep_research/web_search_engines/engines/search_engine_local.py,sha256=rfmPiA9DVmjbaB3KQtlq7s6BRMgHRgzP7AhktZNDw2M,36772
43
- local_deep_research/web_search_engines/engines/search_engine_local_all.py,sha256=CRNcxBzNd9kanyIJYaUDB7qfXYxVCvd4L2mX8jL73v0,5955
45
+ local_deep_research/web_search_engines/engines/search_engine_local.py,sha256=qKgiohPL8oyvpT6S6jSmNFuR_vuNVVVqO7O4gwliLqw,37981
46
+ local_deep_research/web_search_engines/engines/search_engine_local_all.py,sha256=7s7MHuFZTR28bDTxRUj19pzKv7Xzc5SG3yhtGG957eg,5981
44
47
  local_deep_research/web_search_engines/engines/search_engine_pubmed.py,sha256=MayfzM2R0XoI7cpXlG1XJ1ktfTN_6H-Xs9RmD89UAao,39236
45
48
  local_deep_research/web_search_engines/engines/search_engine_searxng.py,sha256=GMy6qDMSaVBtjWRm48XBu6TjLAy1HfcO2EFTwr8S9rk,18048
46
49
  local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py,sha256=6VMymjFJ7pyV2nv5dRfFofXgg0kG82rkwbICVnNDNH4,23352
47
50
  local_deep_research/web_search_engines/engines/search_engine_serpapi.py,sha256=XikEYnM-pAaR70VeAJ28lbqpRzCj4bCA9xY29taTV8g,9215
48
51
  local_deep_research/web_search_engines/engines/search_engine_wayback.py,sha256=astAvSLajDZ6rwgthJ3iBcHSWuDSYPO7uilIxaJhXmU,18132
49
52
  local_deep_research/web_search_engines/engines/search_engine_wikipedia.py,sha256=KSGJECbEcxZpVK-PhYsTCtzedSK0l1AjQmvGtx8KBks,9799
50
- local_deep_research-0.1.18.dist-info/licenses/LICENSE,sha256=Qg2CaTdu6SWnSqk1_JtgBPp_Da-LdqJDhT1Vt1MUc5s,1072
51
- local_deep_research-0.1.18.dist-info/METADATA,sha256=6F15QvEwfPPqHVEGRUCPEYVXKoR8PvbXOwzM9Vemem8,15013
52
- local_deep_research-0.1.18.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
53
- local_deep_research-0.1.18.dist-info/entry_points.txt,sha256=u-Y6Z3MWtR3dmsTDFYhXyfkPv7mALUA7YAnY4Fi1XDs,97
54
- local_deep_research-0.1.18.dist-info/top_level.txt,sha256=h6-uVE_wSuLOcoWwT9szhX23mBWufu77MqmM25UfbCY,20
55
- local_deep_research-0.1.18.dist-info/RECORD,,
53
+ local_deep_research-0.1.20.dist-info/licenses/LICENSE,sha256=Qg2CaTdu6SWnSqk1_JtgBPp_Da-LdqJDhT1Vt1MUc5s,1072
54
+ local_deep_research-0.1.20.dist-info/METADATA,sha256=29URKDSkO8eCFRa5NkCoPIZ_lHYH5xOeK8ORQp5-v6k,15608
55
+ local_deep_research-0.1.20.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
56
+ local_deep_research-0.1.20.dist-info/entry_points.txt,sha256=u-Y6Z3MWtR3dmsTDFYhXyfkPv7mALUA7YAnY4Fi1XDs,97
57
+ local_deep_research-0.1.20.dist-info/top_level.txt,sha256=h6-uVE_wSuLOcoWwT9szhX23mBWufu77MqmM25UfbCY,20
58
+ local_deep_research-0.1.20.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.3)
2
+ Generator: setuptools (78.0.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5