local-deep-research 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/citation_handler.py +0 -2
- local_deep_research/config.py +1 -4
- local_deep_research/defaults/llm_config.py +2 -2
- local_deep_research/defaults/main.toml +3 -3
- local_deep_research/defaults/search_engines.toml +2 -2
- local_deep_research/report_generator.py +1 -5
- local_deep_research/search_system.py +9 -10
- local_deep_research/utilties/search_utilities.py +3 -4
- local_deep_research/web_search_engines/engines/full_search.py +9 -8
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -14
- local_deep_research/web_search_engines/engines/search_engine_brave.py +10 -9
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +4 -2
- local_deep_research/web_search_engines/engines/search_engine_local.py +1 -1
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +102 -661
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +9 -8
- local_deep_research/web_search_engines/search_engine_base.py +6 -15
- local_deep_research-0.1.17.dist-info/METADATA +393 -0
- {local_deep_research-0.1.15.dist-info → local_deep_research-0.1.17.dist-info}/RECORD +22 -24
- local_deep_research/local_collections.py +0 -141
- local_deep_research/web_search_engines/full_search.py +0 -254
- local_deep_research-0.1.15.dist-info/METADATA +0 -346
- {local_deep_research-0.1.15.dist-info → local_deep_research-0.1.17.dist-info}/WHEEL +0 -0
- {local_deep_research-0.1.15.dist-info → local_deep_research-0.1.17.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.1.15.dist-info → local_deep_research-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {local_deep_research-0.1.15.dist-info → local_deep_research-0.1.17.dist-info}/top_level.txt +0 -0
@@ -1,141 +0,0 @@
|
|
1
|
-
# local_collections.py
|
2
|
-
"""
|
3
|
-
Configuration file for local document collections.
|
4
|
-
Each collection functions as an independent search engine.
|
5
|
-
"""
|
6
|
-
|
7
|
-
import os
|
8
|
-
from typing import Dict, Any
|
9
|
-
|
10
|
-
# Registry of local document collections
|
11
|
-
# Each collection appears as a separate search engine in the main configuration
|
12
|
-
LOCAL_COLLECTIONS = {
|
13
|
-
# Project Documents Collection
|
14
|
-
"project_docs": {
|
15
|
-
"name": "Project Documents",
|
16
|
-
"description": "Project documentation and specifications",
|
17
|
-
"paths": [os.path.abspath("./local_search_files/project_documents")],
|
18
|
-
"enabled": True,
|
19
|
-
"embedding_model": "all-MiniLM-L6-v2",
|
20
|
-
"embedding_device": "cpu",
|
21
|
-
"embedding_model_type": "sentence_transformers",
|
22
|
-
"max_results": 20,
|
23
|
-
"max_filtered_results": 5,
|
24
|
-
"chunk_size": 1000,
|
25
|
-
"chunk_overlap": 200,
|
26
|
-
"cache_dir": ".cache/local_search/project_docs"
|
27
|
-
},
|
28
|
-
|
29
|
-
# Research Papers Collection
|
30
|
-
"research_papers": {
|
31
|
-
"name": "Research Papers",
|
32
|
-
"description": "Academic research papers and articles",
|
33
|
-
"paths": [os.path.abspath("local_search_files/research_papers")],
|
34
|
-
"enabled": True,
|
35
|
-
"embedding_model": "all-MiniLM-L6-v2",
|
36
|
-
"embedding_device": "cpu",
|
37
|
-
"embedding_model_type": "sentence_transformers",
|
38
|
-
"max_results": 20,
|
39
|
-
"max_filtered_results": 5,
|
40
|
-
"chunk_size": 800, # Smaller chunks for academic content
|
41
|
-
"chunk_overlap": 150,
|
42
|
-
"cache_dir": ".cache/local_search/research_papers"
|
43
|
-
},
|
44
|
-
|
45
|
-
# Personal Notes Collection
|
46
|
-
"personal_notes": {
|
47
|
-
"name": "Personal Notes",
|
48
|
-
"description": "Personal notes and documents",
|
49
|
-
"paths": [os.path.abspath("./local_search_files/personal_notes")],
|
50
|
-
"enabled": True,
|
51
|
-
"embedding_model": "all-MiniLM-L6-v2",
|
52
|
-
"embedding_device": "cpu",
|
53
|
-
"embedding_model_type": "sentence_transformers",
|
54
|
-
"max_results": 30,
|
55
|
-
"max_filtered_results": 10,
|
56
|
-
"chunk_size": 500, # Smaller chunks for notes
|
57
|
-
"chunk_overlap": 100,
|
58
|
-
"cache_dir": ".cache/local_search/personal_notes"
|
59
|
-
}
|
60
|
-
}
|
61
|
-
|
62
|
-
# Configuration for local search integration
|
63
|
-
LOCAL_SEARCH_CONFIG = {
|
64
|
-
# General embedding options
|
65
|
-
"DEFAULT_EMBEDDING_MODEL": "all-MiniLM-L6-v2",
|
66
|
-
"DEFAULT_EMBEDDING_DEVICE": "cpu", # "cpu" or "cuda" for GPU acceleration
|
67
|
-
"DEFAULT_EMBEDDING_MODEL_TYPE": "sentence_transformers", # or "ollama"
|
68
|
-
|
69
|
-
# Ollama settings (only used if model type is "ollama")
|
70
|
-
# Note: You must run 'ollama pull nomic-embed-text' first if using Ollama for embeddings
|
71
|
-
"OLLAMA_BASE_URL": "http://localhost:11434",
|
72
|
-
"OLLAMA_EMBEDDING_MODEL": "nomic-embed-text",
|
73
|
-
|
74
|
-
# Default indexing options
|
75
|
-
"FORCE_REINDEX": True, # Force reindexing on startup
|
76
|
-
"CACHE_DIR": ".cache/local_search", # Base directory for cache
|
77
|
-
}
|
78
|
-
|
79
|
-
def register_local_collections(search_engines_dict: Dict[str, Any]) -> None:
|
80
|
-
"""
|
81
|
-
Register all enabled local collections as search engines.
|
82
|
-
|
83
|
-
Args:
|
84
|
-
search_engines_dict: The main search engines dictionary to update
|
85
|
-
"""
|
86
|
-
for collection_id, collection in LOCAL_COLLECTIONS.items():
|
87
|
-
print(collection_id, collection)
|
88
|
-
if collection.get("enabled", True):
|
89
|
-
# Skip if already defined (don't override)
|
90
|
-
if collection_id in search_engines_dict:
|
91
|
-
continue
|
92
|
-
|
93
|
-
# Validate paths exist
|
94
|
-
paths = collection.get("paths", [])
|
95
|
-
valid_paths = []
|
96
|
-
for path in paths:
|
97
|
-
if os.path.exists(path) and os.path.isdir(path):
|
98
|
-
valid_paths.append(path)
|
99
|
-
else:
|
100
|
-
print(f"Warning: Collection '{collection_id}' contains non-existent folder: {path}")
|
101
|
-
|
102
|
-
# Log warning if no valid paths
|
103
|
-
if not valid_paths and paths:
|
104
|
-
print(f"Warning: Collection '{collection_id}' has no valid folders. It will be registered but won't return results.")
|
105
|
-
|
106
|
-
# Create a search engine entry for this collection
|
107
|
-
search_engines_dict[collection_id] = {
|
108
|
-
"module_path": "local_deep_research.web_search_engines.engines.search_engine_local",
|
109
|
-
"class_name": "LocalSearchEngine",
|
110
|
-
"requires_api_key": False,
|
111
|
-
"reliability": 0.9, # High reliability for local documents
|
112
|
-
"strengths": ["personal documents", "offline access",
|
113
|
-
collection.get("description", "local documents")],
|
114
|
-
"weaknesses": ["requires indexing", "limited to specific folders"],
|
115
|
-
"default_params": {
|
116
|
-
"folder_paths": collection.get("paths", []),
|
117
|
-
"embedding_model": collection.get(
|
118
|
-
"embedding_model",
|
119
|
-
LOCAL_SEARCH_CONFIG["DEFAULT_EMBEDDING_MODEL"]
|
120
|
-
),
|
121
|
-
"embedding_device": collection.get(
|
122
|
-
"embedding_device",
|
123
|
-
LOCAL_SEARCH_CONFIG["DEFAULT_EMBEDDING_DEVICE"]
|
124
|
-
),
|
125
|
-
"embedding_model_type": collection.get(
|
126
|
-
"embedding_model_type",
|
127
|
-
LOCAL_SEARCH_CONFIG["DEFAULT_EMBEDDING_MODEL_TYPE"]
|
128
|
-
),
|
129
|
-
"chunk_size": collection.get("chunk_size", 1000),
|
130
|
-
"chunk_overlap": collection.get("chunk_overlap", 200),
|
131
|
-
"cache_dir": collection.get(
|
132
|
-
"cache_dir",
|
133
|
-
f"{LOCAL_SEARCH_CONFIG['CACHE_DIR']}/{collection_id}"
|
134
|
-
),
|
135
|
-
"max_results": collection.get("max_results", 20),
|
136
|
-
"max_filtered_results": collection.get("max_filtered_results", 5),
|
137
|
-
"collection_name": collection.get("name", collection_id),
|
138
|
-
"collection_description": collection.get("description", "")
|
139
|
-
},
|
140
|
-
"requires_llm": True
|
141
|
-
}
|
@@ -1,254 +0,0 @@
|
|
1
|
-
import justext
|
2
|
-
from langchain_community.document_loaders import AsyncChromiumLoader
|
3
|
-
from langchain_community.document_transformers import BeautifulSoupTransformer
|
4
|
-
from langchain_core.language_models import BaseLLM
|
5
|
-
from typing import List, Dict, Any, Optional, Union
|
6
|
-
import json
|
7
|
-
import os
|
8
|
-
from .utilties.search_utilities import remove_think_tags
|
9
|
-
from datetime import datetime
|
10
|
-
from local_deep_research import config
|
11
|
-
|
12
|
-
class FullSearchResults:
|
13
|
-
"""
|
14
|
-
Enhanced web content retrieval class that works with the BaseSearchEngine architecture.
|
15
|
-
Can be used as a wrapper around web-based search engines like DuckDuckGo and SerpAPI.
|
16
|
-
"""
|
17
|
-
|
18
|
-
def __init__(
|
19
|
-
self,
|
20
|
-
llm: BaseLLM,
|
21
|
-
web_search,
|
22
|
-
output_format: str = "list",
|
23
|
-
language: str = "English",
|
24
|
-
max_results: int = 10,
|
25
|
-
region: str = "wt-wt",
|
26
|
-
time: str = "y",
|
27
|
-
safesearch: str = "Moderate"
|
28
|
-
):
|
29
|
-
"""
|
30
|
-
Initialize the full search results processor.
|
31
|
-
|
32
|
-
Args:
|
33
|
-
llm: Language model instance for relevance filtering
|
34
|
-
web_search: Web search engine instance that provides initial results
|
35
|
-
output_format: Format of output ('list' or other formats)
|
36
|
-
language: Language for content processing
|
37
|
-
max_results: Maximum number of search results
|
38
|
-
region: Search region
|
39
|
-
time: Time period for search results
|
40
|
-
safesearch: Safe search setting
|
41
|
-
"""
|
42
|
-
self.llm = llm
|
43
|
-
self.output_format = output_format
|
44
|
-
self.language = language
|
45
|
-
self.max_results = max_results
|
46
|
-
self.region = region
|
47
|
-
self.time = time
|
48
|
-
self.safesearch = safesearch
|
49
|
-
self.web_search = web_search
|
50
|
-
os.environ["USER_AGENT"] = "Local Deep Research/1.0"
|
51
|
-
|
52
|
-
self.bs_transformer = BeautifulSoupTransformer()
|
53
|
-
self.tags_to_extract = ["p", "div", "span"]
|
54
|
-
|
55
|
-
def run(self, query: str) -> List[Dict[str, Any]]:
|
56
|
-
"""
|
57
|
-
Legacy method that performs a full search in one step.
|
58
|
-
Respects config parameters:
|
59
|
-
- SEARCH_SNIPPETS_ONLY: If True, only returns snippets without full content
|
60
|
-
- SKIP_RELEVANCE_FILTER: If True, returns all results without filtering
|
61
|
-
|
62
|
-
Args:
|
63
|
-
query: The search query
|
64
|
-
|
65
|
-
Returns:
|
66
|
-
List of search results with full content (unless SEARCH_SNIPPETS_ONLY is True)
|
67
|
-
"""
|
68
|
-
# Phase 1: Get search results from the web search engine
|
69
|
-
previews = self._get_previews(query)
|
70
|
-
if not previews:
|
71
|
-
return []
|
72
|
-
|
73
|
-
# Phase 2: Filter URLs using LLM (unless SKIP_RELEVANCE_FILTER is True)
|
74
|
-
if hasattr(config, 'SKIP_RELEVANCE_FILTER') and config.SKIP_RELEVANCE_FILTER:
|
75
|
-
relevant_items = previews
|
76
|
-
print("Skipping relevance filtering as per config")
|
77
|
-
else:
|
78
|
-
relevant_items = self._filter_relevant_items(previews, query)
|
79
|
-
if not relevant_items:
|
80
|
-
return []
|
81
|
-
|
82
|
-
# Phase 3: Get full content for relevant items (unless SEARCH_SNIPPETS_ONLY is True)
|
83
|
-
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
84
|
-
print("Returning snippet-only results as per config")
|
85
|
-
return relevant_items
|
86
|
-
else:
|
87
|
-
results = self._get_full_content(relevant_items)
|
88
|
-
return results
|
89
|
-
|
90
|
-
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
91
|
-
"""
|
92
|
-
Get preview information from the web search engine.
|
93
|
-
|
94
|
-
Args:
|
95
|
-
query: The search query
|
96
|
-
|
97
|
-
Returns:
|
98
|
-
List of preview dictionaries
|
99
|
-
"""
|
100
|
-
try:
|
101
|
-
# Get search results from the web search engine
|
102
|
-
search_results = self.web_search.invoke(query)
|
103
|
-
|
104
|
-
if not isinstance(search_results, list):
|
105
|
-
print("Error: Expected search results in list format")
|
106
|
-
return []
|
107
|
-
|
108
|
-
# Return the results as previews
|
109
|
-
return search_results
|
110
|
-
|
111
|
-
except Exception as e:
|
112
|
-
print(f"Error getting previews: {e}")
|
113
|
-
return []
|
114
|
-
|
115
|
-
def _filter_relevant_items(self, previews: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
|
116
|
-
"""
|
117
|
-
Filter previews for relevance using LLM.
|
118
|
-
|
119
|
-
Args:
|
120
|
-
previews: List of preview dictionaries
|
121
|
-
query: The original search query
|
122
|
-
|
123
|
-
Returns:
|
124
|
-
List of relevant preview dictionaries
|
125
|
-
"""
|
126
|
-
# Skip filtering if disabled in config or no previews
|
127
|
-
if not config.QUALITY_CHECK_DDG_URLS or not previews:
|
128
|
-
return previews
|
129
|
-
|
130
|
-
# Format for LLM evaluation
|
131
|
-
now = datetime.now()
|
132
|
-
current_time = now.strftime("%Y-%m-%d")
|
133
|
-
prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for:
|
134
|
-
1. Timeliness (today: {current_time})
|
135
|
-
2. Factual accuracy (cross-reference major claims)
|
136
|
-
3. Source reliability (prefer official company websites, established news outlets)
|
137
|
-
4. Direct relevance to query: {query}
|
138
|
-
|
139
|
-
URLs to evaluate:
|
140
|
-
{json.dumps(previews, indent=2)}
|
141
|
-
|
142
|
-
Return a JSON array of indices (0-based) for sources that meet ALL criteria.
|
143
|
-
ONLY Return a JSON array of indices (0-based) and nothing else. No letters.
|
144
|
-
Example response: \n[0, 2, 4]\n\n"""
|
145
|
-
|
146
|
-
try:
|
147
|
-
# Get LLM's evaluation
|
148
|
-
response = self.llm.invoke(prompt)
|
149
|
-
|
150
|
-
# Extract JSON array from response
|
151
|
-
response_text = remove_think_tags(response.content)
|
152
|
-
# Clean up response to handle potential formatting issues
|
153
|
-
response_text = response_text.strip()
|
154
|
-
|
155
|
-
# Find the first occurrence of '[' and the last occurrence of ']'
|
156
|
-
start_idx = response_text.find('[')
|
157
|
-
end_idx = response_text.rfind(']')
|
158
|
-
|
159
|
-
if start_idx >= 0 and end_idx > start_idx:
|
160
|
-
array_text = response_text[start_idx:end_idx+1]
|
161
|
-
good_indices = json.loads(array_text)
|
162
|
-
|
163
|
-
# Return only the results with good indices
|
164
|
-
return [r for i, r in enumerate(previews) if i in good_indices]
|
165
|
-
else:
|
166
|
-
print("Could not find JSON array in response, returning all previews")
|
167
|
-
return previews
|
168
|
-
|
169
|
-
except Exception as e:
|
170
|
-
print(f"URL filtering error: {e}")
|
171
|
-
# Fall back to returning all previews on error
|
172
|
-
return previews
|
173
|
-
|
174
|
-
def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
175
|
-
"""
|
176
|
-
Get full content for the relevant items by retrieving and processing web pages.
|
177
|
-
|
178
|
-
Args:
|
179
|
-
relevant_items: List of relevant preview dictionaries
|
180
|
-
|
181
|
-
Returns:
|
182
|
-
List of result dictionaries with full content
|
183
|
-
"""
|
184
|
-
nr_full_text = 0
|
185
|
-
|
186
|
-
# Extract URLs from relevant items
|
187
|
-
urls = [item.get("link") for item in relevant_items if item.get("link")]
|
188
|
-
|
189
|
-
if not urls:
|
190
|
-
print("\n === NO VALID LINKS ===\n")
|
191
|
-
return relevant_items
|
192
|
-
|
193
|
-
try:
|
194
|
-
# Download the full HTML pages for filtered URLs
|
195
|
-
loader = AsyncChromiumLoader(urls)
|
196
|
-
html_docs = loader.load()
|
197
|
-
|
198
|
-
# Process the HTML using BeautifulSoupTransformer
|
199
|
-
full_docs = self.bs_transformer.transform_documents(
|
200
|
-
html_docs, tags_to_extract=self.tags_to_extract
|
201
|
-
)
|
202
|
-
|
203
|
-
# Remove boilerplate from each document
|
204
|
-
url_to_content = {}
|
205
|
-
for doc in full_docs:
|
206
|
-
nr_full_text += 1
|
207
|
-
source = doc.metadata.get("source")
|
208
|
-
if source:
|
209
|
-
cleaned_text = self._remove_boilerplate(doc.page_content)
|
210
|
-
url_to_content[source] = cleaned_text
|
211
|
-
|
212
|
-
# Attach the cleaned full content to each result
|
213
|
-
results = []
|
214
|
-
for item in relevant_items:
|
215
|
-
new_item = item.copy()
|
216
|
-
link = item.get("link")
|
217
|
-
new_item["full_content"] = url_to_content.get(link, None)
|
218
|
-
results.append(new_item)
|
219
|
-
|
220
|
-
print(f"FULL SEARCH WITH FILTERED URLS - Full text retrieved: {nr_full_text}")
|
221
|
-
return results
|
222
|
-
|
223
|
-
except Exception as e:
|
224
|
-
print(f"Error retrieving full content: {e}")
|
225
|
-
# Return original items if full content retrieval fails
|
226
|
-
return relevant_items
|
227
|
-
|
228
|
-
def _remove_boilerplate(self, html: str) -> str:
|
229
|
-
"""
|
230
|
-
Remove boilerplate content from HTML.
|
231
|
-
|
232
|
-
Args:
|
233
|
-
html: HTML content
|
234
|
-
|
235
|
-
Returns:
|
236
|
-
Cleaned text content
|
237
|
-
"""
|
238
|
-
if not html or not html.strip():
|
239
|
-
return ""
|
240
|
-
try:
|
241
|
-
paragraphs = justext.justext(html, justext.get_stoplist(self.language))
|
242
|
-
cleaned = "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
|
243
|
-
return cleaned
|
244
|
-
except Exception as e:
|
245
|
-
print(f"Error removing boilerplate: {e}")
|
246
|
-
return html
|
247
|
-
|
248
|
-
def invoke(self, query: str) -> List[Dict[str, Any]]:
|
249
|
-
"""Compatibility method for LangChain tools"""
|
250
|
-
return self.run(query)
|
251
|
-
|
252
|
-
def __call__(self, query: str) -> List[Dict[str, Any]]:
|
253
|
-
"""Make the class callable like a function"""
|
254
|
-
return self.invoke(query)
|