local-deep-research 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. local_deep_research/citation_handler.py +0 -2
  2. local_deep_research/config.py +1 -4
  3. local_deep_research/defaults/llm_config.py +2 -2
  4. local_deep_research/defaults/main.toml +3 -3
  5. local_deep_research/defaults/search_engines.toml +2 -2
  6. local_deep_research/report_generator.py +1 -5
  7. local_deep_research/search_system.py +9 -10
  8. local_deep_research/utilties/search_utilities.py +3 -4
  9. local_deep_research/web_search_engines/engines/full_search.py +9 -8
  10. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -14
  11. local_deep_research/web_search_engines/engines/search_engine_brave.py +10 -9
  12. local_deep_research/web_search_engines/engines/search_engine_ddg.py +4 -2
  13. local_deep_research/web_search_engines/engines/search_engine_local.py +1 -1
  14. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +102 -661
  15. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +9 -8
  16. local_deep_research/web_search_engines/search_engine_base.py +6 -15
  17. local_deep_research-0.1.17.dist-info/METADATA +393 -0
  18. {local_deep_research-0.1.15.dist-info → local_deep_research-0.1.17.dist-info}/RECORD +22 -24
  19. local_deep_research/local_collections.py +0 -141
  20. local_deep_research/web_search_engines/full_search.py +0 -254
  21. local_deep_research-0.1.15.dist-info/METADATA +0 -346
  22. {local_deep_research-0.1.15.dist-info → local_deep_research-0.1.17.dist-info}/WHEEL +0 -0
  23. {local_deep_research-0.1.15.dist-info → local_deep_research-0.1.17.dist-info}/entry_points.txt +0 -0
  24. {local_deep_research-0.1.15.dist-info → local_deep_research-0.1.17.dist-info}/licenses/LICENSE +0 -0
  25. {local_deep_research-0.1.15.dist-info → local_deep_research-0.1.17.dist-info}/top_level.txt +0 -0
@@ -1,141 +0,0 @@
1
- # local_collections.py
2
- """
3
- Configuration file for local document collections.
4
- Each collection functions as an independent search engine.
5
- """
6
-
7
- import os
8
- from typing import Dict, Any
9
-
10
- # Registry of local document collections
11
- # Each collection appears as a separate search engine in the main configuration
12
- LOCAL_COLLECTIONS = {
13
- # Project Documents Collection
14
- "project_docs": {
15
- "name": "Project Documents",
16
- "description": "Project documentation and specifications",
17
- "paths": [os.path.abspath("./local_search_files/project_documents")],
18
- "enabled": True,
19
- "embedding_model": "all-MiniLM-L6-v2",
20
- "embedding_device": "cpu",
21
- "embedding_model_type": "sentence_transformers",
22
- "max_results": 20,
23
- "max_filtered_results": 5,
24
- "chunk_size": 1000,
25
- "chunk_overlap": 200,
26
- "cache_dir": ".cache/local_search/project_docs"
27
- },
28
-
29
- # Research Papers Collection
30
- "research_papers": {
31
- "name": "Research Papers",
32
- "description": "Academic research papers and articles",
33
- "paths": [os.path.abspath("local_search_files/research_papers")],
34
- "enabled": True,
35
- "embedding_model": "all-MiniLM-L6-v2",
36
- "embedding_device": "cpu",
37
- "embedding_model_type": "sentence_transformers",
38
- "max_results": 20,
39
- "max_filtered_results": 5,
40
- "chunk_size": 800, # Smaller chunks for academic content
41
- "chunk_overlap": 150,
42
- "cache_dir": ".cache/local_search/research_papers"
43
- },
44
-
45
- # Personal Notes Collection
46
- "personal_notes": {
47
- "name": "Personal Notes",
48
- "description": "Personal notes and documents",
49
- "paths": [os.path.abspath("./local_search_files/personal_notes")],
50
- "enabled": True,
51
- "embedding_model": "all-MiniLM-L6-v2",
52
- "embedding_device": "cpu",
53
- "embedding_model_type": "sentence_transformers",
54
- "max_results": 30,
55
- "max_filtered_results": 10,
56
- "chunk_size": 500, # Smaller chunks for notes
57
- "chunk_overlap": 100,
58
- "cache_dir": ".cache/local_search/personal_notes"
59
- }
60
- }
61
-
62
- # Configuration for local search integration
63
- LOCAL_SEARCH_CONFIG = {
64
- # General embedding options
65
- "DEFAULT_EMBEDDING_MODEL": "all-MiniLM-L6-v2",
66
- "DEFAULT_EMBEDDING_DEVICE": "cpu", # "cpu" or "cuda" for GPU acceleration
67
- "DEFAULT_EMBEDDING_MODEL_TYPE": "sentence_transformers", # or "ollama"
68
-
69
- # Ollama settings (only used if model type is "ollama")
70
- # Note: You must run 'ollama pull nomic-embed-text' first if using Ollama for embeddings
71
- "OLLAMA_BASE_URL": "http://localhost:11434",
72
- "OLLAMA_EMBEDDING_MODEL": "nomic-embed-text",
73
-
74
- # Default indexing options
75
- "FORCE_REINDEX": True, # Force reindexing on startup
76
- "CACHE_DIR": ".cache/local_search", # Base directory for cache
77
- }
78
-
79
- def register_local_collections(search_engines_dict: Dict[str, Any]) -> None:
80
- """
81
- Register all enabled local collections as search engines.
82
-
83
- Args:
84
- search_engines_dict: The main search engines dictionary to update
85
- """
86
- for collection_id, collection in LOCAL_COLLECTIONS.items():
87
- print(collection_id, collection)
88
- if collection.get("enabled", True):
89
- # Skip if already defined (don't override)
90
- if collection_id in search_engines_dict:
91
- continue
92
-
93
- # Validate paths exist
94
- paths = collection.get("paths", [])
95
- valid_paths = []
96
- for path in paths:
97
- if os.path.exists(path) and os.path.isdir(path):
98
- valid_paths.append(path)
99
- else:
100
- print(f"Warning: Collection '{collection_id}' contains non-existent folder: {path}")
101
-
102
- # Log warning if no valid paths
103
- if not valid_paths and paths:
104
- print(f"Warning: Collection '{collection_id}' has no valid folders. It will be registered but won't return results.")
105
-
106
- # Create a search engine entry for this collection
107
- search_engines_dict[collection_id] = {
108
- "module_path": "local_deep_research.web_search_engines.engines.search_engine_local",
109
- "class_name": "LocalSearchEngine",
110
- "requires_api_key": False,
111
- "reliability": 0.9, # High reliability for local documents
112
- "strengths": ["personal documents", "offline access",
113
- collection.get("description", "local documents")],
114
- "weaknesses": ["requires indexing", "limited to specific folders"],
115
- "default_params": {
116
- "folder_paths": collection.get("paths", []),
117
- "embedding_model": collection.get(
118
- "embedding_model",
119
- LOCAL_SEARCH_CONFIG["DEFAULT_EMBEDDING_MODEL"]
120
- ),
121
- "embedding_device": collection.get(
122
- "embedding_device",
123
- LOCAL_SEARCH_CONFIG["DEFAULT_EMBEDDING_DEVICE"]
124
- ),
125
- "embedding_model_type": collection.get(
126
- "embedding_model_type",
127
- LOCAL_SEARCH_CONFIG["DEFAULT_EMBEDDING_MODEL_TYPE"]
128
- ),
129
- "chunk_size": collection.get("chunk_size", 1000),
130
- "chunk_overlap": collection.get("chunk_overlap", 200),
131
- "cache_dir": collection.get(
132
- "cache_dir",
133
- f"{LOCAL_SEARCH_CONFIG['CACHE_DIR']}/{collection_id}"
134
- ),
135
- "max_results": collection.get("max_results", 20),
136
- "max_filtered_results": collection.get("max_filtered_results", 5),
137
- "collection_name": collection.get("name", collection_id),
138
- "collection_description": collection.get("description", "")
139
- },
140
- "requires_llm": True
141
- }
@@ -1,254 +0,0 @@
1
- import justext
2
- from langchain_community.document_loaders import AsyncChromiumLoader
3
- from langchain_community.document_transformers import BeautifulSoupTransformer
4
- from langchain_core.language_models import BaseLLM
5
- from typing import List, Dict, Any, Optional, Union
6
- import json
7
- import os
8
- from .utilties.search_utilities import remove_think_tags
9
- from datetime import datetime
10
- from local_deep_research import config
11
-
12
- class FullSearchResults:
13
- """
14
- Enhanced web content retrieval class that works with the BaseSearchEngine architecture.
15
- Can be used as a wrapper around web-based search engines like DuckDuckGo and SerpAPI.
16
- """
17
-
18
- def __init__(
19
- self,
20
- llm: BaseLLM,
21
- web_search,
22
- output_format: str = "list",
23
- language: str = "English",
24
- max_results: int = 10,
25
- region: str = "wt-wt",
26
- time: str = "y",
27
- safesearch: str = "Moderate"
28
- ):
29
- """
30
- Initialize the full search results processor.
31
-
32
- Args:
33
- llm: Language model instance for relevance filtering
34
- web_search: Web search engine instance that provides initial results
35
- output_format: Format of output ('list' or other formats)
36
- language: Language for content processing
37
- max_results: Maximum number of search results
38
- region: Search region
39
- time: Time period for search results
40
- safesearch: Safe search setting
41
- """
42
- self.llm = llm
43
- self.output_format = output_format
44
- self.language = language
45
- self.max_results = max_results
46
- self.region = region
47
- self.time = time
48
- self.safesearch = safesearch
49
- self.web_search = web_search
50
- os.environ["USER_AGENT"] = "Local Deep Research/1.0"
51
-
52
- self.bs_transformer = BeautifulSoupTransformer()
53
- self.tags_to_extract = ["p", "div", "span"]
54
-
55
- def run(self, query: str) -> List[Dict[str, Any]]:
56
- """
57
- Legacy method that performs a full search in one step.
58
- Respects config parameters:
59
- - SEARCH_SNIPPETS_ONLY: If True, only returns snippets without full content
60
- - SKIP_RELEVANCE_FILTER: If True, returns all results without filtering
61
-
62
- Args:
63
- query: The search query
64
-
65
- Returns:
66
- List of search results with full content (unless SEARCH_SNIPPETS_ONLY is True)
67
- """
68
- # Phase 1: Get search results from the web search engine
69
- previews = self._get_previews(query)
70
- if not previews:
71
- return []
72
-
73
- # Phase 2: Filter URLs using LLM (unless SKIP_RELEVANCE_FILTER is True)
74
- if hasattr(config, 'SKIP_RELEVANCE_FILTER') and config.SKIP_RELEVANCE_FILTER:
75
- relevant_items = previews
76
- print("Skipping relevance filtering as per config")
77
- else:
78
- relevant_items = self._filter_relevant_items(previews, query)
79
- if not relevant_items:
80
- return []
81
-
82
- # Phase 3: Get full content for relevant items (unless SEARCH_SNIPPETS_ONLY is True)
83
- if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
84
- print("Returning snippet-only results as per config")
85
- return relevant_items
86
- else:
87
- results = self._get_full_content(relevant_items)
88
- return results
89
-
90
- def _get_previews(self, query: str) -> List[Dict[str, Any]]:
91
- """
92
- Get preview information from the web search engine.
93
-
94
- Args:
95
- query: The search query
96
-
97
- Returns:
98
- List of preview dictionaries
99
- """
100
- try:
101
- # Get search results from the web search engine
102
- search_results = self.web_search.invoke(query)
103
-
104
- if not isinstance(search_results, list):
105
- print("Error: Expected search results in list format")
106
- return []
107
-
108
- # Return the results as previews
109
- return search_results
110
-
111
- except Exception as e:
112
- print(f"Error getting previews: {e}")
113
- return []
114
-
115
- def _filter_relevant_items(self, previews: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
116
- """
117
- Filter previews for relevance using LLM.
118
-
119
- Args:
120
- previews: List of preview dictionaries
121
- query: The original search query
122
-
123
- Returns:
124
- List of relevant preview dictionaries
125
- """
126
- # Skip filtering if disabled in config or no previews
127
- if not config.QUALITY_CHECK_DDG_URLS or not previews:
128
- return previews
129
-
130
- # Format for LLM evaluation
131
- now = datetime.now()
132
- current_time = now.strftime("%Y-%m-%d")
133
- prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for:
134
- 1. Timeliness (today: {current_time})
135
- 2. Factual accuracy (cross-reference major claims)
136
- 3. Source reliability (prefer official company websites, established news outlets)
137
- 4. Direct relevance to query: {query}
138
-
139
- URLs to evaluate:
140
- {json.dumps(previews, indent=2)}
141
-
142
- Return a JSON array of indices (0-based) for sources that meet ALL criteria.
143
- ONLY Return a JSON array of indices (0-based) and nothing else. No letters.
144
- Example response: \n[0, 2, 4]\n\n"""
145
-
146
- try:
147
- # Get LLM's evaluation
148
- response = self.llm.invoke(prompt)
149
-
150
- # Extract JSON array from response
151
- response_text = remove_think_tags(response.content)
152
- # Clean up response to handle potential formatting issues
153
- response_text = response_text.strip()
154
-
155
- # Find the first occurrence of '[' and the last occurrence of ']'
156
- start_idx = response_text.find('[')
157
- end_idx = response_text.rfind(']')
158
-
159
- if start_idx >= 0 and end_idx > start_idx:
160
- array_text = response_text[start_idx:end_idx+1]
161
- good_indices = json.loads(array_text)
162
-
163
- # Return only the results with good indices
164
- return [r for i, r in enumerate(previews) if i in good_indices]
165
- else:
166
- print("Could not find JSON array in response, returning all previews")
167
- return previews
168
-
169
- except Exception as e:
170
- print(f"URL filtering error: {e}")
171
- # Fall back to returning all previews on error
172
- return previews
173
-
174
- def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
175
- """
176
- Get full content for the relevant items by retrieving and processing web pages.
177
-
178
- Args:
179
- relevant_items: List of relevant preview dictionaries
180
-
181
- Returns:
182
- List of result dictionaries with full content
183
- """
184
- nr_full_text = 0
185
-
186
- # Extract URLs from relevant items
187
- urls = [item.get("link") for item in relevant_items if item.get("link")]
188
-
189
- if not urls:
190
- print("\n === NO VALID LINKS ===\n")
191
- return relevant_items
192
-
193
- try:
194
- # Download the full HTML pages for filtered URLs
195
- loader = AsyncChromiumLoader(urls)
196
- html_docs = loader.load()
197
-
198
- # Process the HTML using BeautifulSoupTransformer
199
- full_docs = self.bs_transformer.transform_documents(
200
- html_docs, tags_to_extract=self.tags_to_extract
201
- )
202
-
203
- # Remove boilerplate from each document
204
- url_to_content = {}
205
- for doc in full_docs:
206
- nr_full_text += 1
207
- source = doc.metadata.get("source")
208
- if source:
209
- cleaned_text = self._remove_boilerplate(doc.page_content)
210
- url_to_content[source] = cleaned_text
211
-
212
- # Attach the cleaned full content to each result
213
- results = []
214
- for item in relevant_items:
215
- new_item = item.copy()
216
- link = item.get("link")
217
- new_item["full_content"] = url_to_content.get(link, None)
218
- results.append(new_item)
219
-
220
- print(f"FULL SEARCH WITH FILTERED URLS - Full text retrieved: {nr_full_text}")
221
- return results
222
-
223
- except Exception as e:
224
- print(f"Error retrieving full content: {e}")
225
- # Return original items if full content retrieval fails
226
- return relevant_items
227
-
228
- def _remove_boilerplate(self, html: str) -> str:
229
- """
230
- Remove boilerplate content from HTML.
231
-
232
- Args:
233
- html: HTML content
234
-
235
- Returns:
236
- Cleaned text content
237
- """
238
- if not html or not html.strip():
239
- return ""
240
- try:
241
- paragraphs = justext.justext(html, justext.get_stoplist(self.language))
242
- cleaned = "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
243
- return cleaned
244
- except Exception as e:
245
- print(f"Error removing boilerplate: {e}")
246
- return html
247
-
248
- def invoke(self, query: str) -> List[Dict[str, Any]]:
249
- """Compatibility method for LangChain tools"""
250
- return self.run(query)
251
-
252
- def __call__(self, query: str) -> List[Dict[str, Any]]:
253
- """Make the class callable like a function"""
254
- return self.invoke(query)