local-deep-research 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. local_deep_research-0.1.0/LICENSE +21 -0
  2. local_deep_research-0.1.0/MANIFEST.in +5 -0
  3. local_deep_research-0.1.0/PKG-INFO +328 -0
  4. local_deep_research-0.1.0/README.md +266 -0
  5. local_deep_research-0.1.0/pyproject.toml +63 -0
  6. local_deep_research-0.1.0/requirements.txt +25 -0
  7. local_deep_research-0.1.0/setup.cfg +4 -0
  8. local_deep_research-0.1.0/src/local_deep_research/__init__.py +24 -0
  9. local_deep_research-0.1.0/src/local_deep_research/citation_handler.py +113 -0
  10. local_deep_research-0.1.0/src/local_deep_research/config.py +166 -0
  11. local_deep_research-0.1.0/src/local_deep_research/defaults/__init__.py +44 -0
  12. local_deep_research-0.1.0/src/local_deep_research/defaults/llm_config.py +269 -0
  13. local_deep_research-0.1.0/src/local_deep_research/defaults/local_collections.toml +47 -0
  14. local_deep_research-0.1.0/src/local_deep_research/defaults/main.toml +57 -0
  15. local_deep_research-0.1.0/src/local_deep_research/defaults/search_engines.toml +244 -0
  16. local_deep_research-0.1.0/src/local_deep_research/local_collections.py +141 -0
  17. local_deep_research-0.1.0/src/local_deep_research/main.py +113 -0
  18. local_deep_research-0.1.0/src/local_deep_research/report_generator.py +206 -0
  19. local_deep_research-0.1.0/src/local_deep_research/search_system.py +241 -0
  20. local_deep_research-0.1.0/src/local_deep_research/utilties/__init__.py +0 -0
  21. local_deep_research-0.1.0/src/local_deep_research/utilties/enums.py +9 -0
  22. local_deep_research-0.1.0/src/local_deep_research/utilties/llm_utils.py +116 -0
  23. local_deep_research-0.1.0/src/local_deep_research/utilties/search_utilities.py +115 -0
  24. local_deep_research-0.1.0/src/local_deep_research/utilties/setup_utils.py +6 -0
  25. local_deep_research-0.1.0/src/local_deep_research/web/__init__.py +2 -0
  26. local_deep_research-0.1.0/src/local_deep_research/web/app.py +1209 -0
  27. local_deep_research-0.1.0/src/local_deep_research/web/static/css/styles.css +1008 -0
  28. local_deep_research-0.1.0/src/local_deep_research/web/static/js/app.js +2078 -0
  29. local_deep_research-0.1.0/src/local_deep_research/web/templates/api_keys_config.html +82 -0
  30. local_deep_research-0.1.0/src/local_deep_research/web/templates/collections_config.html +90 -0
  31. local_deep_research-0.1.0/src/local_deep_research/web/templates/index.html +312 -0
  32. local_deep_research-0.1.0/src/local_deep_research/web/templates/llm_config.html +120 -0
  33. local_deep_research-0.1.0/src/local_deep_research/web/templates/main_config.html +89 -0
  34. local_deep_research-0.1.0/src/local_deep_research/web/templates/search_engines_config.html +154 -0
  35. local_deep_research-0.1.0/src/local_deep_research/web/templates/settings.html +519 -0
  36. local_deep_research-0.1.0/src/local_deep_research/web/templates/settings_dashboard.html +207 -0
  37. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/__init__.py +0 -0
  38. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/__init__.py +0 -0
  39. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/full_search.py +128 -0
  40. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
  41. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
  42. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
  43. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
  44. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
  45. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
  46. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
  47. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
  48. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
  49. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
  50. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
  51. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
  52. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
  53. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
  54. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/full_search.py +254 -0
  55. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/search_engine_base.py +197 -0
  56. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/search_engine_factory.py +233 -0
  57. local_deep_research-0.1.0/src/local_deep_research/web_search_engines/search_engines_config.py +54 -0
  58. local_deep_research-0.1.0/src/local_deep_research.egg-info/PKG-INFO +328 -0
  59. local_deep_research-0.1.0/src/local_deep_research.egg-info/SOURCES.txt +62 -0
  60. local_deep_research-0.1.0/src/local_deep_research.egg-info/dependency_links.txt +1 -0
  61. local_deep_research-0.1.0/src/local_deep_research.egg-info/entry_points.txt +3 -0
  62. local_deep_research-0.1.0/src/local_deep_research.egg-info/requires.txt +26 -0
  63. local_deep_research-0.1.0/src/local_deep_research.egg-info/top_level.txt +1 -0
  64. local_deep_research-0.1.0/tests/test_google_pse.py +206 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 LearningCircuit
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ include LICENSE
2
+ include README.md
3
+ include requirements.txt
4
+ recursive-include src/local_deep_research/web/templates *
5
+ recursive-include src/local_deep_research/web/static *
@@ -0,0 +1,328 @@
1
+ Metadata-Version: 2.2
2
+ Name: local-deep-research
3
+ Version: 0.1.0
4
+ Summary: AI-powered research assistant with deep, iterative analysis using LLMs and web searches
5
+ Author-email: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>, HashedViking <6432677+HashedViking@users.noreply.github.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 LearningCircuit
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/LearningCircuit/local-deep-research
29
+ Project-URL: Bug Tracker, https://github.com/LearningCircuit/local-deep-research/issues
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: License :: OSI Approved :: MIT License
32
+ Classifier: Operating System :: OS Independent
33
+ Requires-Python: >=3.8
34
+ Description-Content-Type: text/markdown
35
+ License-File: LICENSE
36
+ Requires-Dist: langchain>=0.3.18
37
+ Requires-Dist: langchain-community>=0.3.17
38
+ Requires-Dist: langchain-core>=0.3.34
39
+ Requires-Dist: langchain-ollama>=0.2.3
40
+ Requires-Dist: langchain-openai>=0.3.5
41
+ Requires-Dist: langchain_anthropic>=0.3.7
42
+ Requires-Dist: duckduckgo_search>=7.3.2
43
+ Requires-Dist: python-dateutil>=2.9.0
44
+ Requires-Dist: typing_extensions>=4.12.2
45
+ Requires-Dist: justext
46
+ Requires-Dist: playwright
47
+ Requires-Dist: beautifulsoup4
48
+ Requires-Dist: flask>=2.0.1
49
+ Requires-Dist: flask-cors>=3.0.10
50
+ Requires-Dist: flask-socketio>=5.1.1
51
+ Requires-Dist: sqlalchemy>=1.4.23
52
+ Requires-Dist: wikipedia
53
+ Requires-Dist: arxiv>=1.4.3
54
+ Requires-Dist: PyPDF2>=2.0.0
55
+ Requires-Dist: sentence-transformers
56
+ Requires-Dist: faiss-cpu
57
+ Requires-Dist: pydantic>=2.0.0
58
+ Requires-Dist: pydantic-settings>=2.0.0
59
+ Requires-Dist: toml>=0.10.2
60
+ Requires-Dist: platformdirs>=3.0.0
61
+ Requires-Dist: dynaconf
62
+
63
+ # Local Deep Research
64
+
65
+ A powerful AI-powered research assistant that performs deep, iterative analysis using multiple LLMs and web searches. The system can be run locally for privacy or configured to use cloud-based LLMs for enhanced capabilities.
66
+
67
+ ## Features
68
+
69
+ - 🔍 **Advanced Research Capabilities**
70
+ - Automated deep research with intelligent follow-up questions
71
+ - Citation tracking and source verification
72
+ - Multi-iteration analysis for comprehensive coverage
73
+ - Full webpage content analysis (not just snippets)
74
+
75
+ - 🤖 **Flexible LLM Support**
76
+ - Local AI processing with Ollama models
77
+ - Cloud LLM support (Claude, GPT)
78
+ - Supports all Langchain models
79
+ - Configurable model selection based on needs
80
+
81
+ - 📊 **Rich Output Options**
82
+ - Detailed research findings with citations
83
+ - Comprehensive research reports
84
+ - Quick summaries for rapid insights
85
+ - Source tracking and verification
86
+
87
+ - 🔒 **Privacy-Focused**
88
+ - Runs entirely on your machine when using local models
89
+ - Configurable search settings
90
+ - Transparent data handling
91
+
92
+ - 🌐 **Enhanced Search Integration**
93
+ - **Auto-selection of search sources**: The "auto" search engine intelligently analyzes your query and selects the most appropriate search engine based on the query content
94
+ - Wikipedia integration for factual knowledge
95
+ - arXiv integration for scientific papers and academic research
96
+ - PubMed integration for biomedical literature and medical research
97
+ - DuckDuckGo integration for web searches (may experience rate limiting)
98
+ - SerpAPI integration for Google search results (requires API key)
99
+ - **Google Programmable Search Engine** integration for custom search experiences (requires API key)
100
+ - The Guardian integration for news articles and journalism (requires API key)
101
+ - **Local RAG search for private documents** - search your own documents with vector embeddings
102
+ - Full webpage content retrieval
103
+ - Source filtering and validation
104
+ - Configurable search parameters
105
+
106
+ - 📑 **Local Document Search (RAG)**
107
+ - Vector embedding-based search of your local documents
108
+ - Create custom document collections for different topics
109
+ - Privacy-preserving - your documents stay on your machine
110
+ - Intelligent chunking and retrieval
111
+ - Compatible with various document formats (PDF, text, markdown, etc.)
112
+ - Automatic integration with meta-search for unified queries
113
+
114
+ ## Example Research: Fusion Energy Developments
115
+
116
+ The repository includes complete research examples demonstrating the tool's capabilities. For instance, our [fusion energy research analysis](https://github.com/LearningCircuit/local-deep-research/blob/main/examples/fusion-energy-research-developments.md) provides a comprehensive overview of:
117
+
118
+ - Latest scientific breakthroughs in fusion research (2022-2025)
119
+ - Private sector funding developments exceeding $6 billion
120
+ - Expert projections for commercial fusion energy timelines
121
+ - Regulatory frameworks being developed for fusion deployment
122
+ - Technical challenges that must be overcome for commercial viability
123
+
124
+ This example showcases the system's ability to perform multiple research iterations, follow evidence trails across scientific and commercial domains, and synthesize information from diverse sources while maintaining proper citation.
125
+
126
+ ## Installation
127
+
128
+ 1. Clone the repository:
129
+ ```bash
130
+ git clone https://github.com/yourusername/local-deep-research.git
131
+ cd local-deep-research
132
+ ```
133
+
134
+ 2. Install dependencies:
135
+ ```bash
136
+ pip install -r requirements.txt
137
+ playwright install
138
+ ```
139
+
140
+ 3. Install Ollama (for local models):
141
+ ```bash
142
+ # Install Ollama from https://ollama.ai
143
+ ollama pull mistral # Default model - many work really well choose best for your hardware (fits in GPU)
144
+ ```
145
+
146
+ 4. Configure environment variables:
147
+ ```bash
148
+ # Copy the template
149
+ cp .env.template .env
150
+
151
+ # Edit .env with your API keys (if using cloud LLMs)
152
+ ANTHROPIC_API_KEY=your-api-key-here # For Claude
153
+ OPENAI_API_KEY=your-openai-key-here # For GPT models
154
+ GUARDIAN_API_KEY=your-guardian-api-key-here # For The Guardian search
155
+ ```
156
+
157
+ ## Usage
158
+ Terminal usage (not recommended):
159
+ ```bash
160
+ python main.py
161
+ ```
162
+
163
+ ### Web Interface
164
+
165
+ The project includes a web interface for a more user-friendly experience:
166
+
167
+ ```bash
168
+ python app.py
169
+ ```
170
+
171
+ This will start a local web server, accessible at `http://127.0.0.1:5000` in your browser.
172
+
173
+ #### Web Interface Features:
174
+
175
+ - **Dashboard**: Intuitive interface for starting and managing research queries
176
+ - **Real-time Updates**: Track research progress with live updates
177
+ - **Research History**: Access and manage past research queries
178
+ - **PDF Export**: Download completed research reports as PDF documents
179
+ - **Research Management**: Terminate ongoing research processes or delete past records
180
+
181
+ ![Web Interface](./web1.png)
182
+ ![Web Interface](./web2.png)
183
+ ### Configuration
184
+ **Please report your best settings in issues so we can improve the default settings.**
185
+
186
+ Key settings in `config.py`:
187
+ ```python
188
+ # LLM Configuration
189
+ DEFAULT_MODEL = "mistral" # Change based on your needs
190
+ DEFAULT_TEMPERATURE = 0.7
191
+ MAX_TOKENS = 8000
192
+
193
+ # Search Configuration
194
+ MAX_SEARCH_RESULTS = 40
195
+ SEARCH_REGION = "us-en"
196
+ TIME_PERIOD = "y"
197
+ SAFE_SEARCH = True
198
+ SEARCH_SNIPPETS_ONLY = False
199
+
200
+ # Choose search tool: "wiki", "arxiv", "duckduckgo", "guardian", "serp", "local_all", or "auto"
201
+ search_tool = "auto" # "auto" will intelligently select the best search engine for your query
202
+ ```
203
+
204
+ ## Local Document Search (RAG)
205
+
206
+ The system includes powerful local document search capabilities using Retrieval-Augmented Generation (RAG). This allows you to search and retrieve content from your own document collections.
207
+
208
+ ### Setting Up Local Collections
209
+
210
+ Create a file named `local_collections.py` in the project root directory:
211
+
212
+ ```python
213
+ # local_collections.py
214
+ import os
215
+ from typing import Dict, Any
216
+
217
+ # Registry of local document collections
218
+ LOCAL_COLLECTIONS = {
219
+ # Research Papers Collection
220
+ "research_papers": {
221
+ "name": "Research Papers",
222
+ "description": "Academic research papers and articles",
223
+ "paths": [os.path.abspath("local_search_files/research_papers")], # Use absolute paths
224
+ "enabled": True,
225
+ "embedding_model": "all-MiniLM-L6-v2",
226
+ "embedding_device": "cpu",
227
+ "embedding_model_type": "sentence_transformers",
228
+ "max_results": 20,
229
+ "max_filtered_results": 5,
230
+ "chunk_size": 800, # Smaller chunks for academic content
231
+ "chunk_overlap": 150,
232
+ "cache_dir": ".cache/local_search/research_papers"
233
+ },
234
+
235
+ # Personal Notes Collection
236
+ "personal_notes": {
237
+ "name": "Personal Notes",
238
+ "description": "Personal notes and documents",
239
+ "paths": [os.path.abspath("local_search_files/personal_notes")], # Use absolute paths
240
+ "enabled": True,
241
+ "embedding_model": "all-MiniLM-L6-v2",
242
+ "embedding_device": "cpu",
243
+ "embedding_model_type": "sentence_transformers",
244
+ "max_results": 30,
245
+ "max_filtered_results": 10,
246
+ "chunk_size": 500, # Smaller chunks for notes
247
+ "chunk_overlap": 100,
248
+ "cache_dir": ".cache/local_search/personal_notes"
249
+ }
250
+ }
251
+ ```
252
+
253
+ Create directories for your collections:
254
+
255
+ ```bash
256
+ mkdir -p local_search_files/research_papers
257
+ mkdir -p local_search_files/personal_notes
258
+ ```
259
+
260
+ Add your documents to these folders, and the system will automatically index them and make them available for searching.
261
+
262
+ ### Using Local Search
263
+
264
+ You can use local search in several ways:
265
+
266
+ 1. **Auto-selection**: Set `search_tool = "auto"` in `config.py` and the system will automatically use your local collections when appropriate for the query.
267
+
268
+ 2. **Explicit Selection**: Set `search_tool = "research_papers"` to search only that specific collection.
269
+
270
+ 3. **Search All Local Collections**: Set `search_tool = "local_all"` to search across all your local document collections.
271
+
272
+ 4. **Query Syntax**: Use `collection:collection_name your query` to target a specific collection within a query.
273
+
274
+ ### Search Engine Options
275
+
276
+ The system supports multiple search engines that can be selected by changing the `search_tool` variable in `config.py`:
277
+
278
+ - **Auto** (`auto`): Intelligent search engine selector that analyzes your query and chooses the most appropriate source (Wikipedia, arXiv, local collections, etc.)
279
+ - **Wikipedia** (`wiki`): Best for general knowledge, facts, and overview information
280
+ - **arXiv** (`arxiv`): Great for scientific and academic research, accessing preprints and papers
281
+ - **PubMed** (`pubmed`): Excellent for biomedical literature, medical research, and health information
282
+ - **DuckDuckGo** (`duckduckgo`): General web search that doesn't require an API key
283
+ - **The Guardian** (`guardian`): Quality journalism and news articles (requires an API key)
284
+ - **SerpAPI** (`serp`): Google search results (requires an API key)
285
+ - **Google Programmable Search Engine** (`google_pse`): Custom search experiences with control over search scope and domains (requires API key and search engine ID)
286
+ - **Local Collections**: Any collections defined in your `local_collections.py` file
287
+
288
+ > **Note:** The "auto" option will intelligently select the best search engine based on your query. For example, if you ask about physics research papers, it might select arXiv or your research_papers collection, while if you ask about current events, it might select The Guardian or DuckDuckGo.
289
+
290
+ > **Support Free Knowledge:** If you frequently use the search engines in this tool, please consider making a donation to these organizations. They provide valuable services and rely on user support to maintain their operations:
291
+ > - [Donate to Wikipedia](https://donate.wikimedia.org)
292
+ > - [Support The Guardian](https://support.theguardian.com)
293
+ > - [Support arXiv](https://arxiv.org/about/give)
294
+ > - [Donate to DuckDuckGo](https://duckduckgo.com/donations)
295
+ > - [Support PubMed/NCBI](https://www.nlm.nih.gov/pubs/donations/donations.html)
296
+
297
+ ## License
298
+
299
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
300
+
301
+ ## Acknowledgments
302
+ - Built with [Ollama](https://ollama.ai) for local AI processing
303
+ - Search powered by multiple sources:
304
+ - [Wikipedia](https://www.wikipedia.org/) for factual knowledge (default search engine)
305
+ - [arXiv](https://arxiv.org/) for scientific papers
306
+ - [PubMed](https://pubmed.ncbi.nlm.nih.gov/) for biomedical literature
307
+ - [DuckDuckGo](https://duckduckgo.com) for web search
308
+ - [The Guardian](https://www.theguardian.com/) for quality journalism
309
+ - [SerpAPI](https://serpapi.com) for Google search results (requires API key)
310
+ - Built on [LangChain](https://github.com/hwchase17/langchain) framework
311
+ - Uses [justext](https://github.com/miso-belica/justext) for content extraction
312
+ - [Playwright](https://playwright.dev) for web content retrieval
313
+ - Uses [FAISS](https://github.com/facebookresearch/faiss) for vector similarity search
314
+ - Uses [sentence-transformers](https://github.com/UKPLab/sentence-transformers) for embeddings
315
+
316
+ ## Contributing
317
+
318
+ Contributions are welcome! Please feel free to submit a Pull Request.
319
+
320
+ 1. Fork the repository
321
+ 2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
322
+ 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
323
+ 4. Push to the branch (`git push origin feature/AmazingFeature`)
324
+ 5. Open a Pull Request
325
+
326
+ ## Star History
327
+
328
+ [![Star History Chart](https://api.star-history.com/svg?repos=LearningCircuit/local-deep-research&type=Date)](https://www.star-history.com/#LearningCircuit/local-deep-research&Date)
@@ -0,0 +1,266 @@
1
+ # Local Deep Research
2
+
3
+ A powerful AI-powered research assistant that performs deep, iterative analysis using multiple LLMs and web searches. The system can be run locally for privacy or configured to use cloud-based LLMs for enhanced capabilities.
4
+
5
+ ## Features
6
+
7
+ - 🔍 **Advanced Research Capabilities**
8
+ - Automated deep research with intelligent follow-up questions
9
+ - Citation tracking and source verification
10
+ - Multi-iteration analysis for comprehensive coverage
11
+ - Full webpage content analysis (not just snippets)
12
+
13
+ - 🤖 **Flexible LLM Support**
14
+ - Local AI processing with Ollama models
15
+ - Cloud LLM support (Claude, GPT)
16
+ - Supports all Langchain models
17
+ - Configurable model selection based on needs
18
+
19
+ - 📊 **Rich Output Options**
20
+ - Detailed research findings with citations
21
+ - Comprehensive research reports
22
+ - Quick summaries for rapid insights
23
+ - Source tracking and verification
24
+
25
+ - 🔒 **Privacy-Focused**
26
+ - Runs entirely on your machine when using local models
27
+ - Configurable search settings
28
+ - Transparent data handling
29
+
30
+ - 🌐 **Enhanced Search Integration**
31
+ - **Auto-selection of search sources**: The "auto" search engine intelligently analyzes your query and selects the most appropriate search engine based on the query content
32
+ - Wikipedia integration for factual knowledge
33
+ - arXiv integration for scientific papers and academic research
34
+ - PubMed integration for biomedical literature and medical research
35
+ - DuckDuckGo integration for web searches (may experience rate limiting)
36
+ - SerpAPI integration for Google search results (requires API key)
37
+ - **Google Programmable Search Engine** integration for custom search experiences (requires API key)
38
+ - The Guardian integration for news articles and journalism (requires API key)
39
+ - **Local RAG search for private documents** - search your own documents with vector embeddings
40
+ - Full webpage content retrieval
41
+ - Source filtering and validation
42
+ - Configurable search parameters
43
+
44
+ - 📑 **Local Document Search (RAG)**
45
+ - Vector embedding-based search of your local documents
46
+ - Create custom document collections for different topics
47
+ - Privacy-preserving - your documents stay on your machine
48
+ - Intelligent chunking and retrieval
49
+ - Compatible with various document formats (PDF, text, markdown, etc.)
50
+ - Automatic integration with meta-search for unified queries
51
+
52
+ ## Example Research: Fusion Energy Developments
53
+
54
+ The repository includes complete research examples demonstrating the tool's capabilities. For instance, our [fusion energy research analysis](https://github.com/LearningCircuit/local-deep-research/blob/main/examples/fusion-energy-research-developments.md) provides a comprehensive overview of:
55
+
56
+ - Latest scientific breakthroughs in fusion research (2022-2025)
57
+ - Private sector funding developments exceeding $6 billion
58
+ - Expert projections for commercial fusion energy timelines
59
+ - Regulatory frameworks being developed for fusion deployment
60
+ - Technical challenges that must be overcome for commercial viability
61
+
62
+ This example showcases the system's ability to perform multiple research iterations, follow evidence trails across scientific and commercial domains, and synthesize information from diverse sources while maintaining proper citation.
63
+
64
+ ## Installation
65
+
66
+ 1. Clone the repository:
67
+ ```bash
68
+ git clone https://github.com/yourusername/local-deep-research.git
69
+ cd local-deep-research
70
+ ```
71
+
72
+ 2. Install dependencies:
73
+ ```bash
74
+ pip install -r requirements.txt
75
+ playwright install
76
+ ```
77
+
78
+ 3. Install Ollama (for local models):
79
+ ```bash
80
+ # Install Ollama from https://ollama.ai
81
+ ollama pull mistral # Default model - many work really well choose best for your hardware (fits in GPU)
82
+ ```
83
+
84
+ 4. Configure environment variables:
85
+ ```bash
86
+ # Copy the template
87
+ cp .env.template .env
88
+
89
+ # Edit .env with your API keys (if using cloud LLMs)
90
+ ANTHROPIC_API_KEY=your-api-key-here # For Claude
91
+ OPENAI_API_KEY=your-openai-key-here # For GPT models
92
+ GUARDIAN_API_KEY=your-guardian-api-key-here # For The Guardian search
93
+ ```
94
+
95
+ ## Usage
96
+ Terminal usage (not recommended):
97
+ ```bash
98
+ python main.py
99
+ ```
100
+
101
+ ### Web Interface
102
+
103
+ The project includes a web interface for a more user-friendly experience:
104
+
105
+ ```bash
106
+ python app.py
107
+ ```
108
+
109
+ This will start a local web server, accessible at `http://127.0.0.1:5000` in your browser.
110
+
111
+ #### Web Interface Features:
112
+
113
+ - **Dashboard**: Intuitive interface for starting and managing research queries
114
+ - **Real-time Updates**: Track research progress with live updates
115
+ - **Research History**: Access and manage past research queries
116
+ - **PDF Export**: Download completed research reports as PDF documents
117
+ - **Research Management**: Terminate ongoing research processes or delete past records
118
+
119
+ ![Web Interface](./web1.png)
120
+ ![Web Interface](./web2.png)
121
+ ### Configuration
122
+ **Please report your best settings in issues so we can improve the default settings.**
123
+
124
+ Key settings in `config.py`:
125
+ ```python
126
+ # LLM Configuration
127
+ DEFAULT_MODEL = "mistral" # Change based on your needs
128
+ DEFAULT_TEMPERATURE = 0.7
129
+ MAX_TOKENS = 8000
130
+
131
+ # Search Configuration
132
+ MAX_SEARCH_RESULTS = 40
133
+ SEARCH_REGION = "us-en"
134
+ TIME_PERIOD = "y"
135
+ SAFE_SEARCH = True
136
+ SEARCH_SNIPPETS_ONLY = False
137
+
138
+ # Choose search tool: "wiki", "arxiv", "duckduckgo", "guardian", "serp", "local_all", or "auto"
139
+ search_tool = "auto" # "auto" will intelligently select the best search engine for your query
140
+ ```
141
+
142
+ ## Local Document Search (RAG)
143
+
144
+ The system includes powerful local document search capabilities using Retrieval-Augmented Generation (RAG). This allows you to search and retrieve content from your own document collections.
145
+
146
+ ### Setting Up Local Collections
147
+
148
+ Create a file named `local_collections.py` in the project root directory:
149
+
150
+ ```python
151
+ # local_collections.py
152
+ import os
153
+ from typing import Dict, Any
154
+
155
+ # Registry of local document collections
156
+ LOCAL_COLLECTIONS = {
157
+ # Research Papers Collection
158
+ "research_papers": {
159
+ "name": "Research Papers",
160
+ "description": "Academic research papers and articles",
161
+ "paths": [os.path.abspath("local_search_files/research_papers")], # Use absolute paths
162
+ "enabled": True,
163
+ "embedding_model": "all-MiniLM-L6-v2",
164
+ "embedding_device": "cpu",
165
+ "embedding_model_type": "sentence_transformers",
166
+ "max_results": 20,
167
+ "max_filtered_results": 5,
168
+ "chunk_size": 800, # Smaller chunks for academic content
169
+ "chunk_overlap": 150,
170
+ "cache_dir": ".cache/local_search/research_papers"
171
+ },
172
+
173
+ # Personal Notes Collection
174
+ "personal_notes": {
175
+ "name": "Personal Notes",
176
+ "description": "Personal notes and documents",
177
+ "paths": [os.path.abspath("local_search_files/personal_notes")], # Use absolute paths
178
+ "enabled": True,
179
+ "embedding_model": "all-MiniLM-L6-v2",
180
+ "embedding_device": "cpu",
181
+ "embedding_model_type": "sentence_transformers",
182
+ "max_results": 30,
183
+ "max_filtered_results": 10,
184
+ "chunk_size": 500, # Smaller chunks for notes
185
+ "chunk_overlap": 100,
186
+ "cache_dir": ".cache/local_search/personal_notes"
187
+ }
188
+ }
189
+ ```
190
+
191
+ Create directories for your collections:
192
+
193
+ ```bash
194
+ mkdir -p local_search_files/research_papers
195
+ mkdir -p local_search_files/personal_notes
196
+ ```
197
+
198
+ Add your documents to these folders, and the system will automatically index them and make them available for searching.
199
+
200
+ ### Using Local Search
201
+
202
+ You can use local search in several ways:
203
+
204
+ 1. **Auto-selection**: Set `search_tool = "auto"` in `config.py` and the system will automatically use your local collections when appropriate for the query.
205
+
206
+ 2. **Explicit Selection**: Set `search_tool = "research_papers"` to search only that specific collection.
207
+
208
+ 3. **Search All Local Collections**: Set `search_tool = "local_all"` to search across all your local document collections.
209
+
210
+ 4. **Query Syntax**: Use `collection:collection_name your query` to target a specific collection within a query.
211
+
212
+ ### Search Engine Options
213
+
214
+ The system supports multiple search engines that can be selected by changing the `search_tool` variable in `config.py`:
215
+
216
+ - **Auto** (`auto`): Intelligent search engine selector that analyzes your query and chooses the most appropriate source (Wikipedia, arXiv, local collections, etc.)
217
+ - **Wikipedia** (`wiki`): Best for general knowledge, facts, and overview information
218
+ - **arXiv** (`arxiv`): Great for scientific and academic research, accessing preprints and papers
219
+ - **PubMed** (`pubmed`): Excellent for biomedical literature, medical research, and health information
220
+ - **DuckDuckGo** (`duckduckgo`): General web search that doesn't require an API key
221
+ - **The Guardian** (`guardian`): Quality journalism and news articles (requires an API key)
222
+ - **SerpAPI** (`serp`): Google search results (requires an API key)
223
+ - **Google Programmable Search Engine** (`google_pse`): Custom search experiences with control over search scope and domains (requires API key and search engine ID)
224
+ - **Local Collections**: Any collections defined in your `local_collections.py` file
225
+
226
+ > **Note:** The "auto" option will intelligently select the best search engine based on your query. For example, if you ask about physics research papers, it might select arXiv or your research_papers collection, while if you ask about current events, it might select The Guardian or DuckDuckGo.
227
+
228
+ > **Support Free Knowledge:** If you frequently use the search engines in this tool, please consider making a donation to these organizations. They provide valuable services and rely on user support to maintain their operations:
229
+ > - [Donate to Wikipedia](https://donate.wikimedia.org)
230
+ > - [Support The Guardian](https://support.theguardian.com)
231
+ > - [Support arXiv](https://arxiv.org/about/give)
232
+ > - [Donate to DuckDuckGo](https://duckduckgo.com/donations)
233
+ > - [Support PubMed/NCBI](https://www.nlm.nih.gov/pubs/donations/donations.html)
234
+
235
+ ## License
236
+
237
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
238
+
239
+ ## Acknowledgments
240
+ - Built with [Ollama](https://ollama.ai) for local AI processing
241
+ - Search powered by multiple sources:
242
+ - [Wikipedia](https://www.wikipedia.org/) for factual knowledge (default search engine)
243
+ - [arXiv](https://arxiv.org/) for scientific papers
244
+ - [PubMed](https://pubmed.ncbi.nlm.nih.gov/) for biomedical literature
245
+ - [DuckDuckGo](https://duckduckgo.com) for web search
246
+ - [The Guardian](https://www.theguardian.com/) for quality journalism
247
+ - [SerpAPI](https://serpapi.com) for Google search results (requires API key)
248
+ - Built on [LangChain](https://github.com/hwchase17/langchain) framework
249
+ - Uses [justext](https://github.com/miso-belica/justext) for content extraction
250
+ - [Playwright](https://playwright.dev) for web content retrieval
251
+ - Uses [FAISS](https://github.com/facebookresearch/faiss) for vector similarity search
252
+ - Uses [sentence-transformers](https://github.com/UKPLab/sentence-transformers) for embeddings
253
+
254
+ ## Contributing
255
+
256
+ Contributions are welcome! Please feel free to submit a Pull Request.
257
+
258
+ 1. Fork the repository
259
+ 2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
260
+ 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
261
+ 4. Push to the branch (`git push origin feature/AmazingFeature`)
262
+ 5. Open a Pull Request
263
+
264
+ ## Star History
265
+
266
+ [![Star History Chart](https://api.star-history.com/svg?repos=LearningCircuit/local-deep-research&type=Date)](https://www.star-history.com/#LearningCircuit/local-deep-research&Date)
@@ -0,0 +1,63 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "local-deep-research"
7
+ version = "0.1.0"
8
+ description = "AI-powered research assistant with deep, iterative analysis using LLMs and web searches"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = {file = "LICENSE"}
12
+ authors = [
13
+ {name = "LearningCircuit", email = "185559241+LearningCircuit@users.noreply.github.com"},
14
+ {name = "HashedViking", email = "6432677+HashedViking@users.noreply.github.com"}
15
+ ]
16
+ classifiers = [
17
+ "Programming Language :: Python :: 3",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: OS Independent",
20
+ ]
21
+ dependencies = [
22
+ "langchain>=0.3.18",
23
+ "langchain-community>=0.3.17",
24
+ "langchain-core>=0.3.34",
25
+ "langchain-ollama>=0.2.3",
26
+ "langchain-openai>=0.3.5",
27
+ "langchain_anthropic>=0.3.7",
28
+ "duckduckgo_search>=7.3.2",
29
+ "python-dateutil>=2.9.0",
30
+ "typing_extensions>=4.12.2",
31
+ "justext",
32
+ "playwright",
33
+ "beautifulsoup4",
34
+ "flask>=2.0.1",
35
+ "flask-cors>=3.0.10",
36
+ "flask-socketio>=5.1.1",
37
+ "sqlalchemy>=1.4.23",
38
+ "wikipedia",
39
+ "arxiv>=1.4.3",
40
+ "PyPDF2>=2.0.0",
41
+ "sentence-transformers",
42
+ "faiss-cpu",
43
+ "pydantic>=2.0.0",
44
+ "pydantic-settings>=2.0.0",
45
+ "toml>=0.10.2",
46
+ "platformdirs>=3.0.0",
47
+ "dynaconf"
48
+ ]
49
+
50
+ [project.urls]
51
+ "Homepage" = "https://github.com/LearningCircuit/local-deep-research"
52
+ "Bug Tracker" = "https://github.com/LearningCircuit/local-deep-research/issues"
53
+
54
+ [project.scripts]
55
+ ldr = "local_deep_research.main:main"
56
+ ldr-web = "local_deep_research.web.app:main"
57
+
58
+ [tool.setuptools]
59
+ include-package-data = true
60
+
61
+ [tool.setuptools.package-data]
62
+ "local_deep_research.web" = ["templates/*", "static/*", "static/**/*"]
63
+ "local_deep_research.defaults" = ["*.toml", "*.py"]