local-deep-research 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research-0.1.0/LICENSE +21 -0
- local_deep_research-0.1.0/MANIFEST.in +5 -0
- local_deep_research-0.1.0/PKG-INFO +328 -0
- local_deep_research-0.1.0/README.md +266 -0
- local_deep_research-0.1.0/pyproject.toml +63 -0
- local_deep_research-0.1.0/requirements.txt +25 -0
- local_deep_research-0.1.0/setup.cfg +4 -0
- local_deep_research-0.1.0/src/local_deep_research/__init__.py +24 -0
- local_deep_research-0.1.0/src/local_deep_research/citation_handler.py +113 -0
- local_deep_research-0.1.0/src/local_deep_research/config.py +166 -0
- local_deep_research-0.1.0/src/local_deep_research/defaults/__init__.py +44 -0
- local_deep_research-0.1.0/src/local_deep_research/defaults/llm_config.py +269 -0
- local_deep_research-0.1.0/src/local_deep_research/defaults/local_collections.toml +47 -0
- local_deep_research-0.1.0/src/local_deep_research/defaults/main.toml +57 -0
- local_deep_research-0.1.0/src/local_deep_research/defaults/search_engines.toml +244 -0
- local_deep_research-0.1.0/src/local_deep_research/local_collections.py +141 -0
- local_deep_research-0.1.0/src/local_deep_research/main.py +113 -0
- local_deep_research-0.1.0/src/local_deep_research/report_generator.py +206 -0
- local_deep_research-0.1.0/src/local_deep_research/search_system.py +241 -0
- local_deep_research-0.1.0/src/local_deep_research/utilties/__init__.py +0 -0
- local_deep_research-0.1.0/src/local_deep_research/utilties/enums.py +9 -0
- local_deep_research-0.1.0/src/local_deep_research/utilties/llm_utils.py +116 -0
- local_deep_research-0.1.0/src/local_deep_research/utilties/search_utilities.py +115 -0
- local_deep_research-0.1.0/src/local_deep_research/utilties/setup_utils.py +6 -0
- local_deep_research-0.1.0/src/local_deep_research/web/__init__.py +2 -0
- local_deep_research-0.1.0/src/local_deep_research/web/app.py +1209 -0
- local_deep_research-0.1.0/src/local_deep_research/web/static/css/styles.css +1008 -0
- local_deep_research-0.1.0/src/local_deep_research/web/static/js/app.js +2078 -0
- local_deep_research-0.1.0/src/local_deep_research/web/templates/api_keys_config.html +82 -0
- local_deep_research-0.1.0/src/local_deep_research/web/templates/collections_config.html +90 -0
- local_deep_research-0.1.0/src/local_deep_research/web/templates/index.html +312 -0
- local_deep_research-0.1.0/src/local_deep_research/web/templates/llm_config.html +120 -0
- local_deep_research-0.1.0/src/local_deep_research/web/templates/main_config.html +89 -0
- local_deep_research-0.1.0/src/local_deep_research/web/templates/search_engines_config.html +154 -0
- local_deep_research-0.1.0/src/local_deep_research/web/templates/settings.html +519 -0
- local_deep_research-0.1.0/src/local_deep_research/web/templates/settings_dashboard.html +207 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/__init__.py +0 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/__init__.py +0 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/full_search.py +128 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/full_search.py +254 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/search_engine_base.py +197 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/search_engine_factory.py +233 -0
- local_deep_research-0.1.0/src/local_deep_research/web_search_engines/search_engines_config.py +54 -0
- local_deep_research-0.1.0/src/local_deep_research.egg-info/PKG-INFO +328 -0
- local_deep_research-0.1.0/src/local_deep_research.egg-info/SOURCES.txt +62 -0
- local_deep_research-0.1.0/src/local_deep_research.egg-info/dependency_links.txt +1 -0
- local_deep_research-0.1.0/src/local_deep_research.egg-info/entry_points.txt +3 -0
- local_deep_research-0.1.0/src/local_deep_research.egg-info/requires.txt +26 -0
- local_deep_research-0.1.0/src/local_deep_research.egg-info/top_level.txt +1 -0
- local_deep_research-0.1.0/tests/test_google_pse.py +206 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 LearningCircuit
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,328 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: local-deep-research
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: AI-powered research assistant with deep, iterative analysis using LLMs and web searches
|
5
|
+
Author-email: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>, HashedViking <6432677+HashedViking@users.noreply.github.com>
|
6
|
+
License: MIT License
|
7
|
+
|
8
|
+
Copyright (c) 2025 LearningCircuit
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
18
|
+
copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26
|
+
SOFTWARE.
|
27
|
+
|
28
|
+
Project-URL: Homepage, https://github.com/LearningCircuit/local-deep-research
|
29
|
+
Project-URL: Bug Tracker, https://github.com/LearningCircuit/local-deep-research/issues
|
30
|
+
Classifier: Programming Language :: Python :: 3
|
31
|
+
Classifier: License :: OSI Approved :: MIT License
|
32
|
+
Classifier: Operating System :: OS Independent
|
33
|
+
Requires-Python: >=3.8
|
34
|
+
Description-Content-Type: text/markdown
|
35
|
+
License-File: LICENSE
|
36
|
+
Requires-Dist: langchain>=0.3.18
|
37
|
+
Requires-Dist: langchain-community>=0.3.17
|
38
|
+
Requires-Dist: langchain-core>=0.3.34
|
39
|
+
Requires-Dist: langchain-ollama>=0.2.3
|
40
|
+
Requires-Dist: langchain-openai>=0.3.5
|
41
|
+
Requires-Dist: langchain_anthropic>=0.3.7
|
42
|
+
Requires-Dist: duckduckgo_search>=7.3.2
|
43
|
+
Requires-Dist: python-dateutil>=2.9.0
|
44
|
+
Requires-Dist: typing_extensions>=4.12.2
|
45
|
+
Requires-Dist: justext
|
46
|
+
Requires-Dist: playwright
|
47
|
+
Requires-Dist: beautifulsoup4
|
48
|
+
Requires-Dist: flask>=2.0.1
|
49
|
+
Requires-Dist: flask-cors>=3.0.10
|
50
|
+
Requires-Dist: flask-socketio>=5.1.1
|
51
|
+
Requires-Dist: sqlalchemy>=1.4.23
|
52
|
+
Requires-Dist: wikipedia
|
53
|
+
Requires-Dist: arxiv>=1.4.3
|
54
|
+
Requires-Dist: PyPDF2>=2.0.0
|
55
|
+
Requires-Dist: sentence-transformers
|
56
|
+
Requires-Dist: faiss-cpu
|
57
|
+
Requires-Dist: pydantic>=2.0.0
|
58
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
59
|
+
Requires-Dist: toml>=0.10.2
|
60
|
+
Requires-Dist: platformdirs>=3.0.0
|
61
|
+
Requires-Dist: dynaconf
|
62
|
+
|
63
|
+
# Local Deep Research
|
64
|
+
|
65
|
+
A powerful AI-powered research assistant that performs deep, iterative analysis using multiple LLMs and web searches. The system can be run locally for privacy or configured to use cloud-based LLMs for enhanced capabilities.
|
66
|
+
|
67
|
+
## Features
|
68
|
+
|
69
|
+
- 🔍 **Advanced Research Capabilities**
|
70
|
+
- Automated deep research with intelligent follow-up questions
|
71
|
+
- Citation tracking and source verification
|
72
|
+
- Multi-iteration analysis for comprehensive coverage
|
73
|
+
- Full webpage content analysis (not just snippets)
|
74
|
+
|
75
|
+
- 🤖 **Flexible LLM Support**
|
76
|
+
- Local AI processing with Ollama models
|
77
|
+
- Cloud LLM support (Claude, GPT)
|
78
|
+
- Supports all Langchain models
|
79
|
+
- Configurable model selection based on needs
|
80
|
+
|
81
|
+
- 📊 **Rich Output Options**
|
82
|
+
- Detailed research findings with citations
|
83
|
+
- Comprehensive research reports
|
84
|
+
- Quick summaries for rapid insights
|
85
|
+
- Source tracking and verification
|
86
|
+
|
87
|
+
- 🔒 **Privacy-Focused**
|
88
|
+
- Runs entirely on your machine when using local models
|
89
|
+
- Configurable search settings
|
90
|
+
- Transparent data handling
|
91
|
+
|
92
|
+
- 🌐 **Enhanced Search Integration**
|
93
|
+
- **Auto-selection of search sources**: The "auto" search engine intelligently analyzes your query and selects the most appropriate search engine based on the query content
|
94
|
+
- Wikipedia integration for factual knowledge
|
95
|
+
- arXiv integration for scientific papers and academic research
|
96
|
+
- PubMed integration for biomedical literature and medical research
|
97
|
+
- DuckDuckGo integration for web searches (may experience rate limiting)
|
98
|
+
- SerpAPI integration for Google search results (requires API key)
|
99
|
+
- **Google Programmable Search Engine** integration for custom search experiences (requires API key)
|
100
|
+
- The Guardian integration for news articles and journalism (requires API key)
|
101
|
+
- **Local RAG search for private documents** - search your own documents with vector embeddings
|
102
|
+
- Full webpage content retrieval
|
103
|
+
- Source filtering and validation
|
104
|
+
- Configurable search parameters
|
105
|
+
|
106
|
+
- 📑 **Local Document Search (RAG)**
|
107
|
+
- Vector embedding-based search of your local documents
|
108
|
+
- Create custom document collections for different topics
|
109
|
+
- Privacy-preserving - your documents stay on your machine
|
110
|
+
- Intelligent chunking and retrieval
|
111
|
+
- Compatible with various document formats (PDF, text, markdown, etc.)
|
112
|
+
- Automatic integration with meta-search for unified queries
|
113
|
+
|
114
|
+
## Example Research: Fusion Energy Developments
|
115
|
+
|
116
|
+
The repository includes complete research examples demonstrating the tool's capabilities. For instance, our [fusion energy research analysis](https://github.com/LearningCircuit/local-deep-research/blob/main/examples/fusion-energy-research-developments.md) provides a comprehensive overview of:
|
117
|
+
|
118
|
+
- Latest scientific breakthroughs in fusion research (2022-2025)
|
119
|
+
- Private sector funding developments exceeding $6 billion
|
120
|
+
- Expert projections for commercial fusion energy timelines
|
121
|
+
- Regulatory frameworks being developed for fusion deployment
|
122
|
+
- Technical challenges that must be overcome for commercial viability
|
123
|
+
|
124
|
+
This example showcases the system's ability to perform multiple research iterations, follow evidence trails across scientific and commercial domains, and synthesize information from diverse sources while maintaining proper citation.
|
125
|
+
|
126
|
+
## Installation
|
127
|
+
|
128
|
+
1. Clone the repository:
|
129
|
+
```bash
|
130
|
+
git clone https://github.com/yourusername/local-deep-research.git
|
131
|
+
cd local-deep-research
|
132
|
+
```
|
133
|
+
|
134
|
+
2. Install dependencies:
|
135
|
+
```bash
|
136
|
+
pip install -r requirements.txt
|
137
|
+
playwright install
|
138
|
+
```
|
139
|
+
|
140
|
+
3. Install Ollama (for local models):
|
141
|
+
```bash
|
142
|
+
# Install Ollama from https://ollama.ai
|
143
|
+
ollama pull mistral # Default model - many work really well choose best for your hardware (fits in GPU)
|
144
|
+
```
|
145
|
+
|
146
|
+
4. Configure environment variables:
|
147
|
+
```bash
|
148
|
+
# Copy the template
|
149
|
+
cp .env.template .env
|
150
|
+
|
151
|
+
# Edit .env with your API keys (if using cloud LLMs)
|
152
|
+
ANTHROPIC_API_KEY=your-api-key-here # For Claude
|
153
|
+
OPENAI_API_KEY=your-openai-key-here # For GPT models
|
154
|
+
GUARDIAN_API_KEY=your-guardian-api-key-here # For The Guardian search
|
155
|
+
```
|
156
|
+
|
157
|
+
## Usage
|
158
|
+
Terminal usage (not recommended):
|
159
|
+
```bash
|
160
|
+
python main.py
|
161
|
+
```
|
162
|
+
|
163
|
+
### Web Interface
|
164
|
+
|
165
|
+
The project includes a web interface for a more user-friendly experience:
|
166
|
+
|
167
|
+
```bash
|
168
|
+
python app.py
|
169
|
+
```
|
170
|
+
|
171
|
+
This will start a local web server, accessible at `http://127.0.0.1:5000` in your browser.
|
172
|
+
|
173
|
+
#### Web Interface Features:
|
174
|
+
|
175
|
+
- **Dashboard**: Intuitive interface for starting and managing research queries
|
176
|
+
- **Real-time Updates**: Track research progress with live updates
|
177
|
+
- **Research History**: Access and manage past research queries
|
178
|
+
- **PDF Export**: Download completed research reports as PDF documents
|
179
|
+
- **Research Management**: Terminate ongoing research processes or delete past records
|
180
|
+
|
181
|
+

|
182
|
+

|
183
|
+
### Configuration
|
184
|
+
**Please report your best settings in issues so we can improve the default settings.**
|
185
|
+
|
186
|
+
Key settings in `config.py`:
|
187
|
+
```python
|
188
|
+
# LLM Configuration
|
189
|
+
DEFAULT_MODEL = "mistral" # Change based on your needs
|
190
|
+
DEFAULT_TEMPERATURE = 0.7
|
191
|
+
MAX_TOKENS = 8000
|
192
|
+
|
193
|
+
# Search Configuration
|
194
|
+
MAX_SEARCH_RESULTS = 40
|
195
|
+
SEARCH_REGION = "us-en"
|
196
|
+
TIME_PERIOD = "y"
|
197
|
+
SAFE_SEARCH = True
|
198
|
+
SEARCH_SNIPPETS_ONLY = False
|
199
|
+
|
200
|
+
# Choose search tool: "wiki", "arxiv", "duckduckgo", "guardian", "serp", "local_all", or "auto"
|
201
|
+
search_tool = "auto" # "auto" will intelligently select the best search engine for your query
|
202
|
+
```
|
203
|
+
|
204
|
+
## Local Document Search (RAG)
|
205
|
+
|
206
|
+
The system includes powerful local document search capabilities using Retrieval-Augmented Generation (RAG). This allows you to search and retrieve content from your own document collections.
|
207
|
+
|
208
|
+
### Setting Up Local Collections
|
209
|
+
|
210
|
+
Create a file named `local_collections.py` in the project root directory:
|
211
|
+
|
212
|
+
```python
|
213
|
+
# local_collections.py
|
214
|
+
import os
|
215
|
+
from typing import Dict, Any
|
216
|
+
|
217
|
+
# Registry of local document collections
|
218
|
+
LOCAL_COLLECTIONS = {
|
219
|
+
# Research Papers Collection
|
220
|
+
"research_papers": {
|
221
|
+
"name": "Research Papers",
|
222
|
+
"description": "Academic research papers and articles",
|
223
|
+
"paths": [os.path.abspath("local_search_files/research_papers")], # Use absolute paths
|
224
|
+
"enabled": True,
|
225
|
+
"embedding_model": "all-MiniLM-L6-v2",
|
226
|
+
"embedding_device": "cpu",
|
227
|
+
"embedding_model_type": "sentence_transformers",
|
228
|
+
"max_results": 20,
|
229
|
+
"max_filtered_results": 5,
|
230
|
+
"chunk_size": 800, # Smaller chunks for academic content
|
231
|
+
"chunk_overlap": 150,
|
232
|
+
"cache_dir": ".cache/local_search/research_papers"
|
233
|
+
},
|
234
|
+
|
235
|
+
# Personal Notes Collection
|
236
|
+
"personal_notes": {
|
237
|
+
"name": "Personal Notes",
|
238
|
+
"description": "Personal notes and documents",
|
239
|
+
"paths": [os.path.abspath("local_search_files/personal_notes")], # Use absolute paths
|
240
|
+
"enabled": True,
|
241
|
+
"embedding_model": "all-MiniLM-L6-v2",
|
242
|
+
"embedding_device": "cpu",
|
243
|
+
"embedding_model_type": "sentence_transformers",
|
244
|
+
"max_results": 30,
|
245
|
+
"max_filtered_results": 10,
|
246
|
+
"chunk_size": 500, # Smaller chunks for notes
|
247
|
+
"chunk_overlap": 100,
|
248
|
+
"cache_dir": ".cache/local_search/personal_notes"
|
249
|
+
}
|
250
|
+
}
|
251
|
+
```
|
252
|
+
|
253
|
+
Create directories for your collections:
|
254
|
+
|
255
|
+
```bash
|
256
|
+
mkdir -p local_search_files/research_papers
|
257
|
+
mkdir -p local_search_files/personal_notes
|
258
|
+
```
|
259
|
+
|
260
|
+
Add your documents to these folders, and the system will automatically index them and make them available for searching.
|
261
|
+
|
262
|
+
### Using Local Search
|
263
|
+
|
264
|
+
You can use local search in several ways:
|
265
|
+
|
266
|
+
1. **Auto-selection**: Set `search_tool = "auto"` in `config.py` and the system will automatically use your local collections when appropriate for the query.
|
267
|
+
|
268
|
+
2. **Explicit Selection**: Set `search_tool = "research_papers"` to search only that specific collection.
|
269
|
+
|
270
|
+
3. **Search All Local Collections**: Set `search_tool = "local_all"` to search across all your local document collections.
|
271
|
+
|
272
|
+
4. **Query Syntax**: Use `collection:collection_name your query` to target a specific collection within a query.
|
273
|
+
|
274
|
+
### Search Engine Options
|
275
|
+
|
276
|
+
The system supports multiple search engines that can be selected by changing the `search_tool` variable in `config.py`:
|
277
|
+
|
278
|
+
- **Auto** (`auto`): Intelligent search engine selector that analyzes your query and chooses the most appropriate source (Wikipedia, arXiv, local collections, etc.)
|
279
|
+
- **Wikipedia** (`wiki`): Best for general knowledge, facts, and overview information
|
280
|
+
- **arXiv** (`arxiv`): Great for scientific and academic research, accessing preprints and papers
|
281
|
+
- **PubMed** (`pubmed`): Excellent for biomedical literature, medical research, and health information
|
282
|
+
- **DuckDuckGo** (`duckduckgo`): General web search that doesn't require an API key
|
283
|
+
- **The Guardian** (`guardian`): Quality journalism and news articles (requires an API key)
|
284
|
+
- **SerpAPI** (`serp`): Google search results (requires an API key)
|
285
|
+
- **Google Programmable Search Engine** (`google_pse`): Custom search experiences with control over search scope and domains (requires API key and search engine ID)
|
286
|
+
- **Local Collections**: Any collections defined in your `local_collections.py` file
|
287
|
+
|
288
|
+
> **Note:** The "auto" option will intelligently select the best search engine based on your query. For example, if you ask about physics research papers, it might select arXiv or your research_papers collection, while if you ask about current events, it might select The Guardian or DuckDuckGo.
|
289
|
+
|
290
|
+
> **Support Free Knowledge:** If you frequently use the search engines in this tool, please consider making a donation to these organizations. They provide valuable services and rely on user support to maintain their operations:
|
291
|
+
> - [Donate to Wikipedia](https://donate.wikimedia.org)
|
292
|
+
> - [Support The Guardian](https://support.theguardian.com)
|
293
|
+
> - [Support arXiv](https://arxiv.org/about/give)
|
294
|
+
> - [Donate to DuckDuckGo](https://duckduckgo.com/donations)
|
295
|
+
> - [Support PubMed/NCBI](https://www.nlm.nih.gov/pubs/donations/donations.html)
|
296
|
+
|
297
|
+
## License
|
298
|
+
|
299
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
300
|
+
|
301
|
+
## Acknowledgments
|
302
|
+
- Built with [Ollama](https://ollama.ai) for local AI processing
|
303
|
+
- Search powered by multiple sources:
|
304
|
+
- [Wikipedia](https://www.wikipedia.org/) for factual knowledge (default search engine)
|
305
|
+
- [arXiv](https://arxiv.org/) for scientific papers
|
306
|
+
- [PubMed](https://pubmed.ncbi.nlm.nih.gov/) for biomedical literature
|
307
|
+
- [DuckDuckGo](https://duckduckgo.com) for web search
|
308
|
+
- [The Guardian](https://www.theguardian.com/) for quality journalism
|
309
|
+
- [SerpAPI](https://serpapi.com) for Google search results (requires API key)
|
310
|
+
- Built on [LangChain](https://github.com/hwchase17/langchain) framework
|
311
|
+
- Uses [justext](https://github.com/miso-belica/justext) for content extraction
|
312
|
+
- [Playwright](https://playwright.dev) for web content retrieval
|
313
|
+
- Uses [FAISS](https://github.com/facebookresearch/faiss) for vector similarity search
|
314
|
+
- Uses [sentence-transformers](https://github.com/UKPLab/sentence-transformers) for embeddings
|
315
|
+
|
316
|
+
## Contributing
|
317
|
+
|
318
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
319
|
+
|
320
|
+
1. Fork the repository
|
321
|
+
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
322
|
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
323
|
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
324
|
+
5. Open a Pull Request
|
325
|
+
|
326
|
+
## Star History
|
327
|
+
|
328
|
+
[](https://www.star-history.com/#LearningCircuit/local-deep-research&Date)
|
@@ -0,0 +1,266 @@
|
|
1
|
+
# Local Deep Research
|
2
|
+
|
3
|
+
A powerful AI-powered research assistant that performs deep, iterative analysis using multiple LLMs and web searches. The system can be run locally for privacy or configured to use cloud-based LLMs for enhanced capabilities.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
- 🔍 **Advanced Research Capabilities**
|
8
|
+
- Automated deep research with intelligent follow-up questions
|
9
|
+
- Citation tracking and source verification
|
10
|
+
- Multi-iteration analysis for comprehensive coverage
|
11
|
+
- Full webpage content analysis (not just snippets)
|
12
|
+
|
13
|
+
- 🤖 **Flexible LLM Support**
|
14
|
+
- Local AI processing with Ollama models
|
15
|
+
- Cloud LLM support (Claude, GPT)
|
16
|
+
- Supports all Langchain models
|
17
|
+
- Configurable model selection based on needs
|
18
|
+
|
19
|
+
- 📊 **Rich Output Options**
|
20
|
+
- Detailed research findings with citations
|
21
|
+
- Comprehensive research reports
|
22
|
+
- Quick summaries for rapid insights
|
23
|
+
- Source tracking and verification
|
24
|
+
|
25
|
+
- 🔒 **Privacy-Focused**
|
26
|
+
- Runs entirely on your machine when using local models
|
27
|
+
- Configurable search settings
|
28
|
+
- Transparent data handling
|
29
|
+
|
30
|
+
- 🌐 **Enhanced Search Integration**
|
31
|
+
- **Auto-selection of search sources**: The "auto" search engine intelligently analyzes your query and selects the most appropriate search engine based on the query content
|
32
|
+
- Wikipedia integration for factual knowledge
|
33
|
+
- arXiv integration for scientific papers and academic research
|
34
|
+
- PubMed integration for biomedical literature and medical research
|
35
|
+
- DuckDuckGo integration for web searches (may experience rate limiting)
|
36
|
+
- SerpAPI integration for Google search results (requires API key)
|
37
|
+
- **Google Programmable Search Engine** integration for custom search experiences (requires API key)
|
38
|
+
- The Guardian integration for news articles and journalism (requires API key)
|
39
|
+
- **Local RAG search for private documents** - search your own documents with vector embeddings
|
40
|
+
- Full webpage content retrieval
|
41
|
+
- Source filtering and validation
|
42
|
+
- Configurable search parameters
|
43
|
+
|
44
|
+
- 📑 **Local Document Search (RAG)**
|
45
|
+
- Vector embedding-based search of your local documents
|
46
|
+
- Create custom document collections for different topics
|
47
|
+
- Privacy-preserving - your documents stay on your machine
|
48
|
+
- Intelligent chunking and retrieval
|
49
|
+
- Compatible with various document formats (PDF, text, markdown, etc.)
|
50
|
+
- Automatic integration with meta-search for unified queries
|
51
|
+
|
52
|
+
## Example Research: Fusion Energy Developments
|
53
|
+
|
54
|
+
The repository includes complete research examples demonstrating the tool's capabilities. For instance, our [fusion energy research analysis](https://github.com/LearningCircuit/local-deep-research/blob/main/examples/fusion-energy-research-developments.md) provides a comprehensive overview of:
|
55
|
+
|
56
|
+
- Latest scientific breakthroughs in fusion research (2022-2025)
|
57
|
+
- Private sector funding developments exceeding $6 billion
|
58
|
+
- Expert projections for commercial fusion energy timelines
|
59
|
+
- Regulatory frameworks being developed for fusion deployment
|
60
|
+
- Technical challenges that must be overcome for commercial viability
|
61
|
+
|
62
|
+
This example showcases the system's ability to perform multiple research iterations, follow evidence trails across scientific and commercial domains, and synthesize information from diverse sources while maintaining proper citation.
|
63
|
+
|
64
|
+
## Installation
|
65
|
+
|
66
|
+
1. Clone the repository:
|
67
|
+
```bash
|
68
|
+
git clone https://github.com/yourusername/local-deep-research.git
|
69
|
+
cd local-deep-research
|
70
|
+
```
|
71
|
+
|
72
|
+
2. Install dependencies:
|
73
|
+
```bash
|
74
|
+
pip install -r requirements.txt
|
75
|
+
playwright install
|
76
|
+
```
|
77
|
+
|
78
|
+
3. Install Ollama (for local models):
|
79
|
+
```bash
|
80
|
+
# Install Ollama from https://ollama.ai
|
81
|
+
ollama pull mistral # Default model - many work really well choose best for your hardware (fits in GPU)
|
82
|
+
```
|
83
|
+
|
84
|
+
4. Configure environment variables:
|
85
|
+
```bash
|
86
|
+
# Copy the template
|
87
|
+
cp .env.template .env
|
88
|
+
|
89
|
+
# Edit .env with your API keys (if using cloud LLMs)
|
90
|
+
ANTHROPIC_API_KEY=your-api-key-here # For Claude
|
91
|
+
OPENAI_API_KEY=your-openai-key-here # For GPT models
|
92
|
+
GUARDIAN_API_KEY=your-guardian-api-key-here # For The Guardian search
|
93
|
+
```
|
94
|
+
|
95
|
+
## Usage
|
96
|
+
Terminal usage (not recommended):
|
97
|
+
```bash
|
98
|
+
python main.py
|
99
|
+
```
|
100
|
+
|
101
|
+
### Web Interface
|
102
|
+
|
103
|
+
The project includes a web interface for a more user-friendly experience:
|
104
|
+
|
105
|
+
```bash
|
106
|
+
python app.py
|
107
|
+
```
|
108
|
+
|
109
|
+
This will start a local web server, accessible at `http://127.0.0.1:5000` in your browser.
|
110
|
+
|
111
|
+
#### Web Interface Features:
|
112
|
+
|
113
|
+
- **Dashboard**: Intuitive interface for starting and managing research queries
|
114
|
+
- **Real-time Updates**: Track research progress with live updates
|
115
|
+
- **Research History**: Access and manage past research queries
|
116
|
+
- **PDF Export**: Download completed research reports as PDF documents
|
117
|
+
- **Research Management**: Terminate ongoing research processes or delete past records
|
118
|
+
|
119
|
+

|
120
|
+

|
121
|
+
### Configuration
|
122
|
+
**Please report your best settings in issues so we can improve the default settings.**
|
123
|
+
|
124
|
+
Key settings in `config.py`:
|
125
|
+
```python
|
126
|
+
# LLM Configuration
|
127
|
+
DEFAULT_MODEL = "mistral" # Change based on your needs
|
128
|
+
DEFAULT_TEMPERATURE = 0.7
|
129
|
+
MAX_TOKENS = 8000
|
130
|
+
|
131
|
+
# Search Configuration
|
132
|
+
MAX_SEARCH_RESULTS = 40
|
133
|
+
SEARCH_REGION = "us-en"
|
134
|
+
TIME_PERIOD = "y"
|
135
|
+
SAFE_SEARCH = True
|
136
|
+
SEARCH_SNIPPETS_ONLY = False
|
137
|
+
|
138
|
+
# Choose search tool: "wiki", "arxiv", "duckduckgo", "guardian", "serp", "local_all", or "auto"
|
139
|
+
search_tool = "auto" # "auto" will intelligently select the best search engine for your query
|
140
|
+
```
|
141
|
+
|
142
|
+
## Local Document Search (RAG)
|
143
|
+
|
144
|
+
The system includes powerful local document search capabilities using Retrieval-Augmented Generation (RAG). This allows you to search and retrieve content from your own document collections.
|
145
|
+
|
146
|
+
### Setting Up Local Collections
|
147
|
+
|
148
|
+
Create a file named `local_collections.py` in the project root directory:
|
149
|
+
|
150
|
+
```python
|
151
|
+
# local_collections.py
|
152
|
+
import os
|
153
|
+
from typing import Dict, Any
|
154
|
+
|
155
|
+
# Registry of local document collections
|
156
|
+
LOCAL_COLLECTIONS = {
|
157
|
+
# Research Papers Collection
|
158
|
+
"research_papers": {
|
159
|
+
"name": "Research Papers",
|
160
|
+
"description": "Academic research papers and articles",
|
161
|
+
"paths": [os.path.abspath("local_search_files/research_papers")], # Use absolute paths
|
162
|
+
"enabled": True,
|
163
|
+
"embedding_model": "all-MiniLM-L6-v2",
|
164
|
+
"embedding_device": "cpu",
|
165
|
+
"embedding_model_type": "sentence_transformers",
|
166
|
+
"max_results": 20,
|
167
|
+
"max_filtered_results": 5,
|
168
|
+
"chunk_size": 800, # Smaller chunks for academic content
|
169
|
+
"chunk_overlap": 150,
|
170
|
+
"cache_dir": ".cache/local_search/research_papers"
|
171
|
+
},
|
172
|
+
|
173
|
+
# Personal Notes Collection
|
174
|
+
"personal_notes": {
|
175
|
+
"name": "Personal Notes",
|
176
|
+
"description": "Personal notes and documents",
|
177
|
+
"paths": [os.path.abspath("local_search_files/personal_notes")], # Use absolute paths
|
178
|
+
"enabled": True,
|
179
|
+
"embedding_model": "all-MiniLM-L6-v2",
|
180
|
+
"embedding_device": "cpu",
|
181
|
+
"embedding_model_type": "sentence_transformers",
|
182
|
+
"max_results": 30,
|
183
|
+
"max_filtered_results": 10,
|
184
|
+
"chunk_size": 500, # Smaller chunks for notes
|
185
|
+
"chunk_overlap": 100,
|
186
|
+
"cache_dir": ".cache/local_search/personal_notes"
|
187
|
+
}
|
188
|
+
}
|
189
|
+
```
|
190
|
+
|
191
|
+
Create directories for your collections:
|
192
|
+
|
193
|
+
```bash
|
194
|
+
mkdir -p local_search_files/research_papers
|
195
|
+
mkdir -p local_search_files/personal_notes
|
196
|
+
```
|
197
|
+
|
198
|
+
Add your documents to these folders, and the system will automatically index them and make them available for searching.
|
199
|
+
|
200
|
+
### Using Local Search
|
201
|
+
|
202
|
+
You can use local search in several ways:
|
203
|
+
|
204
|
+
1. **Auto-selection**: Set `search_tool = "auto"` in `config.py` and the system will automatically use your local collections when appropriate for the query.
|
205
|
+
|
206
|
+
2. **Explicit Selection**: Set `search_tool = "research_papers"` to search only that specific collection.
|
207
|
+
|
208
|
+
3. **Search All Local Collections**: Set `search_tool = "local_all"` to search across all your local document collections.
|
209
|
+
|
210
|
+
4. **Query Syntax**: Use `collection:collection_name your query` to target a specific collection within a query.
|
211
|
+
|
212
|
+
### Search Engine Options
|
213
|
+
|
214
|
+
The system supports multiple search engines that can be selected by changing the `search_tool` variable in `config.py`:
|
215
|
+
|
216
|
+
- **Auto** (`auto`): Intelligent search engine selector that analyzes your query and chooses the most appropriate source (Wikipedia, arXiv, local collections, etc.)
|
217
|
+
- **Wikipedia** (`wiki`): Best for general knowledge, facts, and overview information
|
218
|
+
- **arXiv** (`arxiv`): Great for scientific and academic research, accessing preprints and papers
|
219
|
+
- **PubMed** (`pubmed`): Excellent for biomedical literature, medical research, and health information
|
220
|
+
- **DuckDuckGo** (`duckduckgo`): General web search that doesn't require an API key
|
221
|
+
- **The Guardian** (`guardian`): Quality journalism and news articles (requires an API key)
|
222
|
+
- **SerpAPI** (`serp`): Google search results (requires an API key)
|
223
|
+
- **Google Programmable Search Engine** (`google_pse`): Custom search experiences with control over search scope and domains (requires API key and search engine ID)
|
224
|
+
- **Local Collections**: Any collections defined in your `local_collections.py` file
|
225
|
+
|
226
|
+
> **Note:** The "auto" option will intelligently select the best search engine based on your query. For example, if you ask about physics research papers, it might select arXiv or your research_papers collection, while if you ask about current events, it might select The Guardian or DuckDuckGo.
|
227
|
+
|
228
|
+
> **Support Free Knowledge:** If you frequently use the search engines in this tool, please consider making a donation to these organizations. They provide valuable services and rely on user support to maintain their operations:
|
229
|
+
> - [Donate to Wikipedia](https://donate.wikimedia.org)
|
230
|
+
> - [Support The Guardian](https://support.theguardian.com)
|
231
|
+
> - [Support arXiv](https://arxiv.org/about/give)
|
232
|
+
> - [Donate to DuckDuckGo](https://duckduckgo.com/donations)
|
233
|
+
> - [Support PubMed/NCBI](https://www.nlm.nih.gov/pubs/donations/donations.html)
|
234
|
+
|
235
|
+
## License
|
236
|
+
|
237
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
238
|
+
|
239
|
+
## Acknowledgments
|
240
|
+
- Built with [Ollama](https://ollama.ai) for local AI processing
|
241
|
+
- Search powered by multiple sources:
|
242
|
+
- [Wikipedia](https://www.wikipedia.org/) for factual knowledge (default search engine)
|
243
|
+
- [arXiv](https://arxiv.org/) for scientific papers
|
244
|
+
- [PubMed](https://pubmed.ncbi.nlm.nih.gov/) for biomedical literature
|
245
|
+
- [DuckDuckGo](https://duckduckgo.com) for web search
|
246
|
+
- [The Guardian](https://www.theguardian.com/) for quality journalism
|
247
|
+
- [SerpAPI](https://serpapi.com) for Google search results (requires API key)
|
248
|
+
- Built on [LangChain](https://github.com/hwchase17/langchain) framework
|
249
|
+
- Uses [justext](https://github.com/miso-belica/justext) for content extraction
|
250
|
+
- [Playwright](https://playwright.dev) for web content retrieval
|
251
|
+
- Uses [FAISS](https://github.com/facebookresearch/faiss) for vector similarity search
|
252
|
+
- Uses [sentence-transformers](https://github.com/UKPLab/sentence-transformers) for embeddings
|
253
|
+
|
254
|
+
## Contributing
|
255
|
+
|
256
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
257
|
+
|
258
|
+
1. Fork the repository
|
259
|
+
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
260
|
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
261
|
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
262
|
+
5. Open a Pull Request
|
263
|
+
|
264
|
+
## Star History
|
265
|
+
|
266
|
+
[](https://www.star-history.com/#LearningCircuit/local-deep-research&Date)
|
@@ -0,0 +1,63 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=42", "wheel"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "local-deep-research"
|
7
|
+
version = "0.1.0"
|
8
|
+
description = "AI-powered research assistant with deep, iterative analysis using LLMs and web searches"
|
9
|
+
readme = "README.md"
|
10
|
+
requires-python = ">=3.8"
|
11
|
+
license = {file = "LICENSE"}
|
12
|
+
authors = [
|
13
|
+
{name = "LearningCircuit", email = "185559241+LearningCircuit@users.noreply.github.com"},
|
14
|
+
{name = "HashedViking", email = "6432677+HashedViking@users.noreply.github.com"}
|
15
|
+
]
|
16
|
+
classifiers = [
|
17
|
+
"Programming Language :: Python :: 3",
|
18
|
+
"License :: OSI Approved :: MIT License",
|
19
|
+
"Operating System :: OS Independent",
|
20
|
+
]
|
21
|
+
dependencies = [
|
22
|
+
"langchain>=0.3.18",
|
23
|
+
"langchain-community>=0.3.17",
|
24
|
+
"langchain-core>=0.3.34",
|
25
|
+
"langchain-ollama>=0.2.3",
|
26
|
+
"langchain-openai>=0.3.5",
|
27
|
+
"langchain_anthropic>=0.3.7",
|
28
|
+
"duckduckgo_search>=7.3.2",
|
29
|
+
"python-dateutil>=2.9.0",
|
30
|
+
"typing_extensions>=4.12.2",
|
31
|
+
"justext",
|
32
|
+
"playwright",
|
33
|
+
"beautifulsoup4",
|
34
|
+
"flask>=2.0.1",
|
35
|
+
"flask-cors>=3.0.10",
|
36
|
+
"flask-socketio>=5.1.1",
|
37
|
+
"sqlalchemy>=1.4.23",
|
38
|
+
"wikipedia",
|
39
|
+
"arxiv>=1.4.3",
|
40
|
+
"PyPDF2>=2.0.0",
|
41
|
+
"sentence-transformers",
|
42
|
+
"faiss-cpu",
|
43
|
+
"pydantic>=2.0.0",
|
44
|
+
"pydantic-settings>=2.0.0",
|
45
|
+
"toml>=0.10.2",
|
46
|
+
"platformdirs>=3.0.0",
|
47
|
+
"dynaconf"
|
48
|
+
]
|
49
|
+
|
50
|
+
[project.urls]
|
51
|
+
"Homepage" = "https://github.com/LearningCircuit/local-deep-research"
|
52
|
+
"Bug Tracker" = "https://github.com/LearningCircuit/local-deep-research/issues"
|
53
|
+
|
54
|
+
[project.scripts]
|
55
|
+
ldr = "local_deep_research.main:main"
|
56
|
+
ldr-web = "local_deep_research.web.app:main"
|
57
|
+
|
58
|
+
[tool.setuptools]
|
59
|
+
include-package-data = true
|
60
|
+
|
61
|
+
[tool.setuptools.package-data]
|
62
|
+
"local_deep_research.web" = ["templates/*", "static/*", "static/**/*"]
|
63
|
+
"local_deep_research.defaults" = ["*.toml", "*.py"]
|