local-deep-research 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. local_deep_research/__init__.py +24 -0
  2. local_deep_research/citation_handler.py +113 -0
  3. local_deep_research/config.py +166 -0
  4. local_deep_research/defaults/__init__.py +44 -0
  5. local_deep_research/defaults/llm_config.py +269 -0
  6. local_deep_research/defaults/local_collections.toml +47 -0
  7. local_deep_research/defaults/main.toml +57 -0
  8. local_deep_research/defaults/search_engines.toml +244 -0
  9. local_deep_research/local_collections.py +141 -0
  10. local_deep_research/main.py +113 -0
  11. local_deep_research/report_generator.py +206 -0
  12. local_deep_research/search_system.py +241 -0
  13. local_deep_research/utilties/__init__.py +0 -0
  14. local_deep_research/utilties/enums.py +9 -0
  15. local_deep_research/utilties/llm_utils.py +116 -0
  16. local_deep_research/utilties/search_utilities.py +115 -0
  17. local_deep_research/utilties/setup_utils.py +6 -0
  18. local_deep_research/web/__init__.py +2 -0
  19. local_deep_research/web/app.py +1209 -0
  20. local_deep_research/web/static/css/styles.css +1008 -0
  21. local_deep_research/web/static/js/app.js +2078 -0
  22. local_deep_research/web/templates/api_keys_config.html +82 -0
  23. local_deep_research/web/templates/collections_config.html +90 -0
  24. local_deep_research/web/templates/index.html +312 -0
  25. local_deep_research/web/templates/llm_config.html +120 -0
  26. local_deep_research/web/templates/main_config.html +89 -0
  27. local_deep_research/web/templates/search_engines_config.html +154 -0
  28. local_deep_research/web/templates/settings.html +519 -0
  29. local_deep_research/web/templates/settings_dashboard.html +207 -0
  30. local_deep_research/web_search_engines/__init__.py +0 -0
  31. local_deep_research/web_search_engines/engines/__init__.py +0 -0
  32. local_deep_research/web_search_engines/engines/full_search.py +128 -0
  33. local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
  34. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
  35. local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
  36. local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
  37. local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
  38. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
  39. local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
  40. local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
  41. local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
  42. local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
  43. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
  44. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
  45. local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
  46. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
  47. local_deep_research/web_search_engines/full_search.py +254 -0
  48. local_deep_research/web_search_engines/search_engine_base.py +197 -0
  49. local_deep_research/web_search_engines/search_engine_factory.py +233 -0
  50. local_deep_research/web_search_engines/search_engines_config.py +54 -0
  51. local_deep_research-0.1.0.dist-info/LICENSE +21 -0
  52. local_deep_research-0.1.0.dist-info/METADATA +328 -0
  53. local_deep_research-0.1.0.dist-info/RECORD +56 -0
  54. local_deep_research-0.1.0.dist-info/WHEEL +5 -0
  55. local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
  56. local_deep_research-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,474 @@
1
+ import requests
2
+ import logging
3
+ from typing import Dict, List, Any, Optional, Tuple
4
+ from langchain_core.language_models import BaseLLM
5
+ import re
6
+ import time
7
+ from datetime import datetime
8
+
9
+ from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
10
+ from local_deep_research import config
11
+ import justext
12
+
13
+ # Setup logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class WaybackSearchEngine(BaseSearchEngine):
18
+ """
19
+ Internet Archive Wayback Machine search engine implementation
20
+ Provides access to historical versions of web pages
21
+ """
22
+
23
+ def __init__(self,
24
+ max_results: int = 10,
25
+ max_snapshots_per_url: int = 3,
26
+ llm: Optional[BaseLLM] = None,
27
+ language: str = "English",
28
+ max_filtered_results: Optional[int] = None,
29
+ closest_only: bool = False):
30
+ """
31
+ Initialize the Wayback Machine search engine.
32
+
33
+ Args:
34
+ max_results: Maximum number of search results
35
+ max_snapshots_per_url: Maximum snapshots to retrieve per URL
36
+ llm: Language model for relevance filtering
37
+ language: Language for content processing
38
+ max_filtered_results: Maximum number of results to keep after filtering
39
+ closest_only: If True, only retrieves the closest snapshot for each URL
40
+ """
41
+ # Initialize the BaseSearchEngine with the LLM and max_filtered_results
42
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results)
43
+
44
+ self.max_results = max_results
45
+ self.max_snapshots_per_url = max_snapshots_per_url
46
+ self.language = language
47
+ self.closest_only = closest_only
48
+
49
+ # API endpoints
50
+ self.available_api = "https://archive.org/wayback/available"
51
+ self.cdx_api = "https://web.archive.org/cdx/search/cdx"
52
+
53
+ def _extract_urls_from_query(self, query: str) -> List[str]:
54
+ """
55
+ Extract URLs from a query string or interpret as an URL if possible.
56
+ For non-URL queries, use a DuckDuckGo search to find relevant URLs.
57
+
58
+ Args:
59
+ query: The search query or URL
60
+
61
+ Returns:
62
+ List of URLs to search in the Wayback Machine
63
+ """
64
+ # Check if the query is already a URL
65
+ url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
66
+ urls = url_pattern.findall(query)
67
+
68
+ if urls:
69
+ logger.info(f"Found {len(urls)} URLs in query")
70
+ return urls
71
+
72
+ # Check if query is a domain without http prefix
73
+ domain_pattern = re.compile(r'^(?:[-\w.]|(?:%[\da-fA-F]{2}))+\.\w+$')
74
+ if domain_pattern.match(query):
75
+ logger.info(f"Query appears to be a domain: {query}")
76
+ return [f"http://{query}"]
77
+
78
+ # For non-URL queries, use DuckDuckGo to find relevant URLs
79
+ logger.info(f"Query is not a URL, using DuckDuckGo to find relevant URLs")
80
+ try:
81
+ # Import DuckDuckGo search engine
82
+ from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
83
+ ddg = DuckDuckGoSearchAPIWrapper(max_results=5)
84
+ results = ddg.results(query)
85
+
86
+ # Extract URLs from results
87
+ ddg_urls = [result.get("link") for result in results if result.get("link")]
88
+ if ddg_urls:
89
+ logger.info(f"Found {len(ddg_urls)} URLs from DuckDuckGo search")
90
+ return ddg_urls
91
+ except Exception as e:
92
+ logger.error(f"Error using DuckDuckGo for URL discovery: {e}")
93
+
94
+ # Fallback: treat the query as a potential domain or path
95
+ if "/" in query and "." in query:
96
+ logger.info(f"Treating query as a partial URL: {query}")
97
+ return [f"http://{query}"]
98
+ elif "." in query:
99
+ logger.info(f"Treating query as a domain: {query}")
100
+ return [f"http://{query}"]
101
+
102
+ # Return empty list if nothing worked
103
+ logger.warning(f"Could not extract any URLs from query: {query}")
104
+ return []
105
+
106
+ def _format_timestamp(self, timestamp: str) -> str:
107
+ """Format Wayback Machine timestamp into readable date"""
108
+ if len(timestamp) < 14:
109
+ return timestamp
110
+
111
+ try:
112
+ year = timestamp[0:4]
113
+ month = timestamp[4:6]
114
+ day = timestamp[6:8]
115
+ hour = timestamp[8:10]
116
+ minute = timestamp[10:12]
117
+ second = timestamp[12:14]
118
+ return f"{year}-{month}-{day} {hour}:{minute}:{second}"
119
+ except:
120
+ return timestamp
121
+
122
+ def _get_wayback_snapshots(self, url: str) -> List[Dict[str, Any]]:
123
+ """
124
+ Get snapshots from the Wayback Machine for a specific URL.
125
+
126
+ Args:
127
+ url: URL to get snapshots for
128
+
129
+ Returns:
130
+ List of snapshot dictionaries
131
+ """
132
+ snapshots = []
133
+
134
+ try:
135
+ if self.closest_only:
136
+ # Get only the closest snapshot
137
+ response = requests.get(
138
+ self.available_api,
139
+ params={"url": url}
140
+ )
141
+ data = response.json()
142
+
143
+ if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
144
+ snapshot = data["archived_snapshots"]["closest"]
145
+ snapshot_url = snapshot["url"]
146
+ timestamp = snapshot["timestamp"]
147
+
148
+ snapshots.append({
149
+ "timestamp": timestamp,
150
+ "formatted_date": self._format_timestamp(timestamp),
151
+ "url": snapshot_url,
152
+ "original_url": url,
153
+ "available": snapshot.get("available", True),
154
+ "status": snapshot.get("status", "200")
155
+ })
156
+ else:
157
+ # Get multiple snapshots using CDX API
158
+ response = requests.get(
159
+ self.cdx_api,
160
+ params={
161
+ "url": url,
162
+ "output": "json",
163
+ "fl": "timestamp,original,statuscode,mimetype",
164
+ "collapse": "timestamp:4", # Group by year
165
+ "limit": self.max_snapshots_per_url
166
+ }
167
+ )
168
+
169
+ # Check if response is valid JSON
170
+ data = response.json()
171
+
172
+ # First item is the header
173
+ if len(data) > 1:
174
+ headers = data[0]
175
+ for item in data[1:]:
176
+ snapshot = dict(zip(headers, item))
177
+ timestamp = snapshot.get("timestamp", "")
178
+
179
+ wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
180
+
181
+ snapshots.append({
182
+ "timestamp": timestamp,
183
+ "formatted_date": self._format_timestamp(timestamp),
184
+ "url": wayback_url,
185
+ "original_url": url,
186
+ "available": True,
187
+ "status": snapshot.get("statuscode", "200")
188
+ })
189
+
190
+ # Limit to max snapshots per URL
191
+ snapshots = snapshots[:self.max_snapshots_per_url]
192
+
193
+ except Exception as e:
194
+ logger.error(f"Error getting Wayback snapshots for {url}: {e}")
195
+
196
+ return snapshots
197
+
198
+ def _get_previews(self, query: str) -> List[Dict[str, Any]]:
199
+ """
200
+ Get preview information for Wayback Machine snapshots.
201
+
202
+ Args:
203
+ query: The search query
204
+
205
+ Returns:
206
+ List of preview dictionaries
207
+ """
208
+ logger.info(f"Getting Wayback Machine previews for query: {query}")
209
+
210
+ # Extract URLs from query
211
+ urls = self._extract_urls_from_query(query)
212
+
213
+ if not urls:
214
+ logger.warning(f"No URLs found in query: {query}")
215
+ return []
216
+
217
+ # Get snapshots for each URL
218
+ all_snapshots = []
219
+ for url in urls:
220
+ snapshots = self._get_wayback_snapshots(url)
221
+ all_snapshots.extend(snapshots)
222
+
223
+ # Respect rate limits
224
+ if len(urls) > 1:
225
+ time.sleep(0.5)
226
+
227
+ # Format as previews
228
+ previews = []
229
+ for snapshot in all_snapshots:
230
+ preview = {
231
+ "id": f"{snapshot['timestamp']}_{snapshot['original_url']}",
232
+ "title": f"Archive of {snapshot['original_url']} ({snapshot['formatted_date']})",
233
+ "link": snapshot["url"],
234
+ "snippet": f"Archived version from {snapshot['formatted_date']}",
235
+ "original_url": snapshot["original_url"],
236
+ "timestamp": snapshot["timestamp"],
237
+ "formatted_date": snapshot["formatted_date"]
238
+ }
239
+ previews.append(preview)
240
+
241
+ logger.info(f"Found {len(previews)} Wayback Machine snapshots")
242
+ return previews
243
+
244
+ def _remove_boilerplate(self, html: str) -> str:
245
+ """
246
+ Remove boilerplate content from HTML.
247
+
248
+ Args:
249
+ html: HTML content
250
+
251
+ Returns:
252
+ Cleaned text content
253
+ """
254
+ if not html or not html.strip():
255
+ return ""
256
+ try:
257
+ paragraphs = justext.justext(html, justext.get_stoplist(self.language))
258
+ cleaned = "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
259
+ return cleaned
260
+ except Exception as e:
261
+ logger.error(f"Error removing boilerplate: {e}")
262
+ return html
263
+
264
+ def _get_wayback_content(self, url: str) -> Tuple[str, str]:
265
+ """
266
+ Retrieve content from a Wayback Machine URL.
267
+
268
+ Args:
269
+ url: Wayback Machine URL
270
+
271
+ Returns:
272
+ Tuple of (raw_html, cleaned_text)
273
+ """
274
+ try:
275
+ headers = {
276
+ "User-Agent": "Mozilla/5.0 (Local Deep Research Bot; research project)"
277
+ }
278
+ response = requests.get(url, headers=headers, timeout=10)
279
+ raw_html = response.text
280
+
281
+ # Clean the HTML
282
+ cleaned_text = self._remove_boilerplate(raw_html)
283
+
284
+ return raw_html, cleaned_text
285
+ except Exception as e:
286
+ logger.error(f"Error retrieving content from {url}: {e}")
287
+ return "", f"Error retrieving content: {str(e)}"
288
+
289
+ def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
290
+ """
291
+ Get full content for the relevant Wayback Machine snapshots.
292
+
293
+ Args:
294
+ relevant_items: List of relevant preview dictionaries
295
+
296
+ Returns:
297
+ List of result dictionaries with full content
298
+ """
299
+ # Check if we should add full content
300
+ if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
301
+ logger.info("Snippet-only mode, skipping full content retrieval")
302
+ return relevant_items
303
+
304
+ logger.info(f"Getting full content for {len(relevant_items)} Wayback Machine snapshots")
305
+
306
+ results = []
307
+ for item in relevant_items:
308
+ wayback_url = item.get("link")
309
+ if not wayback_url:
310
+ results.append(item)
311
+ continue
312
+
313
+ logger.info(f"Retrieving content from {wayback_url}")
314
+
315
+ try:
316
+ # Retrieve content
317
+ raw_html, full_content = self._get_wayback_content(wayback_url)
318
+
319
+ # Add full content to the result
320
+ result = item.copy()
321
+ result["raw_html"] = raw_html
322
+ result["full_content"] = full_content
323
+
324
+ results.append(result)
325
+
326
+ # Brief pause for rate limiting
327
+ time.sleep(0.5)
328
+ except Exception as e:
329
+ logger.error(f"Error processing {wayback_url}: {e}")
330
+ results.append(item)
331
+
332
+ return results
333
+
334
+ def search_by_url(self, url: str, max_snapshots: int = None) -> List[Dict[str, Any]]:
335
+ """
336
+ Search for archived versions of a specific URL.
337
+
338
+ Args:
339
+ url: The URL to search for archives
340
+ max_snapshots: Maximum number of snapshots to return
341
+
342
+ Returns:
343
+ List of snapshot dictionaries
344
+ """
345
+ max_snapshots = max_snapshots or self.max_snapshots_per_url
346
+
347
+ snapshots = self._get_wayback_snapshots(url)
348
+ previews = []
349
+
350
+ for snapshot in snapshots[:max_snapshots]:
351
+ preview = {
352
+ "id": f"{snapshot['timestamp']}_{snapshot['original_url']}",
353
+ "title": f"Archive of {snapshot['original_url']} ({snapshot['formatted_date']})",
354
+ "link": snapshot["url"],
355
+ "snippet": f"Archived version from {snapshot['formatted_date']}",
356
+ "original_url": snapshot["original_url"],
357
+ "timestamp": snapshot["timestamp"],
358
+ "formatted_date": snapshot["formatted_date"]
359
+ }
360
+ previews.append(preview)
361
+
362
+ # Get full content if not in snippets-only mode
363
+ if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
364
+ return self._get_full_content(previews)
365
+
366
+ return previews
367
+
368
+ def search_by_date_range(self, url: str, start_date: str, end_date: str) -> List[Dict[str, Any]]:
369
+ """
370
+ Search for archived versions of a URL within a date range.
371
+
372
+ Args:
373
+ url: The URL to search for archives
374
+ start_date: Start date in format YYYYMMDD
375
+ end_date: End date in format YYYYMMDD
376
+
377
+ Returns:
378
+ List of snapshot dictionaries
379
+ """
380
+ try:
381
+ # Use CDX API with date range
382
+ response = requests.get(
383
+ self.cdx_api,
384
+ params={
385
+ "url": url,
386
+ "output": "json",
387
+ "fl": "timestamp,original,statuscode,mimetype",
388
+ "from": start_date,
389
+ "to": end_date,
390
+ "limit": self.max_snapshots_per_url
391
+ }
392
+ )
393
+
394
+ # Process response
395
+ data = response.json()
396
+
397
+ # First item is the header
398
+ if len(data) <= 1:
399
+ return []
400
+
401
+ headers = data[0]
402
+ snapshots = []
403
+
404
+ for item in data[1:]:
405
+ snapshot = dict(zip(headers, item))
406
+ timestamp = snapshot.get("timestamp", "")
407
+
408
+ wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
409
+
410
+ snapshots.append({
411
+ "id": f"{timestamp}_{url}",
412
+ "title": f"Archive of {url} ({self._format_timestamp(timestamp)})",
413
+ "link": wayback_url,
414
+ "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
415
+ "original_url": url,
416
+ "timestamp": timestamp,
417
+ "formatted_date": self._format_timestamp(timestamp)
418
+ })
419
+
420
+ # Get full content if not in snippets-only mode
421
+ if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
422
+ return self._get_full_content(snapshots)
423
+
424
+ return snapshots
425
+
426
+ except Exception as e:
427
+ logger.error(f"Error searching date range for {url}: {e}")
428
+ return []
429
+
430
+ def get_latest_snapshot(self, url: str) -> Optional[Dict[str, Any]]:
431
+ """
432
+ Get the most recent snapshot of a URL.
433
+
434
+ Args:
435
+ url: The URL to get the latest snapshot for
436
+
437
+ Returns:
438
+ Dictionary with snapshot information or None if not found
439
+ """
440
+ try:
441
+ response = requests.get(
442
+ self.available_api,
443
+ params={"url": url}
444
+ )
445
+ data = response.json()
446
+
447
+ if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
448
+ snapshot = data["archived_snapshots"]["closest"]
449
+ timestamp = snapshot["timestamp"]
450
+ wayback_url = snapshot["url"]
451
+
452
+ result = {
453
+ "id": f"{timestamp}_{url}",
454
+ "title": f"Latest archive of {url} ({self._format_timestamp(timestamp)})",
455
+ "link": wayback_url,
456
+ "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
457
+ "original_url": url,
458
+ "timestamp": timestamp,
459
+ "formatted_date": self._format_timestamp(timestamp)
460
+ }
461
+
462
+ # Get full content if not in snippets-only mode
463
+ if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
464
+ raw_html, full_content = self._get_wayback_content(wayback_url)
465
+ result["raw_html"] = raw_html
466
+ result["full_content"] = full_content
467
+
468
+ return result
469
+
470
+ return None
471
+
472
+ except Exception as e:
473
+ logger.error(f"Error getting latest snapshot for {url}: {e}")
474
+ return None