local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. local_deep_research/__init__.py +23 -22
  2. local_deep_research/__main__.py +16 -0
  3. local_deep_research/advanced_search_system/__init__.py +7 -0
  4. local_deep_research/advanced_search_system/filters/__init__.py +8 -0
  5. local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
  6. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
  7. local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
  8. local_deep_research/advanced_search_system/findings/repository.py +452 -0
  9. local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
  10. local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
  11. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
  12. local_deep_research/advanced_search_system/questions/__init__.py +1 -0
  13. local_deep_research/advanced_search_system/questions/base_question.py +64 -0
  14. local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
  15. local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
  16. local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
  17. local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
  18. local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
  19. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
  20. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
  21. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
  22. local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
  23. local_deep_research/advanced_search_system/tools/__init__.py +1 -0
  24. local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
  25. local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
  26. local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
  27. local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
  28. local_deep_research/api/__init__.py +5 -5
  29. local_deep_research/api/research_functions.py +154 -160
  30. local_deep_research/app.py +8 -0
  31. local_deep_research/citation_handler.py +25 -16
  32. local_deep_research/{config.py → config/config_files.py} +102 -110
  33. local_deep_research/config/llm_config.py +472 -0
  34. local_deep_research/config/search_config.py +77 -0
  35. local_deep_research/defaults/__init__.py +10 -5
  36. local_deep_research/defaults/main.toml +2 -2
  37. local_deep_research/defaults/search_engines.toml +60 -34
  38. local_deep_research/main.py +121 -19
  39. local_deep_research/migrate_db.py +147 -0
  40. local_deep_research/report_generator.py +87 -45
  41. local_deep_research/search_system.py +153 -283
  42. local_deep_research/setup_data_dir.py +35 -0
  43. local_deep_research/test_migration.py +178 -0
  44. local_deep_research/utilities/__init__.py +0 -0
  45. local_deep_research/utilities/db_utils.py +49 -0
  46. local_deep_research/{utilties → utilities}/enums.py +2 -2
  47. local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
  48. local_deep_research/utilities/search_utilities.py +242 -0
  49. local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
  50. local_deep_research/web/__init__.py +0 -1
  51. local_deep_research/web/app.py +86 -1709
  52. local_deep_research/web/app_factory.py +289 -0
  53. local_deep_research/web/database/README.md +70 -0
  54. local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
  55. local_deep_research/web/database/migrations.py +447 -0
  56. local_deep_research/web/database/models.py +117 -0
  57. local_deep_research/web/database/schema_upgrade.py +107 -0
  58. local_deep_research/web/models/database.py +294 -0
  59. local_deep_research/web/models/settings.py +94 -0
  60. local_deep_research/web/routes/api_routes.py +559 -0
  61. local_deep_research/web/routes/history_routes.py +354 -0
  62. local_deep_research/web/routes/research_routes.py +715 -0
  63. local_deep_research/web/routes/settings_routes.py +1583 -0
  64. local_deep_research/web/services/research_service.py +947 -0
  65. local_deep_research/web/services/resource_service.py +149 -0
  66. local_deep_research/web/services/settings_manager.py +669 -0
  67. local_deep_research/web/services/settings_service.py +187 -0
  68. local_deep_research/web/services/socket_service.py +210 -0
  69. local_deep_research/web/static/css/custom_dropdown.css +277 -0
  70. local_deep_research/web/static/css/settings.css +1223 -0
  71. local_deep_research/web/static/css/styles.css +525 -48
  72. local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
  73. local_deep_research/web/static/js/components/detail.js +348 -0
  74. local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
  75. local_deep_research/web/static/js/components/fallback/ui.js +215 -0
  76. local_deep_research/web/static/js/components/history.js +487 -0
  77. local_deep_research/web/static/js/components/logpanel.js +949 -0
  78. local_deep_research/web/static/js/components/progress.js +1107 -0
  79. local_deep_research/web/static/js/components/research.js +1865 -0
  80. local_deep_research/web/static/js/components/results.js +766 -0
  81. local_deep_research/web/static/js/components/settings.js +3981 -0
  82. local_deep_research/web/static/js/components/settings_sync.js +106 -0
  83. local_deep_research/web/static/js/main.js +226 -0
  84. local_deep_research/web/static/js/services/api.js +253 -0
  85. local_deep_research/web/static/js/services/audio.js +31 -0
  86. local_deep_research/web/static/js/services/formatting.js +119 -0
  87. local_deep_research/web/static/js/services/pdf.js +622 -0
  88. local_deep_research/web/static/js/services/socket.js +882 -0
  89. local_deep_research/web/static/js/services/ui.js +546 -0
  90. local_deep_research/web/templates/base.html +72 -0
  91. local_deep_research/web/templates/components/custom_dropdown.html +47 -0
  92. local_deep_research/web/templates/components/log_panel.html +32 -0
  93. local_deep_research/web/templates/components/mobile_nav.html +22 -0
  94. local_deep_research/web/templates/components/settings_form.html +299 -0
  95. local_deep_research/web/templates/components/sidebar.html +21 -0
  96. local_deep_research/web/templates/pages/details.html +73 -0
  97. local_deep_research/web/templates/pages/history.html +51 -0
  98. local_deep_research/web/templates/pages/progress.html +57 -0
  99. local_deep_research/web/templates/pages/research.html +139 -0
  100. local_deep_research/web/templates/pages/results.html +59 -0
  101. local_deep_research/web/templates/settings_dashboard.html +78 -192
  102. local_deep_research/web/utils/__init__.py +0 -0
  103. local_deep_research/web/utils/formatters.py +76 -0
  104. local_deep_research/web_search_engines/engines/full_search.py +18 -16
  105. local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
  106. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
  107. local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
  108. local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
  109. local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
  110. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
  111. local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
  112. local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
  113. local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
  114. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
  115. local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
  116. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
  117. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
  118. local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
  119. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
  120. local_deep_research/web_search_engines/search_engine_base.py +174 -99
  121. local_deep_research/web_search_engines/search_engine_factory.py +192 -102
  122. local_deep_research/web_search_engines/search_engines_config.py +22 -15
  123. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
  124. local_deep_research-0.2.2.dist-info/RECORD +135 -0
  125. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
  126. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
  127. local_deep_research/defaults/llm_config.py +0 -338
  128. local_deep_research/utilties/search_utilities.py +0 -114
  129. local_deep_research/web/static/js/app.js +0 -3763
  130. local_deep_research/web/templates/api_keys_config.html +0 -82
  131. local_deep_research/web/templates/collections_config.html +0 -90
  132. local_deep_research/web/templates/index.html +0 -348
  133. local_deep_research/web/templates/llm_config.html +0 -120
  134. local_deep_research/web/templates/main_config.html +0 -89
  135. local_deep_research/web/templates/search_engines_config.html +0 -154
  136. local_deep_research/web/templates/settings.html +0 -519
  137. local_deep_research-0.1.26.dist-info/RECORD +0 -61
  138. local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
  139. /local_deep_research/{utilties → config}/__init__.py +0 -0
  140. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,35 +1,38 @@
1
- import requests
2
1
  import logging
3
- from typing import Dict, List, Any, Optional, Tuple
4
- from langchain_core.language_models import BaseLLM
5
2
  import re
6
3
  import time
7
- from datetime import datetime
4
+ from typing import Any, Dict, List, Optional, Tuple
8
5
 
9
- from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
10
- from local_deep_research import config
11
6
  import justext
7
+ import requests
8
+ from langchain_core.language_models import BaseLLM
9
+
10
+ from ...config import search_config
11
+ from ..search_engine_base import BaseSearchEngine
12
12
 
13
13
  # Setup logging
14
14
  logging.basicConfig(level=logging.INFO)
15
15
  logger = logging.getLogger(__name__)
16
16
 
17
+
17
18
  class WaybackSearchEngine(BaseSearchEngine):
18
19
  """
19
20
  Internet Archive Wayback Machine search engine implementation
20
21
  Provides access to historical versions of web pages
21
22
  """
22
-
23
- def __init__(self,
24
- max_results: int = 10,
25
- max_snapshots_per_url: int = 3,
26
- llm: Optional[BaseLLM] = None,
27
- language: str = "English",
28
- max_filtered_results: Optional[int] = None,
29
- closest_only: bool = False):
23
+
24
+ def __init__(
25
+ self,
26
+ max_results: int = 10,
27
+ max_snapshots_per_url: int = 3,
28
+ llm: Optional[BaseLLM] = None,
29
+ language: str = "English",
30
+ max_filtered_results: Optional[int] = None,
31
+ closest_only: bool = False,
32
+ ):
30
33
  """
31
34
  Initialize the Wayback Machine search engine.
32
-
35
+
33
36
  Args:
34
37
  max_results: Maximum number of search results
35
38
  max_snapshots_per_url: Maximum snapshots to retrieve per URL
@@ -39,48 +42,54 @@ class WaybackSearchEngine(BaseSearchEngine):
39
42
  closest_only: If True, only retrieves the closest snapshot for each URL
40
43
  """
41
44
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
42
- super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
45
+ super().__init__(
46
+ llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
47
+ )
43
48
  self.max_snapshots_per_url = max_snapshots_per_url
44
49
  self.language = language
45
50
  self.closest_only = closest_only
46
-
51
+
47
52
  # API endpoints
48
53
  self.available_api = "https://archive.org/wayback/available"
49
54
  self.cdx_api = "https://web.archive.org/cdx/search/cdx"
50
-
55
+
51
56
  def _extract_urls_from_query(self, query: str) -> List[str]:
52
57
  """
53
58
  Extract URLs from a query string or interpret as an URL if possible.
54
59
  For non-URL queries, use a DuckDuckGo search to find relevant URLs.
55
-
60
+
56
61
  Args:
57
62
  query: The search query or URL
58
-
63
+
59
64
  Returns:
60
65
  List of URLs to search in the Wayback Machine
61
66
  """
62
67
  # Check if the query is already a URL
63
- url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
68
+ url_pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+")
64
69
  urls = url_pattern.findall(query)
65
-
70
+
66
71
  if urls:
67
72
  logger.info(f"Found {len(urls)} URLs in query")
68
73
  return urls
69
-
74
+
70
75
  # Check if query is a domain without http prefix
71
- domain_pattern = re.compile(r'^(?:[-\w.]|(?:%[\da-fA-F]{2}))+\.\w+$')
76
+ domain_pattern = re.compile(r"^(?:[-\w.]|(?:%[\da-fA-F]{2}))+\.\w+$")
72
77
  if domain_pattern.match(query):
73
78
  logger.info(f"Query appears to be a domain: {query}")
74
79
  return [f"http://{query}"]
75
-
80
+
76
81
  # For non-URL queries, use DuckDuckGo to find relevant URLs
77
- logger.info(f"Query is not a URL, using DuckDuckGo to find relevant URLs")
82
+ logger.info("Query is not a URL, using DuckDuckGo to find relevant URLs")
78
83
  try:
79
84
  # Import DuckDuckGo search engine
80
85
  from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
81
- ddg = DuckDuckGoSearchAPIWrapper(max_results=5)
82
- results = ddg.results(query)
83
-
86
+
87
+ # Use max_results from parent class, but limit to 5 for URL discovery
88
+ url_search_limit = min(5, self.max_results)
89
+ ddg = DuckDuckGoSearchAPIWrapper(max_results=url_search_limit)
90
+ # Pass max_results as a positional argument
91
+ results = ddg.results(query, url_search_limit)
92
+
84
93
  # Extract URLs from results
85
94
  ddg_urls = [result.get("link") for result in results if result.get("link")]
86
95
  if ddg_urls:
@@ -88,7 +97,7 @@ class WaybackSearchEngine(BaseSearchEngine):
88
97
  return ddg_urls
89
98
  except Exception as e:
90
99
  logger.error(f"Error using DuckDuckGo for URL discovery: {e}")
91
-
100
+
92
101
  # Fallback: treat the query as a potential domain or path
93
102
  if "/" in query and "." in query:
94
103
  logger.info(f"Treating query as a partial URL: {query}")
@@ -96,16 +105,16 @@ class WaybackSearchEngine(BaseSearchEngine):
96
105
  elif "." in query:
97
106
  logger.info(f"Treating query as a domain: {query}")
98
107
  return [f"http://{query}"]
99
-
108
+
100
109
  # Return empty list if nothing worked
101
110
  logger.warning(f"Could not extract any URLs from query: {query}")
102
111
  return []
103
-
112
+
104
113
  def _format_timestamp(self, timestamp: str) -> str:
105
114
  """Format Wayback Machine timestamp into readable date"""
106
115
  if len(timestamp) < 14:
107
116
  return timestamp
108
-
117
+
109
118
  try:
110
119
  year = timestamp[0:4]
111
120
  month = timestamp[4:6]
@@ -114,43 +123,45 @@ class WaybackSearchEngine(BaseSearchEngine):
114
123
  minute = timestamp[10:12]
115
124
  second = timestamp[12:14]
116
125
  return f"{year}-{month}-{day} {hour}:{minute}:{second}"
117
- except:
126
+ except Exception:
118
127
  return timestamp
119
-
128
+
120
129
  def _get_wayback_snapshots(self, url: str) -> List[Dict[str, Any]]:
121
130
  """
122
131
  Get snapshots from the Wayback Machine for a specific URL.
123
-
132
+
124
133
  Args:
125
134
  url: URL to get snapshots for
126
-
135
+
127
136
  Returns:
128
137
  List of snapshot dictionaries
129
138
  """
130
139
  snapshots = []
131
-
140
+
132
141
  try:
133
142
  if self.closest_only:
134
143
  # Get only the closest snapshot
135
- response = requests.get(
136
- self.available_api,
137
- params={"url": url}
138
- )
144
+ response = requests.get(self.available_api, params={"url": url})
139
145
  data = response.json()
140
-
141
- if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
146
+
147
+ if (
148
+ "archived_snapshots" in data
149
+ and "closest" in data["archived_snapshots"]
150
+ ):
142
151
  snapshot = data["archived_snapshots"]["closest"]
143
152
  snapshot_url = snapshot["url"]
144
153
  timestamp = snapshot["timestamp"]
145
-
146
- snapshots.append({
147
- "timestamp": timestamp,
148
- "formatted_date": self._format_timestamp(timestamp),
149
- "url": snapshot_url,
150
- "original_url": url,
151
- "available": snapshot.get("available", True),
152
- "status": snapshot.get("status", "200")
153
- })
154
+
155
+ snapshots.append(
156
+ {
157
+ "timestamp": timestamp,
158
+ "formatted_date": self._format_timestamp(timestamp),
159
+ "url": snapshot_url,
160
+ "original_url": url,
161
+ "available": snapshot.get("available", True),
162
+ "status": snapshot.get("status", "200"),
163
+ }
164
+ )
154
165
  else:
155
166
  # Get multiple snapshots using CDX API
156
167
  response = requests.get(
@@ -160,68 +171,70 @@ class WaybackSearchEngine(BaseSearchEngine):
160
171
  "output": "json",
161
172
  "fl": "timestamp,original,statuscode,mimetype",
162
173
  "collapse": "timestamp:4", # Group by year
163
- "limit": self.max_snapshots_per_url
164
- }
174
+ "limit": self.max_snapshots_per_url,
175
+ },
165
176
  )
166
-
177
+
167
178
  # Check if response is valid JSON
168
179
  data = response.json()
169
-
180
+
170
181
  # First item is the header
171
182
  if len(data) > 1:
172
183
  headers = data[0]
173
184
  for item in data[1:]:
174
185
  snapshot = dict(zip(headers, item))
175
186
  timestamp = snapshot.get("timestamp", "")
176
-
187
+
177
188
  wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
178
-
179
- snapshots.append({
180
- "timestamp": timestamp,
181
- "formatted_date": self._format_timestamp(timestamp),
182
- "url": wayback_url,
183
- "original_url": url,
184
- "available": True,
185
- "status": snapshot.get("statuscode", "200")
186
- })
187
-
189
+
190
+ snapshots.append(
191
+ {
192
+ "timestamp": timestamp,
193
+ "formatted_date": self._format_timestamp(timestamp),
194
+ "url": wayback_url,
195
+ "original_url": url,
196
+ "available": True,
197
+ "status": snapshot.get("statuscode", "200"),
198
+ }
199
+ )
200
+
188
201
  # Limit to max snapshots per URL
189
- snapshots = snapshots[:self.max_snapshots_per_url]
190
-
202
+ snapshots = snapshots[: self.max_snapshots_per_url]
203
+
191
204
  except Exception as e:
192
205
  logger.error(f"Error getting Wayback snapshots for {url}: {e}")
193
-
206
+
194
207
  return snapshots
195
-
208
+
196
209
  def _get_previews(self, query: str) -> List[Dict[str, Any]]:
197
210
  """
198
211
  Get preview information for Wayback Machine snapshots.
199
-
212
+
200
213
  Args:
201
214
  query: The search query
202
-
215
+
203
216
  Returns:
204
217
  List of preview dictionaries
205
218
  """
206
219
  logger.info(f"Getting Wayback Machine previews for query: {query}")
207
-
220
+
208
221
  # Extract URLs from query
209
222
  urls = self._extract_urls_from_query(query)
210
-
223
+
211
224
  if not urls:
212
225
  logger.warning(f"No URLs found in query: {query}")
213
226
  return []
214
-
227
+
215
228
  # Get snapshots for each URL
216
229
  all_snapshots = []
217
230
  for url in urls:
218
231
  snapshots = self._get_wayback_snapshots(url)
219
232
  all_snapshots.extend(snapshots)
220
-
233
+
221
234
  # Respect rate limits
222
235
  if len(urls) > 1:
223
236
  time.sleep(0.5)
224
-
237
+
225
238
  # Format as previews
226
239
  previews = []
227
240
  for snapshot in all_snapshots:
@@ -232,20 +245,20 @@ class WaybackSearchEngine(BaseSearchEngine):
232
245
  "snippet": f"Archived version from {snapshot['formatted_date']}",
233
246
  "original_url": snapshot["original_url"],
234
247
  "timestamp": snapshot["timestamp"],
235
- "formatted_date": snapshot["formatted_date"]
248
+ "formatted_date": snapshot["formatted_date"],
236
249
  }
237
250
  previews.append(preview)
238
-
251
+
239
252
  logger.info(f"Found {len(previews)} Wayback Machine snapshots")
240
253
  return previews
241
-
254
+
242
255
  def _remove_boilerplate(self, html: str) -> str:
243
256
  """
244
257
  Remove boilerplate content from HTML.
245
-
258
+
246
259
  Args:
247
260
  html: HTML content
248
-
261
+
249
262
  Returns:
250
263
  Cleaned text content
251
264
  """
@@ -258,14 +271,14 @@ class WaybackSearchEngine(BaseSearchEngine):
258
271
  except Exception as e:
259
272
  logger.error(f"Error removing boilerplate: {e}")
260
273
  return html
261
-
274
+
262
275
  def _get_wayback_content(self, url: str) -> Tuple[str, str]:
263
276
  """
264
277
  Retrieve content from a Wayback Machine URL.
265
-
278
+
266
279
  Args:
267
280
  url: Wayback Machine URL
268
-
281
+
269
282
  Returns:
270
283
  Tuple of (raw_html, cleaned_text)
271
284
  """
@@ -275,76 +288,85 @@ class WaybackSearchEngine(BaseSearchEngine):
275
288
  }
276
289
  response = requests.get(url, headers=headers, timeout=10)
277
290
  raw_html = response.text
278
-
291
+
279
292
  # Clean the HTML
280
293
  cleaned_text = self._remove_boilerplate(raw_html)
281
-
294
+
282
295
  return raw_html, cleaned_text
283
296
  except Exception as e:
284
297
  logger.error(f"Error retrieving content from {url}: {e}")
285
298
  return "", f"Error retrieving content: {str(e)}"
286
-
287
- def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
299
+
300
+ def _get_full_content(
301
+ self, relevant_items: List[Dict[str, Any]]
302
+ ) -> List[Dict[str, Any]]:
288
303
  """
289
304
  Get full content for the relevant Wayback Machine snapshots.
290
-
305
+
291
306
  Args:
292
307
  relevant_items: List of relevant preview dictionaries
293
-
308
+
294
309
  Returns:
295
310
  List of result dictionaries with full content
296
311
  """
297
312
  # Check if we should add full content
298
- if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
313
+ if (
314
+ hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
315
+ and search_config.SEARCH_SNIPPETS_ONLY
316
+ ):
299
317
  logger.info("Snippet-only mode, skipping full content retrieval")
300
318
  return relevant_items
301
-
302
- logger.info(f"Getting full content for {len(relevant_items)} Wayback Machine snapshots")
303
-
319
+
320
+ logger.info(
321
+ f"Getting full content for {len(relevant_items)} Wayback Machine snapshots"
322
+ )
323
+
304
324
  results = []
305
325
  for item in relevant_items:
306
326
  wayback_url = item.get("link")
307
327
  if not wayback_url:
308
328
  results.append(item)
309
329
  continue
310
-
330
+
311
331
  logger.info(f"Retrieving content from {wayback_url}")
312
-
332
+
313
333
  try:
314
334
  # Retrieve content
315
335
  raw_html, full_content = self._get_wayback_content(wayback_url)
316
-
336
+
317
337
  # Add full content to the result
318
338
  result = item.copy()
319
339
  result["raw_html"] = raw_html
320
340
  result["full_content"] = full_content
321
-
341
+
322
342
  results.append(result)
323
-
343
+
324
344
  # Brief pause for rate limiting
325
345
  time.sleep(0.5)
326
346
  except Exception as e:
327
347
  logger.error(f"Error processing {wayback_url}: {e}")
328
348
  results.append(item)
329
-
349
+
330
350
  return results
331
-
332
- def search_by_url(self, url: str, max_snapshots: int = None) -> List[Dict[str, Any]]:
351
+
352
+ def search_by_url(
353
+ self, url: str, max_snapshots: int = None
354
+ ) -> List[Dict[str, Any]]:
333
355
  """
334
356
  Search for archived versions of a specific URL.
335
-
357
+
336
358
  Args:
337
359
  url: The URL to search for archives
338
360
  max_snapshots: Maximum number of snapshots to return
339
-
361
+
340
362
  Returns:
341
363
  List of snapshot dictionaries
342
364
  """
343
365
  max_snapshots = max_snapshots or self.max_snapshots_per_url
344
-
366
+
345
367
  snapshots = self._get_wayback_snapshots(url)
346
368
  previews = []
347
-
369
+
348
370
  for snapshot in snapshots[:max_snapshots]:
349
371
  preview = {
350
372
  "id": f"{snapshot['timestamp']}_{snapshot['original_url']}",
@@ -353,25 +375,30 @@ class WaybackSearchEngine(BaseSearchEngine):
353
375
  "snippet": f"Archived version from {snapshot['formatted_date']}",
354
376
  "original_url": snapshot["original_url"],
355
377
  "timestamp": snapshot["timestamp"],
356
- "formatted_date": snapshot["formatted_date"]
378
+ "formatted_date": snapshot["formatted_date"],
357
379
  }
358
380
  previews.append(preview)
359
-
381
+
360
382
  # Get full content if not in snippets-only mode
361
- if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
383
+ if (
384
+ not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
385
+ or not search_config.SEARCH_SNIPPETS_ONLY
386
+ ):
362
387
  return self._get_full_content(previews)
363
-
388
+
364
389
  return previews
365
-
366
- def search_by_date_range(self, url: str, start_date: str, end_date: str) -> List[Dict[str, Any]]:
390
+
391
+ def search_by_date_range(
392
+ self, url: str, start_date: str, end_date: str
393
+ ) -> List[Dict[str, Any]]:
367
394
  """
368
395
  Search for archived versions of a URL within a date range.
369
-
396
+
370
397
  Args:
371
398
  url: The URL to search for archives
372
399
  start_date: Start date in format YYYYMMDD
373
400
  end_date: End date in format YYYYMMDD
374
-
401
+
375
402
  Returns:
376
403
  List of snapshot dictionaries
377
404
  """
@@ -385,68 +412,70 @@ class WaybackSearchEngine(BaseSearchEngine):
385
412
  "fl": "timestamp,original,statuscode,mimetype",
386
413
  "from": start_date,
387
414
  "to": end_date,
388
- "limit": self.max_snapshots_per_url
389
- }
415
+ "limit": self.max_snapshots_per_url,
416
+ },
390
417
  )
391
-
418
+
392
419
  # Process response
393
420
  data = response.json()
394
-
421
+
395
422
  # First item is the header
396
423
  if len(data) <= 1:
397
424
  return []
398
-
425
+
399
426
  headers = data[0]
400
427
  snapshots = []
401
-
428
+
402
429
  for item in data[1:]:
403
430
  snapshot = dict(zip(headers, item))
404
431
  timestamp = snapshot.get("timestamp", "")
405
-
432
+
406
433
  wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
407
-
408
- snapshots.append({
409
- "id": f"{timestamp}_{url}",
410
- "title": f"Archive of {url} ({self._format_timestamp(timestamp)})",
411
- "link": wayback_url,
412
- "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
413
- "original_url": url,
414
- "timestamp": timestamp,
415
- "formatted_date": self._format_timestamp(timestamp)
416
- })
417
-
434
+
435
+ snapshots.append(
436
+ {
437
+ "id": f"{timestamp}_{url}",
438
+ "title": f"Archive of {url} ({self._format_timestamp(timestamp)})",
439
+ "link": wayback_url,
440
+ "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
441
+ "original_url": url,
442
+ "timestamp": timestamp,
443
+ "formatted_date": self._format_timestamp(timestamp),
444
+ }
445
+ )
446
+
418
447
  # Get full content if not in snippets-only mode
419
- if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
448
+ if (
449
+ not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
450
+ or not search_config.SEARCH_SNIPPETS_ONLY
451
+ ):
420
452
  return self._get_full_content(snapshots)
421
-
453
+
422
454
  return snapshots
423
-
455
+
424
456
  except Exception as e:
425
457
  logger.error(f"Error searching date range for {url}: {e}")
426
458
  return []
427
-
459
+
428
460
  def get_latest_snapshot(self, url: str) -> Optional[Dict[str, Any]]:
429
461
  """
430
462
  Get the most recent snapshot of a URL.
431
-
463
+
432
464
  Args:
433
465
  url: The URL to get the latest snapshot for
434
-
466
+
435
467
  Returns:
436
468
  Dictionary with snapshot information or None if not found
437
469
  """
438
470
  try:
439
- response = requests.get(
440
- self.available_api,
441
- params={"url": url}
442
- )
471
+ response = requests.get(self.available_api, params={"url": url})
443
472
  data = response.json()
444
-
473
+
445
474
  if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
446
475
  snapshot = data["archived_snapshots"]["closest"]
447
476
  timestamp = snapshot["timestamp"]
448
477
  wayback_url = snapshot["url"]
449
-
478
+
450
479
  result = {
451
480
  "id": f"{timestamp}_{url}",
452
481
  "title": f"Latest archive of {url} ({self._format_timestamp(timestamp)})",
@@ -454,19 +483,22 @@ class WaybackSearchEngine(BaseSearchEngine):
454
483
  "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
455
484
  "original_url": url,
456
485
  "timestamp": timestamp,
457
- "formatted_date": self._format_timestamp(timestamp)
486
+ "formatted_date": self._format_timestamp(timestamp),
458
487
  }
459
-
488
+
460
489
  # Get full content if not in snippets-only mode
461
- if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
490
+ if (
491
+ not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
492
+ or not search_config.SEARCH_SNIPPETS_ONLY
493
+ ):
462
494
  raw_html, full_content = self._get_wayback_content(wayback_url)
463
495
  result["raw_html"] = raw_html
464
496
  result["full_content"] = full_content
465
-
497
+
466
498
  return result
467
-
499
+
468
500
  return None
469
-
501
+
470
502
  except Exception as e:
471
503
  logger.error(f"Error getting latest snapshot for {url}: {e}")
472
- return None
504
+ return None