local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +23 -22
- local_deep_research/__main__.py +16 -0
- local_deep_research/advanced_search_system/__init__.py +7 -0
- local_deep_research/advanced_search_system/filters/__init__.py +8 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
- local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
- local_deep_research/advanced_search_system/findings/repository.py +452 -0
- local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
- local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
- local_deep_research/advanced_search_system/questions/__init__.py +1 -0
- local_deep_research/advanced_search_system/questions/base_question.py +64 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
- local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
- local_deep_research/advanced_search_system/tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
- local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
- local_deep_research/api/__init__.py +5 -5
- local_deep_research/api/research_functions.py +154 -160
- local_deep_research/app.py +8 -0
- local_deep_research/citation_handler.py +25 -16
- local_deep_research/{config.py → config/config_files.py} +102 -110
- local_deep_research/config/llm_config.py +472 -0
- local_deep_research/config/search_config.py +77 -0
- local_deep_research/defaults/__init__.py +10 -5
- local_deep_research/defaults/main.toml +2 -2
- local_deep_research/defaults/search_engines.toml +60 -34
- local_deep_research/main.py +121 -19
- local_deep_research/migrate_db.py +147 -0
- local_deep_research/report_generator.py +87 -45
- local_deep_research/search_system.py +153 -283
- local_deep_research/setup_data_dir.py +35 -0
- local_deep_research/test_migration.py +178 -0
- local_deep_research/utilities/__init__.py +0 -0
- local_deep_research/utilities/db_utils.py +49 -0
- local_deep_research/{utilties → utilities}/enums.py +2 -2
- local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
- local_deep_research/utilities/search_utilities.py +242 -0
- local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
- local_deep_research/web/__init__.py +0 -1
- local_deep_research/web/app.py +86 -1709
- local_deep_research/web/app_factory.py +289 -0
- local_deep_research/web/database/README.md +70 -0
- local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
- local_deep_research/web/database/migrations.py +447 -0
- local_deep_research/web/database/models.py +117 -0
- local_deep_research/web/database/schema_upgrade.py +107 -0
- local_deep_research/web/models/database.py +294 -0
- local_deep_research/web/models/settings.py +94 -0
- local_deep_research/web/routes/api_routes.py +559 -0
- local_deep_research/web/routes/history_routes.py +354 -0
- local_deep_research/web/routes/research_routes.py +715 -0
- local_deep_research/web/routes/settings_routes.py +1583 -0
- local_deep_research/web/services/research_service.py +947 -0
- local_deep_research/web/services/resource_service.py +149 -0
- local_deep_research/web/services/settings_manager.py +669 -0
- local_deep_research/web/services/settings_service.py +187 -0
- local_deep_research/web/services/socket_service.py +210 -0
- local_deep_research/web/static/css/custom_dropdown.css +277 -0
- local_deep_research/web/static/css/settings.css +1223 -0
- local_deep_research/web/static/css/styles.css +525 -48
- local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
- local_deep_research/web/static/js/components/detail.js +348 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
- local_deep_research/web/static/js/components/fallback/ui.js +215 -0
- local_deep_research/web/static/js/components/history.js +487 -0
- local_deep_research/web/static/js/components/logpanel.js +949 -0
- local_deep_research/web/static/js/components/progress.js +1107 -0
- local_deep_research/web/static/js/components/research.js +1865 -0
- local_deep_research/web/static/js/components/results.js +766 -0
- local_deep_research/web/static/js/components/settings.js +3981 -0
- local_deep_research/web/static/js/components/settings_sync.js +106 -0
- local_deep_research/web/static/js/main.js +226 -0
- local_deep_research/web/static/js/services/api.js +253 -0
- local_deep_research/web/static/js/services/audio.js +31 -0
- local_deep_research/web/static/js/services/formatting.js +119 -0
- local_deep_research/web/static/js/services/pdf.js +622 -0
- local_deep_research/web/static/js/services/socket.js +882 -0
- local_deep_research/web/static/js/services/ui.js +546 -0
- local_deep_research/web/templates/base.html +72 -0
- local_deep_research/web/templates/components/custom_dropdown.html +47 -0
- local_deep_research/web/templates/components/log_panel.html +32 -0
- local_deep_research/web/templates/components/mobile_nav.html +22 -0
- local_deep_research/web/templates/components/settings_form.html +299 -0
- local_deep_research/web/templates/components/sidebar.html +21 -0
- local_deep_research/web/templates/pages/details.html +73 -0
- local_deep_research/web/templates/pages/history.html +51 -0
- local_deep_research/web/templates/pages/progress.html +57 -0
- local_deep_research/web/templates/pages/research.html +139 -0
- local_deep_research/web/templates/pages/results.html +59 -0
- local_deep_research/web/templates/settings_dashboard.html +78 -192
- local_deep_research/web/utils/__init__.py +0 -0
- local_deep_research/web/utils/formatters.py +76 -0
- local_deep_research/web_search_engines/engines/full_search.py +18 -16
- local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
- local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
- local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
- local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
- local_deep_research/web_search_engines/search_engine_base.py +174 -99
- local_deep_research/web_search_engines/search_engine_factory.py +192 -102
- local_deep_research/web_search_engines/search_engines_config.py +22 -15
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
- local_deep_research-0.2.2.dist-info/RECORD +135 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
- local_deep_research/defaults/llm_config.py +0 -338
- local_deep_research/utilties/search_utilities.py +0 -114
- local_deep_research/web/static/js/app.js +0 -3763
- local_deep_research/web/templates/api_keys_config.html +0 -82
- local_deep_research/web/templates/collections_config.html +0 -90
- local_deep_research/web/templates/index.html +0 -348
- local_deep_research/web/templates/llm_config.html +0 -120
- local_deep_research/web/templates/main_config.html +0 -89
- local_deep_research/web/templates/search_engines_config.html +0 -154
- local_deep_research/web/templates/settings.html +0 -519
- local_deep_research-0.1.26.dist-info/RECORD +0 -61
- local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
- /local_deep_research/{utilties → config}/__init__.py +0 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,35 +1,38 @@
|
|
1
|
-
import requests
|
2
1
|
import logging
|
3
|
-
from typing import Dict, List, Any, Optional, Tuple
|
4
|
-
from langchain_core.language_models import BaseLLM
|
5
2
|
import re
|
6
3
|
import time
|
7
|
-
from
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
8
5
|
|
9
|
-
from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
|
10
|
-
from local_deep_research import config
|
11
6
|
import justext
|
7
|
+
import requests
|
8
|
+
from langchain_core.language_models import BaseLLM
|
9
|
+
|
10
|
+
from ...config import search_config
|
11
|
+
from ..search_engine_base import BaseSearchEngine
|
12
12
|
|
13
13
|
# Setup logging
|
14
14
|
logging.basicConfig(level=logging.INFO)
|
15
15
|
logger = logging.getLogger(__name__)
|
16
16
|
|
17
|
+
|
17
18
|
class WaybackSearchEngine(BaseSearchEngine):
|
18
19
|
"""
|
19
20
|
Internet Archive Wayback Machine search engine implementation
|
20
21
|
Provides access to historical versions of web pages
|
21
22
|
"""
|
22
|
-
|
23
|
-
def __init__(
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
max_results: int = 10,
|
27
|
+
max_snapshots_per_url: int = 3,
|
28
|
+
llm: Optional[BaseLLM] = None,
|
29
|
+
language: str = "English",
|
30
|
+
max_filtered_results: Optional[int] = None,
|
31
|
+
closest_only: bool = False,
|
32
|
+
):
|
30
33
|
"""
|
31
34
|
Initialize the Wayback Machine search engine.
|
32
|
-
|
35
|
+
|
33
36
|
Args:
|
34
37
|
max_results: Maximum number of search results
|
35
38
|
max_snapshots_per_url: Maximum snapshots to retrieve per URL
|
@@ -39,48 +42,54 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
39
42
|
closest_only: If True, only retrieves the closest snapshot for each URL
|
40
43
|
"""
|
41
44
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
42
|
-
super().__init__(
|
45
|
+
super().__init__(
|
46
|
+
llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
|
47
|
+
)
|
43
48
|
self.max_snapshots_per_url = max_snapshots_per_url
|
44
49
|
self.language = language
|
45
50
|
self.closest_only = closest_only
|
46
|
-
|
51
|
+
|
47
52
|
# API endpoints
|
48
53
|
self.available_api = "https://archive.org/wayback/available"
|
49
54
|
self.cdx_api = "https://web.archive.org/cdx/search/cdx"
|
50
|
-
|
55
|
+
|
51
56
|
def _extract_urls_from_query(self, query: str) -> List[str]:
|
52
57
|
"""
|
53
58
|
Extract URLs from a query string or interpret as an URL if possible.
|
54
59
|
For non-URL queries, use a DuckDuckGo search to find relevant URLs.
|
55
|
-
|
60
|
+
|
56
61
|
Args:
|
57
62
|
query: The search query or URL
|
58
|
-
|
63
|
+
|
59
64
|
Returns:
|
60
65
|
List of URLs to search in the Wayback Machine
|
61
66
|
"""
|
62
67
|
# Check if the query is already a URL
|
63
|
-
url_pattern = re.compile(r
|
68
|
+
url_pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+")
|
64
69
|
urls = url_pattern.findall(query)
|
65
|
-
|
70
|
+
|
66
71
|
if urls:
|
67
72
|
logger.info(f"Found {len(urls)} URLs in query")
|
68
73
|
return urls
|
69
|
-
|
74
|
+
|
70
75
|
# Check if query is a domain without http prefix
|
71
|
-
domain_pattern = re.compile(r
|
76
|
+
domain_pattern = re.compile(r"^(?:[-\w.]|(?:%[\da-fA-F]{2}))+\.\w+$")
|
72
77
|
if domain_pattern.match(query):
|
73
78
|
logger.info(f"Query appears to be a domain: {query}")
|
74
79
|
return [f"http://{query}"]
|
75
|
-
|
80
|
+
|
76
81
|
# For non-URL queries, use DuckDuckGo to find relevant URLs
|
77
|
-
logger.info(
|
82
|
+
logger.info("Query is not a URL, using DuckDuckGo to find relevant URLs")
|
78
83
|
try:
|
79
84
|
# Import DuckDuckGo search engine
|
80
85
|
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
|
81
|
-
|
82
|
-
|
83
|
-
|
86
|
+
|
87
|
+
# Use max_results from parent class, but limit to 5 for URL discovery
|
88
|
+
url_search_limit = min(5, self.max_results)
|
89
|
+
ddg = DuckDuckGoSearchAPIWrapper(max_results=url_search_limit)
|
90
|
+
# Pass max_results as a positional argument
|
91
|
+
results = ddg.results(query, url_search_limit)
|
92
|
+
|
84
93
|
# Extract URLs from results
|
85
94
|
ddg_urls = [result.get("link") for result in results if result.get("link")]
|
86
95
|
if ddg_urls:
|
@@ -88,7 +97,7 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
88
97
|
return ddg_urls
|
89
98
|
except Exception as e:
|
90
99
|
logger.error(f"Error using DuckDuckGo for URL discovery: {e}")
|
91
|
-
|
100
|
+
|
92
101
|
# Fallback: treat the query as a potential domain or path
|
93
102
|
if "/" in query and "." in query:
|
94
103
|
logger.info(f"Treating query as a partial URL: {query}")
|
@@ -96,16 +105,16 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
96
105
|
elif "." in query:
|
97
106
|
logger.info(f"Treating query as a domain: {query}")
|
98
107
|
return [f"http://{query}"]
|
99
|
-
|
108
|
+
|
100
109
|
# Return empty list if nothing worked
|
101
110
|
logger.warning(f"Could not extract any URLs from query: {query}")
|
102
111
|
return []
|
103
|
-
|
112
|
+
|
104
113
|
def _format_timestamp(self, timestamp: str) -> str:
|
105
114
|
"""Format Wayback Machine timestamp into readable date"""
|
106
115
|
if len(timestamp) < 14:
|
107
116
|
return timestamp
|
108
|
-
|
117
|
+
|
109
118
|
try:
|
110
119
|
year = timestamp[0:4]
|
111
120
|
month = timestamp[4:6]
|
@@ -114,43 +123,45 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
114
123
|
minute = timestamp[10:12]
|
115
124
|
second = timestamp[12:14]
|
116
125
|
return f"{year}-{month}-{day} {hour}:{minute}:{second}"
|
117
|
-
except:
|
126
|
+
except Exception:
|
118
127
|
return timestamp
|
119
|
-
|
128
|
+
|
120
129
|
def _get_wayback_snapshots(self, url: str) -> List[Dict[str, Any]]:
|
121
130
|
"""
|
122
131
|
Get snapshots from the Wayback Machine for a specific URL.
|
123
|
-
|
132
|
+
|
124
133
|
Args:
|
125
134
|
url: URL to get snapshots for
|
126
|
-
|
135
|
+
|
127
136
|
Returns:
|
128
137
|
List of snapshot dictionaries
|
129
138
|
"""
|
130
139
|
snapshots = []
|
131
|
-
|
140
|
+
|
132
141
|
try:
|
133
142
|
if self.closest_only:
|
134
143
|
# Get only the closest snapshot
|
135
|
-
response = requests.get(
|
136
|
-
self.available_api,
|
137
|
-
params={"url": url}
|
138
|
-
)
|
144
|
+
response = requests.get(self.available_api, params={"url": url})
|
139
145
|
data = response.json()
|
140
|
-
|
141
|
-
if
|
146
|
+
|
147
|
+
if (
|
148
|
+
"archived_snapshots" in data
|
149
|
+
and "closest" in data["archived_snapshots"]
|
150
|
+
):
|
142
151
|
snapshot = data["archived_snapshots"]["closest"]
|
143
152
|
snapshot_url = snapshot["url"]
|
144
153
|
timestamp = snapshot["timestamp"]
|
145
|
-
|
146
|
-
snapshots.append(
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
+
|
155
|
+
snapshots.append(
|
156
|
+
{
|
157
|
+
"timestamp": timestamp,
|
158
|
+
"formatted_date": self._format_timestamp(timestamp),
|
159
|
+
"url": snapshot_url,
|
160
|
+
"original_url": url,
|
161
|
+
"available": snapshot.get("available", True),
|
162
|
+
"status": snapshot.get("status", "200"),
|
163
|
+
}
|
164
|
+
)
|
154
165
|
else:
|
155
166
|
# Get multiple snapshots using CDX API
|
156
167
|
response = requests.get(
|
@@ -160,68 +171,70 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
160
171
|
"output": "json",
|
161
172
|
"fl": "timestamp,original,statuscode,mimetype",
|
162
173
|
"collapse": "timestamp:4", # Group by year
|
163
|
-
"limit": self.max_snapshots_per_url
|
164
|
-
}
|
174
|
+
"limit": self.max_snapshots_per_url,
|
175
|
+
},
|
165
176
|
)
|
166
|
-
|
177
|
+
|
167
178
|
# Check if response is valid JSON
|
168
179
|
data = response.json()
|
169
|
-
|
180
|
+
|
170
181
|
# First item is the header
|
171
182
|
if len(data) > 1:
|
172
183
|
headers = data[0]
|
173
184
|
for item in data[1:]:
|
174
185
|
snapshot = dict(zip(headers, item))
|
175
186
|
timestamp = snapshot.get("timestamp", "")
|
176
|
-
|
187
|
+
|
177
188
|
wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
|
178
|
-
|
179
|
-
snapshots.append(
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
189
|
+
|
190
|
+
snapshots.append(
|
191
|
+
{
|
192
|
+
"timestamp": timestamp,
|
193
|
+
"formatted_date": self._format_timestamp(timestamp),
|
194
|
+
"url": wayback_url,
|
195
|
+
"original_url": url,
|
196
|
+
"available": True,
|
197
|
+
"status": snapshot.get("statuscode", "200"),
|
198
|
+
}
|
199
|
+
)
|
200
|
+
|
188
201
|
# Limit to max snapshots per URL
|
189
|
-
snapshots = snapshots[:self.max_snapshots_per_url]
|
190
|
-
|
202
|
+
snapshots = snapshots[: self.max_snapshots_per_url]
|
203
|
+
|
191
204
|
except Exception as e:
|
192
205
|
logger.error(f"Error getting Wayback snapshots for {url}: {e}")
|
193
|
-
|
206
|
+
|
194
207
|
return snapshots
|
195
|
-
|
208
|
+
|
196
209
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
197
210
|
"""
|
198
211
|
Get preview information for Wayback Machine snapshots.
|
199
|
-
|
212
|
+
|
200
213
|
Args:
|
201
214
|
query: The search query
|
202
|
-
|
215
|
+
|
203
216
|
Returns:
|
204
217
|
List of preview dictionaries
|
205
218
|
"""
|
206
219
|
logger.info(f"Getting Wayback Machine previews for query: {query}")
|
207
|
-
|
220
|
+
|
208
221
|
# Extract URLs from query
|
209
222
|
urls = self._extract_urls_from_query(query)
|
210
|
-
|
223
|
+
|
211
224
|
if not urls:
|
212
225
|
logger.warning(f"No URLs found in query: {query}")
|
213
226
|
return []
|
214
|
-
|
227
|
+
|
215
228
|
# Get snapshots for each URL
|
216
229
|
all_snapshots = []
|
217
230
|
for url in urls:
|
218
231
|
snapshots = self._get_wayback_snapshots(url)
|
219
232
|
all_snapshots.extend(snapshots)
|
220
|
-
|
233
|
+
|
221
234
|
# Respect rate limits
|
222
235
|
if len(urls) > 1:
|
223
236
|
time.sleep(0.5)
|
224
|
-
|
237
|
+
|
225
238
|
# Format as previews
|
226
239
|
previews = []
|
227
240
|
for snapshot in all_snapshots:
|
@@ -232,20 +245,20 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
232
245
|
"snippet": f"Archived version from {snapshot['formatted_date']}",
|
233
246
|
"original_url": snapshot["original_url"],
|
234
247
|
"timestamp": snapshot["timestamp"],
|
235
|
-
"formatted_date": snapshot["formatted_date"]
|
248
|
+
"formatted_date": snapshot["formatted_date"],
|
236
249
|
}
|
237
250
|
previews.append(preview)
|
238
|
-
|
251
|
+
|
239
252
|
logger.info(f"Found {len(previews)} Wayback Machine snapshots")
|
240
253
|
return previews
|
241
|
-
|
254
|
+
|
242
255
|
def _remove_boilerplate(self, html: str) -> str:
|
243
256
|
"""
|
244
257
|
Remove boilerplate content from HTML.
|
245
|
-
|
258
|
+
|
246
259
|
Args:
|
247
260
|
html: HTML content
|
248
|
-
|
261
|
+
|
249
262
|
Returns:
|
250
263
|
Cleaned text content
|
251
264
|
"""
|
@@ -258,14 +271,14 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
258
271
|
except Exception as e:
|
259
272
|
logger.error(f"Error removing boilerplate: {e}")
|
260
273
|
return html
|
261
|
-
|
274
|
+
|
262
275
|
def _get_wayback_content(self, url: str) -> Tuple[str, str]:
|
263
276
|
"""
|
264
277
|
Retrieve content from a Wayback Machine URL.
|
265
|
-
|
278
|
+
|
266
279
|
Args:
|
267
280
|
url: Wayback Machine URL
|
268
|
-
|
281
|
+
|
269
282
|
Returns:
|
270
283
|
Tuple of (raw_html, cleaned_text)
|
271
284
|
"""
|
@@ -275,76 +288,85 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
275
288
|
}
|
276
289
|
response = requests.get(url, headers=headers, timeout=10)
|
277
290
|
raw_html = response.text
|
278
|
-
|
291
|
+
|
279
292
|
# Clean the HTML
|
280
293
|
cleaned_text = self._remove_boilerplate(raw_html)
|
281
|
-
|
294
|
+
|
282
295
|
return raw_html, cleaned_text
|
283
296
|
except Exception as e:
|
284
297
|
logger.error(f"Error retrieving content from {url}: {e}")
|
285
298
|
return "", f"Error retrieving content: {str(e)}"
|
286
|
-
|
287
|
-
def _get_full_content(
|
299
|
+
|
300
|
+
def _get_full_content(
|
301
|
+
self, relevant_items: List[Dict[str, Any]]
|
302
|
+
) -> List[Dict[str, Any]]:
|
288
303
|
"""
|
289
304
|
Get full content for the relevant Wayback Machine snapshots.
|
290
|
-
|
305
|
+
|
291
306
|
Args:
|
292
307
|
relevant_items: List of relevant preview dictionaries
|
293
|
-
|
308
|
+
|
294
309
|
Returns:
|
295
310
|
List of result dictionaries with full content
|
296
311
|
"""
|
297
312
|
# Check if we should add full content
|
298
|
-
if
|
313
|
+
if (
|
314
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
315
|
+
and search_config.SEARCH_SNIPPETS_ONLY
|
316
|
+
):
|
299
317
|
logger.info("Snippet-only mode, skipping full content retrieval")
|
300
318
|
return relevant_items
|
301
|
-
|
302
|
-
logger.info(
|
303
|
-
|
319
|
+
|
320
|
+
logger.info(
|
321
|
+
f"Getting full content for {len(relevant_items)} Wayback Machine snapshots"
|
322
|
+
)
|
323
|
+
|
304
324
|
results = []
|
305
325
|
for item in relevant_items:
|
306
326
|
wayback_url = item.get("link")
|
307
327
|
if not wayback_url:
|
308
328
|
results.append(item)
|
309
329
|
continue
|
310
|
-
|
330
|
+
|
311
331
|
logger.info(f"Retrieving content from {wayback_url}")
|
312
|
-
|
332
|
+
|
313
333
|
try:
|
314
334
|
# Retrieve content
|
315
335
|
raw_html, full_content = self._get_wayback_content(wayback_url)
|
316
|
-
|
336
|
+
|
317
337
|
# Add full content to the result
|
318
338
|
result = item.copy()
|
319
339
|
result["raw_html"] = raw_html
|
320
340
|
result["full_content"] = full_content
|
321
|
-
|
341
|
+
|
322
342
|
results.append(result)
|
323
|
-
|
343
|
+
|
324
344
|
# Brief pause for rate limiting
|
325
345
|
time.sleep(0.5)
|
326
346
|
except Exception as e:
|
327
347
|
logger.error(f"Error processing {wayback_url}: {e}")
|
328
348
|
results.append(item)
|
329
|
-
|
349
|
+
|
330
350
|
return results
|
331
|
-
|
332
|
-
def search_by_url(
|
351
|
+
|
352
|
+
def search_by_url(
|
353
|
+
self, url: str, max_snapshots: int = None
|
354
|
+
) -> List[Dict[str, Any]]:
|
333
355
|
"""
|
334
356
|
Search for archived versions of a specific URL.
|
335
|
-
|
357
|
+
|
336
358
|
Args:
|
337
359
|
url: The URL to search for archives
|
338
360
|
max_snapshots: Maximum number of snapshots to return
|
339
|
-
|
361
|
+
|
340
362
|
Returns:
|
341
363
|
List of snapshot dictionaries
|
342
364
|
"""
|
343
365
|
max_snapshots = max_snapshots or self.max_snapshots_per_url
|
344
|
-
|
366
|
+
|
345
367
|
snapshots = self._get_wayback_snapshots(url)
|
346
368
|
previews = []
|
347
|
-
|
369
|
+
|
348
370
|
for snapshot in snapshots[:max_snapshots]:
|
349
371
|
preview = {
|
350
372
|
"id": f"{snapshot['timestamp']}_{snapshot['original_url']}",
|
@@ -353,25 +375,30 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
353
375
|
"snippet": f"Archived version from {snapshot['formatted_date']}",
|
354
376
|
"original_url": snapshot["original_url"],
|
355
377
|
"timestamp": snapshot["timestamp"],
|
356
|
-
"formatted_date": snapshot["formatted_date"]
|
378
|
+
"formatted_date": snapshot["formatted_date"],
|
357
379
|
}
|
358
380
|
previews.append(preview)
|
359
|
-
|
381
|
+
|
360
382
|
# Get full content if not in snippets-only mode
|
361
|
-
if
|
383
|
+
if (
|
384
|
+
not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
385
|
+
or not search_config.SEARCH_SNIPPETS_ONLY
|
386
|
+
):
|
362
387
|
return self._get_full_content(previews)
|
363
|
-
|
388
|
+
|
364
389
|
return previews
|
365
|
-
|
366
|
-
def search_by_date_range(
|
390
|
+
|
391
|
+
def search_by_date_range(
|
392
|
+
self, url: str, start_date: str, end_date: str
|
393
|
+
) -> List[Dict[str, Any]]:
|
367
394
|
"""
|
368
395
|
Search for archived versions of a URL within a date range.
|
369
|
-
|
396
|
+
|
370
397
|
Args:
|
371
398
|
url: The URL to search for archives
|
372
399
|
start_date: Start date in format YYYYMMDD
|
373
400
|
end_date: End date in format YYYYMMDD
|
374
|
-
|
401
|
+
|
375
402
|
Returns:
|
376
403
|
List of snapshot dictionaries
|
377
404
|
"""
|
@@ -385,68 +412,70 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
385
412
|
"fl": "timestamp,original,statuscode,mimetype",
|
386
413
|
"from": start_date,
|
387
414
|
"to": end_date,
|
388
|
-
"limit": self.max_snapshots_per_url
|
389
|
-
}
|
415
|
+
"limit": self.max_snapshots_per_url,
|
416
|
+
},
|
390
417
|
)
|
391
|
-
|
418
|
+
|
392
419
|
# Process response
|
393
420
|
data = response.json()
|
394
|
-
|
421
|
+
|
395
422
|
# First item is the header
|
396
423
|
if len(data) <= 1:
|
397
424
|
return []
|
398
|
-
|
425
|
+
|
399
426
|
headers = data[0]
|
400
427
|
snapshots = []
|
401
|
-
|
428
|
+
|
402
429
|
for item in data[1:]:
|
403
430
|
snapshot = dict(zip(headers, item))
|
404
431
|
timestamp = snapshot.get("timestamp", "")
|
405
|
-
|
432
|
+
|
406
433
|
wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
|
407
|
-
|
408
|
-
snapshots.append(
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
434
|
+
|
435
|
+
snapshots.append(
|
436
|
+
{
|
437
|
+
"id": f"{timestamp}_{url}",
|
438
|
+
"title": f"Archive of {url} ({self._format_timestamp(timestamp)})",
|
439
|
+
"link": wayback_url,
|
440
|
+
"snippet": f"Archived version from {self._format_timestamp(timestamp)}",
|
441
|
+
"original_url": url,
|
442
|
+
"timestamp": timestamp,
|
443
|
+
"formatted_date": self._format_timestamp(timestamp),
|
444
|
+
}
|
445
|
+
)
|
446
|
+
|
418
447
|
# Get full content if not in snippets-only mode
|
419
|
-
if
|
448
|
+
if (
|
449
|
+
not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
450
|
+
or not search_config.SEARCH_SNIPPETS_ONLY
|
451
|
+
):
|
420
452
|
return self._get_full_content(snapshots)
|
421
|
-
|
453
|
+
|
422
454
|
return snapshots
|
423
|
-
|
455
|
+
|
424
456
|
except Exception as e:
|
425
457
|
logger.error(f"Error searching date range for {url}: {e}")
|
426
458
|
return []
|
427
|
-
|
459
|
+
|
428
460
|
def get_latest_snapshot(self, url: str) -> Optional[Dict[str, Any]]:
|
429
461
|
"""
|
430
462
|
Get the most recent snapshot of a URL.
|
431
|
-
|
463
|
+
|
432
464
|
Args:
|
433
465
|
url: The URL to get the latest snapshot for
|
434
|
-
|
466
|
+
|
435
467
|
Returns:
|
436
468
|
Dictionary with snapshot information or None if not found
|
437
469
|
"""
|
438
470
|
try:
|
439
|
-
response = requests.get(
|
440
|
-
self.available_api,
|
441
|
-
params={"url": url}
|
442
|
-
)
|
471
|
+
response = requests.get(self.available_api, params={"url": url})
|
443
472
|
data = response.json()
|
444
|
-
|
473
|
+
|
445
474
|
if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
|
446
475
|
snapshot = data["archived_snapshots"]["closest"]
|
447
476
|
timestamp = snapshot["timestamp"]
|
448
477
|
wayback_url = snapshot["url"]
|
449
|
-
|
478
|
+
|
450
479
|
result = {
|
451
480
|
"id": f"{timestamp}_{url}",
|
452
481
|
"title": f"Latest archive of {url} ({self._format_timestamp(timestamp)})",
|
@@ -454,19 +483,22 @@ class WaybackSearchEngine(BaseSearchEngine):
|
|
454
483
|
"snippet": f"Archived version from {self._format_timestamp(timestamp)}",
|
455
484
|
"original_url": url,
|
456
485
|
"timestamp": timestamp,
|
457
|
-
"formatted_date": self._format_timestamp(timestamp)
|
486
|
+
"formatted_date": self._format_timestamp(timestamp),
|
458
487
|
}
|
459
|
-
|
488
|
+
|
460
489
|
# Get full content if not in snippets-only mode
|
461
|
-
if
|
490
|
+
if (
|
491
|
+
not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
492
|
+
or not search_config.SEARCH_SNIPPETS_ONLY
|
493
|
+
):
|
462
494
|
raw_html, full_content = self._get_wayback_content(wayback_url)
|
463
495
|
result["raw_html"] = raw_html
|
464
496
|
result["full_content"] = full_content
|
465
|
-
|
497
|
+
|
466
498
|
return result
|
467
|
-
|
499
|
+
|
468
500
|
return None
|
469
|
-
|
501
|
+
|
470
502
|
except Exception as e:
|
471
503
|
logger.error(f"Error getting latest snapshot for {url}: {e}")
|
472
|
-
return None
|
504
|
+
return None
|