@mmmbuto/masix 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -14
- package/install.js +53 -27
- package/package.json +4 -3
- package/packages/plugin-base/codex-backend/0.1.4/SHA256SUMS +3 -0
- package/packages/plugin-base/codex-backend/0.1.4/codex-backend-android-aarch64-termux.pkg +0 -0
- package/packages/plugin-base/codex-backend/0.1.4/codex-backend-linux-x86_64.pkg +0 -0
- package/packages/plugin-base/codex-backend/0.1.4/codex-backend-macos-aarch64.pkg +0 -0
- package/packages/plugin-base/codex-backend/0.1.4/manifest.json +33 -0
- package/packages/plugin-base/codex-backend/CHANGELOG.md +17 -0
- package/packages/plugin-base/codex-backend/README.md +33 -0
- package/packages/plugin-base/codex-backend/source/Cargo.toml +25 -0
- package/packages/plugin-base/codex-backend/source/README-PACKAGE.txt +54 -0
- package/packages/plugin-base/codex-backend/source/plugin.manifest.json +103 -0
- package/packages/plugin-base/codex-backend/source/src/error.rs +60 -0
- package/packages/plugin-base/codex-backend/source/src/exec.rs +436 -0
- package/packages/plugin-base/codex-backend/source/src/http_backend.rs +1198 -0
- package/packages/plugin-base/codex-backend/source/src/lib.rs +328 -0
- package/packages/plugin-base/codex-backend/source/src/patch.rs +767 -0
- package/packages/plugin-base/codex-backend/source/src/policy.rs +297 -0
- package/packages/plugin-base/codex-backend/source/src/tools.rs +72 -0
- package/packages/plugin-base/codex-backend/source/src/workspace.rs +433 -0
- package/packages/plugin-base/codex-tools/0.1.3/SHA256SUMS +3 -0
- package/packages/plugin-base/codex-tools/0.1.3/codex-tools-android-aarch64-termux.pkg +0 -0
- package/packages/plugin-base/codex-tools/0.1.3/codex-tools-linux-x86_64.pkg +0 -0
- package/packages/plugin-base/codex-tools/0.1.3/codex-tools-macos-aarch64.pkg +0 -0
- package/packages/plugin-base/codex-tools/0.1.3/manifest.json +33 -0
- package/packages/plugin-base/codex-tools/CHANGELOG.md +17 -0
- package/packages/plugin-base/codex-tools/README.md +33 -0
- package/packages/plugin-base/codex-tools/source/Cargo.toml +23 -0
- package/packages/plugin-base/codex-tools/source/plugin.manifest.json +124 -0
- package/packages/plugin-base/codex-tools/source/src/main.rs +995 -0
- package/packages/plugin-base/discovery/0.2.4/SHA256SUMS +3 -0
- package/packages/plugin-base/discovery/0.2.4/discovery-android-aarch64-termux.pkg +0 -0
- package/packages/plugin-base/discovery/0.2.4/discovery-linux-x86_64.pkg +0 -0
- package/packages/plugin-base/discovery/0.2.4/discovery-macos-aarch64.pkg +0 -0
- package/packages/plugin-base/discovery/0.2.4/manifest.json +31 -0
- package/packages/plugin-base/discovery/CHANGELOG.md +17 -0
- package/packages/plugin-base/discovery/README.md +48 -0
- package/packages/plugin-base/discovery/source/Cargo.toml +14 -0
- package/packages/plugin-base/discovery/source/plugin.manifest.json +30 -0
- package/packages/plugin-base/discovery/source/src/main.rs +2570 -0
- package/prebuilt/masix +0 -0
|
@@ -0,0 +1,2570 @@
|
|
|
1
|
+
use anyhow::{anyhow, Result};
|
|
2
|
+
use clap::{Parser, Subcommand};
|
|
3
|
+
use scraper::{Html, Selector};
|
|
4
|
+
use serde::{Deserialize, Serialize};
|
|
5
|
+
use std::cmp::Ordering;
|
|
6
|
+
use std::collections::{HashMap, HashSet};
|
|
7
|
+
use std::io::{self, BufRead, Write};
|
|
8
|
+
use std::sync::OnceLock;
|
|
9
|
+
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
|
10
|
+
use tokio::sync::Mutex;
|
|
11
|
+
|
|
12
|
+
const DEFAULT_SEARXNG_URLS: &[&str] = &[
|
|
13
|
+
"https://search.inetol.net",
|
|
14
|
+
"https://searx.work",
|
|
15
|
+
"https://search.privacyredirect.com",
|
|
16
|
+
];
|
|
17
|
+
const MAX_WEB_CONTENT: usize = 15_000;
|
|
18
|
+
const SEARX_RETRIES: usize = 3;
|
|
19
|
+
const MAX_SEARX_PARALLEL_ENDPOINTS: usize = 3;
|
|
20
|
+
const MAX_ENDPOINTS_FROM_CONFIG: usize = 8;
|
|
21
|
+
const DEFAULT_SEARCH_TIMEOUT_SECS: u64 = 15;
|
|
22
|
+
const DEFAULT_FETCH_TIMEOUT_SECS: u64 = 20;
|
|
23
|
+
const MAX_PROVIDER_RESULTS: usize = 40;
|
|
24
|
+
const MODULE_VERSION: &str = env!("CARGO_PKG_VERSION");
|
|
25
|
+
const BROWSER_USER_AGENT: &str =
|
|
26
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36";
|
|
27
|
+
const NEWS_REGIONS: &[(&str, &str, &str)] = &[
|
|
28
|
+
("US", "en-US", "US:en"),
|
|
29
|
+
("GB", "en-GB", "GB:en"),
|
|
30
|
+
("IT", "it-IT", "IT:it"),
|
|
31
|
+
("DE", "de-DE", "DE:de"),
|
|
32
|
+
("ES", "es-ES", "ES:es"),
|
|
33
|
+
("JP", "ja", "JP:ja"),
|
|
34
|
+
];
|
|
35
|
+
|
|
36
|
+
#[derive(Parser)]
|
|
37
|
+
#[command(name = "masix-plugin-discovery")]
|
|
38
|
+
#[command(about = "External discovery module for MasiX (web search + fetch)")]
|
|
39
|
+
struct Cli {
|
|
40
|
+
#[command(subcommand)]
|
|
41
|
+
command: Commands,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
#[derive(Subcommand)]
|
|
45
|
+
enum Commands {
|
|
46
|
+
/// Search the web using autonomous multi-engine broker
|
|
47
|
+
WebSearch {
|
|
48
|
+
query: String,
|
|
49
|
+
#[arg(short, long, default_value_t = 5)]
|
|
50
|
+
max_results: usize,
|
|
51
|
+
#[arg(long)]
|
|
52
|
+
endpoint: Option<String>,
|
|
53
|
+
#[arg(short, long)]
|
|
54
|
+
json: bool,
|
|
55
|
+
},
|
|
56
|
+
/// Fetch and extract text content from a web page
|
|
57
|
+
WebFetch { url: String },
|
|
58
|
+
/// Search torrent metadata pages (lawful content only)
|
|
59
|
+
TorrentSearch {
|
|
60
|
+
query: String,
|
|
61
|
+
#[arg(short, long, default_value_t = 5)]
|
|
62
|
+
max_results: usize,
|
|
63
|
+
#[arg(long)]
|
|
64
|
+
endpoint: Option<String>,
|
|
65
|
+
#[arg(long, default_value_t = true)]
|
|
66
|
+
with_magnets: bool,
|
|
67
|
+
#[arg(short, long)]
|
|
68
|
+
json: bool,
|
|
69
|
+
},
|
|
70
|
+
/// Extract magnet links from a web page URL
|
|
71
|
+
TorrentExtract {
|
|
72
|
+
url: String,
|
|
73
|
+
#[arg(short, long, default_value_t = 5)]
|
|
74
|
+
max_links: usize,
|
|
75
|
+
#[arg(short, long)]
|
|
76
|
+
json: bool,
|
|
77
|
+
},
|
|
78
|
+
/// Print plugin metadata (draft)
|
|
79
|
+
Manifest,
|
|
80
|
+
/// Run MCP server over stdio (JSON-RPC)
|
|
81
|
+
ServeMcp,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
#[derive(Debug, Deserialize)]
|
|
85
|
+
struct SearxResponse {
|
|
86
|
+
#[serde(default)]
|
|
87
|
+
results: Vec<SearxResultRaw>,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#[derive(Debug, Deserialize)]
|
|
91
|
+
struct SearxResultRaw {
|
|
92
|
+
#[serde(default)]
|
|
93
|
+
title: String,
|
|
94
|
+
#[serde(default)]
|
|
95
|
+
url: String,
|
|
96
|
+
#[serde(default, alias = "content")]
|
|
97
|
+
content: String,
|
|
98
|
+
#[serde(default)]
|
|
99
|
+
engine: String,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
#[derive(Debug, Serialize, Clone)]
|
|
103
|
+
struct SearchResult {
|
|
104
|
+
title: String,
|
|
105
|
+
url: String,
|
|
106
|
+
content: String,
|
|
107
|
+
engine: String,
|
|
108
|
+
#[serde(default, skip_serializing_if = "String::is_empty")]
|
|
109
|
+
provider: String,
|
|
110
|
+
#[serde(default, skip_serializing_if = "String::is_empty")]
|
|
111
|
+
source_domain: String,
|
|
112
|
+
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
113
|
+
score: Option<f64>,
|
|
114
|
+
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
115
|
+
endpoint: Option<String>,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
#[derive(Debug, Serialize)]
|
|
119
|
+
struct TorrentSearchResult {
|
|
120
|
+
title: String,
|
|
121
|
+
url: String,
|
|
122
|
+
content: String,
|
|
123
|
+
engine: String,
|
|
124
|
+
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
|
125
|
+
magnet_links: Vec<String>,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
#[derive(Debug, Deserialize)]
|
|
129
|
+
struct JsonRpcRequest {
|
|
130
|
+
#[allow(dead_code)]
|
|
131
|
+
jsonrpc: String,
|
|
132
|
+
id: Option<serde_json::Value>,
|
|
133
|
+
method: String,
|
|
134
|
+
#[serde(default)]
|
|
135
|
+
params: serde_json::Value,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
#[derive(Debug, Serialize)]
|
|
139
|
+
struct JsonRpcResponse {
|
|
140
|
+
jsonrpc: String,
|
|
141
|
+
id: Option<serde_json::Value>,
|
|
142
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
143
|
+
result: Option<serde_json::Value>,
|
|
144
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
145
|
+
error: Option<JsonRpcError>,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
#[derive(Debug, Serialize)]
|
|
149
|
+
struct JsonRpcError {
|
|
150
|
+
code: i32,
|
|
151
|
+
message: String,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
#[derive(Debug, Serialize)]
|
|
155
|
+
struct ToolDefinition {
|
|
156
|
+
name: String,
|
|
157
|
+
description: String,
|
|
158
|
+
input_schema: serde_json::Value,
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
#[derive(Debug, Serialize)]
|
|
162
|
+
struct ToolResult {
|
|
163
|
+
content: Vec<ToolContent>,
|
|
164
|
+
#[serde(skip_serializing_if = "is_false")]
|
|
165
|
+
is_error: bool,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
fn is_false(v: &bool) -> bool {
|
|
169
|
+
!v
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
#[derive(Debug, Serialize)]
|
|
173
|
+
struct ToolContent {
|
|
174
|
+
#[serde(rename = "type")]
|
|
175
|
+
content_type: String,
|
|
176
|
+
text: String,
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
#[derive(Debug, Clone, Default)]
|
|
180
|
+
struct EndpointHealth {
|
|
181
|
+
successes: u32,
|
|
182
|
+
failures: u32,
|
|
183
|
+
cooldown_until: u64,
|
|
184
|
+
last_error: Option<String>,
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
#[derive(Debug, Clone)]
|
|
188
|
+
struct SearchProviderReport {
|
|
189
|
+
provider: &'static str,
|
|
190
|
+
items: Vec<SearchResult>,
|
|
191
|
+
error: Option<String>,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
#[derive(Debug, Deserialize)]
|
|
195
|
+
struct WikipediaResponse {
|
|
196
|
+
#[serde(default)]
|
|
197
|
+
query: Option<WikipediaQuery>,
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
#[derive(Debug, Deserialize)]
|
|
201
|
+
struct WikipediaQuery {
|
|
202
|
+
#[serde(default)]
|
|
203
|
+
search: Vec<WikipediaEntry>,
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
#[derive(Debug, Deserialize)]
|
|
207
|
+
struct WikipediaEntry {
|
|
208
|
+
#[serde(default)]
|
|
209
|
+
title: String,
|
|
210
|
+
#[serde(default)]
|
|
211
|
+
snippet: String,
|
|
212
|
+
#[serde(default)]
|
|
213
|
+
pageid: u64,
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
#[derive(Debug, Deserialize)]
|
|
217
|
+
struct ArchiveResponse {
|
|
218
|
+
#[serde(default)]
|
|
219
|
+
response: ArchiveInnerResponse,
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
#[derive(Debug, Deserialize, Default)]
|
|
223
|
+
struct ArchiveInnerResponse {
|
|
224
|
+
#[serde(default)]
|
|
225
|
+
docs: Vec<ArchiveDoc>,
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
#[derive(Debug, Deserialize, Default)]
|
|
229
|
+
struct ArchiveDoc {
|
|
230
|
+
#[serde(default)]
|
|
231
|
+
identifier: String,
|
|
232
|
+
#[serde(default)]
|
|
233
|
+
title: Option<String>,
|
|
234
|
+
#[serde(default)]
|
|
235
|
+
description: Option<serde_json::Value>,
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
static ENDPOINT_HEALTH: OnceLock<Mutex<HashMap<String, EndpointHealth>>> = OnceLock::new();
|
|
239
|
+
|
|
240
|
+
fn endpoint_health_store() -> &'static Mutex<HashMap<String, EndpointHealth>> {
|
|
241
|
+
ENDPOINT_HEALTH.get_or_init(|| Mutex::new(HashMap::new()))
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
#[tokio::main]
|
|
245
|
+
async fn main() -> Result<()> {
|
|
246
|
+
let cli = Cli::parse();
|
|
247
|
+
match cli.command {
|
|
248
|
+
Commands::WebSearch {
|
|
249
|
+
query,
|
|
250
|
+
max_results,
|
|
251
|
+
endpoint,
|
|
252
|
+
json,
|
|
253
|
+
} => {
|
|
254
|
+
let results = broker_web_search(endpoint.as_deref(), &query, max_results).await?;
|
|
255
|
+
if json {
|
|
256
|
+
println!("{}", serde_json::to_string_pretty(&results)?);
|
|
257
|
+
} else if results.is_empty() {
|
|
258
|
+
println!("No results found.");
|
|
259
|
+
} else {
|
|
260
|
+
for (i, item) in results.iter().enumerate() {
|
|
261
|
+
println!(
|
|
262
|
+
"{}. {} [{} | {}]\n {}\n {}\n",
|
|
263
|
+
i + 1,
|
|
264
|
+
if item.title.trim().is_empty() {
|
|
265
|
+
"(untitled)"
|
|
266
|
+
} else {
|
|
267
|
+
item.title.trim()
|
|
268
|
+
},
|
|
269
|
+
if item.engine.trim().is_empty() {
|
|
270
|
+
"unknown"
|
|
271
|
+
} else {
|
|
272
|
+
item.engine.trim()
|
|
273
|
+
},
|
|
274
|
+
if item.provider.trim().is_empty() {
|
|
275
|
+
"broker"
|
|
276
|
+
} else {
|
|
277
|
+
item.provider.trim()
|
|
278
|
+
},
|
|
279
|
+
item.url.trim(),
|
|
280
|
+
item.content.trim()
|
|
281
|
+
);
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
Commands::WebFetch { url } => {
|
|
286
|
+
println!("{}", web_fetch_page(&url).await?);
|
|
287
|
+
}
|
|
288
|
+
Commands::TorrentSearch {
|
|
289
|
+
query,
|
|
290
|
+
max_results,
|
|
291
|
+
endpoint,
|
|
292
|
+
with_magnets,
|
|
293
|
+
json,
|
|
294
|
+
} => {
|
|
295
|
+
let results = torrent_search(
|
|
296
|
+
endpoint.as_deref(),
|
|
297
|
+
&query,
|
|
298
|
+
max_results.min(20).max(1),
|
|
299
|
+
with_magnets,
|
|
300
|
+
)
|
|
301
|
+
.await?;
|
|
302
|
+
if json {
|
|
303
|
+
println!("{}", serde_json::to_string_pretty(&results)?);
|
|
304
|
+
} else if results.is_empty() {
|
|
305
|
+
println!("No torrent results found.");
|
|
306
|
+
} else {
|
|
307
|
+
for (idx, item) in results.iter().enumerate() {
|
|
308
|
+
println!(
|
|
309
|
+
"{}. {} [{}]\n {}\n {}\n",
|
|
310
|
+
idx + 1,
|
|
311
|
+
if item.title.trim().is_empty() {
|
|
312
|
+
"(untitled)"
|
|
313
|
+
} else {
|
|
314
|
+
item.title.trim()
|
|
315
|
+
},
|
|
316
|
+
if item.engine.trim().is_empty() {
|
|
317
|
+
"unknown"
|
|
318
|
+
} else {
|
|
319
|
+
item.engine.trim()
|
|
320
|
+
},
|
|
321
|
+
item.url.trim(),
|
|
322
|
+
item.content.trim()
|
|
323
|
+
);
|
|
324
|
+
if !item.magnet_links.is_empty() {
|
|
325
|
+
for magnet in &item.magnet_links {
|
|
326
|
+
println!(" magnet: {}", magnet);
|
|
327
|
+
}
|
|
328
|
+
println!();
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
Commands::TorrentExtract {
|
|
334
|
+
url,
|
|
335
|
+
max_links,
|
|
336
|
+
json,
|
|
337
|
+
} => {
|
|
338
|
+
let links = extract_magnet_links(&url, max_links.min(20).max(1)).await?;
|
|
339
|
+
if json {
|
|
340
|
+
println!("{}", serde_json::to_string_pretty(&links)?);
|
|
341
|
+
} else if links.is_empty() {
|
|
342
|
+
println!("No magnet links found.");
|
|
343
|
+
} else {
|
|
344
|
+
for (idx, link) in links.iter().enumerate() {
|
|
345
|
+
println!("{}. {}", idx + 1, link);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
Commands::Manifest => {
|
|
350
|
+
println!("{}", include_str!("../plugin.manifest.json"));
|
|
351
|
+
}
|
|
352
|
+
Commands::ServeMcp => {
|
|
353
|
+
run_mcp_server().await?;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
Ok(())
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
async fn run_mcp_server() -> Result<()> {
|
|
360
|
+
let stdin = io::stdin();
|
|
361
|
+
let mut stdout = io::stdout();
|
|
362
|
+
|
|
363
|
+
for line in stdin.lock().lines() {
|
|
364
|
+
let line = line?;
|
|
365
|
+
let line = line.trim();
|
|
366
|
+
if line.is_empty() {
|
|
367
|
+
continue;
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
let request: JsonRpcRequest = match serde_json::from_str(line) {
|
|
371
|
+
Ok(req) => req,
|
|
372
|
+
Err(e) => {
|
|
373
|
+
let response = JsonRpcResponse {
|
|
374
|
+
jsonrpc: "2.0".to_string(),
|
|
375
|
+
id: None,
|
|
376
|
+
result: None,
|
|
377
|
+
error: Some(JsonRpcError {
|
|
378
|
+
code: -32700,
|
|
379
|
+
message: format!("Parse error: {}", e),
|
|
380
|
+
}),
|
|
381
|
+
};
|
|
382
|
+
writeln!(stdout, "{}", serde_json::to_string(&response)?)?;
|
|
383
|
+
stdout.flush()?;
|
|
384
|
+
continue;
|
|
385
|
+
}
|
|
386
|
+
};
|
|
387
|
+
|
|
388
|
+
let response = handle_mcp_request(&request).await;
|
|
389
|
+
writeln!(stdout, "{}", serde_json::to_string(&response)?)?;
|
|
390
|
+
stdout.flush()?;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
Ok(())
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
async fn handle_mcp_request(request: &JsonRpcRequest) -> JsonRpcResponse {
|
|
397
|
+
match request.method.as_str() {
|
|
398
|
+
"initialize" => JsonRpcResponse {
|
|
399
|
+
jsonrpc: "2.0".to_string(),
|
|
400
|
+
id: request.id.clone(),
|
|
401
|
+
result: Some(serde_json::json!({
|
|
402
|
+
"protocolVersion": "2024-11-05",
|
|
403
|
+
"capabilities": {
|
|
404
|
+
"tools": {}
|
|
405
|
+
},
|
|
406
|
+
"serverInfo": {
|
|
407
|
+
"name": "masix-discovery",
|
|
408
|
+
"version": env!("CARGO_PKG_VERSION")
|
|
409
|
+
}
|
|
410
|
+
})),
|
|
411
|
+
error: None,
|
|
412
|
+
},
|
|
413
|
+
"notifications/initialized" => JsonRpcResponse {
|
|
414
|
+
jsonrpc: "2.0".to_string(),
|
|
415
|
+
id: None,
|
|
416
|
+
result: None,
|
|
417
|
+
error: None,
|
|
418
|
+
},
|
|
419
|
+
"tools/list" => {
|
|
420
|
+
let tools = get_tool_definitions();
|
|
421
|
+
JsonRpcResponse {
|
|
422
|
+
jsonrpc: "2.0".to_string(),
|
|
423
|
+
id: request.id.clone(),
|
|
424
|
+
result: Some(serde_json::json!({ "tools": tools })),
|
|
425
|
+
error: None,
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
"tools/call" => {
|
|
429
|
+
let params = &request.params;
|
|
430
|
+
let tool_name = params.get("name").and_then(|v| v.as_str()).unwrap_or("");
|
|
431
|
+
|
|
432
|
+
let arguments = params
|
|
433
|
+
.get("arguments")
|
|
434
|
+
.cloned()
|
|
435
|
+
.unwrap_or(serde_json::json!({}));
|
|
436
|
+
|
|
437
|
+
match handle_tool_call(tool_name, arguments).await {
|
|
438
|
+
Ok(result) => JsonRpcResponse {
|
|
439
|
+
jsonrpc: "2.0".to_string(),
|
|
440
|
+
id: request.id.clone(),
|
|
441
|
+
result: Some(serde_json::to_value(result).unwrap_or(serde_json::json!({}))),
|
|
442
|
+
error: None,
|
|
443
|
+
},
|
|
444
|
+
Err(e) => JsonRpcResponse {
|
|
445
|
+
jsonrpc: "2.0".to_string(),
|
|
446
|
+
id: request.id.clone(),
|
|
447
|
+
result: Some(
|
|
448
|
+
serde_json::to_value(ToolResult {
|
|
449
|
+
content: vec![ToolContent {
|
|
450
|
+
content_type: "text".to_string(),
|
|
451
|
+
text: format!("Error: {}", e),
|
|
452
|
+
}],
|
|
453
|
+
is_error: true,
|
|
454
|
+
})
|
|
455
|
+
.unwrap_or(serde_json::json!({})),
|
|
456
|
+
),
|
|
457
|
+
error: None,
|
|
458
|
+
},
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
_ => JsonRpcResponse {
|
|
462
|
+
jsonrpc: "2.0".to_string(),
|
|
463
|
+
id: request.id.clone(),
|
|
464
|
+
result: None,
|
|
465
|
+
error: Some(JsonRpcError {
|
|
466
|
+
code: -32601,
|
|
467
|
+
message: format!("Method not found: {}", request.method),
|
|
468
|
+
}),
|
|
469
|
+
},
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
fn get_tool_definitions() -> Vec<ToolDefinition> {
|
|
474
|
+
vec![
|
|
475
|
+
ToolDefinition {
|
|
476
|
+
name: "web_search".to_string(),
|
|
477
|
+
description:
|
|
478
|
+
"Search the web using autonomous multi-engine broker (SearXNG + direct sources)."
|
|
479
|
+
.to_string(),
|
|
480
|
+
input_schema: serde_json::json!({
|
|
481
|
+
"type": "object",
|
|
482
|
+
"properties": {
|
|
483
|
+
"query": {
|
|
484
|
+
"type": "string",
|
|
485
|
+
"description": "Search query"
|
|
486
|
+
},
|
|
487
|
+
"max_results": {
|
|
488
|
+
"type": "integer",
|
|
489
|
+
"description": "Maximum number of results (default: 5, max: 20)",
|
|
490
|
+
"default": 5
|
|
491
|
+
}
|
|
492
|
+
},
|
|
493
|
+
"required": ["query"]
|
|
494
|
+
}),
|
|
495
|
+
},
|
|
496
|
+
ToolDefinition {
|
|
497
|
+
name: "web_fetch".to_string(),
|
|
498
|
+
description: "Fetch and extract text content from a web page".to_string(),
|
|
499
|
+
input_schema: serde_json::json!({
|
|
500
|
+
"type": "object",
|
|
501
|
+
"properties": {
|
|
502
|
+
"url": {
|
|
503
|
+
"type": "string",
|
|
504
|
+
"description": "URL to fetch"
|
|
505
|
+
}
|
|
506
|
+
},
|
|
507
|
+
"required": ["url"]
|
|
508
|
+
}),
|
|
509
|
+
},
|
|
510
|
+
ToolDefinition {
|
|
511
|
+
name: "torrent_search".to_string(),
|
|
512
|
+
description:
|
|
513
|
+
"Search torrent metadata pages (lawful use only), with optional magnet extraction"
|
|
514
|
+
.to_string(),
|
|
515
|
+
input_schema: serde_json::json!({
|
|
516
|
+
"type": "object",
|
|
517
|
+
"properties": {
|
|
518
|
+
"query": {
|
|
519
|
+
"type": "string",
|
|
520
|
+
"description": "Search query for torrent metadata"
|
|
521
|
+
},
|
|
522
|
+
"max_results": {
|
|
523
|
+
"type": "integer",
|
|
524
|
+
"description": "Maximum number of results (default: 5, max: 20)",
|
|
525
|
+
"default": 5
|
|
526
|
+
},
|
|
527
|
+
"with_magnets": {
|
|
528
|
+
"type": "boolean",
|
|
529
|
+
"description": "Try to extract magnet links from each result URL (default: true)",
|
|
530
|
+
"default": true
|
|
531
|
+
}
|
|
532
|
+
},
|
|
533
|
+
"required": ["query"]
|
|
534
|
+
}),
|
|
535
|
+
},
|
|
536
|
+
ToolDefinition {
|
|
537
|
+
name: "torrent_extract".to_string(),
|
|
538
|
+
description: "Extract magnet links from a page URL".to_string(),
|
|
539
|
+
input_schema: serde_json::json!({
|
|
540
|
+
"type": "object",
|
|
541
|
+
"properties": {
|
|
542
|
+
"url": {
|
|
543
|
+
"type": "string",
|
|
544
|
+
"description": "Page URL or magnet URL"
|
|
545
|
+
},
|
|
546
|
+
"max_links": {
|
|
547
|
+
"type": "integer",
|
|
548
|
+
"description": "Maximum links to return (default: 5, max: 20)",
|
|
549
|
+
"default": 5
|
|
550
|
+
}
|
|
551
|
+
},
|
|
552
|
+
"required": ["url"]
|
|
553
|
+
}),
|
|
554
|
+
},
|
|
555
|
+
]
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
async fn handle_tool_call(name: &str, arguments: serde_json::Value) -> Result<ToolResult> {
|
|
559
|
+
match name {
|
|
560
|
+
"web_search" => {
|
|
561
|
+
let query = arguments
|
|
562
|
+
.get("query")
|
|
563
|
+
.and_then(|v| v.as_str())
|
|
564
|
+
.ok_or_else(|| anyhow!("Missing 'query' parameter"))?;
|
|
565
|
+
let max_results = arguments
|
|
566
|
+
.get("max_results")
|
|
567
|
+
.and_then(|v| v.as_u64())
|
|
568
|
+
.unwrap_or(5) as usize;
|
|
569
|
+
|
|
570
|
+
let results = broker_web_search(None, query, max_results.min(20).max(1)).await?;
|
|
571
|
+
let text = serde_json::to_string_pretty(&results)?;
|
|
572
|
+
|
|
573
|
+
Ok(ToolResult {
|
|
574
|
+
content: vec![ToolContent {
|
|
575
|
+
content_type: "text".to_string(),
|
|
576
|
+
text,
|
|
577
|
+
}],
|
|
578
|
+
is_error: false,
|
|
579
|
+
})
|
|
580
|
+
}
|
|
581
|
+
"web_fetch" => {
|
|
582
|
+
let url = arguments
|
|
583
|
+
.get("url")
|
|
584
|
+
.and_then(|v| v.as_str())
|
|
585
|
+
.ok_or_else(|| anyhow!("Missing 'url' parameter"))?;
|
|
586
|
+
|
|
587
|
+
let content = web_fetch_page(url).await?;
|
|
588
|
+
|
|
589
|
+
Ok(ToolResult {
|
|
590
|
+
content: vec![ToolContent {
|
|
591
|
+
content_type: "text".to_string(),
|
|
592
|
+
text: content,
|
|
593
|
+
}],
|
|
594
|
+
is_error: false,
|
|
595
|
+
})
|
|
596
|
+
}
|
|
597
|
+
"torrent_search" => {
|
|
598
|
+
let query = arguments
|
|
599
|
+
.get("query")
|
|
600
|
+
.and_then(|v| v.as_str())
|
|
601
|
+
.ok_or_else(|| anyhow!("Missing 'query' parameter"))?;
|
|
602
|
+
let max_results = arguments
|
|
603
|
+
.get("max_results")
|
|
604
|
+
.and_then(|v| v.as_u64())
|
|
605
|
+
.unwrap_or(5) as usize;
|
|
606
|
+
let with_magnets = arguments
|
|
607
|
+
.get("with_magnets")
|
|
608
|
+
.and_then(|v| v.as_bool())
|
|
609
|
+
.unwrap_or(true);
|
|
610
|
+
|
|
611
|
+
let results =
|
|
612
|
+
torrent_search(None, query, max_results.min(20).max(1), with_magnets).await?;
|
|
613
|
+
let text = serde_json::to_string_pretty(&results)?;
|
|
614
|
+
|
|
615
|
+
Ok(ToolResult {
|
|
616
|
+
content: vec![ToolContent {
|
|
617
|
+
content_type: "text".to_string(),
|
|
618
|
+
text,
|
|
619
|
+
}],
|
|
620
|
+
is_error: false,
|
|
621
|
+
})
|
|
622
|
+
}
|
|
623
|
+
"torrent_extract" => {
|
|
624
|
+
let url = arguments
|
|
625
|
+
.get("url")
|
|
626
|
+
.and_then(|v| v.as_str())
|
|
627
|
+
.ok_or_else(|| anyhow!("Missing 'url' parameter"))?;
|
|
628
|
+
let max_links = arguments
|
|
629
|
+
.get("max_links")
|
|
630
|
+
.and_then(|v| v.as_u64())
|
|
631
|
+
.unwrap_or(5) as usize;
|
|
632
|
+
|
|
633
|
+
let links = extract_magnet_links(url, max_links.min(20).max(1)).await?;
|
|
634
|
+
let text = serde_json::to_string_pretty(&links)?;
|
|
635
|
+
|
|
636
|
+
Ok(ToolResult {
|
|
637
|
+
content: vec![ToolContent {
|
|
638
|
+
content_type: "text".to_string(),
|
|
639
|
+
text,
|
|
640
|
+
}],
|
|
641
|
+
is_error: false,
|
|
642
|
+
})
|
|
643
|
+
}
|
|
644
|
+
_ => Err(anyhow!("Unknown tool: {}", name)),
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
async fn broker_web_search(
|
|
649
|
+
endpoint_override: Option<&str>,
|
|
650
|
+
query: &str,
|
|
651
|
+
max_results: usize,
|
|
652
|
+
) -> Result<Vec<SearchResult>> {
|
|
653
|
+
let max_results = max_results.min(20).max(1);
|
|
654
|
+
let endpoints = resolve_searx_endpoints(endpoint_override);
|
|
655
|
+
let mut reports = collect_provider_reports(&endpoints, query, max_results).await;
|
|
656
|
+
let mut merged = Vec::new();
|
|
657
|
+
for report in &reports {
|
|
658
|
+
merged.extend(report.items.clone());
|
|
659
|
+
}
|
|
660
|
+
let ranked = rank_and_dedup(merged, max_results, query);
|
|
661
|
+
if !ranked.is_empty() {
|
|
662
|
+
return Ok(ranked);
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
// Retry once with a simplified keyword-only query, useful when LLM sends
|
|
666
|
+
// long instruction-style prompts instead of search terms.
|
|
667
|
+
if let Some(relaxed_query) = relax_search_query(query) {
|
|
668
|
+
if relaxed_query != query {
|
|
669
|
+
let retry_reports =
|
|
670
|
+
collect_provider_reports(&endpoints, &relaxed_query, max_results).await;
|
|
671
|
+
let mut retry_merged = Vec::new();
|
|
672
|
+
for report in &retry_reports {
|
|
673
|
+
retry_merged.extend(report.items.clone());
|
|
674
|
+
}
|
|
675
|
+
let retry_ranked = rank_and_dedup(retry_merged, max_results, &relaxed_query);
|
|
676
|
+
if !retry_ranked.is_empty() {
|
|
677
|
+
return Ok(retry_ranked);
|
|
678
|
+
}
|
|
679
|
+
reports.extend(retry_reports);
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
// Second retry using topic-focused keywords for geopolitical/news prompts.
|
|
684
|
+
if let Some(topic_query) = topic_focus_query(query) {
|
|
685
|
+
if topic_query != query {
|
|
686
|
+
let topic_reports =
|
|
687
|
+
collect_provider_reports(&endpoints, &topic_query, max_results).await;
|
|
688
|
+
let mut topic_merged = Vec::new();
|
|
689
|
+
for report in &topic_reports {
|
|
690
|
+
topic_merged.extend(report.items.clone());
|
|
691
|
+
}
|
|
692
|
+
let topic_ranked = rank_and_dedup(topic_merged, max_results, &topic_query);
|
|
693
|
+
if !topic_ranked.is_empty() {
|
|
694
|
+
return Ok(topic_ranked);
|
|
695
|
+
}
|
|
696
|
+
reports.extend(topic_reports);
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
if let Ok(probe_results) = direct_domain_probe(query, max_results).await {
|
|
701
|
+
if !probe_results.is_empty() {
|
|
702
|
+
return Ok(probe_results);
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
let mut provider_errors = Vec::new();
|
|
707
|
+
for report in reports {
|
|
708
|
+
if let Some(err) = report.error {
|
|
709
|
+
provider_errors.push(format!("{}: {}", report.provider, err));
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
if provider_errors.is_empty() {
|
|
714
|
+
Err(anyhow!(
|
|
715
|
+
"multi-engine search returned no results for query '{}'; providers returned empty sets",
|
|
716
|
+
query
|
|
717
|
+
))
|
|
718
|
+
} else {
|
|
719
|
+
Err(anyhow!(
|
|
720
|
+
"multi-engine search failed for query '{}': {}",
|
|
721
|
+
query,
|
|
722
|
+
provider_errors.join(" | ")
|
|
723
|
+
))
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
async fn collect_provider_reports(
|
|
728
|
+
endpoints: &[String],
|
|
729
|
+
query: &str,
|
|
730
|
+
max_results: usize,
|
|
731
|
+
) -> Vec<SearchProviderReport> {
|
|
732
|
+
let searx_future = searx_search_broker(
|
|
733
|
+
endpoints,
|
|
734
|
+
query,
|
|
735
|
+
(max_results * 3).min(MAX_PROVIDER_RESULTS),
|
|
736
|
+
MAX_SEARX_PARALLEL_ENDPOINTS,
|
|
737
|
+
);
|
|
738
|
+
let wiki_future = wikipedia_search(query, (max_results / 2).max(3).min(MAX_PROVIDER_RESULTS));
|
|
739
|
+
let news_future = google_news_search(query, (max_results / 2).max(3).min(MAX_PROVIDER_RESULTS));
|
|
740
|
+
let brave_future = brave_html_search(query, (max_results / 2).max(3).min(MAX_PROVIDER_RESULTS));
|
|
741
|
+
let duckduckgo_future =
|
|
742
|
+
duckduckgo_html_search(query, (max_results / 2).max(3).min(MAX_PROVIDER_RESULTS));
|
|
743
|
+
let bing_future = bing_rss_search(query, (max_results / 2).max(3).min(MAX_PROVIDER_RESULTS));
|
|
744
|
+
|
|
745
|
+
let (searx, wiki, news, brave, duckduckgo, bing) = tokio::join!(
|
|
746
|
+
searx_future,
|
|
747
|
+
wiki_future,
|
|
748
|
+
news_future,
|
|
749
|
+
brave_future,
|
|
750
|
+
duckduckgo_future,
|
|
751
|
+
bing_future
|
|
752
|
+
);
|
|
753
|
+
|
|
754
|
+
vec![
|
|
755
|
+
as_provider_report("searx", searx),
|
|
756
|
+
as_provider_report("wikipedia", wiki),
|
|
757
|
+
as_provider_report("news-rss", news),
|
|
758
|
+
as_provider_report("brave-html", brave),
|
|
759
|
+
as_provider_report("duckduckgo", duckduckgo),
|
|
760
|
+
as_provider_report("bing-rss", bing),
|
|
761
|
+
]
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
fn relax_search_query(query: &str) -> Option<String> {
|
|
765
|
+
let stopwords: HashSet<&'static str> = [
|
|
766
|
+
"fai",
|
|
767
|
+
"ricerca",
|
|
768
|
+
"ricercare",
|
|
769
|
+
"cerca",
|
|
770
|
+
"cercare",
|
|
771
|
+
"correlate",
|
|
772
|
+
"correlato",
|
|
773
|
+
"analizza",
|
|
774
|
+
"analisi",
|
|
775
|
+
"eventuali",
|
|
776
|
+
"possibili",
|
|
777
|
+
"breve",
|
|
778
|
+
"riassunto",
|
|
779
|
+
"sintesi",
|
|
780
|
+
"su",
|
|
781
|
+
"sul",
|
|
782
|
+
"sulla",
|
|
783
|
+
"sulle",
|
|
784
|
+
"con",
|
|
785
|
+
"per",
|
|
786
|
+
"tra",
|
|
787
|
+
"fra",
|
|
788
|
+
"del",
|
|
789
|
+
"della",
|
|
790
|
+
"delle",
|
|
791
|
+
"degli",
|
|
792
|
+
"dei",
|
|
793
|
+
"e",
|
|
794
|
+
"ed",
|
|
795
|
+
"in",
|
|
796
|
+
"di",
|
|
797
|
+
"da",
|
|
798
|
+
"a",
|
|
799
|
+
"il",
|
|
800
|
+
"lo",
|
|
801
|
+
"la",
|
|
802
|
+
"gli",
|
|
803
|
+
"le",
|
|
804
|
+
"the",
|
|
805
|
+
"and",
|
|
806
|
+
"for",
|
|
807
|
+
"with",
|
|
808
|
+
"from",
|
|
809
|
+
"into",
|
|
810
|
+
]
|
|
811
|
+
.into_iter()
|
|
812
|
+
.collect();
|
|
813
|
+
|
|
814
|
+
let mut keywords = Vec::new();
|
|
815
|
+
for raw in query.split_whitespace() {
|
|
816
|
+
let cleaned = raw
|
|
817
|
+
.trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-')
|
|
818
|
+
.to_lowercase();
|
|
819
|
+
if cleaned.len() < 3 {
|
|
820
|
+
continue;
|
|
821
|
+
}
|
|
822
|
+
if stopwords.contains(cleaned.as_str()) {
|
|
823
|
+
continue;
|
|
824
|
+
}
|
|
825
|
+
keywords.push(cleaned);
|
|
826
|
+
if keywords.len() >= 10 {
|
|
827
|
+
break;
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
if keywords.len() < 3 {
|
|
832
|
+
let fallback = query
|
|
833
|
+
.split_whitespace()
|
|
834
|
+
.map(|s| s.trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-'))
|
|
835
|
+
.filter(|s| s.len() >= 3)
|
|
836
|
+
.take(8)
|
|
837
|
+
.collect::<Vec<_>>();
|
|
838
|
+
if fallback.is_empty() {
|
|
839
|
+
return None;
|
|
840
|
+
}
|
|
841
|
+
return Some(fallback.join(" "));
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
Some(keywords.join(" "))
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
fn topic_focus_query(query: &str) -> Option<String> {
|
|
848
|
+
let priority = [
|
|
849
|
+
"israele",
|
|
850
|
+
"israel",
|
|
851
|
+
"gaza",
|
|
852
|
+
"hamas",
|
|
853
|
+
"guerra",
|
|
854
|
+
"war",
|
|
855
|
+
"iran",
|
|
856
|
+
"libano",
|
|
857
|
+
"hezbollah",
|
|
858
|
+
"blocco",
|
|
859
|
+
"blocchi",
|
|
860
|
+
"navale",
|
|
861
|
+
"navali",
|
|
862
|
+
"mar",
|
|
863
|
+
"mare",
|
|
864
|
+
"italia",
|
|
865
|
+
"italy",
|
|
866
|
+
"europa",
|
|
867
|
+
"europe",
|
|
868
|
+
];
|
|
869
|
+
|
|
870
|
+
let tokens = query
|
|
871
|
+
.split_whitespace()
|
|
872
|
+
.map(|raw| {
|
|
873
|
+
raw.trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-')
|
|
874
|
+
.to_lowercase()
|
|
875
|
+
})
|
|
876
|
+
.filter(|t| t.len() >= 3)
|
|
877
|
+
.collect::<Vec<_>>();
|
|
878
|
+
|
|
879
|
+
if tokens.is_empty() {
|
|
880
|
+
return None;
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
let mut selected = Vec::new();
|
|
884
|
+
for p in priority {
|
|
885
|
+
if tokens.iter().any(|t| t == p) {
|
|
886
|
+
selected.push(p.to_string());
|
|
887
|
+
}
|
|
888
|
+
if selected.len() >= 7 {
|
|
889
|
+
break;
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
if selected.len() < 3 {
|
|
894
|
+
for t in tokens {
|
|
895
|
+
if !selected.iter().any(|x| x == &t) {
|
|
896
|
+
selected.push(t);
|
|
897
|
+
}
|
|
898
|
+
if selected.len() >= 7 {
|
|
899
|
+
break;
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
if selected.is_empty() {
|
|
905
|
+
None
|
|
906
|
+
} else {
|
|
907
|
+
Some(selected.join(" "))
|
|
908
|
+
}
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
fn as_provider_report(
|
|
912
|
+
provider: &'static str,
|
|
913
|
+
result: Result<Vec<SearchResult>>,
|
|
914
|
+
) -> SearchProviderReport {
|
|
915
|
+
match result {
|
|
916
|
+
Ok(items) => SearchProviderReport {
|
|
917
|
+
provider,
|
|
918
|
+
items,
|
|
919
|
+
error: None,
|
|
920
|
+
},
|
|
921
|
+
Err(e) => SearchProviderReport {
|
|
922
|
+
provider,
|
|
923
|
+
items: Vec::new(),
|
|
924
|
+
error: Some(e.to_string()),
|
|
925
|
+
},
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
fn rank_and_dedup(
|
|
930
|
+
results: Vec<SearchResult>,
|
|
931
|
+
max_results: usize,
|
|
932
|
+
query: &str,
|
|
933
|
+
) -> Vec<SearchResult> {
|
|
934
|
+
let query_tokens = query_tokens(query);
|
|
935
|
+
let mut deduped = Vec::with_capacity(results.len());
|
|
936
|
+
let mut seen_urls = HashSet::new();
|
|
937
|
+
|
|
938
|
+
for mut item in results {
|
|
939
|
+
if item.url.trim().is_empty() {
|
|
940
|
+
continue;
|
|
941
|
+
}
|
|
942
|
+
let key = normalize_url_key(&item.url);
|
|
943
|
+
if key.is_empty() || !seen_urls.insert(key) {
|
|
944
|
+
continue;
|
|
945
|
+
}
|
|
946
|
+
if item.source_domain.trim().is_empty() {
|
|
947
|
+
item.source_domain = source_domain(&item.url);
|
|
948
|
+
}
|
|
949
|
+
deduped.push(item);
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
let mut domain_counts: HashMap<String, u32> = HashMap::new();
|
|
953
|
+
for item in &mut deduped {
|
|
954
|
+
let domain = if item.source_domain.trim().is_empty() {
|
|
955
|
+
"unknown".to_string()
|
|
956
|
+
} else {
|
|
957
|
+
item.source_domain.clone()
|
|
958
|
+
};
|
|
959
|
+
|
|
960
|
+
let count = *domain_counts.get(&domain).unwrap_or(&0);
|
|
961
|
+
let mut score = 1.0;
|
|
962
|
+
|
|
963
|
+
score += match item.provider.as_str() {
|
|
964
|
+
p if p.starts_with("searx") => 0.9,
|
|
965
|
+
p if p.starts_with("brave-html") => 0.82,
|
|
966
|
+
"wikipedia" => 0.75,
|
|
967
|
+
p if p.starts_with("news-rss") => 0.7,
|
|
968
|
+
p if p.starts_with("duckduckgo") => 0.65,
|
|
969
|
+
p if p.starts_with("bing-rss") => 0.55,
|
|
970
|
+
p if p.starts_with("direct-probe") => 0.62,
|
|
971
|
+
_ => 0.5,
|
|
972
|
+
};
|
|
973
|
+
|
|
974
|
+
if !item.engine.trim().is_empty() {
|
|
975
|
+
score += 0.15;
|
|
976
|
+
}
|
|
977
|
+
if item.content.len() > 80 {
|
|
978
|
+
score += 0.15;
|
|
979
|
+
}
|
|
980
|
+
let combined = format!("{} {} {}", item.title, item.content, item.url);
|
|
981
|
+
let relevance = query_overlap_score(&combined, &query_tokens);
|
|
982
|
+
score += relevance;
|
|
983
|
+
if relevance <= 0.01 {
|
|
984
|
+
score -= 0.3;
|
|
985
|
+
}
|
|
986
|
+
score -= 0.18 * count as f64;
|
|
987
|
+
if domain == "unknown" {
|
|
988
|
+
score -= 0.1;
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
item.score = Some((score * 100.0).round() / 100.0);
|
|
992
|
+
domain_counts.insert(domain, count + 1);
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
deduped.sort_by(|a, b| {
|
|
996
|
+
let sb = b.score.unwrap_or(0.0);
|
|
997
|
+
let sa = a.score.unwrap_or(0.0);
|
|
998
|
+
sb.partial_cmp(&sa)
|
|
999
|
+
.unwrap_or(Ordering::Equal)
|
|
1000
|
+
.then_with(|| a.source_domain.cmp(&b.source_domain))
|
|
1001
|
+
.then_with(|| a.title.cmp(&b.title))
|
|
1002
|
+
});
|
|
1003
|
+
|
|
1004
|
+
if !query_tokens.is_empty() {
|
|
1005
|
+
deduped.retain(|item| {
|
|
1006
|
+
let combined = format!("{} {} {}", item.title, item.content, item.url);
|
|
1007
|
+
query_overlap_score(&combined, &query_tokens) > 0.01
|
|
1008
|
+
});
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
deduped.truncate(max_results);
|
|
1012
|
+
deduped
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
fn query_tokens(query: &str) -> Vec<String> {
|
|
1016
|
+
let mut out = Vec::new();
|
|
1017
|
+
for raw in query.split_whitespace() {
|
|
1018
|
+
let token = raw
|
|
1019
|
+
.trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-')
|
|
1020
|
+
.to_lowercase();
|
|
1021
|
+
if token.len() < 3 {
|
|
1022
|
+
continue;
|
|
1023
|
+
}
|
|
1024
|
+
if !out.iter().any(|v| v == &token) {
|
|
1025
|
+
out.push(token);
|
|
1026
|
+
}
|
|
1027
|
+
if out.len() >= 12 {
|
|
1028
|
+
break;
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
out
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
fn query_overlap_score(text: &str, query_tokens: &[String]) -> f64 {
|
|
1035
|
+
if query_tokens.is_empty() {
|
|
1036
|
+
return 0.0;
|
|
1037
|
+
}
|
|
1038
|
+
let lower = text.to_lowercase();
|
|
1039
|
+
let mut matches = 0usize;
|
|
1040
|
+
for token in query_tokens {
|
|
1041
|
+
if lower.contains(token) {
|
|
1042
|
+
matches += 1;
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
(matches as f64 / query_tokens.len() as f64) * 0.9
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
async fn searx_search_broker(
|
|
1049
|
+
endpoints: &[String],
|
|
1050
|
+
query: &str,
|
|
1051
|
+
max_results_per_endpoint: usize,
|
|
1052
|
+
max_parallel_endpoints: usize,
|
|
1053
|
+
) -> Result<Vec<SearchResult>> {
|
|
1054
|
+
let ordered = rank_endpoints(endpoints).await;
|
|
1055
|
+
let cap = max_parallel_endpoints.max(1).min(ordered.len().max(1));
|
|
1056
|
+
let selected: Vec<String> = ordered.iter().take(cap).cloned().collect();
|
|
1057
|
+
|
|
1058
|
+
let mut tasks = Vec::with_capacity(selected.len());
|
|
1059
|
+
for endpoint in selected {
|
|
1060
|
+
let q = query.to_string();
|
|
1061
|
+
tasks.push(tokio::spawn(async move {
|
|
1062
|
+
let out = web_search_single_endpoint(&endpoint, &q, max_results_per_endpoint).await;
|
|
1063
|
+
(endpoint, out)
|
|
1064
|
+
}));
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
let mut merged = Vec::new();
|
|
1068
|
+
let mut errors = Vec::new();
|
|
1069
|
+
|
|
1070
|
+
for task in tasks {
|
|
1071
|
+
match task.await {
|
|
1072
|
+
Ok((endpoint, Ok(results))) => {
|
|
1073
|
+
mark_endpoint_success(&endpoint).await;
|
|
1074
|
+
merged.extend(results);
|
|
1075
|
+
}
|
|
1076
|
+
Ok((endpoint, Err(err))) => {
|
|
1077
|
+
mark_endpoint_failure(&endpoint, &err.to_string()).await;
|
|
1078
|
+
errors.push(format!("{} -> {}", endpoint, err));
|
|
1079
|
+
}
|
|
1080
|
+
Err(err) => {
|
|
1081
|
+
errors.push(format!("join error: {}", err));
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
if merged.is_empty() {
|
|
1087
|
+
// Fallback pass: if caller provided more endpoints than current parallel cap,
|
|
1088
|
+
// probe the remaining ones sequentially before failing.
|
|
1089
|
+
for endpoint in ordered.iter().skip(cap).cloned() {
|
|
1090
|
+
match web_search_single_endpoint(&endpoint, query, max_results_per_endpoint).await {
|
|
1091
|
+
Ok(results) => {
|
|
1092
|
+
mark_endpoint_success(&endpoint).await;
|
|
1093
|
+
merged.extend(results);
|
|
1094
|
+
if !merged.is_empty() {
|
|
1095
|
+
break;
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
Err(err) => {
|
|
1099
|
+
mark_endpoint_failure(&endpoint, &err.to_string()).await;
|
|
1100
|
+
errors.push(format!("{} -> {}", endpoint, err));
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
if merged.is_empty() {
|
|
1107
|
+
let details = if errors.is_empty() {
|
|
1108
|
+
"unknown failure".to_string()
|
|
1109
|
+
} else {
|
|
1110
|
+
errors.join(" | ")
|
|
1111
|
+
};
|
|
1112
|
+
return Err(anyhow!("all searx endpoints failed: {}", details));
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
Ok(merged)
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
async fn rank_endpoints(endpoints: &[String]) -> Vec<String> {
|
|
1119
|
+
let mut unique = Vec::new();
|
|
1120
|
+
let mut seen = HashSet::new();
|
|
1121
|
+
for raw in endpoints {
|
|
1122
|
+
let normalized = raw.trim().trim_end_matches('/').to_string();
|
|
1123
|
+
if normalized.is_empty() {
|
|
1124
|
+
continue;
|
|
1125
|
+
}
|
|
1126
|
+
if seen.insert(normalized.clone()) {
|
|
1127
|
+
unique.push(normalized);
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1131
|
+
let now = now_unix_secs();
|
|
1132
|
+
let map = endpoint_health_store().lock().await;
|
|
1133
|
+
|
|
1134
|
+
let mut ranked: Vec<(String, i64)> = unique
|
|
1135
|
+
.into_iter()
|
|
1136
|
+
.map(|endpoint| {
|
|
1137
|
+
let stat = map.get(&endpoint).cloned().unwrap_or_default();
|
|
1138
|
+
let cooldown_penalty = if stat.cooldown_until > now { 20 } else { 0 };
|
|
1139
|
+
let score = (stat.successes as i64 * 2) - (stat.failures as i64) - cooldown_penalty;
|
|
1140
|
+
(endpoint, score)
|
|
1141
|
+
})
|
|
1142
|
+
.collect();
|
|
1143
|
+
|
|
1144
|
+
ranked.sort_by(|a, b| b.1.cmp(&a.1));
|
|
1145
|
+
ranked.into_iter().map(|x| x.0).collect()
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1148
|
+
async fn mark_endpoint_success(endpoint: &str) {
|
|
1149
|
+
let mut map = endpoint_health_store().lock().await;
|
|
1150
|
+
let entry = map.entry(endpoint.to_string()).or_default();
|
|
1151
|
+
entry.successes = entry.successes.saturating_add(1);
|
|
1152
|
+
entry.cooldown_until = 0;
|
|
1153
|
+
entry.last_error = None;
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
async fn mark_endpoint_failure(endpoint: &str, error: &str) {
|
|
1157
|
+
let mut map = endpoint_health_store().lock().await;
|
|
1158
|
+
let entry = map.entry(endpoint.to_string()).or_default();
|
|
1159
|
+
entry.failures = entry.failures.saturating_add(1);
|
|
1160
|
+
let lower = error.to_lowercase();
|
|
1161
|
+
let cooldown = if lower.contains("429") { 120 } else { 30 };
|
|
1162
|
+
entry.cooldown_until = now_unix_secs().saturating_add(cooldown);
|
|
1163
|
+
entry.last_error = Some(error.to_string());
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
async fn web_search_single_endpoint(
|
|
1167
|
+
endpoint: &str,
|
|
1168
|
+
query: &str,
|
|
1169
|
+
max_results: usize,
|
|
1170
|
+
) -> Result<Vec<SearchResult>> {
|
|
1171
|
+
let base = endpoint.trim_end_matches('/');
|
|
1172
|
+
let url = format!(
|
|
1173
|
+
"{}/search?q={}&format=json&language=en-US",
|
|
1174
|
+
base,
|
|
1175
|
+
url_encode(query)
|
|
1176
|
+
);
|
|
1177
|
+
let client = reqwest::Client::builder()
|
|
1178
|
+
.user_agent(format!("MasiXDiscovery/{} searx", MODULE_VERSION))
|
|
1179
|
+
.timeout(Duration::from_secs(DEFAULT_SEARCH_TIMEOUT_SECS))
|
|
1180
|
+
.build()?;
|
|
1181
|
+
|
|
1182
|
+
let mut last_error: Option<String> = None;
|
|
1183
|
+
|
|
1184
|
+
for attempt in 1..=SEARX_RETRIES {
|
|
1185
|
+
match client
|
|
1186
|
+
.get(&url)
|
|
1187
|
+
.header("Accept", "application/json")
|
|
1188
|
+
.header("Accept-Language", "en-US,en;q=0.8")
|
|
1189
|
+
.send()
|
|
1190
|
+
.await
|
|
1191
|
+
{
|
|
1192
|
+
Ok(response) => {
|
|
1193
|
+
let status = response.status();
|
|
1194
|
+
if status.is_success() {
|
|
1195
|
+
let mut parsed: SearxResponse = response
|
|
1196
|
+
.json()
|
|
1197
|
+
.await
|
|
1198
|
+
.map_err(|e| anyhow!("Invalid SearXNG JSON from {}: {}", endpoint, e))?;
|
|
1199
|
+
parsed.results.retain(|r| !r.url.trim().is_empty());
|
|
1200
|
+
parsed
|
|
1201
|
+
.results
|
|
1202
|
+
.truncate(max_results.min(MAX_PROVIDER_RESULTS));
|
|
1203
|
+
|
|
1204
|
+
let provider = format!("searx:{}", endpoint_host(endpoint));
|
|
1205
|
+
let mapped = parsed
|
|
1206
|
+
.results
|
|
1207
|
+
.into_iter()
|
|
1208
|
+
.map(|item| SearchResult {
|
|
1209
|
+
title: item.title,
|
|
1210
|
+
url: item.url.clone(),
|
|
1211
|
+
content: item.content,
|
|
1212
|
+
engine: item.engine,
|
|
1213
|
+
provider: provider.clone(),
|
|
1214
|
+
source_domain: source_domain(&item.url),
|
|
1215
|
+
score: None,
|
|
1216
|
+
endpoint: Some(endpoint.to_string()),
|
|
1217
|
+
})
|
|
1218
|
+
.collect::<Vec<_>>();
|
|
1219
|
+
return Ok(mapped);
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
let body = response.text().await.unwrap_or_default();
|
|
1223
|
+
let snippet = summarize_for_error(&body);
|
|
1224
|
+
let err = format!(
|
|
1225
|
+
"SearX endpoint {} HTTP {} (attempt {}/{}): {}",
|
|
1226
|
+
endpoint, status, attempt, SEARX_RETRIES, snippet
|
|
1227
|
+
);
|
|
1228
|
+
|
|
1229
|
+
if should_retry_status(status) && attempt < SEARX_RETRIES {
|
|
1230
|
+
last_error = Some(err);
|
|
1231
|
+
tokio::time::sleep(backoff_for_attempt(attempt, status.as_u16())).await;
|
|
1232
|
+
continue;
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
return Err(anyhow!(err));
|
|
1236
|
+
}
|
|
1237
|
+
Err(e) => {
|
|
1238
|
+
let err = format!(
|
|
1239
|
+
"SearX endpoint {} request error (attempt {}/{}): {}",
|
|
1240
|
+
endpoint, attempt, SEARX_RETRIES, e
|
|
1241
|
+
);
|
|
1242
|
+
if attempt < SEARX_RETRIES {
|
|
1243
|
+
last_error = Some(err);
|
|
1244
|
+
tokio::time::sleep(backoff_for_attempt(attempt, 0)).await;
|
|
1245
|
+
continue;
|
|
1246
|
+
}
|
|
1247
|
+
return Err(anyhow!(err));
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
Err(anyhow!(
|
|
1253
|
+
"SearX endpoint {} failed: {}",
|
|
1254
|
+
endpoint,
|
|
1255
|
+
last_error.unwrap_or_else(|| "unknown error".to_string())
|
|
1256
|
+
))
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
fn should_retry_status(status: reqwest::StatusCode) -> bool {
|
|
1260
|
+
matches!(
|
|
1261
|
+
status.as_u16(),
|
|
1262
|
+
429 | 500 | 502 | 503 | 504 | 520 | 521 | 522 | 523 | 524
|
|
1263
|
+
)
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
fn backoff_for_attempt(attempt: usize, status: u16) -> Duration {
|
|
1267
|
+
let base_ms = match status {
|
|
1268
|
+
429 => 1200,
|
|
1269
|
+
500 | 502 | 503 | 504 => 700,
|
|
1270
|
+
_ => 500,
|
|
1271
|
+
};
|
|
1272
|
+
let factor = (attempt.saturating_sub(1)) as u32;
|
|
1273
|
+
Duration::from_millis(base_ms * (2_u64.pow(factor)).min(6))
|
|
1274
|
+
}
|
|
1275
|
+
|
|
1276
|
+
fn endpoint_host(endpoint: &str) -> String {
|
|
1277
|
+
reqwest::Url::parse(endpoint)
|
|
1278
|
+
.ok()
|
|
1279
|
+
.and_then(|u| u.host_str().map(|s| s.to_string()))
|
|
1280
|
+
.unwrap_or_else(|| "unknown".to_string())
|
|
1281
|
+
}
|
|
1282
|
+
|
|
1283
|
+
async fn wikipedia_search(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
|
1284
|
+
let client = reqwest::Client::builder()
|
|
1285
|
+
.user_agent(format!("MasiXDiscovery/{} wikipedia", MODULE_VERSION))
|
|
1286
|
+
.timeout(Duration::from_secs(DEFAULT_SEARCH_TIMEOUT_SECS))
|
|
1287
|
+
.build()?;
|
|
1288
|
+
|
|
1289
|
+
let url = format!(
|
|
1290
|
+
"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={}&utf8=&format=json&srlimit={}",
|
|
1291
|
+
url_encode(query),
|
|
1292
|
+
max_results.min(20).max(1)
|
|
1293
|
+
);
|
|
1294
|
+
|
|
1295
|
+
let response = client
|
|
1296
|
+
.get(&url)
|
|
1297
|
+
.header("Accept", "application/json")
|
|
1298
|
+
.send()
|
|
1299
|
+
.await?;
|
|
1300
|
+
|
|
1301
|
+
if !response.status().is_success() {
|
|
1302
|
+
return Err(anyhow!(
|
|
1303
|
+
"wikipedia provider failed: HTTP {}",
|
|
1304
|
+
response.status()
|
|
1305
|
+
));
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
let parsed: WikipediaResponse = response.json().await?;
|
|
1309
|
+
let entries = parsed
|
|
1310
|
+
.query
|
|
1311
|
+
.map(|q| q.search)
|
|
1312
|
+
.unwrap_or_default()
|
|
1313
|
+
.into_iter()
|
|
1314
|
+
.filter(|item| item.pageid > 0)
|
|
1315
|
+
.take(max_results.min(MAX_PROVIDER_RESULTS))
|
|
1316
|
+
.map(|item| {
|
|
1317
|
+
let page_url = format!("https://en.wikipedia.org/?curid={}", item.pageid);
|
|
1318
|
+
SearchResult {
|
|
1319
|
+
title: item.title,
|
|
1320
|
+
url: page_url.clone(),
|
|
1321
|
+
content: strip_html_tags(&decode_xml_entities(&item.snippet)),
|
|
1322
|
+
engine: "wikipedia-api".to_string(),
|
|
1323
|
+
provider: "wikipedia".to_string(),
|
|
1324
|
+
source_domain: source_domain(&page_url),
|
|
1325
|
+
score: None,
|
|
1326
|
+
endpoint: None,
|
|
1327
|
+
}
|
|
1328
|
+
})
|
|
1329
|
+
.collect::<Vec<_>>();
|
|
1330
|
+
|
|
1331
|
+
Ok(entries)
|
|
1332
|
+
}
|
|
1333
|
+
|
|
1334
|
+
async fn google_news_search(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
|
1335
|
+
let per_region = ((max_results.saturating_mul(2)) / NEWS_REGIONS.len())
|
|
1336
|
+
.max(2)
|
|
1337
|
+
.min(MAX_PROVIDER_RESULTS);
|
|
1338
|
+
|
|
1339
|
+
let mut tasks = Vec::with_capacity(NEWS_REGIONS.len());
|
|
1340
|
+
for (country, hl, ceid) in NEWS_REGIONS {
|
|
1341
|
+
let q = query.to_string();
|
|
1342
|
+
let country = *country;
|
|
1343
|
+
let hl = *hl;
|
|
1344
|
+
let ceid = *ceid;
|
|
1345
|
+
tasks.push(tokio::spawn(async move {
|
|
1346
|
+
let out = google_news_region_search(&q, country, hl, ceid, per_region).await;
|
|
1347
|
+
(country, out)
|
|
1348
|
+
}));
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
let mut merged = Vec::new();
|
|
1352
|
+
let mut errors = Vec::new();
|
|
1353
|
+
for task in tasks {
|
|
1354
|
+
match task.await {
|
|
1355
|
+
Ok((_, Ok(items))) => merged.extend(items),
|
|
1356
|
+
Ok((country, Err(err))) => errors.push(format!("{}: {}", country, err)),
|
|
1357
|
+
Err(err) => errors.push(format!("join error: {}", err)),
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
if merged.is_empty() {
|
|
1362
|
+
if errors.is_empty() {
|
|
1363
|
+
Err(anyhow!("news-rss provider returned no results"))
|
|
1364
|
+
} else {
|
|
1365
|
+
Err(anyhow!(
|
|
1366
|
+
"news-rss providers failed for query '{}': {}",
|
|
1367
|
+
query,
|
|
1368
|
+
errors.join(" | ")
|
|
1369
|
+
))
|
|
1370
|
+
}
|
|
1371
|
+
} else {
|
|
1372
|
+
Ok(merged)
|
|
1373
|
+
}
|
|
1374
|
+
}
|
|
1375
|
+
|
|
1376
|
+
async fn google_news_region_search(
|
|
1377
|
+
query: &str,
|
|
1378
|
+
country: &str,
|
|
1379
|
+
hl: &str,
|
|
1380
|
+
ceid: &str,
|
|
1381
|
+
max_results: usize,
|
|
1382
|
+
) -> Result<Vec<SearchResult>> {
|
|
1383
|
+
let client = reqwest::Client::builder()
|
|
1384
|
+
.user_agent(format!("MasiXDiscovery/{} news-rss", MODULE_VERSION))
|
|
1385
|
+
.timeout(Duration::from_secs(DEFAULT_SEARCH_TIMEOUT_SECS))
|
|
1386
|
+
.build()?;
|
|
1387
|
+
|
|
1388
|
+
let url = format!(
|
|
1389
|
+
"https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}",
|
|
1390
|
+
url_encode(query),
|
|
1391
|
+
url_encode(hl),
|
|
1392
|
+
url_encode(country),
|
|
1393
|
+
url_encode(ceid)
|
|
1394
|
+
);
|
|
1395
|
+
|
|
1396
|
+
let response = client.get(&url).send().await?;
|
|
1397
|
+
if !response.status().is_success() {
|
|
1398
|
+
return Err(anyhow!(
|
|
1399
|
+
"news-rss provider {} failed: HTTP {}",
|
|
1400
|
+
country,
|
|
1401
|
+
response.status()
|
|
1402
|
+
));
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
let body = response.text().await?;
|
|
1406
|
+
let limit = max_results.min(MAX_PROVIDER_RESULTS).max(1);
|
|
1407
|
+
let mut items = parse_rss_items(&body, limit);
|
|
1408
|
+
|
|
1409
|
+
if items.is_empty() {
|
|
1410
|
+
items = parse_atom_entries(&body, limit);
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
let engine = format!("google-news-rss-{}", country.to_lowercase());
|
|
1414
|
+
let provider = format!("news-rss:{}", country);
|
|
1415
|
+
|
|
1416
|
+
Ok(items
|
|
1417
|
+
.into_iter()
|
|
1418
|
+
.map(|item| SearchResult {
|
|
1419
|
+
title: item.title,
|
|
1420
|
+
url: item.link.clone(),
|
|
1421
|
+
content: item.description,
|
|
1422
|
+
engine: engine.clone(),
|
|
1423
|
+
provider: provider.clone(),
|
|
1424
|
+
source_domain: source_domain(&item.link),
|
|
1425
|
+
score: None,
|
|
1426
|
+
endpoint: None,
|
|
1427
|
+
})
|
|
1428
|
+
.collect::<Vec<_>>())
|
|
1429
|
+
}
|
|
1430
|
+
|
|
1431
|
+
async fn brave_html_search(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
|
1432
|
+
let client = reqwest::Client::builder()
|
|
1433
|
+
.user_agent(BROWSER_USER_AGENT)
|
|
1434
|
+
.timeout(Duration::from_secs(DEFAULT_SEARCH_TIMEOUT_SECS))
|
|
1435
|
+
.build()?;
|
|
1436
|
+
|
|
1437
|
+
let url = format!(
|
|
1438
|
+
"https://search.brave.com/search?q={}&source=web",
|
|
1439
|
+
url_encode(query)
|
|
1440
|
+
);
|
|
1441
|
+
let response = client
|
|
1442
|
+
.get(&url)
|
|
1443
|
+
.header("Accept-Language", "en-US,en;q=0.8")
|
|
1444
|
+
.header("Accept-Encoding", "gzip, deflate")
|
|
1445
|
+
.send()
|
|
1446
|
+
.await?;
|
|
1447
|
+
if !response.status().is_success() {
|
|
1448
|
+
return Err(anyhow!(
|
|
1449
|
+
"brave-html provider failed: HTTP {}",
|
|
1450
|
+
response.status()
|
|
1451
|
+
));
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
let body = response.text().await?;
|
|
1455
|
+
let doc = Html::parse_document(&body);
|
|
1456
|
+
let snippet_selector = Selector::parse("div.snippet[data-type=\"web\"]")
|
|
1457
|
+
.map_err(|e| anyhow!("brave snippet selector error: {}", e))?;
|
|
1458
|
+
let link_selector =
|
|
1459
|
+
Selector::parse("a.l1[href]").map_err(|e| anyhow!("brave link selector error: {}", e))?;
|
|
1460
|
+
let title_selector =
|
|
1461
|
+
Selector::parse("div.title").map_err(|e| anyhow!("brave title selector error: {}", e))?;
|
|
1462
|
+
let desc_selector = Selector::parse("div.snippet-description")
|
|
1463
|
+
.map_err(|e| anyhow!("brave description selector error: {}", e))?;
|
|
1464
|
+
|
|
1465
|
+
let mut results = Vec::new();
|
|
1466
|
+
let limit = max_results.min(MAX_PROVIDER_RESULTS).max(1);
|
|
1467
|
+
for snippet in doc.select(&snippet_selector) {
|
|
1468
|
+
if results.len() >= limit {
|
|
1469
|
+
break;
|
|
1470
|
+
}
|
|
1471
|
+
let Some(link) = snippet.select(&link_selector).next() else {
|
|
1472
|
+
continue;
|
|
1473
|
+
};
|
|
1474
|
+
let href = link.value().attr("href").unwrap_or("").trim();
|
|
1475
|
+
if !(href.starts_with("http://") || href.starts_with("https://")) {
|
|
1476
|
+
continue;
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
let title = snippet
|
|
1480
|
+
.select(&title_selector)
|
|
1481
|
+
.next()
|
|
1482
|
+
.map(|n| n.text().collect::<Vec<_>>().join(" ").trim().to_string())
|
|
1483
|
+
.filter(|v| !v.is_empty())
|
|
1484
|
+
.or_else(|| {
|
|
1485
|
+
let fallback = link.text().collect::<Vec<_>>().join(" ").trim().to_string();
|
|
1486
|
+
if fallback.is_empty() {
|
|
1487
|
+
None
|
|
1488
|
+
} else {
|
|
1489
|
+
Some(fallback)
|
|
1490
|
+
}
|
|
1491
|
+
})
|
|
1492
|
+
.unwrap_or_else(|| "Result from Brave Search".to_string());
|
|
1493
|
+
|
|
1494
|
+
let content = snippet
|
|
1495
|
+
.select(&desc_selector)
|
|
1496
|
+
.next()
|
|
1497
|
+
.map(|n| n.text().collect::<Vec<_>>().join(" ").trim().to_string())
|
|
1498
|
+
.unwrap_or_default();
|
|
1499
|
+
|
|
1500
|
+
results.push(SearchResult {
|
|
1501
|
+
title,
|
|
1502
|
+
url: href.to_string(),
|
|
1503
|
+
content,
|
|
1504
|
+
engine: "brave-html".to_string(),
|
|
1505
|
+
provider: "brave-html".to_string(),
|
|
1506
|
+
source_domain: source_domain(href),
|
|
1507
|
+
score: None,
|
|
1508
|
+
endpoint: Some("https://search.brave.com/search".to_string()),
|
|
1509
|
+
});
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1512
|
+
if results.is_empty() {
|
|
1513
|
+
Err(anyhow!("brave-html provider returned no results"))
|
|
1514
|
+
} else {
|
|
1515
|
+
Ok(results)
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
|
|
1519
|
+
async fn duckduckgo_html_search(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
|
1520
|
+
let client = reqwest::Client::builder()
|
|
1521
|
+
.user_agent(format!("MasiXDiscovery/{} duckduckgo", MODULE_VERSION))
|
|
1522
|
+
.timeout(Duration::from_secs(DEFAULT_SEARCH_TIMEOUT_SECS))
|
|
1523
|
+
.build()?;
|
|
1524
|
+
let target = max_results.min(MAX_PROVIDER_RESULTS).max(1);
|
|
1525
|
+
|
|
1526
|
+
// Preferred path: DuckDuckGo lite endpoint is more stable than /html in headless contexts.
|
|
1527
|
+
let lite_url = format!(
|
|
1528
|
+
"https://lite.duckduckgo.com/lite/?q={}&kl=wt-wt",
|
|
1529
|
+
url_encode(query)
|
|
1530
|
+
);
|
|
1531
|
+
if let Ok(response) = client
|
|
1532
|
+
.get(&lite_url)
|
|
1533
|
+
.header("User-Agent", BROWSER_USER_AGENT)
|
|
1534
|
+
.header("Accept-Language", "en-US,en;q=0.8")
|
|
1535
|
+
.send()
|
|
1536
|
+
.await
|
|
1537
|
+
{
|
|
1538
|
+
if response.status().is_success() {
|
|
1539
|
+
let body = response.text().await.unwrap_or_default();
|
|
1540
|
+
let doc = Html::parse_document(&body);
|
|
1541
|
+
let title_selector = Selector::parse("a.result-link")
|
|
1542
|
+
.map_err(|e| anyhow!("duckduckgo lite selector error: {}", e))?;
|
|
1543
|
+
let snippet_selector = Selector::parse("td.result-snippet")
|
|
1544
|
+
.map_err(|e| anyhow!("duckduckgo lite snippet selector error: {}", e))?;
|
|
1545
|
+
let snippets = doc
|
|
1546
|
+
.select(&snippet_selector)
|
|
1547
|
+
.map(|n| n.text().collect::<Vec<_>>().join(" ").trim().to_string())
|
|
1548
|
+
.collect::<Vec<_>>();
|
|
1549
|
+
|
|
1550
|
+
let mut results = Vec::new();
|
|
1551
|
+
for (idx, element) in doc.select(&title_selector).enumerate() {
|
|
1552
|
+
if results.len() >= target {
|
|
1553
|
+
break;
|
|
1554
|
+
}
|
|
1555
|
+
let raw_href = element.value().attr("href").unwrap_or("").trim();
|
|
1556
|
+
let url = normalize_duckduckgo_href(raw_href);
|
|
1557
|
+
if url.is_empty() {
|
|
1558
|
+
continue;
|
|
1559
|
+
}
|
|
1560
|
+
let title = element
|
|
1561
|
+
.text()
|
|
1562
|
+
.collect::<Vec<_>>()
|
|
1563
|
+
.join(" ")
|
|
1564
|
+
.trim()
|
|
1565
|
+
.to_string();
|
|
1566
|
+
if title.is_empty() {
|
|
1567
|
+
continue;
|
|
1568
|
+
}
|
|
1569
|
+
let content = snippets.get(idx).cloned().unwrap_or_default();
|
|
1570
|
+
results.push(SearchResult {
|
|
1571
|
+
title,
|
|
1572
|
+
url: url.clone(),
|
|
1573
|
+
content,
|
|
1574
|
+
engine: "duckduckgo-lite".to_string(),
|
|
1575
|
+
provider: "duckduckgo".to_string(),
|
|
1576
|
+
source_domain: source_domain(&url),
|
|
1577
|
+
score: None,
|
|
1578
|
+
endpoint: Some("https://lite.duckduckgo.com/lite".to_string()),
|
|
1579
|
+
});
|
|
1580
|
+
}
|
|
1581
|
+
if !results.is_empty() {
|
|
1582
|
+
return Ok(results);
|
|
1583
|
+
}
|
|
1584
|
+
}
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
let url = format!(
|
|
1588
|
+
"https://duckduckgo.com/html/?q={}&kl=wt-wt",
|
|
1589
|
+
url_encode(query)
|
|
1590
|
+
);
|
|
1591
|
+
let response = client
|
|
1592
|
+
.get(&url)
|
|
1593
|
+
.header("User-Agent", BROWSER_USER_AGENT)
|
|
1594
|
+
.header("Accept-Language", "en-US,en;q=0.8")
|
|
1595
|
+
.send()
|
|
1596
|
+
.await?;
|
|
1597
|
+
if !response.status().is_success() {
|
|
1598
|
+
return Err(anyhow!(
|
|
1599
|
+
"duckduckgo provider failed: HTTP {}",
|
|
1600
|
+
response.status()
|
|
1601
|
+
));
|
|
1602
|
+
}
|
|
1603
|
+
|
|
1604
|
+
let body = response.text().await?;
|
|
1605
|
+
let doc = Html::parse_document(&body);
|
|
1606
|
+
let title_selector =
|
|
1607
|
+
Selector::parse("a.result__a").map_err(|e| anyhow!("duckduckgo selector error: {}", e))?;
|
|
1608
|
+
let snippet_selector = Selector::parse(".result__snippet")
|
|
1609
|
+
.map_err(|e| anyhow!("duckduckgo snippet selector error: {}", e))?;
|
|
1610
|
+
let fallback_link_selector =
|
|
1611
|
+
Selector::parse("a[href]").map_err(|e| anyhow!("duckduckgo fallback selector: {}", e))?;
|
|
1612
|
+
|
|
1613
|
+
let mut results = Vec::new();
|
|
1614
|
+
|
|
1615
|
+
for element in doc.select(&title_selector) {
|
|
1616
|
+
if results.len() >= target {
|
|
1617
|
+
break;
|
|
1618
|
+
}
|
|
1619
|
+
let raw_href = element.value().attr("href").unwrap_or("").trim();
|
|
1620
|
+
let url = normalize_duckduckgo_href(raw_href);
|
|
1621
|
+
if url.is_empty() {
|
|
1622
|
+
continue;
|
|
1623
|
+
}
|
|
1624
|
+
let title = element
|
|
1625
|
+
.text()
|
|
1626
|
+
.collect::<Vec<_>>()
|
|
1627
|
+
.join(" ")
|
|
1628
|
+
.trim()
|
|
1629
|
+
.to_string();
|
|
1630
|
+
if title.is_empty() {
|
|
1631
|
+
continue;
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
let content = element
|
|
1635
|
+
.parent()
|
|
1636
|
+
.and_then(scraper::ElementRef::wrap)
|
|
1637
|
+
.and_then(|parent| parent.select(&snippet_selector).next())
|
|
1638
|
+
.map(|n| n.text().collect::<Vec<_>>().join(" ").trim().to_string())
|
|
1639
|
+
.unwrap_or_default();
|
|
1640
|
+
|
|
1641
|
+
results.push(SearchResult {
|
|
1642
|
+
title,
|
|
1643
|
+
url: url.clone(),
|
|
1644
|
+
content,
|
|
1645
|
+
engine: "duckduckgo-html".to_string(),
|
|
1646
|
+
provider: "duckduckgo".to_string(),
|
|
1647
|
+
source_domain: source_domain(&url),
|
|
1648
|
+
score: None,
|
|
1649
|
+
endpoint: Some("https://duckduckgo.com/html".to_string()),
|
|
1650
|
+
});
|
|
1651
|
+
}
|
|
1652
|
+
|
|
1653
|
+
if results.is_empty() {
|
|
1654
|
+
for link in doc.select(&fallback_link_selector) {
|
|
1655
|
+
if results.len() >= target {
|
|
1656
|
+
break;
|
|
1657
|
+
}
|
|
1658
|
+
let raw_href = link.value().attr("href").unwrap_or("").trim();
|
|
1659
|
+
let url = normalize_duckduckgo_href(raw_href);
|
|
1660
|
+
if url.is_empty() {
|
|
1661
|
+
continue;
|
|
1662
|
+
}
|
|
1663
|
+
let title = link.text().collect::<Vec<_>>().join(" ").trim().to_string();
|
|
1664
|
+
if title.len() < 12 {
|
|
1665
|
+
continue;
|
|
1666
|
+
}
|
|
1667
|
+
results.push(SearchResult {
|
|
1668
|
+
title,
|
|
1669
|
+
url: url.clone(),
|
|
1670
|
+
content: String::new(),
|
|
1671
|
+
engine: "duckduckgo-html-fallback".to_string(),
|
|
1672
|
+
provider: "duckduckgo".to_string(),
|
|
1673
|
+
source_domain: source_domain(&url),
|
|
1674
|
+
score: None,
|
|
1675
|
+
endpoint: Some("https://duckduckgo.com/html".to_string()),
|
|
1676
|
+
});
|
|
1677
|
+
}
|
|
1678
|
+
}
|
|
1679
|
+
|
|
1680
|
+
if results.is_empty() {
|
|
1681
|
+
Err(anyhow!("duckduckgo provider returned no results"))
|
|
1682
|
+
} else {
|
|
1683
|
+
Ok(results)
|
|
1684
|
+
}
|
|
1685
|
+
}
|
|
1686
|
+
|
|
1687
|
+
fn normalize_duckduckgo_href(raw_href: &str) -> String {
|
|
1688
|
+
let href = raw_href.trim();
|
|
1689
|
+
if href.is_empty() {
|
|
1690
|
+
return String::new();
|
|
1691
|
+
}
|
|
1692
|
+
if href.starts_with("http://") || href.starts_with("https://") {
|
|
1693
|
+
return href.to_string();
|
|
1694
|
+
}
|
|
1695
|
+
if href.starts_with("/l/?") || href.starts_with("//duckduckgo.com/l/?") {
|
|
1696
|
+
let canonical = if let Some(stripped) = href.strip_prefix("//duckduckgo.com") {
|
|
1697
|
+
format!("https://duckduckgo.com{}", stripped)
|
|
1698
|
+
} else {
|
|
1699
|
+
format!("https://duckduckgo.com{}", href)
|
|
1700
|
+
};
|
|
1701
|
+
if let Ok(url) = reqwest::Url::parse(&canonical) {
|
|
1702
|
+
for (k, v) in url.query_pairs() {
|
|
1703
|
+
if k == "uddg" {
|
|
1704
|
+
let out = v.to_string();
|
|
1705
|
+
if out.starts_with("http://") || out.starts_with("https://") {
|
|
1706
|
+
return out;
|
|
1707
|
+
}
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
}
|
|
1711
|
+
}
|
|
1712
|
+
String::new()
|
|
1713
|
+
}
|
|
1714
|
+
|
|
1715
|
+
async fn bing_rss_search(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
|
1716
|
+
let client = reqwest::Client::builder()
|
|
1717
|
+
.user_agent(format!("MasiXDiscovery/{} bing-rss", MODULE_VERSION))
|
|
1718
|
+
.timeout(Duration::from_secs(DEFAULT_SEARCH_TIMEOUT_SECS))
|
|
1719
|
+
.build()?;
|
|
1720
|
+
let url = format!(
|
|
1721
|
+
"https://www.bing.com/search?q={}&format=rss&setlang=en-US&cc=us",
|
|
1722
|
+
url_encode(query)
|
|
1723
|
+
);
|
|
1724
|
+
|
|
1725
|
+
let response = client.get(&url).send().await?;
|
|
1726
|
+
if !response.status().is_success() {
|
|
1727
|
+
return Err(anyhow!(
|
|
1728
|
+
"bing-rss provider failed: HTTP {}",
|
|
1729
|
+
response.status()
|
|
1730
|
+
));
|
|
1731
|
+
}
|
|
1732
|
+
|
|
1733
|
+
let body = response.text().await?;
|
|
1734
|
+
let limit = max_results.min(MAX_PROVIDER_RESULTS).max(1);
|
|
1735
|
+
let mut items = parse_rss_items(&body, limit);
|
|
1736
|
+
if items.is_empty() {
|
|
1737
|
+
items = parse_atom_entries(&body, limit);
|
|
1738
|
+
}
|
|
1739
|
+
if items.is_empty() {
|
|
1740
|
+
return Err(anyhow!("bing-rss provider returned no results"));
|
|
1741
|
+
}
|
|
1742
|
+
|
|
1743
|
+
Ok(items
|
|
1744
|
+
.into_iter()
|
|
1745
|
+
.map(|item| SearchResult {
|
|
1746
|
+
title: item.title,
|
|
1747
|
+
url: item.link.clone(),
|
|
1748
|
+
content: item.description,
|
|
1749
|
+
engine: "bing-rss".to_string(),
|
|
1750
|
+
provider: "bing-rss".to_string(),
|
|
1751
|
+
source_domain: source_domain(&item.link),
|
|
1752
|
+
score: None,
|
|
1753
|
+
endpoint: Some("https://www.bing.com/search?format=rss".to_string()),
|
|
1754
|
+
})
|
|
1755
|
+
.collect::<Vec<_>>())
|
|
1756
|
+
}
|
|
1757
|
+
|
|
1758
|
+
async fn direct_domain_probe(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
|
1759
|
+
let mut tokens = query
|
|
1760
|
+
.split_whitespace()
|
|
1761
|
+
.map(|raw| {
|
|
1762
|
+
raw.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_')
|
|
1763
|
+
.to_lowercase()
|
|
1764
|
+
})
|
|
1765
|
+
.filter(|t| t.len() >= 3 && t.chars().all(|c| c.is_ascii_alphanumeric() || c == '-'))
|
|
1766
|
+
.collect::<Vec<_>>();
|
|
1767
|
+
tokens.dedup();
|
|
1768
|
+
if tokens.is_empty() {
|
|
1769
|
+
return Ok(Vec::new());
|
|
1770
|
+
}
|
|
1771
|
+
|
|
1772
|
+
let mut stems = Vec::new();
|
|
1773
|
+
stems.push(tokens[0].clone());
|
|
1774
|
+
if tokens.len() >= 2 {
|
|
1775
|
+
stems.push(format!("{}{}", tokens[0], tokens[1]));
|
|
1776
|
+
}
|
|
1777
|
+
stems.truncate(2);
|
|
1778
|
+
|
|
1779
|
+
let tlds = ["com", "org", "net", "io", "it"];
|
|
1780
|
+
let mut candidates = Vec::new();
|
|
1781
|
+
for stem in stems {
|
|
1782
|
+
for tld in tlds {
|
|
1783
|
+
candidates.push(format!("https://{}.{}", stem, tld));
|
|
1784
|
+
}
|
|
1785
|
+
}
|
|
1786
|
+
|
|
1787
|
+
let client = reqwest::Client::builder()
|
|
1788
|
+
.user_agent(BROWSER_USER_AGENT)
|
|
1789
|
+
.timeout(Duration::from_secs(8))
|
|
1790
|
+
.build()?;
|
|
1791
|
+
|
|
1792
|
+
let mut out = Vec::new();
|
|
1793
|
+
for url in candidates {
|
|
1794
|
+
if out.len() >= max_results.min(3) {
|
|
1795
|
+
break;
|
|
1796
|
+
}
|
|
1797
|
+
let response = match client.get(&url).send().await {
|
|
1798
|
+
Ok(r) => r,
|
|
1799
|
+
Err(_) => continue,
|
|
1800
|
+
};
|
|
1801
|
+
if !response.status().is_success() {
|
|
1802
|
+
continue;
|
|
1803
|
+
}
|
|
1804
|
+
let body = response.text().await.unwrap_or_default();
|
|
1805
|
+
let title = extract_html_title(&body).unwrap_or_else(|| url.clone());
|
|
1806
|
+
let snippet = extract_text_snippet_from_html(&body, 260);
|
|
1807
|
+
out.push(SearchResult {
|
|
1808
|
+
title,
|
|
1809
|
+
url: url.clone(),
|
|
1810
|
+
content: snippet,
|
|
1811
|
+
engine: "direct-domain-probe".to_string(),
|
|
1812
|
+
provider: "direct-probe".to_string(),
|
|
1813
|
+
source_domain: source_domain(&url),
|
|
1814
|
+
score: None,
|
|
1815
|
+
endpoint: None,
|
|
1816
|
+
});
|
|
1817
|
+
}
|
|
1818
|
+
|
|
1819
|
+
Ok(out)
|
|
1820
|
+
}
|
|
1821
|
+
|
|
1822
|
+
fn extract_html_title(html: &str) -> Option<String> {
|
|
1823
|
+
let lower = html.to_lowercase();
|
|
1824
|
+
let start = lower.find("<title>")?;
|
|
1825
|
+
let end = lower[start + 7..].find("</title>")?;
|
|
1826
|
+
let raw = &html[start + 7..start + 7 + end];
|
|
1827
|
+
let title = decode_xml_entities(raw).trim().to_string();
|
|
1828
|
+
if title.is_empty() {
|
|
1829
|
+
None
|
|
1830
|
+
} else {
|
|
1831
|
+
Some(title)
|
|
1832
|
+
}
|
|
1833
|
+
}
|
|
1834
|
+
|
|
1835
|
+
fn extract_text_snippet_from_html(html: &str, max_chars: usize) -> String {
|
|
1836
|
+
if html.trim().is_empty() {
|
|
1837
|
+
return String::new();
|
|
1838
|
+
}
|
|
1839
|
+
let document = Html::parse_document(html);
|
|
1840
|
+
let selectors = ["article", "main", ".content", "#content", "body"];
|
|
1841
|
+
let mut best = String::new();
|
|
1842
|
+
for selector_str in selectors {
|
|
1843
|
+
let Ok(selector) = Selector::parse(selector_str) else {
|
|
1844
|
+
continue;
|
|
1845
|
+
};
|
|
1846
|
+
for element in document.select(&selector) {
|
|
1847
|
+
let text = element
|
|
1848
|
+
.text()
|
|
1849
|
+
.collect::<Vec<_>>()
|
|
1850
|
+
.join(" ")
|
|
1851
|
+
.split_whitespace()
|
|
1852
|
+
.collect::<Vec<_>>()
|
|
1853
|
+
.join(" ");
|
|
1854
|
+
if text.len() > best.len() {
|
|
1855
|
+
best = text;
|
|
1856
|
+
}
|
|
1857
|
+
}
|
|
1858
|
+
if !best.is_empty() {
|
|
1859
|
+
break;
|
|
1860
|
+
}
|
|
1861
|
+
}
|
|
1862
|
+
truncate_text(best.trim(), max_chars)
|
|
1863
|
+
}
|
|
1864
|
+
|
|
1865
|
+
#[derive(Debug)]
|
|
1866
|
+
struct FeedItem {
|
|
1867
|
+
title: String,
|
|
1868
|
+
link: String,
|
|
1869
|
+
description: String,
|
|
1870
|
+
}
|
|
1871
|
+
|
|
1872
|
+
fn parse_rss_items(xml: &str, limit: usize) -> Vec<FeedItem> {
|
|
1873
|
+
let mut items = Vec::new();
|
|
1874
|
+
let mut rest = xml;
|
|
1875
|
+
|
|
1876
|
+
while let Some(start_idx) = rest.find("<item") {
|
|
1877
|
+
let after_start = &rest[start_idx..];
|
|
1878
|
+
let Some(open_end) = after_start.find('>') else {
|
|
1879
|
+
break;
|
|
1880
|
+
};
|
|
1881
|
+
let content_start = start_idx + open_end + 1;
|
|
1882
|
+
let Some(close_rel) = rest[content_start..].find("</item>") else {
|
|
1883
|
+
break;
|
|
1884
|
+
};
|
|
1885
|
+
let content_end = content_start + close_rel;
|
|
1886
|
+
let chunk = &rest[content_start..content_end];
|
|
1887
|
+
|
|
1888
|
+
let title = extract_xml_tag(chunk, "title").unwrap_or_default();
|
|
1889
|
+
let link = extract_xml_tag(chunk, "link").unwrap_or_default();
|
|
1890
|
+
let description = extract_xml_tag(chunk, "description")
|
|
1891
|
+
.or_else(|| extract_xml_tag(chunk, "content:encoded"))
|
|
1892
|
+
.unwrap_or_default();
|
|
1893
|
+
|
|
1894
|
+
if !link.trim().is_empty() {
|
|
1895
|
+
items.push(FeedItem {
|
|
1896
|
+
title: clean_feed_text(&title),
|
|
1897
|
+
link: clean_feed_text(&link),
|
|
1898
|
+
description: clean_feed_text(&description),
|
|
1899
|
+
});
|
|
1900
|
+
}
|
|
1901
|
+
|
|
1902
|
+
rest = &rest[content_end + "</item>".len()..];
|
|
1903
|
+
if items.len() >= limit {
|
|
1904
|
+
break;
|
|
1905
|
+
}
|
|
1906
|
+
}
|
|
1907
|
+
|
|
1908
|
+
items
|
|
1909
|
+
}
|
|
1910
|
+
|
|
1911
|
+
fn parse_atom_entries(xml: &str, limit: usize) -> Vec<FeedItem> {
|
|
1912
|
+
let mut entries = Vec::new();
|
|
1913
|
+
let mut rest = xml;
|
|
1914
|
+
|
|
1915
|
+
while let Some(start_idx) = rest.find("<entry") {
|
|
1916
|
+
let after_start = &rest[start_idx..];
|
|
1917
|
+
let Some(open_end) = after_start.find('>') else {
|
|
1918
|
+
break;
|
|
1919
|
+
};
|
|
1920
|
+
let content_start = start_idx + open_end + 1;
|
|
1921
|
+
let Some(close_rel) = rest[content_start..].find("</entry>") else {
|
|
1922
|
+
break;
|
|
1923
|
+
};
|
|
1924
|
+
let content_end = content_start + close_rel;
|
|
1925
|
+
let chunk = &rest[content_start..content_end];
|
|
1926
|
+
|
|
1927
|
+
let title = extract_xml_tag(chunk, "title").unwrap_or_default();
|
|
1928
|
+
let description = extract_xml_tag(chunk, "summary")
|
|
1929
|
+
.or_else(|| extract_xml_tag(chunk, "content"))
|
|
1930
|
+
.unwrap_or_default();
|
|
1931
|
+
|
|
1932
|
+
let link = extract_atom_link(chunk).unwrap_or_default();
|
|
1933
|
+
|
|
1934
|
+
if !link.trim().is_empty() {
|
|
1935
|
+
entries.push(FeedItem {
|
|
1936
|
+
title: clean_feed_text(&title),
|
|
1937
|
+
link: clean_feed_text(&link),
|
|
1938
|
+
description: clean_feed_text(&description),
|
|
1939
|
+
});
|
|
1940
|
+
}
|
|
1941
|
+
|
|
1942
|
+
rest = &rest[content_end + "</entry>".len()..];
|
|
1943
|
+
if entries.len() >= limit {
|
|
1944
|
+
break;
|
|
1945
|
+
}
|
|
1946
|
+
}
|
|
1947
|
+
|
|
1948
|
+
entries
|
|
1949
|
+
}
|
|
1950
|
+
|
|
1951
|
+
fn extract_atom_link(chunk: &str) -> Option<String> {
|
|
1952
|
+
let mut cursor = chunk;
|
|
1953
|
+
while let Some(idx) = cursor.find("<link") {
|
|
1954
|
+
let tail = &cursor[idx..];
|
|
1955
|
+
let end = tail.find('>')?;
|
|
1956
|
+
let tag = &tail[..=end];
|
|
1957
|
+
if let Some(href) = extract_attr(tag, "href") {
|
|
1958
|
+
if !href.trim().is_empty() {
|
|
1959
|
+
return Some(href);
|
|
1960
|
+
}
|
|
1961
|
+
}
|
|
1962
|
+
cursor = &tail[end + 1..];
|
|
1963
|
+
}
|
|
1964
|
+
None
|
|
1965
|
+
}
|
|
1966
|
+
|
|
1967
|
+
fn extract_attr(tag: &str, attr: &str) -> Option<String> {
|
|
1968
|
+
let needle = format!("{}=\"", attr);
|
|
1969
|
+
let start = tag.find(&needle)? + needle.len();
|
|
1970
|
+
let end = tag[start..].find('"')?;
|
|
1971
|
+
Some(tag[start..start + end].to_string())
|
|
1972
|
+
}
|
|
1973
|
+
|
|
1974
|
+
fn extract_xml_tag(chunk: &str, tag: &str) -> Option<String> {
|
|
1975
|
+
let open = format!("<{}>", tag);
|
|
1976
|
+
let close = format!("</{}>", tag);
|
|
1977
|
+
|
|
1978
|
+
if let Some(start) = chunk.find(&open) {
|
|
1979
|
+
let value_start = start + open.len();
|
|
1980
|
+
if let Some(end_rel) = chunk[value_start..].find(&close) {
|
|
1981
|
+
return Some(chunk[value_start..value_start + end_rel].to_string());
|
|
1982
|
+
}
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
None
|
|
1986
|
+
}
|
|
1987
|
+
|
|
1988
|
+
fn clean_feed_text(value: &str) -> String {
|
|
1989
|
+
let mut out = value.trim().to_string();
|
|
1990
|
+
out = out.replace("<![CDATA[", "").replace("]]>", "");
|
|
1991
|
+
out = decode_xml_entities(&out);
|
|
1992
|
+
out = strip_html_tags(&out);
|
|
1993
|
+
out.split_whitespace().collect::<Vec<_>>().join(" ")
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
fn decode_xml_entities(input: &str) -> String {
|
|
1997
|
+
input
|
|
1998
|
+
.replace("&", "&")
|
|
1999
|
+
.replace("<", "<")
|
|
2000
|
+
.replace(">", ">")
|
|
2001
|
+
.replace(""", "\"")
|
|
2002
|
+
.replace("'", "'")
|
|
2003
|
+
.replace("'", "'")
|
|
2004
|
+
}
|
|
2005
|
+
|
|
2006
|
+
fn strip_html_tags(input: &str) -> String {
|
|
2007
|
+
let mut out = String::with_capacity(input.len());
|
|
2008
|
+
let mut in_tag = false;
|
|
2009
|
+
for ch in input.chars() {
|
|
2010
|
+
match ch {
|
|
2011
|
+
'<' => in_tag = true,
|
|
2012
|
+
'>' => in_tag = false,
|
|
2013
|
+
_ if !in_tag => out.push(ch),
|
|
2014
|
+
_ => {}
|
|
2015
|
+
}
|
|
2016
|
+
}
|
|
2017
|
+
out
|
|
2018
|
+
}
|
|
2019
|
+
|
|
2020
|
+
fn resolve_searx_endpoints(endpoint_override: Option<&str>) -> Vec<String> {
|
|
2021
|
+
if let Some(single) = endpoint_override.map(str::trim).filter(|s| !s.is_empty()) {
|
|
2022
|
+
return vec![single.trim_end_matches('/').to_string()];
|
|
2023
|
+
}
|
|
2024
|
+
|
|
2025
|
+
if let Ok(list) = std::env::var("MASIX_DISCOVERY_SEARXNG_URLS") {
|
|
2026
|
+
let parsed = parse_endpoint_list(&list);
|
|
2027
|
+
if !parsed.is_empty() {
|
|
2028
|
+
return parsed;
|
|
2029
|
+
}
|
|
2030
|
+
}
|
|
2031
|
+
|
|
2032
|
+
if let Ok(single) = std::env::var("MASIX_DISCOVERY_SEARXNG_URL") {
|
|
2033
|
+
let parsed = parse_endpoint_list(&single);
|
|
2034
|
+
if !parsed.is_empty() {
|
|
2035
|
+
return parsed;
|
|
2036
|
+
}
|
|
2037
|
+
}
|
|
2038
|
+
|
|
2039
|
+
DEFAULT_SEARXNG_URLS
|
|
2040
|
+
.iter()
|
|
2041
|
+
.map(|v| v.trim_end_matches('/').to_string())
|
|
2042
|
+
.collect()
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
fn parse_endpoint_list(raw: &str) -> Vec<String> {
|
|
2046
|
+
let mut out = Vec::new();
|
|
2047
|
+
let mut seen = HashSet::new();
|
|
2048
|
+
|
|
2049
|
+
for token in raw.split(',') {
|
|
2050
|
+
let value = token.trim().trim_end_matches('/').to_string();
|
|
2051
|
+
if value.is_empty() {
|
|
2052
|
+
continue;
|
|
2053
|
+
}
|
|
2054
|
+
if !(value.starts_with("https://") || value.starts_with("http://")) {
|
|
2055
|
+
continue;
|
|
2056
|
+
}
|
|
2057
|
+
if seen.insert(value.clone()) {
|
|
2058
|
+
out.push(value);
|
|
2059
|
+
}
|
|
2060
|
+
if out.len() >= MAX_ENDPOINTS_FROM_CONFIG {
|
|
2061
|
+
break;
|
|
2062
|
+
}
|
|
2063
|
+
}
|
|
2064
|
+
|
|
2065
|
+
out
|
|
2066
|
+
}
|
|
2067
|
+
|
|
2068
|
+
fn summarize_for_error(body: &str) -> String {
|
|
2069
|
+
let compact = body.split_whitespace().collect::<Vec<_>>().join(" ");
|
|
2070
|
+
let mut text = compact;
|
|
2071
|
+
if text.len() > 180 {
|
|
2072
|
+
text.truncate(180);
|
|
2073
|
+
text.push_str("...");
|
|
2074
|
+
}
|
|
2075
|
+
if text.is_empty() {
|
|
2076
|
+
"(empty body)".to_string()
|
|
2077
|
+
} else {
|
|
2078
|
+
text
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
|
|
2082
|
+
fn normalize_url_key(url: &str) -> String {
|
|
2083
|
+
match reqwest::Url::parse(url) {
|
|
2084
|
+
Ok(mut parsed) => {
|
|
2085
|
+
parsed.set_fragment(None);
|
|
2086
|
+
parsed
|
|
2087
|
+
.as_str()
|
|
2088
|
+
.trim_end_matches('/')
|
|
2089
|
+
.to_lowercase()
|
|
2090
|
+
.replace("http://", "https://")
|
|
2091
|
+
}
|
|
2092
|
+
Err(_) => url.trim().to_lowercase(),
|
|
2093
|
+
}
|
|
2094
|
+
}
|
|
2095
|
+
|
|
2096
|
+
fn source_domain(url: &str) -> String {
|
|
2097
|
+
reqwest::Url::parse(url)
|
|
2098
|
+
.ok()
|
|
2099
|
+
.and_then(|parsed| parsed.host_str().map(|s| s.to_string()))
|
|
2100
|
+
.unwrap_or_else(|| "unknown".to_string())
|
|
2101
|
+
}
|
|
2102
|
+
|
|
2103
|
+
fn now_unix_secs() -> u64 {
|
|
2104
|
+
SystemTime::now()
|
|
2105
|
+
.duration_since(UNIX_EPOCH)
|
|
2106
|
+
.map(|d| d.as_secs())
|
|
2107
|
+
.unwrap_or(0)
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
async fn web_fetch_page(url: &str) -> Result<String> {
|
|
2111
|
+
let client = reqwest::Client::builder()
|
|
2112
|
+
.user_agent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36")
|
|
2113
|
+
.timeout(Duration::from_secs(DEFAULT_FETCH_TIMEOUT_SECS))
|
|
2114
|
+
.build()?;
|
|
2115
|
+
let response = client.get(url).send().await?;
|
|
2116
|
+
if !response.status().is_success() {
|
|
2117
|
+
return Err(anyhow!("fetch failed: HTTP {}", response.status()));
|
|
2118
|
+
}
|
|
2119
|
+
let html = response.text().await?;
|
|
2120
|
+
let document = Html::parse_document(&html);
|
|
2121
|
+
|
|
2122
|
+
let selectors = [
|
|
2123
|
+
"article", "main", ".content", "#content", ".post", ".article", "body",
|
|
2124
|
+
];
|
|
2125
|
+
let mut text_content = String::new();
|
|
2126
|
+
for selector_str in selectors {
|
|
2127
|
+
if let Ok(selector) = Selector::parse(selector_str) {
|
|
2128
|
+
for element in document.select(&selector) {
|
|
2129
|
+
let text = element
|
|
2130
|
+
.text()
|
|
2131
|
+
.collect::<String>()
|
|
2132
|
+
.split_whitespace()
|
|
2133
|
+
.collect::<Vec<_>>()
|
|
2134
|
+
.join(" ");
|
|
2135
|
+
if text.len() > text_content.len() {
|
|
2136
|
+
text_content = text;
|
|
2137
|
+
}
|
|
2138
|
+
}
|
|
2139
|
+
if !text_content.is_empty() {
|
|
2140
|
+
break;
|
|
2141
|
+
}
|
|
2142
|
+
}
|
|
2143
|
+
}
|
|
2144
|
+
|
|
2145
|
+
if text_content.is_empty() {
|
|
2146
|
+
text_content = "(no readable text extracted)".to_string();
|
|
2147
|
+
}
|
|
2148
|
+
if text_content.len() > MAX_WEB_CONTENT {
|
|
2149
|
+
text_content.truncate(MAX_WEB_CONTENT);
|
|
2150
|
+
text_content.push_str("... [truncated]");
|
|
2151
|
+
}
|
|
2152
|
+
|
|
2153
|
+
let title = Selector::parse("title")
|
|
2154
|
+
.ok()
|
|
2155
|
+
.and_then(|sel| document.select(&sel).next())
|
|
2156
|
+
.map(|e| e.text().collect::<String>())
|
|
2157
|
+
.unwrap_or_else(|| "N/A".to_string());
|
|
2158
|
+
|
|
2159
|
+
Ok(format!(
|
|
2160
|
+
"Title: {}\nURL: {}\n\n{}",
|
|
2161
|
+
title.trim(),
|
|
2162
|
+
url,
|
|
2163
|
+
text_content
|
|
2164
|
+
))
|
|
2165
|
+
}
|
|
2166
|
+
|
|
2167
|
+
async fn torrent_search(
|
|
2168
|
+
endpoint_override: Option<&str>,
|
|
2169
|
+
query: &str,
|
|
2170
|
+
max_results: usize,
|
|
2171
|
+
with_magnets: bool,
|
|
2172
|
+
) -> Result<Vec<TorrentSearchResult>> {
|
|
2173
|
+
let normalized_query = normalize_torrent_query(query);
|
|
2174
|
+
let direct_query = query.trim();
|
|
2175
|
+
let direct_query = if direct_query.is_empty() {
|
|
2176
|
+
normalized_query.as_str()
|
|
2177
|
+
} else {
|
|
2178
|
+
direct_query
|
|
2179
|
+
};
|
|
2180
|
+
let endpoints = resolve_searx_endpoints(endpoint_override);
|
|
2181
|
+
|
|
2182
|
+
let searx_future = searx_search_broker(
|
|
2183
|
+
&endpoints,
|
|
2184
|
+
&normalized_query,
|
|
2185
|
+
(max_results * 2).min(MAX_PROVIDER_RESULTS),
|
|
2186
|
+
MAX_SEARX_PARALLEL_ENDPOINTS,
|
|
2187
|
+
);
|
|
2188
|
+
let nyaa_future = nyaa_torrent_search(direct_query, max_results * 2);
|
|
2189
|
+
let archive_future = archive_torrent_search(direct_query, max_results * 2);
|
|
2190
|
+
|
|
2191
|
+
let (searx, nyaa, archive) = tokio::join!(searx_future, nyaa_future, archive_future);
|
|
2192
|
+
|
|
2193
|
+
let mut output = Vec::new();
|
|
2194
|
+
let mut provider_errors = Vec::new();
|
|
2195
|
+
|
|
2196
|
+
match nyaa {
|
|
2197
|
+
Ok(mut items) => output.append(&mut items),
|
|
2198
|
+
Err(e) => provider_errors.push(format!("nyaa: {}", e)),
|
|
2199
|
+
}
|
|
2200
|
+
|
|
2201
|
+
match searx {
|
|
2202
|
+
Ok(items) => {
|
|
2203
|
+
for result in items {
|
|
2204
|
+
let magnet_links = if with_magnets {
|
|
2205
|
+
extract_magnet_links(&result.url, 3)
|
|
2206
|
+
.await
|
|
2207
|
+
.unwrap_or_default()
|
|
2208
|
+
} else {
|
|
2209
|
+
Vec::new()
|
|
2210
|
+
};
|
|
2211
|
+
output.push(TorrentSearchResult {
|
|
2212
|
+
title: result.title,
|
|
2213
|
+
url: result.url,
|
|
2214
|
+
content: result.content,
|
|
2215
|
+
engine: if result.engine.trim().is_empty() {
|
|
2216
|
+
"searx".to_string()
|
|
2217
|
+
} else {
|
|
2218
|
+
result.engine
|
|
2219
|
+
},
|
|
2220
|
+
magnet_links,
|
|
2221
|
+
});
|
|
2222
|
+
}
|
|
2223
|
+
}
|
|
2224
|
+
Err(e) => provider_errors.push(format!("searx: {}", e)),
|
|
2225
|
+
}
|
|
2226
|
+
|
|
2227
|
+
match archive {
|
|
2228
|
+
Ok(mut items) => output.append(&mut items),
|
|
2229
|
+
Err(e) => provider_errors.push(format!("archive: {}", e)),
|
|
2230
|
+
}
|
|
2231
|
+
|
|
2232
|
+
output = dedup_torrent_results(output, with_magnets);
|
|
2233
|
+
output.truncate(max_results.min(20).max(1));
|
|
2234
|
+
|
|
2235
|
+
if output.is_empty() {
|
|
2236
|
+
if provider_errors.is_empty() {
|
|
2237
|
+
Err(anyhow!("torrent search returned no results"))
|
|
2238
|
+
} else {
|
|
2239
|
+
Err(anyhow!(
|
|
2240
|
+
"torrent search failed across providers: {}",
|
|
2241
|
+
provider_errors.join(" | ")
|
|
2242
|
+
))
|
|
2243
|
+
}
|
|
2244
|
+
} else {
|
|
2245
|
+
Ok(output)
|
|
2246
|
+
}
|
|
2247
|
+
}
|
|
2248
|
+
|
|
2249
|
+
fn dedup_torrent_results(
|
|
2250
|
+
mut results: Vec<TorrentSearchResult>,
|
|
2251
|
+
with_magnets: bool,
|
|
2252
|
+
) -> Vec<TorrentSearchResult> {
|
|
2253
|
+
let mut seen = HashSet::new();
|
|
2254
|
+
let mut deduped = Vec::with_capacity(results.len());
|
|
2255
|
+
|
|
2256
|
+
for mut item in results.drain(..) {
|
|
2257
|
+
let key = normalize_url_key(&item.url);
|
|
2258
|
+
if key.is_empty() || !seen.insert(key) {
|
|
2259
|
+
continue;
|
|
2260
|
+
}
|
|
2261
|
+
if !with_magnets {
|
|
2262
|
+
item.magnet_links.clear();
|
|
2263
|
+
}
|
|
2264
|
+
deduped.push(item);
|
|
2265
|
+
}
|
|
2266
|
+
|
|
2267
|
+
deduped.sort_by(|a, b| {
|
|
2268
|
+
torrent_engine_rank(&b.engine)
|
|
2269
|
+
.cmp(&torrent_engine_rank(&a.engine))
|
|
2270
|
+
.then_with(|| b.magnet_links.len().cmp(&a.magnet_links.len()))
|
|
2271
|
+
.then_with(|| a.title.cmp(&b.title))
|
|
2272
|
+
});
|
|
2273
|
+
|
|
2274
|
+
deduped
|
|
2275
|
+
}
|
|
2276
|
+
|
|
2277
|
+
fn torrent_engine_rank(engine: &str) -> u8 {
|
|
2278
|
+
let lowered = engine.to_lowercase();
|
|
2279
|
+
if lowered.contains("nyaa") {
|
|
2280
|
+
4
|
|
2281
|
+
} else if lowered.contains("searx") {
|
|
2282
|
+
3
|
|
2283
|
+
} else if lowered.contains("archive") {
|
|
2284
|
+
2
|
|
2285
|
+
} else {
|
|
2286
|
+
1
|
|
2287
|
+
}
|
|
2288
|
+
}
|
|
2289
|
+
|
|
2290
|
+
async fn nyaa_torrent_search(query: &str, max_results: usize) -> Result<Vec<TorrentSearchResult>> {
|
|
2291
|
+
let url = format!("https://nyaa.si/?f=0&c=0_0&q={}", url_encode(query));
|
|
2292
|
+
let client = reqwest::Client::builder()
|
|
2293
|
+
.user_agent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36")
|
|
2294
|
+
.timeout(Duration::from_secs(DEFAULT_SEARCH_TIMEOUT_SECS))
|
|
2295
|
+
.build()?;
|
|
2296
|
+
let response = client.get(&url).send().await?;
|
|
2297
|
+
if !response.status().is_success() {
|
|
2298
|
+
return Err(anyhow!("Nyaa provider failed: HTTP {}", response.status()));
|
|
2299
|
+
}
|
|
2300
|
+
let html = response.text().await?;
|
|
2301
|
+
let document = Html::parse_document(&html);
|
|
2302
|
+
|
|
2303
|
+
let row_selector = Selector::parse("table tbody tr")
|
|
2304
|
+
.map_err(|e| anyhow!("Invalid Nyaa row selector: {}", e))?;
|
|
2305
|
+
let title_selector = Selector::parse("td[colspan=\"2\"] a[href^=\"/view/\"]")
|
|
2306
|
+
.map_err(|e| anyhow!("Invalid Nyaa title selector: {}", e))?;
|
|
2307
|
+
let magnet_selector = Selector::parse("a[href^=\"magnet:?\"]")
|
|
2308
|
+
.map_err(|e| anyhow!("Invalid Nyaa magnet selector: {}", e))?;
|
|
2309
|
+
let cell_selector =
|
|
2310
|
+
Selector::parse("td").map_err(|e| anyhow!("Invalid Nyaa cell selector: {}", e))?;
|
|
2311
|
+
|
|
2312
|
+
let mut output = Vec::new();
|
|
2313
|
+
for row in document.select(&row_selector) {
|
|
2314
|
+
if output.len() >= max_results.min(MAX_PROVIDER_RESULTS) {
|
|
2315
|
+
break;
|
|
2316
|
+
}
|
|
2317
|
+
|
|
2318
|
+
let mut title = String::new();
|
|
2319
|
+
let mut detail_url = String::new();
|
|
2320
|
+
for link in row.select(&title_selector) {
|
|
2321
|
+
if let Some(href) = link.value().attr("href") {
|
|
2322
|
+
title = link.text().collect::<String>().trim().to_string();
|
|
2323
|
+
detail_url = format!("https://nyaa.si{}", href);
|
|
2324
|
+
break;
|
|
2325
|
+
}
|
|
2326
|
+
}
|
|
2327
|
+
if detail_url.is_empty() {
|
|
2328
|
+
continue;
|
|
2329
|
+
}
|
|
2330
|
+
|
|
2331
|
+
let mut magnets = Vec::new();
|
|
2332
|
+
for a in row.select(&magnet_selector) {
|
|
2333
|
+
if let Some(href) = a.value().attr("href") {
|
|
2334
|
+
let cleaned = sanitize_magnet(href);
|
|
2335
|
+
if cleaned.starts_with("magnet:?") {
|
|
2336
|
+
magnets.push(cleaned);
|
|
2337
|
+
}
|
|
2338
|
+
}
|
|
2339
|
+
}
|
|
2340
|
+
|
|
2341
|
+
let mut columns_text = row
|
|
2342
|
+
.select(&cell_selector)
|
|
2343
|
+
.map(|td| td.text().collect::<String>())
|
|
2344
|
+
.collect::<Vec<_>>();
|
|
2345
|
+
columns_text.retain(|v| !v.trim().is_empty());
|
|
2346
|
+
let summary = columns_text.join(" | ");
|
|
2347
|
+
let summary = summary.split_whitespace().collect::<Vec<_>>().join(" ");
|
|
2348
|
+
|
|
2349
|
+
output.push(TorrentSearchResult {
|
|
2350
|
+
title: if title.is_empty() {
|
|
2351
|
+
"Nyaa result".to_string()
|
|
2352
|
+
} else {
|
|
2353
|
+
title
|
|
2354
|
+
},
|
|
2355
|
+
url: detail_url,
|
|
2356
|
+
content: if summary.is_empty() {
|
|
2357
|
+
"Result from Nyaa index".to_string()
|
|
2358
|
+
} else {
|
|
2359
|
+
truncate_text(&summary, 700)
|
|
2360
|
+
},
|
|
2361
|
+
engine: "nyaa-html".to_string(),
|
|
2362
|
+
magnet_links: magnets,
|
|
2363
|
+
});
|
|
2364
|
+
}
|
|
2365
|
+
|
|
2366
|
+
if output.is_empty() {
|
|
2367
|
+
return Err(anyhow!("Nyaa provider returned no results"));
|
|
2368
|
+
}
|
|
2369
|
+
|
|
2370
|
+
Ok(output)
|
|
2371
|
+
}
|
|
2372
|
+
|
|
2373
|
+
async fn archive_torrent_search(
|
|
2374
|
+
query: &str,
|
|
2375
|
+
max_results: usize,
|
|
2376
|
+
) -> Result<Vec<TorrentSearchResult>> {
|
|
2377
|
+
let url = format!(
|
|
2378
|
+
"https://archive.org/advancedsearch.php?q={}&fl[]=identifier,title,description&rows={}&page=1&output=json",
|
|
2379
|
+
url_encode(query),
|
|
2380
|
+
max_results.min(MAX_PROVIDER_RESULTS).max(1)
|
|
2381
|
+
);
|
|
2382
|
+
let client = reqwest::Client::builder()
|
|
2383
|
+
.user_agent(format!("MasiXDiscovery/{} archive", MODULE_VERSION))
|
|
2384
|
+
.timeout(Duration::from_secs(DEFAULT_SEARCH_TIMEOUT_SECS))
|
|
2385
|
+
.build()?;
|
|
2386
|
+
let response = client.get(&url).send().await?;
|
|
2387
|
+
if !response.status().is_success() {
|
|
2388
|
+
return Err(anyhow!(
|
|
2389
|
+
"Archive provider failed: HTTP {}",
|
|
2390
|
+
response.status()
|
|
2391
|
+
));
|
|
2392
|
+
}
|
|
2393
|
+
|
|
2394
|
+
let payload: ArchiveResponse = response.json().await?;
|
|
2395
|
+
let mut output = Vec::new();
|
|
2396
|
+
for doc in payload
|
|
2397
|
+
.response
|
|
2398
|
+
.docs
|
|
2399
|
+
.into_iter()
|
|
2400
|
+
.take(max_results.min(MAX_PROVIDER_RESULTS))
|
|
2401
|
+
{
|
|
2402
|
+
if doc.identifier.trim().is_empty() {
|
|
2403
|
+
continue;
|
|
2404
|
+
}
|
|
2405
|
+
|
|
2406
|
+
let title = doc
|
|
2407
|
+
.title
|
|
2408
|
+
.unwrap_or_else(|| format!("archive.org item {}", doc.identifier));
|
|
2409
|
+
let description = truncate_text(&archive_description_to_string(doc.description), 700);
|
|
2410
|
+
let item_url = format!("https://archive.org/details/{}", doc.identifier);
|
|
2411
|
+
|
|
2412
|
+
output.push(TorrentSearchResult {
|
|
2413
|
+
title,
|
|
2414
|
+
url: item_url,
|
|
2415
|
+
content: if description.is_empty() {
|
|
2416
|
+
"Result from archive.org".to_string()
|
|
2417
|
+
} else {
|
|
2418
|
+
description
|
|
2419
|
+
},
|
|
2420
|
+
engine: "archive-search".to_string(),
|
|
2421
|
+
magnet_links: Vec::new(),
|
|
2422
|
+
});
|
|
2423
|
+
}
|
|
2424
|
+
|
|
2425
|
+
if output.is_empty() {
|
|
2426
|
+
return Err(anyhow!("Archive provider returned no results"));
|
|
2427
|
+
}
|
|
2428
|
+
|
|
2429
|
+
Ok(output)
|
|
2430
|
+
}
|
|
2431
|
+
|
|
2432
|
+
fn archive_description_to_string(value: Option<serde_json::Value>) -> String {
|
|
2433
|
+
let Some(value) = value else {
|
|
2434
|
+
return String::new();
|
|
2435
|
+
};
|
|
2436
|
+
|
|
2437
|
+
match value {
|
|
2438
|
+
serde_json::Value::String(v) => v,
|
|
2439
|
+
serde_json::Value::Array(values) => values
|
|
2440
|
+
.into_iter()
|
|
2441
|
+
.filter_map(|v| v.as_str().map(|s| s.to_string()))
|
|
2442
|
+
.collect::<Vec<_>>()
|
|
2443
|
+
.join(" "),
|
|
2444
|
+
_ => String::new(),
|
|
2445
|
+
}
|
|
2446
|
+
}
|
|
2447
|
+
|
|
2448
|
+
fn truncate_text(value: &str, max_chars: usize) -> String {
|
|
2449
|
+
if value.chars().count() <= max_chars {
|
|
2450
|
+
return value.to_string();
|
|
2451
|
+
}
|
|
2452
|
+
let mut out = String::new();
|
|
2453
|
+
for (idx, ch) in value.chars().enumerate() {
|
|
2454
|
+
if idx >= max_chars {
|
|
2455
|
+
break;
|
|
2456
|
+
}
|
|
2457
|
+
out.push(ch);
|
|
2458
|
+
}
|
|
2459
|
+
out.push_str("...");
|
|
2460
|
+
out
|
|
2461
|
+
}
|
|
2462
|
+
|
|
2463
|
+
fn normalize_torrent_query(query: &str) -> String {
|
|
2464
|
+
let trimmed = query.trim();
|
|
2465
|
+
if trimmed.is_empty() {
|
|
2466
|
+
"torrent".to_string()
|
|
2467
|
+
} else if trimmed.to_lowercase().contains("torrent") {
|
|
2468
|
+
trimmed.to_string()
|
|
2469
|
+
} else {
|
|
2470
|
+
format!("{} torrent", trimmed)
|
|
2471
|
+
}
|
|
2472
|
+
}
|
|
2473
|
+
|
|
2474
|
+
async fn extract_magnet_links(url: &str, max_links: usize) -> Result<Vec<String>> {
|
|
2475
|
+
if url.trim_start().starts_with("magnet:?") {
|
|
2476
|
+
return Ok(vec![sanitize_magnet(url)]);
|
|
2477
|
+
}
|
|
2478
|
+
|
|
2479
|
+
let client = reqwest::Client::builder()
|
|
2480
|
+
.user_agent(format!("MasiXDiscovery/{} torrent", MODULE_VERSION))
|
|
2481
|
+
.timeout(Duration::from_secs(DEFAULT_SEARCH_TIMEOUT_SECS))
|
|
2482
|
+
.build()?;
|
|
2483
|
+
let response = client.get(url).send().await?;
|
|
2484
|
+
if !response.status().is_success() {
|
|
2485
|
+
return Ok(Vec::new());
|
|
2486
|
+
}
|
|
2487
|
+
let html = response.text().await.unwrap_or_default();
|
|
2488
|
+
let document = Html::parse_document(&html);
|
|
2489
|
+
let selector = Selector::parse("a[href^=\"magnet:?\"]").ok();
|
|
2490
|
+
|
|
2491
|
+
let mut seen = HashSet::new();
|
|
2492
|
+
let mut output = Vec::new();
|
|
2493
|
+
|
|
2494
|
+
if let Some(sel) = selector {
|
|
2495
|
+
for element in document.select(&sel) {
|
|
2496
|
+
if let Some(link) = element.value().attr("href") {
|
|
2497
|
+
let cleaned = sanitize_magnet(link);
|
|
2498
|
+
if cleaned.starts_with("magnet:?")
|
|
2499
|
+
&& seen.insert(cleaned.clone())
|
|
2500
|
+
&& output.len() < max_links
|
|
2501
|
+
{
|
|
2502
|
+
output.push(cleaned);
|
|
2503
|
+
}
|
|
2504
|
+
}
|
|
2505
|
+
if output.len() >= max_links {
|
|
2506
|
+
break;
|
|
2507
|
+
}
|
|
2508
|
+
}
|
|
2509
|
+
}
|
|
2510
|
+
|
|
2511
|
+
if output.len() < max_links {
|
|
2512
|
+
for candidate in extract_magnet_links_from_text(&html) {
|
|
2513
|
+
if seen.insert(candidate.clone()) {
|
|
2514
|
+
output.push(candidate);
|
|
2515
|
+
}
|
|
2516
|
+
if output.len() >= max_links {
|
|
2517
|
+
break;
|
|
2518
|
+
}
|
|
2519
|
+
}
|
|
2520
|
+
}
|
|
2521
|
+
|
|
2522
|
+
Ok(output)
|
|
2523
|
+
}
|
|
2524
|
+
|
|
2525
|
+
fn extract_magnet_links_from_text(text: &str) -> Vec<String> {
|
|
2526
|
+
let mut links = Vec::new();
|
|
2527
|
+
let mut idx = 0usize;
|
|
2528
|
+
while idx < text.len() {
|
|
2529
|
+
let Some(found) = text[idx..].find("magnet:?") else {
|
|
2530
|
+
break;
|
|
2531
|
+
};
|
|
2532
|
+
let start = idx + found;
|
|
2533
|
+
let remainder = &text[start..];
|
|
2534
|
+
let end_rel = remainder
|
|
2535
|
+
.find(|c: char| c.is_whitespace() || matches!(c, '"' | '\'' | '<' | '>'))
|
|
2536
|
+
.unwrap_or(remainder.len());
|
|
2537
|
+
let raw = &remainder[..end_rel];
|
|
2538
|
+
let candidate = sanitize_magnet(raw);
|
|
2539
|
+
if candidate.starts_with("magnet:?") {
|
|
2540
|
+
links.push(candidate);
|
|
2541
|
+
}
|
|
2542
|
+
idx = start + end_rel;
|
|
2543
|
+
}
|
|
2544
|
+
links
|
|
2545
|
+
}
|
|
2546
|
+
|
|
2547
|
+
fn sanitize_magnet(value: &str) -> String {
|
|
2548
|
+
value
|
|
2549
|
+
.trim()
|
|
2550
|
+
.trim_matches('"')
|
|
2551
|
+
.trim_matches('\'')
|
|
2552
|
+
.replace("&", "&")
|
|
2553
|
+
}
|
|
2554
|
+
|
|
2555
|
+
fn url_encode(value: &str) -> String {
|
|
2556
|
+
let mut out = String::new();
|
|
2557
|
+
for b in value.as_bytes() {
|
|
2558
|
+
let is_unreserved = matches!(
|
|
2559
|
+
*b,
|
|
2560
|
+
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~'
|
|
2561
|
+
);
|
|
2562
|
+
if is_unreserved {
|
|
2563
|
+
out.push(*b as char);
|
|
2564
|
+
} else {
|
|
2565
|
+
use std::fmt::Write as _;
|
|
2566
|
+
let _ = write!(&mut out, "%{:02X}", b);
|
|
2567
|
+
}
|
|
2568
|
+
}
|
|
2569
|
+
out
|
|
2570
|
+
}
|