pi-research 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -250
- package/lib/page-fetch-adapter.js +311 -64
- package/lib/research-policy.js +36 -15
- package/lib/research-profiles.json +4 -0
- package/lib/research.js +15 -6
- package/lib/router-annotation.js +192 -0
- package/lib/router-structured-features.js +134 -0
- package/lib/tiny-router.js +338 -0
- package/lib/web-research.js +171 -10
- package/package.json +2 -2
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { spawn } from "node:child_process";
|
|
1
|
+
import { spawn as nodeSpawn, spawnSync } from "node:child_process";
|
|
2
|
+
import { existsSync } from "node:fs";
|
|
2
3
|
import { fileURLToPath } from "node:url";
|
|
3
4
|
import path from "node:path";
|
|
4
5
|
|
|
@@ -28,6 +29,13 @@ const DYNAMIC_PATTERNS = [
|
|
|
28
29
|
/id=["']root["']/i,
|
|
29
30
|
];
|
|
30
31
|
|
|
32
|
+
let spawnProcess = nodeSpawn;
|
|
33
|
+
let daemonState = null;
|
|
34
|
+
let daemonSequence = 0;
|
|
35
|
+
let exitHookInstalled = false;
|
|
36
|
+
let runtimeStatus = null;
|
|
37
|
+
const DAEMON_IDLE_TIMEOUT_MS = 3000;
|
|
38
|
+
|
|
31
39
|
function stripHtml(value) {
|
|
32
40
|
return String(value || "")
|
|
33
41
|
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
|
@@ -71,35 +79,62 @@ export function chooseScraplingMode(input) {
|
|
|
71
79
|
return assessPageAttempt(input).mode;
|
|
72
80
|
}
|
|
73
81
|
|
|
74
|
-
function
|
|
82
|
+
function pythonDaemonScript() {
|
|
75
83
|
return String.raw`
|
|
76
84
|
import asyncio
|
|
85
|
+
import atexit
|
|
77
86
|
import json
|
|
78
87
|
import os
|
|
79
88
|
import sys
|
|
80
89
|
|
|
81
90
|
root = sys.argv[1]
|
|
82
|
-
mode = sys.argv[2]
|
|
83
|
-
url = sys.argv[3]
|
|
84
|
-
payload = json.loads(sys.argv[4])
|
|
85
|
-
|
|
86
91
|
sys.path.insert(0, root)
|
|
87
92
|
|
|
88
|
-
|
|
89
|
-
from scrapling.fetchers import AsyncFetcher, DynamicFetcher, StealthyFetcher
|
|
93
|
+
from scrapling.fetchers import AsyncFetcher, AsyncDynamicSession, AsyncStealthySession, ProxyRotator
|
|
90
94
|
|
|
91
|
-
|
|
92
|
-
kwargs = {}
|
|
93
|
-
if timeout:
|
|
94
|
-
kwargs["timeout"] = timeout
|
|
95
|
+
sessions = {}
|
|
95
96
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
else:
|
|
101
|
-
response = StealthyFetcher.fetch(url, **kwargs)
|
|
97
|
+
def session_key(mode, proxy_rotation):
|
|
98
|
+
if not proxy_rotation:
|
|
99
|
+
return mode
|
|
100
|
+
return f"{mode}:{json.dumps(proxy_rotation, sort_keys=True)}"
|
|
102
101
|
|
|
102
|
+
async def build_session(mode, payload):
|
|
103
|
+
proxy_rotation = payload.get("proxyRotation") or []
|
|
104
|
+
key = session_key(mode, proxy_rotation)
|
|
105
|
+
session = sessions.get(key)
|
|
106
|
+
if session is not None:
|
|
107
|
+
return session
|
|
108
|
+
|
|
109
|
+
kwargs = {
|
|
110
|
+
"headless": True,
|
|
111
|
+
"disable_resources": True,
|
|
112
|
+
"network_idle": True,
|
|
113
|
+
"timeout": payload.get("timeout") or 30000,
|
|
114
|
+
}
|
|
115
|
+
if proxy_rotation:
|
|
116
|
+
kwargs["proxy_rotator"] = ProxyRotator(proxy_rotation)
|
|
117
|
+
|
|
118
|
+
session = AsyncDynamicSession(**kwargs) if mode == "dynamic" else AsyncStealthySession(**kwargs)
|
|
119
|
+
await session.start()
|
|
120
|
+
sessions[key] = session
|
|
121
|
+
return session
|
|
122
|
+
|
|
123
|
+
async def cleanup():
|
|
124
|
+
for session in sessions.values():
|
|
125
|
+
try:
|
|
126
|
+
await session.close()
|
|
127
|
+
except Exception:
|
|
128
|
+
pass
|
|
129
|
+
sessions.clear()
|
|
130
|
+
|
|
131
|
+
def cleanup_sync():
|
|
132
|
+
asyncio.get_event_loop().run_until_complete(cleanup())
|
|
133
|
+
|
|
134
|
+
atexit.register(cleanup_sync)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def normalize_response(response, fallback_url):
|
|
103
138
|
headers = {}
|
|
104
139
|
raw_headers = getattr(response, "headers", None)
|
|
105
140
|
if hasattr(raw_headers, "items"):
|
|
@@ -120,77 +155,289 @@ async def main():
|
|
|
120
155
|
elif not isinstance(body, str):
|
|
121
156
|
body = str(body or "")
|
|
122
157
|
|
|
123
|
-
|
|
158
|
+
return {
|
|
124
159
|
"ok": True,
|
|
125
|
-
"url": getattr(response, "url",
|
|
160
|
+
"url": getattr(response, "url", fallback_url),
|
|
126
161
|
"status": getattr(response, "status", 200),
|
|
127
162
|
"contentType": headers.get("content-type", ""),
|
|
128
163
|
"body": body,
|
|
129
164
|
"headers": headers,
|
|
130
165
|
}
|
|
131
|
-
print(json.dumps(out))
|
|
132
166
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
167
|
+
|
|
168
|
+
async def handle_job(job):
|
|
169
|
+
mode = job.get("mode")
|
|
170
|
+
url = job.get("url")
|
|
171
|
+
payload = job.get("payload") or {}
|
|
172
|
+
timeout = payload.get("timeout") or 30000
|
|
173
|
+
proxy = payload.get("proxy")
|
|
174
|
+
|
|
175
|
+
kwargs = {"timeout": timeout}
|
|
176
|
+
if proxy:
|
|
177
|
+
kwargs["proxy"] = proxy
|
|
178
|
+
|
|
179
|
+
if mode == "async":
|
|
180
|
+
response = await AsyncFetcher.get(url, **kwargs)
|
|
181
|
+
else:
|
|
182
|
+
session = await build_session(mode, payload)
|
|
183
|
+
response = await session.fetch(url, **kwargs)
|
|
184
|
+
|
|
185
|
+
out = normalize_response(response, url)
|
|
186
|
+
out["id"] = job.get("id")
|
|
187
|
+
return out
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
async def main():
|
|
191
|
+
print(json.dumps({"type": "ready"}), flush=True)
|
|
192
|
+
|
|
193
|
+
for raw_line in sys.stdin:
|
|
194
|
+
line = raw_line.strip()
|
|
195
|
+
if not line:
|
|
196
|
+
continue
|
|
197
|
+
try:
|
|
198
|
+
job = json.loads(line)
|
|
199
|
+
except Exception as exc:
|
|
200
|
+
print(json.dumps({"type": "error", "ok": False, "error": str(exc)}), flush=True)
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
if job.get("type") == "shutdown":
|
|
204
|
+
break
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
out = await handle_job(job)
|
|
208
|
+
except Exception as exc:
|
|
209
|
+
out = {"id": job.get("id"), "ok": False, "error": str(exc), "type": exc.__class__.__name__}
|
|
210
|
+
print(json.dumps(out), flush=True)
|
|
211
|
+
|
|
212
|
+
await cleanup()
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
asyncio.run(main())
|
|
138
216
|
`;
|
|
139
217
|
}
|
|
140
218
|
|
|
219
|
+
function handleDaemonStdout(state, chunk) {
|
|
220
|
+
state.stdoutBuffer += String(chunk || "");
|
|
221
|
+
while (state.stdoutBuffer.includes("\n")) {
|
|
222
|
+
const newlineIndex = state.stdoutBuffer.indexOf("\n");
|
|
223
|
+
const line = state.stdoutBuffer.slice(0, newlineIndex).trim();
|
|
224
|
+
state.stdoutBuffer = state.stdoutBuffer.slice(newlineIndex + 1);
|
|
225
|
+
if (!line) continue;
|
|
226
|
+
|
|
227
|
+
let parsed;
|
|
228
|
+
try {
|
|
229
|
+
parsed = JSON.parse(line);
|
|
230
|
+
} catch {
|
|
231
|
+
continue;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if (parsed.type === "ready") {
|
|
235
|
+
state.ready = true;
|
|
236
|
+
state.resolveReady?.(state);
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const pending = state.pending.get(parsed.id);
|
|
241
|
+
if (!pending) continue;
|
|
242
|
+
state.pending.delete(parsed.id);
|
|
243
|
+
pending.cleanup?.();
|
|
244
|
+
pending.resolve(parsed.ok ? parsed : null);
|
|
245
|
+
scheduleDaemonIdleStop(state);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
function failDaemonState(state) {
|
|
250
|
+
if (state.idleTimer) clearTimeout(state.idleTimer);
|
|
251
|
+
for (const pending of state.pending.values()) {
|
|
252
|
+
pending.cleanup?.();
|
|
253
|
+
pending.resolve(null);
|
|
254
|
+
}
|
|
255
|
+
state.pending.clear();
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
function scheduleDaemonIdleStop(state) {
|
|
259
|
+
if (state.idleTimer) clearTimeout(state.idleTimer);
|
|
260
|
+
if (state.pending.size > 0) return;
|
|
261
|
+
state.idleTimer = setTimeout(() => {
|
|
262
|
+
if (daemonState === state && state.pending.size === 0) void stopScraplingDaemon();
|
|
263
|
+
}, DAEMON_IDLE_TIMEOUT_MS);
|
|
264
|
+
state.idleTimer.unref?.();
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function resolvePythonExecutable() {
|
|
268
|
+
if (process.env.PYTHON) return process.env.PYTHON;
|
|
269
|
+
const venvPython = path.join(process.cwd(), ".venv-scrapling", "bin", "python");
|
|
270
|
+
return existsSync(venvPython) ? venvPython : "python3";
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
function daemonEnv() {
|
|
274
|
+
return {
|
|
275
|
+
...process.env,
|
|
276
|
+
PYTHONPATH: [SCRAPLING_ROOT, process.env.PYTHONPATH].filter(Boolean).join(path.delimiter),
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
function validateScraplingRuntime() {
|
|
281
|
+
if (runtimeStatus) return runtimeStatus;
|
|
282
|
+
const python = resolvePythonExecutable();
|
|
283
|
+
const probe = spawnSync(python, ["-c", "import sys; sys.path.insert(0, sys.argv[1]); import lxml, patchright, playwright, scrapling; print('OK')", SCRAPLING_ROOT], {
|
|
284
|
+
env: daemonEnv(),
|
|
285
|
+
encoding: "utf8",
|
|
286
|
+
timeout: 15000,
|
|
287
|
+
});
|
|
288
|
+
|
|
289
|
+
runtimeStatus = probe.status === 0
|
|
290
|
+
? { ok: true, python }
|
|
291
|
+
: {
|
|
292
|
+
ok: false,
|
|
293
|
+
python,
|
|
294
|
+
error: (probe.stderr || probe.stdout || `scrapling runtime check failed with status ${probe.status ?? "unknown"}`).trim(),
|
|
295
|
+
};
|
|
296
|
+
return runtimeStatus;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
export function getScraplingRuntimeStatus() {
|
|
300
|
+
return validateScraplingRuntime();
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
function ensureExitHook() {
|
|
304
|
+
if (exitHookInstalled) return;
|
|
305
|
+
exitHookInstalled = true;
|
|
306
|
+
process.once("exit", () => {
|
|
307
|
+
daemonState?.child?.kill?.("SIGKILL");
|
|
308
|
+
});
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
async function ensureScraplingDaemon() {
|
|
312
|
+
if (daemonState?.ready) return daemonState;
|
|
313
|
+
if (daemonState?.readyPromise) return daemonState.readyPromise;
|
|
314
|
+
|
|
315
|
+
ensureExitHook();
|
|
316
|
+
const runtime = validateScraplingRuntime();
|
|
317
|
+
if (!runtime.ok) throw new Error(runtime.error || "scrapling runtime unavailable");
|
|
318
|
+
const child = spawnProcess(runtime.python, ["-c", pythonDaemonScript(), SCRAPLING_ROOT], {
|
|
319
|
+
env: daemonEnv(),
|
|
320
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
const state = {
|
|
324
|
+
child,
|
|
325
|
+
pending: new Map(),
|
|
326
|
+
stdoutBuffer: "",
|
|
327
|
+
stderrBuffer: "",
|
|
328
|
+
ready: false,
|
|
329
|
+
readyPromise: null,
|
|
330
|
+
resolveReady: null,
|
|
331
|
+
rejectReady: null,
|
|
332
|
+
idleTimer: null,
|
|
333
|
+
};
|
|
334
|
+
|
|
335
|
+
state.readyPromise = new Promise((resolve, reject) => {
|
|
336
|
+
state.resolveReady = resolve;
|
|
337
|
+
state.rejectReady = reject;
|
|
338
|
+
});
|
|
339
|
+
|
|
340
|
+
child.stdout.on("data", (chunk) => handleDaemonStdout(state, chunk));
|
|
341
|
+
child.stderr.on("data", (chunk) => {
|
|
342
|
+
state.stderrBuffer += String(chunk || "");
|
|
343
|
+
if (state.stderrBuffer.length > 20_000) state.stderrBuffer = state.stderrBuffer.slice(-20_000);
|
|
344
|
+
});
|
|
345
|
+
child.on("error", (error) => {
|
|
346
|
+
if (!state.ready) state.rejectReady?.(error);
|
|
347
|
+
failDaemonState(state);
|
|
348
|
+
if (daemonState === state) daemonState = null;
|
|
349
|
+
});
|
|
350
|
+
child.on("close", (code) => {
|
|
351
|
+
if (!state.ready) state.rejectReady?.(new Error(`scrapling daemon exited before ready (${code ?? "unknown"})`));
|
|
352
|
+
failDaemonState(state);
|
|
353
|
+
if (daemonState === state) daemonState = null;
|
|
354
|
+
});
|
|
355
|
+
|
|
356
|
+
daemonState = state;
|
|
357
|
+
return state.readyPromise;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
function requestPayload(mode, config = {}) {
|
|
361
|
+
return {
|
|
362
|
+
timeout: mode === "stealthy"
|
|
363
|
+
? (config.stealthTimeoutMs || config.pageTimeoutMs || 30_000)
|
|
364
|
+
: (config.pageTimeoutMs || 30_000),
|
|
365
|
+
proxy: config.proxy || null,
|
|
366
|
+
proxyRotation: Array.isArray(config.proxyRotation) && config.proxyRotation.length ? config.proxyRotation : null,
|
|
367
|
+
};
|
|
368
|
+
}
|
|
369
|
+
|
|
141
370
|
export async function fetchWithScrapling(url, mode, signal, config = {}) {
|
|
142
371
|
if (!mode) return null;
|
|
372
|
+
let state;
|
|
373
|
+
try {
|
|
374
|
+
state = await ensureScraplingDaemon();
|
|
375
|
+
} catch {
|
|
376
|
+
return null;
|
|
377
|
+
}
|
|
378
|
+
const id = `job-${++daemonSequence}`;
|
|
379
|
+
const payload = requestPayload(mode, config);
|
|
143
380
|
|
|
144
381
|
return await new Promise((resolve) => {
|
|
145
|
-
|
|
146
|
-
env: {
|
|
147
|
-
...process.env,
|
|
148
|
-
PYTHONPATH: [SCRAPLING_ROOT, process.env.PYTHONPATH].filter(Boolean).join(path.delimiter),
|
|
149
|
-
},
|
|
150
|
-
stdio: ["ignore", "pipe", "pipe"],
|
|
151
|
-
});
|
|
152
|
-
|
|
153
|
-
let stdout = "";
|
|
154
|
-
let stderr = "";
|
|
155
|
-
child.stdout.on("data", (chunk) => {
|
|
156
|
-
stdout += chunk;
|
|
157
|
-
});
|
|
158
|
-
child.stderr.on("data", (chunk) => {
|
|
159
|
-
stderr += chunk;
|
|
160
|
-
});
|
|
161
|
-
|
|
382
|
+
let settled = false;
|
|
162
383
|
const finish = (value) => {
|
|
163
|
-
if (
|
|
164
|
-
|
|
165
|
-
|
|
384
|
+
if (settled) return;
|
|
385
|
+
settled = true;
|
|
386
|
+
resolve(signal?.aborted ? null : value);
|
|
387
|
+
};
|
|
388
|
+
|
|
389
|
+
const cleanup = () => {
|
|
390
|
+
if (signal && abort) signal.removeEventListener("abort", abort);
|
|
391
|
+
};
|
|
392
|
+
|
|
393
|
+
const abort = () => {
|
|
394
|
+
state.pending.delete(id);
|
|
395
|
+
cleanup();
|
|
396
|
+
scheduleDaemonIdleStop(state);
|
|
397
|
+
finish(null);
|
|
166
398
|
};
|
|
167
399
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
if (code !== 0) return finish(null);
|
|
171
|
-
try {
|
|
172
|
-
const parsed = JSON.parse(stdout.trim() || "{}");
|
|
173
|
-
if (!parsed.ok) return finish(null);
|
|
174
|
-
return finish(parsed);
|
|
175
|
-
} catch {
|
|
176
|
-
if (stderr) return finish(null);
|
|
177
|
-
return finish(null);
|
|
178
|
-
}
|
|
179
|
-
});
|
|
400
|
+
if (state.idleTimer) clearTimeout(state.idleTimer);
|
|
401
|
+
state.pending.set(id, { resolve: finish, cleanup });
|
|
180
402
|
|
|
181
403
|
if (signal) {
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
404
|
+
if (signal.aborted) return abort();
|
|
405
|
+
signal.addEventListener("abort", abort, { once: true });
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
try {
|
|
409
|
+
state.child.stdin.write(`${JSON.stringify({ id, url, mode, payload })}\n`);
|
|
410
|
+
} catch {
|
|
411
|
+
state.pending.delete(id);
|
|
412
|
+
cleanup();
|
|
413
|
+
finish(null);
|
|
188
414
|
}
|
|
189
415
|
});
|
|
190
416
|
}
|
|
191
417
|
|
|
418
|
+
export async function stopScraplingDaemon() {
|
|
419
|
+
if (!daemonState) return;
|
|
420
|
+
const state = daemonState;
|
|
421
|
+
daemonState = null;
|
|
422
|
+
failDaemonState(state);
|
|
423
|
+
try {
|
|
424
|
+
state.child.kill("SIGKILL");
|
|
425
|
+
} catch {
|
|
426
|
+
// ignore
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
export function setScraplingSpawnForTests(factory) {
|
|
431
|
+
spawnProcess = factory || nodeSpawn;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
export function setScraplingRuntimeStatusForTests(status) {
|
|
435
|
+
runtimeStatus = status;
|
|
436
|
+
}
|
|
437
|
+
|
|
192
438
|
export const pageFetchAdapter = {
|
|
193
439
|
assessPageAttempt,
|
|
194
440
|
chooseScraplingMode,
|
|
195
441
|
fetchWithScrapling,
|
|
442
|
+
stopScraplingDaemon,
|
|
196
443
|
};
|
package/lib/research-policy.js
CHANGED
|
@@ -35,7 +35,7 @@ export const DOMAIN_AUTHORITY_RULES = {
|
|
|
35
35
|
},
|
|
36
36
|
};
|
|
37
37
|
|
|
38
|
-
const PLACEHOLDER_PATTERNS = [
|
|
38
|
+
export const PLACEHOLDER_PATTERNS = [
|
|
39
39
|
/cloudflare/i,
|
|
40
40
|
/access denied/i,
|
|
41
41
|
/temporarily unavailable/i,
|
|
@@ -119,6 +119,20 @@ export function pageQualitySignals({ title = "", text = "", status = 200, conten
|
|
|
119
119
|
};
|
|
120
120
|
}
|
|
121
121
|
|
|
122
|
+
export function isUsableContent(page, config = {}) {
|
|
123
|
+
if (!page || !page.text) return false;
|
|
124
|
+
const quality = page.quality || pageQualitySignals({
|
|
125
|
+
title: page.title,
|
|
126
|
+
text: page.text,
|
|
127
|
+
url: page.url,
|
|
128
|
+
query: config.query || "",
|
|
129
|
+
status: page.fetchStatus ?? 200,
|
|
130
|
+
contentType: page.contentType || "text/html",
|
|
131
|
+
});
|
|
132
|
+
const minPageText = config.minPageText ?? WEAK_PAGE_POLICY.weakTextLimit;
|
|
133
|
+
return !quality.blocked && !quality.placeholder && !quality.weak && quality.plainLength >= minPageText;
|
|
134
|
+
}
|
|
135
|
+
|
|
122
136
|
export function sourceAuthorityProfile({ url = "", title = "", text = "", query = "", domain = "" } = {}) {
|
|
123
137
|
const hostname = normalizeHostname(url);
|
|
124
138
|
const resolvedDomain = resolvePolicyDomain(query, domain);
|
|
@@ -147,42 +161,49 @@ export function sourceAuthorityProfile({ url = "", title = "", text = "", query
|
|
|
147
161
|
return { sourceType: null, authoritative: false, domainBoost: 0, reasons: [] };
|
|
148
162
|
}
|
|
149
163
|
|
|
150
|
-
|
|
164
|
+
function followUpSiteExclusions(seenUrls = []) {
|
|
165
|
+
const sites = [...new Set(seenUrls.map((url) => normalizeHostname(url)).filter(Boolean))];
|
|
166
|
+
return sites.length ? ` ${sites.map((site) => `-site:${site}`).join(" ")}` : "";
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
export function buildAuthorityFollowUpQueries(query = "", explicitDomain = "", options = {}) {
|
|
151
170
|
const resolvedDomain = resolvePolicyDomain(query, explicitDomain);
|
|
152
171
|
const base = baseQuery(query);
|
|
172
|
+
const exclusions = followUpSiteExclusions(options.seenUrls);
|
|
153
173
|
|
|
154
174
|
switch (resolvedDomain) {
|
|
155
175
|
case "security":
|
|
156
|
-
return [`${base} cve advisory vendor`, `${base} nvd cisa mitre`];
|
|
176
|
+
return [`${base} cve advisory vendor${exclusions}`, `${base} nvd cisa mitre${exclusions}`];
|
|
157
177
|
case "vendor-status":
|
|
158
|
-
return [`${base} status page incident`, `${base} official outage status`];
|
|
178
|
+
return [`${base} status page incident${exclusions}`, `${base} official outage status${exclusions}`];
|
|
159
179
|
case "package-registry":
|
|
160
|
-
return [`${base} npm pypi crates readme`, `${base} official package docs`];
|
|
180
|
+
return [`${base} npm pypi crates readme${exclusions}`, `${base} official package docs${exclusions}`];
|
|
161
181
|
case "github":
|
|
162
|
-
return [`${base} github readme releases`, `${base} site:github.com readme docs`];
|
|
182
|
+
return [`${base} github readme releases${exclusions}`, `${base} site:github.com readme docs${exclusions}`];
|
|
163
183
|
case "papers":
|
|
164
|
-
return [`${base} arxiv doi publisher`, `${base} semanticscholar arxiv doi`];
|
|
184
|
+
return [`${base} arxiv doi publisher${exclusions}`, `${base} semanticscholar arxiv doi${exclusions}`];
|
|
165
185
|
default:
|
|
166
|
-
return [`${base} official docs`, `${base} documentation reference`];
|
|
186
|
+
return [`${base} official docs${exclusions}`, `${base} documentation reference${exclusions}`];
|
|
167
187
|
}
|
|
168
188
|
}
|
|
169
189
|
|
|
170
|
-
export function buildConflictFollowUpQueries(query = "", explicitDomain = "") {
|
|
190
|
+
export function buildConflictFollowUpQueries(query = "", explicitDomain = "", options = {}) {
|
|
171
191
|
const resolvedDomain = resolvePolicyDomain(query, explicitDomain);
|
|
172
192
|
const base = baseQuery(query);
|
|
193
|
+
const exclusions = followUpSiteExclusions(options.seenUrls);
|
|
173
194
|
|
|
174
195
|
switch (resolvedDomain) {
|
|
175
196
|
case "security":
|
|
176
|
-
return [`${base} vendor advisory official`, `${base} cve mitigation official`];
|
|
197
|
+
return [`${base} vendor advisory official${exclusions}`, `${base} cve mitigation official${exclusions}`];
|
|
177
198
|
case "vendor-status":
|
|
178
|
-
return [`${base} incident status official`, `${base} status page postmortem`];
|
|
199
|
+
return [`${base} incident status official${exclusions}`, `${base} status page postmortem${exclusions}`];
|
|
179
200
|
case "package-registry":
|
|
180
|
-
return [`${base} release notes changelog`, `${base} maintainer docs`];
|
|
201
|
+
return [`${base} release notes changelog${exclusions}`, `${base} maintainer docs${exclusions}`];
|
|
181
202
|
case "github":
|
|
182
|
-
return [`${base} github releases readme`, `${base} canonical repo docs`];
|
|
203
|
+
return [`${base} github releases readme${exclusions}`, `${base} canonical repo docs${exclusions}`];
|
|
183
204
|
case "papers":
|
|
184
|
-
return [`${base} arxiv doi compare`, `${base} publisher abstract official`];
|
|
205
|
+
return [`${base} arxiv doi compare${exclusions}`, `${base} publisher abstract official${exclusions}`];
|
|
185
206
|
default:
|
|
186
|
-
return [`${base} official docs support status`, `${base} official comparison reference`];
|
|
207
|
+
return [`${base} official docs support status${exclusions}`, `${base} official comparison reference${exclusions}`];
|
|
187
208
|
}
|
|
188
209
|
}
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
"resultsPerQuery": 4,
|
|
7
7
|
"maxPages": 3,
|
|
8
8
|
"pageTimeoutMs": 6000,
|
|
9
|
+
"stealthTimeoutMs": 30000,
|
|
9
10
|
"pageTextLimit": 4000,
|
|
10
11
|
"minPageText": 300,
|
|
11
12
|
"useJinaFallback": true,
|
|
@@ -23,6 +24,7 @@
|
|
|
23
24
|
"resultsPerQuery": 5,
|
|
24
25
|
"maxPages": 8,
|
|
25
26
|
"pageTimeoutMs": 10000,
|
|
27
|
+
"stealthTimeoutMs": 40000,
|
|
26
28
|
"pageTextLimit": 8000,
|
|
27
29
|
"minPageText": 300,
|
|
28
30
|
"useJinaFallback": true,
|
|
@@ -40,6 +42,7 @@
|
|
|
40
42
|
"resultsPerQuery": 5,
|
|
41
43
|
"maxPages": 6,
|
|
42
44
|
"pageTimeoutMs": 10000,
|
|
45
|
+
"stealthTimeoutMs": 40000,
|
|
43
46
|
"pageTextLimit": 8000,
|
|
44
47
|
"minPageText": 300,
|
|
45
48
|
"useJinaFallback": true,
|
|
@@ -57,6 +60,7 @@
|
|
|
57
60
|
"resultsPerQuery": 5,
|
|
58
61
|
"maxPages": 6,
|
|
59
62
|
"pageTimeoutMs": 10000,
|
|
63
|
+
"stealthTimeoutMs": 40000,
|
|
60
64
|
"pageTextLimit": 8000,
|
|
61
65
|
"minPageText": 300,
|
|
62
66
|
"useJinaFallback": true,
|
package/lib/research.js
CHANGED
|
@@ -459,7 +459,7 @@ export function detectConflictSignals(pages) {
|
|
|
459
459
|
return { detected: false, reason: null, conflictSummary: "", conflictingSourcePairs: [] };
|
|
460
460
|
}
|
|
461
461
|
|
|
462
|
-
export function detectResearchGaps(query, pages) {
|
|
462
|
+
export function detectResearchGaps(query, pages, options = {}) {
|
|
463
463
|
const hasAuthoritativeSource = pages.some((page) => {
|
|
464
464
|
const scored = scoreSourceEntry(page, query || "");
|
|
465
465
|
return Boolean(page.authoritative || scored.authoritative);
|
|
@@ -468,7 +468,7 @@ export function detectResearchGaps(query, pages) {
|
|
|
468
468
|
return {
|
|
469
469
|
detected: true,
|
|
470
470
|
reason: "Retrieved pages lack an authoritative docs or README source.",
|
|
471
|
-
followupQuery: buildAuthorityFollowUpQueries(query)[0] || `${queryBase(query)} official docs`,
|
|
471
|
+
followupQuery: buildAuthorityFollowUpQueries(query, "", options)[0] || `${queryBase(query)} official docs`,
|
|
472
472
|
missingAspects: ["authoritative sources"],
|
|
473
473
|
};
|
|
474
474
|
}
|
|
@@ -476,12 +476,21 @@ export function detectResearchGaps(query, pages) {
|
|
|
476
476
|
return { detected: false, reason: null, followupQuery: null, missingAspects: [] };
|
|
477
477
|
}
|
|
478
478
|
|
|
479
|
-
export function buildFollowUpQuery(query, pages) {
|
|
479
|
+
export function buildFollowUpQuery(query, pages, options = {}) {
|
|
480
480
|
const conflict = detectConflictSignals(pages);
|
|
481
|
-
if (conflict.detected) return buildConflictFollowUpQueries(query)[0] || `${queryBase(query)} official docs support status`;
|
|
482
|
-
const gaps = detectResearchGaps(query, pages);
|
|
481
|
+
if (conflict.detected) return buildConflictFollowUpQueries(query, "", options)[0] || `${queryBase(query)} official docs support status`;
|
|
482
|
+
const gaps = detectResearchGaps(query, pages, options);
|
|
483
483
|
if (gaps.detected) return gaps.followupQuery;
|
|
484
|
-
return buildAuthorityFollowUpQueries(`${queryBase(query)} clarification
|
|
484
|
+
return buildAuthorityFollowUpQueries(`${queryBase(query)} clarification`, "", options)[0] || `${queryBase(query)} clarification official docs`;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
export function buildActionBasedFollowUpQuery(query, action, options = {}) {
|
|
488
|
+
if (action === "need_conflict_resolution") return buildConflictFollowUpQueries(query, "", options)[0] || `${queryBase(query)} official docs support status`;
|
|
489
|
+
if (action === "need_authority") return buildAuthorityFollowUpQueries(query, "", options)[0] || `${queryBase(query)} official docs`;
|
|
490
|
+
if (action === "need_recency") return `${queryBase(query)} latest`;
|
|
491
|
+
if (action === "need_version_context") return `${queryBase(query)} version diff`;
|
|
492
|
+
if (action === "need_primary_source") return `${queryBase(query)} source announcement`;
|
|
493
|
+
return buildAuthorityFollowUpQueries(`${queryBase(query)} clarification`, "", options)[0] || `${queryBase(query)} clarification official docs`;
|
|
485
494
|
}
|
|
486
495
|
|
|
487
496
|
function queryTermsForFactCheck(text) {
|