@apmantza/greedysearch-pi 1.9.2 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +132 -2
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +318 -81
- package/extractors/bing-copilot.mjs +48 -18
- package/extractors/chatgpt.mjs +553 -0
- package/extractors/common.mjs +213 -22
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +182 -18
- package/extractors/gemini.mjs +350 -217
- package/extractors/google-ai.mjs +129 -128
- package/extractors/logically.mjs +629 -0
- package/extractors/perplexity.mjs +547 -217
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +8 -4
- package/skills/greedy-search/skill.md +20 -12
- package/src/fetcher.mjs +23 -1
- package/src/formatters/results.ts +185 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/challenge-detect.mjs +205 -0
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +155 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/progress.mjs +145 -0
- package/src/search/recovery.mjs +73 -45
- package/src/search/research.mjs +1419 -62
- package/src/search/scale-aware.mjs +93 -0
- package/src/search/simple-research.mjs +520 -0
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +129 -59
- package/src/tools/shared.ts +312 -186
- package/src/types.ts +110 -104
- package/test.mjs +537 -18
|
@@ -0,0 +1,655 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// extractors/consensus.mjs
|
|
4
|
+
// Navigate consensus.app, submit query, extract research-grounded answer + paper sources.
|
|
5
|
+
//
|
|
6
|
+
// Usage:
|
|
7
|
+
// node extractors/consensus.mjs "<query>" [--tab <prefix>]
|
|
8
|
+
//
|
|
9
|
+
// Output (stdout): JSON { query, url, answer, sources }
|
|
10
|
+
// Errors go to stderr only — stdout is always clean JSON for piping.
|
|
11
|
+
//
|
|
12
|
+
// Language-agnostic: all DOM selectors target structure, data attributes, and
|
|
13
|
+
// URL fragments, never English text. The .CSV button text comes from a
|
|
14
|
+
// developer-set constant in the source (".CSV" with a leading period, not
|
|
15
|
+
// localized), the answer container uses the Tailwind "prose" class, and paper
|
|
16
|
+
// links are matched by their /papers/ URL fragment. Should work in any locale.
|
|
17
|
+
|
|
18
|
+
import {
|
|
19
|
+
buildEnvelope,
|
|
20
|
+
cdp,
|
|
21
|
+
cdpWithInput,
|
|
22
|
+
formatAnswer,
|
|
23
|
+
getOrOpenTab,
|
|
24
|
+
handleError,
|
|
25
|
+
jitter,
|
|
26
|
+
logStage,
|
|
27
|
+
outputJson,
|
|
28
|
+
parseArgs,
|
|
29
|
+
prepareArgs,
|
|
30
|
+
TIMING,
|
|
31
|
+
validateQuery,
|
|
32
|
+
waitForSelector,
|
|
33
|
+
} from "./common.mjs";
|
|
34
|
+
import { ensureChrome } from "../src/search/chrome.mjs";
|
|
35
|
+
import { dismissConsent } from "./consent.mjs";
|
|
36
|
+
|
|
37
|
+
// All structural selectors — no English text matching except for stable
|
|
38
|
+
// developer-set constants (CSV button text, Load more text) that are
|
|
39
|
+
// set in the source code and don't change with locale.
|
|
40
|
+
const SELECTORS = {
|
|
41
|
+
input: 'textarea[name="newMessage"]',
|
|
42
|
+
submitButton: 'button[aria-label="Submit search"]',
|
|
43
|
+
// Tailwind Typography container — set by the developer's CSS framework,
|
|
44
|
+
// not by user-facing text. The answer H1 is the only H1 inside this div.
|
|
45
|
+
answerContainer: 'div[class*="prose"]',
|
|
46
|
+
// Each paper card in the references list carries this data-testid.
|
|
47
|
+
// Distinguishes top-level paper cards from inline citation links
|
|
48
|
+
// (which are bare <a> elements without the testid).
|
|
49
|
+
paperCard: 'a[data-testid="search-result"]',
|
|
50
|
+
exportButton: 'button[aria-label="Export"]',
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
// ============================================================================
|
|
54
|
+
// Sign-in wall detection
|
|
55
|
+
// ============================================================================
|
|
56
|
+
//
|
|
57
|
+
// Anonymous Consensus searches get redirected to the sign-up flow. The
|
|
58
|
+
// redirect is structural (URL pattern), so detection works in any locale.
|
|
59
|
+
// The runner surfaces this as a "needs human verification" error so the
|
|
60
|
+
// user can sign in via the visible Chrome window. Once signed in, the
|
|
61
|
+
// session cookies persist in the GreedySearch Chrome profile for future
|
|
62
|
+
// headless runs.
|
|
63
|
+
|
|
64
|
+
async function detectSignUpWall(tab) {
|
|
65
|
+
const code = `(() => {
|
|
66
|
+
const url = document.location.href || '';
|
|
67
|
+
return url.indexOf('/sign-up/') !== -1 || url.indexOf('redirect_url=') !== -1;
|
|
68
|
+
})()`;
|
|
69
|
+
const result = await cdp(["eval", tab, code]).catch(() => "false");
|
|
70
|
+
return result === "true";
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
async function detectStaleClerkSession(tab) {
|
|
74
|
+
const code = `(() => {
|
|
75
|
+
const url = document.location.href || '';
|
|
76
|
+
const title = document.title || '';
|
|
77
|
+
const text = document.body?.innerText || '';
|
|
78
|
+
const stale =
|
|
79
|
+
url.includes('clerk.consensus.app') ||
|
|
80
|
+
title.includes('clerk.consensus.app') ||
|
|
81
|
+
text.includes('session-token-expired') ||
|
|
82
|
+
text.includes('refresh_request_origin_azp_mismatch') ||
|
|
83
|
+
(text.includes('HTTP ERROR 405') && text.includes('This page isn'));
|
|
84
|
+
return JSON.stringify({ stale, url, title, text: text.slice(0, 500) });
|
|
85
|
+
})()`;
|
|
86
|
+
try {
|
|
87
|
+
return JSON.parse(await cdp(["eval", tab, code], 5000));
|
|
88
|
+
} catch {
|
|
89
|
+
return { stale: false, url: "", title: "", text: "" };
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
async function clearConsensusAuthStorage(tab) {
|
|
94
|
+
for (const origin of [
|
|
95
|
+
"https://consensus.app",
|
|
96
|
+
"https://clerk.consensus.app",
|
|
97
|
+
]) {
|
|
98
|
+
await cdp([
|
|
99
|
+
"evalraw",
|
|
100
|
+
tab,
|
|
101
|
+
"Storage.clearDataForOrigin",
|
|
102
|
+
JSON.stringify({ origin, storageTypes: "all" }),
|
|
103
|
+
]).catch((e) => {
|
|
104
|
+
console.error(
|
|
105
|
+
`[consensus] Warning: failed to clear stale auth storage for ${origin}: ${e.message}`,
|
|
106
|
+
);
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
async function recoverStaleClerkSession(tab, env, startTime) {
|
|
112
|
+
const before = await detectStaleClerkSession(tab);
|
|
113
|
+
if (!before.stale) return false;
|
|
114
|
+
|
|
115
|
+
logStage(env, "auth-storage-reset", startTime);
|
|
116
|
+
console.error(
|
|
117
|
+
`[consensus] Detected stale Clerk/Consensus auth state (${before.title || before.url}) — clearing per-origin storage and retrying navigation`,
|
|
118
|
+
);
|
|
119
|
+
env.fallbackUsed = "clear-stale-consensus-auth";
|
|
120
|
+
await clearConsensusAuthStorage(tab);
|
|
121
|
+
await cdp(["nav", tab, "https://consensus.app/"], 20000);
|
|
122
|
+
await new Promise((r) => setTimeout(r, 900));
|
|
123
|
+
|
|
124
|
+
const after = await detectStaleClerkSession(tab);
|
|
125
|
+
if (after.stale) {
|
|
126
|
+
env.blockedBy = "signin";
|
|
127
|
+
env.verificationResult = "needs-human";
|
|
128
|
+
throw new Error(
|
|
129
|
+
"Consensus auth session is stale — visible Chrome is open. Please sign in again, then rerun the search.",
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
return true;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// ============================================================================
|
|
136
|
+
// Typing helper
|
|
137
|
+
// ============================================================================
|
|
138
|
+
|
|
139
|
+
async function typeIntoConsensus(tab, text) {
|
|
140
|
+
// 1. Focus the input via click (more reliable than eval focus for textareas)
|
|
141
|
+
await cdp(["click", tab, SELECTORS.input]);
|
|
142
|
+
await new Promise((r) => setTimeout(r, jitter(200)));
|
|
143
|
+
|
|
144
|
+
// 2. Type using CDP Input.insertText. Pass long queries through stdin so
|
|
145
|
+
// Windows does not reject the cdp.mjs process spawn with ENAMETOOLONG.
|
|
146
|
+
await cdpWithInput(["type", tab, "--stdin"], text);
|
|
147
|
+
await new Promise((r) => setTimeout(r, jitter(300)));
|
|
148
|
+
|
|
149
|
+
// 3. Verify the text was actually inserted
|
|
150
|
+
const inserted = await cdp([
|
|
151
|
+
"eval",
|
|
152
|
+
tab,
|
|
153
|
+
`(document.querySelector('${SELECTORS.input}')?.value || '').length >= ${Math.floor(text.length * 0.8)}`,
|
|
154
|
+
]);
|
|
155
|
+
if (inserted !== "true") {
|
|
156
|
+
throw new Error(
|
|
157
|
+
"Consensus input did not accept text — input verification failed",
|
|
158
|
+
);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// ============================================================================
|
|
163
|
+
// XHR interceptor for /api/papers/details/
|
|
164
|
+
// ============================================================================
|
|
165
|
+
//
|
|
166
|
+
// The "Export → .CSV" button in Consensus's UI fetches paper details from
|
|
167
|
+
// /api/papers/details/ (a POST returning JSON) and converts them to CSV
|
|
168
|
+
// client-side. We intercept that POST response directly via an XHR
|
|
169
|
+
// monkey-patch, avoiding the file-download dance. The response carries
|
|
170
|
+
// rich metadata: title, authors, year, journal, doi, citation_count,
|
|
171
|
+
// abstract_takeaway, badges, open_access_pdf_url, etc. — much more than
|
|
172
|
+
// the DOM cards expose.
|
|
173
|
+
//
|
|
174
|
+
// We don't capture Load More responses: those are partial-page fetches.
|
|
175
|
+
// We only capture the .CSV response, which (after Load More has expanded
|
|
176
|
+
// the list) contains the full set of references for the query.
|
|
177
|
+
|
|
178
|
+
async function installPapersDetailsInterceptor(tab) {
|
|
179
|
+
const code = `(() => {
|
|
180
|
+
if (window.__pdiInstalled) return 'already';
|
|
181
|
+
window.__pdiInstalled = true;
|
|
182
|
+
const _origOpen = XMLHttpRequest.prototype.open;
|
|
183
|
+
const _origSend = XMLHttpRequest.prototype.send;
|
|
184
|
+
XMLHttpRequest.prototype.open = function(method, url) {
|
|
185
|
+
this.__url = String(url);
|
|
186
|
+
return _origOpen.apply(this, arguments);
|
|
187
|
+
};
|
|
188
|
+
XMLHttpRequest.prototype.send = function(body) {
|
|
189
|
+
if (this.__url && this.__url.indexOf('/api/papers/details') !== -1) {
|
|
190
|
+
this.addEventListener('load', function() {
|
|
191
|
+
if (this.status === 200) {
|
|
192
|
+
try {
|
|
193
|
+
const parsed = JSON.parse(this.responseText);
|
|
194
|
+
// Stack responses: the last one wins (the .CSV
|
|
195
|
+
// request fires after Load More has settled).
|
|
196
|
+
window.__papersDetailsResps = window.__papersDetailsResps || [];
|
|
197
|
+
window.__papersDetailsResps.push({
|
|
198
|
+
at: Date.now(),
|
|
199
|
+
count: Object.keys(parsed?.paperDetailsListByPaperId || {}).length,
|
|
200
|
+
data: parsed,
|
|
201
|
+
});
|
|
202
|
+
} catch (e) {
|
|
203
|
+
window.__papersDetailsErrors = window.__papersDetailsErrors || [];
|
|
204
|
+
window.__papersDetailsErrors.push(String(e.message || e));
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
return _origSend.apply(this, arguments);
|
|
210
|
+
};
|
|
211
|
+
return 'installed';
|
|
212
|
+
})()`;
|
|
213
|
+
const r = await cdp(["eval", tab, code]);
|
|
214
|
+
return r === "installed" || r === "already";
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
async function fetchPapersDetailsResponse(tab, timeoutMs = 10000) {
|
|
218
|
+
// Single-eval poll for the captured /api/papers/details/ response. We
|
|
219
|
+
// wait for a response to land and then return the latest one. The
|
|
220
|
+
// timeout covers the worst case where the .CSV click never triggers
|
|
221
|
+
// the request (e.g., signed-out user, the button is gated).
|
|
222
|
+
const code = `new Promise((resolve) => {
|
|
223
|
+
const _deadline = Date.now() + ${timeoutMs};
|
|
224
|
+
function _check() {
|
|
225
|
+
const resps = window.__papersDetailsResps || [];
|
|
226
|
+
if (resps.length > 0) {
|
|
227
|
+
const last = resps[resps.length - 1];
|
|
228
|
+
resolve(JSON.stringify({ ok: true, count: last.count, data: last.data }));
|
|
229
|
+
return;
|
|
230
|
+
}
|
|
231
|
+
if (Date.now() < _deadline) {
|
|
232
|
+
setTimeout(_check, 200);
|
|
233
|
+
} else {
|
|
234
|
+
resolve(JSON.stringify({ ok: false, reason: 'timeout' }));
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
_check();
|
|
238
|
+
})`;
|
|
239
|
+
const result = await cdp(["eval", tab, code], timeoutMs + 5000);
|
|
240
|
+
return JSON.parse(result);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// ============================================================================
|
|
244
|
+
// Load More — wait for all references to be fetched
|
|
245
|
+
// ============================================================================
|
|
246
|
+
//
|
|
247
|
+
// Consensus streams the answer in two phases:
|
|
248
|
+
// 1. The prose summary appears first.
|
|
249
|
+
// 2. The references list loads after a brief pause, then "Load more results"
|
|
250
|
+
// fetches subsequent pages of papers.
|
|
251
|
+
// We click Load More until it disappears, then wait briefly for the last
|
|
252
|
+
// batch to settle before clicking Export → .CSV.
|
|
253
|
+
|
|
254
|
+
async function expandReferences(tab, maxClicks = 8) {
|
|
255
|
+
// First, wait for the references section to render — the initial 20
|
|
256
|
+
// paper cards need to appear before we can click Load More. Without
|
|
257
|
+
// this gate, expandReferences often runs before the button is in the
|
|
258
|
+
// DOM, the loop exits on first check, and the .CSV click only fetches
|
|
259
|
+
// the partial 20-paper set instead of the full result list.
|
|
260
|
+
const ready = await waitForSelector(
|
|
261
|
+
tab,
|
|
262
|
+
SELECTORS.paperCard,
|
|
263
|
+
20000,
|
|
264
|
+
500,
|
|
265
|
+
).catch(() => false);
|
|
266
|
+
if (!ready) {
|
|
267
|
+
console.error(
|
|
268
|
+
"[consensus] Warning: no paper cards appeared within 20s — Load More will be skipped",
|
|
269
|
+
);
|
|
270
|
+
return 0;
|
|
271
|
+
}
|
|
272
|
+
// Brief settle so the Load More button has time to mount after the
|
|
273
|
+
// initial 20 cards render.
|
|
274
|
+
await new Promise((r) => setTimeout(r, 800));
|
|
275
|
+
|
|
276
|
+
let clicks = 0;
|
|
277
|
+
for (let i = 0; i < maxClicks; i++) {
|
|
278
|
+
// Find the Load more button by its visible text. The text is
|
|
279
|
+
// "Load more results" in English UI; for non-English the structure
|
|
280
|
+
// is the same primary button below the paper list. Some pages
|
|
281
|
+
// also use a sidebar of references — same selector works because
|
|
282
|
+
// we query globally.
|
|
283
|
+
const hasMore = await cdp([
|
|
284
|
+
"eval",
|
|
285
|
+
tab,
|
|
286
|
+
`(() => {
|
|
287
|
+
const btns = Array.from(document.querySelectorAll('button'));
|
|
288
|
+
return btns.some(b => {
|
|
289
|
+
const t = (b.innerText || '').trim();
|
|
290
|
+
return /load more/i.test(t) || /more results/i.test(t) || /show more/i.test(t);
|
|
291
|
+
});
|
|
292
|
+
})()`,
|
|
293
|
+
]);
|
|
294
|
+
if (hasMore !== "true") break;
|
|
295
|
+
await cdp([
|
|
296
|
+
"eval",
|
|
297
|
+
tab,
|
|
298
|
+
`(() => {
|
|
299
|
+
const btns = Array.from(document.querySelectorAll('button'));
|
|
300
|
+
const btn = btns.find(b => {
|
|
301
|
+
const t = (b.innerText || '').trim();
|
|
302
|
+
return /load more/i.test(t) || /more results/i.test(t) || /show more/i.test(t);
|
|
303
|
+
});
|
|
304
|
+
btn?.click();
|
|
305
|
+
return 'clicked';
|
|
306
|
+
})()`,
|
|
307
|
+
]);
|
|
308
|
+
clicks++;
|
|
309
|
+
// 1.5s between clicks: each batch needs time to render.
|
|
310
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
311
|
+
}
|
|
312
|
+
return clicks;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// ============================================================================
|
|
316
|
+
// CSV download via .CSV button → /api/papers/details/ interception
|
|
317
|
+
// ============================================================================
|
|
318
|
+
|
|
319
|
+
async function clickExportCsv(tab) {
|
|
320
|
+
// Open the Export menu. The button is aria-label="Export" and lives
|
|
321
|
+
// in the page header; clicking it reveals a dropdown with the .CSV
|
|
322
|
+
// option. We scroll the button into view first to avoid stale-layout
|
|
323
|
+
// issues when the references list pushes it off-screen.
|
|
324
|
+
await cdp([
|
|
325
|
+
"eval",
|
|
326
|
+
tab,
|
|
327
|
+
`(() => {
|
|
328
|
+
const btn = document.querySelector('${SELECTORS.exportButton}');
|
|
329
|
+
if (!btn) return 'no-export';
|
|
330
|
+
btn.scrollIntoView({ block: 'center' });
|
|
331
|
+
btn.click();
|
|
332
|
+
return 'opened';
|
|
333
|
+
})()`,
|
|
334
|
+
]);
|
|
335
|
+
// 600ms for the dropdown animation/portal to mount.
|
|
336
|
+
await new Promise((r) => setTimeout(r, 600));
|
|
337
|
+
// Click the .CSV option. The button text is the developer-set
|
|
338
|
+
// constant ".CSV" followed by "\n\nExcel, Numbers, Sheets" — the
|
|
339
|
+
// leading period is part of the source-code string, not localized.
|
|
340
|
+
const clicked = await cdp([
|
|
341
|
+
"eval",
|
|
342
|
+
tab,
|
|
343
|
+
`(() => {
|
|
344
|
+
const btn = Array.from(document.querySelectorAll('button'))
|
|
345
|
+
.find(b => /\\.CSV/.test((b.innerText || '').trim()));
|
|
346
|
+
if (!btn) return 'no-csv';
|
|
347
|
+
btn.click();
|
|
348
|
+
return 'clicked';
|
|
349
|
+
})()`,
|
|
350
|
+
]);
|
|
351
|
+
return clicked;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// ============================================================================
|
|
355
|
+
// Build sources from the API response
|
|
356
|
+
// ============================================================================
|
|
357
|
+
|
|
358
|
+
function buildSourcesFromApi(respData) {
|
|
359
|
+
const map = respData?.paperDetailsListByPaperId || {};
|
|
360
|
+
const ids = Object.keys(map);
|
|
361
|
+
const sources = [];
|
|
362
|
+
for (let i = 0; i < ids.length; i++) {
|
|
363
|
+
const p = map[ids[i]] || {};
|
|
364
|
+
const urlSlug = p.url_slug || "";
|
|
365
|
+
const paperId = p.paper_id || p.hash_paper_id || ids[i];
|
|
366
|
+
// DOM uses /papers/{url_slug}/{paperId_short}/ — paperId is the
|
|
367
|
+
// hash. We construct the consensus.app detail page URL. If we
|
|
368
|
+
// don't have url_slug, fall back to a search-by-id URL.
|
|
369
|
+
let url;
|
|
370
|
+
if (urlSlug) {
|
|
371
|
+
url = `https://consensus.app/papers/${urlSlug}/${paperId}/`;
|
|
372
|
+
} else if (p.provider_url) {
|
|
373
|
+
url = p.provider_url;
|
|
374
|
+
} else if (p.doi) {
|
|
375
|
+
url = `https://doi.org/${p.doi}`;
|
|
376
|
+
} else {
|
|
377
|
+
url = `https://consensus.app/paper/${paperId}`;
|
|
378
|
+
}
|
|
379
|
+
const tags = [];
|
|
380
|
+
const badges = p.badges || {};
|
|
381
|
+
if (badges.study_type === "rct") tags.push("RCT");
|
|
382
|
+
else if (badges.study_type === "meta_analysis") tags.push("META-ANALYSIS");
|
|
383
|
+
else if (badges.study_type === "systematic_review")
|
|
384
|
+
tags.push("SYSTEMATIC REVIEW");
|
|
385
|
+
else if (badges.study_type) tags.push(badges.study_type.toUpperCase());
|
|
386
|
+
if (badges.rigorous_journal) tags.push("RIGOROUS JOURNAL");
|
|
387
|
+
if (badges.very_rigorous_journal) tags.push("VERY RIGOROUS JOURNAL");
|
|
388
|
+
if (badges.highly_cited_paper) tags.push("HIGHLY CITED");
|
|
389
|
+
if (badges.large_human_trial) tags.push("LARGE HUMAN TRIAL");
|
|
390
|
+
if (p.is_retracted) tags.push("RETRACTED");
|
|
391
|
+
if (p.open_access_pdf_url) tags.push("OPEN ACCESS");
|
|
392
|
+
|
|
393
|
+
sources.push({
|
|
394
|
+
title: p.title || "",
|
|
395
|
+
url,
|
|
396
|
+
rank: i + 1,
|
|
397
|
+
authors: Array.isArray(p.authors) ? p.authors : [],
|
|
398
|
+
year: p.year || null,
|
|
399
|
+
journal: p.journal || p.publisher_name || null,
|
|
400
|
+
doi: p.doi || null,
|
|
401
|
+
citation_count: p.citation_count || 0,
|
|
402
|
+
snippet: p.abstract_takeaway || "",
|
|
403
|
+
tags,
|
|
404
|
+
});
|
|
405
|
+
}
|
|
406
|
+
return sources;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// ============================================================================
|
|
410
|
+
// DOM fallback for sources
|
|
411
|
+
// ============================================================================
|
|
412
|
+
|
|
413
|
+
async function extractSourcesFromDom(tab) {
|
|
414
|
+
const code = `(() => {
|
|
415
|
+
const cards = Array.from(document.querySelectorAll('${SELECTORS.paperCard}'));
|
|
416
|
+
const sources = [];
|
|
417
|
+
const seen = new Set();
|
|
418
|
+
for (const card of cards) {
|
|
419
|
+
const url = card.href || '';
|
|
420
|
+
if (!url || seen.has(url)) continue;
|
|
421
|
+
seen.add(url);
|
|
422
|
+
const rankText = card.querySelector('span[data-testid="tag"]')?.innerText?.trim() || '';
|
|
423
|
+
const rank = parseInt(rankText, 10) || null;
|
|
424
|
+
const title = (card.querySelector('h2')?.innerText || '').trim();
|
|
425
|
+
// Key takeaway: the span after the "KEY TAKEAWAY" label
|
|
426
|
+
const takeawaySpan = card.querySelector('span.sm-normal');
|
|
427
|
+
const snippet = takeawaySpan
|
|
428
|
+
? takeawaySpan.innerText.replace(/^KEY TAKEAWAY\\s*[·\\-]\\s*/i, '').trim()
|
|
429
|
+
: '';
|
|
430
|
+
// Tags: collect chip text from meta-analysis-tag, journal tags, etc.
|
|
431
|
+
const tags = Array.from(card.querySelectorAll('span[data-testid$="-tag"]'))
|
|
432
|
+
.map(t => t.innerText?.trim())
|
|
433
|
+
.filter(Boolean);
|
|
434
|
+
sources.push({ title, url, rank, snippet, tags });
|
|
435
|
+
}
|
|
436
|
+
return JSON.stringify(sources);
|
|
437
|
+
})()`;
|
|
438
|
+
const result = await cdp(["eval", tab, code], 10000);
|
|
439
|
+
return JSON.parse(result);
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// ============================================================================
|
|
443
|
+
// Answer extraction
|
|
444
|
+
// ============================================================================
|
|
445
|
+
|
|
446
|
+
async function extractAnswer(tab) {
|
|
447
|
+
const code = `(() => {
|
|
448
|
+
const prose = document.querySelector('${SELECTORS.answerContainer}');
|
|
449
|
+
return prose?.innerText?.trim() || '';
|
|
450
|
+
})()`;
|
|
451
|
+
return (await cdp(["eval", tab, code], 10000)) || "";
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// ============================================================================
|
|
455
|
+
// Main
|
|
456
|
+
// ============================================================================
|
|
457
|
+
|
|
458
|
+
const USAGE =
|
|
459
|
+
'Usage: node extractors/consensus.mjs "<query>" [--tab <prefix>]\n';
|
|
460
|
+
|
|
461
|
+
async function main() {
|
|
462
|
+
const args = await prepareArgs(process.argv.slice(2));
|
|
463
|
+
validateQuery(args, USAGE);
|
|
464
|
+
|
|
465
|
+
const { query, tabPrefix, short } = parseArgs(args);
|
|
466
|
+
const startTime = Date.now();
|
|
467
|
+
const mode =
|
|
468
|
+
process.env.GREEDY_SEARCH_VISIBLE === "1" ? "visible" : "headless";
|
|
469
|
+
|
|
470
|
+
const env = {
|
|
471
|
+
engine: "consensus",
|
|
472
|
+
mode,
|
|
473
|
+
clipboardEmpty: null,
|
|
474
|
+
fallbackUsed: null,
|
|
475
|
+
blockedBy: null,
|
|
476
|
+
verificationResult: null,
|
|
477
|
+
inputReady: null,
|
|
478
|
+
};
|
|
479
|
+
|
|
480
|
+
try {
|
|
481
|
+
// Default to headless unless the caller explicitly set visible mode.
|
|
482
|
+
if (
|
|
483
|
+
process.env.GREEDY_SEARCH_VISIBLE !== "1" &&
|
|
484
|
+
process.env.GREEDY_SEARCH_ALWAYS_VISIBLE !== "1"
|
|
485
|
+
) {
|
|
486
|
+
process.env.GREEDY_SEARCH_HEADLESS = "1";
|
|
487
|
+
}
|
|
488
|
+
await ensureChrome();
|
|
489
|
+
|
|
490
|
+
if (!tabPrefix) await cdp(["list"]);
|
|
491
|
+
const tab = await getOrOpenTab(tabPrefix);
|
|
492
|
+
|
|
493
|
+
// Skip navigation if tab was pre-seeded to consensus.app
|
|
494
|
+
const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
495
|
+
() => "",
|
|
496
|
+
);
|
|
497
|
+
let onConsensus = false;
|
|
498
|
+
try {
|
|
499
|
+
const host = new URL(currentUrl).hostname.toLowerCase();
|
|
500
|
+
onConsensus = host === "consensus.app" || host.endsWith(".consensus.app");
|
|
501
|
+
} catch {}
|
|
502
|
+
|
|
503
|
+
if (!onConsensus) {
|
|
504
|
+
logStage(env, "nav", startTime);
|
|
505
|
+
await cdp(["nav", tab, "https://consensus.app/"], 20000);
|
|
506
|
+
await new Promise((r) => setTimeout(r, 600));
|
|
507
|
+
}
|
|
508
|
+
await recoverStaleClerkSession(tab, env, startTime);
|
|
509
|
+
await dismissConsent(tab, cdp);
|
|
510
|
+
// Skip handleVerification: consensus.app has no Cloudflare/Turnstile
|
|
511
|
+
// challenge, but the verify detector matches "human" inside suggested-
|
|
512
|
+
// search chip text and false-positives into clicking a different query.
|
|
513
|
+
// Anonymous users hit the /sign-up/ wall which we detect explicitly
|
|
514
|
+
// after submit.
|
|
515
|
+
|
|
516
|
+
logStage(env, "input-wait", startTime);
|
|
517
|
+
const inputReady = await waitForSelector(tab, SELECTORS.input, 15000, 400);
|
|
518
|
+
env.inputReady = inputReady;
|
|
519
|
+
if (!inputReady) {
|
|
520
|
+
const recovered = await recoverStaleClerkSession(tab, env, startTime);
|
|
521
|
+
if (recovered) {
|
|
522
|
+
const retryInputReady = await waitForSelector(
|
|
523
|
+
tab,
|
|
524
|
+
SELECTORS.input,
|
|
525
|
+
15000,
|
|
526
|
+
400,
|
|
527
|
+
);
|
|
528
|
+
env.inputReady = retryInputReady;
|
|
529
|
+
if (retryInputReady) {
|
|
530
|
+
await dismissConsent(tab, cdp);
|
|
531
|
+
} else {
|
|
532
|
+
throw new Error(
|
|
533
|
+
"Consensus input not found after stale auth recovery — page may not have loaded or is in unexpected state",
|
|
534
|
+
);
|
|
535
|
+
}
|
|
536
|
+
} else {
|
|
537
|
+
throw new Error(
|
|
538
|
+
"Consensus input not found — page may not have loaded or is in unexpected state",
|
|
539
|
+
);
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
await new Promise((r) => setTimeout(r, jitter(TIMING.postClick)));
|
|
543
|
+
|
|
544
|
+
logStage(env, "type-and-submit", startTime);
|
|
545
|
+
await typeIntoConsensus(tab, query);
|
|
546
|
+
await new Promise((r) => setTimeout(r, jitter(TIMING.postType)));
|
|
547
|
+
await cdp([
|
|
548
|
+
"eval",
|
|
549
|
+
tab,
|
|
550
|
+
`document.querySelector('${SELECTORS.submitButton}')?.click()`,
|
|
551
|
+
]);
|
|
552
|
+
|
|
553
|
+
// Fast-fail if Consensus redirected to the sign-up wall. The page
|
|
554
|
+
// navigates from / to /search/.../.../ for a signed-in user, but
|
|
555
|
+
// anonymous users get bounced to /sign-up/?redirect_url=... We check
|
|
556
|
+
// after a short settle so the URL has time to update.
|
|
557
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
558
|
+
if (await detectSignUpWall(tab)) {
|
|
559
|
+
env.blockedBy = "signin";
|
|
560
|
+
throw new Error(
|
|
561
|
+
"Consensus requires sign-in — please sign in or create a free account in the visible browser window. Once signed in, cookies persist for future runs.",
|
|
562
|
+
);
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
logStage(env, "answer-wait", startTime);
|
|
566
|
+
await waitForSelector(tab, SELECTORS.answerContainer, 30000, 500);
|
|
567
|
+
|
|
568
|
+
// Install the XHR interceptor BEFORE Load More clicks. Each Load
|
|
569
|
+
// More triggers its own /api/papers/details/ call (partial page).
|
|
570
|
+
// We capture every response and pick the largest one (which is the
|
|
571
|
+
// .CSV response after Load More has settled and the full list is
|
|
572
|
+
// in scope). This also covers the corner case where the user has
|
|
573
|
+
// fewer than 20 papers — .CSV still works on whatever is visible.
|
|
574
|
+
await installPapersDetailsInterceptor(tab);
|
|
575
|
+
|
|
576
|
+
logStage(env, "expand-refs", startTime);
|
|
577
|
+
const clicks = await expandReferences(tab, 8);
|
|
578
|
+
if (clicks === 0) {
|
|
579
|
+
console.error(
|
|
580
|
+
"[consensus] Note: 'Load more results' button not present (initial page has all references)",
|
|
581
|
+
);
|
|
582
|
+
} else {
|
|
583
|
+
console.error(
|
|
584
|
+
`[consensus] Clicked 'Load more results' ${clicks} time(s) to expand references`,
|
|
585
|
+
);
|
|
586
|
+
}
|
|
587
|
+
// Brief settle for the last batch to render fully.
|
|
588
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
589
|
+
|
|
590
|
+
logStage(env, "csv-click", startTime);
|
|
591
|
+
const csvResult = await clickExportCsv(tab);
|
|
592
|
+
if (csvResult !== "clicked") {
|
|
593
|
+
console.error(
|
|
594
|
+
`[consensus] Export → .CSV click did not register (${csvResult}) — falling back to DOM`,
|
|
595
|
+
);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
logStage(env, "wait-csv-resp", startTime);
|
|
599
|
+
const csvResp = await fetchPapersDetailsResponse(tab, 12000);
|
|
600
|
+
let sources = [];
|
|
601
|
+
let sourcePath = "dom";
|
|
602
|
+
if (csvResp.ok) {
|
|
603
|
+
sources = buildSourcesFromApi(csvResp.data);
|
|
604
|
+
sourcePath = "api-intercept";
|
|
605
|
+
console.error(
|
|
606
|
+
`[consensus] Captured /api/papers/details/ response with ${csvResp.count} paper(s)`,
|
|
607
|
+
);
|
|
608
|
+
} else {
|
|
609
|
+
console.error(
|
|
610
|
+
`[consensus] /api/papers/details/ response not captured (${csvResp.reason}) — falling back to DOM cards`,
|
|
611
|
+
);
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
logStage(env, "extract", startTime);
|
|
615
|
+
const answer = await extractAnswer(tab);
|
|
616
|
+
if (!answer) {
|
|
617
|
+
throw new Error("No answer extracted — Consensus may not have responded");
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
// DOM fallback: if the API interception didn't yield sources (or
|
|
621
|
+
// yielded very few compared to the visible cards), top up from
|
|
622
|
+
// the DOM so we don't lose data.
|
|
623
|
+
if (sources.length === 0) {
|
|
624
|
+
sources = await extractSourcesFromDom(tab);
|
|
625
|
+
} else {
|
|
626
|
+
// Top-up from DOM only for cards we don't already have via API
|
|
627
|
+
// (defensive — shouldn't happen in normal flow).
|
|
628
|
+
const domSources = await extractSourcesFromDom(tab);
|
|
629
|
+
const apiUrls = new Set(sources.map((s) => s.url));
|
|
630
|
+
for (const ds of domSources) {
|
|
631
|
+
if (!apiUrls.has(ds.url)) sources.push(ds);
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
636
|
+
() => "",
|
|
637
|
+
);
|
|
638
|
+
env.durationMs = Date.now() - startTime;
|
|
639
|
+
env.sourcePath = sourcePath;
|
|
640
|
+
logStage(env, "done", startTime);
|
|
641
|
+
|
|
642
|
+
outputJson({
|
|
643
|
+
query,
|
|
644
|
+
url: finalUrl,
|
|
645
|
+
answer: formatAnswer(answer, short),
|
|
646
|
+
sources,
|
|
647
|
+
_envelope: buildEnvelope(env),
|
|
648
|
+
});
|
|
649
|
+
} catch (e) {
|
|
650
|
+
env.durationMs = Date.now() - startTime;
|
|
651
|
+
handleError(e, buildEnvelope(env));
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
main();
|