@apmantza/greedysearch-pi 1.9.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +80 -1
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +197 -68
- package/extractors/bing-copilot.mjs +42 -4
- package/extractors/chatgpt.mjs +436 -0
- package/extractors/common.mjs +155 -21
- package/extractors/consensus.mjs +655 -0
- package/extractors/gemini.mjs +335 -217
- package/extractors/logically.mjs +567 -0
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +7 -3
- package/skills/greedy-search/skill.md +9 -3
- package/src/fetcher.mjs +8 -1
- package/src/formatters/results.ts +163 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +150 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/recovery.mjs +51 -45
- package/src/search/research.mjs +1059 -61
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +124 -52
- package/src/tools/shared.ts +187 -186
- package/src/types.ts +110 -104
- package/test.mjs +377 -6
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// extractors/chatgpt.mjs
|
|
4
|
+
// Navigate chatgpt.com, submit query, wait for answer, extract answer + sources.
|
|
5
|
+
//
|
|
6
|
+
// Usage:
|
|
7
|
+
// node extractors/chatgpt.mjs "<query>" [--tab <prefix>]
|
|
8
|
+
//
|
|
9
|
+
// Output (stdout): JSON { answer, sources, query, url }
|
|
10
|
+
// Errors go to stderr only — stdout is always clean JSON for piping.
|
|
11
|
+
|
|
12
|
+
import {
|
|
13
|
+
buildEnvelope,
|
|
14
|
+
cdp,
|
|
15
|
+
cdpWithInput,
|
|
16
|
+
formatAnswer,
|
|
17
|
+
getOrOpenTab,
|
|
18
|
+
handleError,
|
|
19
|
+
injectClipboardInterceptor,
|
|
20
|
+
jitter,
|
|
21
|
+
logStage,
|
|
22
|
+
outputJson,
|
|
23
|
+
parseArgs,
|
|
24
|
+
parseSourcesFromMarkdown,
|
|
25
|
+
parseSourcesFromMarkdownRefStyle,
|
|
26
|
+
prepareArgs,
|
|
27
|
+
validateQuery,
|
|
28
|
+
waitForSelector,
|
|
29
|
+
waitForStreamComplete,
|
|
30
|
+
} from "./common.mjs";
|
|
31
|
+
import { dismissConsent, handleVerification } from "./consent.mjs";
|
|
32
|
+
|
|
33
|
+
const GLOBAL_VAR = "__chatgptClipboard";
|
|
34
|
+
const PROSE_SELECTOR = "div.ProseMirror";
|
|
35
|
+
const SEND_SELECTOR = 'button[data-testid="send-button"]';
|
|
36
|
+
const COPY_SELECTOR = 'button[data-testid="copy-turn-action-button"]';
|
|
37
|
+
|
|
38
|
+
// ============================================================================
|
|
39
|
+
// ChatGPT-specific helpers
|
|
40
|
+
// ============================================================================
|
|
41
|
+
|
|
42
|
+
async function typeAndSubmit(tab, query) {
|
|
43
|
+
// Focus the ProseMirror editor
|
|
44
|
+
await cdp(["click", tab, PROSE_SELECTOR]);
|
|
45
|
+
await new Promise((r) => setTimeout(r, jitter(200)));
|
|
46
|
+
|
|
47
|
+
// Type via CDP (sends Input.insertText). Use stdin so long synthesis
|
|
48
|
+
// prompts do not hit Windows command-line length limits.
|
|
49
|
+
await cdpWithInput(["type", tab, "--stdin"], query);
|
|
50
|
+
await new Promise((r) => setTimeout(r, jitter(300)));
|
|
51
|
+
|
|
52
|
+
// Click send button
|
|
53
|
+
const sendCode = `
|
|
54
|
+
(() => {
|
|
55
|
+
const btn = document.querySelector('${SEND_SELECTOR}');
|
|
56
|
+
if (!btn) return 'no-send';
|
|
57
|
+
btn.click();
|
|
58
|
+
return 'ok';
|
|
59
|
+
})()
|
|
60
|
+
`;
|
|
61
|
+
const sendResult = await cdp(["eval", tab, sendCode]);
|
|
62
|
+
if (sendResult === "no-send")
|
|
63
|
+
throw new Error("ChatGPT send button not found");
|
|
64
|
+
await new Promise((r) => setTimeout(r, jitter(300)));
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Inline selector for waitForStreamComplete: returns the assistant message
|
|
69
|
+
* that comes AFTER the last user message, or null if none exists. This
|
|
70
|
+
* skips chatgpt.com's static pre-rendered greeting card (which is
|
|
71
|
+
* `data-turn-start-message="true"` and lives on the homepage before any
|
|
72
|
+
* conversation) so short answers like "Hello! 👋" don't get confused with
|
|
73
|
+
* the 32-char placeholder.
|
|
74
|
+
*/
|
|
75
|
+
const CHATGPT_RESPONSE_SELECTOR = String.raw`(() => {
|
|
76
|
+
const all = document.querySelectorAll('[data-message-author-role]');
|
|
77
|
+
let lastUserIdx = -1;
|
|
78
|
+
for (let i = 0; i < all.length; i++) {
|
|
79
|
+
if (all[i].getAttribute('data-message-author-role') === 'user') lastUserIdx = i;
|
|
80
|
+
}
|
|
81
|
+
if (lastUserIdx < 0) return null;
|
|
82
|
+
let bestEl = null;
|
|
83
|
+
let bestLen = 0;
|
|
84
|
+
for (let i = lastUserIdx + 1; i < all.length; i++) {
|
|
85
|
+
if (all[i].getAttribute('data-message-author-role') === 'assistant') {
|
|
86
|
+
const len = (all[i].innerText || '').length;
|
|
87
|
+
if (len > bestLen) { bestLen = len; bestEl = all[i]; }
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return bestEl;
|
|
91
|
+
})()`;
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Wait for ChatGPT's response to finish streaming. Delegates to the shared
|
|
95
|
+
* waitForStreamComplete in common.mjs with a custom selector that skips the
|
|
96
|
+
* static homepage greeting card. minLength: 1 means any non-empty response
|
|
97
|
+
* is considered "started" — short answers like "Hello! 👋" (8 chars) used
|
|
98
|
+
* to burn the full 65s budget under the old 50-char threshold.
|
|
99
|
+
*/
|
|
100
|
+
async function waitForResponse(tab, timeoutMs = 20000) {
|
|
101
|
+
return waitForStreamComplete(tab, {
|
|
102
|
+
timeout: timeoutMs,
|
|
103
|
+
interval: 600,
|
|
104
|
+
stableRounds: 3,
|
|
105
|
+
minLength: 1,
|
|
106
|
+
selector: CHATGPT_RESPONSE_SELECTOR,
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Node-side fallback for chatgpt stream completion. Used when the in-browser
|
|
112
|
+
* poll times out (typically because Chrome throttles background tabs to 1Hz
|
|
113
|
+
* when 3+ extractors run in parallel in `all` mode). Polls the same
|
|
114
|
+
* greeting-card-skipping selector via short independent Runtime.evaluate
|
|
115
|
+
* calls so the WebSocket is free between polls.
|
|
116
|
+
*/
|
|
117
|
+
async function pollForResponseNodeSide(tab, maxMs = 15000) {
|
|
118
|
+
const deadline = Date.now() + maxMs;
|
|
119
|
+
let lastLen = 0;
|
|
120
|
+
let stableRounds = 0;
|
|
121
|
+
while (Date.now() < deadline) {
|
|
122
|
+
const result = await cdp(
|
|
123
|
+
["eval", tab, `${CHATGPT_RESPONSE_SELECTOR}?.innerText?.length ?? 0`],
|
|
124
|
+
4000,
|
|
125
|
+
).catch(() => "0");
|
|
126
|
+
const len = parseInt(result, 10) || 0;
|
|
127
|
+
if (len >= 1 && len === lastLen) {
|
|
128
|
+
stableRounds++;
|
|
129
|
+
if (stableRounds >= 3) return len;
|
|
130
|
+
} else {
|
|
131
|
+
lastLen = len;
|
|
132
|
+
stableRounds = 0;
|
|
133
|
+
}
|
|
134
|
+
await new Promise((r) => setTimeout(r, 1200));
|
|
135
|
+
}
|
|
136
|
+
return lastLen;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
async function extractAnswerFromDom(tab) {
|
|
140
|
+
const raw = await cdp([
|
|
141
|
+
"eval",
|
|
142
|
+
tab,
|
|
143
|
+
String.raw`
|
|
144
|
+
(() => {
|
|
145
|
+
// Find the assistant message that comes AFTER the last user message,
|
|
146
|
+
// not the absolute last assistant element. The chatgpt.com homepage
|
|
147
|
+
// has a static pre-rendered greeting card that renders as a
|
|
148
|
+
// [data-message-author-role="assistant"] element with
|
|
149
|
+
// data-turn-start-message="true" — it must be skipped or the
|
|
150
|
+
// static "Hello! How can I help you today?" placeholder gets
|
|
151
|
+
// returned as the answer to a query the assistant never answered.
|
|
152
|
+
const all = Array.from(document.querySelectorAll('[data-message-author-role]'));
|
|
153
|
+
let lastUserIdx = -1;
|
|
154
|
+
for (let i = 0; i < all.length; i++) {
|
|
155
|
+
if (all[i].getAttribute('data-message-author-role') === 'user') {
|
|
156
|
+
lastUserIdx = i;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
if (lastUserIdx < 0) {
|
|
160
|
+
// No user message at all — page is still on the homepage.
|
|
161
|
+
return JSON.stringify({
|
|
162
|
+
answer: '',
|
|
163
|
+
sources: [],
|
|
164
|
+
skipped: 'no-user-message',
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
let assistant = null;
|
|
168
|
+
for (let i = lastUserIdx + 1; i < all.length; i++) {
|
|
169
|
+
if (all[i].getAttribute('data-message-author-role') === 'assistant') {
|
|
170
|
+
assistant = all[i];
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
if (!assistant) {
|
|
174
|
+
return JSON.stringify({
|
|
175
|
+
answer: '',
|
|
176
|
+
sources: [],
|
|
177
|
+
skipped: 'no-assistant-response',
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
const answer = (assistant.innerText || assistant.textContent || '').trim();
|
|
181
|
+
const seen = new Set();
|
|
182
|
+
const sources = [];
|
|
183
|
+
for (const link of assistant.querySelectorAll('a[href]')) {
|
|
184
|
+
const url = link.href;
|
|
185
|
+
if (!url || seen.has(url)) continue;
|
|
186
|
+
seen.add(url);
|
|
187
|
+
const title = (link.innerText || link.textContent || '').replace(/\s+/g, ' ').trim();
|
|
188
|
+
sources.push({ title, url });
|
|
189
|
+
if (sources.length >= 10) break;
|
|
190
|
+
}
|
|
191
|
+
return JSON.stringify({ answer, sources });
|
|
192
|
+
})()
|
|
193
|
+
`,
|
|
194
|
+
]);
|
|
195
|
+
try {
|
|
196
|
+
return JSON.parse(raw);
|
|
197
|
+
} catch {
|
|
198
|
+
return { answer: "", sources: [], skipped: "parse-error" };
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
async function extractAnswer(tab, env) {
|
|
203
|
+
// Click the copy button on the assistant's response (after the last
|
|
204
|
+
// user message). The old `buttons[buttons.length - 1]` picked the
|
|
205
|
+
// absolute last copy button on the page — which is the USER message's
|
|
206
|
+
// copy button when the assistant response is still empty (0 chars) and
|
|
207
|
+
// has no copy button of its own. That copied the user's query into
|
|
208
|
+
// the clipboard interceptor and returned it as the "answer".
|
|
209
|
+
//
|
|
210
|
+
// If the assistant message has no copy button yet (still streaming, or
|
|
211
|
+
// the React tree hasn't rendered the button after streaming completed),
|
|
212
|
+
// we deliberately click NOTHING rather than falling back to the last
|
|
213
|
+
// copy button on the page. An empty clipboard routes us to the DOM
|
|
214
|
+
// fallback, which correctly targets the assistant message after the
|
|
215
|
+
// last user message and returns its innerText.
|
|
216
|
+
await cdp([
|
|
217
|
+
"eval",
|
|
218
|
+
tab,
|
|
219
|
+
`(() => {
|
|
220
|
+
const all = document.querySelectorAll('[data-message-author-role]');
|
|
221
|
+
let lastUserIdx = -1;
|
|
222
|
+
for (let i = 0; i < all.length; i++) {
|
|
223
|
+
if (all[i].getAttribute('data-message-author-role') === 'user') lastUserIdx = i;
|
|
224
|
+
}
|
|
225
|
+
if (lastUserIdx < 0) return 'no-user';
|
|
226
|
+
let assistantCopy = null;
|
|
227
|
+
for (let i = lastUserIdx + 1; i < all.length; i++) {
|
|
228
|
+
if (all[i].getAttribute('data-message-author-role') === 'assistant') {
|
|
229
|
+
const btn = all[i].querySelector('${COPY_SELECTOR}');
|
|
230
|
+
if (btn) assistantCopy = btn;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
if (assistantCopy) { assistantCopy.click(); return 'clicked'; }
|
|
234
|
+
return 'no-assistant-copy';
|
|
235
|
+
})()`,
|
|
236
|
+
]);
|
|
237
|
+
await new Promise((r) => setTimeout(r, 600));
|
|
238
|
+
|
|
239
|
+
let answer = await cdp(["eval", tab, `window.${GLOBAL_VAR} || ''`]);
|
|
240
|
+
env.clipboardEmpty = !answer;
|
|
241
|
+
|
|
242
|
+
// Retry once if clipboard is empty — the assistant message may have
|
|
243
|
+
// finished streaming and the copy button may have rendered in the
|
|
244
|
+
// meantime.
|
|
245
|
+
if (!answer) {
|
|
246
|
+
console.error("[chatgpt] Clipboard empty, retrying in 2s...");
|
|
247
|
+
await cdp([
|
|
248
|
+
"eval",
|
|
249
|
+
tab,
|
|
250
|
+
`(() => {
|
|
251
|
+
const all = document.querySelectorAll('[data-message-author-role]');
|
|
252
|
+
let lastUserIdx = -1;
|
|
253
|
+
for (let i = 0; i < all.length; i++) {
|
|
254
|
+
if (all[i].getAttribute('data-message-author-role') === 'user') lastUserIdx = i;
|
|
255
|
+
}
|
|
256
|
+
if (lastUserIdx < 0) return 'no-user';
|
|
257
|
+
let assistantCopy = null;
|
|
258
|
+
for (let i = lastUserIdx + 1; i < all.length; i++) {
|
|
259
|
+
if (all[i].getAttribute('data-message-author-role') === 'assistant') {
|
|
260
|
+
const btn = all[i].querySelector('${COPY_SELECTOR}');
|
|
261
|
+
if (btn) assistantCopy = btn;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
if (assistantCopy) { assistantCopy.click(); return 'clicked'; }
|
|
265
|
+
return 'no-assistant-copy';
|
|
266
|
+
})()`,
|
|
267
|
+
]);
|
|
268
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
269
|
+
answer = await cdp(["eval", tab, `window.${GLOBAL_VAR} || ''`]);
|
|
270
|
+
env.clipboardEmpty = !answer;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
let domFallback = null;
|
|
274
|
+
if (!answer) {
|
|
275
|
+
domFallback = await extractAnswerFromDom(tab);
|
|
276
|
+
answer = domFallback.answer;
|
|
277
|
+
env.fallbackUsed = answer ? "dom" : null;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
if (!answer) throw new Error("Clipboard interceptor returned empty text");
|
|
281
|
+
|
|
282
|
+
// Parse sources from both inline/reference-style markdown links and DOM links
|
|
283
|
+
// (DOM fallback preserves sources even when native clipboard copy fails).
|
|
284
|
+
const sourcesInline = parseSourcesFromMarkdown(answer);
|
|
285
|
+
const sourcesRef = parseSourcesFromMarkdownRefStyle(answer);
|
|
286
|
+
const sourceMap = new Map();
|
|
287
|
+
for (const s of [
|
|
288
|
+
...(domFallback?.sources || []),
|
|
289
|
+
...sourcesRef,
|
|
290
|
+
...sourcesInline,
|
|
291
|
+
]) {
|
|
292
|
+
if (s?.url && !sourceMap.has(s.url)) sourceMap.set(s.url, s);
|
|
293
|
+
}
|
|
294
|
+
const sources = Array.from(sourceMap.values()).slice(0, 10);
|
|
295
|
+
|
|
296
|
+
return { answer: answer.trim(), sources };
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// ============================================================================
|
|
300
|
+
// Main
|
|
301
|
+
// ============================================================================
|
|
302
|
+
|
|
303
|
+
const USAGE = 'Usage: node extractors/chatgpt.mjs "<query>" [--tab <prefix>]\n';
|
|
304
|
+
|
|
305
|
+
async function main() {
|
|
306
|
+
const args = await prepareArgs(process.argv.slice(2));
|
|
307
|
+
validateQuery(args, USAGE);
|
|
308
|
+
|
|
309
|
+
const { query, tabPrefix, short } = parseArgs(args);
|
|
310
|
+
const startTime = Date.now();
|
|
311
|
+
const mode =
|
|
312
|
+
process.env.GREEDY_SEARCH_VISIBLE === "1" ? "visible" : "headless";
|
|
313
|
+
|
|
314
|
+
const env = {
|
|
315
|
+
engine: "chatgpt",
|
|
316
|
+
mode,
|
|
317
|
+
clipboardEmpty: null,
|
|
318
|
+
fallbackUsed: null,
|
|
319
|
+
blockedBy: null,
|
|
320
|
+
verificationResult: null,
|
|
321
|
+
inputReady: null,
|
|
322
|
+
};
|
|
323
|
+
|
|
324
|
+
try {
|
|
325
|
+
if (!tabPrefix) await cdp(["list"]);
|
|
326
|
+
const tab = await getOrOpenTab(tabPrefix);
|
|
327
|
+
|
|
328
|
+
const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
329
|
+
() => "",
|
|
330
|
+
);
|
|
331
|
+
let onChatGPT = false;
|
|
332
|
+
try {
|
|
333
|
+
onChatGPT = new URL(currentUrl).hostname.toLowerCase() === "chatgpt.com";
|
|
334
|
+
} catch {}
|
|
335
|
+
|
|
336
|
+
if (!onChatGPT) {
|
|
337
|
+
logStage(env, "nav", startTime);
|
|
338
|
+
await cdp(["nav", tab, "https://chatgpt.com"], 20000);
|
|
339
|
+
await new Promise((r) => setTimeout(r, 600));
|
|
340
|
+
}
|
|
341
|
+
logStage(env, "consent", startTime);
|
|
342
|
+
await dismissConsent(tab, cdp);
|
|
343
|
+
logStage(env, "verification", startTime);
|
|
344
|
+
await handleVerification(tab, cdp, 10000);
|
|
345
|
+
|
|
346
|
+
logStage(env, "input-wait", startTime);
|
|
347
|
+
const inputReady = await waitForSelector(tab, PROSE_SELECTOR, 8000, 400);
|
|
348
|
+
env.inputReady = inputReady;
|
|
349
|
+
if (!inputReady) {
|
|
350
|
+
const bodyText = await cdp([
|
|
351
|
+
"eval",
|
|
352
|
+
tab,
|
|
353
|
+
`document.body?.innerText || ''`,
|
|
354
|
+
]).catch(() => "");
|
|
355
|
+
if (
|
|
356
|
+
/sign in|log in|sign up|\u03a3\u03cd\u03bd\u03b4\u03b5\u03c3\u03b7|login/i.test(
|
|
357
|
+
bodyText,
|
|
358
|
+
)
|
|
359
|
+
) {
|
|
360
|
+
throw new Error(
|
|
361
|
+
"ChatGPT requires sign-in — please sign in in the visible browser window",
|
|
362
|
+
);
|
|
363
|
+
}
|
|
364
|
+
throw new Error(
|
|
365
|
+
"ChatGPT input not found — page may be blocked or in unexpected state",
|
|
366
|
+
);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
logStage(env, "clipboard-inject", startTime);
|
|
370
|
+
await injectClipboardInterceptor(tab, GLOBAL_VAR);
|
|
371
|
+
logStage(env, "type-and-submit", startTime);
|
|
372
|
+
await typeAndSubmit(tab, query);
|
|
373
|
+
|
|
374
|
+
logStage(env, "stream-wait", startTime);
|
|
375
|
+
// waitForStreamComplete handles the in-browser poll in a single
|
|
376
|
+
// Runtime.evaluate call. If the response is still streaming past
|
|
377
|
+
// 20s (slow under tab throttling in `all` mode), fall back to
|
|
378
|
+
// node-side polls that release the WebSocket between each call.
|
|
379
|
+
// Together they stay well within the engine's 80s outer budget.
|
|
380
|
+
let asstLen = 0;
|
|
381
|
+
try {
|
|
382
|
+
asstLen = await waitForResponse(tab, 20000);
|
|
383
|
+
} catch (e) {
|
|
384
|
+
logStage(env, "stream-poll-fallback", startTime);
|
|
385
|
+
asstLen = await pollForResponseNodeSide(tab, 15000);
|
|
386
|
+
}
|
|
387
|
+
env.assistantTextLen = asstLen;
|
|
388
|
+
if (asstLen < 1) {
|
|
389
|
+
console.error(
|
|
390
|
+
"[chatgpt] Warning: assistant response may not have completed",
|
|
391
|
+
);
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
logStage(env, "extract", startTime);
|
|
395
|
+
const { answer, sources, skipped } = await extractAnswer(tab, env);
|
|
396
|
+
// If the DOM fallback skipped the response (no real assistant
|
|
397
|
+
// message after the user's query), surface a clear error so the
|
|
398
|
+
// caller doesn't silently consume the static homepage greeting
|
|
399
|
+
// card as a real answer. The static card lives on chatgpt.com
|
|
400
|
+
// before any conversation; without this guard the extractor used
|
|
401
|
+
// to return "Hello! How can I help you today?" as a successful
|
|
402
|
+
// response to every query.
|
|
403
|
+
if (!answer) {
|
|
404
|
+
env.blockedBy = "no-response";
|
|
405
|
+
env.skipped = skipped || null;
|
|
406
|
+
throw new Error(
|
|
407
|
+
skipped === "no-user-message"
|
|
408
|
+
? "ChatGPT still on homepage — query was not submitted"
|
|
409
|
+
: skipped === "no-assistant-response"
|
|
410
|
+
? "ChatGPT did not return an assistant response after submit"
|
|
411
|
+
: "ChatGPT returned no answer — assistant never responded",
|
|
412
|
+
);
|
|
413
|
+
}
|
|
414
|
+
logStage(env, "done", startTime);
|
|
415
|
+
|
|
416
|
+
const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
417
|
+
() => "https://chatgpt.com",
|
|
418
|
+
);
|
|
419
|
+
env.durationMs = Date.now() - startTime;
|
|
420
|
+
outputJson({
|
|
421
|
+
query,
|
|
422
|
+
url: finalUrl,
|
|
423
|
+
answer: formatAnswer(answer, short),
|
|
424
|
+
sources,
|
|
425
|
+
_envelope: buildEnvelope(env),
|
|
426
|
+
});
|
|
427
|
+
} catch (e) {
|
|
428
|
+
env.durationMs = Date.now() - startTime;
|
|
429
|
+
console.error(
|
|
430
|
+
`[chatgpt] error during stage '${env.lastStage || "unknown"}': ${e.message}`,
|
|
431
|
+
);
|
|
432
|
+
handleError(e, buildEnvelope(env));
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
main();
|
package/extractors/common.mjs
CHANGED
|
@@ -20,10 +20,18 @@ const CDP = join(__dir, "..", "bin", "cdp.mjs");
|
|
|
20
20
|
* @returns {Promise<string>} Command output
|
|
21
21
|
*/
|
|
22
22
|
export function cdp(args, timeoutMs = 30000) {
|
|
23
|
+
return cdpWithInput(args, null, timeoutMs);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function cdpWithInput(args, input = null, timeoutMs = 30000) {
|
|
23
27
|
return new Promise((resolve, reject) => {
|
|
24
28
|
const proc = spawn(process.execPath, [CDP, ...args], {
|
|
25
|
-
stdio: ["ignore", "pipe", "pipe"],
|
|
29
|
+
stdio: [input == null ? "ignore" : "pipe", "pipe", "pipe"],
|
|
26
30
|
});
|
|
31
|
+
if (input != null) {
|
|
32
|
+
proc.stdin.write(input);
|
|
33
|
+
proc.stdin.end();
|
|
34
|
+
}
|
|
27
35
|
let out = "";
|
|
28
36
|
let err = "";
|
|
29
37
|
proc.stdout.on("data", (d) => (out += d));
|
|
@@ -67,8 +75,20 @@ export async function getOrOpenTab(tabPrefix) {
|
|
|
67
75
|
const { targetId } = JSON.parse(raw);
|
|
68
76
|
await cdp(["list"]); // refresh cache
|
|
69
77
|
const tid = targetId.slice(0, 8);
|
|
70
|
-
// Inject stealth patches for anti-detection coverage (both headless + visible)
|
|
71
|
-
|
|
78
|
+
// Inject stealth patches for anti-detection coverage (both headless + visible).
|
|
79
|
+
// MUST be awaited: the daemon processes commands concurrently, so a
|
|
80
|
+
// fire-and-forget registration races the next Page.navigate and the
|
|
81
|
+
// script may not be in place when the new document is created.
|
|
82
|
+
// Sites like consensus.app use this race to detect automation — the
|
|
83
|
+
// script's Navigator/webdriver overrides are absent on first paint,
|
|
84
|
+
// fingerprinting fires, and the user is bounced to a sign-up wall.
|
|
85
|
+
try {
|
|
86
|
+
await injectHeadlessStealth(tid);
|
|
87
|
+
} catch (e) {
|
|
88
|
+
process.stderr.write(
|
|
89
|
+
`[getOrOpenTab] stealth injection failed: ${e.message}\n`,
|
|
90
|
+
);
|
|
91
|
+
}
|
|
72
92
|
return tid;
|
|
73
93
|
}
|
|
74
94
|
|
|
@@ -84,25 +104,42 @@ export async function getOrOpenTab(tabPrefix) {
|
|
|
84
104
|
*/
|
|
85
105
|
export async function injectClipboardInterceptor(tab, globalVar) {
|
|
86
106
|
const code = `
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
107
|
+
(() => {
|
|
108
|
+
window.${globalVar} = null;
|
|
109
|
+
const _clipboard = navigator.clipboard;
|
|
110
|
+
if (!_clipboard) return;
|
|
111
|
+
const _origWriteText = typeof _clipboard.writeText === 'function'
|
|
112
|
+
? _clipboard.writeText.bind(_clipboard)
|
|
113
|
+
: null;
|
|
114
|
+
const _origWrite = typeof _clipboard.write === 'function'
|
|
115
|
+
? _clipboard.write.bind(_clipboard)
|
|
116
|
+
: null;
|
|
117
|
+
|
|
118
|
+
_clipboard.writeText = function(text) {
|
|
119
|
+
window.${globalVar} = String(text ?? '');
|
|
120
|
+
if (!_origWriteText) return Promise.resolve();
|
|
121
|
+
// The OS/browser clipboard write may be denied in automated Chrome or
|
|
122
|
+
// when the tab is not focused. We only need the captured text; returning
|
|
123
|
+
// a resolved promise prevents the page from surfacing a misleading
|
|
124
|
+
// "failed to copy" toast after our interceptor already succeeded.
|
|
125
|
+
return Promise.resolve(_origWriteText(text)).catch(() => undefined);
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
_clipboard.write = async function(items) {
|
|
129
|
+
try {
|
|
130
|
+
for (const item of items || []) {
|
|
131
|
+
if (item.types && item.types.includes('text/plain')) {
|
|
132
|
+
const blob = await item.getType('text/plain');
|
|
133
|
+
window.${globalVar} = await blob.text();
|
|
134
|
+
break;
|
|
135
|
+
}
|
|
101
136
|
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
137
|
+
} catch(e) {}
|
|
138
|
+
if (!_origWrite) return undefined;
|
|
139
|
+
try { return await _origWrite(items); }
|
|
140
|
+
catch (_) { return undefined; }
|
|
141
|
+
};
|
|
142
|
+
})();
|
|
106
143
|
`;
|
|
107
144
|
await cdp(["eval", tab, code]);
|
|
108
145
|
}
|
|
@@ -379,6 +416,79 @@ export function parseSourcesFromMarkdown(text) {
|
|
|
379
416
|
return results;
|
|
380
417
|
}
|
|
381
418
|
|
|
419
|
+
/**
|
|
420
|
+
* Linear-time "is this a non-empty digit string?" check.
|
|
421
|
+
* Equivalent to /^\d+$/ without the regex — used to keep the
|
|
422
|
+
* parseSourcesFromMarkdownRefStyle inline scan free of any regex
|
|
423
|
+
* (SonarCloud hotspot js:S5852).
|
|
424
|
+
* @param {string} s
|
|
425
|
+
* @returns {boolean}
|
|
426
|
+
*/
|
|
427
|
+
function isAllDigits(s) {
|
|
428
|
+
if (!s) return false;
|
|
429
|
+
for (let k = 0; k < s.length; k++) {
|
|
430
|
+
const c = s.charCodeAt(k);
|
|
431
|
+
if (c < 48 || c > 57) return false;
|
|
432
|
+
}
|
|
433
|
+
return true;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
/**
|
|
437
|
+
* Parse reference-style markdown links: [text][num] with [num]: url "title" at bottom.
|
|
438
|
+
* ChatGPT uses this format for its inline citations.
|
|
439
|
+
* @param {string} text - Markdown text
|
|
440
|
+
* @returns {Array<{title: string, url: string}>} Extracted sources
|
|
441
|
+
*/
|
|
442
|
+
export function parseSourcesFromMarkdownRefStyle(text) {
|
|
443
|
+
if (!text) return [];
|
|
444
|
+
const results = [];
|
|
445
|
+
|
|
446
|
+
// Find all reference definitions: [num]: url "title"
|
|
447
|
+
const refMap = new Map();
|
|
448
|
+
const refRegex = /^\[(\d+)\]:\s*(https?:\/\/[^\s"]+)(?:\s+"([^"]*)")?/gm;
|
|
449
|
+
let m;
|
|
450
|
+
while ((m = refRegex.exec(text)) !== null) {
|
|
451
|
+
const num = m[1];
|
|
452
|
+
const url = m[2];
|
|
453
|
+
const title = m[3] || "";
|
|
454
|
+
refMap.set(num, { url, title });
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
// Find inline references: [text][num] or [num]. Linear scan via
|
|
458
|
+
// indexOf — avoids the ReDoS-prone /\[([^\]]*)\]\[(\d+)\]/g pattern
|
|
459
|
+
// (SonarCloud hotspot js:S5852). The original `[^\]]*` allowed `[`
|
|
460
|
+
// inside, which caused quadratic backtracking on inputs like
|
|
461
|
+
// `[a[[[[[[[[[[[1]`.
|
|
462
|
+
let cursor = 0;
|
|
463
|
+
while (cursor < text.length) {
|
|
464
|
+
const open = text.indexOf("[", cursor);
|
|
465
|
+
if (open === -1) break;
|
|
466
|
+
const close = text.indexOf("]", open + 1);
|
|
467
|
+
if (close === -1) break;
|
|
468
|
+
if (text[close + 1] !== "[") {
|
|
469
|
+
cursor = open + 1;
|
|
470
|
+
continue;
|
|
471
|
+
}
|
|
472
|
+
const close2 = text.indexOf("]", close + 2);
|
|
473
|
+
if (close2 === -1) break;
|
|
474
|
+
|
|
475
|
+
const inner = text.slice(open + 1, close);
|
|
476
|
+
const numStr = text.slice(close + 2, close2);
|
|
477
|
+
if (isAllDigits(numStr)) {
|
|
478
|
+
const ref = refMap.get(numStr);
|
|
479
|
+
if (ref && !results.some((r) => r.url === ref.url)) {
|
|
480
|
+
results.push({
|
|
481
|
+
title: inner.trim() || ref.title || "",
|
|
482
|
+
url: ref.url,
|
|
483
|
+
});
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
cursor = close2 + 1;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
return results;
|
|
490
|
+
}
|
|
491
|
+
|
|
382
492
|
// ============================================================================
|
|
383
493
|
// Timing constants
|
|
384
494
|
// ============================================================================
|
|
@@ -658,6 +768,26 @@ export function outputJson(data) {
|
|
|
658
768
|
process.stdout.write(`${JSON.stringify(data, null, 2)}\n`);
|
|
659
769
|
}
|
|
660
770
|
|
|
771
|
+
/**
|
|
772
|
+
* Record the current extractor stage for debugging and timeout diagnostics.
|
|
773
|
+
* Writes `[engine] stage: <name> (+<ms>)` to stderr and updates `env.lastStage`
|
|
774
|
+
* / `env.stages` so the envelope carries the last known phase on any outcome
|
|
775
|
+
* (success, error, timeout, kill).
|
|
776
|
+
*
|
|
777
|
+
* @param {object} env - The mutable env object the extractor is filling in.
|
|
778
|
+
* @param {string} stage - Short, snake_case stage name (e.g. "nav", "type", "stream").
|
|
779
|
+
* @param {number} [startTime] - Optional extractor start time for elapsed-ms logging.
|
|
780
|
+
*/
|
|
781
|
+
export function logStage(env, stage, startTime = null) {
|
|
782
|
+
if (!env || typeof env !== "object") return;
|
|
783
|
+
const elapsed = startTime ? ` (+${Date.now() - startTime}ms)` : "";
|
|
784
|
+
env.lastStage = stage;
|
|
785
|
+
if (!Array.isArray(env.stages)) env.stages = [];
|
|
786
|
+
env.stages.push({ stage, at: Date.now() });
|
|
787
|
+
const engine = env.engine || "extractor";
|
|
788
|
+
console.error(`[${engine}] stage: ${stage}${elapsed}`);
|
|
789
|
+
}
|
|
790
|
+
|
|
661
791
|
/**
|
|
662
792
|
* Build a lightweight result envelope from data already collected during extraction.
|
|
663
793
|
* Zero additional CDP calls — everything here is already known.
|
|
@@ -673,6 +803,8 @@ export function buildEnvelope({
|
|
|
673
803
|
verificationResult = null,
|
|
674
804
|
inputReady = null,
|
|
675
805
|
durationMs = null,
|
|
806
|
+
lastStage = null,
|
|
807
|
+
stages = null,
|
|
676
808
|
} = {}) {
|
|
677
809
|
return {
|
|
678
810
|
engine,
|
|
@@ -683,6 +815,8 @@ export function buildEnvelope({
|
|
|
683
815
|
verificationResult,
|
|
684
816
|
inputReady,
|
|
685
817
|
durationMs,
|
|
818
|
+
lastStage,
|
|
819
|
+
stages,
|
|
686
820
|
};
|
|
687
821
|
}
|
|
688
822
|
|