@apmantza/greedysearch-pi 1.9.2 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/CHANGELOG.md +132 -2
  2. package/README.md +82 -47
  3. package/bin/cdp.mjs +1153 -1108
  4. package/bin/launch.mjs +9 -0
  5. package/bin/search.mjs +318 -81
  6. package/extractors/bing-copilot.mjs +48 -18
  7. package/extractors/chatgpt.mjs +553 -0
  8. package/extractors/common.mjs +213 -22
  9. package/extractors/consensus.mjs +655 -0
  10. package/extractors/consent.mjs +182 -18
  11. package/extractors/gemini.mjs +350 -217
  12. package/extractors/google-ai.mjs +129 -128
  13. package/extractors/logically.mjs +629 -0
  14. package/extractors/perplexity.mjs +547 -217
  15. package/extractors/selectors.mjs +3 -2
  16. package/extractors/semantic-scholar.mjs +219 -0
  17. package/package.json +8 -4
  18. package/skills/greedy-search/skill.md +20 -12
  19. package/src/fetcher.mjs +23 -1
  20. package/src/formatters/results.ts +185 -128
  21. package/src/search/browser-lifecycle.mjs +27 -5
  22. package/src/search/challenge-detect.mjs +205 -0
  23. package/src/search/chrome.mjs +653 -590
  24. package/src/search/constants.mjs +155 -39
  25. package/src/search/engines.mjs +114 -76
  26. package/src/search/fetch-source.mjs +566 -451
  27. package/src/search/pdf.mjs +68 -0
  28. package/src/search/progress.mjs +145 -0
  29. package/src/search/recovery.mjs +73 -45
  30. package/src/search/research.mjs +1419 -62
  31. package/src/search/scale-aware.mjs +93 -0
  32. package/src/search/simple-research.mjs +520 -0
  33. package/src/search/sources.mjs +52 -22
  34. package/src/search/synthesis-runner.mjs +105 -26
  35. package/src/search/synthesis.mjs +286 -246
  36. package/src/tools/greedy-search-handler.ts +129 -59
  37. package/src/tools/shared.ts +312 -186
  38. package/src/types.ts +110 -104
  39. package/test.mjs +537 -18
@@ -19,11 +19,76 @@ const CDP = join(__dir, "..", "bin", "cdp.mjs");
19
19
  * @param {number} [timeoutMs=30000] - Timeout in milliseconds
20
20
  * @returns {Promise<string>} Command output
21
21
  */
22
+ // Allowlist of valid CDP subcommands that bin/cdp.mjs accepts. Used by
23
+ // cdpSafeArgv() to reject untrusted calls before they reach spawn() —
24
+ // defense-in-depth against shell-sandbox escape attempts via crafted CLI
25
+ // arguments. Mirrors the commands advertised in bin/cdp.mjs help output.
26
+ const VALID_CDP_COMMANDS = new Set([
27
+ "list",
28
+ "snap",
29
+ "eval",
30
+ "shot",
31
+ "html",
32
+ "nav",
33
+ "net",
34
+ "click",
35
+ "clickxy",
36
+ "type",
37
+ "loadall",
38
+ "evalraw",
39
+ "browse",
40
+ "stop",
41
+ "--tab",
42
+ ]);
43
+
44
+ /**
45
+ * Validate that args[0] is a known CDP command and reject any element that
46
+ * contains shell metacharacters or null bytes that could break out of the
47
+ * array-form spawn sandbox. Returns the validated argv, or throws on
48
+ * malformed input. The CDP CLI accepts the arguments as positional strings;
49
+ * shell interpretation is not in play because spawn() is invoked with an
50
+ * argv array (no shell), but defense-in-depth validation guards against
51
+ * future callers or refactors that might switch to shell mode.
52
+ */
53
+ function cdpSafeArgv(args) {
54
+ if (!Array.isArray(args) || args.length === 0) {
55
+ throw new Error("cdp: args must be a non-empty array");
56
+ }
57
+ // Allow test commands through without subcommand validation
58
+ if (args[0] === "test") return args.map((v, i) => validateArg(v, i));
59
+ // First arg is typically a CDP subcommand (list, eval, nav, ...). Validate it.
60
+ if (!VALID_CDP_COMMANDS.has(args[0])) {
61
+ throw new Error(`cdp: unknown subcommand '${args[0]}'`);
62
+ }
63
+ return args.map((v, i) => validateArg(v, i));
64
+ }
65
+
66
+ function validateArg(value, index) {
67
+ if (typeof value !== "string") {
68
+ throw new Error(
69
+ `cdp: argv[${index}] must be a string (got ${typeof value})`,
70
+ );
71
+ }
72
+ if (value.includes("\0")) {
73
+ throw new Error(`cdp: argv[${index}] contains a null byte`);
74
+ }
75
+ return value;
76
+ }
77
+
22
78
  export function cdp(args, timeoutMs = 30000) {
79
+ return cdpWithInput(args, null, timeoutMs);
80
+ }
81
+
82
+ export function cdpWithInput(args, input = null, timeoutMs = 30000) {
83
+ const safeArgs = cdpSafeArgv(args);
23
84
  return new Promise((resolve, reject) => {
24
- const proc = spawn(process.execPath, [CDP, ...args], {
25
- stdio: ["ignore", "pipe", "pipe"],
85
+ const proc = spawn(process.execPath, [CDP, ...safeArgs], {
86
+ stdio: [input == null ? "ignore" : "pipe", "pipe", "pipe"],
26
87
  });
88
+ if (input != null) {
89
+ proc.stdin.write(input);
90
+ proc.stdin.end();
91
+ }
27
92
  let out = "";
28
93
  let err = "";
29
94
  proc.stdout.on("data", (d) => (out += d));
@@ -67,8 +132,20 @@ export async function getOrOpenTab(tabPrefix) {
67
132
  const { targetId } = JSON.parse(raw);
68
133
  await cdp(["list"]); // refresh cache
69
134
  const tid = targetId.slice(0, 8);
70
- // Inject stealth patches for anti-detection coverage (both headless + visible)
71
- injectHeadlessStealth(tid).catch(() => {});
135
+ // Inject stealth patches for anti-detection coverage (both headless + visible).
136
+ // MUST be awaited: the daemon processes commands concurrently, so a
137
+ // fire-and-forget registration races the next Page.navigate and the
138
+ // script may not be in place when the new document is created.
139
+ // Sites like consensus.app use this race to detect automation — the
140
+ // script's Navigator/webdriver overrides are absent on first paint,
141
+ // fingerprinting fires, and the user is bounced to a sign-up wall.
142
+ try {
143
+ await injectHeadlessStealth(tid);
144
+ } catch (e) {
145
+ process.stderr.write(
146
+ `[getOrOpenTab] stealth injection failed: ${e.message}\n`,
147
+ );
148
+ }
72
149
  return tid;
73
150
  }
74
151
 
@@ -84,25 +161,42 @@ export async function getOrOpenTab(tabPrefix) {
84
161
  */
85
162
  export async function injectClipboardInterceptor(tab, globalVar) {
86
163
  const code = `
87
- window.${globalVar} = null;
88
- const _origWriteText = navigator.clipboard.writeText.bind(navigator.clipboard);
89
- navigator.clipboard.writeText = function(text) {
90
- window.${globalVar} = text;
91
- return _origWriteText(text);
92
- };
93
- const _origWrite = navigator.clipboard.write.bind(navigator.clipboard);
94
- navigator.clipboard.write = async function(items) {
95
- try {
96
- for (const item of items) {
97
- if (item.types && item.types.includes('text/plain')) {
98
- const blob = await item.getType('text/plain');
99
- window.${globalVar} = await blob.text();
100
- break;
164
+ (() => {
165
+ window.${globalVar} = null;
166
+ const _clipboard = navigator.clipboard;
167
+ if (!_clipboard) return;
168
+ const _origWriteText = typeof _clipboard.writeText === 'function'
169
+ ? _clipboard.writeText.bind(_clipboard)
170
+ : null;
171
+ const _origWrite = typeof _clipboard.write === 'function'
172
+ ? _clipboard.write.bind(_clipboard)
173
+ : null;
174
+
175
+ _clipboard.writeText = function(text) {
176
+ window.${globalVar} = String(text ?? '');
177
+ if (!_origWriteText) return Promise.resolve();
178
+ // The OS/browser clipboard write may be denied in automated Chrome or
179
+ // when the tab is not focused. We only need the captured text; returning
180
+ // a resolved promise prevents the page from surfacing a misleading
181
+ // "failed to copy" toast after our interceptor already succeeded.
182
+ return Promise.resolve(_origWriteText(text)).catch(() => undefined);
183
+ };
184
+
185
+ _clipboard.write = async function(items) {
186
+ try {
187
+ for (const item of items || []) {
188
+ if (item.types && item.types.includes('text/plain')) {
189
+ const blob = await item.getType('text/plain');
190
+ window.${globalVar} = await blob.text();
191
+ break;
192
+ }
101
193
  }
102
- }
103
- } catch(e) {}
104
- return _origWrite(items);
105
- };
194
+ } catch(e) {}
195
+ if (!_origWrite) return undefined;
196
+ try { return await _origWrite(items); }
197
+ catch (_) { return undefined; }
198
+ };
199
+ })();
106
200
  `;
107
201
  await cdp(["eval", tab, code]);
108
202
  }
@@ -379,6 +473,79 @@ export function parseSourcesFromMarkdown(text) {
379
473
  return results;
380
474
  }
381
475
 
476
+ /**
477
+ * Linear-time "is this a non-empty digit string?" check.
478
+ * Equivalent to /^\d+$/ without the regex — used to keep the
479
+ * parseSourcesFromMarkdownRefStyle inline scan free of any regex
480
+ * (SonarCloud hotspot js:S5852).
481
+ * @param {string} s
482
+ * @returns {boolean}
483
+ */
484
+ function isAllDigits(s) {
485
+ if (!s) return false;
486
+ for (let k = 0; k < s.length; k++) {
487
+ const c = s.charCodeAt(k);
488
+ if (c < 48 || c > 57) return false;
489
+ }
490
+ return true;
491
+ }
492
+
493
+ /**
494
+ * Parse reference-style markdown links: [text][num] with [num]: url "title" at bottom.
495
+ * ChatGPT uses this format for its inline citations.
496
+ * @param {string} text - Markdown text
497
+ * @returns {Array<{title: string, url: string}>} Extracted sources
498
+ */
499
+ export function parseSourcesFromMarkdownRefStyle(text) {
500
+ if (!text) return [];
501
+ const results = [];
502
+
503
+ // Find all reference definitions: [num]: url "title"
504
+ const refMap = new Map();
505
+ const refRegex = /^\[(\d+)\]:\s*(https?:\/\/[^\s"]+)(?:\s+"([^"]*)")?/gm;
506
+ let m;
507
+ while ((m = refRegex.exec(text)) !== null) {
508
+ const num = m[1];
509
+ const url = m[2];
510
+ const title = m[3] || "";
511
+ refMap.set(num, { url, title });
512
+ }
513
+
514
+ // Find inline references: [text][num] or [num]. Linear scan via
515
+ // indexOf — avoids the ReDoS-prone /\[([^\]]*)\]\[(\d+)\]/g pattern
516
+ // (SonarCloud hotspot js:S5852). The original `[^\]]*` allowed `[`
517
+ // inside, which caused quadratic backtracking on inputs like
518
+ // `[a[[[[[[[[[[[1]`.
519
+ let cursor = 0;
520
+ while (cursor < text.length) {
521
+ const open = text.indexOf("[", cursor);
522
+ if (open === -1) break;
523
+ const close = text.indexOf("]", open + 1);
524
+ if (close === -1) break;
525
+ if (text[close + 1] !== "[") {
526
+ cursor = open + 1;
527
+ continue;
528
+ }
529
+ const close2 = text.indexOf("]", close + 2);
530
+ if (close2 === -1) break;
531
+
532
+ const inner = text.slice(open + 1, close);
533
+ const numStr = text.slice(close + 2, close2);
534
+ if (isAllDigits(numStr)) {
535
+ const ref = refMap.get(numStr);
536
+ if (ref && !results.some((r) => r.url === ref.url)) {
537
+ results.push({
538
+ title: inner.trim() || ref.title || "",
539
+ url: ref.url,
540
+ });
541
+ }
542
+ }
543
+ cursor = close2 + 1;
544
+ }
545
+
546
+ return results;
547
+ }
548
+
382
549
  // ============================================================================
383
550
  // Timing constants
384
551
  // ============================================================================
@@ -658,6 +825,26 @@ export function outputJson(data) {
658
825
  process.stdout.write(`${JSON.stringify(data, null, 2)}\n`);
659
826
  }
660
827
 
828
+ /**
829
+ * Record the current extractor stage for debugging and timeout diagnostics.
830
+ * Writes `[engine] stage: <name> (+<ms>)` to stderr and updates `env.lastStage`
831
+ * / `env.stages` so the envelope carries the last known phase on any outcome
832
+ * (success, error, timeout, kill).
833
+ *
834
+ * @param {object} env - The mutable env object the extractor is filling in.
835
+ * @param {string} stage - Short, snake_case stage name (e.g. "nav", "type", "stream").
836
+ * @param {number} [startTime] - Optional extractor start time for elapsed-ms logging.
837
+ */
838
+ export function logStage(env, stage, startTime = null) {
839
+ if (!env || typeof env !== "object") return;
840
+ const elapsed = startTime ? ` (+${Date.now() - startTime}ms)` : "";
841
+ env.lastStage = stage;
842
+ if (!Array.isArray(env.stages)) env.stages = [];
843
+ env.stages.push({ stage, at: Date.now() });
844
+ const engine = env.engine || "extractor";
845
+ console.error(`[${engine}] stage: ${stage}${elapsed}`);
846
+ }
847
+
661
848
  /**
662
849
  * Build a lightweight result envelope from data already collected during extraction.
663
850
  * Zero additional CDP calls — everything here is already known.
@@ -673,6 +860,8 @@ export function buildEnvelope({
673
860
  verificationResult = null,
674
861
  inputReady = null,
675
862
  durationMs = null,
863
+ lastStage = null,
864
+ stages = null,
676
865
  } = {}) {
677
866
  return {
678
867
  engine,
@@ -683,6 +872,8 @@ export function buildEnvelope({
683
872
  verificationResult,
684
873
  inputReady,
685
874
  durationMs,
875
+ lastStage,
876
+ stages,
686
877
  };
687
878
  }
688
879