@apmantza/greedysearch-pi 1.8.2 → 1.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/README.md +10 -1
- package/extractors/perplexity.mjs +3 -1
- package/package.json +1 -1
- package/src/reddit.mjs +210 -0
- package/src/search/fetch-source.mjs +262 -230
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## v1.8.3 (2026-04-24)
|
|
4
|
+
|
|
5
|
+
### Fixes
|
|
6
|
+
- **Perplexity extraction fixed** — The copy button selector was returning the first matching button ("Copy question") instead of the answer copy button. Changed `.find()` to `.filter().pop()` to get the last matching button, which correctly copies the answer text. Fixes `--full` flag returning only the query text instead of the full answer.
|
|
7
|
+
|
|
8
|
+
### Features
|
|
9
|
+
- **Reddit JSON API support** — Reddit post URLs now use Reddit's public `.json` API instead of HTML scraping. Gets structured post data + top comments with nesting. Falls back to HTTP fetch if API fails.
|
|
10
|
+
|
|
3
11
|
## v1.8.2 (2026-04-20)
|
|
4
12
|
|
|
5
13
|
### Cross-Platform Testing
|
package/README.md
CHANGED
|
@@ -57,11 +57,20 @@ node ~/.pi/agent/git/GreedySearch-pi/bin/launch.mjs --kill
|
|
|
57
57
|
- Chrome
|
|
58
58
|
- Node.js 20.11.0+ (22+ recommended)
|
|
59
59
|
|
|
60
|
+
## Source fetching
|
|
61
|
+
|
|
62
|
+
When using `depth: "standard"` or `depth: "deep"`, source content is fetched and synthesized:
|
|
63
|
+
|
|
64
|
+
- **Reddit** — Uses Reddit's public `.json` API for posts and comments (no scraping)
|
|
65
|
+
- **GitHub** — Uses GitHub REST API for repos, READMEs, and file trees
|
|
66
|
+
- **General web** — Mozilla Readability extraction with browser fallback for bot-blocked pages
|
|
67
|
+
- **Metadata** — title, author/byline, site name, publish date, language, excerpt
|
|
68
|
+
|
|
60
69
|
## Project layout
|
|
61
70
|
|
|
62
71
|
- `bin/` - runtime CLIs (`search.mjs`, `launch.mjs`, `cdp.mjs`, `coding-task.mjs`)
|
|
63
72
|
- `extractors/` - engine-specific automation
|
|
64
|
-
- `src/` - ranking/fetching/formatting internals
|
|
73
|
+
- `src/` - ranking/fetching/formatting internals (includes `reddit.mjs`, `github.mjs`, `fetcher.mjs`)
|
|
65
74
|
- `skills/` - Pi skill metadata
|
|
66
75
|
|
|
67
76
|
## Testing
|
|
@@ -36,7 +36,9 @@ const GLOBAL_VAR = "__pplxClipboard";
|
|
|
36
36
|
function findCopyButtonJsExpression() {
|
|
37
37
|
// Perplexity uses SVG icons via <use xlink:href="#pplx-icon-copy">
|
|
38
38
|
// This works across all locales since it doesn't depend on aria-label text
|
|
39
|
-
|
|
39
|
+
// Use .pop() to get the last matching button (the answer copy button),
|
|
40
|
+
// not the first one which is the question copy button
|
|
41
|
+
return `Array.from(document.querySelectorAll('button')).filter(b => b.innerHTML.includes('#pplx-icon-copy')).pop()`;
|
|
40
42
|
}
|
|
41
43
|
|
|
42
44
|
// ============================================================================
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@apmantza/greedysearch-pi",
|
|
3
|
-
"version": "1.8.
|
|
3
|
+
"version": "1.8.3",
|
|
4
4
|
"description": "Pi extension: multi-engine AI search (Perplexity, Bing Copilot, Google AI) via browser automation -- NO API KEYS needed. Extracts answers with sources, optional Gemini synthesis. Grounded AI answers from real browser interactions.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"keywords": [
|
package/src/reddit.mjs
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
// src/reddit.mjs - Reddit content fetching via public JSON API
|
|
2
|
+
// Reddit exposes structured data by appending .json to any URL
|
|
3
|
+
|
|
4
|
+
const REDDIT_HEADERS = {
|
|
5
|
+
"user-agent": "GreedySearch/1.0 (Research Bot)",
|
|
6
|
+
accept: "application/json",
|
|
7
|
+
};
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Parse a Reddit URL to check if it's a post, comment, or user profile
|
|
11
|
+
* @param {string} url
|
|
12
|
+
* @returns {{type: 'post'|'user'|'other', cleanUrl: string} | null}
|
|
13
|
+
*/
|
|
14
|
+
export function parseRedditUrl(url) {
|
|
15
|
+
try {
|
|
16
|
+
const parsed = new URL(url);
|
|
17
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
18
|
+
|
|
19
|
+
// Support reddit.com, old.reddit.com, www.reddit.com
|
|
20
|
+
if (!hostname.endsWith("reddit.com")) {
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const pathname = parsed.pathname;
|
|
25
|
+
|
|
26
|
+
// User profile: /u/username or /user/username
|
|
27
|
+
if (pathname.match(/^\/(u|user)\/[^/]+\/?$/i)) {
|
|
28
|
+
return { type: "user", cleanUrl: normalizeRedditUrl(url) };
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Post: /r/subreddit/comments/xxxx/...
|
|
32
|
+
if (pathname.match(/^\/r\/[^/]+\/comments\/[^/]+/i)) {
|
|
33
|
+
return { type: "post", cleanUrl: normalizeRedditUrl(url) };
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return null;
|
|
37
|
+
} catch {
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Normalize Reddit URL (remove query params, fragments)
|
|
44
|
+
* @param {string} url
|
|
45
|
+
* @returns {string}
|
|
46
|
+
*/
|
|
47
|
+
function normalizeRedditUrl(url) {
|
|
48
|
+
try {
|
|
49
|
+
const parsed = new URL(url);
|
|
50
|
+
// Reconstruct without query/fragment
|
|
51
|
+
return `${parsed.protocol}//${parsed.hostname}${parsed.pathname}`;
|
|
52
|
+
} catch {
|
|
53
|
+
return url;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Fetch Reddit content via the .json API
|
|
59
|
+
* @param {string} url - Reddit URL (will have .json appended)
|
|
60
|
+
* @param {number} maxChars - Max characters for content
|
|
61
|
+
* @returns {Promise<FetchResult>}
|
|
62
|
+
*/
|
|
63
|
+
export async function fetchRedditContent(url, maxChars = 8000) {
|
|
64
|
+
const start = Date.now();
|
|
65
|
+
|
|
66
|
+
try {
|
|
67
|
+
// Append .json to get API response
|
|
68
|
+
const jsonUrl = url.replace(/\/?$/, ".json");
|
|
69
|
+
|
|
70
|
+
const controller = new AbortController();
|
|
71
|
+
const timeoutId = setTimeout(() => controller.abort(), 15000);
|
|
72
|
+
|
|
73
|
+
const response = await fetch(jsonUrl, {
|
|
74
|
+
headers: REDDIT_HEADERS,
|
|
75
|
+
signal: controller.signal,
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
clearTimeout(timeoutId);
|
|
79
|
+
|
|
80
|
+
if (!response.ok) {
|
|
81
|
+
throw new Error(`Reddit API ${response.status}`);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const data = await response.json();
|
|
85
|
+
|
|
86
|
+
// data[0] = post listing, data[1] = comments listing
|
|
87
|
+
if (!Array.isArray(data) || data.length < 1) {
|
|
88
|
+
throw new Error("Invalid Reddit API response structure");
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const postListing = data[0];
|
|
92
|
+
const commentsListing = data[1];
|
|
93
|
+
|
|
94
|
+
// Extract post data
|
|
95
|
+
const post = postListing?.data?.children?.[0]?.data;
|
|
96
|
+
if (!post) {
|
|
97
|
+
throw new Error("No post data in Reddit response");
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Format as markdown
|
|
101
|
+
const markdown = formatRedditPost(post, commentsListing, maxChars);
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
ok: true,
|
|
105
|
+
url,
|
|
106
|
+
finalUrl: url,
|
|
107
|
+
status: 200,
|
|
108
|
+
contentType: "text/markdown",
|
|
109
|
+
lastModified: "",
|
|
110
|
+
title: post.title || "Reddit Post",
|
|
111
|
+
byline: `u/${post.author}`,
|
|
112
|
+
siteName: `r/${post.subreddit}`,
|
|
113
|
+
lang: "en",
|
|
114
|
+
publishedTime: new Date(post.created_utc * 1000).toISOString(),
|
|
115
|
+
excerpt: post.selftext?.slice(0, 300).replace(/\n/g, " ") || "",
|
|
116
|
+
markdown,
|
|
117
|
+
contentLength: markdown.length,
|
|
118
|
+
needsBrowser: false,
|
|
119
|
+
duration: Date.now() - start,
|
|
120
|
+
};
|
|
121
|
+
} catch (error) {
|
|
122
|
+
return {
|
|
123
|
+
ok: false,
|
|
124
|
+
url,
|
|
125
|
+
finalUrl: url,
|
|
126
|
+
status: 0,
|
|
127
|
+
error: `Reddit fetch failed: ${error.message}`,
|
|
128
|
+
needsBrowser: false,
|
|
129
|
+
duration: Date.now() - start,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Format Reddit post and comments as clean markdown
|
|
136
|
+
* @param {object} post - Reddit post data
|
|
137
|
+
* @param {object|null} commentsListing - Comments listing data
|
|
138
|
+
* @param {number} maxChars - Max characters
|
|
139
|
+
* @returns {string}
|
|
140
|
+
*/
|
|
141
|
+
function formatRedditPost(post, commentsListing, maxChars) {
|
|
142
|
+
let md = "";
|
|
143
|
+
|
|
144
|
+
// Post header
|
|
145
|
+
md += `# ${post.title}\n\n`;
|
|
146
|
+
md += `**Subreddit:** r/${post.subreddit} | **Author:** u/${post.author} | **Score:** ${post.score}\n\n`;
|
|
147
|
+
|
|
148
|
+
// Post body (selftext) or link
|
|
149
|
+
if (post.selftext) {
|
|
150
|
+
md += post.selftext;
|
|
151
|
+
md += "\n\n";
|
|
152
|
+
} else if (post.url && !post.url.includes("reddit.com")) {
|
|
153
|
+
// External link post
|
|
154
|
+
md += `**Link:** ${post.url}\n\n`;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Comments section
|
|
158
|
+
if (commentsListing?.data?.children?.length > 0) {
|
|
159
|
+
md += "---\n\n## Comments\n\n";
|
|
160
|
+
const comments = commentsListing.data.children
|
|
161
|
+
.filter((c) => c.kind === "t1") // t1 = comment
|
|
162
|
+
.slice(0, 10); // Top 10 comments
|
|
163
|
+
|
|
164
|
+
for (const comment of comments) {
|
|
165
|
+
md += formatComment(comment.data, 0);
|
|
166
|
+
md += "\n";
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Trim to maxChars while keeping structure
|
|
171
|
+
if (md.length > maxChars) {
|
|
172
|
+
md = md.slice(0, maxChars).trim() + "\n\n... (truncated)";
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return md;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Format a single comment with nesting
|
|
180
|
+
* @param {object} comment - Reddit comment data
|
|
181
|
+
* @param {number} depth - Nesting depth
|
|
182
|
+
* @returns {string}
|
|
183
|
+
*/
|
|
184
|
+
function formatComment(comment, depth) {
|
|
185
|
+
if (
|
|
186
|
+
!comment ||
|
|
187
|
+
comment.body === "[deleted]" ||
|
|
188
|
+
comment.body === "[removed]"
|
|
189
|
+
) {
|
|
190
|
+
return "";
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const indent = "> ".repeat(depth);
|
|
194
|
+
let md = "";
|
|
195
|
+
|
|
196
|
+
md += `${indent}**u/${comment.author}** (${comment.score} pts)\n`;
|
|
197
|
+
md += `${indent}${comment.body.replace(/\n/g, "\n" + indent)}\n`;
|
|
198
|
+
|
|
199
|
+
// Handle nested replies (limit depth to 3)
|
|
200
|
+
if (depth < 3 && comment.replies?.data?.children) {
|
|
201
|
+
const replies = comment.replies.data.children.filter(
|
|
202
|
+
(r) => r.kind === "t1",
|
|
203
|
+
);
|
|
204
|
+
for (const reply of replies.slice(0, 5)) {
|
|
205
|
+
md += "\n" + formatComment(reply.data, depth + 1);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
return md;
|
|
210
|
+
}
|
|
@@ -1,230 +1,262 @@
|
|
|
1
|
-
// src/search/fetch-source.mjs — HTTP and browser-based source content fetching
|
|
2
|
-
//
|
|
3
|
-
// Extracted from search.mjs. Uses fetchSourceHttp from src/fetcher.mjs
|
|
4
|
-
// with browser fallback via CDP, plus GitHub content fetching.
|
|
5
|
-
|
|
6
|
-
import { spawn } from "node:child_process";
|
|
7
|
-
import { tmpdir } from "node:os";
|
|
8
|
-
import { join } from "node:path";
|
|
9
|
-
import { fetchSourceHttp, shouldUseBrowser } from "../fetcher.mjs";
|
|
10
|
-
import { fetchGitHubContent, parseGitHubUrl } from "../github.mjs";
|
|
11
|
-
import {
|
|
12
|
-
import {
|
|
13
|
-
import {
|
|
14
|
-
import { SOURCE_FETCH_CONCURRENCY } from "./constants.mjs";
|
|
15
|
-
import { trimText } from "./sources.mjs";
|
|
16
|
-
|
|
17
|
-
export async function fetchSourceContent(url, maxChars = 8000) {
|
|
18
|
-
const start = Date.now();
|
|
19
|
-
|
|
20
|
-
// Check if it's a GitHub URL
|
|
21
|
-
if (parseGitHubUrl(url)) {
|
|
22
|
-
const parsed = parseGitHubUrl(url);
|
|
23
|
-
if (
|
|
24
|
-
parsed &&
|
|
25
|
-
(parsed.type === "root" ||
|
|
26
|
-
parsed.type === "tree" ||
|
|
27
|
-
(parsed.type === "blob" && !parsed.path?.includes(".")))
|
|
28
|
-
) {
|
|
29
|
-
const ghResult = await fetchGitHubContent(url);
|
|
30
|
-
if (ghResult.ok) {
|
|
31
|
-
const content = trimContentHeadTail(ghResult.content, maxChars);
|
|
32
|
-
return {
|
|
33
|
-
url,
|
|
34
|
-
finalUrl: url,
|
|
35
|
-
status: 200,
|
|
36
|
-
contentType: "text/markdown",
|
|
37
|
-
lastModified: "",
|
|
38
|
-
title: ghResult.title,
|
|
39
|
-
snippet: content.slice(0, 320),
|
|
40
|
-
content,
|
|
41
|
-
contentChars: content.length,
|
|
42
|
-
source: "github-api",
|
|
43
|
-
...(ghResult.tree && { tree: ghResult.tree }),
|
|
44
|
-
duration: Date.now() - start,
|
|
45
|
-
};
|
|
46
|
-
}
|
|
47
|
-
process.stderr.write(
|
|
48
|
-
`[greedysearch] GitHub API fetch failed, trying HTTP: ${ghResult.error}\n`,
|
|
49
|
-
);
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
//
|
|
54
|
-
const
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
const
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
1
|
+
// src/search/fetch-source.mjs — HTTP and browser-based source content fetching
|
|
2
|
+
//
|
|
3
|
+
// Extracted from search.mjs. Uses fetchSourceHttp from src/fetcher.mjs
|
|
4
|
+
// with browser fallback via CDP, plus GitHub content fetching.
|
|
5
|
+
|
|
6
|
+
import { spawn } from "node:child_process";
|
|
7
|
+
import { tmpdir } from "node:os";
|
|
8
|
+
import { join } from "node:path";
|
|
9
|
+
import { fetchSourceHttp, shouldUseBrowser } from "../fetcher.mjs";
|
|
10
|
+
import { fetchGitHubContent, parseGitHubUrl } from "../github.mjs";
|
|
11
|
+
import { fetchRedditContent, parseRedditUrl } from "../reddit.mjs";
|
|
12
|
+
import { trimContentHeadTail } from "../utils/content.mjs";
|
|
13
|
+
import { cdp, closeTab, closeTabs, openNewTab } from "./chrome.mjs";
|
|
14
|
+
import { SOURCE_FETCH_CONCURRENCY } from "./constants.mjs";
|
|
15
|
+
import { trimText } from "./sources.mjs";
|
|
16
|
+
|
|
17
|
+
export async function fetchSourceContent(url, maxChars = 8000) {
|
|
18
|
+
const start = Date.now();
|
|
19
|
+
|
|
20
|
+
// Check if it's a GitHub URL
|
|
21
|
+
if (parseGitHubUrl(url)) {
|
|
22
|
+
const parsed = parseGitHubUrl(url);
|
|
23
|
+
if (
|
|
24
|
+
parsed &&
|
|
25
|
+
(parsed.type === "root" ||
|
|
26
|
+
parsed.type === "tree" ||
|
|
27
|
+
(parsed.type === "blob" && !parsed.path?.includes(".")))
|
|
28
|
+
) {
|
|
29
|
+
const ghResult = await fetchGitHubContent(url);
|
|
30
|
+
if (ghResult.ok) {
|
|
31
|
+
const content = trimContentHeadTail(ghResult.content, maxChars);
|
|
32
|
+
return {
|
|
33
|
+
url,
|
|
34
|
+
finalUrl: url,
|
|
35
|
+
status: 200,
|
|
36
|
+
contentType: "text/markdown",
|
|
37
|
+
lastModified: "",
|
|
38
|
+
title: ghResult.title,
|
|
39
|
+
snippet: content.slice(0, 320),
|
|
40
|
+
content,
|
|
41
|
+
contentChars: content.length,
|
|
42
|
+
source: "github-api",
|
|
43
|
+
...(ghResult.tree && { tree: ghResult.tree }),
|
|
44
|
+
duration: Date.now() - start,
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
process.stderr.write(
|
|
48
|
+
`[greedysearch] GitHub API fetch failed, trying HTTP: ${ghResult.error}\n`,
|
|
49
|
+
);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Check if it's a Reddit URL (posts and comments)
|
|
54
|
+
const redditInfo = parseRedditUrl(url);
|
|
55
|
+
if (redditInfo?.type === "post") {
|
|
56
|
+
process.stderr.write(
|
|
57
|
+
`[greedysearch] Using Reddit JSON API for: ${url.slice(0, 60)}...\n`,
|
|
58
|
+
);
|
|
59
|
+
const redditResult = await fetchRedditContent(url, maxChars);
|
|
60
|
+
if (redditResult.ok) {
|
|
61
|
+
const content = trimContentHeadTail(redditResult.markdown, maxChars);
|
|
62
|
+
return {
|
|
63
|
+
url,
|
|
64
|
+
finalUrl: redditResult.finalUrl,
|
|
65
|
+
status: redditResult.status,
|
|
66
|
+
contentType: "text/markdown",
|
|
67
|
+
lastModified: redditResult.lastModified || "",
|
|
68
|
+
publishedTime: redditResult.publishedTime || "",
|
|
69
|
+
byline: redditResult.byline || "",
|
|
70
|
+
siteName: redditResult.siteName || "",
|
|
71
|
+
lang: redditResult.lang || "",
|
|
72
|
+
title: redditResult.title,
|
|
73
|
+
snippet: redditResult.excerpt,
|
|
74
|
+
content,
|
|
75
|
+
contentChars: content.length,
|
|
76
|
+
source: "reddit-api",
|
|
77
|
+
duration: Date.now() - start,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
process.stderr.write(
|
|
81
|
+
`[greedysearch] Reddit API fetch failed, falling back to HTTP: ${redditResult.error}\n`,
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Try HTTP first
|
|
86
|
+
const httpResult = await fetchSourceHttp(url, { timeoutMs: 15000 });
|
|
87
|
+
|
|
88
|
+
if (httpResult.ok) {
|
|
89
|
+
const content = trimContentHeadTail(httpResult.markdown, maxChars);
|
|
90
|
+
return {
|
|
91
|
+
url,
|
|
92
|
+
finalUrl: httpResult.finalUrl,
|
|
93
|
+
status: httpResult.status,
|
|
94
|
+
contentType: "text/markdown",
|
|
95
|
+
lastModified: httpResult.lastModified || "",
|
|
96
|
+
publishedTime: httpResult.publishedTime || "",
|
|
97
|
+
byline: httpResult.byline || "",
|
|
98
|
+
siteName: httpResult.siteName || "",
|
|
99
|
+
lang: httpResult.lang || "",
|
|
100
|
+
title: httpResult.title,
|
|
101
|
+
snippet: httpResult.excerpt,
|
|
102
|
+
content,
|
|
103
|
+
contentChars: content.length,
|
|
104
|
+
source: "http",
|
|
105
|
+
duration: Date.now() - start,
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// HTTP failed — fall back to browser
|
|
110
|
+
process.stderr.write(
|
|
111
|
+
`[greedysearch] HTTP failed for ${url.slice(0, 60)}, trying browser...\n`,
|
|
112
|
+
);
|
|
113
|
+
return await fetchSourceContentBrowser(url, maxChars);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
async function fetchSourceContentBrowser(url, maxChars = 8000) {
|
|
117
|
+
const start = Date.now();
|
|
118
|
+
const tab = await openNewTab();
|
|
119
|
+
|
|
120
|
+
try {
|
|
121
|
+
await cdp(["nav", tab, url], 30000);
|
|
122
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
123
|
+
|
|
124
|
+
const content = await cdp([
|
|
125
|
+
"eval",
|
|
126
|
+
tab,
|
|
127
|
+
`
|
|
128
|
+
(function(){
|
|
129
|
+
var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
|
|
130
|
+
var text = (el || document.body).innerText;
|
|
131
|
+
return JSON.stringify({
|
|
132
|
+
title: document.title,
|
|
133
|
+
content: text.replace(/\\s+/g, ' ').trim(),
|
|
134
|
+
url: location.href
|
|
135
|
+
});
|
|
136
|
+
})()
|
|
137
|
+
`,
|
|
138
|
+
]);
|
|
139
|
+
|
|
140
|
+
const parsed = JSON.parse(content);
|
|
141
|
+
const finalContent = trimContentHeadTail(parsed.content, maxChars);
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
url,
|
|
145
|
+
finalUrl: parsed.url || url,
|
|
146
|
+
status: 200,
|
|
147
|
+
contentType: "text/plain",
|
|
148
|
+
lastModified: "",
|
|
149
|
+
title: parsed.title,
|
|
150
|
+
snippet: trimText(finalContent, 320),
|
|
151
|
+
content: finalContent,
|
|
152
|
+
contentChars: finalContent.length,
|
|
153
|
+
source: "browser",
|
|
154
|
+
duration: Date.now() - start,
|
|
155
|
+
};
|
|
156
|
+
} catch (error) {
|
|
157
|
+
return {
|
|
158
|
+
url,
|
|
159
|
+
title: "",
|
|
160
|
+
content: null,
|
|
161
|
+
snippet: "",
|
|
162
|
+
contentChars: 0,
|
|
163
|
+
error: error.message,
|
|
164
|
+
source: "browser",
|
|
165
|
+
duration: Date.now() - start,
|
|
166
|
+
};
|
|
167
|
+
} finally {
|
|
168
|
+
await closeTab(tab);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
export async function fetchMultipleSources(
|
|
173
|
+
sources,
|
|
174
|
+
maxSources = 5,
|
|
175
|
+
maxChars = 8000,
|
|
176
|
+
concurrency = SOURCE_FETCH_CONCURRENCY,
|
|
177
|
+
) {
|
|
178
|
+
const toFetch = sources.slice(0, maxSources);
|
|
179
|
+
if (toFetch.length === 0) return [];
|
|
180
|
+
|
|
181
|
+
const workerCount = Math.min(
|
|
182
|
+
toFetch.length,
|
|
183
|
+
Math.max(1, parseInt(String(concurrency), 10) || SOURCE_FETCH_CONCURRENCY),
|
|
184
|
+
);
|
|
185
|
+
|
|
186
|
+
process.stderr.write(
|
|
187
|
+
`[greedysearch] Fetching content from ${toFetch.length} sources via HTTP (concurrency ${workerCount})...\n`,
|
|
188
|
+
);
|
|
189
|
+
|
|
190
|
+
const fetched = new Array(toFetch.length);
|
|
191
|
+
let nextIndex = 0;
|
|
192
|
+
let completed = 0;
|
|
193
|
+
|
|
194
|
+
async function worker() {
|
|
195
|
+
while (true) {
|
|
196
|
+
const index = nextIndex++;
|
|
197
|
+
if (index >= toFetch.length) return;
|
|
198
|
+
|
|
199
|
+
const s = toFetch[index];
|
|
200
|
+
const url = s.canonicalUrl || s.url;
|
|
201
|
+
process.stderr.write(
|
|
202
|
+
`[greedysearch] [${index + 1}/${toFetch.length}] Fetching: ${url.slice(0, 60)}...\n`,
|
|
203
|
+
);
|
|
204
|
+
|
|
205
|
+
const result = await fetchSourceContent(url, maxChars);
|
|
206
|
+
fetched[index] = {
|
|
207
|
+
id: s.id,
|
|
208
|
+
...result,
|
|
209
|
+
};
|
|
210
|
+
|
|
211
|
+
if (result.content && result.content.length > 100) {
|
|
212
|
+
process.stderr.write(
|
|
213
|
+
`[greedysearch] ✓ ${result.source}: ${result.content.length} chars\n`,
|
|
214
|
+
);
|
|
215
|
+
} else if (result.error) {
|
|
216
|
+
process.stderr.write(`[greedysearch] ✗ ${result.error.slice(0, 80)}\n`);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
completed += 1;
|
|
220
|
+
process.stderr.write(`PROGRESS:fetch:${completed}/${toFetch.length}\n`);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
await Promise.all(Array.from({ length: workerCount }, () => worker()));
|
|
225
|
+
|
|
226
|
+
// Log summary
|
|
227
|
+
const successful = fetched.filter((f) => f.content && f.content.length > 100);
|
|
228
|
+
const httpCount = fetched.filter((f) => f.source === "http").length;
|
|
229
|
+
const browserCount = fetched.filter((f) => f.source === "browser").length;
|
|
230
|
+
|
|
231
|
+
process.stderr.write(
|
|
232
|
+
`[greedysearch] Fetched ${successful.length}/${fetched.length} sources ` +
|
|
233
|
+
`(HTTP: ${httpCount}, Browser: ${browserCount})\n`,
|
|
234
|
+
);
|
|
235
|
+
|
|
236
|
+
return fetched;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
export async function fetchTopSource(url) {
|
|
240
|
+
const tab = await openNewTab();
|
|
241
|
+
await cdp(["list"]); // refresh cache
|
|
242
|
+
try {
|
|
243
|
+
await cdp(["nav", tab, url], 30000);
|
|
244
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
245
|
+
const content = await cdp([
|
|
246
|
+
"eval",
|
|
247
|
+
tab,
|
|
248
|
+
`
|
|
249
|
+
(function(){
|
|
250
|
+
var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
|
|
251
|
+
var text = (el || document.body).innerText;
|
|
252
|
+
return text.replace(/\\s+/g, ' ').trim();
|
|
253
|
+
})()
|
|
254
|
+
`,
|
|
255
|
+
]);
|
|
256
|
+
return { url, content };
|
|
257
|
+
} catch (e) {
|
|
258
|
+
return { url, content: null, error: e.message };
|
|
259
|
+
} finally {
|
|
260
|
+
await closeTab(tab);
|
|
261
|
+
}
|
|
262
|
+
}
|