@apmantza/greedysearch-pi 1.8.3 → 1.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/bin/launch.mjs +366 -366
- package/bin/search.mjs +388 -388
- package/extractors/common.mjs +291 -291
- package/extractors/gemini.mjs +146 -146
- package/extractors/google-ai.mjs +125 -125
- package/extractors/perplexity.mjs +147 -147
- package/extractors/selectors.mjs +54 -54
- package/index.ts +256 -278
- package/package.json +1 -1
- package/src/github.mjs +237 -237
- package/src/reddit.mjs +210 -210
- package/src/search/chrome.mjs +222 -222
- package/src/search/constants.mjs +37 -37
- package/src/search/defaults.mjs +14 -14
- package/src/search/engines.mjs +62 -62
- package/src/search/fetch-source.mjs +262 -262
- package/src/search/output.mjs +58 -58
- package/src/search/sources.mjs +445 -445
- package/src/search/synthesis-runner.mjs +63 -63
- package/src/search/synthesis.mjs +223 -223
- package/src/tools/deep-research-handler.ts +36 -36
- package/src/tools/greedy-search-handler.ts +53 -57
- package/src/tools/shared.ts +135 -130
- package/src/types.ts +103 -103
- package/test.mjs +423 -377
package/src/reddit.mjs
CHANGED
|
@@ -1,210 +1,210 @@
|
|
|
1
|
-
// src/reddit.mjs - Reddit content fetching via public JSON API
|
|
2
|
-
// Reddit exposes structured data by appending .json to any URL
|
|
3
|
-
|
|
4
|
-
const REDDIT_HEADERS = {
|
|
5
|
-
"user-agent": "GreedySearch/1.0 (Research Bot)",
|
|
6
|
-
accept: "application/json",
|
|
7
|
-
};
|
|
8
|
-
|
|
9
|
-
/**
|
|
10
|
-
* Parse a Reddit URL to check if it's a post, comment, or user profile
|
|
11
|
-
* @param {string} url
|
|
12
|
-
* @returns {{type: 'post'|'user'|'other', cleanUrl: string} | null}
|
|
13
|
-
*/
|
|
14
|
-
export function parseRedditUrl(url) {
|
|
15
|
-
try {
|
|
16
|
-
const parsed = new URL(url);
|
|
17
|
-
const hostname = parsed.hostname.toLowerCase();
|
|
18
|
-
|
|
19
|
-
// Support reddit.com, old.reddit.com, www.reddit.com
|
|
20
|
-
if (!hostname.endsWith("reddit.com")) {
|
|
21
|
-
return null;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
const pathname = parsed.pathname;
|
|
25
|
-
|
|
26
|
-
// User profile: /u/username or /user/username
|
|
27
|
-
if (pathname.match(/^\/(u|user)\/[^/]+\/?$/i)) {
|
|
28
|
-
return { type: "user", cleanUrl: normalizeRedditUrl(url) };
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
// Post: /r/subreddit/comments/xxxx/...
|
|
32
|
-
if (pathname.match(/^\/r\/[^/]+\/comments\/[^/]+/i)) {
|
|
33
|
-
return { type: "post", cleanUrl: normalizeRedditUrl(url) };
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
return null;
|
|
37
|
-
} catch {
|
|
38
|
-
return null;
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
/**
|
|
43
|
-
* Normalize Reddit URL (remove query params, fragments)
|
|
44
|
-
* @param {string} url
|
|
45
|
-
* @returns {string}
|
|
46
|
-
*/
|
|
47
|
-
function normalizeRedditUrl(url) {
|
|
48
|
-
try {
|
|
49
|
-
const parsed = new URL(url);
|
|
50
|
-
// Reconstruct without query/fragment
|
|
51
|
-
return `${parsed.protocol}//${parsed.hostname}${parsed.pathname}`;
|
|
52
|
-
} catch {
|
|
53
|
-
return url;
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
/**
|
|
58
|
-
* Fetch Reddit content via the .json API
|
|
59
|
-
* @param {string} url - Reddit URL (will have .json appended)
|
|
60
|
-
* @param {number} maxChars - Max characters for content
|
|
61
|
-
* @returns {Promise<FetchResult>}
|
|
62
|
-
*/
|
|
63
|
-
export async function fetchRedditContent(url, maxChars = 8000) {
|
|
64
|
-
const start = Date.now();
|
|
65
|
-
|
|
66
|
-
try {
|
|
67
|
-
// Append .json to get API response
|
|
68
|
-
const jsonUrl = url.replace(/\/?$/, ".json");
|
|
69
|
-
|
|
70
|
-
const controller = new AbortController();
|
|
71
|
-
const timeoutId = setTimeout(() => controller.abort(), 15000);
|
|
72
|
-
|
|
73
|
-
const response = await fetch(jsonUrl, {
|
|
74
|
-
headers: REDDIT_HEADERS,
|
|
75
|
-
signal: controller.signal,
|
|
76
|
-
});
|
|
77
|
-
|
|
78
|
-
clearTimeout(timeoutId);
|
|
79
|
-
|
|
80
|
-
if (!response.ok) {
|
|
81
|
-
throw new Error(`Reddit API ${response.status}`);
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
const data = await response.json();
|
|
85
|
-
|
|
86
|
-
// data[0] = post listing, data[1] = comments listing
|
|
87
|
-
if (!Array.isArray(data) || data.length < 1) {
|
|
88
|
-
throw new Error("Invalid Reddit API response structure");
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
const postListing = data[0];
|
|
92
|
-
const commentsListing = data[1];
|
|
93
|
-
|
|
94
|
-
// Extract post data
|
|
95
|
-
const post = postListing?.data?.children?.[0]?.data;
|
|
96
|
-
if (!post) {
|
|
97
|
-
throw new Error("No post data in Reddit response");
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
// Format as markdown
|
|
101
|
-
const markdown = formatRedditPost(post, commentsListing, maxChars);
|
|
102
|
-
|
|
103
|
-
return {
|
|
104
|
-
ok: true,
|
|
105
|
-
url,
|
|
106
|
-
finalUrl: url,
|
|
107
|
-
status: 200,
|
|
108
|
-
contentType: "text/markdown",
|
|
109
|
-
lastModified: "",
|
|
110
|
-
title: post.title || "Reddit Post",
|
|
111
|
-
byline: `u/${post.author}`,
|
|
112
|
-
siteName: `r/${post.subreddit}`,
|
|
113
|
-
lang: "en",
|
|
114
|
-
publishedTime: new Date(post.created_utc * 1000).toISOString(),
|
|
115
|
-
excerpt: post.selftext?.slice(0, 300).replace(/\n/g, " ") || "",
|
|
116
|
-
markdown,
|
|
117
|
-
contentLength: markdown.length,
|
|
118
|
-
needsBrowser: false,
|
|
119
|
-
duration: Date.now() - start,
|
|
120
|
-
};
|
|
121
|
-
} catch (error) {
|
|
122
|
-
return {
|
|
123
|
-
ok: false,
|
|
124
|
-
url,
|
|
125
|
-
finalUrl: url,
|
|
126
|
-
status: 0,
|
|
127
|
-
error: `Reddit fetch failed: ${error.message}`,
|
|
128
|
-
needsBrowser: false,
|
|
129
|
-
duration: Date.now() - start,
|
|
130
|
-
};
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
/**
|
|
135
|
-
* Format Reddit post and comments as clean markdown
|
|
136
|
-
* @param {object} post - Reddit post data
|
|
137
|
-
* @param {object|null} commentsListing - Comments listing data
|
|
138
|
-
* @param {number} maxChars - Max characters
|
|
139
|
-
* @returns {string}
|
|
140
|
-
*/
|
|
141
|
-
function formatRedditPost(post, commentsListing, maxChars) {
|
|
142
|
-
let md = "";
|
|
143
|
-
|
|
144
|
-
// Post header
|
|
145
|
-
md += `# ${post.title}\n\n`;
|
|
146
|
-
md += `**Subreddit:** r/${post.subreddit} | **Author:** u/${post.author} | **Score:** ${post.score}\n\n`;
|
|
147
|
-
|
|
148
|
-
// Post body (selftext) or link
|
|
149
|
-
if (post.selftext) {
|
|
150
|
-
md += post.selftext;
|
|
151
|
-
md += "\n\n";
|
|
152
|
-
} else if (post.url && !post.url.includes("reddit.com")) {
|
|
153
|
-
// External link post
|
|
154
|
-
md += `**Link:** ${post.url}\n\n`;
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
// Comments section
|
|
158
|
-
if (commentsListing?.data?.children?.length > 0) {
|
|
159
|
-
md += "---\n\n## Comments\n\n";
|
|
160
|
-
const comments = commentsListing.data.children
|
|
161
|
-
.filter((c) => c.kind === "t1") // t1 = comment
|
|
162
|
-
.slice(0, 10); // Top 10 comments
|
|
163
|
-
|
|
164
|
-
for (const comment of comments) {
|
|
165
|
-
md += formatComment(comment.data, 0);
|
|
166
|
-
md += "\n";
|
|
167
|
-
}
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
// Trim to maxChars while keeping structure
|
|
171
|
-
if (md.length > maxChars) {
|
|
172
|
-
md = md.slice(0, maxChars).trim() + "\n\n... (truncated)";
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
return md;
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
/**
|
|
179
|
-
* Format a single comment with nesting
|
|
180
|
-
* @param {object} comment - Reddit comment data
|
|
181
|
-
* @param {number} depth - Nesting depth
|
|
182
|
-
* @returns {string}
|
|
183
|
-
*/
|
|
184
|
-
function formatComment(comment, depth) {
|
|
185
|
-
if (
|
|
186
|
-
!comment ||
|
|
187
|
-
comment.body === "[deleted]" ||
|
|
188
|
-
comment.body === "[removed]"
|
|
189
|
-
) {
|
|
190
|
-
return "";
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
const indent = "> ".repeat(depth);
|
|
194
|
-
let md = "";
|
|
195
|
-
|
|
196
|
-
md += `${indent}**u/${comment.author}** (${comment.score} pts)\n`;
|
|
197
|
-
md += `${indent}${comment.body.replace(/\n/g, "\n" + indent)}\n`;
|
|
198
|
-
|
|
199
|
-
// Handle nested replies (limit depth to 3)
|
|
200
|
-
if (depth < 3 && comment.replies?.data?.children) {
|
|
201
|
-
const replies = comment.replies.data.children.filter(
|
|
202
|
-
(r) => r.kind === "t1",
|
|
203
|
-
);
|
|
204
|
-
for (const reply of replies.slice(0, 5)) {
|
|
205
|
-
md += "\n" + formatComment(reply.data, depth + 1);
|
|
206
|
-
}
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
return md;
|
|
210
|
-
}
|
|
1
|
+
// src/reddit.mjs - Reddit content fetching via public JSON API
|
|
2
|
+
// Reddit exposes structured data by appending .json to any URL
|
|
3
|
+
|
|
4
|
+
const REDDIT_HEADERS = {
|
|
5
|
+
"user-agent": "GreedySearch/1.0 (Research Bot)",
|
|
6
|
+
accept: "application/json",
|
|
7
|
+
};
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Parse a Reddit URL to check if it's a post, comment, or user profile
|
|
11
|
+
* @param {string} url
|
|
12
|
+
* @returns {{type: 'post'|'user'|'other', cleanUrl: string} | null}
|
|
13
|
+
*/
|
|
14
|
+
export function parseRedditUrl(url) {
|
|
15
|
+
try {
|
|
16
|
+
const parsed = new URL(url);
|
|
17
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
18
|
+
|
|
19
|
+
// Support reddit.com, old.reddit.com, www.reddit.com
|
|
20
|
+
if (!hostname.endsWith("reddit.com")) {
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const pathname = parsed.pathname;
|
|
25
|
+
|
|
26
|
+
// User profile: /u/username or /user/username
|
|
27
|
+
if (pathname.match(/^\/(u|user)\/[^/]+\/?$/i)) {
|
|
28
|
+
return { type: "user", cleanUrl: normalizeRedditUrl(url) };
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Post: /r/subreddit/comments/xxxx/...
|
|
32
|
+
if (pathname.match(/^\/r\/[^/]+\/comments\/[^/]+/i)) {
|
|
33
|
+
return { type: "post", cleanUrl: normalizeRedditUrl(url) };
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return null;
|
|
37
|
+
} catch {
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Normalize Reddit URL (remove query params, fragments)
|
|
44
|
+
* @param {string} url
|
|
45
|
+
* @returns {string}
|
|
46
|
+
*/
|
|
47
|
+
function normalizeRedditUrl(url) {
|
|
48
|
+
try {
|
|
49
|
+
const parsed = new URL(url);
|
|
50
|
+
// Reconstruct without query/fragment
|
|
51
|
+
return `${parsed.protocol}//${parsed.hostname}${parsed.pathname}`;
|
|
52
|
+
} catch {
|
|
53
|
+
return url;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Fetch Reddit content via the .json API
|
|
59
|
+
* @param {string} url - Reddit URL (will have .json appended)
|
|
60
|
+
* @param {number} maxChars - Max characters for content
|
|
61
|
+
* @returns {Promise<FetchResult>}
|
|
62
|
+
*/
|
|
63
|
+
export async function fetchRedditContent(url, maxChars = 8000) {
|
|
64
|
+
const start = Date.now();
|
|
65
|
+
|
|
66
|
+
try {
|
|
67
|
+
// Append .json to get API response
|
|
68
|
+
const jsonUrl = url.replace(/\/?$/, ".json");
|
|
69
|
+
|
|
70
|
+
const controller = new AbortController();
|
|
71
|
+
const timeoutId = setTimeout(() => controller.abort(), 15000);
|
|
72
|
+
|
|
73
|
+
const response = await fetch(jsonUrl, {
|
|
74
|
+
headers: REDDIT_HEADERS,
|
|
75
|
+
signal: controller.signal,
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
clearTimeout(timeoutId);
|
|
79
|
+
|
|
80
|
+
if (!response.ok) {
|
|
81
|
+
throw new Error(`Reddit API ${response.status}`);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const data = await response.json();
|
|
85
|
+
|
|
86
|
+
// data[0] = post listing, data[1] = comments listing
|
|
87
|
+
if (!Array.isArray(data) || data.length < 1) {
|
|
88
|
+
throw new Error("Invalid Reddit API response structure");
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const postListing = data[0];
|
|
92
|
+
const commentsListing = data[1];
|
|
93
|
+
|
|
94
|
+
// Extract post data
|
|
95
|
+
const post = postListing?.data?.children?.[0]?.data;
|
|
96
|
+
if (!post) {
|
|
97
|
+
throw new Error("No post data in Reddit response");
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Format as markdown
|
|
101
|
+
const markdown = formatRedditPost(post, commentsListing, maxChars);
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
ok: true,
|
|
105
|
+
url,
|
|
106
|
+
finalUrl: url,
|
|
107
|
+
status: 200,
|
|
108
|
+
contentType: "text/markdown",
|
|
109
|
+
lastModified: "",
|
|
110
|
+
title: post.title || "Reddit Post",
|
|
111
|
+
byline: `u/${post.author}`,
|
|
112
|
+
siteName: `r/${post.subreddit}`,
|
|
113
|
+
lang: "en",
|
|
114
|
+
publishedTime: new Date(post.created_utc * 1000).toISOString(),
|
|
115
|
+
excerpt: post.selftext?.slice(0, 300).replace(/\n/g, " ") || "",
|
|
116
|
+
markdown,
|
|
117
|
+
contentLength: markdown.length,
|
|
118
|
+
needsBrowser: false,
|
|
119
|
+
duration: Date.now() - start,
|
|
120
|
+
};
|
|
121
|
+
} catch (error) {
|
|
122
|
+
return {
|
|
123
|
+
ok: false,
|
|
124
|
+
url,
|
|
125
|
+
finalUrl: url,
|
|
126
|
+
status: 0,
|
|
127
|
+
error: `Reddit fetch failed: ${error.message}`,
|
|
128
|
+
needsBrowser: false,
|
|
129
|
+
duration: Date.now() - start,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Format Reddit post and comments as clean markdown
|
|
136
|
+
* @param {object} post - Reddit post data
|
|
137
|
+
* @param {object|null} commentsListing - Comments listing data
|
|
138
|
+
* @param {number} maxChars - Max characters
|
|
139
|
+
* @returns {string}
|
|
140
|
+
*/
|
|
141
|
+
function formatRedditPost(post, commentsListing, maxChars) {
|
|
142
|
+
let md = "";
|
|
143
|
+
|
|
144
|
+
// Post header
|
|
145
|
+
md += `# ${post.title}\n\n`;
|
|
146
|
+
md += `**Subreddit:** r/${post.subreddit} | **Author:** u/${post.author} | **Score:** ${post.score}\n\n`;
|
|
147
|
+
|
|
148
|
+
// Post body (selftext) or link
|
|
149
|
+
if (post.selftext) {
|
|
150
|
+
md += post.selftext;
|
|
151
|
+
md += "\n\n";
|
|
152
|
+
} else if (post.url && !post.url.includes("reddit.com")) {
|
|
153
|
+
// External link post
|
|
154
|
+
md += `**Link:** ${post.url}\n\n`;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Comments section
|
|
158
|
+
if (commentsListing?.data?.children?.length > 0) {
|
|
159
|
+
md += "---\n\n## Comments\n\n";
|
|
160
|
+
const comments = commentsListing.data.children
|
|
161
|
+
.filter((c) => c.kind === "t1") // t1 = comment
|
|
162
|
+
.slice(0, 10); // Top 10 comments
|
|
163
|
+
|
|
164
|
+
for (const comment of comments) {
|
|
165
|
+
md += formatComment(comment.data, 0);
|
|
166
|
+
md += "\n";
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Trim to maxChars while keeping structure
|
|
171
|
+
if (md.length > maxChars) {
|
|
172
|
+
md = md.slice(0, maxChars).trim() + "\n\n... (truncated)";
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return md;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Format a single comment with nesting
|
|
180
|
+
* @param {object} comment - Reddit comment data
|
|
181
|
+
* @param {number} depth - Nesting depth
|
|
182
|
+
* @returns {string}
|
|
183
|
+
*/
|
|
184
|
+
function formatComment(comment, depth) {
|
|
185
|
+
if (
|
|
186
|
+
!comment ||
|
|
187
|
+
comment.body === "[deleted]" ||
|
|
188
|
+
comment.body === "[removed]"
|
|
189
|
+
) {
|
|
190
|
+
return "";
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const indent = "> ".repeat(depth);
|
|
194
|
+
let md = "";
|
|
195
|
+
|
|
196
|
+
md += `${indent}**u/${comment.author}** (${comment.score} pts)\n`;
|
|
197
|
+
md += `${indent}${comment.body.replace(/\n/g, "\n" + indent)}\n`;
|
|
198
|
+
|
|
199
|
+
// Handle nested replies (limit depth to 3)
|
|
200
|
+
if (depth < 3 && comment.replies?.data?.children) {
|
|
201
|
+
const replies = comment.replies.data.children.filter(
|
|
202
|
+
(r) => r.kind === "t1",
|
|
203
|
+
);
|
|
204
|
+
for (const reply of replies.slice(0, 5)) {
|
|
205
|
+
md += "\n" + formatComment(reply.data, depth + 1);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
return md;
|
|
210
|
+
}
|