pi-web-access 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/rsc-extract.ts ADDED
@@ -0,0 +1,338 @@
1
+ /**
2
+ * RSC Content Extractor
3
+ *
4
+ * Extracts readable content from Next.js React Server Components (RSC) flight payloads.
5
+ * RSC pages embed content as JSON in <script>self.__next_f.push([...])</script> tags.
6
+ */
7
+
8
+ export interface RSCExtractResult {
9
+ title: string;
10
+ content: string;
11
+ }
12
+
13
+ export function extractRSCContent(html: string): RSCExtractResult | null {
14
+ if (!html.includes("self.__next_f.push")) {
15
+ return null;
16
+ }
17
+
18
+ // Parse all RSC chunks into a map
19
+ const chunkMap = new Map<string, string>();
20
+ const scriptRegex = /<script>self\.__next_f\.push\(\[1,"([\s\S]*?)"\]\)<\/script>/g;
21
+
22
+ for (const match of html.matchAll(scriptRegex)) {
23
+ let content: string;
24
+ try {
25
+ content = JSON.parse('"' + match[1] + '"');
26
+ } catch {
27
+ continue;
28
+ }
29
+
30
+ // Parse each line as "id:payload"
31
+ // Lines are separated by \n, each line is one chunk
32
+ // Chunk IDs are hex strings, typically 1-4 chars (supports up to 65535 chunks)
33
+ for (const line of content.split("\n")) {
34
+ if (!line.trim()) continue;
35
+
36
+ const colonIdx = line.indexOf(":");
37
+ if (colonIdx <= 0 || colonIdx > 4) continue;
38
+
39
+ const id = line.slice(0, colonIdx);
40
+ if (!/^[0-9a-f]+$/i.test(id)) continue;
41
+
42
+ const payload = line.slice(colonIdx + 1);
43
+ if (!payload) continue;
44
+
45
+ const existing = chunkMap.get(id);
46
+ if (!existing || payload.length > existing.length) {
47
+ chunkMap.set(id, payload);
48
+ }
49
+ }
50
+ }
51
+
52
+ if (chunkMap.size === 0) return null;
53
+
54
+ // Extract title
55
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/);
56
+ const title = titleMatch?.[1]?.split("|")[0]?.trim() || "";
57
+
58
+ // Parse and cache parsed chunks
59
+ const parsedCache = new Map<string, unknown>();
60
+
61
+ function getParsedChunk(id: string): unknown | null {
62
+ if (parsedCache.has(id)) return parsedCache.get(id);
63
+
64
+ const chunk = chunkMap.get(id);
65
+ if (!chunk || !chunk.startsWith("[")) {
66
+ parsedCache.set(id, null);
67
+ return null;
68
+ }
69
+
70
+ try {
71
+ const parsed = JSON.parse(chunk);
72
+ parsedCache.set(id, parsed);
73
+ return parsed;
74
+ } catch {
75
+ parsedCache.set(id, null);
76
+ return null;
77
+ }
78
+ }
79
+
80
+ // Extract markdown from nodes, resolving refs on the fly
81
+ type Node = unknown;
82
+ const visitedRefs = new Set<string>();
83
+
84
+ function extractNode(node: Node, ctx = { inTable: false, inCode: false }): string {
85
+ if (node === null || node === undefined) return "";
86
+
87
+ if (typeof node === "string") {
88
+ // Check if it's a reference like "$L30"
89
+ const refMatch = node.match(/^\$L([0-9a-f]+)$/i);
90
+ if (refMatch) {
91
+ const refId = refMatch[1];
92
+ if (visitedRefs.has(refId)) return ""; // Prevent cycles
93
+ visitedRefs.add(refId);
94
+ const refNode = getParsedChunk(refId);
95
+ const result = refNode ? extractNode(refNode, ctx) : "";
96
+ visitedRefs.delete(refId);
97
+ return result;
98
+ }
99
+ // Filter out RSC-specific artifacts, but preserve content inside code blocks
100
+ if (!ctx.inCode && (node === "$undefined" || node === "$" || /^\$[A-Z]/.test(node))) return "";
101
+ return node.trim() ? node : "";
102
+ }
103
+
104
+ if (typeof node === "number") return String(node);
105
+ if (typeof node === "boolean") return "";
106
+ if (!Array.isArray(node)) return "";
107
+
108
+ // RSC element: ["$", "tag", key, props]
109
+ if (node[0] === "$" && typeof node[1] === "string") {
110
+ const tag = node[1] as string;
111
+ const props = (node[3] || {}) as Record<string, unknown>;
112
+
113
+ // Skip non-content
114
+ const skipTags = ["script", "style", "svg", "path", "circle", "link", "meta",
115
+ "template", "button", "input", "nav", "footer", "aside"];
116
+ if (skipTags.includes(tag)) return "";
117
+
118
+ // Component ref like $L25
119
+ if (tag.startsWith("$L")) {
120
+ const refId = tag.slice(2);
121
+ if (visitedRefs.has(refId)) return "";
122
+
123
+ // Check for heading components with baseId
124
+ if (props.baseId && props.children) {
125
+ return `## ${String(props.children)}\n\n`;
126
+ }
127
+
128
+ visitedRefs.add(refId);
129
+ const refNode = getParsedChunk(refId);
130
+ let result = "";
131
+ if (refNode) {
132
+ result = extractNode(refNode, ctx);
133
+ } else if (props.children) {
134
+ result = extractNode(props.children as Node, ctx);
135
+ }
136
+ visitedRefs.delete(refId);
137
+ return result;
138
+ }
139
+
140
+ const children = props.children;
141
+ const content = children ? extractNode(children as Node, ctx) : "";
142
+
143
+ switch (tag) {
144
+ case "h1": return `# ${content.trim()}\n\n`;
145
+ case "h2": return `## ${content.trim()}\n\n`;
146
+ case "h3": return `### ${content.trim()}\n\n`;
147
+ case "h4": return `#### ${content.trim()}\n\n`;
148
+ case "h5": return `##### ${content.trim()}\n\n`;
149
+ case "h6": return `###### ${content.trim()}\n\n`;
150
+ case "p": return ctx.inTable ? content : `${content.trim()}\n\n`;
151
+ case "code": {
152
+ const codeContent = children ? extractNode(children as Node, { ...ctx, inCode: true }) : "";
153
+ return `\`${codeContent}\``;
154
+ }
155
+ case "pre": {
156
+ const preContent = children ? extractNode(children as Node, { ...ctx, inCode: true }) : "";
157
+ return "```\n" + preContent + "\n```\n\n";
158
+ }
159
+ case "strong": case "b": return `**${content}**`;
160
+ case "em": case "i": return `*${content}*`;
161
+ case "li": return `- ${content.trim()}\n`;
162
+ case "ul": case "ol": return content + "\n";
163
+ case "blockquote": return `> ${content.trim()}\n\n`;
164
+ case "table": return extractTable(node as unknown[]) + "\n";
165
+ case "thead": case "tbody": case "tr": case "th": case "td":
166
+ return content;
167
+ case "div":
168
+ if (props.role === "alert" || props["data-slot"] === "alert") {
169
+ return `> ${content.trim()}\n\n`;
170
+ }
171
+ return content;
172
+ case "a": {
173
+ const href = props.href as string | undefined;
174
+ return href && !href.startsWith("#") ? `[${content}](${href})` : content;
175
+ }
176
+ default: return content;
177
+ }
178
+ }
179
+
180
+ // Array of child nodes
181
+ return (node as Node[]).map(n => extractNode(n, ctx)).join("");
182
+ }
183
+
184
+ function extractTable(tableNode: unknown[]): string {
185
+ const props = (tableNode[3] || {}) as Record<string, unknown>;
186
+ const rows: string[][] = [];
187
+ let headerRowCount = 0;
188
+
189
+ function walkTable(node: unknown, isHeader = false): void {
190
+ if (node === null || node === undefined) return;
191
+
192
+ // Handle string refs
193
+ if (typeof node === "string") {
194
+ const refMatch = node.match(/^\$L([0-9a-f]+)$/i);
195
+ if (refMatch && !visitedRefs.has(refMatch[1])) {
196
+ visitedRefs.add(refMatch[1]);
197
+ const refNode = getParsedChunk(refMatch[1]);
198
+ if (refNode) walkTable(refNode, isHeader);
199
+ visitedRefs.delete(refMatch[1]);
200
+ }
201
+ return;
202
+ }
203
+
204
+ if (!Array.isArray(node)) return;
205
+
206
+ if (node[0] === "$") {
207
+ const tag = node[1] as string;
208
+ const nodeProps = (node[3] || {}) as Record<string, unknown>;
209
+
210
+ // Handle component refs
211
+ if (tag.startsWith("$L")) {
212
+ const refId = tag.slice(2);
213
+ if (!visitedRefs.has(refId)) {
214
+ visitedRefs.add(refId);
215
+ const refNode = getParsedChunk(refId);
216
+ if (refNode) walkTable(refNode, isHeader);
217
+ visitedRefs.delete(refId);
218
+ }
219
+ return;
220
+ }
221
+
222
+ if (tag === "thead") walkTable(nodeProps.children, true);
223
+ else if (tag === "tbody") walkTable(nodeProps.children, false);
224
+ else if (tag === "tr") {
225
+ const cells: string[] = [];
226
+ walkCells(nodeProps.children, cells);
227
+ if (cells.length > 0) {
228
+ rows.push(cells);
229
+ if (isHeader) headerRowCount++;
230
+ }
231
+ } else walkTable(nodeProps.children, isHeader);
232
+ } else {
233
+ for (const child of node) walkTable(child, isHeader);
234
+ }
235
+ }
236
+
237
+ function walkCells(node: unknown, cells: string[]): void {
238
+ if (node === null || node === undefined) return;
239
+
240
+ // Handle string refs
241
+ if (typeof node === "string") {
242
+ const refMatch = node.match(/^\$L([0-9a-f]+)$/i);
243
+ if (refMatch && !visitedRefs.has(refMatch[1])) {
244
+ visitedRefs.add(refMatch[1]);
245
+ const refNode = getParsedChunk(refMatch[1]);
246
+ if (refNode) walkCells(refNode, cells);
247
+ visitedRefs.delete(refMatch[1]);
248
+ }
249
+ return;
250
+ }
251
+
252
+ if (!Array.isArray(node)) return;
253
+
254
+ if (node[0] === "$" && (node[1] === "td" || node[1] === "th")) {
255
+ const cellProps = (node[3] || {}) as Record<string, unknown>;
256
+ const text = extractNode(cellProps.children, { inTable: true, inCode: false })
257
+ .trim()
258
+ .replace(/\n/g, " ")
259
+ .replace(/\\/g, "\\\\") // Escape backslashes first
260
+ .replace(/\|/g, "\\|"); // Then escape pipes
261
+ cells.push(text);
262
+ } else if (node[0] === "$" && typeof node[1] === "string" && (node[1] as string).startsWith("$L")) {
263
+ // Component ref for a cell
264
+ const refId = (node[1] as string).slice(2);
265
+ if (!visitedRefs.has(refId)) {
266
+ visitedRefs.add(refId);
267
+ const refNode = getParsedChunk(refId);
268
+ if (refNode) walkCells(refNode, cells);
269
+ visitedRefs.delete(refId);
270
+ }
271
+ } else {
272
+ for (const child of node) walkCells(child, cells);
273
+ }
274
+ }
275
+
276
+ walkTable(props.children);
277
+ if (rows.length === 0) return "";
278
+
279
+ const colCount = Math.max(...rows.map(r => r.length));
280
+ let md = "";
281
+ for (let i = 0; i < rows.length; i++) {
282
+ const row = rows[i].concat(Array(colCount - rows[i].length).fill(""));
283
+ md += "| " + row.join(" | ") + " |\n";
284
+ if (i === headerRowCount - 1 || (headerRowCount === 0 && i === 0)) {
285
+ md += "| " + Array(colCount).fill("---").join(" | ") + " |\n";
286
+ }
287
+ }
288
+ return md;
289
+ }
290
+
291
+ // Process main content chunk (usually "23")
292
+ const mainChunk = getParsedChunk("23");
293
+
294
+ if (mainChunk) {
295
+ const content = extractNode(mainChunk);
296
+ if (content.trim().length > 100) {
297
+ const cleaned = content
298
+ .replace(/\n{3,}/g, "\n\n")
299
+ .trim();
300
+ return { title, content: cleaned };
301
+ }
302
+ }
303
+
304
+ // Fallback: try other chunks
305
+ const contentParts: { order: number; text: string }[] = [];
306
+
307
+ for (const [id] of chunkMap) {
308
+ if (id === "23") continue;
309
+ const parsed = getParsedChunk(id);
310
+ if (!parsed) continue;
311
+
312
+ visitedRefs.clear();
313
+ const text = extractNode(parsed);
314
+
315
+ if (text.trim().length > 50 &&
316
+ !text.includes("page was not found") &&
317
+ !text.includes("404")) {
318
+ contentParts.push({ order: parseInt(id, 16), text: text.trim() });
319
+ }
320
+ }
321
+
322
+ if (contentParts.length === 0) return null;
323
+
324
+ contentParts.sort((a, b) => a.order - b.order);
325
+
326
+ const seen = new Set<string>();
327
+ const uniqueParts: string[] = [];
328
+ for (const part of contentParts) {
329
+ const key = part.text.slice(0, 150);
330
+ if (!seen.has(key)) {
331
+ seen.add(key);
332
+ uniqueParts.push(part.text);
333
+ }
334
+ }
335
+
336
+ const content = uniqueParts.join("\n\n").replace(/\n{3,}/g, "\n\n").trim();
337
+ return content.length > 100 ? { title, content } : null;
338
+ }
package/storage.ts ADDED
@@ -0,0 +1,71 @@
1
+ import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
2
+ import type { ExtractedContent } from "./extract.js";
3
+ import type { SearchResult } from "./perplexity.js";
4
+
5
+ const CACHE_TTL_MS = 60 * 60 * 1000;
6
+
7
+ export interface QueryResultData {
8
+ query: string;
9
+ answer: string;
10
+ results: SearchResult[];
11
+ error: string | null;
12
+ }
13
+
14
+ export interface StoredSearchData {
15
+ id: string;
16
+ type: "search" | "fetch";
17
+ timestamp: number;
18
+ queries?: QueryResultData[];
19
+ urls?: ExtractedContent[];
20
+ }
21
+
22
+ const storedResults = new Map<string, StoredSearchData>();
23
+
24
+ export function generateId(): string {
25
+ return Date.now().toString(36) + Math.random().toString(36).slice(2, 8);
26
+ }
27
+
28
+ export function storeResult(id: string, data: StoredSearchData): void {
29
+ storedResults.set(id, data);
30
+ }
31
+
32
+ export function getResult(id: string): StoredSearchData | null {
33
+ return storedResults.get(id) ?? null;
34
+ }
35
+
36
+ export function getAllResults(): StoredSearchData[] {
37
+ return Array.from(storedResults.values());
38
+ }
39
+
40
+ export function deleteResult(id: string): boolean {
41
+ return storedResults.delete(id);
42
+ }
43
+
44
+ export function clearResults(): void {
45
+ storedResults.clear();
46
+ }
47
+
48
+ function isValidStoredData(data: unknown): data is StoredSearchData {
49
+ if (!data || typeof data !== "object") return false;
50
+ const d = data as Record<string, unknown>;
51
+ if (typeof d.id !== "string" || !d.id) return false;
52
+ if (d.type !== "search" && d.type !== "fetch") return false;
53
+ if (typeof d.timestamp !== "number") return false;
54
+ if (d.type === "search" && !Array.isArray(d.queries)) return false;
55
+ if (d.type === "fetch" && !Array.isArray(d.urls)) return false;
56
+ return true;
57
+ }
58
+
59
+ export function restoreFromSession(ctx: ExtensionContext): void {
60
+ storedResults.clear();
61
+ const now = Date.now();
62
+
63
+ for (const entry of ctx.sessionManager.getBranch()) {
64
+ if (entry.type === "custom" && entry.customType === "web-search-results") {
65
+ const data = entry.data;
66
+ if (isValidStoredData(data) && now - data.timestamp < CACHE_TTL_MS) {
67
+ storedResults.set(data.id, data);
68
+ }
69
+ }
70
+ }
71
+ }