pi-read-page 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +233 -0
- package/extensions/pi-read-page.ts +11 -0
- package/package.json +65 -0
- package/src/browser/browser-manager.ts +329 -0
- package/src/browser/confidence.ts +167 -0
- package/src/browser/dom-preparer.ts +150 -0
- package/src/browser/extractor.ts +222 -0
- package/src/browser/user-action.ts +43 -0
- package/src/cache/cache.ts +265 -0
- package/src/security/url-policy.ts +345 -0
- package/src/tools/read-page.ts +636 -0
- package/src/types.ts +54 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
ConfidenceReport,
|
|
3
|
+
ExtractedPage,
|
|
4
|
+
HandoffReason,
|
|
5
|
+
UserActionDecision,
|
|
6
|
+
} from "../types";
|
|
7
|
+
|
|
8
|
+
const CAPTCHA_RE =
|
|
9
|
+
/captcha|recaptcha|hcaptcha|verify you are human|human verification|are you human|请完成安全验证|人机验证/i;
|
|
10
|
+
const BLOCK_RE =
|
|
11
|
+
/cloudflare|attention required|checking your browser|just a moment|access denied|temporarily blocked|访问受限|安全检查/i;
|
|
12
|
+
const LOGIN_WALL_RE =
|
|
13
|
+
/sign in to continue|log in to continue|login to continue|please sign in|please log in|subscribe to continue|sign up to continue|join to view|登录后(查看|继续|阅读)|请登录(后)?(查看|继续|阅读)?/i;
|
|
14
|
+
const LOGIN_URL_RE = /\/(login|signin|sign-in|auth|oauth|session)(\/|$|[?#])/i;
|
|
15
|
+
const NAV_ONLY_RE =
|
|
16
|
+
/home\s+about\s+contact|privacy policy|terms of service|all rights reserved/i;
|
|
17
|
+
|
|
18
|
+
export function assessConfidence(
|
|
19
|
+
page: Omit<ExtractedPage, "confidence">,
|
|
20
|
+
): ConfidenceReport {
|
|
21
|
+
const markdown = page.markdown.trim();
|
|
22
|
+
const contentHtml = page.contentHtml.trim();
|
|
23
|
+
const title = page.title.trim();
|
|
24
|
+
const sample =
|
|
25
|
+
`${page.url}\n${title}\n${markdown.slice(0, 10_000)}`.toLowerCase();
|
|
26
|
+
const reasons: string[] = [];
|
|
27
|
+
|
|
28
|
+
const wordCount = page.metadata.wordCount || estimateWordCount(markdown);
|
|
29
|
+
const markdownLength = markdown.length;
|
|
30
|
+
let score = 100;
|
|
31
|
+
|
|
32
|
+
if (CAPTCHA_RE.test(sample)) {
|
|
33
|
+
score -= 70;
|
|
34
|
+
reasons.push("captcha_or_human_verification_detected");
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (BLOCK_RE.test(sample)) {
|
|
38
|
+
score -= 70;
|
|
39
|
+
reasons.push("anti_bot_or_access_block_detected");
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (isLoginWall(page, markdownLength, wordCount)) {
|
|
43
|
+
score -= 65;
|
|
44
|
+
reasons.push("login_wall_detected");
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (!title) {
|
|
48
|
+
score -= 15;
|
|
49
|
+
reasons.push("missing_title");
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if (!contentHtml) {
|
|
53
|
+
score -= 45;
|
|
54
|
+
reasons.push("missing_extracted_html");
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (markdownLength < 100) {
|
|
58
|
+
score -= 60;
|
|
59
|
+
reasons.push("markdown_too_short");
|
|
60
|
+
} else if (markdownLength < 500) {
|
|
61
|
+
score -= 25;
|
|
62
|
+
reasons.push("markdown_short");
|
|
63
|
+
} else if (markdownLength < 1_500) {
|
|
64
|
+
score -= 12;
|
|
65
|
+
reasons.push("markdown_somewhat_short");
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (wordCount < 10) {
|
|
69
|
+
score -= 45;
|
|
70
|
+
reasons.push("word_count_too_low");
|
|
71
|
+
} else if (wordCount < 30) {
|
|
72
|
+
score -= 15;
|
|
73
|
+
reasons.push("word_count_low");
|
|
74
|
+
} else if (wordCount < 200) {
|
|
75
|
+
score -= 8;
|
|
76
|
+
reasons.push("word_count_somewhat_low");
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (NAV_ONLY_RE.test(sample) && wordCount < 250) {
|
|
80
|
+
score -= 30;
|
|
81
|
+
reasons.push("looks_like_navigation_or_boilerplate");
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if (
|
|
85
|
+
!page.metadata.description &&
|
|
86
|
+
!page.metadata.published &&
|
|
87
|
+
!page.metadata.author &&
|
|
88
|
+
!page.metadata.site
|
|
89
|
+
) {
|
|
90
|
+
score -= 5;
|
|
91
|
+
reasons.push("little_metadata");
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const boundedScore = Math.max(0, score);
|
|
95
|
+
const level =
|
|
96
|
+
boundedScore >= 70 ? "high" : boundedScore >= 40 ? "medium" : "low";
|
|
97
|
+
return { level, score: boundedScore, reasons };
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
export function decideUserAction(extracted: ExtractedPage): UserActionDecision {
|
|
101
|
+
const handoff = detectActionableHandoff(extracted);
|
|
102
|
+
return {
|
|
103
|
+
required: handoff !== undefined,
|
|
104
|
+
reason: handoff?.reason,
|
|
105
|
+
message: handoff?.message,
|
|
106
|
+
confidence: extracted.confidence,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function detectActionableHandoff(
|
|
111
|
+
page: ExtractedPage,
|
|
112
|
+
): { reason: HandoffReason; message: string } | undefined {
|
|
113
|
+
const markdown = page.markdown.trim();
|
|
114
|
+
const sample =
|
|
115
|
+
`${page.url}\n${page.title}\n${markdown.slice(0, 10_000)}`.toLowerCase();
|
|
116
|
+
const wordCount = page.metadata.wordCount || estimateWordCount(markdown);
|
|
117
|
+
// Captcha/anti-bot/login handoff is only actionable when the extracted body is
|
|
118
|
+
// thin. A long article that merely discusses these topics is not user-actionable.
|
|
119
|
+
const contentIsThin = wordCount < 120 || markdown.length < 1_200;
|
|
120
|
+
|
|
121
|
+
if (contentIsThin && CAPTCHA_RE.test(sample)) {
|
|
122
|
+
return {
|
|
123
|
+
reason: "captcha",
|
|
124
|
+
message: "The page appears to require captcha or human verification.",
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (contentIsThin && BLOCK_RE.test(sample)) {
|
|
129
|
+
return {
|
|
130
|
+
reason: "blocked",
|
|
131
|
+
message:
|
|
132
|
+
"The page appears to be blocked by an anti-bot or permission interstitial.",
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (isLoginWall(page, markdown.length, wordCount)) {
|
|
137
|
+
return {
|
|
138
|
+
reason: "login_required",
|
|
139
|
+
message:
|
|
140
|
+
"The page appears to require login, subscription access, or manual navigation.",
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return undefined;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function isLoginWall(
|
|
148
|
+
page: Pick<ExtractedPage, "url" | "title" | "markdown" | "contentHtml">,
|
|
149
|
+
markdownLength: number,
|
|
150
|
+
wordCount: number,
|
|
151
|
+
): boolean {
|
|
152
|
+
const sample = `${page.url}\n${page.title}\n${page.markdown.slice(0, 10_000)}`;
|
|
153
|
+
const html = page.contentHtml.toLowerCase();
|
|
154
|
+
const hasPasswordInput = /<input\b[^>]*type=["']?password/i.test(html);
|
|
155
|
+
const hasLoginWallText = LOGIN_WALL_RE.test(sample);
|
|
156
|
+
const loginUrl = LOGIN_URL_RE.test(page.url);
|
|
157
|
+
const contentIsThin = wordCount < 120 || markdownLength < 1_200;
|
|
158
|
+
|
|
159
|
+
return contentIsThin && (hasPasswordInput || hasLoginWallText || loginUrl);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
function estimateWordCount(markdown: string): number {
|
|
163
|
+
const asciiWords =
|
|
164
|
+
markdown.match(/[A-Za-z0-9_]+(?:[-'][A-Za-z0-9_]+)*/g)?.length || 0;
|
|
165
|
+
const cjkChars = markdown.match(/[\u3400-\u9fff]/g)?.length || 0;
|
|
166
|
+
return asciiWords + Math.ceil(cjkChars / 2);
|
|
167
|
+
}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import { parseHTML } from "linkedom";
|
|
2
|
+
import type { Page } from "playwright-core";
|
|
3
|
+
|
|
4
|
+
export async function flattenOpenShadowRoots(page: Page): Promise<void> {
|
|
5
|
+
await page
|
|
6
|
+
.evaluate(() => {
|
|
7
|
+
document.querySelectorAll("*").forEach((element) => {
|
|
8
|
+
element.removeAttribute("data-defuddle-shadow");
|
|
9
|
+
const shadowRoot = element.shadowRoot;
|
|
10
|
+
if (shadowRoot?.innerHTML) {
|
|
11
|
+
element.setAttribute("data-defuddle-shadow", shadowRoot.innerHTML);
|
|
12
|
+
}
|
|
13
|
+
});
|
|
14
|
+
})
|
|
15
|
+
.catch(() => undefined);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export function prepareHtmlForExtraction(
|
|
19
|
+
html: string,
|
|
20
|
+
url: string,
|
|
21
|
+
): {
|
|
22
|
+
extractionHtml: string;
|
|
23
|
+
sanitizedHtml: string;
|
|
24
|
+
fallbackDocument: Document;
|
|
25
|
+
} {
|
|
26
|
+
const { document: extractionDocument, baseUrl } = parseDocument(html, url);
|
|
27
|
+
inlineCapturedShadowRoots(extractionDocument);
|
|
28
|
+
absolutizeUrls(extractionDocument, baseUrl);
|
|
29
|
+
const extractionHtml = extractionDocument.documentElement?.outerHTML || html;
|
|
30
|
+
|
|
31
|
+
const { document: fallbackDocument, baseUrl: fallbackBaseUrl } =
|
|
32
|
+
parseDocument(extractionHtml, url);
|
|
33
|
+
sanitizeDocumentForOutput(fallbackDocument);
|
|
34
|
+
absolutizeUrls(fallbackDocument, fallbackBaseUrl);
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
extractionHtml,
|
|
38
|
+
sanitizedHtml:
|
|
39
|
+
fallbackDocument.documentElement?.outerHTML || extractionHtml,
|
|
40
|
+
fallbackDocument,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function inlineCapturedShadowRoots(document: Document): void {
|
|
45
|
+
document.querySelectorAll("[data-defuddle-shadow]").forEach((host) => {
|
|
46
|
+
const shadowHtml = host.getAttribute("data-defuddle-shadow");
|
|
47
|
+
host.removeAttribute("data-defuddle-shadow");
|
|
48
|
+
if (!shadowHtml) return;
|
|
49
|
+
|
|
50
|
+
const { document: shadowDocument } = parseHTML(
|
|
51
|
+
`<html><body>${shadowHtml}</body></html>`,
|
|
52
|
+
);
|
|
53
|
+
const fragment = document.createDocumentFragment();
|
|
54
|
+
for (const node of Array.from(shadowDocument.body?.childNodes ?? [])) {
|
|
55
|
+
fragment.appendChild(document.importNode(node, true));
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if (host.tagName.includes("-") && host.parentNode) {
|
|
59
|
+
const replacement = document.createElement("div");
|
|
60
|
+
replacement.appendChild(fragment);
|
|
61
|
+
host.parentNode.replaceChild(replacement, host);
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
host.textContent = "";
|
|
66
|
+
host.appendChild(fragment);
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function sanitizeDocumentForOutput(document: Document): void {
|
|
71
|
+
document.querySelectorAll("script, style, noscript").forEach((element) => {
|
|
72
|
+
element.remove();
|
|
73
|
+
});
|
|
74
|
+
document.querySelectorAll("*").forEach((element) => {
|
|
75
|
+
element.removeAttribute("style");
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function parseDocument(
|
|
80
|
+
html: string,
|
|
81
|
+
url: string,
|
|
82
|
+
): { document: Document; baseUrl: string } {
|
|
83
|
+
const { document } = parseHTML(html);
|
|
84
|
+
const base = document.querySelector("base[href]");
|
|
85
|
+
const baseUrl = base?.getAttribute("href")
|
|
86
|
+
? new URL(base.getAttribute("href") || url, url).href
|
|
87
|
+
: url;
|
|
88
|
+
|
|
89
|
+
return { document: document as unknown as Document, baseUrl };
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function absolutizeUrls(document: Document, baseUrl: string): void {
|
|
93
|
+
document.querySelectorAll("[src], [href], [srcset]").forEach((element) => {
|
|
94
|
+
absolutizeAttribute(element, "src", baseUrl);
|
|
95
|
+
absolutizeAttribute(element, "href", baseUrl);
|
|
96
|
+
absolutizeSrcset(element, baseUrl);
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function absolutizeAttribute(
|
|
101
|
+
element: Element,
|
|
102
|
+
attr: "src" | "href",
|
|
103
|
+
baseUrl: string,
|
|
104
|
+
): void {
|
|
105
|
+
const value = element.getAttribute(attr);
|
|
106
|
+
if (!value) return;
|
|
107
|
+
if (
|
|
108
|
+
value.startsWith("http://") ||
|
|
109
|
+
value.startsWith("https://") ||
|
|
110
|
+
value.startsWith("data:") ||
|
|
111
|
+
value.startsWith("#")
|
|
112
|
+
)
|
|
113
|
+
return;
|
|
114
|
+
if (value.startsWith("//")) {
|
|
115
|
+
const protocol = new URL(baseUrl).protocol;
|
|
116
|
+
element.setAttribute(attr, `${protocol}${value}`);
|
|
117
|
+
return;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
try {
|
|
121
|
+
element.setAttribute(attr, new URL(value, baseUrl).href);
|
|
122
|
+
} catch {
|
|
123
|
+
// Keep the original value if URL parsing fails.
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function absolutizeSrcset(element: Element, baseUrl: string): void {
|
|
128
|
+
const value = element.getAttribute("srcset");
|
|
129
|
+
if (!value) return;
|
|
130
|
+
|
|
131
|
+
const next = value
|
|
132
|
+
.split(",")
|
|
133
|
+
.map((candidate) => {
|
|
134
|
+
const parts = candidate.trim().split(/\s+/);
|
|
135
|
+
const src = parts.shift();
|
|
136
|
+
if (!src) return candidate;
|
|
137
|
+
|
|
138
|
+
try {
|
|
139
|
+
const absolute = src.startsWith("data:")
|
|
140
|
+
? src
|
|
141
|
+
: new URL(src, baseUrl).href;
|
|
142
|
+
return [absolute, ...parts].join(" ");
|
|
143
|
+
} catch {
|
|
144
|
+
return candidate;
|
|
145
|
+
}
|
|
146
|
+
})
|
|
147
|
+
.join(", ");
|
|
148
|
+
|
|
149
|
+
element.setAttribute("srcset", next);
|
|
150
|
+
}
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import type { DefuddleOptions, DefuddleResponse } from "defuddle/full";
|
|
2
|
+
import type { Page } from "playwright-core";
|
|
3
|
+
import { createPolicyFetch } from "../security/url-policy";
|
|
4
|
+
import type { ExtractedPage, PageMetadata, UserActionDecision } from "../types";
|
|
5
|
+
import {
|
|
6
|
+
assessConfidence,
|
|
7
|
+
decideUserAction as decideUserActionFromConfidence,
|
|
8
|
+
} from "./confidence";
|
|
9
|
+
import {
|
|
10
|
+
flattenOpenShadowRoots,
|
|
11
|
+
prepareHtmlForExtraction,
|
|
12
|
+
} from "./dom-preparer";
|
|
13
|
+
|
|
14
|
+
const DEFAULT_PARSE_TIMEOUT_MS = 8_000;
|
|
15
|
+
|
|
16
|
+
type ParseMode = ExtractedPage["parseMode"];
|
|
17
|
+
type DefuddleNodeModule = typeof import("defuddle/node");
|
|
18
|
+
|
|
19
|
+
let defuddleNodeModulePromise: Promise<DefuddleNodeModule> | undefined;
|
|
20
|
+
|
|
21
|
+
async function getDefuddleNodeModule(): Promise<DefuddleNodeModule> {
|
|
22
|
+
defuddleNodeModulePromise ??= import("defuddle/node");
|
|
23
|
+
return defuddleNodeModulePromise;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export async function extractMarkdown(page: Page): Promise<ExtractedPage> {
|
|
27
|
+
await flattenOpenShadowRoots(page);
|
|
28
|
+
|
|
29
|
+
const url = page.url();
|
|
30
|
+
const rawHtml = await page.content();
|
|
31
|
+
const { fallbackDocument, extractionHtml, sanitizedHtml } =
|
|
32
|
+
prepareHtmlForExtraction(rawHtml, url);
|
|
33
|
+
const { result, parseMode } = await parseWithDefuddle(extractionHtml, url);
|
|
34
|
+
|
|
35
|
+
const contentHtml = result.content || fallbackDocument.body?.innerHTML || "";
|
|
36
|
+
const markdown = (
|
|
37
|
+
result.contentMarkdown || structuredTextFallback(contentHtml, url)
|
|
38
|
+
).trim();
|
|
39
|
+
const textLength = cleanText(markdown).length;
|
|
40
|
+
const metadata = buildMetadata(result);
|
|
41
|
+
const title = cleanText(result.title || document.title || url);
|
|
42
|
+
|
|
43
|
+
const withoutConfidence = {
|
|
44
|
+
url,
|
|
45
|
+
title,
|
|
46
|
+
markdown,
|
|
47
|
+
contentHtml,
|
|
48
|
+
fullHtml: sanitizedHtml,
|
|
49
|
+
textLength,
|
|
50
|
+
capturedAt: new Date().toISOString(),
|
|
51
|
+
extractor: "defuddle" as const,
|
|
52
|
+
extraction: result.extractorType || result.debug?.contentSelector || "auto",
|
|
53
|
+
parseMode,
|
|
54
|
+
metadata,
|
|
55
|
+
warnings: buildWarnings(result, markdown, textLength),
|
|
56
|
+
debug: result.debug,
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
const confidence = assessConfidence(withoutConfidence);
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
...withoutConfidence,
|
|
63
|
+
confidence,
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export function decideUserAction(extracted: ExtractedPage): UserActionDecision {
|
|
68
|
+
return decideUserActionFromConfidence(extracted);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
async function parseWithDefuddle(
|
|
72
|
+
html: string,
|
|
73
|
+
url: string,
|
|
74
|
+
): Promise<{ result: DefuddleResponse; parseMode: ParseMode }> {
|
|
75
|
+
const allowThirdPartyAsync = process.env.READ_PAGE_DEFUDDLE_ASYNC === "1";
|
|
76
|
+
const timeoutMs =
|
|
77
|
+
Number.parseInt(process.env.READ_PAGE_PARSE_TIMEOUT_MS || "", 10) ||
|
|
78
|
+
DEFAULT_PARSE_TIMEOUT_MS;
|
|
79
|
+
const options: DefuddleOptions = {
|
|
80
|
+
url,
|
|
81
|
+
debug: process.env.READ_PAGE_DEFUDDLE_DEBUG === "1",
|
|
82
|
+
useAsync: allowThirdPartyAsync,
|
|
83
|
+
includeReplies: "extractors",
|
|
84
|
+
separateMarkdown: true,
|
|
85
|
+
markdown: false,
|
|
86
|
+
fetch: createPolicyFetch(),
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
const { Defuddle } = await getDefuddleNodeModule();
|
|
90
|
+
|
|
91
|
+
try {
|
|
92
|
+
const result = await withTimeout(Defuddle(html, url, options), timeoutMs);
|
|
93
|
+
return { result, parseMode: allowThirdPartyAsync ? "async" : "sync" };
|
|
94
|
+
} catch {
|
|
95
|
+
const result = await Defuddle(html, url, {
|
|
96
|
+
...options,
|
|
97
|
+
useAsync: false,
|
|
98
|
+
separateMarkdown: true,
|
|
99
|
+
});
|
|
100
|
+
return { result, parseMode: "sync-fallback" };
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function buildMetadata(result: DefuddleResponse): PageMetadata {
|
|
105
|
+
return {
|
|
106
|
+
title: cleanText(result.title || ""),
|
|
107
|
+
author: cleanText(result.author || ""),
|
|
108
|
+
description: cleanText(result.description || ""),
|
|
109
|
+
domain: result.domain || "",
|
|
110
|
+
favicon: result.favicon || "",
|
|
111
|
+
image: result.image || "",
|
|
112
|
+
published: result.published || "",
|
|
113
|
+
site: cleanText(result.site || ""),
|
|
114
|
+
language: result.language || "",
|
|
115
|
+
wordCount: result.wordCount || 0,
|
|
116
|
+
parseTime: result.parseTime || 0,
|
|
117
|
+
schemaOrgData: result.schemaOrgData,
|
|
118
|
+
metaTags: result.metaTags || [],
|
|
119
|
+
variables: result.variables || {},
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function buildWarnings(
|
|
124
|
+
result: DefuddleResponse,
|
|
125
|
+
markdown: string,
|
|
126
|
+
textLength: number,
|
|
127
|
+
): string[] {
|
|
128
|
+
const warnings: string[] = [];
|
|
129
|
+
|
|
130
|
+
if (!result.content)
|
|
131
|
+
warnings.push(
|
|
132
|
+
"Defuddle did not return extracted HTML; body fallback may have been used internally.",
|
|
133
|
+
);
|
|
134
|
+
if (!result.contentMarkdown)
|
|
135
|
+
warnings.push(
|
|
136
|
+
"Defuddle did not return Markdown; used structured plain-text fallback, so some formatting may be lost.",
|
|
137
|
+
);
|
|
138
|
+
if (!markdown.trim()) warnings.push("No Markdown content extracted.");
|
|
139
|
+
if (textLength < 500)
|
|
140
|
+
warnings.push(
|
|
141
|
+
"Extracted text is short; the page may require login, captcha, or manual navigation.",
|
|
142
|
+
);
|
|
143
|
+
if (!result.title) warnings.push("No title extracted.");
|
|
144
|
+
|
|
145
|
+
return warnings;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
function structuredTextFallback(html: string, url: string): string {
|
|
149
|
+
const { fallbackDocument } = prepareHtmlForExtraction(
|
|
150
|
+
`<body>${html}</body>`,
|
|
151
|
+
url,
|
|
152
|
+
);
|
|
153
|
+
const blocks = Array.from(
|
|
154
|
+
fallbackDocument.querySelectorAll("h1,h2,h3,h4,h5,h6,p,li,pre,blockquote"),
|
|
155
|
+
);
|
|
156
|
+
|
|
157
|
+
if (blocks.length === 0) {
|
|
158
|
+
return cleanTextPreservingLines(
|
|
159
|
+
fallbackDocument.body?.textContent ||
|
|
160
|
+
fallbackDocument.documentElement?.textContent ||
|
|
161
|
+
fallbackDocument.textContent ||
|
|
162
|
+
"",
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
return blocks
|
|
167
|
+
.map((element) => formatBlock(element))
|
|
168
|
+
.filter(Boolean)
|
|
169
|
+
.join("\n\n")
|
|
170
|
+
.trim();
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function formatBlock(element: Element): string {
|
|
174
|
+
const tag = element.tagName.toLowerCase();
|
|
175
|
+
const text = cleanTextPreservingLines(element.textContent || "");
|
|
176
|
+
if (!text) return "";
|
|
177
|
+
|
|
178
|
+
if (/^h[1-6]$/.test(tag))
|
|
179
|
+
return `${"#".repeat(Number(tag.slice(1)))} ${text}`;
|
|
180
|
+
if (tag === "li") return `- ${text}`;
|
|
181
|
+
if (tag === "blockquote")
|
|
182
|
+
return text
|
|
183
|
+
.split("\n")
|
|
184
|
+
.map((line) => `> ${line}`)
|
|
185
|
+
.join("\n");
|
|
186
|
+
if (tag === "pre") return `\`\`\`\n${text}\n\`\`\``;
|
|
187
|
+
return text;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function cleanText(value: string): string {
|
|
191
|
+
return value.replace(/\s+/g, " ").trim();
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
function cleanTextPreservingLines(value: string): string {
|
|
195
|
+
return value
|
|
196
|
+
.replace(/\r\n/g, "\n")
|
|
197
|
+
.replace(/[ \t\f\v]+/g, " ")
|
|
198
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
199
|
+
.split("\n")
|
|
200
|
+
.map((line) => line.trim())
|
|
201
|
+
.join("\n")
|
|
202
|
+
.trim();
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
function withTimeout<T>(promise: Promise<T>, timeoutMs: number): Promise<T> {
|
|
206
|
+
return new Promise<T>((resolve, reject) => {
|
|
207
|
+
const timeout = setTimeout(
|
|
208
|
+
() => reject(new Error(`Timed out after ${timeoutMs}ms`)),
|
|
209
|
+
timeoutMs,
|
|
210
|
+
);
|
|
211
|
+
promise.then(
|
|
212
|
+
(value) => {
|
|
213
|
+
clearTimeout(timeout);
|
|
214
|
+
resolve(value);
|
|
215
|
+
},
|
|
216
|
+
(error) => {
|
|
217
|
+
clearTimeout(timeout);
|
|
218
|
+
reject(error);
|
|
219
|
+
},
|
|
220
|
+
);
|
|
221
|
+
});
|
|
222
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
2
|
+
import type { HandoffReason } from "../types";
|
|
3
|
+
|
|
4
|
+
export async function waitForUserAction(
|
|
5
|
+
ctx: ExtensionContext,
|
|
6
|
+
url: string,
|
|
7
|
+
reason: HandoffReason,
|
|
8
|
+
message: string,
|
|
9
|
+
signal?: AbortSignal,
|
|
10
|
+
): Promise<boolean> {
|
|
11
|
+
if (!ctx.hasUI) {
|
|
12
|
+
throw new Error(
|
|
13
|
+
`No interactive UI available for required user action: ${reason}`,
|
|
14
|
+
);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const statusKey = "read-page";
|
|
18
|
+
ctx.ui.setStatus(statusKey, "Waiting for browser action");
|
|
19
|
+
ctx.ui.setWidget(statusKey, [
|
|
20
|
+
"read-page needs user action.",
|
|
21
|
+
`Reason: ${reason}`,
|
|
22
|
+
`URL: ${url}`,
|
|
23
|
+
"Finish the action in the opened browser, then confirm here.",
|
|
24
|
+
]);
|
|
25
|
+
|
|
26
|
+
try {
|
|
27
|
+
return await ctx.ui.confirm(
|
|
28
|
+
"read-page needs user action",
|
|
29
|
+
[
|
|
30
|
+
message,
|
|
31
|
+
"",
|
|
32
|
+
`URL: ${url}`,
|
|
33
|
+
"",
|
|
34
|
+
"Complete login / captcha / manual navigation in the opened browser.",
|
|
35
|
+
"When the page is ready, return here and confirm.",
|
|
36
|
+
].join("\n"),
|
|
37
|
+
{ signal, timeout: 15 * 60 * 1000 },
|
|
38
|
+
);
|
|
39
|
+
} finally {
|
|
40
|
+
ctx.ui.setStatus(statusKey, undefined);
|
|
41
|
+
ctx.ui.setWidget(statusKey, []);
|
|
42
|
+
}
|
|
43
|
+
}
|