@pi-unipi/web-api 0.1.13 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -15
- package/package.json +9 -2
- package/skills/web/SKILL.md +54 -11
- package/src/engine/constants.ts +36 -0
- package/src/engine/dependencies.ts +145 -0
- package/src/engine/dom.ts +266 -0
- package/src/engine/extract.ts +642 -0
- package/src/engine/format.ts +306 -0
- package/src/engine/profiles.ts +102 -0
- package/src/engine/types.ts +169 -0
- package/src/index.ts +9 -2
- package/src/providers/base.ts +9 -1
- package/src/settings.ts +70 -4
- package/src/tools.ts +281 -24
- package/src/tui/progress.ts +168 -0
- package/src/tui/result.ts +173 -0
- package/src/tui/settings-dialog.ts +168 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @unipi/web-api — DOM Parsing
|
|
3
|
+
*
|
|
4
|
+
* Server-side DOM parsing using linkedom.
|
|
5
|
+
* Provides polyfills for defuddle compatibility.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { parseHTML as linkedomParseHTML } from "linkedom";
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Parse HTML into a DOM document.
|
|
12
|
+
* Returns a document compatible with defuddle extraction.
|
|
13
|
+
*
|
|
14
|
+
* @param html - HTML string to parse
|
|
15
|
+
* @returns DOM document and window
|
|
16
|
+
*/
|
|
17
|
+
export function parseHTML(html: string): {
|
|
18
|
+
document: Document;
|
|
19
|
+
window: Window & typeof globalThis;
|
|
20
|
+
} {
|
|
21
|
+
const { window } = linkedomParseHTML(html);
|
|
22
|
+
|
|
23
|
+
// Apply polyfills for defuddle compatibility
|
|
24
|
+
applyPolyfills(window);
|
|
25
|
+
|
|
26
|
+
return {
|
|
27
|
+
document: window.document,
|
|
28
|
+
window,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Apply polyfills to the window object for defuddle compatibility.
|
|
34
|
+
* Defuddle expects certain browser APIs that linkedom may not provide.
|
|
35
|
+
*
|
|
36
|
+
* @param window - Window object to polyfill
|
|
37
|
+
*/
|
|
38
|
+
function applyPolyfills(window: Window & typeof globalThis): void {
|
|
39
|
+
// NodeList.forEach is often expected but may not be in linkedom
|
|
40
|
+
const NodeList = (window as any).NodeList;
|
|
41
|
+
if (NodeList && !NodeList.prototype.forEach) {
|
|
42
|
+
NodeList.prototype.forEach = function (callback: (value: Node, key: number, parent: NodeList) => void, thisArg?: any): void {
|
|
43
|
+
for (let i = 0; i < this.length; i++) {
|
|
44
|
+
callback.call(thisArg, this.item(i), i, this);
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Element.matches polyfill
|
|
50
|
+
const Element = (window as any).Element;
|
|
51
|
+
|
|
52
|
+
if (Element && Element.prototype) {
|
|
53
|
+
if (!Element.prototype.matches) {
|
|
54
|
+
Element.prototype.matches = function (selector: string): boolean {
|
|
55
|
+
const doc = this.ownerDocument;
|
|
56
|
+
if (!doc) return false;
|
|
57
|
+
const matches = doc.querySelectorAll(selector);
|
|
58
|
+
for (let i = 0; i < matches.length; i++) {
|
|
59
|
+
if (matches[i] === this) return true;
|
|
60
|
+
}
|
|
61
|
+
return false;
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Element.closest polyfill
|
|
66
|
+
if (!Element.prototype.closest) {
|
|
67
|
+
Element.prototype.closest = function (selector: string): Element | null {
|
|
68
|
+
// eslint-disable-next-line @typescript-eslint/no-this-alias
|
|
69
|
+
let el: Element | null = this;
|
|
70
|
+
while (el) {
|
|
71
|
+
if (el.matches && el.matches(selector)) {
|
|
72
|
+
return el;
|
|
73
|
+
}
|
|
74
|
+
el = el.parentElement;
|
|
75
|
+
}
|
|
76
|
+
return null;
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// TextDecoder/TextEncoder polyfill (Node has these natively, but just in case)
|
|
82
|
+
if (typeof (window as any).TextDecoder === "undefined") {
|
|
83
|
+
(window as any).TextDecoder = globalThis.TextDecoder;
|
|
84
|
+
}
|
|
85
|
+
if (typeof (window as any).TextEncoder === "undefined") {
|
|
86
|
+
(window as any).TextEncoder = globalThis.TextEncoder;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// URL polyfill
|
|
90
|
+
if (typeof (window as any).URL === "undefined") {
|
|
91
|
+
(window as any).URL = globalThis.URL;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// console polyfill (linkedom may not provide)
|
|
95
|
+
if (typeof (window as any).console === "undefined") {
|
|
96
|
+
(window as any).console = console;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Extract text content from a DOM element.
|
|
102
|
+
* Walks text nodes and concatenates their content.
|
|
103
|
+
*
|
|
104
|
+
* @param element - Root element to extract from
|
|
105
|
+
* @returns Extracted text
|
|
106
|
+
*/
|
|
107
|
+
export function extractTextContent(element: Element): string {
|
|
108
|
+
const texts: string[] = [];
|
|
109
|
+
|
|
110
|
+
function walk(node: Node): void {
|
|
111
|
+
if (node.nodeType === 3) {
|
|
112
|
+
// Text node
|
|
113
|
+
const text = node.textContent?.trim();
|
|
114
|
+
if (text) {
|
|
115
|
+
texts.push(text);
|
|
116
|
+
}
|
|
117
|
+
} else if (node.nodeType === 1) {
|
|
118
|
+
// Element node
|
|
119
|
+
for (const child of Array.from(node.childNodes)) {
|
|
120
|
+
walk(child);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
walk(element);
|
|
126
|
+
|
|
127
|
+
return texts.join(" ");
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Convert a DOM element to markdown.
|
|
132
|
+
* Basic implementation for fallback extraction.
|
|
133
|
+
*
|
|
134
|
+
* @param element - Root element to convert
|
|
135
|
+
* @returns Markdown string
|
|
136
|
+
*/
|
|
137
|
+
export function elementToMarkdown(element: Element): string {
|
|
138
|
+
const lines: string[] = [];
|
|
139
|
+
|
|
140
|
+
function walk(node: Node, depth: number = 0): void {
|
|
141
|
+
if (node.nodeType === 3) {
|
|
142
|
+
// Text node
|
|
143
|
+
const text = node.textContent?.trim();
|
|
144
|
+
if (text) {
|
|
145
|
+
lines.push(text);
|
|
146
|
+
}
|
|
147
|
+
} else if (node.nodeType === 1) {
|
|
148
|
+
// Element node
|
|
149
|
+
const el = node as Element;
|
|
150
|
+
const tag = el.tagName?.toLowerCase();
|
|
151
|
+
|
|
152
|
+
switch (tag) {
|
|
153
|
+
case "h1":
|
|
154
|
+
lines.push("");
|
|
155
|
+
lines.push(`# ${extractTextContent(el)}`);
|
|
156
|
+
lines.push("");
|
|
157
|
+
break;
|
|
158
|
+
case "h2":
|
|
159
|
+
lines.push("");
|
|
160
|
+
lines.push(`## ${extractTextContent(el)}`);
|
|
161
|
+
lines.push("");
|
|
162
|
+
break;
|
|
163
|
+
case "h3":
|
|
164
|
+
lines.push("");
|
|
165
|
+
lines.push(`### ${extractTextContent(el)}`);
|
|
166
|
+
lines.push("");
|
|
167
|
+
break;
|
|
168
|
+
case "h4":
|
|
169
|
+
case "h5":
|
|
170
|
+
case "h6":
|
|
171
|
+
lines.push("");
|
|
172
|
+
lines.push(`${"#".repeat(parseInt(tag[1]))} ${extractTextContent(el)}`);
|
|
173
|
+
lines.push("");
|
|
174
|
+
break;
|
|
175
|
+
case "p":
|
|
176
|
+
lines.push("");
|
|
177
|
+
lines.push(extractTextContent(el));
|
|
178
|
+
lines.push("");
|
|
179
|
+
break;
|
|
180
|
+
case "br":
|
|
181
|
+
lines.push("");
|
|
182
|
+
break;
|
|
183
|
+
case "a": {
|
|
184
|
+
const href = el.getAttribute("href");
|
|
185
|
+
const text = extractTextContent(el);
|
|
186
|
+
if (href && text) {
|
|
187
|
+
lines.push(`[${text}](${href})`);
|
|
188
|
+
} else if (text) {
|
|
189
|
+
lines.push(text);
|
|
190
|
+
}
|
|
191
|
+
break;
|
|
192
|
+
}
|
|
193
|
+
case "strong":
|
|
194
|
+
case "b":
|
|
195
|
+
lines.push(`**${extractTextContent(el)}**`);
|
|
196
|
+
break;
|
|
197
|
+
case "em":
|
|
198
|
+
case "i":
|
|
199
|
+
lines.push(`*${extractTextContent(el)}*`);
|
|
200
|
+
break;
|
|
201
|
+
case "code":
|
|
202
|
+
lines.push(`\`${extractTextContent(el)}\``);
|
|
203
|
+
break;
|
|
204
|
+
case "pre":
|
|
205
|
+
lines.push("");
|
|
206
|
+
lines.push("```");
|
|
207
|
+
lines.push(extractTextContent(el));
|
|
208
|
+
lines.push("```");
|
|
209
|
+
lines.push("");
|
|
210
|
+
break;
|
|
211
|
+
case "blockquote":
|
|
212
|
+
lines.push("");
|
|
213
|
+
for (const line of extractTextContent(el).split("\n")) {
|
|
214
|
+
lines.push(`> ${line}`);
|
|
215
|
+
}
|
|
216
|
+
lines.push("");
|
|
217
|
+
break;
|
|
218
|
+
case "ul":
|
|
219
|
+
case "ol":
|
|
220
|
+
lines.push("");
|
|
221
|
+
for (const child of Array.from(el.children)) {
|
|
222
|
+
walk(child, depth + 1);
|
|
223
|
+
}
|
|
224
|
+
lines.push("");
|
|
225
|
+
break;
|
|
226
|
+
case "li": {
|
|
227
|
+
const prefix = depth > 0 ? " " : "";
|
|
228
|
+
lines.push(`${prefix}- ${extractTextContent(el)}`);
|
|
229
|
+
break;
|
|
230
|
+
}
|
|
231
|
+
case "img": {
|
|
232
|
+
const alt = el.getAttribute("alt") || "";
|
|
233
|
+
const src = el.getAttribute("src") || "";
|
|
234
|
+
lines.push(``);
|
|
235
|
+
break;
|
|
236
|
+
}
|
|
237
|
+
case "div":
|
|
238
|
+
case "section":
|
|
239
|
+
case "article":
|
|
240
|
+
case "main":
|
|
241
|
+
case "header":
|
|
242
|
+
case "footer":
|
|
243
|
+
case "nav":
|
|
244
|
+
case "aside":
|
|
245
|
+
// Container elements - recurse into children
|
|
246
|
+
for (const child of Array.from(el.childNodes)) {
|
|
247
|
+
walk(child, depth);
|
|
248
|
+
}
|
|
249
|
+
break;
|
|
250
|
+
default:
|
|
251
|
+
// Unknown elements - recurse into children
|
|
252
|
+
for (const child of Array.from(el.childNodes)) {
|
|
253
|
+
walk(child, depth);
|
|
254
|
+
}
|
|
255
|
+
break;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
walk(element);
|
|
261
|
+
|
|
262
|
+
// Clean up multiple blank lines
|
|
263
|
+
let result = lines.join("\n");
|
|
264
|
+
result = result.replace(/\n{3,}/g, "\n\n");
|
|
265
|
+
return result.trim();
|
|
266
|
+
}
|