@pi-unipi/web-api 0.1.14 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,266 @@
1
+ /**
2
+ * @unipi/web-api — DOM Parsing
3
+ *
4
+ * Server-side DOM parsing using linkedom.
5
+ * Provides polyfills for defuddle compatibility.
6
+ */
7
+
8
+ import { parseHTML as linkedomParseHTML } from "linkedom";
9
+
10
+ /**
11
+ * Parse HTML into a DOM document.
12
+ * Returns a document compatible with defuddle extraction.
13
+ *
14
+ * @param html - HTML string to parse
15
+ * @returns DOM document and window
16
+ */
17
+ export function parseHTML(html: string): {
18
+ document: Document;
19
+ window: Window & typeof globalThis;
20
+ } {
21
+ const { window } = linkedomParseHTML(html);
22
+
23
+ // Apply polyfills for defuddle compatibility
24
+ applyPolyfills(window);
25
+
26
+ return {
27
+ document: window.document,
28
+ window,
29
+ };
30
+ }
31
+
32
+ /**
33
+ * Apply polyfills to the window object for defuddle compatibility.
34
+ * Defuddle expects certain browser APIs that linkedom may not provide.
35
+ *
36
+ * @param window - Window object to polyfill
37
+ */
38
+ function applyPolyfills(window: Window & typeof globalThis): void {
39
+ // NodeList.forEach is often expected but may not be in linkedom
40
+ const NodeList = (window as any).NodeList;
41
+ if (NodeList && !NodeList.prototype.forEach) {
42
+ NodeList.prototype.forEach = function (callback: (value: Node, key: number, parent: NodeList) => void, thisArg?: any): void {
43
+ for (let i = 0; i < this.length; i++) {
44
+ callback.call(thisArg, this.item(i), i, this);
45
+ }
46
+ };
47
+ }
48
+
49
+ // Element.matches polyfill
50
+ const Element = (window as any).Element;
51
+
52
+ if (Element && Element.prototype) {
53
+ if (!Element.prototype.matches) {
54
+ Element.prototype.matches = function (selector: string): boolean {
55
+ const doc = this.ownerDocument;
56
+ if (!doc) return false;
57
+ const matches = doc.querySelectorAll(selector);
58
+ for (let i = 0; i < matches.length; i++) {
59
+ if (matches[i] === this) return true;
60
+ }
61
+ return false;
62
+ };
63
+ }
64
+
65
+ // Element.closest polyfill
66
+ if (!Element.prototype.closest) {
67
+ Element.prototype.closest = function (selector: string): Element | null {
68
+ // eslint-disable-next-line @typescript-eslint/no-this-alias
69
+ let el: Element | null = this;
70
+ while (el) {
71
+ if (el.matches && el.matches(selector)) {
72
+ return el;
73
+ }
74
+ el = el.parentElement;
75
+ }
76
+ return null;
77
+ };
78
+ }
79
+ }
80
+
81
+ // TextDecoder/TextEncoder polyfill (Node has these natively, but just in case)
82
+ if (typeof (window as any).TextDecoder === "undefined") {
83
+ (window as any).TextDecoder = globalThis.TextDecoder;
84
+ }
85
+ if (typeof (window as any).TextEncoder === "undefined") {
86
+ (window as any).TextEncoder = globalThis.TextEncoder;
87
+ }
88
+
89
+ // URL polyfill
90
+ if (typeof (window as any).URL === "undefined") {
91
+ (window as any).URL = globalThis.URL;
92
+ }
93
+
94
+ // console polyfill (linkedom may not provide)
95
+ if (typeof (window as any).console === "undefined") {
96
+ (window as any).console = console;
97
+ }
98
+ }
99
+
100
+ /**
101
+ * Extract text content from a DOM element.
102
+ * Walks text nodes and concatenates their content.
103
+ *
104
+ * @param element - Root element to extract from
105
+ * @returns Extracted text
106
+ */
107
+ export function extractTextContent(element: Element): string {
108
+ const texts: string[] = [];
109
+
110
+ function walk(node: Node): void {
111
+ if (node.nodeType === 3) {
112
+ // Text node
113
+ const text = node.textContent?.trim();
114
+ if (text) {
115
+ texts.push(text);
116
+ }
117
+ } else if (node.nodeType === 1) {
118
+ // Element node
119
+ for (const child of Array.from(node.childNodes)) {
120
+ walk(child);
121
+ }
122
+ }
123
+ }
124
+
125
+ walk(element);
126
+
127
+ return texts.join(" ");
128
+ }
129
+
130
+ /**
131
+ * Convert a DOM element to markdown.
132
+ * Basic implementation for fallback extraction.
133
+ *
134
+ * @param element - Root element to convert
135
+ * @returns Markdown string
136
+ */
137
+ export function elementToMarkdown(element: Element): string {
138
+ const lines: string[] = [];
139
+
140
+ function walk(node: Node, depth: number = 0): void {
141
+ if (node.nodeType === 3) {
142
+ // Text node
143
+ const text = node.textContent?.trim();
144
+ if (text) {
145
+ lines.push(text);
146
+ }
147
+ } else if (node.nodeType === 1) {
148
+ // Element node
149
+ const el = node as Element;
150
+ const tag = el.tagName?.toLowerCase();
151
+
152
+ switch (tag) {
153
+ case "h1":
154
+ lines.push("");
155
+ lines.push(`# ${extractTextContent(el)}`);
156
+ lines.push("");
157
+ break;
158
+ case "h2":
159
+ lines.push("");
160
+ lines.push(`## ${extractTextContent(el)}`);
161
+ lines.push("");
162
+ break;
163
+ case "h3":
164
+ lines.push("");
165
+ lines.push(`### ${extractTextContent(el)}`);
166
+ lines.push("");
167
+ break;
168
+ case "h4":
169
+ case "h5":
170
+ case "h6":
171
+ lines.push("");
172
+ lines.push(`${"#".repeat(parseInt(tag[1]))} ${extractTextContent(el)}`);
173
+ lines.push("");
174
+ break;
175
+ case "p":
176
+ lines.push("");
177
+ lines.push(extractTextContent(el));
178
+ lines.push("");
179
+ break;
180
+ case "br":
181
+ lines.push("");
182
+ break;
183
+ case "a": {
184
+ const href = el.getAttribute("href");
185
+ const text = extractTextContent(el);
186
+ if (href && text) {
187
+ lines.push(`[${text}](${href})`);
188
+ } else if (text) {
189
+ lines.push(text);
190
+ }
191
+ break;
192
+ }
193
+ case "strong":
194
+ case "b":
195
+ lines.push(`**${extractTextContent(el)}**`);
196
+ break;
197
+ case "em":
198
+ case "i":
199
+ lines.push(`*${extractTextContent(el)}*`);
200
+ break;
201
+ case "code":
202
+ lines.push(`\`${extractTextContent(el)}\``);
203
+ break;
204
+ case "pre":
205
+ lines.push("");
206
+ lines.push("```");
207
+ lines.push(extractTextContent(el));
208
+ lines.push("```");
209
+ lines.push("");
210
+ break;
211
+ case "blockquote":
212
+ lines.push("");
213
+ for (const line of extractTextContent(el).split("\n")) {
214
+ lines.push(`> ${line}`);
215
+ }
216
+ lines.push("");
217
+ break;
218
+ case "ul":
219
+ case "ol":
220
+ lines.push("");
221
+ for (const child of Array.from(el.children)) {
222
+ walk(child, depth + 1);
223
+ }
224
+ lines.push("");
225
+ break;
226
+ case "li": {
227
+ const prefix = depth > 0 ? " " : "";
228
+ lines.push(`${prefix}- ${extractTextContent(el)}`);
229
+ break;
230
+ }
231
+ case "img": {
232
+ const alt = el.getAttribute("alt") || "";
233
+ const src = el.getAttribute("src") || "";
234
+ lines.push(`![${alt}](${src})`);
235
+ break;
236
+ }
237
+ case "div":
238
+ case "section":
239
+ case "article":
240
+ case "main":
241
+ case "header":
242
+ case "footer":
243
+ case "nav":
244
+ case "aside":
245
+ // Container elements - recurse into children
246
+ for (const child of Array.from(el.childNodes)) {
247
+ walk(child, depth);
248
+ }
249
+ break;
250
+ default:
251
+ // Unknown elements - recurse into children
252
+ for (const child of Array.from(el.childNodes)) {
253
+ walk(child, depth);
254
+ }
255
+ break;
256
+ }
257
+ }
258
+ }
259
+
260
+ walk(element);
261
+
262
+ // Clean up multiple blank lines
263
+ let result = lines.join("\n");
264
+ result = result.replace(/\n{3,}/g, "\n\n");
265
+ return result.trim();
266
+ }