@wong2kim/wmux 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,520 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.extractMarkdown = extractMarkdown;
4
+ exports.extractStructuredData = extractStructuredData;
5
+ // ---------------------------------------------------------------------------
6
+ // Constants
7
+ // ---------------------------------------------------------------------------
8
+ const DEFAULT_MAX_LENGTH = 4000;
9
+ /** Elements stripped before conversion — typically non-content chrome */
10
+ const NOISE_SELECTORS = [
11
+ 'script',
12
+ 'style',
13
+ 'noscript',
14
+ 'nav',
15
+ 'footer',
16
+ 'header',
17
+ 'aside',
18
+ 'svg',
19
+ 'iframe',
20
+ '[role="navigation"]',
21
+ '[role="banner"]',
22
+ '[role="complementary"]',
23
+ '[aria-hidden="true"]',
24
+ ];
25
+ function convertNode(node, includeLinks, includeImages) {
26
+ if (node.type === 3) {
27
+ // Text node — collapse whitespace
28
+ return (node.text ?? '').replace(/[ \t]+/g, ' ');
29
+ }
30
+ if (node.type !== 1 || !node.tag)
31
+ return '';
32
+ const tag = node.tag;
33
+ const children = node.children ?? [];
34
+ const inner = children
35
+ .map((c) => convertNode(c, includeLinks, includeImages))
36
+ .join('');
37
+ switch (tag) {
38
+ // Headings
39
+ case 'H1':
40
+ return `\n\n# ${inner.trim()}\n\n`;
41
+ case 'H2':
42
+ return `\n\n## ${inner.trim()}\n\n`;
43
+ case 'H3':
44
+ return `\n\n### ${inner.trim()}\n\n`;
45
+ case 'H4':
46
+ return `\n\n#### ${inner.trim()}\n\n`;
47
+ case 'H5':
48
+ return `\n\n##### ${inner.trim()}\n\n`;
49
+ case 'H6':
50
+ return `\n\n###### ${inner.trim()}\n\n`;
51
+ // Paragraphs & divs
52
+ case 'P':
53
+ return `\n\n${inner.trim()}\n\n`;
54
+ case 'DIV':
55
+ case 'SECTION':
56
+ case 'ARTICLE':
57
+ case 'MAIN':
58
+ return `\n${inner}\n`;
59
+ // Inline formatting
60
+ case 'STRONG':
61
+ case 'B':
62
+ return `**${inner.trim()}**`;
63
+ case 'EM':
64
+ case 'I':
65
+ return `*${inner.trim()}*`;
66
+ case 'CODE':
67
+ return `\`${inner.trim()}\``;
68
+ case 'DEL':
69
+ case 'S':
70
+ return `~~${inner.trim()}~~`;
71
+ // Line break
72
+ case 'BR':
73
+ return '\n';
74
+ case 'HR':
75
+ return '\n\n---\n\n';
76
+ // Links
77
+ case 'A': {
78
+ const href = node.attrs?.['href'] ?? '';
79
+ const text = inner.trim();
80
+ if (!text)
81
+ return '';
82
+ if (includeLinks && href && !href.startsWith('javascript:')) {
83
+ return `[${text}](${href})`;
84
+ }
85
+ return text;
86
+ }
87
+ // Images
88
+ case 'IMG': {
89
+ if (!includeImages)
90
+ return '';
91
+ const alt = node.attrs?.['alt'] ?? '';
92
+ const src = node.attrs?.['src'] ?? '';
93
+ return `![${alt}](${src})`;
94
+ }
95
+ // Lists
96
+ case 'UL':
97
+ case 'OL':
98
+ return `\n${inner}\n`;
99
+ case 'LI':
100
+ return `- ${inner.trim()}\n`;
101
+ // Blockquote
102
+ case 'BLOCKQUOTE': {
103
+ const lines = inner.trim().split('\n');
104
+ return '\n\n' + lines.map((l) => `> ${l}`).join('\n') + '\n\n';
105
+ }
106
+ // Pre-formatted / code blocks
107
+ case 'PRE': {
108
+ // If there is a single <code> child, extract its text directly
109
+ const codeChild = children.find((c) => c.tag === 'CODE');
110
+ const codeText = codeChild
111
+ ? children.map((c) => convertNode(c, false, false)).join('')
112
+ : inner;
113
+ return `\n\n\`\`\`\n${codeText.trim()}\n\`\`\`\n\n`;
114
+ }
115
+ // Tables
116
+ case 'TABLE':
117
+ return `\n\n${convertTable(children, includeLinks, includeImages)}\n\n`;
118
+ // Table sub-elements handled by convertTable; skip here
119
+ case 'THEAD':
120
+ case 'TBODY':
121
+ case 'TFOOT':
122
+ case 'TR':
123
+ case 'TH':
124
+ case 'TD':
125
+ return inner;
126
+ // Ignore certain tags entirely
127
+ case 'SCRIPT':
128
+ case 'STYLE':
129
+ case 'NOSCRIPT':
130
+ return '';
131
+ // Default — pass through inner text
132
+ default:
133
+ return inner;
134
+ }
135
+ }
136
+ // ---------------------------------------------------------------------------
137
+ // Table conversion
138
+ // ---------------------------------------------------------------------------
139
+ function collectRows(nodes) {
140
+ const rows = [];
141
+ function walk(list) {
142
+ for (const n of list) {
143
+ if (n.tag === 'TR') {
144
+ rows.push(n.children ?? []);
145
+ }
146
+ else if (n.children) {
147
+ walk(n.children);
148
+ }
149
+ }
150
+ }
151
+ walk(nodes);
152
+ return rows;
153
+ }
154
+ function cellText(cell, includeLinks, includeImages) {
155
+ return convertNode(cell, includeLinks, includeImages)
156
+ .replace(/\n/g, ' ')
157
+ .trim();
158
+ }
159
+ function convertTable(children, includeLinks, includeImages) {
160
+ const rows = collectRows(children);
161
+ if (rows.length === 0)
162
+ return '';
163
+ const matrix = rows.map((cells) => cells.map((c) => cellText(c, includeLinks, includeImages)));
164
+ // Determine column widths
165
+ const colCount = Math.max(...matrix.map((r) => r.length));
166
+ const normalized = matrix.map((row) => {
167
+ while (row.length < colCount)
168
+ row.push('');
169
+ return row;
170
+ });
171
+ // First row is header
172
+ const headerRow = normalized[0];
173
+ const separator = headerRow.map(() => '---');
174
+ const lines = [
175
+ '| ' + headerRow.join(' | ') + ' |',
176
+ '| ' + separator.join(' | ') + ' |',
177
+ ];
178
+ for (let i = 1; i < normalized.length; i++) {
179
+ lines.push('| ' + normalized[i].join(' | ') + ' |');
180
+ }
181
+ return lines.join('\n');
182
+ }
183
+ // ---------------------------------------------------------------------------
184
+ // Post-processing
185
+ // ---------------------------------------------------------------------------
186
+ function cleanMarkdown(md, maxLength) {
187
+ let result = md
188
+ // Collapse 3+ newlines into 2
189
+ .replace(/\n{3,}/g, '\n\n')
190
+ // Remove leading/trailing whitespace on each line
191
+ .split('\n')
192
+ .map((l) => l.trimEnd())
193
+ .join('\n')
194
+ .trim();
195
+ if (result.length > maxLength) {
196
+ result = result.slice(0, maxLength) + '\n... (truncated)';
197
+ }
198
+ return result;
199
+ }
200
+ // ---------------------------------------------------------------------------
201
+ // Browser-side serialisation function
202
+ // ---------------------------------------------------------------------------
203
+ /**
204
+ * Returns a string that, when evaluated inside the browser, serialises the
205
+ * DOM rooted at `rootSelector` into a JSON-safe tree structure.
206
+ *
207
+ * Noise elements are stripped before serialisation.
208
+ */
209
+ function buildSerialiseScript(rootSelector, noiseSelectors) {
210
+ // The function body runs inside the browser context
211
+ return `
212
+ (() => {
213
+ const NOISE = ${JSON.stringify(noiseSelectors)};
214
+ const root = ${rootSelector ? `document.querySelector(${JSON.stringify(rootSelector)})` : 'document.body'};
215
+ if (!root) return null;
216
+
217
+ // Remove noise elements
218
+ for (const sel of NOISE) {
219
+ for (const el of root.querySelectorAll(sel)) {
220
+ el.remove();
221
+ }
222
+ }
223
+
224
+ function serialise(node) {
225
+ if (node.nodeType === 3) {
226
+ const text = node.textContent || '';
227
+ if (!text.trim()) return null;
228
+ return { type: 3, text };
229
+ }
230
+ if (node.nodeType !== 1) return null;
231
+
232
+ const el = node;
233
+ const tag = el.tagName;
234
+ const attrs = {};
235
+ if (el.hasAttribute('href')) attrs['href'] = el.getAttribute('href');
236
+ if (el.hasAttribute('src')) attrs['src'] = el.getAttribute('src');
237
+ if (el.hasAttribute('alt')) attrs['alt'] = el.getAttribute('alt');
238
+
239
+ const children = [];
240
+ for (const child of el.childNodes) {
241
+ const s = serialise(child);
242
+ if (s) children.push(s);
243
+ }
244
+
245
+ return { type: 1, tag, attrs, children };
246
+ }
247
+
248
+ return serialise(root);
249
+ })()
250
+ `;
251
+ }
252
+ // ---------------------------------------------------------------------------
253
+ // Public API
254
+ // ---------------------------------------------------------------------------
255
+ /**
256
+ * Extract page content as clean markdown.
257
+ *
258
+ * Strips navigation, footer, ad, and other non-content elements, then
259
+ * converts the remaining HTML structure into readable markdown text.
260
+ */
261
+ async function extractMarkdown(page, options) {
262
+ const maxLength = options?.maxLength ?? DEFAULT_MAX_LENGTH;
263
+ const includeLinks = options?.includeLinks ?? true;
264
+ const includeImages = options?.includeImages ?? false;
265
+ const selector = options?.selector ?? null;
266
+ const script = buildSerialiseScript(selector, NOISE_SELECTORS);
267
+ const tree = await page.evaluate(script);
268
+ if (!tree) {
269
+ return '';
270
+ }
271
+ const raw = convertNode(tree, includeLinks, includeImages);
272
+ return cleanMarkdown(raw, maxLength);
273
+ }
274
+ // ---------------------------------------------------------------------------
275
+ // Structured data extraction
276
+ // ---------------------------------------------------------------------------
277
+ /**
278
+ * Extract structured data from a page based on a goal description and
279
+ * a set of field definitions.
280
+ *
281
+ * Uses heuristic DOM parsing (NOT LLM) to find repeated data patterns
282
+ * such as tables, lists, or repeated elements and maps them to the
283
+ * requested fields.
284
+ *
285
+ * @param page Playwright Page instance
286
+ * @param goal Human-readable description of what to extract (used to
287
+ * narrow scope when multiple data regions exist)
288
+ * @param fields Mapping of field names to human descriptions, e.g.
289
+ * `{ title: "product name", price: "price in USD" }`
290
+ * @returns Array of objects with keys matching `fields`
291
+ */
292
+ async function extractStructuredData(page, goal, fields) {
293
+ const fieldNames = Object.keys(fields);
294
+ if (fieldNames.length === 0)
295
+ return [];
296
+ // Strategy 1: Try to extract from <table> elements
297
+ const tableData = await extractFromTables(page, fieldNames);
298
+ if (tableData.length > 0)
299
+ return tableData;
300
+ // Strategy 2: Try to extract from repeated list items
301
+ const listData = await extractFromLists(page, fieldNames);
302
+ if (listData.length > 0)
303
+ return listData;
304
+ // Strategy 3: Try to find repeated element patterns (grids, cards, etc.)
305
+ const repeatedData = await extractFromRepeatedElements(page, fieldNames);
306
+ if (repeatedData.length > 0)
307
+ return repeatedData;
308
+ return [];
309
+ }
310
+ // ---------------------------------------------------------------------------
311
+ // Table extraction
312
+ // ---------------------------------------------------------------------------
313
+ async function extractFromTables(page, fieldNames) {
314
+ return await page.evaluate(({ fieldNames: names }) => {
315
+ const tables = document.querySelectorAll('table');
316
+ if (tables.length === 0)
317
+ return [];
318
+ for (const table of tables) {
319
+ const rows = table.querySelectorAll('tr');
320
+ if (rows.length < 2)
321
+ continue;
322
+ // Extract headers from first row
323
+ const headerCells = rows[0].querySelectorAll('th, td');
324
+ const headers = [];
325
+ headerCells.forEach((cell) => {
326
+ headers.push((cell.textContent ?? '').trim().toLowerCase());
327
+ });
328
+ if (headers.length === 0)
329
+ continue;
330
+ // Map requested field names to column indices
331
+ const fieldToCol = new Map();
332
+ for (const name of names) {
333
+ const lower = name.toLowerCase();
334
+ // Exact match first
335
+ let idx = headers.indexOf(lower);
336
+ if (idx === -1) {
337
+ // Partial match
338
+ idx = headers.findIndex((h) => h.includes(lower) || lower.includes(h));
339
+ }
340
+ if (idx !== -1) {
341
+ fieldToCol.set(name, idx);
342
+ }
343
+ }
344
+ // If we matched at least one field, extract rows
345
+ if (fieldToCol.size === 0)
346
+ continue;
347
+ const results = [];
348
+ for (let i = 1; i < rows.length; i++) {
349
+ const cells = rows[i].querySelectorAll('td, th');
350
+ const record = {};
351
+ let hasValue = false;
352
+ for (const name of names) {
353
+ const colIdx = fieldToCol.get(name);
354
+ if (colIdx !== undefined && colIdx < cells.length) {
355
+ const text = (cells[colIdx].textContent ?? '').trim();
356
+ record[name] = text;
357
+ if (text)
358
+ hasValue = true;
359
+ }
360
+ else {
361
+ record[name] = null;
362
+ }
363
+ }
364
+ if (hasValue)
365
+ results.push(record);
366
+ }
367
+ if (results.length > 0)
368
+ return results;
369
+ }
370
+ return [];
371
+ }, { fieldNames });
372
+ }
373
+ // ---------------------------------------------------------------------------
374
+ // List extraction
375
+ // ---------------------------------------------------------------------------
376
+ async function extractFromLists(page, fieldNames) {
377
+ return await page.evaluate(({ fieldNames: names }) => {
378
+ const lists = document.querySelectorAll('ul, ol');
379
+ if (lists.length === 0)
380
+ return [];
381
+ // Find the largest list with enough items
382
+ let bestList = null;
383
+ let bestCount = 0;
384
+ for (const list of lists) {
385
+ const items = list.querySelectorAll(':scope > li');
386
+ if (items.length > bestCount) {
387
+ bestCount = items.length;
388
+ bestList = list;
389
+ }
390
+ }
391
+ if (!bestList || bestCount < 2)
392
+ return [];
393
+ const items = bestList.querySelectorAll(':scope > li');
394
+ const results = [];
395
+ for (const item of items) {
396
+ const record = {};
397
+ const text = (item.textContent ?? '').trim();
398
+ if (!text)
399
+ continue;
400
+ if (names.length === 1) {
401
+ // Single field — map entire text
402
+ record[names[0]] = text;
403
+ }
404
+ else {
405
+ // Multiple fields — try splitting by common delimiters or child elements
406
+ const childElements = item.querySelectorAll('*');
407
+ const textSegments = [];
408
+ if (childElements.length > 0) {
409
+ // Use direct child elements' text
410
+ const directChildren = item.children;
411
+ for (const child of directChildren) {
412
+ const t = (child.textContent ?? '').trim();
413
+ if (t)
414
+ textSegments.push(t);
415
+ }
416
+ }
417
+ if (textSegments.length === 0) {
418
+ // Split on common delimiters
419
+ textSegments.push(...text.split(/\s*[|–—:,]\s*/).filter(Boolean));
420
+ }
421
+ for (let i = 0; i < names.length; i++) {
422
+ record[names[i]] = i < textSegments.length ? textSegments[i] : null;
423
+ }
424
+ }
425
+ results.push(record);
426
+ }
427
+ return results;
428
+ }, { fieldNames });
429
+ }
430
+ // ---------------------------------------------------------------------------
431
+ // Repeated-element extraction (cards, grids, etc.)
432
+ // ---------------------------------------------------------------------------
433
+ async function extractFromRepeatedElements(page, fieldNames) {
434
+ return await page.evaluate(({ fieldNames: names }) => {
435
+ // Find class names that appear 3+ times, suggesting repeated items
436
+ const classCount = new Map();
437
+ const allElements = document.querySelectorAll('div, li, article, section');
438
+ for (const el of allElements) {
439
+ const cls = el.className;
440
+ if (typeof cls === 'string' && cls.trim()) {
441
+ const key = el.tagName + '.' + cls.trim();
442
+ classCount.set(key, (classCount.get(key) ?? 0) + 1);
443
+ }
444
+ }
445
+ // Sort by count descending, pick the most repeated pattern with 3+ items
446
+ const candidates = [...classCount.entries()]
447
+ .filter(([, count]) => count >= 3)
448
+ .sort((a, b) => b[1] - a[1]);
449
+ for (const [tagClass] of candidates) {
450
+ const dotIdx = tagClass.indexOf('.');
451
+ const tag = tagClass.slice(0, dotIdx);
452
+ const cls = tagClass.slice(dotIdx + 1);
453
+ // Build selector: tag.class1.class2...
454
+ const classes = cls.split(/\s+/).filter(Boolean);
455
+ const sel = tag.toLowerCase() + classes.map((c) => '.' + CSS.escape(c)).join('');
456
+ let elements;
457
+ try {
458
+ elements = document.querySelectorAll(sel);
459
+ }
460
+ catch {
461
+ continue;
462
+ }
463
+ if (elements.length < 3)
464
+ continue;
465
+ const results = [];
466
+ for (const el of elements) {
467
+ const record = {};
468
+ let hasValue = false;
469
+ for (const name of names) {
470
+ const lower = name.toLowerCase();
471
+ // Try to find a child element whose class/tag/aria-label hints at the field
472
+ let value = null;
473
+ // Check common patterns: heading elements for title-like fields
474
+ if (/title|name|heading/i.test(lower)) {
475
+ const heading = el.querySelector('h1, h2, h3, h4, h5, h6') ??
476
+ el.querySelector('[class*="title"], [class*="name"], [class*="heading"]');
477
+ if (heading)
478
+ value = (heading.textContent ?? '').trim();
479
+ }
480
+ // Price-like fields
481
+ if (!value && /price|cost|amount/i.test(lower)) {
482
+ const priceEl = el.querySelector('[class*="price"], [class*="cost"], [class*="amount"]');
483
+ if (priceEl)
484
+ value = (priceEl.textContent ?? '').trim();
485
+ }
486
+ // Description-like fields
487
+ if (!value && /desc|summary|text|content/i.test(lower)) {
488
+ const descEl = el.querySelector('p, [class*="desc"], [class*="summary"], [class*="text"]');
489
+ if (descEl)
490
+ value = (descEl.textContent ?? '').trim();
491
+ }
492
+ // Link / URL fields
493
+ if (!value && /link|url|href/i.test(lower)) {
494
+ const anchor = el.querySelector('a[href]');
495
+ if (anchor)
496
+ value = anchor.getAttribute('href');
497
+ }
498
+ // Image fields
499
+ if (!value && /image|img|photo|src/i.test(lower)) {
500
+ const img = el.querySelector('img[src]');
501
+ if (img)
502
+ value = img.getAttribute('src');
503
+ }
504
+ // Fallback: use full text for first unmatched field
505
+ if (!value) {
506
+ value = (el.textContent ?? '').trim().slice(0, 200);
507
+ }
508
+ record[name] = value || null;
509
+ if (value)
510
+ hasValue = true;
511
+ }
512
+ if (hasValue)
513
+ results.push(record);
514
+ }
515
+ if (results.length > 0)
516
+ return results;
517
+ }
518
+ return [];
519
+ }, { fieldNames });
520
+ }