@wong2kim/wmux 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -4
- package/dist/cli/cli/commands/browser.js +101 -77
- package/dist/cli/cli/index.js +6 -6
- package/dist/cli/shared/constants.js +3 -0
- package/dist/cli/shared/rpc.js +15 -4
- package/dist/mcp/mcp/index.js +41 -21
- package/dist/mcp/mcp/playwright/PlaywrightEngine.js +186 -0
- package/dist/mcp/mcp/playwright/anti-detection.js +58 -0
- package/dist/mcp/mcp/playwright/dom-intelligence.js +171 -0
- package/dist/mcp/mcp/playwright/human-typing.js +48 -0
- package/dist/mcp/mcp/playwright/markdown-extractor.js +520 -0
- package/dist/mcp/mcp/playwright/snapshot.js +261 -0
- package/dist/mcp/mcp/playwright/tools/extraction.js +143 -0
- package/dist/mcp/mcp/playwright/tools/file.js +274 -0
- package/dist/mcp/mcp/playwright/tools/inspection.js +395 -0
- package/dist/mcp/mcp/playwright/tools/interaction.js +387 -0
- package/dist/mcp/mcp/playwright/tools/navigation.js +183 -0
- package/dist/mcp/mcp/playwright/tools/state.js +410 -0
- package/dist/mcp/mcp/playwright/tools/utility.js +167 -0
- package/dist/mcp/mcp/playwright/tools/wait.js +111 -0
- package/dist/mcp/shared/constants.js +3 -0
- package/dist/mcp/shared/rpc.js +15 -4
- package/package.json +7 -4
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.extractMarkdown = extractMarkdown;
|
|
4
|
+
exports.extractStructuredData = extractStructuredData;
|
|
5
|
+
// ---------------------------------------------------------------------------
|
|
6
|
+
// Constants
|
|
7
|
+
// ---------------------------------------------------------------------------
|
|
8
|
+
const DEFAULT_MAX_LENGTH = 4000;
|
|
9
|
+
/** Elements stripped before conversion — typically non-content chrome */
|
|
10
|
+
const NOISE_SELECTORS = [
|
|
11
|
+
'script',
|
|
12
|
+
'style',
|
|
13
|
+
'noscript',
|
|
14
|
+
'nav',
|
|
15
|
+
'footer',
|
|
16
|
+
'header',
|
|
17
|
+
'aside',
|
|
18
|
+
'svg',
|
|
19
|
+
'iframe',
|
|
20
|
+
'[role="navigation"]',
|
|
21
|
+
'[role="banner"]',
|
|
22
|
+
'[role="complementary"]',
|
|
23
|
+
'[aria-hidden="true"]',
|
|
24
|
+
];
|
|
25
|
+
function convertNode(node, includeLinks, includeImages) {
|
|
26
|
+
if (node.type === 3) {
|
|
27
|
+
// Text node — collapse whitespace
|
|
28
|
+
return (node.text ?? '').replace(/[ \t]+/g, ' ');
|
|
29
|
+
}
|
|
30
|
+
if (node.type !== 1 || !node.tag)
|
|
31
|
+
return '';
|
|
32
|
+
const tag = node.tag;
|
|
33
|
+
const children = node.children ?? [];
|
|
34
|
+
const inner = children
|
|
35
|
+
.map((c) => convertNode(c, includeLinks, includeImages))
|
|
36
|
+
.join('');
|
|
37
|
+
switch (tag) {
|
|
38
|
+
// Headings
|
|
39
|
+
case 'H1':
|
|
40
|
+
return `\n\n# ${inner.trim()}\n\n`;
|
|
41
|
+
case 'H2':
|
|
42
|
+
return `\n\n## ${inner.trim()}\n\n`;
|
|
43
|
+
case 'H3':
|
|
44
|
+
return `\n\n### ${inner.trim()}\n\n`;
|
|
45
|
+
case 'H4':
|
|
46
|
+
return `\n\n#### ${inner.trim()}\n\n`;
|
|
47
|
+
case 'H5':
|
|
48
|
+
return `\n\n##### ${inner.trim()}\n\n`;
|
|
49
|
+
case 'H6':
|
|
50
|
+
return `\n\n###### ${inner.trim()}\n\n`;
|
|
51
|
+
// Paragraphs & divs
|
|
52
|
+
case 'P':
|
|
53
|
+
return `\n\n${inner.trim()}\n\n`;
|
|
54
|
+
case 'DIV':
|
|
55
|
+
case 'SECTION':
|
|
56
|
+
case 'ARTICLE':
|
|
57
|
+
case 'MAIN':
|
|
58
|
+
return `\n${inner}\n`;
|
|
59
|
+
// Inline formatting
|
|
60
|
+
case 'STRONG':
|
|
61
|
+
case 'B':
|
|
62
|
+
return `**${inner.trim()}**`;
|
|
63
|
+
case 'EM':
|
|
64
|
+
case 'I':
|
|
65
|
+
return `*${inner.trim()}*`;
|
|
66
|
+
case 'CODE':
|
|
67
|
+
return `\`${inner.trim()}\``;
|
|
68
|
+
case 'DEL':
|
|
69
|
+
case 'S':
|
|
70
|
+
return `~~${inner.trim()}~~`;
|
|
71
|
+
// Line break
|
|
72
|
+
case 'BR':
|
|
73
|
+
return '\n';
|
|
74
|
+
case 'HR':
|
|
75
|
+
return '\n\n---\n\n';
|
|
76
|
+
// Links
|
|
77
|
+
case 'A': {
|
|
78
|
+
const href = node.attrs?.['href'] ?? '';
|
|
79
|
+
const text = inner.trim();
|
|
80
|
+
if (!text)
|
|
81
|
+
return '';
|
|
82
|
+
if (includeLinks && href && !href.startsWith('javascript:')) {
|
|
83
|
+
return `[${text}](${href})`;
|
|
84
|
+
}
|
|
85
|
+
return text;
|
|
86
|
+
}
|
|
87
|
+
// Images
|
|
88
|
+
case 'IMG': {
|
|
89
|
+
if (!includeImages)
|
|
90
|
+
return '';
|
|
91
|
+
const alt = node.attrs?.['alt'] ?? '';
|
|
92
|
+
const src = node.attrs?.['src'] ?? '';
|
|
93
|
+
return ``;
|
|
94
|
+
}
|
|
95
|
+
// Lists
|
|
96
|
+
case 'UL':
|
|
97
|
+
case 'OL':
|
|
98
|
+
return `\n${inner}\n`;
|
|
99
|
+
case 'LI':
|
|
100
|
+
return `- ${inner.trim()}\n`;
|
|
101
|
+
// Blockquote
|
|
102
|
+
case 'BLOCKQUOTE': {
|
|
103
|
+
const lines = inner.trim().split('\n');
|
|
104
|
+
return '\n\n' + lines.map((l) => `> ${l}`).join('\n') + '\n\n';
|
|
105
|
+
}
|
|
106
|
+
// Pre-formatted / code blocks
|
|
107
|
+
case 'PRE': {
|
|
108
|
+
// If there is a single <code> child, extract its text directly
|
|
109
|
+
const codeChild = children.find((c) => c.tag === 'CODE');
|
|
110
|
+
const codeText = codeChild
|
|
111
|
+
? children.map((c) => convertNode(c, false, false)).join('')
|
|
112
|
+
: inner;
|
|
113
|
+
return `\n\n\`\`\`\n${codeText.trim()}\n\`\`\`\n\n`;
|
|
114
|
+
}
|
|
115
|
+
// Tables
|
|
116
|
+
case 'TABLE':
|
|
117
|
+
return `\n\n${convertTable(children, includeLinks, includeImages)}\n\n`;
|
|
118
|
+
// Table sub-elements handled by convertTable; skip here
|
|
119
|
+
case 'THEAD':
|
|
120
|
+
case 'TBODY':
|
|
121
|
+
case 'TFOOT':
|
|
122
|
+
case 'TR':
|
|
123
|
+
case 'TH':
|
|
124
|
+
case 'TD':
|
|
125
|
+
return inner;
|
|
126
|
+
// Ignore certain tags entirely
|
|
127
|
+
case 'SCRIPT':
|
|
128
|
+
case 'STYLE':
|
|
129
|
+
case 'NOSCRIPT':
|
|
130
|
+
return '';
|
|
131
|
+
// Default — pass through inner text
|
|
132
|
+
default:
|
|
133
|
+
return inner;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
// ---------------------------------------------------------------------------
|
|
137
|
+
// Table conversion
|
|
138
|
+
// ---------------------------------------------------------------------------
|
|
139
|
+
function collectRows(nodes) {
|
|
140
|
+
const rows = [];
|
|
141
|
+
function walk(list) {
|
|
142
|
+
for (const n of list) {
|
|
143
|
+
if (n.tag === 'TR') {
|
|
144
|
+
rows.push(n.children ?? []);
|
|
145
|
+
}
|
|
146
|
+
else if (n.children) {
|
|
147
|
+
walk(n.children);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
walk(nodes);
|
|
152
|
+
return rows;
|
|
153
|
+
}
|
|
154
|
+
function cellText(cell, includeLinks, includeImages) {
|
|
155
|
+
return convertNode(cell, includeLinks, includeImages)
|
|
156
|
+
.replace(/\n/g, ' ')
|
|
157
|
+
.trim();
|
|
158
|
+
}
|
|
159
|
+
function convertTable(children, includeLinks, includeImages) {
|
|
160
|
+
const rows = collectRows(children);
|
|
161
|
+
if (rows.length === 0)
|
|
162
|
+
return '';
|
|
163
|
+
const matrix = rows.map((cells) => cells.map((c) => cellText(c, includeLinks, includeImages)));
|
|
164
|
+
// Determine column widths
|
|
165
|
+
const colCount = Math.max(...matrix.map((r) => r.length));
|
|
166
|
+
const normalized = matrix.map((row) => {
|
|
167
|
+
while (row.length < colCount)
|
|
168
|
+
row.push('');
|
|
169
|
+
return row;
|
|
170
|
+
});
|
|
171
|
+
// First row is header
|
|
172
|
+
const headerRow = normalized[0];
|
|
173
|
+
const separator = headerRow.map(() => '---');
|
|
174
|
+
const lines = [
|
|
175
|
+
'| ' + headerRow.join(' | ') + ' |',
|
|
176
|
+
'| ' + separator.join(' | ') + ' |',
|
|
177
|
+
];
|
|
178
|
+
for (let i = 1; i < normalized.length; i++) {
|
|
179
|
+
lines.push('| ' + normalized[i].join(' | ') + ' |');
|
|
180
|
+
}
|
|
181
|
+
return lines.join('\n');
|
|
182
|
+
}
|
|
183
|
+
// ---------------------------------------------------------------------------
|
|
184
|
+
// Post-processing
|
|
185
|
+
// ---------------------------------------------------------------------------
|
|
186
|
+
function cleanMarkdown(md, maxLength) {
|
|
187
|
+
let result = md
|
|
188
|
+
// Collapse 3+ newlines into 2
|
|
189
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
190
|
+
// Remove leading/trailing whitespace on each line
|
|
191
|
+
.split('\n')
|
|
192
|
+
.map((l) => l.trimEnd())
|
|
193
|
+
.join('\n')
|
|
194
|
+
.trim();
|
|
195
|
+
if (result.length > maxLength) {
|
|
196
|
+
result = result.slice(0, maxLength) + '\n... (truncated)';
|
|
197
|
+
}
|
|
198
|
+
return result;
|
|
199
|
+
}
|
|
200
|
+
// ---------------------------------------------------------------------------
|
|
201
|
+
// Browser-side serialisation function
|
|
202
|
+
// ---------------------------------------------------------------------------
|
|
203
|
+
/**
|
|
204
|
+
* Returns a string that, when evaluated inside the browser, serialises the
|
|
205
|
+
* DOM rooted at `rootSelector` into a JSON-safe tree structure.
|
|
206
|
+
*
|
|
207
|
+
* Noise elements are stripped before serialisation.
|
|
208
|
+
*/
|
|
209
|
+
function buildSerialiseScript(rootSelector, noiseSelectors) {
|
|
210
|
+
// The function body runs inside the browser context
|
|
211
|
+
return `
|
|
212
|
+
(() => {
|
|
213
|
+
const NOISE = ${JSON.stringify(noiseSelectors)};
|
|
214
|
+
const root = ${rootSelector ? `document.querySelector(${JSON.stringify(rootSelector)})` : 'document.body'};
|
|
215
|
+
if (!root) return null;
|
|
216
|
+
|
|
217
|
+
// Remove noise elements
|
|
218
|
+
for (const sel of NOISE) {
|
|
219
|
+
for (const el of root.querySelectorAll(sel)) {
|
|
220
|
+
el.remove();
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
function serialise(node) {
|
|
225
|
+
if (node.nodeType === 3) {
|
|
226
|
+
const text = node.textContent || '';
|
|
227
|
+
if (!text.trim()) return null;
|
|
228
|
+
return { type: 3, text };
|
|
229
|
+
}
|
|
230
|
+
if (node.nodeType !== 1) return null;
|
|
231
|
+
|
|
232
|
+
const el = node;
|
|
233
|
+
const tag = el.tagName;
|
|
234
|
+
const attrs = {};
|
|
235
|
+
if (el.hasAttribute('href')) attrs['href'] = el.getAttribute('href');
|
|
236
|
+
if (el.hasAttribute('src')) attrs['src'] = el.getAttribute('src');
|
|
237
|
+
if (el.hasAttribute('alt')) attrs['alt'] = el.getAttribute('alt');
|
|
238
|
+
|
|
239
|
+
const children = [];
|
|
240
|
+
for (const child of el.childNodes) {
|
|
241
|
+
const s = serialise(child);
|
|
242
|
+
if (s) children.push(s);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return { type: 1, tag, attrs, children };
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
return serialise(root);
|
|
249
|
+
})()
|
|
250
|
+
`;
|
|
251
|
+
}
|
|
252
|
+
// ---------------------------------------------------------------------------
|
|
253
|
+
// Public API
|
|
254
|
+
// ---------------------------------------------------------------------------
|
|
255
|
+
/**
|
|
256
|
+
* Extract page content as clean markdown.
|
|
257
|
+
*
|
|
258
|
+
* Strips navigation, footer, ad, and other non-content elements, then
|
|
259
|
+
* converts the remaining HTML structure into readable markdown text.
|
|
260
|
+
*/
|
|
261
|
+
async function extractMarkdown(page, options) {
|
|
262
|
+
const maxLength = options?.maxLength ?? DEFAULT_MAX_LENGTH;
|
|
263
|
+
const includeLinks = options?.includeLinks ?? true;
|
|
264
|
+
const includeImages = options?.includeImages ?? false;
|
|
265
|
+
const selector = options?.selector ?? null;
|
|
266
|
+
const script = buildSerialiseScript(selector, NOISE_SELECTORS);
|
|
267
|
+
const tree = await page.evaluate(script);
|
|
268
|
+
if (!tree) {
|
|
269
|
+
return '';
|
|
270
|
+
}
|
|
271
|
+
const raw = convertNode(tree, includeLinks, includeImages);
|
|
272
|
+
return cleanMarkdown(raw, maxLength);
|
|
273
|
+
}
|
|
274
|
+
// ---------------------------------------------------------------------------
|
|
275
|
+
// Structured data extraction
|
|
276
|
+
// ---------------------------------------------------------------------------
|
|
277
|
+
/**
|
|
278
|
+
* Extract structured data from a page based on a goal description and
|
|
279
|
+
* a set of field definitions.
|
|
280
|
+
*
|
|
281
|
+
* Uses heuristic DOM parsing (NOT LLM) to find repeated data patterns
|
|
282
|
+
* such as tables, lists, or repeated elements and maps them to the
|
|
283
|
+
* requested fields.
|
|
284
|
+
*
|
|
285
|
+
* @param page Playwright Page instance
|
|
286
|
+
* @param goal Human-readable description of what to extract (used to
|
|
287
|
+
* narrow scope when multiple data regions exist)
|
|
288
|
+
* @param fields Mapping of field names to human descriptions, e.g.
|
|
289
|
+
* `{ title: "product name", price: "price in USD" }`
|
|
290
|
+
* @returns Array of objects with keys matching `fields`
|
|
291
|
+
*/
|
|
292
|
+
async function extractStructuredData(page, goal, fields) {
|
|
293
|
+
const fieldNames = Object.keys(fields);
|
|
294
|
+
if (fieldNames.length === 0)
|
|
295
|
+
return [];
|
|
296
|
+
// Strategy 1: Try to extract from <table> elements
|
|
297
|
+
const tableData = await extractFromTables(page, fieldNames);
|
|
298
|
+
if (tableData.length > 0)
|
|
299
|
+
return tableData;
|
|
300
|
+
// Strategy 2: Try to extract from repeated list items
|
|
301
|
+
const listData = await extractFromLists(page, fieldNames);
|
|
302
|
+
if (listData.length > 0)
|
|
303
|
+
return listData;
|
|
304
|
+
// Strategy 3: Try to find repeated element patterns (grids, cards, etc.)
|
|
305
|
+
const repeatedData = await extractFromRepeatedElements(page, fieldNames);
|
|
306
|
+
if (repeatedData.length > 0)
|
|
307
|
+
return repeatedData;
|
|
308
|
+
return [];
|
|
309
|
+
}
|
|
310
|
+
// ---------------------------------------------------------------------------
|
|
311
|
+
// Table extraction
|
|
312
|
+
// ---------------------------------------------------------------------------
|
|
313
|
+
async function extractFromTables(page, fieldNames) {
|
|
314
|
+
return await page.evaluate(({ fieldNames: names }) => {
|
|
315
|
+
const tables = document.querySelectorAll('table');
|
|
316
|
+
if (tables.length === 0)
|
|
317
|
+
return [];
|
|
318
|
+
for (const table of tables) {
|
|
319
|
+
const rows = table.querySelectorAll('tr');
|
|
320
|
+
if (rows.length < 2)
|
|
321
|
+
continue;
|
|
322
|
+
// Extract headers from first row
|
|
323
|
+
const headerCells = rows[0].querySelectorAll('th, td');
|
|
324
|
+
const headers = [];
|
|
325
|
+
headerCells.forEach((cell) => {
|
|
326
|
+
headers.push((cell.textContent ?? '').trim().toLowerCase());
|
|
327
|
+
});
|
|
328
|
+
if (headers.length === 0)
|
|
329
|
+
continue;
|
|
330
|
+
// Map requested field names to column indices
|
|
331
|
+
const fieldToCol = new Map();
|
|
332
|
+
for (const name of names) {
|
|
333
|
+
const lower = name.toLowerCase();
|
|
334
|
+
// Exact match first
|
|
335
|
+
let idx = headers.indexOf(lower);
|
|
336
|
+
if (idx === -1) {
|
|
337
|
+
// Partial match
|
|
338
|
+
idx = headers.findIndex((h) => h.includes(lower) || lower.includes(h));
|
|
339
|
+
}
|
|
340
|
+
if (idx !== -1) {
|
|
341
|
+
fieldToCol.set(name, idx);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
// If we matched at least one field, extract rows
|
|
345
|
+
if (fieldToCol.size === 0)
|
|
346
|
+
continue;
|
|
347
|
+
const results = [];
|
|
348
|
+
for (let i = 1; i < rows.length; i++) {
|
|
349
|
+
const cells = rows[i].querySelectorAll('td, th');
|
|
350
|
+
const record = {};
|
|
351
|
+
let hasValue = false;
|
|
352
|
+
for (const name of names) {
|
|
353
|
+
const colIdx = fieldToCol.get(name);
|
|
354
|
+
if (colIdx !== undefined && colIdx < cells.length) {
|
|
355
|
+
const text = (cells[colIdx].textContent ?? '').trim();
|
|
356
|
+
record[name] = text;
|
|
357
|
+
if (text)
|
|
358
|
+
hasValue = true;
|
|
359
|
+
}
|
|
360
|
+
else {
|
|
361
|
+
record[name] = null;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
if (hasValue)
|
|
365
|
+
results.push(record);
|
|
366
|
+
}
|
|
367
|
+
if (results.length > 0)
|
|
368
|
+
return results;
|
|
369
|
+
}
|
|
370
|
+
return [];
|
|
371
|
+
}, { fieldNames });
|
|
372
|
+
}
|
|
373
|
+
// ---------------------------------------------------------------------------
|
|
374
|
+
// List extraction
|
|
375
|
+
// ---------------------------------------------------------------------------
|
|
376
|
+
async function extractFromLists(page, fieldNames) {
|
|
377
|
+
return await page.evaluate(({ fieldNames: names }) => {
|
|
378
|
+
const lists = document.querySelectorAll('ul, ol');
|
|
379
|
+
if (lists.length === 0)
|
|
380
|
+
return [];
|
|
381
|
+
// Find the largest list with enough items
|
|
382
|
+
let bestList = null;
|
|
383
|
+
let bestCount = 0;
|
|
384
|
+
for (const list of lists) {
|
|
385
|
+
const items = list.querySelectorAll(':scope > li');
|
|
386
|
+
if (items.length > bestCount) {
|
|
387
|
+
bestCount = items.length;
|
|
388
|
+
bestList = list;
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
if (!bestList || bestCount < 2)
|
|
392
|
+
return [];
|
|
393
|
+
const items = bestList.querySelectorAll(':scope > li');
|
|
394
|
+
const results = [];
|
|
395
|
+
for (const item of items) {
|
|
396
|
+
const record = {};
|
|
397
|
+
const text = (item.textContent ?? '').trim();
|
|
398
|
+
if (!text)
|
|
399
|
+
continue;
|
|
400
|
+
if (names.length === 1) {
|
|
401
|
+
// Single field — map entire text
|
|
402
|
+
record[names[0]] = text;
|
|
403
|
+
}
|
|
404
|
+
else {
|
|
405
|
+
// Multiple fields — try splitting by common delimiters or child elements
|
|
406
|
+
const childElements = item.querySelectorAll('*');
|
|
407
|
+
const textSegments = [];
|
|
408
|
+
if (childElements.length > 0) {
|
|
409
|
+
// Use direct child elements' text
|
|
410
|
+
const directChildren = item.children;
|
|
411
|
+
for (const child of directChildren) {
|
|
412
|
+
const t = (child.textContent ?? '').trim();
|
|
413
|
+
if (t)
|
|
414
|
+
textSegments.push(t);
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
if (textSegments.length === 0) {
|
|
418
|
+
// Split on common delimiters
|
|
419
|
+
textSegments.push(...text.split(/\s*[|–—:,]\s*/).filter(Boolean));
|
|
420
|
+
}
|
|
421
|
+
for (let i = 0; i < names.length; i++) {
|
|
422
|
+
record[names[i]] = i < textSegments.length ? textSegments[i] : null;
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
results.push(record);
|
|
426
|
+
}
|
|
427
|
+
return results;
|
|
428
|
+
}, { fieldNames });
|
|
429
|
+
}
|
|
430
|
+
// ---------------------------------------------------------------------------
|
|
431
|
+
// Repeated-element extraction (cards, grids, etc.)
|
|
432
|
+
// ---------------------------------------------------------------------------
|
|
433
|
+
async function extractFromRepeatedElements(page, fieldNames) {
|
|
434
|
+
return await page.evaluate(({ fieldNames: names }) => {
|
|
435
|
+
// Find class names that appear 3+ times, suggesting repeated items
|
|
436
|
+
const classCount = new Map();
|
|
437
|
+
const allElements = document.querySelectorAll('div, li, article, section');
|
|
438
|
+
for (const el of allElements) {
|
|
439
|
+
const cls = el.className;
|
|
440
|
+
if (typeof cls === 'string' && cls.trim()) {
|
|
441
|
+
const key = el.tagName + '.' + cls.trim();
|
|
442
|
+
classCount.set(key, (classCount.get(key) ?? 0) + 1);
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
// Sort by count descending, pick the most repeated pattern with 3+ items
|
|
446
|
+
const candidates = [...classCount.entries()]
|
|
447
|
+
.filter(([, count]) => count >= 3)
|
|
448
|
+
.sort((a, b) => b[1] - a[1]);
|
|
449
|
+
for (const [tagClass] of candidates) {
|
|
450
|
+
const dotIdx = tagClass.indexOf('.');
|
|
451
|
+
const tag = tagClass.slice(0, dotIdx);
|
|
452
|
+
const cls = tagClass.slice(dotIdx + 1);
|
|
453
|
+
// Build selector: tag.class1.class2...
|
|
454
|
+
const classes = cls.split(/\s+/).filter(Boolean);
|
|
455
|
+
const sel = tag.toLowerCase() + classes.map((c) => '.' + CSS.escape(c)).join('');
|
|
456
|
+
let elements;
|
|
457
|
+
try {
|
|
458
|
+
elements = document.querySelectorAll(sel);
|
|
459
|
+
}
|
|
460
|
+
catch {
|
|
461
|
+
continue;
|
|
462
|
+
}
|
|
463
|
+
if (elements.length < 3)
|
|
464
|
+
continue;
|
|
465
|
+
const results = [];
|
|
466
|
+
for (const el of elements) {
|
|
467
|
+
const record = {};
|
|
468
|
+
let hasValue = false;
|
|
469
|
+
for (const name of names) {
|
|
470
|
+
const lower = name.toLowerCase();
|
|
471
|
+
// Try to find a child element whose class/tag/aria-label hints at the field
|
|
472
|
+
let value = null;
|
|
473
|
+
// Check common patterns: heading elements for title-like fields
|
|
474
|
+
if (/title|name|heading/i.test(lower)) {
|
|
475
|
+
const heading = el.querySelector('h1, h2, h3, h4, h5, h6') ??
|
|
476
|
+
el.querySelector('[class*="title"], [class*="name"], [class*="heading"]');
|
|
477
|
+
if (heading)
|
|
478
|
+
value = (heading.textContent ?? '').trim();
|
|
479
|
+
}
|
|
480
|
+
// Price-like fields
|
|
481
|
+
if (!value && /price|cost|amount/i.test(lower)) {
|
|
482
|
+
const priceEl = el.querySelector('[class*="price"], [class*="cost"], [class*="amount"]');
|
|
483
|
+
if (priceEl)
|
|
484
|
+
value = (priceEl.textContent ?? '').trim();
|
|
485
|
+
}
|
|
486
|
+
// Description-like fields
|
|
487
|
+
if (!value && /desc|summary|text|content/i.test(lower)) {
|
|
488
|
+
const descEl = el.querySelector('p, [class*="desc"], [class*="summary"], [class*="text"]');
|
|
489
|
+
if (descEl)
|
|
490
|
+
value = (descEl.textContent ?? '').trim();
|
|
491
|
+
}
|
|
492
|
+
// Link / URL fields
|
|
493
|
+
if (!value && /link|url|href/i.test(lower)) {
|
|
494
|
+
const anchor = el.querySelector('a[href]');
|
|
495
|
+
if (anchor)
|
|
496
|
+
value = anchor.getAttribute('href');
|
|
497
|
+
}
|
|
498
|
+
// Image fields
|
|
499
|
+
if (!value && /image|img|photo|src/i.test(lower)) {
|
|
500
|
+
const img = el.querySelector('img[src]');
|
|
501
|
+
if (img)
|
|
502
|
+
value = img.getAttribute('src');
|
|
503
|
+
}
|
|
504
|
+
// Fallback: use full text for first unmatched field
|
|
505
|
+
if (!value) {
|
|
506
|
+
value = (el.textContent ?? '').trim().slice(0, 200);
|
|
507
|
+
}
|
|
508
|
+
record[name] = value || null;
|
|
509
|
+
if (value)
|
|
510
|
+
hasValue = true;
|
|
511
|
+
}
|
|
512
|
+
if (hasValue)
|
|
513
|
+
results.push(record);
|
|
514
|
+
}
|
|
515
|
+
if (results.length > 0)
|
|
516
|
+
return results;
|
|
517
|
+
}
|
|
518
|
+
return [];
|
|
519
|
+
}, { fieldNames });
|
|
520
|
+
}
|