maxun-core 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +527 -70
- package/build/interpret.d.ts +2 -0
- package/build/interpret.js +23 -1
- package/package.json +1 -1
|
@@ -159,6 +159,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
159
159
|
* @returns {Array.<Object.<string, string>>}
|
|
160
160
|
*/
|
|
161
161
|
window.scrapeSchema = function (lists) {
|
|
162
|
+
// Utility functions remain the same
|
|
162
163
|
function omap(object, f, kf = (x) => x) {
|
|
163
164
|
return Object.fromEntries(Object.entries(object)
|
|
164
165
|
.map(([k, v]) => [kf(k), f(v)]));
|
|
@@ -167,15 +168,121 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
167
168
|
return Object.fromEntries(Object.entries(object)
|
|
168
169
|
.filter(([k, v]) => f(k, v)));
|
|
169
170
|
}
|
|
171
|
+
function findAllElements(config) {
|
|
172
|
+
var _a;
|
|
173
|
+
// Regular DOM query if no special delimiters
|
|
174
|
+
if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
|
|
175
|
+
return Array.from(document.querySelectorAll(config.selector));
|
|
176
|
+
}
|
|
177
|
+
// First handle iframe traversal if present
|
|
178
|
+
if (config.selector.includes(':>>')) {
|
|
179
|
+
const parts = config.selector.split(':>>').map(s => s.trim());
|
|
180
|
+
let currentElements = [document];
|
|
181
|
+
// Traverse through each part of the selector
|
|
182
|
+
for (let i = 0; i < parts.length; i++) {
|
|
183
|
+
const part = parts[i];
|
|
184
|
+
const nextElements = [];
|
|
185
|
+
const isLast = i === parts.length - 1;
|
|
186
|
+
for (const element of currentElements) {
|
|
187
|
+
try {
|
|
188
|
+
// For document or iframe document
|
|
189
|
+
const doc = element.contentDocument || element || ((_a = element.contentWindow) === null || _a === void 0 ? void 0 : _a.document);
|
|
190
|
+
if (!doc)
|
|
191
|
+
continue;
|
|
192
|
+
// Query elements in current context
|
|
193
|
+
const found = Array.from(doc.querySelectorAll(part));
|
|
194
|
+
if (isLast) {
|
|
195
|
+
// If it's the last part, keep all matching elements
|
|
196
|
+
nextElements.push(...found);
|
|
197
|
+
}
|
|
198
|
+
else {
|
|
199
|
+
// If not last, only keep iframes for next iteration
|
|
200
|
+
const iframes = found.filter(el => el.tagName === 'IFRAME');
|
|
201
|
+
nextElements.push(...iframes);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
catch (error) {
|
|
205
|
+
console.warn('Cannot access iframe content:', error, {
|
|
206
|
+
part,
|
|
207
|
+
element,
|
|
208
|
+
index: i
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
if (nextElements.length === 0) {
|
|
213
|
+
console.warn('No elements found for part:', part, 'at depth:', i);
|
|
214
|
+
return [];
|
|
215
|
+
}
|
|
216
|
+
currentElements = nextElements;
|
|
217
|
+
}
|
|
218
|
+
return currentElements;
|
|
219
|
+
}
|
|
220
|
+
// Handle shadow DOM traversal
|
|
221
|
+
if (config.selector.includes('>>')) {
|
|
222
|
+
const parts = config.selector.split('>>').map(s => s.trim());
|
|
223
|
+
let currentElements = [document];
|
|
224
|
+
for (const part of parts) {
|
|
225
|
+
const nextElements = [];
|
|
226
|
+
for (const element of currentElements) {
|
|
227
|
+
// Try regular DOM first
|
|
228
|
+
const found = Array.from(element.querySelectorAll(part));
|
|
229
|
+
// Then check shadow roots
|
|
230
|
+
for (const foundEl of found) {
|
|
231
|
+
if (foundEl.shadowRoot) {
|
|
232
|
+
nextElements.push(foundEl.shadowRoot);
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
nextElements.push(foundEl);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
currentElements = nextElements;
|
|
240
|
+
}
|
|
241
|
+
return currentElements.filter(el => !(el instanceof ShadowRoot));
|
|
242
|
+
}
|
|
243
|
+
return [];
|
|
244
|
+
}
|
|
245
|
+
// Modified to handle iframe context for URL resolution
|
|
246
|
+
function getElementValue(element, attribute) {
|
|
247
|
+
var _a, _b, _c, _d, _e;
|
|
248
|
+
if (!element)
|
|
249
|
+
return null;
|
|
250
|
+
// Get the base URL for resolving relative URLs
|
|
251
|
+
const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
|
|
252
|
+
switch (attribute) {
|
|
253
|
+
case 'href': {
|
|
254
|
+
const relativeHref = element.getAttribute('href');
|
|
255
|
+
return relativeHref ? new URL(relativeHref, baseURL).href : null;
|
|
256
|
+
}
|
|
257
|
+
case 'src': {
|
|
258
|
+
const relativeSrc = element.getAttribute('src');
|
|
259
|
+
return relativeSrc ? new URL(relativeSrc, baseURL).href : null;
|
|
260
|
+
}
|
|
261
|
+
case 'innerText':
|
|
262
|
+
return (_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim();
|
|
263
|
+
case 'textContent':
|
|
264
|
+
return (_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim();
|
|
265
|
+
default:
|
|
266
|
+
return element.getAttribute(attribute) || ((_e = element.innerText) === null || _e === void 0 ? void 0 : _e.trim());
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
// Rest of the functions remain largely the same
|
|
170
270
|
function getSeedKey(listObj) {
|
|
171
|
-
const maxLength = Math.max(...Object.values(omap(listObj, (x) =>
|
|
172
|
-
return Object.keys(ofilter(listObj, (_, v) =>
|
|
271
|
+
const maxLength = Math.max(...Object.values(omap(listObj, (x) => findAllElements(x).length)));
|
|
272
|
+
return Object.keys(ofilter(listObj, (_, v) => findAllElements(v).length === maxLength))[0];
|
|
173
273
|
}
|
|
274
|
+
// Find minimal bounding elements
|
|
174
275
|
function getMBEs(elements) {
|
|
175
276
|
return elements.map((element) => {
|
|
176
277
|
let candidate = element;
|
|
177
278
|
const isUniqueChild = (e) => elements
|
|
178
|
-
.filter((elem) => {
|
|
279
|
+
.filter((elem) => {
|
|
280
|
+
var _a;
|
|
281
|
+
// Handle both iframe and shadow DOM boundaries
|
|
282
|
+
const sameContext = elem.getRootNode() === e.getRootNode() &&
|
|
283
|
+
elem.ownerDocument === e.ownerDocument;
|
|
284
|
+
return sameContext && ((_a = e.parentNode) === null || _a === void 0 ? void 0 : _a.contains(elem));
|
|
285
|
+
})
|
|
179
286
|
.length === 1;
|
|
180
287
|
while (candidate && isUniqueChild(candidate)) {
|
|
181
288
|
candidate = candidate.parentNode;
|
|
@@ -184,28 +291,35 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
184
291
|
});
|
|
185
292
|
}
|
|
186
293
|
const seedName = getSeedKey(lists);
|
|
187
|
-
const seedElements =
|
|
294
|
+
const seedElements = findAllElements(lists[seedName]);
|
|
188
295
|
const MBEs = getMBEs(seedElements);
|
|
189
|
-
|
|
190
|
-
const elem =
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
296
|
+
const mbeResults = MBEs.map((mbe) => omap(lists, (config) => {
|
|
297
|
+
const elem = findAllElements(config)
|
|
298
|
+
.find((elem) => mbe.contains(elem));
|
|
299
|
+
return elem ? getElementValue(elem, config.attribute) : undefined;
|
|
300
|
+
}, (key) => key)) || [];
|
|
301
|
+
// If MBE approach didn't find all elements, try independent scraping
|
|
302
|
+
if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) {
|
|
303
|
+
// Fall back to independent scraping
|
|
304
|
+
const results = [];
|
|
305
|
+
const foundElements = new Map();
|
|
306
|
+
// Find all elements for each selector
|
|
307
|
+
Object.entries(lists).forEach(([key, config]) => {
|
|
308
|
+
const elements = findAllElements(config);
|
|
309
|
+
foundElements.set(key, elements);
|
|
310
|
+
});
|
|
311
|
+
// Create result objects for each found element
|
|
312
|
+
foundElements.forEach((elements, key) => {
|
|
313
|
+
elements.forEach((element, index) => {
|
|
314
|
+
if (!results[index]) {
|
|
315
|
+
results[index] = {};
|
|
316
|
+
}
|
|
317
|
+
results[index][key] = getElementValue(element, lists[key].attribute);
|
|
318
|
+
});
|
|
319
|
+
});
|
|
320
|
+
return results.filter(result => Object.keys(result).length > 0);
|
|
321
|
+
}
|
|
322
|
+
return mbeResults;
|
|
209
323
|
};
|
|
210
324
|
/**
|
|
211
325
|
* Scrapes multiple lists of similar items based on a template item.
|
|
@@ -218,67 +332,410 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
218
332
|
*/
|
|
219
333
|
window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
|
|
220
334
|
return __awaiter(this, void 0, void 0, function* () {
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
335
|
+
// Enhanced query function to handle both iframe and shadow DOM
|
|
336
|
+
const queryElement = (rootElement, selector) => {
|
|
337
|
+
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
|
338
|
+
return rootElement.querySelector(selector);
|
|
339
|
+
}
|
|
340
|
+
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
|
341
|
+
let currentElement = rootElement;
|
|
342
|
+
for (let i = 0; i < parts.length; i++) {
|
|
343
|
+
if (!currentElement)
|
|
344
|
+
return null;
|
|
345
|
+
// Handle iframe traversal
|
|
346
|
+
if (currentElement.tagName === 'IFRAME') {
|
|
347
|
+
try {
|
|
348
|
+
const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
|
349
|
+
currentElement = iframeDoc.querySelector(parts[i]);
|
|
350
|
+
continue;
|
|
351
|
+
}
|
|
352
|
+
catch (e) {
|
|
353
|
+
console.warn('Cannot access iframe content:', e);
|
|
354
|
+
return null;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
// Try regular DOM first
|
|
358
|
+
let nextElement = currentElement.querySelector(parts[i]);
|
|
359
|
+
// Try shadow DOM if not found
|
|
360
|
+
if (!nextElement && currentElement.shadowRoot) {
|
|
361
|
+
nextElement = currentElement.shadowRoot.querySelector(parts[i]);
|
|
362
|
+
}
|
|
363
|
+
// Check children's shadow roots if still not found
|
|
364
|
+
if (!nextElement) {
|
|
365
|
+
const children = Array.from(currentElement.children || []);
|
|
366
|
+
for (const child of children) {
|
|
367
|
+
if (child.shadowRoot) {
|
|
368
|
+
nextElement = child.shadowRoot.querySelector(parts[i]);
|
|
369
|
+
if (nextElement)
|
|
370
|
+
break;
|
|
371
|
+
}
|
|
241
372
|
}
|
|
242
373
|
}
|
|
374
|
+
currentElement = nextElement;
|
|
243
375
|
}
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
376
|
+
return currentElement;
|
|
377
|
+
};
|
|
378
|
+
// Enhanced query all function for both contexts
|
|
379
|
+
const queryElementAll = (rootElement, selector) => {
|
|
380
|
+
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
|
381
|
+
return rootElement.querySelectorAll(selector);
|
|
382
|
+
}
|
|
383
|
+
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
|
384
|
+
let currentElements = [rootElement];
|
|
385
|
+
for (const part of parts) {
|
|
386
|
+
const nextElements = [];
|
|
387
|
+
for (const element of currentElements) {
|
|
388
|
+
// Handle iframe traversal
|
|
389
|
+
if (element.tagName === 'IFRAME') {
|
|
390
|
+
try {
|
|
391
|
+
const iframeDoc = element.contentDocument || element.contentWindow.document;
|
|
392
|
+
nextElements.push(...iframeDoc.querySelectorAll(part));
|
|
393
|
+
}
|
|
394
|
+
catch (e) {
|
|
395
|
+
console.warn('Cannot access iframe content:', e);
|
|
396
|
+
continue;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
else {
|
|
400
|
+
// Regular DOM elements
|
|
401
|
+
if (element.querySelectorAll) {
|
|
402
|
+
nextElements.push(...element.querySelectorAll(part));
|
|
255
403
|
}
|
|
256
|
-
|
|
257
|
-
|
|
404
|
+
// Shadow DOM elements
|
|
405
|
+
if (element.shadowRoot) {
|
|
406
|
+
nextElements.push(...element.shadowRoot.querySelectorAll(part));
|
|
258
407
|
}
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
408
|
+
// Check children's shadow roots
|
|
409
|
+
const children = Array.from(element.children || []);
|
|
410
|
+
for (const child of children) {
|
|
411
|
+
if (child.shadowRoot) {
|
|
412
|
+
nextElements.push(...child.shadowRoot.querySelectorAll(part));
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
currentElements = nextElements;
|
|
418
|
+
}
|
|
419
|
+
return currentElements;
|
|
420
|
+
};
|
|
421
|
+
// Enhanced value extraction with context awareness
|
|
422
|
+
function extractValue(element, attribute) {
|
|
423
|
+
var _a, _b;
|
|
424
|
+
if (!element)
|
|
425
|
+
return null;
|
|
426
|
+
// Get context-aware base URL
|
|
427
|
+
const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
|
|
428
|
+
// Check shadow root first
|
|
429
|
+
if (element.shadowRoot) {
|
|
430
|
+
const shadowContent = element.shadowRoot.textContent;
|
|
431
|
+
if (shadowContent === null || shadowContent === void 0 ? void 0 : shadowContent.trim()) {
|
|
432
|
+
return shadowContent.trim();
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
if (attribute === 'innerText') {
|
|
436
|
+
return element.innerText.trim();
|
|
437
|
+
}
|
|
438
|
+
else if (attribute === 'innerHTML') {
|
|
439
|
+
return element.innerHTML.trim();
|
|
440
|
+
}
|
|
441
|
+
else if (attribute === 'src' || attribute === 'href') {
|
|
442
|
+
const attrValue = element.getAttribute(attribute);
|
|
443
|
+
return attrValue ? new URL(attrValue, baseURL).href : null;
|
|
444
|
+
}
|
|
445
|
+
return element.getAttribute(attribute);
|
|
446
|
+
}
|
|
447
|
+
// Enhanced table ancestor finding with context support
|
|
448
|
+
function findTableAncestor(element) {
|
|
449
|
+
let currentElement = element;
|
|
450
|
+
const MAX_DEPTH = 5;
|
|
451
|
+
let depth = 0;
|
|
452
|
+
while (currentElement && depth < MAX_DEPTH) {
|
|
453
|
+
// Handle shadow DOM
|
|
454
|
+
if (currentElement.getRootNode() instanceof ShadowRoot) {
|
|
455
|
+
currentElement = currentElement.getRootNode().host;
|
|
456
|
+
continue;
|
|
457
|
+
}
|
|
458
|
+
if (currentElement.tagName === 'TD') {
|
|
459
|
+
return { type: 'TD', element: currentElement };
|
|
460
|
+
}
|
|
461
|
+
else if (currentElement.tagName === 'TR') {
|
|
462
|
+
return { type: 'TR', element: currentElement };
|
|
463
|
+
}
|
|
464
|
+
// Handle iframe crossing
|
|
465
|
+
if (currentElement.tagName === 'IFRAME') {
|
|
466
|
+
try {
|
|
467
|
+
currentElement = currentElement.contentDocument.body;
|
|
468
|
+
}
|
|
469
|
+
catch (e) {
|
|
470
|
+
return null;
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
else {
|
|
474
|
+
currentElement = currentElement.parentElement;
|
|
475
|
+
}
|
|
476
|
+
depth++;
|
|
477
|
+
}
|
|
478
|
+
return null;
|
|
479
|
+
}
|
|
480
|
+
// Helper function to get cell index
|
|
481
|
+
function getCellIndex(td) {
|
|
482
|
+
if (td.getRootNode() instanceof ShadowRoot) {
|
|
483
|
+
const shadowRoot = td.getRootNode();
|
|
484
|
+
const allCells = Array.from(shadowRoot.querySelectorAll('td'));
|
|
485
|
+
return allCells.indexOf(td);
|
|
486
|
+
}
|
|
487
|
+
let index = 0;
|
|
488
|
+
let sibling = td;
|
|
489
|
+
while (sibling = sibling.previousElementSibling) {
|
|
490
|
+
index++;
|
|
491
|
+
}
|
|
492
|
+
return index;
|
|
493
|
+
}
|
|
494
|
+
// Helper function to check for TH elements
|
|
495
|
+
function hasThElement(row, tableFields) {
|
|
496
|
+
for (const [_, { selector }] of Object.entries(tableFields)) {
|
|
497
|
+
const element = queryElement(row, selector);
|
|
498
|
+
if (element) {
|
|
499
|
+
let current = element;
|
|
500
|
+
while (current && current !== row) {
|
|
501
|
+
if (current.getRootNode() instanceof ShadowRoot) {
|
|
502
|
+
current = current.getRootNode().host;
|
|
503
|
+
continue;
|
|
263
504
|
}
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
505
|
+
if (current.tagName === 'TH')
|
|
506
|
+
return true;
|
|
507
|
+
if (current.tagName === 'IFRAME') {
|
|
508
|
+
try {
|
|
509
|
+
current = current.contentDocument.body;
|
|
510
|
+
}
|
|
511
|
+
catch (e) {
|
|
512
|
+
break;
|
|
513
|
+
}
|
|
268
514
|
}
|
|
269
515
|
else {
|
|
270
|
-
|
|
516
|
+
current = current.parentElement;
|
|
271
517
|
}
|
|
272
518
|
}
|
|
273
519
|
}
|
|
274
|
-
scrapedData.push(record);
|
|
275
520
|
}
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
521
|
+
return false;
|
|
522
|
+
}
|
|
523
|
+
// Helper function to filter rows
|
|
524
|
+
function filterRowsBasedOnTag(rows, tableFields) {
|
|
525
|
+
for (const row of rows) {
|
|
526
|
+
if (hasThElement(row, tableFields)) {
|
|
527
|
+
return rows;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
// Include shadow DOM in TH search
|
|
531
|
+
return rows.filter(row => {
|
|
532
|
+
const directTH = row.getElementsByTagName('TH').length === 0;
|
|
533
|
+
const shadowTH = row.shadowRoot ?
|
|
534
|
+
row.shadowRoot.querySelector('th') === null : true;
|
|
535
|
+
return directTH && shadowTH;
|
|
536
|
+
});
|
|
537
|
+
}
|
|
538
|
+
// Class similarity comparison functions
|
|
539
|
+
function calculateClassSimilarity(classList1, classList2) {
|
|
540
|
+
const set1 = new Set(classList1);
|
|
541
|
+
const set2 = new Set(classList2);
|
|
542
|
+
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
|
543
|
+
const union = new Set([...set1, ...set2]);
|
|
544
|
+
return intersection.size / union.size;
|
|
545
|
+
}
|
|
546
|
+
// Enhanced similar elements finding with context support
|
|
547
|
+
function findSimilarElements(baseElement, similarityThreshold = 0.7) {
|
|
548
|
+
const baseClasses = Array.from(baseElement.classList);
|
|
549
|
+
if (baseClasses.length === 0)
|
|
550
|
+
return [];
|
|
551
|
+
const allElements = [];
|
|
552
|
+
// Get elements from main document
|
|
553
|
+
allElements.push(...document.getElementsByTagName(baseElement.tagName));
|
|
554
|
+
// Get elements from shadow DOM
|
|
555
|
+
if (baseElement.getRootNode() instanceof ShadowRoot) {
|
|
556
|
+
const shadowHost = baseElement.getRootNode().host;
|
|
557
|
+
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
|
558
|
+
}
|
|
559
|
+
// Get elements from iframes
|
|
560
|
+
const iframes = document.getElementsByTagName('iframe');
|
|
561
|
+
for (const iframe of iframes) {
|
|
562
|
+
try {
|
|
563
|
+
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
|
|
564
|
+
allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
|
|
565
|
+
}
|
|
566
|
+
catch (e) {
|
|
567
|
+
console.warn('Cannot access iframe content:', e);
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
return allElements.filter(element => {
|
|
571
|
+
if (element === baseElement)
|
|
572
|
+
return false;
|
|
573
|
+
const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
|
|
574
|
+
return similarity >= similarityThreshold;
|
|
575
|
+
});
|
|
576
|
+
}
|
|
577
|
+
// Main scraping logic with context support
|
|
578
|
+
let containers = queryElementAll(document, listSelector);
|
|
579
|
+
containers = Array.from(containers);
|
|
580
|
+
if (containers.length === 0)
|
|
581
|
+
return [];
|
|
582
|
+
if (limit > 1 && containers.length === 1) {
|
|
583
|
+
const baseContainer = containers[0];
|
|
584
|
+
const similarContainers = findSimilarElements(baseContainer);
|
|
585
|
+
if (similarContainers.length > 0) {
|
|
586
|
+
const newContainers = similarContainers.filter(container => !container.matches(listSelector));
|
|
587
|
+
containers = [...containers, ...newContainers];
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
const containerFields = containers.map(() => ({
|
|
591
|
+
tableFields: {},
|
|
592
|
+
nonTableFields: {}
|
|
593
|
+
}));
|
|
594
|
+
// Classify fields
|
|
595
|
+
containers.forEach((container, containerIndex) => {
|
|
596
|
+
for (const [label, field] of Object.entries(fields)) {
|
|
597
|
+
const sampleElement = queryElement(container, field.selector);
|
|
598
|
+
if (sampleElement) {
|
|
599
|
+
const ancestor = findTableAncestor(sampleElement);
|
|
600
|
+
if (ancestor) {
|
|
601
|
+
containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 });
|
|
602
|
+
}
|
|
603
|
+
else {
|
|
604
|
+
containerFields[containerIndex].nonTableFields[label] = field;
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
else {
|
|
608
|
+
containerFields[containerIndex].nonTableFields[label] = field;
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
});
|
|
612
|
+
const tableData = [];
|
|
613
|
+
const nonTableData = [];
|
|
614
|
+
// Process table data with both iframe and shadow DOM support
|
|
615
|
+
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
|
616
|
+
const container = containers[containerIndex];
|
|
617
|
+
const { tableFields } = containerFields[containerIndex];
|
|
618
|
+
if (Object.keys(tableFields).length > 0) {
|
|
619
|
+
const firstField = Object.values(tableFields)[0];
|
|
620
|
+
const firstElement = queryElement(container, firstField.selector);
|
|
621
|
+
let tableContext = firstElement;
|
|
622
|
+
// Find table context including both iframe and shadow DOM
|
|
623
|
+
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
|
624
|
+
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
|
625
|
+
tableContext = tableContext.getRootNode().host;
|
|
626
|
+
continue;
|
|
627
|
+
}
|
|
628
|
+
if (tableContext.tagName === 'IFRAME') {
|
|
629
|
+
try {
|
|
630
|
+
tableContext = tableContext.contentDocument.body;
|
|
631
|
+
}
|
|
632
|
+
catch (e) {
|
|
633
|
+
break;
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
else {
|
|
637
|
+
tableContext = tableContext.parentElement;
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
if (tableContext) {
|
|
641
|
+
// Get rows from all contexts
|
|
642
|
+
const rows = [];
|
|
643
|
+
// Get rows from regular DOM
|
|
644
|
+
rows.push(...tableContext.getElementsByTagName('TR'));
|
|
645
|
+
// Get rows from shadow DOM
|
|
646
|
+
if (tableContext.shadowRoot) {
|
|
647
|
+
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
|
648
|
+
}
|
|
649
|
+
// Get rows from iframes
|
|
650
|
+
if (tableContext.tagName === 'IFRAME') {
|
|
651
|
+
try {
|
|
652
|
+
const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
|
653
|
+
rows.push(...iframeDoc.getElementsByTagName('TR'));
|
|
654
|
+
}
|
|
655
|
+
catch (e) {
|
|
656
|
+
console.warn('Cannot access iframe rows:', e);
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
|
660
|
+
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
|
|
661
|
+
const record = {};
|
|
662
|
+
const currentRow = processedRows[rowIndex];
|
|
663
|
+
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
|
|
664
|
+
let element = null;
|
|
665
|
+
if (cellIndex >= 0) {
|
|
666
|
+
// Get TD element considering both contexts
|
|
667
|
+
let td = currentRow.children[cellIndex];
|
|
668
|
+
// Check shadow DOM for td
|
|
669
|
+
if (!td && currentRow.shadowRoot) {
|
|
670
|
+
const shadowCells = currentRow.shadowRoot.children;
|
|
671
|
+
if (shadowCells && shadowCells.length > cellIndex) {
|
|
672
|
+
td = shadowCells[cellIndex];
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
if (td) {
|
|
676
|
+
element = queryElement(td, selector);
|
|
677
|
+
if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
|
|
678
|
+
element = td;
|
|
679
|
+
}
|
|
680
|
+
if (!element) {
|
|
681
|
+
const tagOnlySelector = selector.split('.')[0];
|
|
682
|
+
element = queryElement(td, tagOnlySelector);
|
|
683
|
+
}
|
|
684
|
+
if (!element) {
|
|
685
|
+
let currentElement = td;
|
|
686
|
+
while (currentElement && currentElement.children.length > 0) {
|
|
687
|
+
let foundContentChild = false;
|
|
688
|
+
for (const child of currentElement.children) {
|
|
689
|
+
if (extractValue(child, attribute)) {
|
|
690
|
+
currentElement = child;
|
|
691
|
+
foundContentChild = true;
|
|
692
|
+
break;
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
if (!foundContentChild)
|
|
696
|
+
break;
|
|
697
|
+
}
|
|
698
|
+
element = currentElement;
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
else {
|
|
703
|
+
element = queryElement(currentRow, selector);
|
|
704
|
+
}
|
|
705
|
+
if (element) {
|
|
706
|
+
record[label] = extractValue(element, attribute);
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
if (Object.keys(record).length > 0) {
|
|
710
|
+
tableData.push(record);
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
// Process non-table data with both contexts support
|
|
717
|
+
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
|
718
|
+
if (nonTableData.length >= limit)
|
|
279
719
|
break;
|
|
720
|
+
const container = containers[containerIndex];
|
|
721
|
+
const { nonTableFields } = containerFields[containerIndex];
|
|
722
|
+
if (Object.keys(nonTableFields).length > 0) {
|
|
723
|
+
const record = {};
|
|
724
|
+
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
|
725
|
+
// Get the last part of the selector after any context delimiter
|
|
726
|
+
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
|
727
|
+
const element = queryElement(container, relativeSelector);
|
|
728
|
+
if (element) {
|
|
729
|
+
record[label] = extractValue(element, attribute);
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
if (Object.keys(record).length > 0) {
|
|
733
|
+
nonTableData.push(record);
|
|
734
|
+
}
|
|
280
735
|
}
|
|
281
736
|
}
|
|
737
|
+
// Merge and limit the results
|
|
738
|
+
const scrapedData = [...tableData, ...nonTableData];
|
|
282
739
|
return scrapedData;
|
|
283
740
|
});
|
|
284
741
|
};
|
package/build/interpret.d.ts
CHANGED
|
@@ -86,6 +86,8 @@ export default class Interpreter extends EventEmitter {
|
|
|
86
86
|
private carryOutSteps;
|
|
87
87
|
private handlePagination;
|
|
88
88
|
private getMatchingActionId;
|
|
89
|
+
private removeShadowSelectors;
|
|
90
|
+
private removeSpecialSelectors;
|
|
89
91
|
private runLoop;
|
|
90
92
|
private ensureScriptsLoaded;
|
|
91
93
|
/**
|
package/build/interpret.js
CHANGED
|
@@ -568,10 +568,32 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
568
568
|
}
|
|
569
569
|
}
|
|
570
570
|
}
|
|
571
|
+
removeShadowSelectors(workflow) {
|
|
572
|
+
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
573
|
+
const step = workflow[actionId];
|
|
574
|
+
// Check if step has where and selectors
|
|
575
|
+
if (step.where && Array.isArray(step.where.selectors)) {
|
|
576
|
+
// Filter out selectors that contain ">>"
|
|
577
|
+
step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>'));
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
return workflow;
|
|
581
|
+
}
|
|
582
|
+
removeSpecialSelectors(workflow) {
|
|
583
|
+
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
584
|
+
const step = workflow[actionId];
|
|
585
|
+
if (step.where && Array.isArray(step.where.selectors)) {
|
|
586
|
+
// Filter out if selector has EITHER ":>>" OR ">>"
|
|
587
|
+
step.where.selectors = step.where.selectors.filter(selector => !(selector.includes(':>>') || selector.includes('>>')));
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
return workflow;
|
|
591
|
+
}
|
|
571
592
|
runLoop(p, workflow) {
|
|
572
593
|
var _a, _b;
|
|
573
594
|
return __awaiter(this, void 0, void 0, function* () {
|
|
574
|
-
|
|
595
|
+
let workflowCopy = JSON.parse(JSON.stringify(workflow));
|
|
596
|
+
workflowCopy = this.removeSpecialSelectors(workflowCopy);
|
|
575
597
|
// apply ad-blocker to the current page
|
|
576
598
|
try {
|
|
577
599
|
yield this.applyAdBlocker(p);
|