maxun-core 0.0.6 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +532 -51
- package/build/interpret.d.ts +2 -0
- package/build/interpret.js +60 -8
- package/build/utils/concurrency.d.ts +24 -24
- package/build/utils/concurrency.js +24 -24
- package/package.json +1 -1
|
@@ -159,6 +159,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
159
159
|
* @returns {Array.<Object.<string, string>>}
|
|
160
160
|
*/
|
|
161
161
|
window.scrapeSchema = function (lists) {
|
|
162
|
+
// Utility functions remain the same
|
|
162
163
|
function omap(object, f, kf = (x) => x) {
|
|
163
164
|
return Object.fromEntries(Object.entries(object)
|
|
164
165
|
.map(([k, v]) => [kf(k), f(v)]));
|
|
@@ -167,15 +168,121 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
167
168
|
return Object.fromEntries(Object.entries(object)
|
|
168
169
|
.filter(([k, v]) => f(k, v)));
|
|
169
170
|
}
|
|
171
|
+
function findAllElements(config) {
|
|
172
|
+
var _a;
|
|
173
|
+
// Regular DOM query if no special delimiters
|
|
174
|
+
if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
|
|
175
|
+
return Array.from(document.querySelectorAll(config.selector));
|
|
176
|
+
}
|
|
177
|
+
// First handle iframe traversal if present
|
|
178
|
+
if (config.selector.includes(':>>')) {
|
|
179
|
+
const parts = config.selector.split(':>>').map(s => s.trim());
|
|
180
|
+
let currentElements = [document];
|
|
181
|
+
// Traverse through each part of the selector
|
|
182
|
+
for (let i = 0; i < parts.length; i++) {
|
|
183
|
+
const part = parts[i];
|
|
184
|
+
const nextElements = [];
|
|
185
|
+
const isLast = i === parts.length - 1;
|
|
186
|
+
for (const element of currentElements) {
|
|
187
|
+
try {
|
|
188
|
+
// For document or iframe document
|
|
189
|
+
const doc = element.contentDocument || element || ((_a = element.contentWindow) === null || _a === void 0 ? void 0 : _a.document);
|
|
190
|
+
if (!doc)
|
|
191
|
+
continue;
|
|
192
|
+
// Query elements in current context
|
|
193
|
+
const found = Array.from(doc.querySelectorAll(part));
|
|
194
|
+
if (isLast) {
|
|
195
|
+
// If it's the last part, keep all matching elements
|
|
196
|
+
nextElements.push(...found);
|
|
197
|
+
}
|
|
198
|
+
else {
|
|
199
|
+
// If not last, only keep iframes for next iteration
|
|
200
|
+
const iframes = found.filter(el => el.tagName === 'IFRAME');
|
|
201
|
+
nextElements.push(...iframes);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
catch (error) {
|
|
205
|
+
console.warn('Cannot access iframe content:', error, {
|
|
206
|
+
part,
|
|
207
|
+
element,
|
|
208
|
+
index: i
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
if (nextElements.length === 0) {
|
|
213
|
+
console.warn('No elements found for part:', part, 'at depth:', i);
|
|
214
|
+
return [];
|
|
215
|
+
}
|
|
216
|
+
currentElements = nextElements;
|
|
217
|
+
}
|
|
218
|
+
return currentElements;
|
|
219
|
+
}
|
|
220
|
+
// Handle shadow DOM traversal
|
|
221
|
+
if (config.selector.includes('>>')) {
|
|
222
|
+
const parts = config.selector.split('>>').map(s => s.trim());
|
|
223
|
+
let currentElements = [document];
|
|
224
|
+
for (const part of parts) {
|
|
225
|
+
const nextElements = [];
|
|
226
|
+
for (const element of currentElements) {
|
|
227
|
+
// Try regular DOM first
|
|
228
|
+
const found = Array.from(element.querySelectorAll(part));
|
|
229
|
+
// Then check shadow roots
|
|
230
|
+
for (const foundEl of found) {
|
|
231
|
+
if (foundEl.shadowRoot) {
|
|
232
|
+
nextElements.push(foundEl.shadowRoot);
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
nextElements.push(foundEl);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
currentElements = nextElements;
|
|
240
|
+
}
|
|
241
|
+
return currentElements.filter(el => !(el instanceof ShadowRoot));
|
|
242
|
+
}
|
|
243
|
+
return [];
|
|
244
|
+
}
|
|
245
|
+
// Modified to handle iframe context for URL resolution
|
|
246
|
+
function getElementValue(element, attribute) {
|
|
247
|
+
var _a, _b, _c, _d, _e;
|
|
248
|
+
if (!element)
|
|
249
|
+
return null;
|
|
250
|
+
// Get the base URL for resolving relative URLs
|
|
251
|
+
const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
|
|
252
|
+
switch (attribute) {
|
|
253
|
+
case 'href': {
|
|
254
|
+
const relativeHref = element.getAttribute('href');
|
|
255
|
+
return relativeHref ? new URL(relativeHref, baseURL).href : null;
|
|
256
|
+
}
|
|
257
|
+
case 'src': {
|
|
258
|
+
const relativeSrc = element.getAttribute('src');
|
|
259
|
+
return relativeSrc ? new URL(relativeSrc, baseURL).href : null;
|
|
260
|
+
}
|
|
261
|
+
case 'innerText':
|
|
262
|
+
return (_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim();
|
|
263
|
+
case 'textContent':
|
|
264
|
+
return (_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim();
|
|
265
|
+
default:
|
|
266
|
+
return element.getAttribute(attribute) || ((_e = element.innerText) === null || _e === void 0 ? void 0 : _e.trim());
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
// Rest of the functions remain largely the same
|
|
170
270
|
function getSeedKey(listObj) {
|
|
171
|
-
const maxLength = Math.max(...Object.values(omap(listObj, (x) =>
|
|
172
|
-
return Object.keys(ofilter(listObj, (_, v) =>
|
|
271
|
+
const maxLength = Math.max(...Object.values(omap(listObj, (x) => findAllElements(x).length)));
|
|
272
|
+
return Object.keys(ofilter(listObj, (_, v) => findAllElements(v).length === maxLength))[0];
|
|
173
273
|
}
|
|
274
|
+
// Find minimal bounding elements
|
|
174
275
|
function getMBEs(elements) {
|
|
175
276
|
return elements.map((element) => {
|
|
176
277
|
let candidate = element;
|
|
177
278
|
const isUniqueChild = (e) => elements
|
|
178
|
-
.filter((elem) => {
|
|
279
|
+
.filter((elem) => {
|
|
280
|
+
var _a;
|
|
281
|
+
// Handle both iframe and shadow DOM boundaries
|
|
282
|
+
const sameContext = elem.getRootNode() === e.getRootNode() &&
|
|
283
|
+
elem.ownerDocument === e.ownerDocument;
|
|
284
|
+
return sameContext && ((_a = e.parentNode) === null || _a === void 0 ? void 0 : _a.contains(elem));
|
|
285
|
+
})
|
|
179
286
|
.length === 1;
|
|
180
287
|
while (candidate && isUniqueChild(candidate)) {
|
|
181
288
|
candidate = candidate.parentNode;
|
|
@@ -184,28 +291,35 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
184
291
|
});
|
|
185
292
|
}
|
|
186
293
|
const seedName = getSeedKey(lists);
|
|
187
|
-
const seedElements =
|
|
294
|
+
const seedElements = findAllElements(lists[seedName]);
|
|
188
295
|
const MBEs = getMBEs(seedElements);
|
|
189
|
-
|
|
190
|
-
const elem =
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
296
|
+
const mbeResults = MBEs.map((mbe) => omap(lists, (config) => {
|
|
297
|
+
const elem = findAllElements(config)
|
|
298
|
+
.find((elem) => mbe.contains(elem));
|
|
299
|
+
return elem ? getElementValue(elem, config.attribute) : undefined;
|
|
300
|
+
}, (key) => key)) || [];
|
|
301
|
+
// If MBE approach didn't find all elements, try independent scraping
|
|
302
|
+
if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) {
|
|
303
|
+
// Fall back to independent scraping
|
|
304
|
+
const results = [];
|
|
305
|
+
const foundElements = new Map();
|
|
306
|
+
// Find all elements for each selector
|
|
307
|
+
Object.entries(lists).forEach(([key, config]) => {
|
|
308
|
+
const elements = findAllElements(config);
|
|
309
|
+
foundElements.set(key, elements);
|
|
310
|
+
});
|
|
311
|
+
// Create result objects for each found element
|
|
312
|
+
foundElements.forEach((elements, key) => {
|
|
313
|
+
elements.forEach((element, index) => {
|
|
314
|
+
if (!results[index]) {
|
|
315
|
+
results[index] = {};
|
|
316
|
+
}
|
|
317
|
+
results[index][key] = getElementValue(element, lists[key].attribute);
|
|
318
|
+
});
|
|
319
|
+
});
|
|
320
|
+
return results.filter(result => Object.keys(result).length > 0);
|
|
321
|
+
}
|
|
322
|
+
return mbeResults;
|
|
209
323
|
};
|
|
210
324
|
/**
|
|
211
325
|
* Scrapes multiple lists of similar items based on a template item.
|
|
@@ -218,43 +332,410 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
218
332
|
*/
|
|
219
333
|
window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
|
|
220
334
|
return __awaiter(this, void 0, void 0, function* () {
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
335
|
+
// Enhanced query function to handle both iframe and shadow DOM
|
|
336
|
+
const queryElement = (rootElement, selector) => {
|
|
337
|
+
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
|
338
|
+
return rootElement.querySelector(selector);
|
|
339
|
+
}
|
|
340
|
+
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
|
341
|
+
let currentElement = rootElement;
|
|
342
|
+
for (let i = 0; i < parts.length; i++) {
|
|
343
|
+
if (!currentElement)
|
|
344
|
+
return null;
|
|
345
|
+
// Handle iframe traversal
|
|
346
|
+
if (currentElement.tagName === 'IFRAME') {
|
|
347
|
+
try {
|
|
348
|
+
const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
|
349
|
+
currentElement = iframeDoc.querySelector(parts[i]);
|
|
350
|
+
continue;
|
|
351
|
+
}
|
|
352
|
+
catch (e) {
|
|
353
|
+
console.warn('Cannot access iframe content:', e);
|
|
354
|
+
return null;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
// Try regular DOM first
|
|
358
|
+
let nextElement = currentElement.querySelector(parts[i]);
|
|
359
|
+
// Try shadow DOM if not found
|
|
360
|
+
if (!nextElement && currentElement.shadowRoot) {
|
|
361
|
+
nextElement = currentElement.shadowRoot.querySelector(parts[i]);
|
|
362
|
+
}
|
|
363
|
+
// Check children's shadow roots if still not found
|
|
364
|
+
if (!nextElement) {
|
|
365
|
+
const children = Array.from(currentElement.children || []);
|
|
366
|
+
for (const child of children) {
|
|
367
|
+
if (child.shadowRoot) {
|
|
368
|
+
nextElement = child.shadowRoot.querySelector(parts[i]);
|
|
369
|
+
if (nextElement)
|
|
370
|
+
break;
|
|
236
371
|
}
|
|
237
|
-
|
|
238
|
-
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
currentElement = nextElement;
|
|
375
|
+
}
|
|
376
|
+
return currentElement;
|
|
377
|
+
};
|
|
378
|
+
// Enhanced query all function for both contexts
|
|
379
|
+
const queryElementAll = (rootElement, selector) => {
|
|
380
|
+
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
|
381
|
+
return rootElement.querySelectorAll(selector);
|
|
382
|
+
}
|
|
383
|
+
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
|
384
|
+
let currentElements = [rootElement];
|
|
385
|
+
for (const part of parts) {
|
|
386
|
+
const nextElements = [];
|
|
387
|
+
for (const element of currentElements) {
|
|
388
|
+
// Handle iframe traversal
|
|
389
|
+
if (element.tagName === 'IFRAME') {
|
|
390
|
+
try {
|
|
391
|
+
const iframeDoc = element.contentDocument || element.contentWindow.document;
|
|
392
|
+
nextElements.push(...iframeDoc.querySelectorAll(part));
|
|
393
|
+
}
|
|
394
|
+
catch (e) {
|
|
395
|
+
console.warn('Cannot access iframe content:', e);
|
|
396
|
+
continue;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
else {
|
|
400
|
+
// Regular DOM elements
|
|
401
|
+
if (element.querySelectorAll) {
|
|
402
|
+
nextElements.push(...element.querySelectorAll(part));
|
|
403
|
+
}
|
|
404
|
+
// Shadow DOM elements
|
|
405
|
+
if (element.shadowRoot) {
|
|
406
|
+
nextElements.push(...element.shadowRoot.querySelectorAll(part));
|
|
239
407
|
}
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
408
|
+
// Check children's shadow roots
|
|
409
|
+
const children = Array.from(element.children || []);
|
|
410
|
+
for (const child of children) {
|
|
411
|
+
if (child.shadowRoot) {
|
|
412
|
+
nextElements.push(...child.shadowRoot.querySelectorAll(part));
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
currentElements = nextElements;
|
|
418
|
+
}
|
|
419
|
+
return currentElements;
|
|
420
|
+
};
|
|
421
|
+
// Enhanced value extraction with context awareness
|
|
422
|
+
function extractValue(element, attribute) {
|
|
423
|
+
var _a, _b;
|
|
424
|
+
if (!element)
|
|
425
|
+
return null;
|
|
426
|
+
// Get context-aware base URL
|
|
427
|
+
const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
|
|
428
|
+
// Check shadow root first
|
|
429
|
+
if (element.shadowRoot) {
|
|
430
|
+
const shadowContent = element.shadowRoot.textContent;
|
|
431
|
+
if (shadowContent === null || shadowContent === void 0 ? void 0 : shadowContent.trim()) {
|
|
432
|
+
return shadowContent.trim();
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
if (attribute === 'innerText') {
|
|
436
|
+
return element.innerText.trim();
|
|
437
|
+
}
|
|
438
|
+
else if (attribute === 'innerHTML') {
|
|
439
|
+
return element.innerHTML.trim();
|
|
440
|
+
}
|
|
441
|
+
else if (attribute === 'src' || attribute === 'href') {
|
|
442
|
+
const attrValue = element.getAttribute(attribute);
|
|
443
|
+
return attrValue ? new URL(attrValue, baseURL).href : null;
|
|
444
|
+
}
|
|
445
|
+
return element.getAttribute(attribute);
|
|
446
|
+
}
|
|
447
|
+
// Enhanced table ancestor finding with context support
|
|
448
|
+
function findTableAncestor(element) {
|
|
449
|
+
let currentElement = element;
|
|
450
|
+
const MAX_DEPTH = 5;
|
|
451
|
+
let depth = 0;
|
|
452
|
+
while (currentElement && depth < MAX_DEPTH) {
|
|
453
|
+
// Handle shadow DOM
|
|
454
|
+
if (currentElement.getRootNode() instanceof ShadowRoot) {
|
|
455
|
+
currentElement = currentElement.getRootNode().host;
|
|
456
|
+
continue;
|
|
457
|
+
}
|
|
458
|
+
if (currentElement.tagName === 'TD') {
|
|
459
|
+
return { type: 'TD', element: currentElement };
|
|
460
|
+
}
|
|
461
|
+
else if (currentElement.tagName === 'TR') {
|
|
462
|
+
return { type: 'TR', element: currentElement };
|
|
463
|
+
}
|
|
464
|
+
// Handle iframe crossing
|
|
465
|
+
if (currentElement.tagName === 'IFRAME') {
|
|
466
|
+
try {
|
|
467
|
+
currentElement = currentElement.contentDocument.body;
|
|
468
|
+
}
|
|
469
|
+
catch (e) {
|
|
470
|
+
return null;
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
else {
|
|
474
|
+
currentElement = currentElement.parentElement;
|
|
475
|
+
}
|
|
476
|
+
depth++;
|
|
477
|
+
}
|
|
478
|
+
return null;
|
|
479
|
+
}
|
|
480
|
+
// Helper function to get cell index
|
|
481
|
+
function getCellIndex(td) {
|
|
482
|
+
if (td.getRootNode() instanceof ShadowRoot) {
|
|
483
|
+
const shadowRoot = td.getRootNode();
|
|
484
|
+
const allCells = Array.from(shadowRoot.querySelectorAll('td'));
|
|
485
|
+
return allCells.indexOf(td);
|
|
486
|
+
}
|
|
487
|
+
let index = 0;
|
|
488
|
+
let sibling = td;
|
|
489
|
+
while (sibling = sibling.previousElementSibling) {
|
|
490
|
+
index++;
|
|
491
|
+
}
|
|
492
|
+
return index;
|
|
493
|
+
}
|
|
494
|
+
// Helper function to check for TH elements
|
|
495
|
+
function hasThElement(row, tableFields) {
|
|
496
|
+
for (const [_, { selector }] of Object.entries(tableFields)) {
|
|
497
|
+
const element = queryElement(row, selector);
|
|
498
|
+
if (element) {
|
|
499
|
+
let current = element;
|
|
500
|
+
while (current && current !== row) {
|
|
501
|
+
if (current.getRootNode() instanceof ShadowRoot) {
|
|
502
|
+
current = current.getRootNode().host;
|
|
503
|
+
continue;
|
|
244
504
|
}
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
505
|
+
if (current.tagName === 'TH')
|
|
506
|
+
return true;
|
|
507
|
+
if (current.tagName === 'IFRAME') {
|
|
508
|
+
try {
|
|
509
|
+
current = current.contentDocument.body;
|
|
510
|
+
}
|
|
511
|
+
catch (e) {
|
|
512
|
+
break;
|
|
513
|
+
}
|
|
249
514
|
}
|
|
250
515
|
else {
|
|
251
|
-
|
|
516
|
+
current = current.parentElement;
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
return false;
|
|
522
|
+
}
|
|
523
|
+
// Helper function to filter rows
|
|
524
|
+
function filterRowsBasedOnTag(rows, tableFields) {
|
|
525
|
+
for (const row of rows) {
|
|
526
|
+
if (hasThElement(row, tableFields)) {
|
|
527
|
+
return rows;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
// Include shadow DOM in TH search
|
|
531
|
+
return rows.filter(row => {
|
|
532
|
+
const directTH = row.getElementsByTagName('TH').length === 0;
|
|
533
|
+
const shadowTH = row.shadowRoot ?
|
|
534
|
+
row.shadowRoot.querySelector('th') === null : true;
|
|
535
|
+
return directTH && shadowTH;
|
|
536
|
+
});
|
|
537
|
+
}
|
|
538
|
+
// Class similarity comparison functions
|
|
539
|
+
function calculateClassSimilarity(classList1, classList2) {
|
|
540
|
+
const set1 = new Set(classList1);
|
|
541
|
+
const set2 = new Set(classList2);
|
|
542
|
+
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
|
543
|
+
const union = new Set([...set1, ...set2]);
|
|
544
|
+
return intersection.size / union.size;
|
|
545
|
+
}
|
|
546
|
+
// Enhanced similar elements finding with context support
|
|
547
|
+
function findSimilarElements(baseElement, similarityThreshold = 0.7) {
|
|
548
|
+
const baseClasses = Array.from(baseElement.classList);
|
|
549
|
+
if (baseClasses.length === 0)
|
|
550
|
+
return [];
|
|
551
|
+
const allElements = [];
|
|
552
|
+
// Get elements from main document
|
|
553
|
+
allElements.push(...document.getElementsByTagName(baseElement.tagName));
|
|
554
|
+
// Get elements from shadow DOM
|
|
555
|
+
if (baseElement.getRootNode() instanceof ShadowRoot) {
|
|
556
|
+
const shadowHost = baseElement.getRootNode().host;
|
|
557
|
+
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
|
558
|
+
}
|
|
559
|
+
// Get elements from iframes
|
|
560
|
+
const iframes = document.getElementsByTagName('iframe');
|
|
561
|
+
for (const iframe of iframes) {
|
|
562
|
+
try {
|
|
563
|
+
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
|
|
564
|
+
allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
|
|
565
|
+
}
|
|
566
|
+
catch (e) {
|
|
567
|
+
console.warn('Cannot access iframe content:', e);
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
return allElements.filter(element => {
|
|
571
|
+
if (element === baseElement)
|
|
572
|
+
return false;
|
|
573
|
+
const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
|
|
574
|
+
return similarity >= similarityThreshold;
|
|
575
|
+
});
|
|
576
|
+
}
|
|
577
|
+
// Main scraping logic with context support
|
|
578
|
+
let containers = queryElementAll(document, listSelector);
|
|
579
|
+
containers = Array.from(containers);
|
|
580
|
+
if (containers.length === 0)
|
|
581
|
+
return [];
|
|
582
|
+
if (limit > 1 && containers.length === 1) {
|
|
583
|
+
const baseContainer = containers[0];
|
|
584
|
+
const similarContainers = findSimilarElements(baseContainer);
|
|
585
|
+
if (similarContainers.length > 0) {
|
|
586
|
+
const newContainers = similarContainers.filter(container => !container.matches(listSelector));
|
|
587
|
+
containers = [...containers, ...newContainers];
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
const containerFields = containers.map(() => ({
|
|
591
|
+
tableFields: {},
|
|
592
|
+
nonTableFields: {}
|
|
593
|
+
}));
|
|
594
|
+
// Classify fields
|
|
595
|
+
containers.forEach((container, containerIndex) => {
|
|
596
|
+
for (const [label, field] of Object.entries(fields)) {
|
|
597
|
+
const sampleElement = queryElement(container, field.selector);
|
|
598
|
+
if (sampleElement) {
|
|
599
|
+
const ancestor = findTableAncestor(sampleElement);
|
|
600
|
+
if (ancestor) {
|
|
601
|
+
containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 });
|
|
602
|
+
}
|
|
603
|
+
else {
|
|
604
|
+
containerFields[containerIndex].nonTableFields[label] = field;
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
else {
|
|
608
|
+
containerFields[containerIndex].nonTableFields[label] = field;
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
});
|
|
612
|
+
const tableData = [];
|
|
613
|
+
const nonTableData = [];
|
|
614
|
+
// Process table data with both iframe and shadow DOM support
|
|
615
|
+
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
|
616
|
+
const container = containers[containerIndex];
|
|
617
|
+
const { tableFields } = containerFields[containerIndex];
|
|
618
|
+
if (Object.keys(tableFields).length > 0) {
|
|
619
|
+
const firstField = Object.values(tableFields)[0];
|
|
620
|
+
const firstElement = queryElement(container, firstField.selector);
|
|
621
|
+
let tableContext = firstElement;
|
|
622
|
+
// Find table context including both iframe and shadow DOM
|
|
623
|
+
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
|
624
|
+
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
|
625
|
+
tableContext = tableContext.getRootNode().host;
|
|
626
|
+
continue;
|
|
627
|
+
}
|
|
628
|
+
if (tableContext.tagName === 'IFRAME') {
|
|
629
|
+
try {
|
|
630
|
+
tableContext = tableContext.contentDocument.body;
|
|
631
|
+
}
|
|
632
|
+
catch (e) {
|
|
633
|
+
break;
|
|
252
634
|
}
|
|
253
635
|
}
|
|
636
|
+
else {
|
|
637
|
+
tableContext = tableContext.parentElement;
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
if (tableContext) {
|
|
641
|
+
// Get rows from all contexts
|
|
642
|
+
const rows = [];
|
|
643
|
+
// Get rows from regular DOM
|
|
644
|
+
rows.push(...tableContext.getElementsByTagName('TR'));
|
|
645
|
+
// Get rows from shadow DOM
|
|
646
|
+
if (tableContext.shadowRoot) {
|
|
647
|
+
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
|
648
|
+
}
|
|
649
|
+
// Get rows from iframes
|
|
650
|
+
if (tableContext.tagName === 'IFRAME') {
|
|
651
|
+
try {
|
|
652
|
+
const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
|
653
|
+
rows.push(...iframeDoc.getElementsByTagName('TR'));
|
|
654
|
+
}
|
|
655
|
+
catch (e) {
|
|
656
|
+
console.warn('Cannot access iframe rows:', e);
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
|
660
|
+
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
|
|
661
|
+
const record = {};
|
|
662
|
+
const currentRow = processedRows[rowIndex];
|
|
663
|
+
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
|
|
664
|
+
let element = null;
|
|
665
|
+
if (cellIndex >= 0) {
|
|
666
|
+
// Get TD element considering both contexts
|
|
667
|
+
let td = currentRow.children[cellIndex];
|
|
668
|
+
// Check shadow DOM for td
|
|
669
|
+
if (!td && currentRow.shadowRoot) {
|
|
670
|
+
const shadowCells = currentRow.shadowRoot.children;
|
|
671
|
+
if (shadowCells && shadowCells.length > cellIndex) {
|
|
672
|
+
td = shadowCells[cellIndex];
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
if (td) {
|
|
676
|
+
element = queryElement(td, selector);
|
|
677
|
+
if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
|
|
678
|
+
element = td;
|
|
679
|
+
}
|
|
680
|
+
if (!element) {
|
|
681
|
+
const tagOnlySelector = selector.split('.')[0];
|
|
682
|
+
element = queryElement(td, tagOnlySelector);
|
|
683
|
+
}
|
|
684
|
+
if (!element) {
|
|
685
|
+
let currentElement = td;
|
|
686
|
+
while (currentElement && currentElement.children.length > 0) {
|
|
687
|
+
let foundContentChild = false;
|
|
688
|
+
for (const child of currentElement.children) {
|
|
689
|
+
if (extractValue(child, attribute)) {
|
|
690
|
+
currentElement = child;
|
|
691
|
+
foundContentChild = true;
|
|
692
|
+
break;
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
if (!foundContentChild)
|
|
696
|
+
break;
|
|
697
|
+
}
|
|
698
|
+
element = currentElement;
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
else {
|
|
703
|
+
element = queryElement(currentRow, selector);
|
|
704
|
+
}
|
|
705
|
+
if (element) {
|
|
706
|
+
record[label] = extractValue(element, attribute);
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
if (Object.keys(record).length > 0) {
|
|
710
|
+
tableData.push(record);
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
// Process non-table data with both contexts support
|
|
717
|
+
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
|
718
|
+
if (nonTableData.length >= limit)
|
|
719
|
+
break;
|
|
720
|
+
const container = containers[containerIndex];
|
|
721
|
+
const { nonTableFields } = containerFields[containerIndex];
|
|
722
|
+
if (Object.keys(nonTableFields).length > 0) {
|
|
723
|
+
const record = {};
|
|
724
|
+
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
|
725
|
+
// Get the last part of the selector after any context delimiter
|
|
726
|
+
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
|
727
|
+
const element = queryElement(container, relativeSelector);
|
|
728
|
+
if (element) {
|
|
729
|
+
record[label] = extractValue(element, attribute);
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
if (Object.keys(record).length > 0) {
|
|
733
|
+
nonTableData.push(record);
|
|
254
734
|
}
|
|
255
|
-
scrapedData.push(record);
|
|
256
735
|
}
|
|
257
736
|
}
|
|
737
|
+
// Merge and limit the results
|
|
738
|
+
const scrapedData = [...tableData, ...nonTableData];
|
|
258
739
|
return scrapedData;
|
|
259
740
|
});
|
|
260
741
|
};
|
package/build/interpret.d.ts
CHANGED
|
@@ -86,6 +86,8 @@ export default class Interpreter extends EventEmitter {
|
|
|
86
86
|
private carryOutSteps;
|
|
87
87
|
private handlePagination;
|
|
88
88
|
private getMatchingActionId;
|
|
89
|
+
private removeShadowSelectors;
|
|
90
|
+
private removeSpecialSelectors;
|
|
89
91
|
private runLoop;
|
|
90
92
|
private ensureScriptsLoaded;
|
|
91
93
|
/**
|
package/build/interpret.js
CHANGED
|
@@ -84,14 +84,24 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
84
84
|
applyAdBlocker(page) {
|
|
85
85
|
return __awaiter(this, void 0, void 0, function* () {
|
|
86
86
|
if (this.blocker) {
|
|
87
|
-
|
|
87
|
+
try {
|
|
88
|
+
yield this.blocker.enableBlockingInPage(page);
|
|
89
|
+
}
|
|
90
|
+
catch (err) {
|
|
91
|
+
this.log(`Ad-blocker operation failed:`, logger_1.Level.ERROR);
|
|
92
|
+
}
|
|
88
93
|
}
|
|
89
94
|
});
|
|
90
95
|
}
|
|
91
96
|
disableAdBlocker(page) {
|
|
92
97
|
return __awaiter(this, void 0, void 0, function* () {
|
|
93
98
|
if (this.blocker) {
|
|
94
|
-
|
|
99
|
+
try {
|
|
100
|
+
yield this.blocker.disableBlockingInPage(page);
|
|
101
|
+
}
|
|
102
|
+
catch (err) {
|
|
103
|
+
this.log(`Ad-blocker operation failed:`, logger_1.Level.ERROR);
|
|
104
|
+
}
|
|
95
105
|
}
|
|
96
106
|
});
|
|
97
107
|
}
|
|
@@ -156,8 +166,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
156
166
|
// const actionable = async (selector: string): Promise<boolean> => {
|
|
157
167
|
// try {
|
|
158
168
|
// const proms = [
|
|
159
|
-
// page.isEnabled(selector, { timeout:
|
|
160
|
-
// page.isVisible(selector, { timeout:
|
|
169
|
+
// page.isEnabled(selector, { timeout: 10000 }),
|
|
170
|
+
// page.isVisible(selector, { timeout: 10000 }),
|
|
161
171
|
// ];
|
|
162
172
|
// return await Promise.all(proms).then((bools) => bools.every((x) => x));
|
|
163
173
|
// } catch (e) {
|
|
@@ -176,6 +186,15 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
176
186
|
// return [];
|
|
177
187
|
// }),
|
|
178
188
|
// ).then((x) => x.flat());
|
|
189
|
+
const presentSelectors = yield Promise.all(selectors.map((selector) => __awaiter(this, void 0, void 0, function* () {
|
|
190
|
+
try {
|
|
191
|
+
yield page.waitForSelector(selector, { state: 'attached' });
|
|
192
|
+
return [selector];
|
|
193
|
+
}
|
|
194
|
+
catch (e) {
|
|
195
|
+
return [];
|
|
196
|
+
}
|
|
197
|
+
}))).then((x) => x.flat());
|
|
179
198
|
const action = workflowCopy[workflowCopy.length - 1];
|
|
180
199
|
// console.log("Next action:", action)
|
|
181
200
|
let url = page.url();
|
|
@@ -186,7 +205,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
186
205
|
url,
|
|
187
206
|
cookies: (yield page.context().cookies([page.url()]))
|
|
188
207
|
.reduce((p, cookie) => (Object.assign(Object.assign({}, p), { [cookie.name]: cookie.value })), {}),
|
|
189
|
-
selectors,
|
|
208
|
+
selectors: presentSelectors,
|
|
190
209
|
};
|
|
191
210
|
});
|
|
192
211
|
}
|
|
@@ -420,7 +439,12 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
420
439
|
yield executeAction(invokee, methodName, step.args);
|
|
421
440
|
}
|
|
422
441
|
catch (error) {
|
|
423
|
-
|
|
442
|
+
try {
|
|
443
|
+
yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
|
|
444
|
+
}
|
|
445
|
+
catch (error) {
|
|
446
|
+
continue;
|
|
447
|
+
}
|
|
424
448
|
}
|
|
425
449
|
}
|
|
426
450
|
else {
|
|
@@ -544,12 +568,39 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
544
568
|
}
|
|
545
569
|
}
|
|
546
570
|
}
|
|
571
|
+
removeShadowSelectors(workflow) {
|
|
572
|
+
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
573
|
+
const step = workflow[actionId];
|
|
574
|
+
// Check if step has where and selectors
|
|
575
|
+
if (step.where && Array.isArray(step.where.selectors)) {
|
|
576
|
+
// Filter out selectors that contain ">>"
|
|
577
|
+
step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>'));
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
return workflow;
|
|
581
|
+
}
|
|
582
|
+
removeSpecialSelectors(workflow) {
|
|
583
|
+
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
584
|
+
const step = workflow[actionId];
|
|
585
|
+
if (step.where && Array.isArray(step.where.selectors)) {
|
|
586
|
+
// Filter out if selector has EITHER ":>>" OR ">>"
|
|
587
|
+
step.where.selectors = step.where.selectors.filter(selector => !(selector.includes(':>>') || selector.includes('>>')));
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
return workflow;
|
|
591
|
+
}
|
|
547
592
|
runLoop(p, workflow) {
|
|
548
593
|
var _a, _b;
|
|
549
594
|
return __awaiter(this, void 0, void 0, function* () {
|
|
550
|
-
|
|
595
|
+
let workflowCopy = JSON.parse(JSON.stringify(workflow));
|
|
596
|
+
workflowCopy = this.removeSpecialSelectors(workflowCopy);
|
|
551
597
|
// apply ad-blocker to the current page
|
|
552
|
-
|
|
598
|
+
try {
|
|
599
|
+
yield this.applyAdBlocker(p);
|
|
600
|
+
}
|
|
601
|
+
catch (error) {
|
|
602
|
+
this.log(`Failed to apply ad-blocker: ${error.message}`, logger_1.Level.ERROR);
|
|
603
|
+
}
|
|
553
604
|
const usedActions = [];
|
|
554
605
|
let selectors = [];
|
|
555
606
|
let lastAction = null;
|
|
@@ -660,6 +711,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
660
711
|
return __awaiter(this, void 0, void 0, function* () {
|
|
661
712
|
this.log('Starting the workflow.', logger_1.Level.LOG);
|
|
662
713
|
const context = page.context();
|
|
714
|
+
page.setDefaultNavigationTimeout(100000);
|
|
663
715
|
// Check proxy settings from context options
|
|
664
716
|
const contextOptions = context._options;
|
|
665
717
|
const hasProxy = !!(contextOptions === null || contextOptions === void 0 ? void 0 : contextOptions.proxy);
|
|
@@ -3,43 +3,43 @@
|
|
|
3
3
|
*/
|
|
4
4
|
export default class Concurrency {
|
|
5
5
|
/**
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
* Maximum number of workers running in parallel. If set to `null`, there is no limit.
|
|
7
|
+
*/
|
|
8
8
|
maxConcurrency: number;
|
|
9
9
|
/**
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
* Number of currently active workers.
|
|
11
|
+
*/
|
|
12
12
|
activeWorkers: number;
|
|
13
13
|
/**
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
* Queue of jobs waiting to be completed.
|
|
15
|
+
*/
|
|
16
16
|
private jobQueue;
|
|
17
17
|
/**
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
* "Resolve" callbacks of the waitForCompletion() promises.
|
|
19
|
+
*/
|
|
20
20
|
private waiting;
|
|
21
21
|
/**
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
* Constructs a new instance of concurrency manager.
|
|
23
|
+
* @param {number} maxConcurrency Maximum number of workers running in parallel.
|
|
24
|
+
*/
|
|
25
25
|
constructor(maxConcurrency: number);
|
|
26
26
|
/**
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
* Takes a waiting job out of the queue and runs it.
|
|
28
|
+
*/
|
|
29
29
|
private runNextJob;
|
|
30
30
|
/**
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
31
|
+
* Pass a job (a time-demanding async function) to the concurrency manager. \
|
|
32
|
+
* The time of the job's execution depends on the concurrency manager itself
|
|
33
|
+
* (given a generous enough `maxConcurrency` value, it might be immediate,
|
|
34
|
+
* but this is not guaranteed).
|
|
35
|
+
* @param worker Async function to be executed (job to be processed).
|
|
36
|
+
*/
|
|
37
37
|
addJob(job: () => Promise<any>): void;
|
|
38
38
|
/**
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
39
|
+
* Waits until there is no running nor waiting job. \
|
|
40
|
+
* If the concurrency manager is idle at the time of calling this function,
|
|
41
|
+
* it waits until at least one job is completed (can be "presubscribed").
|
|
42
|
+
* @returns Promise, resolved after there is no running/waiting worker.
|
|
43
|
+
*/
|
|
44
44
|
waitForCompletion(): Promise<void>;
|
|
45
45
|
}
|
|
@@ -5,31 +5,31 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
5
5
|
*/
|
|
6
6
|
class Concurrency {
|
|
7
7
|
/**
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
* Constructs a new instance of concurrency manager.
|
|
9
|
+
* @param {number} maxConcurrency Maximum number of workers running in parallel.
|
|
10
|
+
*/
|
|
11
11
|
constructor(maxConcurrency) {
|
|
12
12
|
/**
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
* Maximum number of workers running in parallel. If set to `null`, there is no limit.
|
|
14
|
+
*/
|
|
15
15
|
this.maxConcurrency = 1;
|
|
16
16
|
/**
|
|
17
|
-
|
|
18
|
-
|
|
17
|
+
* Number of currently active workers.
|
|
18
|
+
*/
|
|
19
19
|
this.activeWorkers = 0;
|
|
20
20
|
/**
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
* Queue of jobs waiting to be completed.
|
|
22
|
+
*/
|
|
23
23
|
this.jobQueue = [];
|
|
24
24
|
/**
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
* "Resolve" callbacks of the waitForCompletion() promises.
|
|
26
|
+
*/
|
|
27
27
|
this.waiting = [];
|
|
28
28
|
this.maxConcurrency = maxConcurrency;
|
|
29
29
|
}
|
|
30
30
|
/**
|
|
31
|
-
|
|
32
|
-
|
|
31
|
+
* Takes a waiting job out of the queue and runs it.
|
|
32
|
+
*/
|
|
33
33
|
runNextJob() {
|
|
34
34
|
const job = this.jobQueue.pop();
|
|
35
35
|
if (job) {
|
|
@@ -49,12 +49,12 @@ class Concurrency {
|
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
51
|
/**
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
52
|
+
* Pass a job (a time-demanding async function) to the concurrency manager. \
|
|
53
|
+
* The time of the job's execution depends on the concurrency manager itself
|
|
54
|
+
* (given a generous enough `maxConcurrency` value, it might be immediate,
|
|
55
|
+
* but this is not guaranteed).
|
|
56
|
+
* @param worker Async function to be executed (job to be processed).
|
|
57
|
+
*/
|
|
58
58
|
addJob(job) {
|
|
59
59
|
// console.debug("Adding a worker!");
|
|
60
60
|
this.jobQueue.push(job);
|
|
@@ -67,11 +67,11 @@ class Concurrency {
|
|
|
67
67
|
}
|
|
68
68
|
}
|
|
69
69
|
/**
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
70
|
+
* Waits until there is no running nor waiting job. \
|
|
71
|
+
* If the concurrency manager is idle at the time of calling this function,
|
|
72
|
+
* it waits until at least one job is completed (can be "presubscribed").
|
|
73
|
+
* @returns Promise, resolved after there is no running/waiting worker.
|
|
74
|
+
*/
|
|
75
75
|
waitForCompletion() {
|
|
76
76
|
return new Promise((res) => {
|
|
77
77
|
this.waiting.push(res);
|