maxun-core 0.0.11 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/browserSide/scraper.js +74 -44
- package/build/interpret.js +78 -39
- package/package.json +1 -1
|
@@ -174,7 +174,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
174
174
|
if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
|
|
175
175
|
return Array.from(document.querySelectorAll(config.selector));
|
|
176
176
|
}
|
|
177
|
-
// First handle iframe traversal if present
|
|
178
177
|
if (config.selector.includes(':>>')) {
|
|
179
178
|
const parts = config.selector.split(':>>').map(s => s.trim());
|
|
180
179
|
let currentElements = [document];
|
|
@@ -185,24 +184,42 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
185
184
|
const isLast = i === parts.length - 1;
|
|
186
185
|
for (const element of currentElements) {
|
|
187
186
|
try {
|
|
188
|
-
// For document or iframe document
|
|
189
187
|
const doc = element.contentDocument || element || ((_a = element.contentWindow) === null || _a === void 0 ? void 0 : _a.document);
|
|
190
188
|
if (!doc)
|
|
191
189
|
continue;
|
|
192
|
-
|
|
190
|
+
if (part.startsWith('frame[name=') || part.startsWith('iframe[name=')) {
|
|
191
|
+
const nameMatch = part.match(/\[name=['"]([^'"]+)['"]\]/);
|
|
192
|
+
if (nameMatch && nameMatch[1]) {
|
|
193
|
+
const frameName = nameMatch[1];
|
|
194
|
+
let foundFrames = [];
|
|
195
|
+
if (doc.getElementsByName && typeof doc.getElementsByName === 'function') {
|
|
196
|
+
foundFrames = Array.from(doc.getElementsByName(frameName))
|
|
197
|
+
.filter(el => el.tagName === 'FRAME' || el.tagName === 'IFRAME');
|
|
198
|
+
}
|
|
199
|
+
if (foundFrames.length === 0) {
|
|
200
|
+
const framesBySelector = Array.from(doc.querySelectorAll(`frame[name="${frameName}"], iframe[name="${frameName}"]`));
|
|
201
|
+
foundFrames = framesBySelector;
|
|
202
|
+
}
|
|
203
|
+
if (isLast) {
|
|
204
|
+
nextElements.push(...foundFrames);
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
nextElements.push(...foundFrames);
|
|
208
|
+
}
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
193
212
|
const found = Array.from(doc.querySelectorAll(part));
|
|
194
213
|
if (isLast) {
|
|
195
|
-
// If it's the last part, keep all matching elements
|
|
196
214
|
nextElements.push(...found);
|
|
197
215
|
}
|
|
198
216
|
else {
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
nextElements.push(...iframes);
|
|
217
|
+
const frames = found.filter(el => el.tagName === 'IFRAME' || el.tagName === 'FRAME');
|
|
218
|
+
nextElements.push(...frames);
|
|
202
219
|
}
|
|
203
220
|
}
|
|
204
221
|
catch (error) {
|
|
205
|
-
console.warn('Cannot access iframe content:', error, {
|
|
222
|
+
console.warn('Cannot access iframe/frame content:', error, {
|
|
206
223
|
part,
|
|
207
224
|
element,
|
|
208
225
|
index: i
|
|
@@ -242,13 +259,19 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
242
259
|
}
|
|
243
260
|
return [];
|
|
244
261
|
}
|
|
245
|
-
// Modified to handle iframe context for URL resolution
|
|
246
262
|
function getElementValue(element, attribute) {
|
|
247
|
-
var _a, _b, _c, _d, _e;
|
|
263
|
+
var _a, _b, _c, _d, _e, _f;
|
|
248
264
|
if (!element)
|
|
249
265
|
return null;
|
|
250
|
-
|
|
251
|
-
|
|
266
|
+
let baseURL;
|
|
267
|
+
try {
|
|
268
|
+
baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) ||
|
|
269
|
+
((_c = element.ownerDocument) === null || _c === void 0 ? void 0 : _c.baseURI) ||
|
|
270
|
+
window.location.origin;
|
|
271
|
+
}
|
|
272
|
+
catch (e) {
|
|
273
|
+
baseURL = window.location.origin;
|
|
274
|
+
}
|
|
252
275
|
switch (attribute) {
|
|
253
276
|
case 'href': {
|
|
254
277
|
const relativeHref = element.getAttribute('href');
|
|
@@ -259,11 +282,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
259
282
|
return relativeSrc ? new URL(relativeSrc, baseURL).href : null;
|
|
260
283
|
}
|
|
261
284
|
case 'innerText':
|
|
262
|
-
return (
|
|
285
|
+
return (_d = element.innerText) === null || _d === void 0 ? void 0 : _d.trim();
|
|
263
286
|
case 'textContent':
|
|
264
|
-
return (
|
|
287
|
+
return (_e = element.textContent) === null || _e === void 0 ? void 0 : _e.trim();
|
|
288
|
+
case 'innerHTML':
|
|
289
|
+
return element.innerHTML;
|
|
290
|
+
case 'outerHTML':
|
|
291
|
+
return element.outerHTML;
|
|
265
292
|
default:
|
|
266
|
-
return element.getAttribute(attribute) || ((
|
|
293
|
+
return element.getAttribute(attribute) || ((_f = element.innerText) === null || _f === void 0 ? void 0 : _f.trim());
|
|
267
294
|
}
|
|
268
295
|
}
|
|
269
296
|
// Rest of the functions remain largely the same
|
|
@@ -332,7 +359,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
332
359
|
*/
|
|
333
360
|
window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
|
|
334
361
|
return __awaiter(this, void 0, void 0, function* () {
|
|
335
|
-
// Enhanced query function to handle
|
|
362
|
+
// Enhanced query function to handle iframe, frame and shadow DOM
|
|
336
363
|
const queryElement = (rootElement, selector) => {
|
|
337
364
|
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
|
338
365
|
return rootElement.querySelector(selector);
|
|
@@ -342,15 +369,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
342
369
|
for (let i = 0; i < parts.length; i++) {
|
|
343
370
|
if (!currentElement)
|
|
344
371
|
return null;
|
|
345
|
-
// Handle iframe traversal
|
|
346
|
-
if (currentElement.tagName === 'IFRAME') {
|
|
372
|
+
// Handle iframe and frame traversal
|
|
373
|
+
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
|
347
374
|
try {
|
|
348
|
-
const
|
|
349
|
-
currentElement =
|
|
375
|
+
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
|
376
|
+
currentElement = frameDoc.querySelector(parts[i]);
|
|
350
377
|
continue;
|
|
351
378
|
}
|
|
352
379
|
catch (e) {
|
|
353
|
-
console.warn(
|
|
380
|
+
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
|
|
354
381
|
return null;
|
|
355
382
|
}
|
|
356
383
|
}
|
|
@@ -385,14 +412,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
385
412
|
for (const part of parts) {
|
|
386
413
|
const nextElements = [];
|
|
387
414
|
for (const element of currentElements) {
|
|
388
|
-
// Handle iframe traversal
|
|
389
|
-
if (element.tagName === 'IFRAME') {
|
|
415
|
+
// Handle iframe and frame traversal
|
|
416
|
+
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
|
|
390
417
|
try {
|
|
391
|
-
const
|
|
392
|
-
nextElements.push(...
|
|
418
|
+
const frameDoc = element.contentDocument || element.contentWindow.document;
|
|
419
|
+
nextElements.push(...frameDoc.querySelectorAll(part));
|
|
393
420
|
}
|
|
394
421
|
catch (e) {
|
|
395
|
-
console.warn(
|
|
422
|
+
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
|
|
396
423
|
continue;
|
|
397
424
|
}
|
|
398
425
|
}
|
|
@@ -461,8 +488,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
461
488
|
else if (currentElement.tagName === 'TR') {
|
|
462
489
|
return { type: 'TR', element: currentElement };
|
|
463
490
|
}
|
|
464
|
-
// Handle iframe crossing
|
|
465
|
-
if (currentElement.tagName === 'IFRAME') {
|
|
491
|
+
// Handle iframe and frame crossing
|
|
492
|
+
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
|
466
493
|
try {
|
|
467
494
|
currentElement = currentElement.contentDocument.body;
|
|
468
495
|
}
|
|
@@ -504,7 +531,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
504
531
|
}
|
|
505
532
|
if (current.tagName === 'TH')
|
|
506
533
|
return true;
|
|
507
|
-
if (current.tagName === 'IFRAME') {
|
|
534
|
+
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
|
|
508
535
|
try {
|
|
509
536
|
current = current.contentDocument.body;
|
|
510
537
|
}
|
|
@@ -556,15 +583,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
556
583
|
const shadowHost = baseElement.getRootNode().host;
|
|
557
584
|
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
|
558
585
|
}
|
|
559
|
-
// Get elements from iframes
|
|
560
|
-
const
|
|
561
|
-
|
|
586
|
+
// Get elements from iframes and frames
|
|
587
|
+
const frames = [
|
|
588
|
+
...Array.from(document.getElementsByTagName('iframe')),
|
|
589
|
+
...Array.from(document.getElementsByTagName('frame'))
|
|
590
|
+
];
|
|
591
|
+
for (const frame of frames) {
|
|
562
592
|
try {
|
|
563
|
-
const
|
|
564
|
-
allElements.push(...
|
|
593
|
+
const frameDoc = frame.contentDocument || frame.contentWindow.document;
|
|
594
|
+
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
|
|
565
595
|
}
|
|
566
596
|
catch (e) {
|
|
567
|
-
console.warn(
|
|
597
|
+
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
|
568
598
|
}
|
|
569
599
|
}
|
|
570
600
|
return allElements.filter(element => {
|
|
@@ -611,7 +641,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
611
641
|
});
|
|
612
642
|
const tableData = [];
|
|
613
643
|
const nonTableData = [];
|
|
614
|
-
// Process table data with
|
|
644
|
+
// Process table data with support for iframes, frames, and shadow DOM
|
|
615
645
|
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
|
616
646
|
const container = containers[containerIndex];
|
|
617
647
|
const { tableFields } = containerFields[containerIndex];
|
|
@@ -619,13 +649,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
619
649
|
const firstField = Object.values(tableFields)[0];
|
|
620
650
|
const firstElement = queryElement(container, firstField.selector);
|
|
621
651
|
let tableContext = firstElement;
|
|
622
|
-
// Find table context including
|
|
652
|
+
// Find table context including iframe, frame and shadow DOM
|
|
623
653
|
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
|
624
654
|
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
|
625
655
|
tableContext = tableContext.getRootNode().host;
|
|
626
656
|
continue;
|
|
627
657
|
}
|
|
628
|
-
if (tableContext.tagName === 'IFRAME') {
|
|
658
|
+
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
|
629
659
|
try {
|
|
630
660
|
tableContext = tableContext.contentDocument.body;
|
|
631
661
|
}
|
|
@@ -646,14 +676,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
646
676
|
if (tableContext.shadowRoot) {
|
|
647
677
|
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
|
648
678
|
}
|
|
649
|
-
// Get rows from iframes
|
|
650
|
-
if (tableContext.tagName === 'IFRAME') {
|
|
679
|
+
// Get rows from iframes and frames
|
|
680
|
+
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
|
651
681
|
try {
|
|
652
|
-
const
|
|
653
|
-
rows.push(...
|
|
682
|
+
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
|
683
|
+
rows.push(...frameDoc.getElementsByTagName('TR'));
|
|
654
684
|
}
|
|
655
685
|
catch (e) {
|
|
656
|
-
console.warn(
|
|
686
|
+
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
|
657
687
|
}
|
|
658
688
|
}
|
|
659
689
|
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
|
@@ -713,7 +743,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|
|
713
743
|
}
|
|
714
744
|
}
|
|
715
745
|
}
|
|
716
|
-
// Process non-table data with
|
|
746
|
+
// Process non-table data with all contexts support
|
|
717
747
|
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
|
718
748
|
if (nonTableData.length >= limit)
|
|
719
749
|
break;
|
package/build/interpret.js
CHANGED
|
@@ -234,6 +234,14 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
234
234
|
parsedSuperset[key] = Array.isArray(superset[key])
|
|
235
235
|
? (0, utils_1.arrayToObject)(superset[key])
|
|
236
236
|
: superset[key];
|
|
237
|
+
if ((key === 'url' || key === 'selectors') &&
|
|
238
|
+
Array.isArray(value) && Array.isArray(superset[key]) &&
|
|
239
|
+
value.length === 0 && superset[key].length === 0) {
|
|
240
|
+
return true;
|
|
241
|
+
}
|
|
242
|
+
if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
|
|
243
|
+
return value.some(selector => superset[key].includes(selector));
|
|
244
|
+
}
|
|
237
245
|
// Every `subset` key must exist in the `superset` and
|
|
238
246
|
// have the same value (strict equality), or subset[key] <= superset[key]
|
|
239
247
|
return parsedSuperset[key]
|
|
@@ -492,29 +500,45 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
492
500
|
return false;
|
|
493
501
|
};
|
|
494
502
|
// Enhanced button finder with retry mechanism
|
|
495
|
-
const findWorkingButton = (selectors
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
503
|
+
const findWorkingButton = (selectors) => __awaiter(this, void 0, void 0, function* () {
|
|
504
|
+
let updatedSelectors = [...selectors];
|
|
505
|
+
for (let i = 0; i < selectors.length; i++) {
|
|
506
|
+
const selector = selectors[i];
|
|
507
|
+
let retryCount = 0;
|
|
508
|
+
let selectorSuccess = false;
|
|
509
|
+
while (retryCount < MAX_RETRIES && !selectorSuccess) {
|
|
510
|
+
try {
|
|
511
|
+
const button = yield page.waitForSelector(selector, {
|
|
512
|
+
state: 'attached',
|
|
513
|
+
timeout: 10000
|
|
514
|
+
});
|
|
515
|
+
if (button) {
|
|
516
|
+
debugLog('Found working selector:', selector);
|
|
517
|
+
return {
|
|
518
|
+
button,
|
|
519
|
+
workingSelector: selector,
|
|
520
|
+
updatedSelectors
|
|
521
|
+
};
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
catch (error) {
|
|
525
|
+
retryCount++;
|
|
526
|
+
debugLog(`Selector "${selector}" failed: attempt ${retryCount}/${MAX_RETRIES}`);
|
|
527
|
+
if (retryCount < MAX_RETRIES) {
|
|
528
|
+
yield page.waitForTimeout(RETRY_DELAY);
|
|
529
|
+
}
|
|
530
|
+
else {
|
|
531
|
+
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
|
|
532
|
+
updatedSelectors = updatedSelectors.filter(s => s !== selector);
|
|
533
|
+
}
|
|
505
534
|
}
|
|
506
535
|
}
|
|
507
|
-
catch (error) {
|
|
508
|
-
debugLog(`Selector failed: ${selector}`);
|
|
509
|
-
}
|
|
510
|
-
}
|
|
511
|
-
// Implement retry mechanism when no selectors work
|
|
512
|
-
if (selectors.length > 0 && retryCount < MAX_RETRIES) {
|
|
513
|
-
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
514
|
-
yield page.waitForTimeout(RETRY_DELAY);
|
|
515
|
-
return findWorkingButton(selectors, retryCount + 1);
|
|
516
536
|
}
|
|
517
|
-
return {
|
|
537
|
+
return {
|
|
538
|
+
button: null,
|
|
539
|
+
workingSelector: null,
|
|
540
|
+
updatedSelectors
|
|
541
|
+
};
|
|
518
542
|
});
|
|
519
543
|
const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
|
|
520
544
|
try {
|
|
@@ -566,7 +590,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
566
590
|
yield scrapeCurrentPage();
|
|
567
591
|
if (checkLimit())
|
|
568
592
|
return allResults;
|
|
569
|
-
const { button, workingSelector } = yield findWorkingButton(availableSelectors);
|
|
593
|
+
const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
594
|
+
availableSelectors = updatedSelectors;
|
|
570
595
|
if (!button || !workingSelector) {
|
|
571
596
|
// Final retry for navigation when no selectors work
|
|
572
597
|
const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -583,7 +608,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
583
608
|
return allResults;
|
|
584
609
|
break;
|
|
585
610
|
}
|
|
586
|
-
availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
|
|
587
611
|
let retryCount = 0;
|
|
588
612
|
let navigationSuccess = false;
|
|
589
613
|
while (retryCount < MAX_RETRIES && !navigationSuccess) {
|
|
@@ -648,17 +672,21 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
648
672
|
break;
|
|
649
673
|
}
|
|
650
674
|
case 'clickLoadMore': {
|
|
675
|
+
yield scrapeCurrentPage();
|
|
676
|
+
if (checkLimit())
|
|
677
|
+
return allResults;
|
|
678
|
+
let loadMoreCounter = 0;
|
|
679
|
+
let previousResultCount = allResults.length;
|
|
680
|
+
let noNewItemsCounter = 0;
|
|
681
|
+
const MAX_NO_NEW_ITEMS = 2;
|
|
651
682
|
while (true) {
|
|
652
|
-
// Find working button with retry mechanism
|
|
653
|
-
const { button: loadMoreButton, workingSelector } = yield findWorkingButton(availableSelectors);
|
|
683
|
+
// Find working button with retry mechanism
|
|
684
|
+
const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
|
|
685
|
+
availableSelectors = updatedSelectors;
|
|
654
686
|
if (!workingSelector || !loadMoreButton) {
|
|
655
687
|
debugLog('No working Load More selector found after retries');
|
|
656
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
657
|
-
allResults = allResults.concat(finalResults);
|
|
658
688
|
return allResults;
|
|
659
689
|
}
|
|
660
|
-
// Update available selectors to start from the working one
|
|
661
|
-
availableSelectors = availableSelectors.slice(availableSelectors.indexOf(workingSelector));
|
|
662
690
|
// Implement retry mechanism for clicking the button
|
|
663
691
|
let retryCount = 0;
|
|
664
692
|
let clickSuccess = false;
|
|
@@ -682,6 +710,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
682
710
|
}
|
|
683
711
|
if (clickSuccess) {
|
|
684
712
|
yield page.waitForTimeout(1000);
|
|
713
|
+
loadMoreCounter++;
|
|
714
|
+
debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
|
|
685
715
|
}
|
|
686
716
|
}
|
|
687
717
|
catch (error) {
|
|
@@ -695,8 +725,6 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
695
725
|
}
|
|
696
726
|
if (!clickSuccess) {
|
|
697
727
|
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
|
698
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
699
|
-
allResults = allResults.concat(finalResults);
|
|
700
728
|
return allResults;
|
|
701
729
|
}
|
|
702
730
|
// Wait for content to load and check scroll height
|
|
@@ -704,19 +732,30 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
704
732
|
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
705
733
|
yield page.waitForTimeout(2000);
|
|
706
734
|
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
|
|
707
|
-
|
|
735
|
+
const heightChanged = currentHeight !== previousHeight;
|
|
736
|
+
previousHeight = currentHeight;
|
|
737
|
+
yield scrapeCurrentPage();
|
|
738
|
+
const currentResultCount = allResults.length;
|
|
739
|
+
const newItemsAdded = currentResultCount > previousResultCount;
|
|
740
|
+
if (!newItemsAdded) {
|
|
741
|
+
noNewItemsCounter++;
|
|
742
|
+
debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
|
|
743
|
+
if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
|
|
744
|
+
debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
|
|
745
|
+
return allResults;
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
else {
|
|
749
|
+
noNewItemsCounter = 0;
|
|
750
|
+
previousResultCount = currentResultCount;
|
|
751
|
+
}
|
|
752
|
+
if (checkLimit())
|
|
753
|
+
return allResults;
|
|
754
|
+
if (!heightChanged) {
|
|
708
755
|
debugLog('No more items loaded after Load More');
|
|
709
|
-
const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
710
|
-
allResults = allResults.concat(finalResults);
|
|
711
756
|
return allResults;
|
|
712
757
|
}
|
|
713
|
-
previousHeight = currentHeight;
|
|
714
|
-
if (config.limit && allResults.length >= config.limit) {
|
|
715
|
-
allResults = allResults.slice(0, config.limit);
|
|
716
|
-
break;
|
|
717
|
-
}
|
|
718
758
|
}
|
|
719
|
-
break;
|
|
720
759
|
}
|
|
721
760
|
default: {
|
|
722
761
|
yield scrapeCurrentPage();
|