maxun-core 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -159,6 +159,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
159
159
  * @returns {Array.<Object.<string, string>>}
160
160
  */
161
161
  window.scrapeSchema = function (lists) {
162
+ // Utility functions remain the same
162
163
  function omap(object, f, kf = (x) => x) {
163
164
  return Object.fromEntries(Object.entries(object)
164
165
  .map(([k, v]) => [kf(k), f(v)]));
@@ -167,15 +168,121 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
167
168
  return Object.fromEntries(Object.entries(object)
168
169
  .filter(([k, v]) => f(k, v)));
169
170
  }
171
+ function findAllElements(config) {
172
+ var _a;
173
+ // Regular DOM query if no special delimiters
174
+ if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
175
+ return Array.from(document.querySelectorAll(config.selector));
176
+ }
177
+ // First handle iframe traversal if present
178
+ if (config.selector.includes(':>>')) {
179
+ const parts = config.selector.split(':>>').map(s => s.trim());
180
+ let currentElements = [document];
181
+ // Traverse through each part of the selector
182
+ for (let i = 0; i < parts.length; i++) {
183
+ const part = parts[i];
184
+ const nextElements = [];
185
+ const isLast = i === parts.length - 1;
186
+ for (const element of currentElements) {
187
+ try {
188
+ // For document or iframe document
189
+ const doc = element.contentDocument || element || ((_a = element.contentWindow) === null || _a === void 0 ? void 0 : _a.document);
190
+ if (!doc)
191
+ continue;
192
+ // Query elements in current context
193
+ const found = Array.from(doc.querySelectorAll(part));
194
+ if (isLast) {
195
+ // If it's the last part, keep all matching elements
196
+ nextElements.push(...found);
197
+ }
198
+ else {
199
+ // If not last, only keep iframes for next iteration
200
+ const iframes = found.filter(el => el.tagName === 'IFRAME');
201
+ nextElements.push(...iframes);
202
+ }
203
+ }
204
+ catch (error) {
205
+ console.warn('Cannot access iframe content:', error, {
206
+ part,
207
+ element,
208
+ index: i
209
+ });
210
+ }
211
+ }
212
+ if (nextElements.length === 0) {
213
+ console.warn('No elements found for part:', part, 'at depth:', i);
214
+ return [];
215
+ }
216
+ currentElements = nextElements;
217
+ }
218
+ return currentElements;
219
+ }
220
+ // Handle shadow DOM traversal
221
+ if (config.selector.includes('>>')) {
222
+ const parts = config.selector.split('>>').map(s => s.trim());
223
+ let currentElements = [document];
224
+ for (const part of parts) {
225
+ const nextElements = [];
226
+ for (const element of currentElements) {
227
+ // Try regular DOM first
228
+ const found = Array.from(element.querySelectorAll(part));
229
+ // Then check shadow roots
230
+ for (const foundEl of found) {
231
+ if (foundEl.shadowRoot) {
232
+ nextElements.push(foundEl.shadowRoot);
233
+ }
234
+ else {
235
+ nextElements.push(foundEl);
236
+ }
237
+ }
238
+ }
239
+ currentElements = nextElements;
240
+ }
241
+ return currentElements.filter(el => !(el instanceof ShadowRoot));
242
+ }
243
+ return [];
244
+ }
245
+ // Modified to handle iframe context for URL resolution
246
+ function getElementValue(element, attribute) {
247
+ var _a, _b, _c, _d, _e;
248
+ if (!element)
249
+ return null;
250
+ // Get the base URL for resolving relative URLs
251
+ const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
252
+ switch (attribute) {
253
+ case 'href': {
254
+ const relativeHref = element.getAttribute('href');
255
+ return relativeHref ? new URL(relativeHref, baseURL).href : null;
256
+ }
257
+ case 'src': {
258
+ const relativeSrc = element.getAttribute('src');
259
+ return relativeSrc ? new URL(relativeSrc, baseURL).href : null;
260
+ }
261
+ case 'innerText':
262
+ return (_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim();
263
+ case 'textContent':
264
+ return (_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim();
265
+ default:
266
+ return element.getAttribute(attribute) || ((_e = element.innerText) === null || _e === void 0 ? void 0 : _e.trim());
267
+ }
268
+ }
269
+ // Rest of the functions remain largely the same
170
270
  function getSeedKey(listObj) {
171
- const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length)));
172
- return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0];
271
+ const maxLength = Math.max(...Object.values(omap(listObj, (x) => findAllElements(x).length)));
272
+ return Object.keys(ofilter(listObj, (_, v) => findAllElements(v).length === maxLength))[0];
173
273
  }
274
+ // Find minimal bounding elements
174
275
  function getMBEs(elements) {
175
276
  return elements.map((element) => {
176
277
  let candidate = element;
177
278
  const isUniqueChild = (e) => elements
178
- .filter((elem) => { var _a; return (_a = e.parentNode) === null || _a === void 0 ? void 0 : _a.contains(elem); })
279
+ .filter((elem) => {
280
+ var _a;
281
+ // Handle both iframe and shadow DOM boundaries
282
+ const sameContext = elem.getRootNode() === e.getRootNode() &&
283
+ elem.ownerDocument === e.ownerDocument;
284
+ return sameContext && ((_a = e.parentNode) === null || _a === void 0 ? void 0 : _a.contains(elem));
285
+ })
179
286
  .length === 1;
180
287
  while (candidate && isUniqueChild(candidate)) {
181
288
  candidate = candidate.parentNode;
@@ -184,28 +291,35 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
184
291
  });
185
292
  }
186
293
  const seedName = getSeedKey(lists);
187
- const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector));
294
+ const seedElements = findAllElements(lists[seedName]);
188
295
  const MBEs = getMBEs(seedElements);
189
- return MBEs.map((mbe) => omap(lists, ({ selector, attribute }, key) => {
190
- const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem));
191
- if (!elem)
192
- return undefined;
193
- switch (attribute) {
194
- case 'href':
195
- const relativeHref = elem.getAttribute('href');
196
- return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
197
- case 'src':
198
- const relativeSrc = elem.getAttribute('src');
199
- return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
200
- case 'innerText':
201
- return elem.innerText;
202
- case 'textContent':
203
- return elem.textContent;
204
- default:
205
- return elem.innerText;
206
- }
207
- }, (key) => key // Use the original key in the output
208
- )) || [];
296
+ const mbeResults = MBEs.map((mbe) => omap(lists, (config) => {
297
+ const elem = findAllElements(config)
298
+ .find((elem) => mbe.contains(elem));
299
+ return elem ? getElementValue(elem, config.attribute) : undefined;
300
+ }, (key) => key)) || [];
301
+ // If MBE approach didn't find all elements, try independent scraping
302
+ if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) {
303
+ // Fall back to independent scraping
304
+ const results = [];
305
+ const foundElements = new Map();
306
+ // Find all elements for each selector
307
+ Object.entries(lists).forEach(([key, config]) => {
308
+ const elements = findAllElements(config);
309
+ foundElements.set(key, elements);
310
+ });
311
+ // Create result objects for each found element
312
+ foundElements.forEach((elements, key) => {
313
+ elements.forEach((element, index) => {
314
+ if (!results[index]) {
315
+ results[index] = {};
316
+ }
317
+ results[index][key] = getElementValue(element, lists[key].attribute);
318
+ });
319
+ });
320
+ return results.filter(result => Object.keys(result).length > 0);
321
+ }
322
+ return mbeResults;
209
323
  };
210
324
  /**
211
325
  * Scrapes multiple lists of similar items based on a template item.
@@ -218,67 +332,410 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
218
332
  */
219
333
  window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
220
334
  return __awaiter(this, void 0, void 0, function* () {
221
- const scrapedData = [];
222
- while (scrapedData.length < limit) {
223
- let parentElements = Array.from(document.querySelectorAll(listSelector));
224
- // If we only got one element or none, try a more generic approach
225
- if (limit > 1 && parentElements.length <= 1) {
226
- const [containerSelector, _] = listSelector.split('>').map(s => s.trim());
227
- const container = document.querySelector(containerSelector);
228
- if (container) {
229
- const allChildren = Array.from(container.children);
230
- const firstMatch = document.querySelector(listSelector);
231
- if (firstMatch) {
232
- // Get classes from the first matching element
233
- const firstMatchClasses = Array.from(firstMatch.classList);
234
- // Find similar elements by matching most of their classes
235
- parentElements = allChildren.filter(element => {
236
- const elementClasses = Array.from(element.classList);
237
- // Element should share at least 70% of classes with the first match
238
- const commonClasses = firstMatchClasses.filter(cls => elementClasses.includes(cls));
239
- return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7);
240
- });
335
+ // Enhanced query function to handle both iframe and shadow DOM
336
+ const queryElement = (rootElement, selector) => {
337
+ if (!selector.includes('>>') && !selector.includes(':>>')) {
338
+ return rootElement.querySelector(selector);
339
+ }
340
+ const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
341
+ let currentElement = rootElement;
342
+ for (let i = 0; i < parts.length; i++) {
343
+ if (!currentElement)
344
+ return null;
345
+ // Handle iframe traversal
346
+ if (currentElement.tagName === 'IFRAME') {
347
+ try {
348
+ const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
349
+ currentElement = iframeDoc.querySelector(parts[i]);
350
+ continue;
351
+ }
352
+ catch (e) {
353
+ console.warn('Cannot access iframe content:', e);
354
+ return null;
355
+ }
356
+ }
357
+ // Try regular DOM first
358
+ let nextElement = currentElement.querySelector(parts[i]);
359
+ // Try shadow DOM if not found
360
+ if (!nextElement && currentElement.shadowRoot) {
361
+ nextElement = currentElement.shadowRoot.querySelector(parts[i]);
362
+ }
363
+ // Check children's shadow roots if still not found
364
+ if (!nextElement) {
365
+ const children = Array.from(currentElement.children || []);
366
+ for (const child of children) {
367
+ if (child.shadowRoot) {
368
+ nextElement = child.shadowRoot.querySelector(parts[i]);
369
+ if (nextElement)
370
+ break;
371
+ }
241
372
  }
242
373
  }
374
+ currentElement = nextElement;
243
375
  }
244
- // Iterate through each parent element
245
- for (const parent of parentElements) {
246
- if (scrapedData.length >= limit)
247
- break;
248
- const record = {};
249
- // For each field, select the corresponding element within the parent
250
- for (const [label, { selector, attribute }] of Object.entries(fields)) {
251
- const fieldElement = parent.querySelector(selector);
252
- if (fieldElement) {
253
- if (attribute === 'innerText') {
254
- record[label] = fieldElement.innerText.trim();
376
+ return currentElement;
377
+ };
378
+ // Enhanced query all function for both contexts
379
+ const queryElementAll = (rootElement, selector) => {
380
+ if (!selector.includes('>>') && !selector.includes(':>>')) {
381
+ return rootElement.querySelectorAll(selector);
382
+ }
383
+ const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
384
+ let currentElements = [rootElement];
385
+ for (const part of parts) {
386
+ const nextElements = [];
387
+ for (const element of currentElements) {
388
+ // Handle iframe traversal
389
+ if (element.tagName === 'IFRAME') {
390
+ try {
391
+ const iframeDoc = element.contentDocument || element.contentWindow.document;
392
+ nextElements.push(...iframeDoc.querySelectorAll(part));
393
+ }
394
+ catch (e) {
395
+ console.warn('Cannot access iframe content:', e);
396
+ continue;
397
+ }
398
+ }
399
+ else {
400
+ // Regular DOM elements
401
+ if (element.querySelectorAll) {
402
+ nextElements.push(...element.querySelectorAll(part));
255
403
  }
256
- else if (attribute === 'innerHTML') {
257
- record[label] = fieldElement.innerHTML.trim();
404
+ // Shadow DOM elements
405
+ if (element.shadowRoot) {
406
+ nextElements.push(...element.shadowRoot.querySelectorAll(part));
258
407
  }
259
- else if (attribute === 'src') {
260
- // Handle relative 'src' URLs
261
- const src = fieldElement.getAttribute('src');
262
- record[label] = src ? new URL(src, window.location.origin).href : null;
408
+ // Check children's shadow roots
409
+ const children = Array.from(element.children || []);
410
+ for (const child of children) {
411
+ if (child.shadowRoot) {
412
+ nextElements.push(...child.shadowRoot.querySelectorAll(part));
413
+ }
414
+ }
415
+ }
416
+ }
417
+ currentElements = nextElements;
418
+ }
419
+ return currentElements;
420
+ };
421
+ // Enhanced value extraction with context awareness
422
+ function extractValue(element, attribute) {
423
+ var _a, _b;
424
+ if (!element)
425
+ return null;
426
+ // Get context-aware base URL
427
+ const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
428
+ // Check shadow root first
429
+ if (element.shadowRoot) {
430
+ const shadowContent = element.shadowRoot.textContent;
431
+ if (shadowContent === null || shadowContent === void 0 ? void 0 : shadowContent.trim()) {
432
+ return shadowContent.trim();
433
+ }
434
+ }
435
+ if (attribute === 'innerText') {
436
+ return element.innerText.trim();
437
+ }
438
+ else if (attribute === 'innerHTML') {
439
+ return element.innerHTML.trim();
440
+ }
441
+ else if (attribute === 'src' || attribute === 'href') {
442
+ const attrValue = element.getAttribute(attribute);
443
+ return attrValue ? new URL(attrValue, baseURL).href : null;
444
+ }
445
+ return element.getAttribute(attribute);
446
+ }
447
+ // Enhanced table ancestor finding with context support
448
+ function findTableAncestor(element) {
449
+ let currentElement = element;
450
+ const MAX_DEPTH = 5;
451
+ let depth = 0;
452
+ while (currentElement && depth < MAX_DEPTH) {
453
+ // Handle shadow DOM
454
+ if (currentElement.getRootNode() instanceof ShadowRoot) {
455
+ currentElement = currentElement.getRootNode().host;
456
+ continue;
457
+ }
458
+ if (currentElement.tagName === 'TD') {
459
+ return { type: 'TD', element: currentElement };
460
+ }
461
+ else if (currentElement.tagName === 'TR') {
462
+ return { type: 'TR', element: currentElement };
463
+ }
464
+ // Handle iframe crossing
465
+ if (currentElement.tagName === 'IFRAME') {
466
+ try {
467
+ currentElement = currentElement.contentDocument.body;
468
+ }
469
+ catch (e) {
470
+ return null;
471
+ }
472
+ }
473
+ else {
474
+ currentElement = currentElement.parentElement;
475
+ }
476
+ depth++;
477
+ }
478
+ return null;
479
+ }
480
+ // Helper function to get cell index
481
+ function getCellIndex(td) {
482
+ if (td.getRootNode() instanceof ShadowRoot) {
483
+ const shadowRoot = td.getRootNode();
484
+ const allCells = Array.from(shadowRoot.querySelectorAll('td'));
485
+ return allCells.indexOf(td);
486
+ }
487
+ let index = 0;
488
+ let sibling = td;
489
+ while (sibling = sibling.previousElementSibling) {
490
+ index++;
491
+ }
492
+ return index;
493
+ }
494
+ // Helper function to check for TH elements
495
+ function hasThElement(row, tableFields) {
496
+ for (const [_, { selector }] of Object.entries(tableFields)) {
497
+ const element = queryElement(row, selector);
498
+ if (element) {
499
+ let current = element;
500
+ while (current && current !== row) {
501
+ if (current.getRootNode() instanceof ShadowRoot) {
502
+ current = current.getRootNode().host;
503
+ continue;
263
504
  }
264
- else if (attribute === 'href') {
265
- // Handle relative 'href' URLs
266
- const href = fieldElement.getAttribute('href');
267
- record[label] = href ? new URL(href, window.location.origin).href : null;
505
+ if (current.tagName === 'TH')
506
+ return true;
507
+ if (current.tagName === 'IFRAME') {
508
+ try {
509
+ current = current.contentDocument.body;
510
+ }
511
+ catch (e) {
512
+ break;
513
+ }
268
514
  }
269
515
  else {
270
- record[label] = fieldElement.getAttribute(attribute);
516
+ current = current.parentElement;
271
517
  }
272
518
  }
273
519
  }
274
- scrapedData.push(record);
275
520
  }
276
- // If we've processed all available elements and still haven't reached the limit,
277
- // break to avoid infinite loop
278
- if (parentElements.length === 0 || scrapedData.length >= parentElements.length) {
521
+ return false;
522
+ }
523
+ // Helper function to filter rows
524
+ function filterRowsBasedOnTag(rows, tableFields) {
525
+ for (const row of rows) {
526
+ if (hasThElement(row, tableFields)) {
527
+ return rows;
528
+ }
529
+ }
530
+ // Include shadow DOM in TH search
531
+ return rows.filter(row => {
532
+ const directTH = row.getElementsByTagName('TH').length === 0;
533
+ const shadowTH = row.shadowRoot ?
534
+ row.shadowRoot.querySelector('th') === null : true;
535
+ return directTH && shadowTH;
536
+ });
537
+ }
538
+ // Class similarity comparison functions
539
+ function calculateClassSimilarity(classList1, classList2) {
540
+ const set1 = new Set(classList1);
541
+ const set2 = new Set(classList2);
542
+ const intersection = new Set([...set1].filter(x => set2.has(x)));
543
+ const union = new Set([...set1, ...set2]);
544
+ return intersection.size / union.size;
545
+ }
546
+ // Enhanced similar elements finding with context support
547
+ function findSimilarElements(baseElement, similarityThreshold = 0.7) {
548
+ const baseClasses = Array.from(baseElement.classList);
549
+ if (baseClasses.length === 0)
550
+ return [];
551
+ const allElements = [];
552
+ // Get elements from main document
553
+ allElements.push(...document.getElementsByTagName(baseElement.tagName));
554
+ // Get elements from shadow DOM
555
+ if (baseElement.getRootNode() instanceof ShadowRoot) {
556
+ const shadowHost = baseElement.getRootNode().host;
557
+ allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
558
+ }
559
+ // Get elements from iframes
560
+ const iframes = document.getElementsByTagName('iframe');
561
+ for (const iframe of iframes) {
562
+ try {
563
+ const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
564
+ allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
565
+ }
566
+ catch (e) {
567
+ console.warn('Cannot access iframe content:', e);
568
+ }
569
+ }
570
+ return allElements.filter(element => {
571
+ if (element === baseElement)
572
+ return false;
573
+ const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
574
+ return similarity >= similarityThreshold;
575
+ });
576
+ }
577
+ // Main scraping logic with context support
578
+ let containers = queryElementAll(document, listSelector);
579
+ containers = Array.from(containers);
580
+ if (containers.length === 0)
581
+ return [];
582
+ if (limit > 1 && containers.length === 1) {
583
+ const baseContainer = containers[0];
584
+ const similarContainers = findSimilarElements(baseContainer);
585
+ if (similarContainers.length > 0) {
586
+ const newContainers = similarContainers.filter(container => !container.matches(listSelector));
587
+ containers = [...containers, ...newContainers];
588
+ }
589
+ }
590
+ const containerFields = containers.map(() => ({
591
+ tableFields: {},
592
+ nonTableFields: {}
593
+ }));
594
+ // Classify fields
595
+ containers.forEach((container, containerIndex) => {
596
+ for (const [label, field] of Object.entries(fields)) {
597
+ const sampleElement = queryElement(container, field.selector);
598
+ if (sampleElement) {
599
+ const ancestor = findTableAncestor(sampleElement);
600
+ if (ancestor) {
601
+ containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 });
602
+ }
603
+ else {
604
+ containerFields[containerIndex].nonTableFields[label] = field;
605
+ }
606
+ }
607
+ else {
608
+ containerFields[containerIndex].nonTableFields[label] = field;
609
+ }
610
+ }
611
+ });
612
+ const tableData = [];
613
+ const nonTableData = [];
614
+ // Process table data with both iframe and shadow DOM support
615
+ for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
616
+ const container = containers[containerIndex];
617
+ const { tableFields } = containerFields[containerIndex];
618
+ if (Object.keys(tableFields).length > 0) {
619
+ const firstField = Object.values(tableFields)[0];
620
+ const firstElement = queryElement(container, firstField.selector);
621
+ let tableContext = firstElement;
622
+ // Find table context including both iframe and shadow DOM
623
+ while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
624
+ if (tableContext.getRootNode() instanceof ShadowRoot) {
625
+ tableContext = tableContext.getRootNode().host;
626
+ continue;
627
+ }
628
+ if (tableContext.tagName === 'IFRAME') {
629
+ try {
630
+ tableContext = tableContext.contentDocument.body;
631
+ }
632
+ catch (e) {
633
+ break;
634
+ }
635
+ }
636
+ else {
637
+ tableContext = tableContext.parentElement;
638
+ }
639
+ }
640
+ if (tableContext) {
641
+ // Get rows from all contexts
642
+ const rows = [];
643
+ // Get rows from regular DOM
644
+ rows.push(...tableContext.getElementsByTagName('TR'));
645
+ // Get rows from shadow DOM
646
+ if (tableContext.shadowRoot) {
647
+ rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
648
+ }
649
+ // Get rows from iframes
650
+ if (tableContext.tagName === 'IFRAME') {
651
+ try {
652
+ const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
653
+ rows.push(...iframeDoc.getElementsByTagName('TR'));
654
+ }
655
+ catch (e) {
656
+ console.warn('Cannot access iframe rows:', e);
657
+ }
658
+ }
659
+ const processedRows = filterRowsBasedOnTag(rows, tableFields);
660
+ for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
661
+ const record = {};
662
+ const currentRow = processedRows[rowIndex];
663
+ for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
664
+ let element = null;
665
+ if (cellIndex >= 0) {
666
+ // Get TD element considering both contexts
667
+ let td = currentRow.children[cellIndex];
668
+ // Check shadow DOM for td
669
+ if (!td && currentRow.shadowRoot) {
670
+ const shadowCells = currentRow.shadowRoot.children;
671
+ if (shadowCells && shadowCells.length > cellIndex) {
672
+ td = shadowCells[cellIndex];
673
+ }
674
+ }
675
+ if (td) {
676
+ element = queryElement(td, selector);
677
+ if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
678
+ element = td;
679
+ }
680
+ if (!element) {
681
+ const tagOnlySelector = selector.split('.')[0];
682
+ element = queryElement(td, tagOnlySelector);
683
+ }
684
+ if (!element) {
685
+ let currentElement = td;
686
+ while (currentElement && currentElement.children.length > 0) {
687
+ let foundContentChild = false;
688
+ for (const child of currentElement.children) {
689
+ if (extractValue(child, attribute)) {
690
+ currentElement = child;
691
+ foundContentChild = true;
692
+ break;
693
+ }
694
+ }
695
+ if (!foundContentChild)
696
+ break;
697
+ }
698
+ element = currentElement;
699
+ }
700
+ }
701
+ }
702
+ else {
703
+ element = queryElement(currentRow, selector);
704
+ }
705
+ if (element) {
706
+ record[label] = extractValue(element, attribute);
707
+ }
708
+ }
709
+ if (Object.keys(record).length > 0) {
710
+ tableData.push(record);
711
+ }
712
+ }
713
+ }
714
+ }
715
+ }
716
+ // Process non-table data with both contexts support
717
+ for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
718
+ if (nonTableData.length >= limit)
279
719
  break;
720
+ const container = containers[containerIndex];
721
+ const { nonTableFields } = containerFields[containerIndex];
722
+ if (Object.keys(nonTableFields).length > 0) {
723
+ const record = {};
724
+ for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
725
+ // Get the last part of the selector after any context delimiter
726
+ const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
727
+ const element = queryElement(container, relativeSelector);
728
+ if (element) {
729
+ record[label] = extractValue(element, attribute);
730
+ }
731
+ }
732
+ if (Object.keys(record).length > 0) {
733
+ nonTableData.push(record);
734
+ }
280
735
  }
281
736
  }
737
+ // Merge and limit the results
738
+ const scrapedData = [...tableData, ...nonTableData];
282
739
  return scrapedData;
283
740
  });
284
741
  };
@@ -86,6 +86,8 @@ export default class Interpreter extends EventEmitter {
86
86
  private carryOutSteps;
87
87
  private handlePagination;
88
88
  private getMatchingActionId;
89
+ private removeShadowSelectors;
90
+ private removeSpecialSelectors;
89
91
  private runLoop;
90
92
  private ensureScriptsLoaded;
91
93
  /**
@@ -461,6 +461,8 @@ class Interpreter extends events_1.EventEmitter {
461
461
  let previousHeight = 0;
462
462
  // track unique items per page to avoid re-scraping
463
463
  let scrapedItems = new Set();
464
+ let visitedUrls = [];
465
+ let availableSelectors = config.pagination.selector.split(',');
464
466
  while (true) {
465
467
  switch (config.pagination.type) {
466
468
  case 'scrollDown':
@@ -486,6 +488,7 @@ class Interpreter extends events_1.EventEmitter {
486
488
  previousHeight = currentTopHeight;
487
489
  break;
488
490
  case 'clickNext':
491
+ console.log("Page URL:", page.url());
489
492
  const pageResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
490
493
  // console.log("Page results:", pageResults);
491
494
  // Filter out already scraped items
@@ -497,30 +500,128 @@ class Interpreter extends events_1.EventEmitter {
497
500
  return true;
498
501
  });
499
502
  allResults = allResults.concat(newResults);
503
+ console.log("Results so far:", allResults.length);
500
504
  if (config.limit && allResults.length >= config.limit) {
501
505
  return allResults.slice(0, config.limit);
502
506
  }
503
- const nextButton = yield page.$(config.pagination.selector);
507
+ let checkButton = null;
508
+ let workingSelector = null;
509
+ for (let i = 0; i < availableSelectors.length; i++) {
510
+ const selector = availableSelectors[i];
511
+ try {
512
+ // Wait for selector with a short timeout
513
+ checkButton = yield page.waitForSelector(selector, { state: 'attached' });
514
+ if (checkButton) {
515
+ workingSelector = selector;
516
+ break;
517
+ }
518
+ }
519
+ catch (error) {
520
+ console.log(`Selector failed: ${selector}`);
521
+ }
522
+ }
523
+ if (!workingSelector) {
524
+ return allResults;
525
+ }
526
+ // const nextButton = await page.$(config.pagination.selector);
527
+ const nextButton = yield page.$(workingSelector);
504
528
  if (!nextButton) {
505
529
  return allResults; // No more pages to scrape
506
530
  }
507
- yield Promise.all([
508
- nextButton.dispatchEvent('click'),
509
- page.waitForNavigation({ waitUntil: 'networkidle' })
510
- ]);
531
+ const selectorIndex = availableSelectors.indexOf(workingSelector);
532
+ availableSelectors = availableSelectors.slice(selectorIndex);
533
+ // await Promise.all([
534
+ // nextButton.dispatchEvent('click'),
535
+ // page.waitForNavigation({ waitUntil: 'networkidle' })
536
+ // ]);
537
+ const previousUrl = page.url();
538
+ visitedUrls.push(previousUrl);
539
+ try {
540
+ // Try both click methods simultaneously
541
+ yield Promise.race([
542
+ Promise.all([
543
+ page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
544
+ nextButton.click()
545
+ ]),
546
+ Promise.all([
547
+ page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
548
+ nextButton.dispatchEvent('click')
549
+ ])
550
+ ]);
551
+ }
552
+ catch (error) {
553
+ // Verify if navigation actually succeeded
554
+ const currentUrl = page.url();
555
+ if (currentUrl === previousUrl) {
556
+ console.log("Previous URL same as current URL. Navigation failed.");
557
+ }
558
+ }
559
+ const currentUrl = page.url();
560
+ if (visitedUrls.includes(currentUrl)) {
561
+ console.log(`Detected navigation to a previously visited URL: ${currentUrl}`);
562
+ // Extract the current page number from the URL
563
+ const match = currentUrl.match(/\d+/);
564
+ if (match) {
565
+ const currentNumber = match[0];
566
+ // Use visitedUrls.length + 1 as the next page number
567
+ const nextNumber = visitedUrls.length + 1;
568
+ // Create new URL by replacing the current number with the next number
569
+ const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());
570
+ console.log(`Navigating to constructed URL: ${nextUrl}`);
571
+ // Navigate to the next page
572
+ yield Promise.all([
573
+ page.waitForNavigation({ waitUntil: 'networkidle' }),
574
+ page.goto(nextUrl)
575
+ ]);
576
+ }
577
+ }
578
+ // Give the page a moment to stabilize after navigation
511
579
  yield page.waitForTimeout(1000);
512
580
  break;
513
581
  case 'clickLoadMore':
514
582
  while (true) {
515
- const loadMoreButton = yield page.$(config.pagination.selector);
583
+ let checkButton = null;
584
+ let workingSelector = null;
585
+ for (let i = 0; i < availableSelectors.length; i++) {
586
+ const selector = availableSelectors[i];
587
+ try {
588
+ // Wait for selector with a short timeout
589
+ checkButton = yield page.waitForSelector(selector, { state: 'attached' });
590
+ if (checkButton) {
591
+ workingSelector = selector;
592
+ break;
593
+ }
594
+ }
595
+ catch (error) {
596
+ console.log(`Selector failed: ${selector}`);
597
+ }
598
+ }
599
+ if (!workingSelector) {
600
+ // No more working selectors available, so scrape the remaining items
601
+ const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
602
+ allResults = allResults.concat(finalResults);
603
+ return allResults;
604
+ }
605
+ const loadMoreButton = yield page.$(workingSelector);
516
606
  if (!loadMoreButton) {
517
607
  // No more "Load More" button, so scrape the remaining items
518
608
  const finalResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
519
609
  allResults = allResults.concat(finalResults);
520
610
  return allResults;
521
611
  }
612
+ const selectorIndex = availableSelectors.indexOf(workingSelector);
613
+ availableSelectors = availableSelectors.slice(selectorIndex);
522
614
  // Click the 'Load More' button to load additional items
523
- yield loadMoreButton.dispatchEvent('click');
615
+ // await loadMoreButton.dispatchEvent('click');
616
+ try {
617
+ yield Promise.race([
618
+ loadMoreButton.click(),
619
+ loadMoreButton.dispatchEvent('click')
620
+ ]);
621
+ }
622
+ catch (error) {
623
+ console.log('Both click attempts failed');
624
+ }
524
625
  yield page.waitForTimeout(2000); // Wait for new items to load
525
626
  // After clicking 'Load More', scroll down to load more items
526
627
  yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
@@ -568,10 +669,32 @@ class Interpreter extends events_1.EventEmitter {
568
669
  }
569
670
  }
570
671
  }
672
+ removeShadowSelectors(workflow) {
673
+ for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
674
+ const step = workflow[actionId];
675
+ // Check if step has where and selectors
676
+ if (step.where && Array.isArray(step.where.selectors)) {
677
+ // Filter out selectors that contain ">>"
678
+ step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>'));
679
+ }
680
+ }
681
+ return workflow;
682
+ }
683
+ removeSpecialSelectors(workflow) {
684
+ for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
685
+ const step = workflow[actionId];
686
+ if (step.where && Array.isArray(step.where.selectors)) {
687
+ // Filter out if selector has EITHER ":>>" OR ">>"
688
+ step.where.selectors = step.where.selectors.filter(selector => !(selector.includes(':>>') || selector.includes('>>')));
689
+ }
690
+ }
691
+ return workflow;
692
+ }
571
693
  runLoop(p, workflow) {
572
694
  var _a, _b;
573
695
  return __awaiter(this, void 0, void 0, function* () {
574
- const workflowCopy = JSON.parse(JSON.stringify(workflow));
696
+ let workflowCopy = JSON.parse(JSON.stringify(workflow));
697
+ workflowCopy = this.removeSpecialSelectors(workflowCopy);
575
698
  // apply ad-blocker to the current page
576
699
  try {
577
700
  yield this.applyAdBlocker(p);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.7",
3
+ "version": "0.0.9",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",