maxun-core 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -159,6 +159,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
159
159
  * @returns {Array.<Object.<string, string>>}
160
160
  */
161
161
  window.scrapeSchema = function (lists) {
162
+ // Utility functions remain the same
162
163
  function omap(object, f, kf = (x) => x) {
163
164
  return Object.fromEntries(Object.entries(object)
164
165
  .map(([k, v]) => [kf(k), f(v)]));
@@ -167,15 +168,121 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
167
168
  return Object.fromEntries(Object.entries(object)
168
169
  .filter(([k, v]) => f(k, v)));
169
170
  }
171
+ function findAllElements(config) {
172
+ var _a;
173
+ // Regular DOM query if no special delimiters
174
+ if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
175
+ return Array.from(document.querySelectorAll(config.selector));
176
+ }
177
+ // First handle iframe traversal if present
178
+ if (config.selector.includes(':>>')) {
179
+ const parts = config.selector.split(':>>').map(s => s.trim());
180
+ let currentElements = [document];
181
+ // Traverse through each part of the selector
182
+ for (let i = 0; i < parts.length; i++) {
183
+ const part = parts[i];
184
+ const nextElements = [];
185
+ const isLast = i === parts.length - 1;
186
+ for (const element of currentElements) {
187
+ try {
188
+ // For document or iframe document
189
+ const doc = element.contentDocument || element || ((_a = element.contentWindow) === null || _a === void 0 ? void 0 : _a.document);
190
+ if (!doc)
191
+ continue;
192
+ // Query elements in current context
193
+ const found = Array.from(doc.querySelectorAll(part));
194
+ if (isLast) {
195
+ // If it's the last part, keep all matching elements
196
+ nextElements.push(...found);
197
+ }
198
+ else {
199
+ // If not last, only keep iframes for next iteration
200
+ const iframes = found.filter(el => el.tagName === 'IFRAME');
201
+ nextElements.push(...iframes);
202
+ }
203
+ }
204
+ catch (error) {
205
+ console.warn('Cannot access iframe content:', error, {
206
+ part,
207
+ element,
208
+ index: i
209
+ });
210
+ }
211
+ }
212
+ if (nextElements.length === 0) {
213
+ console.warn('No elements found for part:', part, 'at depth:', i);
214
+ return [];
215
+ }
216
+ currentElements = nextElements;
217
+ }
218
+ return currentElements;
219
+ }
220
+ // Handle shadow DOM traversal
221
+ if (config.selector.includes('>>')) {
222
+ const parts = config.selector.split('>>').map(s => s.trim());
223
+ let currentElements = [document];
224
+ for (const part of parts) {
225
+ const nextElements = [];
226
+ for (const element of currentElements) {
227
+ // Try regular DOM first
228
+ const found = Array.from(element.querySelectorAll(part));
229
+ // Then check shadow roots
230
+ for (const foundEl of found) {
231
+ if (foundEl.shadowRoot) {
232
+ nextElements.push(foundEl.shadowRoot);
233
+ }
234
+ else {
235
+ nextElements.push(foundEl);
236
+ }
237
+ }
238
+ }
239
+ currentElements = nextElements;
240
+ }
241
+ return currentElements.filter(el => !(el instanceof ShadowRoot));
242
+ }
243
+ return [];
244
+ }
245
+ // Modified to handle iframe context for URL resolution
246
+ function getElementValue(element, attribute) {
247
+ var _a, _b, _c, _d, _e;
248
+ if (!element)
249
+ return null;
250
+ // Get the base URL for resolving relative URLs
251
+ const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
252
+ switch (attribute) {
253
+ case 'href': {
254
+ const relativeHref = element.getAttribute('href');
255
+ return relativeHref ? new URL(relativeHref, baseURL).href : null;
256
+ }
257
+ case 'src': {
258
+ const relativeSrc = element.getAttribute('src');
259
+ return relativeSrc ? new URL(relativeSrc, baseURL).href : null;
260
+ }
261
+ case 'innerText':
262
+ return (_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim();
263
+ case 'textContent':
264
+ return (_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim();
265
+ default:
266
+ return element.getAttribute(attribute) || ((_e = element.innerText) === null || _e === void 0 ? void 0 : _e.trim());
267
+ }
268
+ }
269
+ // Rest of the functions remain largely the same
170
270
  function getSeedKey(listObj) {
171
- const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length)));
172
- return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0];
271
+ const maxLength = Math.max(...Object.values(omap(listObj, (x) => findAllElements(x).length)));
272
+ return Object.keys(ofilter(listObj, (_, v) => findAllElements(v).length === maxLength))[0];
173
273
  }
274
+ // Find minimal bounding elements
174
275
  function getMBEs(elements) {
175
276
  return elements.map((element) => {
176
277
  let candidate = element;
177
278
  const isUniqueChild = (e) => elements
178
- .filter((elem) => { var _a; return (_a = e.parentNode) === null || _a === void 0 ? void 0 : _a.contains(elem); })
279
+ .filter((elem) => {
280
+ var _a;
281
+ // Handle both iframe and shadow DOM boundaries
282
+ const sameContext = elem.getRootNode() === e.getRootNode() &&
283
+ elem.ownerDocument === e.ownerDocument;
284
+ return sameContext && ((_a = e.parentNode) === null || _a === void 0 ? void 0 : _a.contains(elem));
285
+ })
179
286
  .length === 1;
180
287
  while (candidate && isUniqueChild(candidate)) {
181
288
  candidate = candidate.parentNode;
@@ -184,28 +291,35 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
184
291
  });
185
292
  }
186
293
  const seedName = getSeedKey(lists);
187
- const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector));
294
+ const seedElements = findAllElements(lists[seedName]);
188
295
  const MBEs = getMBEs(seedElements);
189
- return MBEs.map((mbe) => omap(lists, ({ selector, attribute }, key) => {
190
- const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem));
191
- if (!elem)
192
- return undefined;
193
- switch (attribute) {
194
- case 'href':
195
- const relativeHref = elem.getAttribute('href');
196
- return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
197
- case 'src':
198
- const relativeSrc = elem.getAttribute('src');
199
- return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
200
- case 'innerText':
201
- return elem.innerText;
202
- case 'textContent':
203
- return elem.textContent;
204
- default:
205
- return elem.innerText;
206
- }
207
- }, (key) => key // Use the original key in the output
208
- )) || [];
296
+ const mbeResults = MBEs.map((mbe) => omap(lists, (config) => {
297
+ const elem = findAllElements(config)
298
+ .find((elem) => mbe.contains(elem));
299
+ return elem ? getElementValue(elem, config.attribute) : undefined;
300
+ }, (key) => key)) || [];
301
+ // If MBE approach didn't find all elements, try independent scraping
302
+ if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) {
303
+ // Fall back to independent scraping
304
+ const results = [];
305
+ const foundElements = new Map();
306
+ // Find all elements for each selector
307
+ Object.entries(lists).forEach(([key, config]) => {
308
+ const elements = findAllElements(config);
309
+ foundElements.set(key, elements);
310
+ });
311
+ // Create result objects for each found element
312
+ foundElements.forEach((elements, key) => {
313
+ elements.forEach((element, index) => {
314
+ if (!results[index]) {
315
+ results[index] = {};
316
+ }
317
+ results[index][key] = getElementValue(element, lists[key].attribute);
318
+ });
319
+ });
320
+ return results.filter(result => Object.keys(result).length > 0);
321
+ }
322
+ return mbeResults;
209
323
  };
210
324
  /**
211
325
  * Scrapes multiple lists of similar items based on a template item.
@@ -218,67 +332,410 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
218
332
  */
219
333
  window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
220
334
  return __awaiter(this, void 0, void 0, function* () {
221
- const scrapedData = [];
222
- while (scrapedData.length < limit) {
223
- let parentElements = Array.from(document.querySelectorAll(listSelector));
224
- // If we only got one element or none, try a more generic approach
225
- if (limit > 1 && parentElements.length <= 1) {
226
- const [containerSelector, _] = listSelector.split('>').map(s => s.trim());
227
- const container = document.querySelector(containerSelector);
228
- if (container) {
229
- const allChildren = Array.from(container.children);
230
- const firstMatch = document.querySelector(listSelector);
231
- if (firstMatch) {
232
- // Get classes from the first matching element
233
- const firstMatchClasses = Array.from(firstMatch.classList);
234
- // Find similar elements by matching most of their classes
235
- parentElements = allChildren.filter(element => {
236
- const elementClasses = Array.from(element.classList);
237
- // Element should share at least 70% of classes with the first match
238
- const commonClasses = firstMatchClasses.filter(cls => elementClasses.includes(cls));
239
- return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7);
240
- });
335
+ // Enhanced query function to handle both iframe and shadow DOM
336
+ const queryElement = (rootElement, selector) => {
337
+ if (!selector.includes('>>') && !selector.includes(':>>')) {
338
+ return rootElement.querySelector(selector);
339
+ }
340
+ const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
341
+ let currentElement = rootElement;
342
+ for (let i = 0; i < parts.length; i++) {
343
+ if (!currentElement)
344
+ return null;
345
+ // Handle iframe traversal
346
+ if (currentElement.tagName === 'IFRAME') {
347
+ try {
348
+ const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
349
+ currentElement = iframeDoc.querySelector(parts[i]);
350
+ continue;
351
+ }
352
+ catch (e) {
353
+ console.warn('Cannot access iframe content:', e);
354
+ return null;
355
+ }
356
+ }
357
+ // Try regular DOM first
358
+ let nextElement = currentElement.querySelector(parts[i]);
359
+ // Try shadow DOM if not found
360
+ if (!nextElement && currentElement.shadowRoot) {
361
+ nextElement = currentElement.shadowRoot.querySelector(parts[i]);
362
+ }
363
+ // Check children's shadow roots if still not found
364
+ if (!nextElement) {
365
+ const children = Array.from(currentElement.children || []);
366
+ for (const child of children) {
367
+ if (child.shadowRoot) {
368
+ nextElement = child.shadowRoot.querySelector(parts[i]);
369
+ if (nextElement)
370
+ break;
371
+ }
241
372
  }
242
373
  }
374
+ currentElement = nextElement;
243
375
  }
244
- // Iterate through each parent element
245
- for (const parent of parentElements) {
246
- if (scrapedData.length >= limit)
247
- break;
248
- const record = {};
249
- // For each field, select the corresponding element within the parent
250
- for (const [label, { selector, attribute }] of Object.entries(fields)) {
251
- const fieldElement = parent.querySelector(selector);
252
- if (fieldElement) {
253
- if (attribute === 'innerText') {
254
- record[label] = fieldElement.innerText.trim();
376
+ return currentElement;
377
+ };
378
+ // Enhanced query all function for both contexts
379
+ const queryElementAll = (rootElement, selector) => {
380
+ if (!selector.includes('>>') && !selector.includes(':>>')) {
381
+ return rootElement.querySelectorAll(selector);
382
+ }
383
+ const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
384
+ let currentElements = [rootElement];
385
+ for (const part of parts) {
386
+ const nextElements = [];
387
+ for (const element of currentElements) {
388
+ // Handle iframe traversal
389
+ if (element.tagName === 'IFRAME') {
390
+ try {
391
+ const iframeDoc = element.contentDocument || element.contentWindow.document;
392
+ nextElements.push(...iframeDoc.querySelectorAll(part));
393
+ }
394
+ catch (e) {
395
+ console.warn('Cannot access iframe content:', e);
396
+ continue;
397
+ }
398
+ }
399
+ else {
400
+ // Regular DOM elements
401
+ if (element.querySelectorAll) {
402
+ nextElements.push(...element.querySelectorAll(part));
255
403
  }
256
- else if (attribute === 'innerHTML') {
257
- record[label] = fieldElement.innerHTML.trim();
404
+ // Shadow DOM elements
405
+ if (element.shadowRoot) {
406
+ nextElements.push(...element.shadowRoot.querySelectorAll(part));
258
407
  }
259
- else if (attribute === 'src') {
260
- // Handle relative 'src' URLs
261
- const src = fieldElement.getAttribute('src');
262
- record[label] = src ? new URL(src, window.location.origin).href : null;
408
+ // Check children's shadow roots
409
+ const children = Array.from(element.children || []);
410
+ for (const child of children) {
411
+ if (child.shadowRoot) {
412
+ nextElements.push(...child.shadowRoot.querySelectorAll(part));
413
+ }
414
+ }
415
+ }
416
+ }
417
+ currentElements = nextElements;
418
+ }
419
+ return currentElements;
420
+ };
421
+ // Enhanced value extraction with context awareness
422
+ function extractValue(element, attribute) {
423
+ var _a, _b;
424
+ if (!element)
425
+ return null;
426
+ // Get context-aware base URL
427
+ const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
428
+ // Check shadow root first
429
+ if (element.shadowRoot) {
430
+ const shadowContent = element.shadowRoot.textContent;
431
+ if (shadowContent === null || shadowContent === void 0 ? void 0 : shadowContent.trim()) {
432
+ return shadowContent.trim();
433
+ }
434
+ }
435
+ if (attribute === 'innerText') {
436
+ return element.innerText.trim();
437
+ }
438
+ else if (attribute === 'innerHTML') {
439
+ return element.innerHTML.trim();
440
+ }
441
+ else if (attribute === 'src' || attribute === 'href') {
442
+ const attrValue = element.getAttribute(attribute);
443
+ return attrValue ? new URL(attrValue, baseURL).href : null;
444
+ }
445
+ return element.getAttribute(attribute);
446
+ }
447
+ // Enhanced table ancestor finding with context support
448
+ function findTableAncestor(element) {
449
+ let currentElement = element;
450
+ const MAX_DEPTH = 5;
451
+ let depth = 0;
452
+ while (currentElement && depth < MAX_DEPTH) {
453
+ // Handle shadow DOM
454
+ if (currentElement.getRootNode() instanceof ShadowRoot) {
455
+ currentElement = currentElement.getRootNode().host;
456
+ continue;
457
+ }
458
+ if (currentElement.tagName === 'TD') {
459
+ return { type: 'TD', element: currentElement };
460
+ }
461
+ else if (currentElement.tagName === 'TR') {
462
+ return { type: 'TR', element: currentElement };
463
+ }
464
+ // Handle iframe crossing
465
+ if (currentElement.tagName === 'IFRAME') {
466
+ try {
467
+ currentElement = currentElement.contentDocument.body;
468
+ }
469
+ catch (e) {
470
+ return null;
471
+ }
472
+ }
473
+ else {
474
+ currentElement = currentElement.parentElement;
475
+ }
476
+ depth++;
477
+ }
478
+ return null;
479
+ }
480
+ // Helper function to get cell index
481
+ function getCellIndex(td) {
482
+ if (td.getRootNode() instanceof ShadowRoot) {
483
+ const shadowRoot = td.getRootNode();
484
+ const allCells = Array.from(shadowRoot.querySelectorAll('td'));
485
+ return allCells.indexOf(td);
486
+ }
487
+ let index = 0;
488
+ let sibling = td;
489
+ while (sibling = sibling.previousElementSibling) {
490
+ index++;
491
+ }
492
+ return index;
493
+ }
494
+ // Helper function to check for TH elements
495
+ function hasThElement(row, tableFields) {
496
+ for (const [_, { selector }] of Object.entries(tableFields)) {
497
+ const element = queryElement(row, selector);
498
+ if (element) {
499
+ let current = element;
500
+ while (current && current !== row) {
501
+ if (current.getRootNode() instanceof ShadowRoot) {
502
+ current = current.getRootNode().host;
503
+ continue;
263
504
  }
264
- else if (attribute === 'href') {
265
- // Handle relative 'href' URLs
266
- const href = fieldElement.getAttribute('href');
267
- record[label] = href ? new URL(href, window.location.origin).href : null;
505
+ if (current.tagName === 'TH')
506
+ return true;
507
+ if (current.tagName === 'IFRAME') {
508
+ try {
509
+ current = current.contentDocument.body;
510
+ }
511
+ catch (e) {
512
+ break;
513
+ }
268
514
  }
269
515
  else {
270
- record[label] = fieldElement.getAttribute(attribute);
516
+ current = current.parentElement;
271
517
  }
272
518
  }
273
519
  }
274
- scrapedData.push(record);
275
520
  }
276
- // If we've processed all available elements and still haven't reached the limit,
277
- // break to avoid infinite loop
278
- if (parentElements.length === 0 || scrapedData.length >= parentElements.length) {
521
+ return false;
522
+ }
523
+ // Helper function to filter rows
524
+ function filterRowsBasedOnTag(rows, tableFields) {
525
+ for (const row of rows) {
526
+ if (hasThElement(row, tableFields)) {
527
+ return rows;
528
+ }
529
+ }
530
+ // Include shadow DOM in TH search
531
+ return rows.filter(row => {
532
+ const directTH = row.getElementsByTagName('TH').length === 0;
533
+ const shadowTH = row.shadowRoot ?
534
+ row.shadowRoot.querySelector('th') === null : true;
535
+ return directTH && shadowTH;
536
+ });
537
+ }
538
+ // Class similarity comparison functions
539
+ function calculateClassSimilarity(classList1, classList2) {
540
+ const set1 = new Set(classList1);
541
+ const set2 = new Set(classList2);
542
+ const intersection = new Set([...set1].filter(x => set2.has(x)));
543
+ const union = new Set([...set1, ...set2]);
544
+ return intersection.size / union.size;
545
+ }
546
+ // Enhanced similar elements finding with context support
547
+ function findSimilarElements(baseElement, similarityThreshold = 0.7) {
548
+ const baseClasses = Array.from(baseElement.classList);
549
+ if (baseClasses.length === 0)
550
+ return [];
551
+ const allElements = [];
552
+ // Get elements from main document
553
+ allElements.push(...document.getElementsByTagName(baseElement.tagName));
554
+ // Get elements from shadow DOM
555
+ if (baseElement.getRootNode() instanceof ShadowRoot) {
556
+ const shadowHost = baseElement.getRootNode().host;
557
+ allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
558
+ }
559
+ // Get elements from iframes
560
+ const iframes = document.getElementsByTagName('iframe');
561
+ for (const iframe of iframes) {
562
+ try {
563
+ const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
564
+ allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
565
+ }
566
+ catch (e) {
567
+ console.warn('Cannot access iframe content:', e);
568
+ }
569
+ }
570
+ return allElements.filter(element => {
571
+ if (element === baseElement)
572
+ return false;
573
+ const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
574
+ return similarity >= similarityThreshold;
575
+ });
576
+ }
577
+ // Main scraping logic with context support
578
+ let containers = queryElementAll(document, listSelector);
579
+ containers = Array.from(containers);
580
+ if (containers.length === 0)
581
+ return [];
582
+ if (limit > 1 && containers.length === 1) {
583
+ const baseContainer = containers[0];
584
+ const similarContainers = findSimilarElements(baseContainer);
585
+ if (similarContainers.length > 0) {
586
+ const newContainers = similarContainers.filter(container => !container.matches(listSelector));
587
+ containers = [...containers, ...newContainers];
588
+ }
589
+ }
590
+ const containerFields = containers.map(() => ({
591
+ tableFields: {},
592
+ nonTableFields: {}
593
+ }));
594
+ // Classify fields
595
+ containers.forEach((container, containerIndex) => {
596
+ for (const [label, field] of Object.entries(fields)) {
597
+ const sampleElement = queryElement(container, field.selector);
598
+ if (sampleElement) {
599
+ const ancestor = findTableAncestor(sampleElement);
600
+ if (ancestor) {
601
+ containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 });
602
+ }
603
+ else {
604
+ containerFields[containerIndex].nonTableFields[label] = field;
605
+ }
606
+ }
607
+ else {
608
+ containerFields[containerIndex].nonTableFields[label] = field;
609
+ }
610
+ }
611
+ });
612
+ const tableData = [];
613
+ const nonTableData = [];
614
+ // Process table data with both iframe and shadow DOM support
615
+ for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
616
+ const container = containers[containerIndex];
617
+ const { tableFields } = containerFields[containerIndex];
618
+ if (Object.keys(tableFields).length > 0) {
619
+ const firstField = Object.values(tableFields)[0];
620
+ const firstElement = queryElement(container, firstField.selector);
621
+ let tableContext = firstElement;
622
+ // Find table context including both iframe and shadow DOM
623
+ while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
624
+ if (tableContext.getRootNode() instanceof ShadowRoot) {
625
+ tableContext = tableContext.getRootNode().host;
626
+ continue;
627
+ }
628
+ if (tableContext.tagName === 'IFRAME') {
629
+ try {
630
+ tableContext = tableContext.contentDocument.body;
631
+ }
632
+ catch (e) {
633
+ break;
634
+ }
635
+ }
636
+ else {
637
+ tableContext = tableContext.parentElement;
638
+ }
639
+ }
640
+ if (tableContext) {
641
+ // Get rows from all contexts
642
+ const rows = [];
643
+ // Get rows from regular DOM
644
+ rows.push(...tableContext.getElementsByTagName('TR'));
645
+ // Get rows from shadow DOM
646
+ if (tableContext.shadowRoot) {
647
+ rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
648
+ }
649
+ // Get rows from iframes
650
+ if (tableContext.tagName === 'IFRAME') {
651
+ try {
652
+ const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
653
+ rows.push(...iframeDoc.getElementsByTagName('TR'));
654
+ }
655
+ catch (e) {
656
+ console.warn('Cannot access iframe rows:', e);
657
+ }
658
+ }
659
+ const processedRows = filterRowsBasedOnTag(rows, tableFields);
660
+ for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
661
+ const record = {};
662
+ const currentRow = processedRows[rowIndex];
663
+ for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
664
+ let element = null;
665
+ if (cellIndex >= 0) {
666
+ // Get TD element considering both contexts
667
+ let td = currentRow.children[cellIndex];
668
+ // Check shadow DOM for td
669
+ if (!td && currentRow.shadowRoot) {
670
+ const shadowCells = currentRow.shadowRoot.children;
671
+ if (shadowCells && shadowCells.length > cellIndex) {
672
+ td = shadowCells[cellIndex];
673
+ }
674
+ }
675
+ if (td) {
676
+ element = queryElement(td, selector);
677
+ if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
678
+ element = td;
679
+ }
680
+ if (!element) {
681
+ const tagOnlySelector = selector.split('.')[0];
682
+ element = queryElement(td, tagOnlySelector);
683
+ }
684
+ if (!element) {
685
+ let currentElement = td;
686
+ while (currentElement && currentElement.children.length > 0) {
687
+ let foundContentChild = false;
688
+ for (const child of currentElement.children) {
689
+ if (extractValue(child, attribute)) {
690
+ currentElement = child;
691
+ foundContentChild = true;
692
+ break;
693
+ }
694
+ }
695
+ if (!foundContentChild)
696
+ break;
697
+ }
698
+ element = currentElement;
699
+ }
700
+ }
701
+ }
702
+ else {
703
+ element = queryElement(currentRow, selector);
704
+ }
705
+ if (element) {
706
+ record[label] = extractValue(element, attribute);
707
+ }
708
+ }
709
+ if (Object.keys(record).length > 0) {
710
+ tableData.push(record);
711
+ }
712
+ }
713
+ }
714
+ }
715
+ }
716
+ // Process non-table data with both contexts support
717
+ for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
718
+ if (nonTableData.length >= limit)
279
719
  break;
720
+ const container = containers[containerIndex];
721
+ const { nonTableFields } = containerFields[containerIndex];
722
+ if (Object.keys(nonTableFields).length > 0) {
723
+ const record = {};
724
+ for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
725
+ // Get the last part of the selector after any context delimiter
726
+ const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
727
+ const element = queryElement(container, relativeSelector);
728
+ if (element) {
729
+ record[label] = extractValue(element, attribute);
730
+ }
731
+ }
732
+ if (Object.keys(record).length > 0) {
733
+ nonTableData.push(record);
734
+ }
280
735
  }
281
736
  }
737
+ // Merge and limit the results
738
+ const scrapedData = [...tableData, ...nonTableData];
282
739
  return scrapedData;
283
740
  });
284
741
  };
@@ -86,6 +86,8 @@ export default class Interpreter extends EventEmitter {
86
86
  private carryOutSteps;
87
87
  private handlePagination;
88
88
  private getMatchingActionId;
89
+ private removeShadowSelectors;
90
+ private removeSpecialSelectors;
89
91
  private runLoop;
90
92
  private ensureScriptsLoaded;
91
93
  /**
@@ -568,10 +568,32 @@ class Interpreter extends events_1.EventEmitter {
568
568
  }
569
569
  }
570
570
  }
571
+ removeShadowSelectors(workflow) {
572
+ for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
573
+ const step = workflow[actionId];
574
+ // Check if step has where and selectors
575
+ if (step.where && Array.isArray(step.where.selectors)) {
576
+ // Filter out selectors that contain ">>"
577
+ step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>'));
578
+ }
579
+ }
580
+ return workflow;
581
+ }
582
+ removeSpecialSelectors(workflow) {
583
+ for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
584
+ const step = workflow[actionId];
585
+ if (step.where && Array.isArray(step.where.selectors)) {
586
+ // Filter out if selector has EITHER ":>>" OR ">>"
587
+ step.where.selectors = step.where.selectors.filter(selector => !(selector.includes(':>>') || selector.includes('>>')));
588
+ }
589
+ }
590
+ return workflow;
591
+ }
571
592
  runLoop(p, workflow) {
572
593
  var _a, _b;
573
594
  return __awaiter(this, void 0, void 0, function* () {
574
- const workflowCopy = JSON.parse(JSON.stringify(workflow));
595
+ let workflowCopy = JSON.parse(JSON.stringify(workflow));
596
+ workflowCopy = this.removeSpecialSelectors(workflowCopy);
575
597
  // apply ad-blocker to the current page
576
598
  try {
577
599
  yield this.applyAdBlocker(p);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.7",
3
+ "version": "0.0.8",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",