maxun-core 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -159,6 +159,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
159
159
  * @returns {Array.<Object.<string, string>>}
160
160
  */
161
161
  window.scrapeSchema = function (lists) {
162
+ // Utility functions remain the same
162
163
  function omap(object, f, kf = (x) => x) {
163
164
  return Object.fromEntries(Object.entries(object)
164
165
  .map(([k, v]) => [kf(k), f(v)]));
@@ -167,15 +168,121 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
167
168
  return Object.fromEntries(Object.entries(object)
168
169
  .filter(([k, v]) => f(k, v)));
169
170
  }
171
+ function findAllElements(config) {
172
+ var _a;
173
+ // Regular DOM query if no special delimiters
174
+ if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
175
+ return Array.from(document.querySelectorAll(config.selector));
176
+ }
177
+ // First handle iframe traversal if present
178
+ if (config.selector.includes(':>>')) {
179
+ const parts = config.selector.split(':>>').map(s => s.trim());
180
+ let currentElements = [document];
181
+ // Traverse through each part of the selector
182
+ for (let i = 0; i < parts.length; i++) {
183
+ const part = parts[i];
184
+ const nextElements = [];
185
+ const isLast = i === parts.length - 1;
186
+ for (const element of currentElements) {
187
+ try {
188
+ // For document or iframe document
189
+ const doc = element.contentDocument || element || ((_a = element.contentWindow) === null || _a === void 0 ? void 0 : _a.document);
190
+ if (!doc)
191
+ continue;
192
+ // Query elements in current context
193
+ const found = Array.from(doc.querySelectorAll(part));
194
+ if (isLast) {
195
+ // If it's the last part, keep all matching elements
196
+ nextElements.push(...found);
197
+ }
198
+ else {
199
+ // If not last, only keep iframes for next iteration
200
+ const iframes = found.filter(el => el.tagName === 'IFRAME');
201
+ nextElements.push(...iframes);
202
+ }
203
+ }
204
+ catch (error) {
205
+ console.warn('Cannot access iframe content:', error, {
206
+ part,
207
+ element,
208
+ index: i
209
+ });
210
+ }
211
+ }
212
+ if (nextElements.length === 0) {
213
+ console.warn('No elements found for part:', part, 'at depth:', i);
214
+ return [];
215
+ }
216
+ currentElements = nextElements;
217
+ }
218
+ return currentElements;
219
+ }
220
+ // Handle shadow DOM traversal
221
+ if (config.selector.includes('>>')) {
222
+ const parts = config.selector.split('>>').map(s => s.trim());
223
+ let currentElements = [document];
224
+ for (const part of parts) {
225
+ const nextElements = [];
226
+ for (const element of currentElements) {
227
+ // Try regular DOM first
228
+ const found = Array.from(element.querySelectorAll(part));
229
+ // Then check shadow roots
230
+ for (const foundEl of found) {
231
+ if (foundEl.shadowRoot) {
232
+ nextElements.push(foundEl.shadowRoot);
233
+ }
234
+ else {
235
+ nextElements.push(foundEl);
236
+ }
237
+ }
238
+ }
239
+ currentElements = nextElements;
240
+ }
241
+ return currentElements.filter(el => !(el instanceof ShadowRoot));
242
+ }
243
+ return [];
244
+ }
245
+ // Modified to handle iframe context for URL resolution
246
+ function getElementValue(element, attribute) {
247
+ var _a, _b, _c, _d, _e;
248
+ if (!element)
249
+ return null;
250
+ // Get the base URL for resolving relative URLs
251
+ const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
252
+ switch (attribute) {
253
+ case 'href': {
254
+ const relativeHref = element.getAttribute('href');
255
+ return relativeHref ? new URL(relativeHref, baseURL).href : null;
256
+ }
257
+ case 'src': {
258
+ const relativeSrc = element.getAttribute('src');
259
+ return relativeSrc ? new URL(relativeSrc, baseURL).href : null;
260
+ }
261
+ case 'innerText':
262
+ return (_c = element.innerText) === null || _c === void 0 ? void 0 : _c.trim();
263
+ case 'textContent':
264
+ return (_d = element.textContent) === null || _d === void 0 ? void 0 : _d.trim();
265
+ default:
266
+ return element.getAttribute(attribute) || ((_e = element.innerText) === null || _e === void 0 ? void 0 : _e.trim());
267
+ }
268
+ }
269
+ // Rest of the functions remain largely the same
170
270
  function getSeedKey(listObj) {
171
- const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length)));
172
- return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0];
271
+ const maxLength = Math.max(...Object.values(omap(listObj, (x) => findAllElements(x).length)));
272
+ return Object.keys(ofilter(listObj, (_, v) => findAllElements(v).length === maxLength))[0];
173
273
  }
274
+ // Find minimal bounding elements
174
275
  function getMBEs(elements) {
175
276
  return elements.map((element) => {
176
277
  let candidate = element;
177
278
  const isUniqueChild = (e) => elements
178
- .filter((elem) => { var _a; return (_a = e.parentNode) === null || _a === void 0 ? void 0 : _a.contains(elem); })
279
+ .filter((elem) => {
280
+ var _a;
281
+ // Handle both iframe and shadow DOM boundaries
282
+ const sameContext = elem.getRootNode() === e.getRootNode() &&
283
+ elem.ownerDocument === e.ownerDocument;
284
+ return sameContext && ((_a = e.parentNode) === null || _a === void 0 ? void 0 : _a.contains(elem));
285
+ })
179
286
  .length === 1;
180
287
  while (candidate && isUniqueChild(candidate)) {
181
288
  candidate = candidate.parentNode;
@@ -184,28 +291,35 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
184
291
  });
185
292
  }
186
293
  const seedName = getSeedKey(lists);
187
- const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector));
294
+ const seedElements = findAllElements(lists[seedName]);
188
295
  const MBEs = getMBEs(seedElements);
189
- return MBEs.map((mbe) => omap(lists, ({ selector, attribute }, key) => {
190
- const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem));
191
- if (!elem)
192
- return undefined;
193
- switch (attribute) {
194
- case 'href':
195
- const relativeHref = elem.getAttribute('href');
196
- return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
197
- case 'src':
198
- const relativeSrc = elem.getAttribute('src');
199
- return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
200
- case 'innerText':
201
- return elem.innerText;
202
- case 'textContent':
203
- return elem.textContent;
204
- default:
205
- return elem.innerText;
206
- }
207
- }, (key) => key // Use the original key in the output
208
- )) || [];
296
+ const mbeResults = MBEs.map((mbe) => omap(lists, (config) => {
297
+ const elem = findAllElements(config)
298
+ .find((elem) => mbe.contains(elem));
299
+ return elem ? getElementValue(elem, config.attribute) : undefined;
300
+ }, (key) => key)) || [];
301
+ // If MBE approach didn't find all elements, try independent scraping
302
+ if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) {
303
+ // Fall back to independent scraping
304
+ const results = [];
305
+ const foundElements = new Map();
306
+ // Find all elements for each selector
307
+ Object.entries(lists).forEach(([key, config]) => {
308
+ const elements = findAllElements(config);
309
+ foundElements.set(key, elements);
310
+ });
311
+ // Create result objects for each found element
312
+ foundElements.forEach((elements, key) => {
313
+ elements.forEach((element, index) => {
314
+ if (!results[index]) {
315
+ results[index] = {};
316
+ }
317
+ results[index][key] = getElementValue(element, lists[key].attribute);
318
+ });
319
+ });
320
+ return results.filter(result => Object.keys(result).length > 0);
321
+ }
322
+ return mbeResults;
209
323
  };
210
324
  /**
211
325
  * Scrapes multiple lists of similar items based on a template item.
@@ -218,43 +332,410 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
218
332
  */
219
333
  window.scrapeList = function ({ listSelector, fields, limit = 10 }) {
220
334
  return __awaiter(this, void 0, void 0, function* () {
221
- const scrapedData = [];
222
- while (scrapedData.length < limit) {
223
- // Get all parent elements matching the listSelector
224
- const parentElements = Array.from(document.querySelectorAll(listSelector));
225
- // Iterate through each parent element
226
- for (const parent of parentElements) {
227
- if (scrapedData.length >= limit)
228
- break;
229
- const record = {};
230
- // For each field, select the corresponding element within the parent
231
- for (const [label, { selector, attribute }] of Object.entries(fields)) {
232
- const fieldElement = parent.querySelector(selector);
233
- if (fieldElement) {
234
- if (attribute === 'innerText') {
235
- record[label] = fieldElement.innerText.trim();
335
+ // Enhanced query function to handle both iframe and shadow DOM
336
+ const queryElement = (rootElement, selector) => {
337
+ if (!selector.includes('>>') && !selector.includes(':>>')) {
338
+ return rootElement.querySelector(selector);
339
+ }
340
+ const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
341
+ let currentElement = rootElement;
342
+ for (let i = 0; i < parts.length; i++) {
343
+ if (!currentElement)
344
+ return null;
345
+ // Handle iframe traversal
346
+ if (currentElement.tagName === 'IFRAME') {
347
+ try {
348
+ const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
349
+ currentElement = iframeDoc.querySelector(parts[i]);
350
+ continue;
351
+ }
352
+ catch (e) {
353
+ console.warn('Cannot access iframe content:', e);
354
+ return null;
355
+ }
356
+ }
357
+ // Try regular DOM first
358
+ let nextElement = currentElement.querySelector(parts[i]);
359
+ // Try shadow DOM if not found
360
+ if (!nextElement && currentElement.shadowRoot) {
361
+ nextElement = currentElement.shadowRoot.querySelector(parts[i]);
362
+ }
363
+ // Check children's shadow roots if still not found
364
+ if (!nextElement) {
365
+ const children = Array.from(currentElement.children || []);
366
+ for (const child of children) {
367
+ if (child.shadowRoot) {
368
+ nextElement = child.shadowRoot.querySelector(parts[i]);
369
+ if (nextElement)
370
+ break;
236
371
  }
237
- else if (attribute === 'innerHTML') {
238
- record[label] = fieldElement.innerHTML.trim();
372
+ }
373
+ }
374
+ currentElement = nextElement;
375
+ }
376
+ return currentElement;
377
+ };
378
+ // Enhanced query all function for both contexts
379
+ const queryElementAll = (rootElement, selector) => {
380
+ if (!selector.includes('>>') && !selector.includes(':>>')) {
381
+ return rootElement.querySelectorAll(selector);
382
+ }
383
+ const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
384
+ let currentElements = [rootElement];
385
+ for (const part of parts) {
386
+ const nextElements = [];
387
+ for (const element of currentElements) {
388
+ // Handle iframe traversal
389
+ if (element.tagName === 'IFRAME') {
390
+ try {
391
+ const iframeDoc = element.contentDocument || element.contentWindow.document;
392
+ nextElements.push(...iframeDoc.querySelectorAll(part));
393
+ }
394
+ catch (e) {
395
+ console.warn('Cannot access iframe content:', e);
396
+ continue;
397
+ }
398
+ }
399
+ else {
400
+ // Regular DOM elements
401
+ if (element.querySelectorAll) {
402
+ nextElements.push(...element.querySelectorAll(part));
403
+ }
404
+ // Shadow DOM elements
405
+ if (element.shadowRoot) {
406
+ nextElements.push(...element.shadowRoot.querySelectorAll(part));
239
407
  }
240
- else if (attribute === 'src') {
241
- // Handle relative 'src' URLs
242
- const src = fieldElement.getAttribute('src');
243
- record[label] = src ? new URL(src, window.location.origin).href : null;
408
+ // Check children's shadow roots
409
+ const children = Array.from(element.children || []);
410
+ for (const child of children) {
411
+ if (child.shadowRoot) {
412
+ nextElements.push(...child.shadowRoot.querySelectorAll(part));
413
+ }
414
+ }
415
+ }
416
+ }
417
+ currentElements = nextElements;
418
+ }
419
+ return currentElements;
420
+ };
421
+ // Enhanced value extraction with context awareness
422
+ function extractValue(element, attribute) {
423
+ var _a, _b;
424
+ if (!element)
425
+ return null;
426
+ // Get context-aware base URL
427
+ const baseURL = ((_b = (_a = element.ownerDocument) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || window.location.origin;
428
+ // Check shadow root first
429
+ if (element.shadowRoot) {
430
+ const shadowContent = element.shadowRoot.textContent;
431
+ if (shadowContent === null || shadowContent === void 0 ? void 0 : shadowContent.trim()) {
432
+ return shadowContent.trim();
433
+ }
434
+ }
435
+ if (attribute === 'innerText') {
436
+ return element.innerText.trim();
437
+ }
438
+ else if (attribute === 'innerHTML') {
439
+ return element.innerHTML.trim();
440
+ }
441
+ else if (attribute === 'src' || attribute === 'href') {
442
+ const attrValue = element.getAttribute(attribute);
443
+ return attrValue ? new URL(attrValue, baseURL).href : null;
444
+ }
445
+ return element.getAttribute(attribute);
446
+ }
447
+ // Enhanced table ancestor finding with context support
448
+ function findTableAncestor(element) {
449
+ let currentElement = element;
450
+ const MAX_DEPTH = 5;
451
+ let depth = 0;
452
+ while (currentElement && depth < MAX_DEPTH) {
453
+ // Handle shadow DOM
454
+ if (currentElement.getRootNode() instanceof ShadowRoot) {
455
+ currentElement = currentElement.getRootNode().host;
456
+ continue;
457
+ }
458
+ if (currentElement.tagName === 'TD') {
459
+ return { type: 'TD', element: currentElement };
460
+ }
461
+ else if (currentElement.tagName === 'TR') {
462
+ return { type: 'TR', element: currentElement };
463
+ }
464
+ // Handle iframe crossing
465
+ if (currentElement.tagName === 'IFRAME') {
466
+ try {
467
+ currentElement = currentElement.contentDocument.body;
468
+ }
469
+ catch (e) {
470
+ return null;
471
+ }
472
+ }
473
+ else {
474
+ currentElement = currentElement.parentElement;
475
+ }
476
+ depth++;
477
+ }
478
+ return null;
479
+ }
480
+ // Helper function to get cell index
481
+ function getCellIndex(td) {
482
+ if (td.getRootNode() instanceof ShadowRoot) {
483
+ const shadowRoot = td.getRootNode();
484
+ const allCells = Array.from(shadowRoot.querySelectorAll('td'));
485
+ return allCells.indexOf(td);
486
+ }
487
+ let index = 0;
488
+ let sibling = td;
489
+ while (sibling = sibling.previousElementSibling) {
490
+ index++;
491
+ }
492
+ return index;
493
+ }
494
+ // Helper function to check for TH elements
495
+ function hasThElement(row, tableFields) {
496
+ for (const [_, { selector }] of Object.entries(tableFields)) {
497
+ const element = queryElement(row, selector);
498
+ if (element) {
499
+ let current = element;
500
+ while (current && current !== row) {
501
+ if (current.getRootNode() instanceof ShadowRoot) {
502
+ current = current.getRootNode().host;
503
+ continue;
244
504
  }
245
- else if (attribute === 'href') {
246
- // Handle relative 'href' URLs
247
- const href = fieldElement.getAttribute('href');
248
- record[label] = href ? new URL(href, window.location.origin).href : null;
505
+ if (current.tagName === 'TH')
506
+ return true;
507
+ if (current.tagName === 'IFRAME') {
508
+ try {
509
+ current = current.contentDocument.body;
510
+ }
511
+ catch (e) {
512
+ break;
513
+ }
249
514
  }
250
515
  else {
251
- record[label] = fieldElement.getAttribute(attribute);
516
+ current = current.parentElement;
517
+ }
518
+ }
519
+ }
520
+ }
521
+ return false;
522
+ }
523
+ // Helper function to filter rows
524
+ function filterRowsBasedOnTag(rows, tableFields) {
525
+ for (const row of rows) {
526
+ if (hasThElement(row, tableFields)) {
527
+ return rows;
528
+ }
529
+ }
530
+ // Include shadow DOM in TH search
531
+ return rows.filter(row => {
532
+ const directTH = row.getElementsByTagName('TH').length === 0;
533
+ const shadowTH = row.shadowRoot ?
534
+ row.shadowRoot.querySelector('th') === null : true;
535
+ return directTH && shadowTH;
536
+ });
537
+ }
538
+ // Class similarity comparison functions
539
+ function calculateClassSimilarity(classList1, classList2) {
540
+ const set1 = new Set(classList1);
541
+ const set2 = new Set(classList2);
542
+ const intersection = new Set([...set1].filter(x => set2.has(x)));
543
+ const union = new Set([...set1, ...set2]);
544
+ return intersection.size / union.size;
545
+ }
546
+ // Enhanced similar elements finding with context support
547
+ function findSimilarElements(baseElement, similarityThreshold = 0.7) {
548
+ const baseClasses = Array.from(baseElement.classList);
549
+ if (baseClasses.length === 0)
550
+ return [];
551
+ const allElements = [];
552
+ // Get elements from main document
553
+ allElements.push(...document.getElementsByTagName(baseElement.tagName));
554
+ // Get elements from shadow DOM
555
+ if (baseElement.getRootNode() instanceof ShadowRoot) {
556
+ const shadowHost = baseElement.getRootNode().host;
557
+ allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
558
+ }
559
+ // Get elements from iframes
560
+ const iframes = document.getElementsByTagName('iframe');
561
+ for (const iframe of iframes) {
562
+ try {
563
+ const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
564
+ allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
565
+ }
566
+ catch (e) {
567
+ console.warn('Cannot access iframe content:', e);
568
+ }
569
+ }
570
+ return allElements.filter(element => {
571
+ if (element === baseElement)
572
+ return false;
573
+ const similarity = calculateClassSimilarity(baseClasses, Array.from(element.classList));
574
+ return similarity >= similarityThreshold;
575
+ });
576
+ }
577
+ // Main scraping logic with context support
578
+ let containers = queryElementAll(document, listSelector);
579
+ containers = Array.from(containers);
580
+ if (containers.length === 0)
581
+ return [];
582
+ if (limit > 1 && containers.length === 1) {
583
+ const baseContainer = containers[0];
584
+ const similarContainers = findSimilarElements(baseContainer);
585
+ if (similarContainers.length > 0) {
586
+ const newContainers = similarContainers.filter(container => !container.matches(listSelector));
587
+ containers = [...containers, ...newContainers];
588
+ }
589
+ }
590
+ const containerFields = containers.map(() => ({
591
+ tableFields: {},
592
+ nonTableFields: {}
593
+ }));
594
+ // Classify fields
595
+ containers.forEach((container, containerIndex) => {
596
+ for (const [label, field] of Object.entries(fields)) {
597
+ const sampleElement = queryElement(container, field.selector);
598
+ if (sampleElement) {
599
+ const ancestor = findTableAncestor(sampleElement);
600
+ if (ancestor) {
601
+ containerFields[containerIndex].tableFields[label] = Object.assign(Object.assign({}, field), { tableContext: ancestor.type, cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 });
602
+ }
603
+ else {
604
+ containerFields[containerIndex].nonTableFields[label] = field;
605
+ }
606
+ }
607
+ else {
608
+ containerFields[containerIndex].nonTableFields[label] = field;
609
+ }
610
+ }
611
+ });
612
+ const tableData = [];
613
+ const nonTableData = [];
614
+ // Process table data with both iframe and shadow DOM support
615
+ for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
616
+ const container = containers[containerIndex];
617
+ const { tableFields } = containerFields[containerIndex];
618
+ if (Object.keys(tableFields).length > 0) {
619
+ const firstField = Object.values(tableFields)[0];
620
+ const firstElement = queryElement(container, firstField.selector);
621
+ let tableContext = firstElement;
622
+ // Find table context including both iframe and shadow DOM
623
+ while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
624
+ if (tableContext.getRootNode() instanceof ShadowRoot) {
625
+ tableContext = tableContext.getRootNode().host;
626
+ continue;
627
+ }
628
+ if (tableContext.tagName === 'IFRAME') {
629
+ try {
630
+ tableContext = tableContext.contentDocument.body;
631
+ }
632
+ catch (e) {
633
+ break;
252
634
  }
253
635
  }
636
+ else {
637
+ tableContext = tableContext.parentElement;
638
+ }
639
+ }
640
+ if (tableContext) {
641
+ // Get rows from all contexts
642
+ const rows = [];
643
+ // Get rows from regular DOM
644
+ rows.push(...tableContext.getElementsByTagName('TR'));
645
+ // Get rows from shadow DOM
646
+ if (tableContext.shadowRoot) {
647
+ rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
648
+ }
649
+ // Get rows from iframes
650
+ if (tableContext.tagName === 'IFRAME') {
651
+ try {
652
+ const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
653
+ rows.push(...iframeDoc.getElementsByTagName('TR'));
654
+ }
655
+ catch (e) {
656
+ console.warn('Cannot access iframe rows:', e);
657
+ }
658
+ }
659
+ const processedRows = filterRowsBasedOnTag(rows, tableFields);
660
+ for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
661
+ const record = {};
662
+ const currentRow = processedRows[rowIndex];
663
+ for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
664
+ let element = null;
665
+ if (cellIndex >= 0) {
666
+ // Get TD element considering both contexts
667
+ let td = currentRow.children[cellIndex];
668
+ // Check shadow DOM for td
669
+ if (!td && currentRow.shadowRoot) {
670
+ const shadowCells = currentRow.shadowRoot.children;
671
+ if (shadowCells && shadowCells.length > cellIndex) {
672
+ td = shadowCells[cellIndex];
673
+ }
674
+ }
675
+ if (td) {
676
+ element = queryElement(td, selector);
677
+ if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
678
+ element = td;
679
+ }
680
+ if (!element) {
681
+ const tagOnlySelector = selector.split('.')[0];
682
+ element = queryElement(td, tagOnlySelector);
683
+ }
684
+ if (!element) {
685
+ let currentElement = td;
686
+ while (currentElement && currentElement.children.length > 0) {
687
+ let foundContentChild = false;
688
+ for (const child of currentElement.children) {
689
+ if (extractValue(child, attribute)) {
690
+ currentElement = child;
691
+ foundContentChild = true;
692
+ break;
693
+ }
694
+ }
695
+ if (!foundContentChild)
696
+ break;
697
+ }
698
+ element = currentElement;
699
+ }
700
+ }
701
+ }
702
+ else {
703
+ element = queryElement(currentRow, selector);
704
+ }
705
+ if (element) {
706
+ record[label] = extractValue(element, attribute);
707
+ }
708
+ }
709
+ if (Object.keys(record).length > 0) {
710
+ tableData.push(record);
711
+ }
712
+ }
713
+ }
714
+ }
715
+ }
716
+ // Process non-table data with both contexts support
717
+ for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
718
+ if (nonTableData.length >= limit)
719
+ break;
720
+ const container = containers[containerIndex];
721
+ const { nonTableFields } = containerFields[containerIndex];
722
+ if (Object.keys(nonTableFields).length > 0) {
723
+ const record = {};
724
+ for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
725
+ // Get the last part of the selector after any context delimiter
726
+ const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
727
+ const element = queryElement(container, relativeSelector);
728
+ if (element) {
729
+ record[label] = extractValue(element, attribute);
730
+ }
731
+ }
732
+ if (Object.keys(record).length > 0) {
733
+ nonTableData.push(record);
254
734
  }
255
- scrapedData.push(record);
256
735
  }
257
736
  }
737
+ // Merge and limit the results
738
+ const scrapedData = [...tableData, ...nonTableData];
258
739
  return scrapedData;
259
740
  });
260
741
  };
@@ -86,6 +86,8 @@ export default class Interpreter extends EventEmitter {
86
86
  private carryOutSteps;
87
87
  private handlePagination;
88
88
  private getMatchingActionId;
89
+ private removeShadowSelectors;
90
+ private removeSpecialSelectors;
89
91
  private runLoop;
90
92
  private ensureScriptsLoaded;
91
93
  /**
@@ -84,14 +84,24 @@ class Interpreter extends events_1.EventEmitter {
84
84
  applyAdBlocker(page) {
85
85
  return __awaiter(this, void 0, void 0, function* () {
86
86
  if (this.blocker) {
87
- yield this.blocker.enableBlockingInPage(page);
87
+ try {
88
+ yield this.blocker.enableBlockingInPage(page);
89
+ }
90
+ catch (err) {
91
+ this.log(`Ad-blocker operation failed:`, logger_1.Level.ERROR);
92
+ }
88
93
  }
89
94
  });
90
95
  }
91
96
  disableAdBlocker(page) {
92
97
  return __awaiter(this, void 0, void 0, function* () {
93
98
  if (this.blocker) {
94
- yield this.blocker.disableBlockingInPage(page);
99
+ try {
100
+ yield this.blocker.disableBlockingInPage(page);
101
+ }
102
+ catch (err) {
103
+ this.log(`Ad-blocker operation failed:`, logger_1.Level.ERROR);
104
+ }
95
105
  }
96
106
  });
97
107
  }
@@ -156,8 +166,8 @@ class Interpreter extends events_1.EventEmitter {
156
166
  // const actionable = async (selector: string): Promise<boolean> => {
157
167
  // try {
158
168
  // const proms = [
159
- // page.isEnabled(selector, { timeout: 5000 }),
160
- // page.isVisible(selector, { timeout: 5000 }),
169
+ // page.isEnabled(selector, { timeout: 10000 }),
170
+ // page.isVisible(selector, { timeout: 10000 }),
161
171
  // ];
162
172
  // return await Promise.all(proms).then((bools) => bools.every((x) => x));
163
173
  // } catch (e) {
@@ -176,6 +186,15 @@ class Interpreter extends events_1.EventEmitter {
176
186
  // return [];
177
187
  // }),
178
188
  // ).then((x) => x.flat());
189
+ const presentSelectors = yield Promise.all(selectors.map((selector) => __awaiter(this, void 0, void 0, function* () {
190
+ try {
191
+ yield page.waitForSelector(selector, { state: 'attached' });
192
+ return [selector];
193
+ }
194
+ catch (e) {
195
+ return [];
196
+ }
197
+ }))).then((x) => x.flat());
179
198
  const action = workflowCopy[workflowCopy.length - 1];
180
199
  // console.log("Next action:", action)
181
200
  let url = page.url();
@@ -186,7 +205,7 @@ class Interpreter extends events_1.EventEmitter {
186
205
  url,
187
206
  cookies: (yield page.context().cookies([page.url()]))
188
207
  .reduce((p, cookie) => (Object.assign(Object.assign({}, p), { [cookie.name]: cookie.value })), {}),
189
- selectors,
208
+ selectors: presentSelectors,
190
209
  };
191
210
  });
192
211
  }
@@ -420,7 +439,12 @@ class Interpreter extends events_1.EventEmitter {
420
439
  yield executeAction(invokee, methodName, step.args);
421
440
  }
422
441
  catch (error) {
423
- yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
442
+ try {
443
+ yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
444
+ }
445
+ catch (error) {
446
+ continue;
447
+ }
424
448
  }
425
449
  }
426
450
  else {
@@ -544,12 +568,39 @@ class Interpreter extends events_1.EventEmitter {
544
568
  }
545
569
  }
546
570
  }
571
+ removeShadowSelectors(workflow) {
572
+ for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
573
+ const step = workflow[actionId];
574
+ // Check if step has where and selectors
575
+ if (step.where && Array.isArray(step.where.selectors)) {
576
+ // Filter out selectors that contain ">>"
577
+ step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>'));
578
+ }
579
+ }
580
+ return workflow;
581
+ }
582
+ removeSpecialSelectors(workflow) {
583
+ for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
584
+ const step = workflow[actionId];
585
+ if (step.where && Array.isArray(step.where.selectors)) {
586
+ // Filter out if selector has EITHER ":>>" OR ">>"
587
+ step.where.selectors = step.where.selectors.filter(selector => !(selector.includes(':>>') || selector.includes('>>')));
588
+ }
589
+ }
590
+ return workflow;
591
+ }
547
592
  runLoop(p, workflow) {
548
593
  var _a, _b;
549
594
  return __awaiter(this, void 0, void 0, function* () {
550
- const workflowCopy = JSON.parse(JSON.stringify(workflow));
595
+ let workflowCopy = JSON.parse(JSON.stringify(workflow));
596
+ workflowCopy = this.removeSpecialSelectors(workflowCopy);
551
597
  // apply ad-blocker to the current page
552
- yield this.applyAdBlocker(p);
598
+ try {
599
+ yield this.applyAdBlocker(p);
600
+ }
601
+ catch (error) {
602
+ this.log(`Failed to apply ad-blocker: ${error.message}`, logger_1.Level.ERROR);
603
+ }
553
604
  const usedActions = [];
554
605
  let selectors = [];
555
606
  let lastAction = null;
@@ -660,6 +711,7 @@ class Interpreter extends events_1.EventEmitter {
660
711
  return __awaiter(this, void 0, void 0, function* () {
661
712
  this.log('Starting the workflow.', logger_1.Level.LOG);
662
713
  const context = page.context();
714
+ page.setDefaultNavigationTimeout(100000);
663
715
  // Check proxy settings from context options
664
716
  const contextOptions = context._options;
665
717
  const hasProxy = !!(contextOptions === null || contextOptions === void 0 ? void 0 : contextOptions.proxy);
@@ -3,43 +3,43 @@
3
3
  */
4
4
  export default class Concurrency {
5
5
  /**
6
- * Maximum number of workers running in parallel. If set to `null`, there is no limit.
7
- */
6
+ * Maximum number of workers running in parallel. If set to `null`, there is no limit.
7
+ */
8
8
  maxConcurrency: number;
9
9
  /**
10
- * Number of currently active workers.
11
- */
10
+ * Number of currently active workers.
11
+ */
12
12
  activeWorkers: number;
13
13
  /**
14
- * Queue of jobs waiting to be completed.
15
- */
14
+ * Queue of jobs waiting to be completed.
15
+ */
16
16
  private jobQueue;
17
17
  /**
18
- * "Resolve" callbacks of the waitForCompletion() promises.
19
- */
18
+ * "Resolve" callbacks of the waitForCompletion() promises.
19
+ */
20
20
  private waiting;
21
21
  /**
22
- * Constructs a new instance of concurrency manager.
23
- * @param {number} maxConcurrency Maximum number of workers running in parallel.
24
- */
22
+ * Constructs a new instance of concurrency manager.
23
+ * @param {number} maxConcurrency Maximum number of workers running in parallel.
24
+ */
25
25
  constructor(maxConcurrency: number);
26
26
  /**
27
- * Takes a waiting job out of the queue and runs it.
28
- */
27
+ * Takes a waiting job out of the queue and runs it.
28
+ */
29
29
  private runNextJob;
30
30
  /**
31
- * Pass a job (a time-demanding async function) to the concurrency manager. \
32
- * The time of the job's execution depends on the concurrency manager itself
33
- * (given a generous enough `maxConcurrency` value, it might be immediate,
34
- * but this is not guaranteed).
35
- * @param worker Async function to be executed (job to be processed).
36
- */
31
+ * Pass a job (a time-demanding async function) to the concurrency manager. \
32
+ * The time of the job's execution depends on the concurrency manager itself
33
+ * (given a generous enough `maxConcurrency` value, it might be immediate,
34
+ * but this is not guaranteed).
35
+ * @param worker Async function to be executed (job to be processed).
36
+ */
37
37
  addJob(job: () => Promise<any>): void;
38
38
  /**
39
- * Waits until there is no running nor waiting job. \
40
- * If the concurrency manager is idle at the time of calling this function,
41
- * it waits until at least one job is completed (can be "presubscribed").
42
- * @returns Promise, resolved after there is no running/waiting worker.
43
- */
39
+ * Waits until there is no running nor waiting job. \
40
+ * If the concurrency manager is idle at the time of calling this function,
41
+ * it waits until at least one job is completed (can be "presubscribed").
42
+ * @returns Promise, resolved after there is no running/waiting worker.
43
+ */
44
44
  waitForCompletion(): Promise<void>;
45
45
  }
@@ -5,31 +5,31 @@ Object.defineProperty(exports, "__esModule", { value: true });
5
5
  */
6
6
  class Concurrency {
7
7
  /**
8
- * Constructs a new instance of concurrency manager.
9
- * @param {number} maxConcurrency Maximum number of workers running in parallel.
10
- */
8
+ * Constructs a new instance of concurrency manager.
9
+ * @param {number} maxConcurrency Maximum number of workers running in parallel.
10
+ */
11
11
  constructor(maxConcurrency) {
12
12
  /**
13
- * Maximum number of workers running in parallel. If set to `null`, there is no limit.
14
- */
13
+ * Maximum number of workers running in parallel. If set to `null`, there is no limit.
14
+ */
15
15
  this.maxConcurrency = 1;
16
16
  /**
17
- * Number of currently active workers.
18
- */
17
+ * Number of currently active workers.
18
+ */
19
19
  this.activeWorkers = 0;
20
20
  /**
21
- * Queue of jobs waiting to be completed.
22
- */
21
+ * Queue of jobs waiting to be completed.
22
+ */
23
23
  this.jobQueue = [];
24
24
  /**
25
- * "Resolve" callbacks of the waitForCompletion() promises.
26
- */
25
+ * "Resolve" callbacks of the waitForCompletion() promises.
26
+ */
27
27
  this.waiting = [];
28
28
  this.maxConcurrency = maxConcurrency;
29
29
  }
30
30
  /**
31
- * Takes a waiting job out of the queue and runs it.
32
- */
31
+ * Takes a waiting job out of the queue and runs it.
32
+ */
33
33
  runNextJob() {
34
34
  const job = this.jobQueue.pop();
35
35
  if (job) {
@@ -49,12 +49,12 @@ class Concurrency {
49
49
  }
50
50
  }
51
51
  /**
52
- * Pass a job (a time-demanding async function) to the concurrency manager. \
53
- * The time of the job's execution depends on the concurrency manager itself
54
- * (given a generous enough `maxConcurrency` value, it might be immediate,
55
- * but this is not guaranteed).
56
- * @param worker Async function to be executed (job to be processed).
57
- */
52
+ * Pass a job (a time-demanding async function) to the concurrency manager. \
53
+ * The time of the job's execution depends on the concurrency manager itself
54
+ * (given a generous enough `maxConcurrency` value, it might be immediate,
55
+ * but this is not guaranteed).
56
+ * @param worker Async function to be executed (job to be processed).
57
+ */
58
58
  addJob(job) {
59
59
  // console.debug("Adding a worker!");
60
60
  this.jobQueue.push(job);
@@ -67,11 +67,11 @@ class Concurrency {
67
67
  }
68
68
  }
69
69
  /**
70
- * Waits until there is no running nor waiting job. \
71
- * If the concurrency manager is idle at the time of calling this function,
72
- * it waits until at least one job is completed (can be "presubscribed").
73
- * @returns Promise, resolved after there is no running/waiting worker.
74
- */
70
+ * Waits until there is no running nor waiting job. \
71
+ * If the concurrency manager is idle at the time of calling this function,
72
+ * it waits until at least one job is completed (can be "presubscribed").
73
+ * @returns Promise, resolved after there is no running/waiting worker.
74
+ */
75
75
  waitForCompletion() {
76
76
  return new Promise((res) => {
77
77
  this.waiting.push(res);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "maxun-core",
3
- "version": "0.0.6",
3
+ "version": "0.0.8",
4
4
  "description": "Core package for Maxun, responsible for data extraction",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",