mx-cloud 0.0.27 → 0.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,6 +54,7 @@ const concurrency_1 = __importDefault(require("./utils/concurrency"));
54
54
  const preprocessor_1 = __importDefault(require("./preprocessor"));
55
55
  const logger_1 = __importStar(require("./utils/logger"));
56
56
  const selector_1 = require("./selector");
57
+ const markdown_1 = require("./utils/markdown");
57
58
  /**
58
59
  * Class for running the Smart Workflows.
59
60
  */
@@ -678,10 +679,8 @@ class Interpreter extends events_1.EventEmitter {
678
679
  }
679
680
  this.log('Starting crawl operation', logger_1.Level.LOG);
680
681
  try {
681
- // Get current page URL and log it
682
682
  const currentUrl = page.url();
683
683
  this.log(`Current page URL: ${currentUrl}`, logger_1.Level.LOG);
684
- // If page is on about:blank or empty, we need to wait for navigation
685
684
  if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
686
685
  this.log('Page not yet navigated, waiting for navigation...', logger_1.Level.WARN);
687
686
  yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
@@ -690,13 +689,306 @@ class Interpreter extends events_1.EventEmitter {
690
689
  this.log(`Using base URL for crawl: ${baseUrl}`, logger_1.Level.LOG);
691
690
  const parsedBase = new URL(baseUrl);
692
691
  const baseDomain = parsedBase.hostname;
693
- let discoveredUrls = [];
694
- // Step 1: Sitemap discovery using XMLHttpRequest to avoid polyfills
692
+ let robotRules = {
693
+ disallowedPaths: [],
694
+ allowedPaths: [],
695
+ crawlDelay: null
696
+ };
697
+ if (crawlConfig.respectRobots) {
698
+ this.log('Fetching robots.txt...', logger_1.Level.LOG);
699
+ try {
700
+ const robotsUrl = `${parsedBase.protocol}//${parsedBase.host}/robots.txt`;
701
+ const robotsContent = yield page.evaluate((url) => {
702
+ return new Promise((resolve) => {
703
+ const xhr = new XMLHttpRequest();
704
+ xhr.open('GET', url, true);
705
+ xhr.onload = function () {
706
+ if (xhr.status === 200) {
707
+ resolve(xhr.responseText);
708
+ }
709
+ else {
710
+ resolve('');
711
+ }
712
+ };
713
+ xhr.onerror = function () {
714
+ resolve('');
715
+ };
716
+ xhr.send();
717
+ });
718
+ }, robotsUrl);
719
+ if (robotsContent) {
720
+ const lines = robotsContent.split('\n');
721
+ let isRelevantUserAgent = false;
722
+ let foundSpecificUserAgent = false;
723
+ for (const line of lines) {
724
+ const trimmedLine = line.trim().toLowerCase();
725
+ if (trimmedLine.startsWith('#') || trimmedLine === '') {
726
+ continue;
727
+ }
728
+ const colonIndex = line.indexOf(':');
729
+ if (colonIndex === -1)
730
+ continue;
731
+ const directive = line.substring(0, colonIndex).trim().toLowerCase();
732
+ const value = line.substring(colonIndex + 1).trim();
733
+ if (directive === 'user-agent') {
734
+ const agent = value.toLowerCase();
735
+ if (agent === '*' && !foundSpecificUserAgent) {
736
+ isRelevantUserAgent = true;
737
+ }
738
+ else if (agent.includes('bot') || agent.includes('crawler') || agent.includes('spider')) {
739
+ isRelevantUserAgent = true;
740
+ foundSpecificUserAgent = true;
741
+ }
742
+ else {
743
+ if (!foundSpecificUserAgent) {
744
+ isRelevantUserAgent = false;
745
+ }
746
+ }
747
+ }
748
+ else if (isRelevantUserAgent) {
749
+ if (directive === 'disallow' && value) {
750
+ robotRules.disallowedPaths.push(value);
751
+ }
752
+ else if (directive === 'allow' && value) {
753
+ robotRules.allowedPaths.push(value);
754
+ }
755
+ else if (directive === 'crawl-delay' && value) {
756
+ const delay = parseFloat(value);
757
+ if (!isNaN(delay) && delay > 0) {
758
+ robotRules.crawlDelay = delay * 1000;
759
+ }
760
+ }
761
+ }
762
+ }
763
+ this.log(`Robots.txt parsed: ${robotRules.disallowedPaths.length} disallowed paths, ${robotRules.allowedPaths.length} allowed paths, crawl-delay: ${robotRules.crawlDelay || 'none'}`, logger_1.Level.LOG);
764
+ }
765
+ else {
766
+ this.log('No robots.txt found or not accessible, proceeding without restrictions', logger_1.Level.WARN);
767
+ }
768
+ }
769
+ catch (error) {
770
+ this.log(`Failed to fetch robots.txt: ${error.message}, proceeding without restrictions`, logger_1.Level.WARN);
771
+ }
772
+ }
773
+ const isUrlAllowedByRobots = (url) => {
774
+ if (!crawlConfig.respectRobots)
775
+ return true;
776
+ try {
777
+ const urlObj = new URL(url);
778
+ const pathname = urlObj.pathname;
779
+ for (const allowedPath of robotRules.allowedPaths) {
780
+ if (allowedPath === pathname || pathname.startsWith(allowedPath)) {
781
+ return true;
782
+ }
783
+ if (allowedPath.includes('*')) {
784
+ const regex = new RegExp('^' + allowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
785
+ if (regex.test(pathname)) {
786
+ return true;
787
+ }
788
+ }
789
+ }
790
+ for (const disallowedPath of robotRules.disallowedPaths) {
791
+ if (disallowedPath === '/') {
792
+ return false;
793
+ }
794
+ if (pathname.startsWith(disallowedPath)) {
795
+ return false;
796
+ }
797
+ if (disallowedPath.includes('*')) {
798
+ const regex = new RegExp('^' + disallowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
799
+ if (regex.test(pathname)) {
800
+ return false;
801
+ }
802
+ }
803
+ if (disallowedPath.endsWith('$')) {
804
+ const pattern = disallowedPath.slice(0, -1);
805
+ if (pathname === pattern || pathname.endsWith(pattern)) {
806
+ return false;
807
+ }
808
+ }
809
+ }
810
+ return true;
811
+ }
812
+ catch (error) {
813
+ return true;
814
+ }
815
+ };
816
+ const isUrlAllowedByConfig = (url) => {
817
+ try {
818
+ const urlObj = new URL(url);
819
+ if (crawlConfig.mode === 'domain') {
820
+ if (urlObj.hostname !== baseDomain)
821
+ return false;
822
+ }
823
+ else if (crawlConfig.mode === 'subdomain') {
824
+ if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
825
+ return false;
826
+ }
827
+ else if (crawlConfig.mode === 'path') {
828
+ if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
829
+ return false;
830
+ }
831
+ if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
832
+ const matches = crawlConfig.includePaths.some(pattern => {
833
+ try {
834
+ const regex = new RegExp(pattern);
835
+ return regex.test(url);
836
+ }
837
+ catch (_a) {
838
+ return url.includes(pattern);
839
+ }
840
+ });
841
+ if (!matches)
842
+ return false;
843
+ }
844
+ if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
845
+ const matches = crawlConfig.excludePaths.some(pattern => {
846
+ try {
847
+ const regex = new RegExp(pattern);
848
+ return regex.test(url);
849
+ }
850
+ catch (_a) {
851
+ return url.includes(pattern);
852
+ }
853
+ });
854
+ if (matches)
855
+ return false;
856
+ }
857
+ return true;
858
+ }
859
+ catch (error) {
860
+ return false;
861
+ }
862
+ };
863
+ const normalizeUrl = (url) => {
864
+ return url.replace(/#.*$/, '').replace(/\/$/, '');
865
+ };
866
+ const extractLinksFromPage = () => __awaiter(this, void 0, void 0, function* () {
867
+ try {
868
+ yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
869
+ yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
870
+ yield new Promise(resolve => setTimeout(resolve, 1000));
871
+ const pageLinks = yield page.evaluate(() => {
872
+ const links = [];
873
+ const allAnchors = document.querySelectorAll('a');
874
+ for (let i = 0; i < allAnchors.length; i++) {
875
+ const anchor = allAnchors[i];
876
+ const fullHref = anchor.href;
877
+ if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
878
+ links.push(fullHref);
879
+ }
880
+ }
881
+ return links;
882
+ });
883
+ return pageLinks;
884
+ }
885
+ catch (error) {
886
+ this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
887
+ return [];
888
+ }
889
+ });
890
+ const scrapePageContent = (url) => __awaiter(this, void 0, void 0, function* () {
891
+ const pageData = yield page.evaluate(() => {
892
+ var _a, _b;
893
+ const getMeta = (name) => {
894
+ const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
895
+ return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
896
+ };
897
+ const getAllMeta = () => {
898
+ const metadata = {};
899
+ const metaTags = document.querySelectorAll('meta');
900
+ metaTags.forEach(tag => {
901
+ const name = tag.getAttribute('name') || tag.getAttribute('property');
902
+ const content = tag.getAttribute('content');
903
+ if (name && content) {
904
+ metadata[name] = content;
905
+ }
906
+ });
907
+ return metadata;
908
+ };
909
+ const title = document.title || '';
910
+ const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
911
+ const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
912
+ elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
913
+ const html = document.documentElement.outerHTML;
914
+ const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
915
+ const allMetadata = getAllMeta();
916
+ return {
917
+ title,
918
+ description: getMeta('description'),
919
+ text: bodyText,
920
+ html: html,
921
+ links: links,
922
+ wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
923
+ metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
924
+ };
925
+ });
926
+ const result = {
927
+ metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
928
+ scrapedAt: new Date().toISOString()
929
+ };
930
+ const formats = crawlConfig.outputFormats || [];
931
+ if (formats.includes('text')) {
932
+ result.text = pageData.text;
933
+ result.wordCount = pageData.wordCount;
934
+ }
935
+ if (formats.includes('html')) {
936
+ result.html = pageData.html;
937
+ result.links = pageData.links;
938
+ }
939
+ if (formats.includes('markdown')) {
940
+ try {
941
+ const markdown = yield (0, markdown_1.parseMarkdown)(pageData.html, url);
942
+ result.markdown = markdown;
943
+ }
944
+ catch (err) {
945
+ this.log(`Markdown conversion failed for ${url}: ${err.message}`, logger_1.Level.WARN);
946
+ result.markdown = '';
947
+ }
948
+ }
949
+ if (formats.includes('screenshot-visible')) {
950
+ try {
951
+ const screenshotBuffer = yield page.screenshot({ fullPage: false });
952
+ const screenshotName = `Crawl - ${crawlResults.length} - Visible`;
953
+ yield this.options.binaryCallback({
954
+ name: screenshotName,
955
+ data: screenshotBuffer,
956
+ mimeType: 'image/png'
957
+ }, 'image/png');
958
+ result.screenshotVisible = screenshotName;
959
+ }
960
+ catch (err) {
961
+ this.log(`Screenshot-visible failed for ${url}: ${err.message}`, logger_1.Level.WARN);
962
+ }
963
+ }
964
+ if (formats.includes('screenshot-fullpage')) {
965
+ try {
966
+ const screenshotBuffer = yield page.screenshot({ fullPage: true });
967
+ const screenshotName = `Crawl - ${crawlResults.length} - Full Page`;
968
+ yield this.options.binaryCallback({
969
+ name: screenshotName,
970
+ data: screenshotBuffer,
971
+ mimeType: 'image/png'
972
+ }, 'image/png');
973
+ result.screenshotFullpage = screenshotName;
974
+ }
975
+ catch (err) {
976
+ this.log(`Screenshot-fullpage failed for ${url}: ${err.message}`, logger_1.Level.WARN);
977
+ }
978
+ }
979
+ return result;
980
+ });
981
+ const visitedUrls = new Set();
982
+ const crawlResults = [];
983
+ const crawlQueue = [];
984
+ const normalizedBaseUrl = normalizeUrl(baseUrl);
985
+ visitedUrls.add(normalizedBaseUrl);
986
+ crawlQueue.push({ url: baseUrl, depth: 0 });
987
+ this.log(`Starting breadth-first crawl with maxDepth: ${crawlConfig.maxDepth}, limit: ${crawlConfig.limit}`, logger_1.Level.LOG);
695
988
  if (crawlConfig.useSitemap) {
696
989
  this.log('Fetching sitemap URLs...', logger_1.Level.LOG);
697
990
  try {
698
991
  const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
699
- // Use XMLHttpRequest instead of fetch to avoid polyfills
700
992
  const sitemapUrls = yield page.evaluate((url) => {
701
993
  return new Promise((resolve) => {
702
994
  const xhr = new XMLHttpRequest();
@@ -721,7 +1013,13 @@ class Interpreter extends events_1.EventEmitter {
721
1013
  if (sitemapUrls.length > 0) {
722
1014
  const nestedSitemaps = sitemapUrls.filter(url => url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/'));
723
1015
  const regularUrls = sitemapUrls.filter(url => !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/'));
724
- discoveredUrls.push(...regularUrls);
1016
+ for (const sitemapPageUrl of regularUrls) {
1017
+ const normalized = normalizeUrl(sitemapPageUrl);
1018
+ if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(sitemapPageUrl) && isUrlAllowedByRobots(sitemapPageUrl)) {
1019
+ visitedUrls.add(normalized);
1020
+ crawlQueue.push({ url: sitemapPageUrl, depth: 1 });
1021
+ }
1022
+ }
725
1023
  this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, logger_1.Level.LOG);
726
1024
  for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
727
1025
  try {
@@ -747,16 +1045,20 @@ class Interpreter extends events_1.EventEmitter {
747
1045
  xhr.send();
748
1046
  });
749
1047
  }, nestedUrl);
750
- if (nestedUrls.length > 0) {
751
- discoveredUrls.push(...nestedUrls);
752
- this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, logger_1.Level.LOG);
1048
+ for (const nestedPageUrl of nestedUrls) {
1049
+ const normalized = normalizeUrl(nestedPageUrl);
1050
+ if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(nestedPageUrl) && isUrlAllowedByRobots(nestedPageUrl)) {
1051
+ visitedUrls.add(normalized);
1052
+ crawlQueue.push({ url: nestedPageUrl, depth: 1 });
1053
+ }
753
1054
  }
1055
+ this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, logger_1.Level.LOG);
754
1056
  }
755
1057
  catch (error) {
756
1058
  this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, logger_1.Level.WARN);
757
1059
  }
758
1060
  }
759
- this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, logger_1.Level.LOG);
1061
+ this.log(`Total URLs queued from sitemaps: ${crawlQueue.length - 1}`, logger_1.Level.LOG);
760
1062
  }
761
1063
  else {
762
1064
  this.log('No URLs found in sitemap or sitemap not available', logger_1.Level.WARN);
@@ -766,167 +1068,76 @@ class Interpreter extends events_1.EventEmitter {
766
1068
  this.log(`Sitemap fetch failed: ${error.message}`, logger_1.Level.WARN);
767
1069
  }
768
1070
  }
769
- if (crawlConfig.followLinks) {
770
- this.log('Extracting links from current page...', logger_1.Level.LOG);
771
- try {
772
- yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
773
- yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
774
- this.log('Network did not become idle, continuing anyway', logger_1.Level.WARN);
775
- });
776
- yield new Promise(resolve => setTimeout(resolve, 5000));
777
- const anchorCount = yield page.evaluate(() => {
778
- return document.querySelectorAll('a').length;
779
- });
780
- this.log(`Page has ${anchorCount} total anchor tags`, logger_1.Level.LOG);
781
- const pageLinks = yield page.evaluate(() => {
782
- const links = [];
783
- const allAnchors = document.querySelectorAll('a');
784
- console.log('Total anchors found:', allAnchors.length);
785
- for (let i = 0; i < allAnchors.length; i++) {
786
- const anchor = allAnchors[i];
787
- const href = anchor.getAttribute('href');
788
- const fullHref = anchor.href;
789
- if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
790
- links.push(fullHref);
791
- }
792
- }
793
- console.log('Links extracted:', links.length);
794
- return links;
795
- });
796
- discoveredUrls.push(...pageLinks);
797
- this.log(`Found ${pageLinks.length} links from page`, logger_1.Level.LOG);
798
- }
799
- catch (error) {
800
- this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
1071
+ let processedCount = 0;
1072
+ while (crawlQueue.length > 0 && crawlResults.length < crawlConfig.limit) {
1073
+ if (this.isAborted) {
1074
+ this.log('Workflow aborted during crawl', logger_1.Level.WARN);
1075
+ break;
801
1076
  }
802
- }
803
- const filteredUrls = discoveredUrls.filter(url => {
1077
+ const { url, depth } = crawlQueue.shift();
1078
+ processedCount++;
1079
+ this.log(`[${crawlResults.length + 1}/${crawlConfig.limit}] Crawling (depth ${depth}): ${url}`, logger_1.Level.LOG);
804
1080
  try {
805
- const urlObj = new URL(url);
806
- if (crawlConfig.mode === 'domain') {
807
- if (urlObj.hostname !== baseDomain)
808
- return false;
1081
+ if (robotRules.crawlDelay && crawlResults.length > 0) {
1082
+ this.log(`Applying crawl delay: ${robotRules.crawlDelay}ms`, logger_1.Level.LOG);
1083
+ yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
809
1084
  }
810
- else if (crawlConfig.mode === 'subdomain') {
811
- if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
812
- return false;
813
- }
814
- else if (crawlConfig.mode === 'path') {
815
- if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
816
- return false;
817
- }
818
- if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
819
- const matches = crawlConfig.includePaths.some(pattern => {
820
- const regex = new RegExp(pattern);
821
- return regex.test(url);
822
- });
823
- if (!matches)
824
- return false;
825
- }
826
- if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
827
- const matches = crawlConfig.excludePaths.some(pattern => {
828
- const regex = new RegExp(pattern);
829
- return regex.test(url);
830
- });
831
- if (matches)
832
- return false;
833
- }
834
- return true;
835
- }
836
- catch (error) {
837
- return false;
838
- }
839
- });
840
- const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
841
- return url.replace(/#.*$/, '').replace(/\/$/, '');
842
- })));
843
- const basePathname = parsedBase.pathname;
844
- const prioritizedUrls = uniqueUrls.sort((a, b) => {
845
- try {
846
- const aUrl = new URL(a);
847
- const bUrl = new URL(b);
848
- const aMatchesBase = aUrl.pathname.startsWith(basePathname);
849
- const bMatchesBase = bUrl.pathname.startsWith(basePathname);
850
- if (aMatchesBase && !bMatchesBase)
851
- return -1;
852
- if (!aMatchesBase && bMatchesBase)
853
- return 1;
854
- return 0;
855
- }
856
- catch (error) {
857
- return 0;
858
- }
859
- });
860
- const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
861
- this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, logger_1.Level.LOG);
862
- this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, logger_1.Level.LOG);
863
- const crawlResults = [];
864
- for (let i = 0; i < finalUrls.length; i++) {
865
- const url = finalUrls[i];
866
- try {
867
- this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, logger_1.Level.LOG);
868
1085
  yield page.goto(url, {
869
- waitUntil: 'domcontentloaded',
870
- timeout: 30000
871
- }).catch(() => {
872
- this.log(`Failed to navigate to ${url}, skipping...`, logger_1.Level.WARN);
873
- });
874
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
875
- const pageData = yield page.evaluate(() => {
876
- var _a, _b;
877
- const getMeta = (name) => {
878
- const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
879
- return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
880
- };
881
- const getAllMeta = () => {
882
- const metadata = {};
883
- const metaTags = document.querySelectorAll('meta');
884
- metaTags.forEach(tag => {
885
- const name = tag.getAttribute('name') || tag.getAttribute('property');
886
- const content = tag.getAttribute('content');
887
- if (name && content) {
888
- metadata[name] = content;
889
- }
890
- });
891
- return metadata;
892
- };
893
- const title = document.title || '';
894
- const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
895
- const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
896
- elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
897
- const html = document.documentElement.outerHTML;
898
- const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
899
- const allMetadata = getAllMeta();
900
- return {
901
- title,
902
- description: getMeta('description'),
903
- text: bodyText,
904
- html: html,
905
- links: links,
906
- wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
907
- metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
908
- };
1086
+ waitUntil: 'load',
1087
+ timeout: 60000
1088
+ }).catch((err) => {
1089
+ throw new Error(`Navigation failed: ${err.message}`);
909
1090
  });
910
- crawlResults.push({
911
- metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
912
- html: pageData.html,
913
- text: pageData.text,
914
- links: pageData.links,
915
- wordCount: pageData.wordCount,
916
- scrapedAt: new Date().toISOString()
1091
+ yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
1092
+ const pageResult = yield scrapePageContent(url);
1093
+ pageResult.metadata.depth = depth;
1094
+ crawlResults.push(pageResult);
1095
+ const actionType = "crawl";
1096
+ const actionName = "Crawl Results";
1097
+ if (!this.serializableDataByType[actionType]) {
1098
+ this.serializableDataByType[actionType] = {};
1099
+ }
1100
+ this.serializableDataByType[actionType][actionName] = [...crawlResults];
1101
+ yield this.options.serializableCallback({
1102
+ crawl: this.serializableDataByType.crawl
917
1103
  });
918
- this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
1104
+ if (this.isAborted) {
1105
+ this.log(`Run aborted after scraping ${url}, stopping crawl`, logger_1.Level.WARN);
1106
+ break;
1107
+ }
1108
+ this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, logger_1.Level.LOG);
1109
+ if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
1110
+ const newLinks = yield extractLinksFromPage();
1111
+ let addedCount = 0;
1112
+ for (const link of newLinks) {
1113
+ const normalized = normalizeUrl(link);
1114
+ if (!visitedUrls.has(normalized) &&
1115
+ isUrlAllowedByConfig(link) &&
1116
+ isUrlAllowedByRobots(link)) {
1117
+ visitedUrls.add(normalized);
1118
+ crawlQueue.push({ url: link, depth: depth + 1 });
1119
+ addedCount++;
1120
+ }
1121
+ }
1122
+ if (addedCount > 0) {
1123
+ this.log(`Added ${addedCount} new URLs to queue at depth ${depth + 1}`, logger_1.Level.LOG);
1124
+ }
1125
+ }
919
1126
  }
920
1127
  catch (error) {
921
- this.log(`Failed to scrape ${url}: ${error.message}`, logger_1.Level.WARN);
1128
+ this.log(`Failed to crawl ${url}: ${error.message}`, logger_1.Level.WARN);
922
1129
  crawlResults.push({
923
- url: url,
1130
+ metadata: {
1131
+ url: url,
1132
+ sourceURL: url,
1133
+ depth: depth
1134
+ },
924
1135
  error: error.message,
925
1136
  scrapedAt: new Date().toISOString()
926
1137
  });
927
1138
  }
928
1139
  }
929
- this.log(`Successfully scraped ${crawlResults.length} pages`, logger_1.Level.LOG);
1140
+ this.log(`Crawl completed: ${crawlResults.length} pages scraped (${processedCount} URLs processed, ${visitedUrls.size} URLs discovered)`, logger_1.Level.LOG);
930
1141
  const actionType = "crawl";
931
1142
  const actionName = "Crawl Results";
932
1143
  if (!this.serializableDataByType[actionType]) {
@@ -1157,6 +1368,7 @@ class Interpreter extends events_1.EventEmitter {
1157
1368
  filters: searchConfig.filters || {},
1158
1369
  resultsCount: searchResults.length,
1159
1370
  results: searchResults,
1371
+ mode: searchConfig.mode,
1160
1372
  searchedAt: new Date().toISOString()
1161
1373
  };
1162
1374
  this.serializableDataByType[actionType][actionName] = searchData;
@@ -1172,16 +1384,25 @@ class Interpreter extends events_1.EventEmitter {
1172
1384
  this.log(`Starting to scrape content from ${searchResults.length} search results...`, logger_1.Level.LOG);
1173
1385
  const scrapedResults = [];
1174
1386
  for (let i = 0; i < searchResults.length; i++) {
1387
+ if (this.isAborted) {
1388
+ this.log(`Run aborted, stopping search scraping at result ${i + 1}/${searchResults.length}`, logger_1.Level.WARN);
1389
+ break;
1390
+ }
1175
1391
  const result = searchResults[i];
1176
1392
  try {
1177
1393
  this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
1394
+ let navigationFailed = false;
1178
1395
  yield page.goto(result.url, {
1179
- waitUntil: 'domcontentloaded',
1180
- timeout: 30000
1396
+ waitUntil: 'load',
1397
+ timeout: 60000
1181
1398
  }).catch(() => {
1182
1399
  this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
1400
+ navigationFailed = true;
1183
1401
  });
1184
- yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
1402
+ if (navigationFailed) {
1403
+ continue;
1404
+ }
1405
+ yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
1185
1406
  const pageData = yield page.evaluate(() => {
1186
1407
  var _a, _b;
1187
1408
  const getMeta = (name) => {
@@ -1217,7 +1438,7 @@ class Interpreter extends events_1.EventEmitter {
1217
1438
  metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
1218
1439
  };
1219
1440
  });
1220
- scrapedResults.push({
1441
+ const scrapedResult = {
1221
1442
  searchResult: {
1222
1443
  query: searchConfig.query,
1223
1444
  position: result.position,
@@ -1225,12 +1446,79 @@ class Interpreter extends events_1.EventEmitter {
1225
1446
  searchDescription: result.description,
1226
1447
  },
1227
1448
  metadata: Object.assign(Object.assign({}, pageData.metadata), { url: result.url, sourceURL: result.url }),
1228
- html: pageData.html,
1229
- text: pageData.text,
1230
- links: pageData.links,
1231
- wordCount: pageData.wordCount,
1232
1449
  scrapedAt: new Date().toISOString()
1450
+ };
1451
+ const formats = searchConfig.outputFormats || [];
1452
+ if (formats.includes('text')) {
1453
+ scrapedResult.text = pageData.text;
1454
+ scrapedResult.wordCount = pageData.wordCount;
1455
+ }
1456
+ if (formats.includes('html')) {
1457
+ scrapedResult.html = pageData.html;
1458
+ scrapedResult.links = pageData.links;
1459
+ }
1460
+ if (formats.includes('markdown')) {
1461
+ try {
1462
+ const markdown = yield (0, markdown_1.parseMarkdown)(pageData.html, result.url);
1463
+ scrapedResult.markdown = markdown;
1464
+ }
1465
+ catch (err) {
1466
+ this.log(`Markdown conversion failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
1467
+ scrapedResult.markdown = '';
1468
+ }
1469
+ }
1470
+ if (formats.includes('screenshot-visible')) {
1471
+ try {
1472
+ const screenshotBuffer = yield page.screenshot({ fullPage: false });
1473
+ const screenshotName = `Search - ${i} - Visible`;
1474
+ yield this.options.binaryCallback({
1475
+ name: screenshotName,
1476
+ data: screenshotBuffer,
1477
+ mimeType: 'image/png'
1478
+ }, 'image/png');
1479
+ scrapedResult.screenshotVisible = screenshotName;
1480
+ }
1481
+ catch (err) {
1482
+ this.log(`Screenshot-visible failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
1483
+ }
1484
+ }
1485
+ if (formats.includes('screenshot-fullpage')) {
1486
+ try {
1487
+ const screenshotBuffer = yield page.screenshot({ fullPage: true });
1488
+ const screenshotName = `Search - ${i} - Full Page`;
1489
+ yield this.options.binaryCallback({
1490
+ name: screenshotName,
1491
+ data: screenshotBuffer,
1492
+ mimeType: 'image/png'
1493
+ }, 'image/png');
1494
+ scrapedResult.screenshotFullpage = screenshotName;
1495
+ }
1496
+ catch (err) {
1497
+ this.log(`Screenshot-fullpage failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
1498
+ }
1499
+ }
1500
+ scrapedResults.push(scrapedResult);
1501
+ const actionType = "search";
1502
+ const actionName = "Search Results";
1503
+ if (!this.serializableDataByType[actionType]) {
1504
+ this.serializableDataByType[actionType] = {};
1505
+ }
1506
+ this.serializableDataByType[actionType][actionName] = {
1507
+ query: searchConfig.query,
1508
+ provider: searchConfig.provider,
1509
+ filters: searchConfig.filters || {},
1510
+ resultsCount: scrapedResults.length,
1511
+ results: scrapedResults,
1512
+ mode: searchConfig.mode,
1513
+ searchedAt: new Date().toISOString()
1514
+ };
1515
+ yield this.options.serializableCallback({
1516
+ search: this.serializableDataByType.search
1233
1517
  });
1518
+ if (this.isAborted) {
1519
+ this.log(`Run aborted after scraping ${result.url}, stopping search`, logger_1.Level.WARN);
1520
+ break;
1521
+ }
1234
1522
  this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
1235
1523
  }
1236
1524
  catch (error) {
@@ -0,0 +1 @@
1
+ export declare function parseMarkdown(html: string | null | undefined, baseUrl?: string | null): Promise<string>;
@@ -0,0 +1,153 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.parseMarkdown = parseMarkdown;
13
+ function parseMarkdown(html, baseUrl) {
14
+ return __awaiter(this, void 0, void 0, function* () {
15
+ const TurndownService = require("turndown");
16
+ const { gfm } = require("joplin-turndown-plugin-gfm");
17
+ const cheerio = require("cheerio");
18
+ const { URL } = require("url");
19
+ if (!html)
20
+ return "";
21
+ const tidiedHtml = tidyHtml(html);
22
+ const t = new TurndownService({
23
+ headingStyle: "atx", // ensures #### instead of ------
24
+ codeBlockStyle: "fenced",
25
+ });
26
+ // ---------------------------------------------
27
+ // Proper ATX headings #### instead of underline-style
28
+ // ---------------------------------------------
29
+ t.addRule("forceAtxHeadings", {
30
+ filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
31
+ replacement: (content, node) => {
32
+ const level = Number(node.nodeName.charAt(1));
33
+ const clean = content.trim();
34
+ return `\n${"#".repeat(level)} ${clean}\n`;
35
+ },
36
+ });
37
+ // ---------------------------------------------
38
+ // Remove SVGs
39
+ // ---------------------------------------------
40
+ t.addRule("truncate-svg", {
41
+ filter: "svg",
42
+ replacement: () => "",
43
+ });
44
+ // ---------------------------------------------
45
+ // Improved paragraph cleanup
46
+ // ---------------------------------------------
47
+ t.addRule("improved-paragraph", {
48
+ filter: "p",
49
+ replacement: (innerText) => {
50
+ const trimmed = innerText.trim();
51
+ if (!trimmed)
52
+ return "";
53
+ return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
54
+ },
55
+ });
56
+ // ---------------------------------------------
57
+ // Inline link with fallback text
58
+ // ---------------------------------------------
59
+ t.addRule("inlineLink", {
60
+ filter: (node, opts) => node.nodeName === "A" && node.getAttribute("href"),
61
+ replacement: (content, node) => {
62
+ var _a, _b;
63
+ let text = content.trim();
64
+ // Fallback: aria-label → title → domain
65
+ if (!text) {
66
+ text =
67
+ ((_a = node.getAttribute("aria-label")) === null || _a === void 0 ? void 0 : _a.trim()) ||
68
+ ((_b = node.getAttribute("title")) === null || _b === void 0 ? void 0 : _b.trim()) ||
69
+ getDomainFromUrl(node.getAttribute("href")) ||
70
+ "link";
71
+ }
72
+ let href = node.getAttribute("href").trim();
73
+ // relative → absolute
74
+ if (baseUrl && isRelativeUrl(href)) {
75
+ try {
76
+ const u = new URL(href, baseUrl);
77
+ href = u.toString();
78
+ }
79
+ catch (_c) { }
80
+ }
81
+ href = cleanUrl(href);
82
+ return `[${text}](${href})`;
83
+ },
84
+ });
85
+ t.use(gfm);
86
+ // Convert HTML → Markdown
87
+ try {
88
+ let out = yield t.turndown(tidiedHtml);
89
+ out = fixBrokenLinks(out);
90
+ out = stripSkipLinks(out);
91
+ return out.trim();
92
+ }
93
+ catch (err) {
94
+ console.error("HTML→Markdown failed", { err });
95
+ return "";
96
+ }
97
+ });
98
+ }
99
+ // -----------------------------------------------------
100
+ // Helpers
101
+ // -----------------------------------------------------
102
+ function isRelativeUrl(url) {
103
+ return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
104
+ }
105
+ function getDomainFromUrl(url) {
106
+ try {
107
+ const u = new URL(url);
108
+ return u.hostname.replace("www.", "");
109
+ }
110
+ catch (_a) {
111
+ return null;
112
+ }
113
+ }
114
+ function cleanUrl(u) {
115
+ return u;
116
+ }
117
+ function cleanAttribute(attr) {
118
+ return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
119
+ }
120
+ function tidyHtml(html) {
121
+ const cheerio = require("cheerio");
122
+ const $ = cheerio.load(html);
123
+ const manuallyCleanedElements = [
124
+ "script",
125
+ "style",
126
+ "iframe",
127
+ "noscript",
128
+ "meta",
129
+ "link",
130
+ "object",
131
+ "embed",
132
+ "canvas",
133
+ "audio",
134
+ "video",
135
+ ];
136
+ manuallyCleanedElements.forEach((tag) => $(tag).remove());
137
+ return $("body").html();
138
+ }
139
+ function fixBrokenLinks(md) {
140
+ let depth = 0;
141
+ let result = "";
142
+ for (const ch of md) {
143
+ if (ch === "[")
144
+ depth++;
145
+ if (ch === "]")
146
+ depth = Math.max(0, depth - 1);
147
+ result += depth > 0 && ch === "\n" ? "\\\n" : ch;
148
+ }
149
+ return result;
150
+ }
151
+ function stripSkipLinks(md) {
152
+ return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
153
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.27",
3
+ "version": "0.0.30",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",
@@ -17,9 +17,16 @@
17
17
  "license": "AGPL-3.0-or-later",
18
18
  "dependencies": {
19
19
  "@cliqz/adblocker-playwright": "^1.31.3",
20
+ "cheerio": "^1.1.2",
20
21
  "cross-fetch": "^4.0.0",
21
22
  "joi": "^17.6.0",
23
+ "joplin-turndown-plugin-gfm": "^1.0.12",
22
24
  "nodemailer": "^6.10.0",
23
- "playwright-core": "^1.57.0"
25
+ "playwright-core": "^1.57.0",
26
+ "rimraf": "^6.1.2",
27
+ "turndown": "^7.2.0"
28
+ },
29
+ "devDependencies": {
30
+ "@types/turndown": "^5.0.6"
24
31
  }
25
32
  }