mx-cloud 0.0.27 → 0.0.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/build/interpret.js +306 -155
  2. package/package.json +1 -1
@@ -678,10 +678,8 @@ class Interpreter extends events_1.EventEmitter {
678
678
  }
679
679
  this.log('Starting crawl operation', logger_1.Level.LOG);
680
680
  try {
681
- // Get current page URL and log it
682
681
  const currentUrl = page.url();
683
682
  this.log(`Current page URL: ${currentUrl}`, logger_1.Level.LOG);
684
- // If page is on about:blank or empty, we need to wait for navigation
685
683
  if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
686
684
  this.log('Page not yet navigated, waiting for navigation...', logger_1.Level.WARN);
687
685
  yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
@@ -690,13 +688,260 @@ class Interpreter extends events_1.EventEmitter {
690
688
  this.log(`Using base URL for crawl: ${baseUrl}`, logger_1.Level.LOG);
691
689
  const parsedBase = new URL(baseUrl);
692
690
  const baseDomain = parsedBase.hostname;
693
- let discoveredUrls = [];
694
- // Step 1: Sitemap discovery using XMLHttpRequest to avoid polyfills
691
+ let robotRules = {
692
+ disallowedPaths: [],
693
+ allowedPaths: [],
694
+ crawlDelay: null
695
+ };
696
+ if (crawlConfig.respectRobots) {
697
+ this.log('Fetching robots.txt...', logger_1.Level.LOG);
698
+ try {
699
+ const robotsUrl = `${parsedBase.protocol}//${parsedBase.host}/robots.txt`;
700
+ const robotsContent = yield page.evaluate((url) => {
701
+ return new Promise((resolve) => {
702
+ const xhr = new XMLHttpRequest();
703
+ xhr.open('GET', url, true);
704
+ xhr.onload = function () {
705
+ if (xhr.status === 200) {
706
+ resolve(xhr.responseText);
707
+ }
708
+ else {
709
+ resolve('');
710
+ }
711
+ };
712
+ xhr.onerror = function () {
713
+ resolve('');
714
+ };
715
+ xhr.send();
716
+ });
717
+ }, robotsUrl);
718
+ if (robotsContent) {
719
+ const lines = robotsContent.split('\n');
720
+ let isRelevantUserAgent = false;
721
+ let foundSpecificUserAgent = false;
722
+ for (const line of lines) {
723
+ const trimmedLine = line.trim().toLowerCase();
724
+ if (trimmedLine.startsWith('#') || trimmedLine === '') {
725
+ continue;
726
+ }
727
+ const colonIndex = line.indexOf(':');
728
+ if (colonIndex === -1)
729
+ continue;
730
+ const directive = line.substring(0, colonIndex).trim().toLowerCase();
731
+ const value = line.substring(colonIndex + 1).trim();
732
+ if (directive === 'user-agent') {
733
+ const agent = value.toLowerCase();
734
+ if (agent === '*' && !foundSpecificUserAgent) {
735
+ isRelevantUserAgent = true;
736
+ }
737
+ else if (agent.includes('bot') || agent.includes('crawler') || agent.includes('spider')) {
738
+ isRelevantUserAgent = true;
739
+ foundSpecificUserAgent = true;
740
+ }
741
+ else {
742
+ if (!foundSpecificUserAgent) {
743
+ isRelevantUserAgent = false;
744
+ }
745
+ }
746
+ }
747
+ else if (isRelevantUserAgent) {
748
+ if (directive === 'disallow' && value) {
749
+ robotRules.disallowedPaths.push(value);
750
+ }
751
+ else if (directive === 'allow' && value) {
752
+ robotRules.allowedPaths.push(value);
753
+ }
754
+ else if (directive === 'crawl-delay' && value) {
755
+ const delay = parseFloat(value);
756
+ if (!isNaN(delay) && delay > 0) {
757
+ robotRules.crawlDelay = delay * 1000;
758
+ }
759
+ }
760
+ }
761
+ }
762
+ this.log(`Robots.txt parsed: ${robotRules.disallowedPaths.length} disallowed paths, ${robotRules.allowedPaths.length} allowed paths, crawl-delay: ${robotRules.crawlDelay || 'none'}`, logger_1.Level.LOG);
763
+ }
764
+ else {
765
+ this.log('No robots.txt found or not accessible, proceeding without restrictions', logger_1.Level.WARN);
766
+ }
767
+ }
768
+ catch (error) {
769
+ this.log(`Failed to fetch robots.txt: ${error.message}, proceeding without restrictions`, logger_1.Level.WARN);
770
+ }
771
+ }
772
+ const isUrlAllowedByRobots = (url) => {
773
+ if (!crawlConfig.respectRobots)
774
+ return true;
775
+ try {
776
+ const urlObj = new URL(url);
777
+ const pathname = urlObj.pathname;
778
+ for (const allowedPath of robotRules.allowedPaths) {
779
+ if (allowedPath === pathname || pathname.startsWith(allowedPath)) {
780
+ return true;
781
+ }
782
+ if (allowedPath.includes('*')) {
783
+ const regex = new RegExp('^' + allowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
784
+ if (regex.test(pathname)) {
785
+ return true;
786
+ }
787
+ }
788
+ }
789
+ for (const disallowedPath of robotRules.disallowedPaths) {
790
+ if (disallowedPath === '/') {
791
+ return false;
792
+ }
793
+ if (pathname.startsWith(disallowedPath)) {
794
+ return false;
795
+ }
796
+ if (disallowedPath.includes('*')) {
797
+ const regex = new RegExp('^' + disallowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
798
+ if (regex.test(pathname)) {
799
+ return false;
800
+ }
801
+ }
802
+ if (disallowedPath.endsWith('$')) {
803
+ const pattern = disallowedPath.slice(0, -1);
804
+ if (pathname === pattern || pathname.endsWith(pattern)) {
805
+ return false;
806
+ }
807
+ }
808
+ }
809
+ return true;
810
+ }
811
+ catch (error) {
812
+ return true;
813
+ }
814
+ };
815
+ const isUrlAllowedByConfig = (url) => {
816
+ try {
817
+ const urlObj = new URL(url);
818
+ if (crawlConfig.mode === 'domain') {
819
+ if (urlObj.hostname !== baseDomain)
820
+ return false;
821
+ }
822
+ else if (crawlConfig.mode === 'subdomain') {
823
+ if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
824
+ return false;
825
+ }
826
+ else if (crawlConfig.mode === 'path') {
827
+ if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
828
+ return false;
829
+ }
830
+ if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
831
+ const matches = crawlConfig.includePaths.some(pattern => {
832
+ try {
833
+ const regex = new RegExp(pattern);
834
+ return regex.test(url);
835
+ }
836
+ catch (_a) {
837
+ return url.includes(pattern);
838
+ }
839
+ });
840
+ if (!matches)
841
+ return false;
842
+ }
843
+ if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
844
+ const matches = crawlConfig.excludePaths.some(pattern => {
845
+ try {
846
+ const regex = new RegExp(pattern);
847
+ return regex.test(url);
848
+ }
849
+ catch (_a) {
850
+ return url.includes(pattern);
851
+ }
852
+ });
853
+ if (matches)
854
+ return false;
855
+ }
856
+ return true;
857
+ }
858
+ catch (error) {
859
+ return false;
860
+ }
861
+ };
862
+ const normalizeUrl = (url) => {
863
+ return url.replace(/#.*$/, '').replace(/\/$/, '');
864
+ };
865
+ const extractLinksFromPage = () => __awaiter(this, void 0, void 0, function* () {
866
+ try {
867
+ yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
868
+ yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
869
+ yield new Promise(resolve => setTimeout(resolve, 1000));
870
+ const pageLinks = yield page.evaluate(() => {
871
+ const links = [];
872
+ const allAnchors = document.querySelectorAll('a');
873
+ for (let i = 0; i < allAnchors.length; i++) {
874
+ const anchor = allAnchors[i];
875
+ const fullHref = anchor.href;
876
+ if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
877
+ links.push(fullHref);
878
+ }
879
+ }
880
+ return links;
881
+ });
882
+ return pageLinks;
883
+ }
884
+ catch (error) {
885
+ this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
886
+ return [];
887
+ }
888
+ });
889
+ const scrapePageContent = (url) => __awaiter(this, void 0, void 0, function* () {
890
+ const pageData = yield page.evaluate(() => {
891
+ var _a, _b;
892
+ const getMeta = (name) => {
893
+ const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
894
+ return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
895
+ };
896
+ const getAllMeta = () => {
897
+ const metadata = {};
898
+ const metaTags = document.querySelectorAll('meta');
899
+ metaTags.forEach(tag => {
900
+ const name = tag.getAttribute('name') || tag.getAttribute('property');
901
+ const content = tag.getAttribute('content');
902
+ if (name && content) {
903
+ metadata[name] = content;
904
+ }
905
+ });
906
+ return metadata;
907
+ };
908
+ const title = document.title || '';
909
+ const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
910
+ const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
911
+ elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
912
+ const html = document.documentElement.outerHTML;
913
+ const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
914
+ const allMetadata = getAllMeta();
915
+ return {
916
+ title,
917
+ description: getMeta('description'),
918
+ text: bodyText,
919
+ html: html,
920
+ links: links,
921
+ wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
922
+ metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
923
+ };
924
+ });
925
+ return {
926
+ metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
927
+ html: pageData.html,
928
+ text: pageData.text,
929
+ links: pageData.links,
930
+ wordCount: pageData.wordCount,
931
+ scrapedAt: new Date().toISOString()
932
+ };
933
+ });
934
+ const visitedUrls = new Set();
935
+ const crawlResults = [];
936
+ const crawlQueue = [];
937
+ const normalizedBaseUrl = normalizeUrl(baseUrl);
938
+ visitedUrls.add(normalizedBaseUrl);
939
+ crawlQueue.push({ url: baseUrl, depth: 0 });
940
+ this.log(`Starting breadth-first crawl with maxDepth: ${crawlConfig.maxDepth}, limit: ${crawlConfig.limit}`, logger_1.Level.LOG);
695
941
  if (crawlConfig.useSitemap) {
696
942
  this.log('Fetching sitemap URLs...', logger_1.Level.LOG);
697
943
  try {
698
944
  const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
699
- // Use XMLHttpRequest instead of fetch to avoid polyfills
700
945
  const sitemapUrls = yield page.evaluate((url) => {
701
946
  return new Promise((resolve) => {
702
947
  const xhr = new XMLHttpRequest();
@@ -721,7 +966,13 @@ class Interpreter extends events_1.EventEmitter {
721
966
  if (sitemapUrls.length > 0) {
722
967
  const nestedSitemaps = sitemapUrls.filter(url => url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/'));
723
968
  const regularUrls = sitemapUrls.filter(url => !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/'));
724
- discoveredUrls.push(...regularUrls);
969
+ for (const sitemapPageUrl of regularUrls) {
970
+ const normalized = normalizeUrl(sitemapPageUrl);
971
+ if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(sitemapPageUrl) && isUrlAllowedByRobots(sitemapPageUrl)) {
972
+ visitedUrls.add(normalized);
973
+ crawlQueue.push({ url: sitemapPageUrl, depth: 1 });
974
+ }
975
+ }
725
976
  this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, logger_1.Level.LOG);
726
977
  for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
727
978
  try {
@@ -747,16 +998,20 @@ class Interpreter extends events_1.EventEmitter {
747
998
  xhr.send();
748
999
  });
749
1000
  }, nestedUrl);
750
- if (nestedUrls.length > 0) {
751
- discoveredUrls.push(...nestedUrls);
752
- this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, logger_1.Level.LOG);
1001
+ for (const nestedPageUrl of nestedUrls) {
1002
+ const normalized = normalizeUrl(nestedPageUrl);
1003
+ if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(nestedPageUrl) && isUrlAllowedByRobots(nestedPageUrl)) {
1004
+ visitedUrls.add(normalized);
1005
+ crawlQueue.push({ url: nestedPageUrl, depth: 1 });
1006
+ }
753
1007
  }
1008
+ this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, logger_1.Level.LOG);
754
1009
  }
755
1010
  catch (error) {
756
1011
  this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, logger_1.Level.WARN);
757
1012
  }
758
1013
  }
759
- this.log(`Total URLs from all sitemaps: ${discoveredUrls.length}`, logger_1.Level.LOG);
1014
+ this.log(`Total URLs queued from sitemaps: ${crawlQueue.length - 1}`, logger_1.Level.LOG);
760
1015
  }
761
1016
  else {
762
1017
  this.log('No URLs found in sitemap or sitemap not available', logger_1.Level.WARN);
@@ -766,167 +1021,63 @@ class Interpreter extends events_1.EventEmitter {
766
1021
  this.log(`Sitemap fetch failed: ${error.message}`, logger_1.Level.WARN);
767
1022
  }
768
1023
  }
769
- if (crawlConfig.followLinks) {
770
- this.log('Extracting links from current page...', logger_1.Level.LOG);
771
- try {
772
- yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
773
- yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
774
- this.log('Network did not become idle, continuing anyway', logger_1.Level.WARN);
775
- });
776
- yield new Promise(resolve => setTimeout(resolve, 5000));
777
- const anchorCount = yield page.evaluate(() => {
778
- return document.querySelectorAll('a').length;
779
- });
780
- this.log(`Page has ${anchorCount} total anchor tags`, logger_1.Level.LOG);
781
- const pageLinks = yield page.evaluate(() => {
782
- const links = [];
783
- const allAnchors = document.querySelectorAll('a');
784
- console.log('Total anchors found:', allAnchors.length);
785
- for (let i = 0; i < allAnchors.length; i++) {
786
- const anchor = allAnchors[i];
787
- const href = anchor.getAttribute('href');
788
- const fullHref = anchor.href;
789
- if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
790
- links.push(fullHref);
791
- }
792
- }
793
- console.log('Links extracted:', links.length);
794
- return links;
795
- });
796
- discoveredUrls.push(...pageLinks);
797
- this.log(`Found ${pageLinks.length} links from page`, logger_1.Level.LOG);
798
- }
799
- catch (error) {
800
- this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
1024
+ let processedCount = 0;
1025
+ while (crawlQueue.length > 0 && crawlResults.length < crawlConfig.limit) {
1026
+ if (this.isAborted) {
1027
+ this.log('Workflow aborted during crawl', logger_1.Level.WARN);
1028
+ break;
801
1029
  }
802
- }
803
- const filteredUrls = discoveredUrls.filter(url => {
1030
+ const { url, depth } = crawlQueue.shift();
1031
+ processedCount++;
1032
+ this.log(`[${crawlResults.length + 1}/${crawlConfig.limit}] Crawling (depth ${depth}): ${url}`, logger_1.Level.LOG);
804
1033
  try {
805
- const urlObj = new URL(url);
806
- if (crawlConfig.mode === 'domain') {
807
- if (urlObj.hostname !== baseDomain)
808
- return false;
809
- }
810
- else if (crawlConfig.mode === 'subdomain') {
811
- if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
812
- return false;
813
- }
814
- else if (crawlConfig.mode === 'path') {
815
- if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
816
- return false;
1034
+ if (robotRules.crawlDelay && crawlResults.length > 0) {
1035
+ this.log(`Applying crawl delay: ${robotRules.crawlDelay}ms`, logger_1.Level.LOG);
1036
+ yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
817
1037
  }
818
- if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
819
- const matches = crawlConfig.includePaths.some(pattern => {
820
- const regex = new RegExp(pattern);
821
- return regex.test(url);
822
- });
823
- if (!matches)
824
- return false;
825
- }
826
- if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
827
- const matches = crawlConfig.excludePaths.some(pattern => {
828
- const regex = new RegExp(pattern);
829
- return regex.test(url);
830
- });
831
- if (matches)
832
- return false;
833
- }
834
- return true;
835
- }
836
- catch (error) {
837
- return false;
838
- }
839
- });
840
- const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
841
- return url.replace(/#.*$/, '').replace(/\/$/, '');
842
- })));
843
- const basePathname = parsedBase.pathname;
844
- const prioritizedUrls = uniqueUrls.sort((a, b) => {
845
- try {
846
- const aUrl = new URL(a);
847
- const bUrl = new URL(b);
848
- const aMatchesBase = aUrl.pathname.startsWith(basePathname);
849
- const bMatchesBase = bUrl.pathname.startsWith(basePathname);
850
- if (aMatchesBase && !bMatchesBase)
851
- return -1;
852
- if (!aMatchesBase && bMatchesBase)
853
- return 1;
854
- return 0;
855
- }
856
- catch (error) {
857
- return 0;
858
- }
859
- });
860
- const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
861
- this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, logger_1.Level.LOG);
862
- this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, logger_1.Level.LOG);
863
- const crawlResults = [];
864
- for (let i = 0; i < finalUrls.length; i++) {
865
- const url = finalUrls[i];
866
- try {
867
- this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, logger_1.Level.LOG);
868
1038
  yield page.goto(url, {
869
1039
  waitUntil: 'domcontentloaded',
870
1040
  timeout: 30000
871
- }).catch(() => {
872
- this.log(`Failed to navigate to ${url}, skipping...`, logger_1.Level.WARN);
1041
+ }).catch((err) => {
1042
+ throw new Error(`Navigation failed: ${err.message}`);
873
1043
  });
874
1044
  yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
875
- const pageData = yield page.evaluate(() => {
876
- var _a, _b;
877
- const getMeta = (name) => {
878
- const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
879
- return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
880
- };
881
- const getAllMeta = () => {
882
- const metadata = {};
883
- const metaTags = document.querySelectorAll('meta');
884
- metaTags.forEach(tag => {
885
- const name = tag.getAttribute('name') || tag.getAttribute('property');
886
- const content = tag.getAttribute('content');
887
- if (name && content) {
888
- metadata[name] = content;
889
- }
890
- });
891
- return metadata;
892
- };
893
- const title = document.title || '';
894
- const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
895
- const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
896
- elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
897
- const html = document.documentElement.outerHTML;
898
- const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
899
- const allMetadata = getAllMeta();
900
- return {
901
- title,
902
- description: getMeta('description'),
903
- text: bodyText,
904
- html: html,
905
- links: links,
906
- wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
907
- metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
908
- };
909
- });
910
- crawlResults.push({
911
- metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
912
- html: pageData.html,
913
- text: pageData.text,
914
- links: pageData.links,
915
- wordCount: pageData.wordCount,
916
- scrapedAt: new Date().toISOString()
917
- });
918
- this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
1045
+ const pageResult = yield scrapePageContent(url);
1046
+ pageResult.metadata.depth = depth;
1047
+ crawlResults.push(pageResult);
1048
+ this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, logger_1.Level.LOG);
1049
+ if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
1050
+ const newLinks = yield extractLinksFromPage();
1051
+ let addedCount = 0;
1052
+ for (const link of newLinks) {
1053
+ const normalized = normalizeUrl(link);
1054
+ if (!visitedUrls.has(normalized) &&
1055
+ isUrlAllowedByConfig(link) &&
1056
+ isUrlAllowedByRobots(link)) {
1057
+ visitedUrls.add(normalized);
1058
+ crawlQueue.push({ url: link, depth: depth + 1 });
1059
+ addedCount++;
1060
+ }
1061
+ }
1062
+ if (addedCount > 0) {
1063
+ this.log(`Added ${addedCount} new URLs to queue at depth ${depth + 1}`, logger_1.Level.LOG);
1064
+ }
1065
+ }
919
1066
  }
920
1067
  catch (error) {
921
- this.log(`Failed to scrape ${url}: ${error.message}`, logger_1.Level.WARN);
1068
+ this.log(`Failed to crawl ${url}: ${error.message}`, logger_1.Level.WARN);
922
1069
  crawlResults.push({
923
- url: url,
1070
+ metadata: {
1071
+ url: url,
1072
+ sourceURL: url,
1073
+ depth: depth
1074
+ },
924
1075
  error: error.message,
925
1076
  scrapedAt: new Date().toISOString()
926
1077
  });
927
1078
  }
928
1079
  }
929
- this.log(`Successfully scraped ${crawlResults.length} pages`, logger_1.Level.LOG);
1080
+ this.log(`Crawl completed: ${crawlResults.length} pages scraped (${processedCount} URLs processed, ${visitedUrls.size} URLs discovered)`, logger_1.Level.LOG);
930
1081
  const actionType = "crawl";
931
1082
  const actionName = "Crawl Results";
932
1083
  if (!this.serializableDataByType[actionType]) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mx-cloud",
3
- "version": "0.0.27",
3
+ "version": "0.0.28",
4
4
  "description": "mx cloud",
5
5
  "main": "build/index.js",
6
6
  "typings": "build/index.d.ts",