mx-cloud 0.0.26 → 0.0.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.d.ts +1 -1
- package/build/interpret.js +306 -155
- package/package.json +1 -1
package/build/interpret.d.ts
CHANGED
|
@@ -38,7 +38,7 @@ interface InterpreterOptions {
|
|
|
38
38
|
serializableCallback: (output: any) => (void | Promise<void>);
|
|
39
39
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
40
40
|
debug: boolean;
|
|
41
|
-
robotType?: 'extract' | 'scrape' | 'deep-extract';
|
|
41
|
+
robotType?: 'extract' | 'scrape' | 'crawl' | 'search' | 'deep-extract';
|
|
42
42
|
debugChannel: Partial<{
|
|
43
43
|
activeId: (id: number) => void;
|
|
44
44
|
debugMessage: (msg: string) => void;
|
package/build/interpret.js
CHANGED
|
@@ -678,10 +678,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
678
678
|
}
|
|
679
679
|
this.log('Starting crawl operation', logger_1.Level.LOG);
|
|
680
680
|
try {
|
|
681
|
-
// Get current page URL and log it
|
|
682
681
|
const currentUrl = page.url();
|
|
683
682
|
this.log(`Current page URL: ${currentUrl}`, logger_1.Level.LOG);
|
|
684
|
-
// If page is on about:blank or empty, we need to wait for navigation
|
|
685
683
|
if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
|
|
686
684
|
this.log('Page not yet navigated, waiting for navigation...', logger_1.Level.WARN);
|
|
687
685
|
yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
|
|
@@ -690,13 +688,260 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
690
688
|
this.log(`Using base URL for crawl: ${baseUrl}`, logger_1.Level.LOG);
|
|
691
689
|
const parsedBase = new URL(baseUrl);
|
|
692
690
|
const baseDomain = parsedBase.hostname;
|
|
693
|
-
let
|
|
694
|
-
|
|
691
|
+
let robotRules = {
|
|
692
|
+
disallowedPaths: [],
|
|
693
|
+
allowedPaths: [],
|
|
694
|
+
crawlDelay: null
|
|
695
|
+
};
|
|
696
|
+
if (crawlConfig.respectRobots) {
|
|
697
|
+
this.log('Fetching robots.txt...', logger_1.Level.LOG);
|
|
698
|
+
try {
|
|
699
|
+
const robotsUrl = `${parsedBase.protocol}//${parsedBase.host}/robots.txt`;
|
|
700
|
+
const robotsContent = yield page.evaluate((url) => {
|
|
701
|
+
return new Promise((resolve) => {
|
|
702
|
+
const xhr = new XMLHttpRequest();
|
|
703
|
+
xhr.open('GET', url, true);
|
|
704
|
+
xhr.onload = function () {
|
|
705
|
+
if (xhr.status === 200) {
|
|
706
|
+
resolve(xhr.responseText);
|
|
707
|
+
}
|
|
708
|
+
else {
|
|
709
|
+
resolve('');
|
|
710
|
+
}
|
|
711
|
+
};
|
|
712
|
+
xhr.onerror = function () {
|
|
713
|
+
resolve('');
|
|
714
|
+
};
|
|
715
|
+
xhr.send();
|
|
716
|
+
});
|
|
717
|
+
}, robotsUrl);
|
|
718
|
+
if (robotsContent) {
|
|
719
|
+
const lines = robotsContent.split('\n');
|
|
720
|
+
let isRelevantUserAgent = false;
|
|
721
|
+
let foundSpecificUserAgent = false;
|
|
722
|
+
for (const line of lines) {
|
|
723
|
+
const trimmedLine = line.trim().toLowerCase();
|
|
724
|
+
if (trimmedLine.startsWith('#') || trimmedLine === '') {
|
|
725
|
+
continue;
|
|
726
|
+
}
|
|
727
|
+
const colonIndex = line.indexOf(':');
|
|
728
|
+
if (colonIndex === -1)
|
|
729
|
+
continue;
|
|
730
|
+
const directive = line.substring(0, colonIndex).trim().toLowerCase();
|
|
731
|
+
const value = line.substring(colonIndex + 1).trim();
|
|
732
|
+
if (directive === 'user-agent') {
|
|
733
|
+
const agent = value.toLowerCase();
|
|
734
|
+
if (agent === '*' && !foundSpecificUserAgent) {
|
|
735
|
+
isRelevantUserAgent = true;
|
|
736
|
+
}
|
|
737
|
+
else if (agent.includes('bot') || agent.includes('crawler') || agent.includes('spider')) {
|
|
738
|
+
isRelevantUserAgent = true;
|
|
739
|
+
foundSpecificUserAgent = true;
|
|
740
|
+
}
|
|
741
|
+
else {
|
|
742
|
+
if (!foundSpecificUserAgent) {
|
|
743
|
+
isRelevantUserAgent = false;
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
else if (isRelevantUserAgent) {
|
|
748
|
+
if (directive === 'disallow' && value) {
|
|
749
|
+
robotRules.disallowedPaths.push(value);
|
|
750
|
+
}
|
|
751
|
+
else if (directive === 'allow' && value) {
|
|
752
|
+
robotRules.allowedPaths.push(value);
|
|
753
|
+
}
|
|
754
|
+
else if (directive === 'crawl-delay' && value) {
|
|
755
|
+
const delay = parseFloat(value);
|
|
756
|
+
if (!isNaN(delay) && delay > 0) {
|
|
757
|
+
robotRules.crawlDelay = delay * 1000;
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
this.log(`Robots.txt parsed: ${robotRules.disallowedPaths.length} disallowed paths, ${robotRules.allowedPaths.length} allowed paths, crawl-delay: ${robotRules.crawlDelay || 'none'}`, logger_1.Level.LOG);
|
|
763
|
+
}
|
|
764
|
+
else {
|
|
765
|
+
this.log('No robots.txt found or not accessible, proceeding without restrictions', logger_1.Level.WARN);
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
catch (error) {
|
|
769
|
+
this.log(`Failed to fetch robots.txt: ${error.message}, proceeding without restrictions`, logger_1.Level.WARN);
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
const isUrlAllowedByRobots = (url) => {
|
|
773
|
+
if (!crawlConfig.respectRobots)
|
|
774
|
+
return true;
|
|
775
|
+
try {
|
|
776
|
+
const urlObj = new URL(url);
|
|
777
|
+
const pathname = urlObj.pathname;
|
|
778
|
+
for (const allowedPath of robotRules.allowedPaths) {
|
|
779
|
+
if (allowedPath === pathname || pathname.startsWith(allowedPath)) {
|
|
780
|
+
return true;
|
|
781
|
+
}
|
|
782
|
+
if (allowedPath.includes('*')) {
|
|
783
|
+
const regex = new RegExp('^' + allowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
|
|
784
|
+
if (regex.test(pathname)) {
|
|
785
|
+
return true;
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
for (const disallowedPath of robotRules.disallowedPaths) {
|
|
790
|
+
if (disallowedPath === '/') {
|
|
791
|
+
return false;
|
|
792
|
+
}
|
|
793
|
+
if (pathname.startsWith(disallowedPath)) {
|
|
794
|
+
return false;
|
|
795
|
+
}
|
|
796
|
+
if (disallowedPath.includes('*')) {
|
|
797
|
+
const regex = new RegExp('^' + disallowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
|
|
798
|
+
if (regex.test(pathname)) {
|
|
799
|
+
return false;
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
if (disallowedPath.endsWith('$')) {
|
|
803
|
+
const pattern = disallowedPath.slice(0, -1);
|
|
804
|
+
if (pathname === pattern || pathname.endsWith(pattern)) {
|
|
805
|
+
return false;
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
return true;
|
|
810
|
+
}
|
|
811
|
+
catch (error) {
|
|
812
|
+
return true;
|
|
813
|
+
}
|
|
814
|
+
};
|
|
815
|
+
const isUrlAllowedByConfig = (url) => {
|
|
816
|
+
try {
|
|
817
|
+
const urlObj = new URL(url);
|
|
818
|
+
if (crawlConfig.mode === 'domain') {
|
|
819
|
+
if (urlObj.hostname !== baseDomain)
|
|
820
|
+
return false;
|
|
821
|
+
}
|
|
822
|
+
else if (crawlConfig.mode === 'subdomain') {
|
|
823
|
+
if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
|
|
824
|
+
return false;
|
|
825
|
+
}
|
|
826
|
+
else if (crawlConfig.mode === 'path') {
|
|
827
|
+
if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
|
|
828
|
+
return false;
|
|
829
|
+
}
|
|
830
|
+
if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
|
|
831
|
+
const matches = crawlConfig.includePaths.some(pattern => {
|
|
832
|
+
try {
|
|
833
|
+
const regex = new RegExp(pattern);
|
|
834
|
+
return regex.test(url);
|
|
835
|
+
}
|
|
836
|
+
catch (_a) {
|
|
837
|
+
return url.includes(pattern);
|
|
838
|
+
}
|
|
839
|
+
});
|
|
840
|
+
if (!matches)
|
|
841
|
+
return false;
|
|
842
|
+
}
|
|
843
|
+
if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
|
|
844
|
+
const matches = crawlConfig.excludePaths.some(pattern => {
|
|
845
|
+
try {
|
|
846
|
+
const regex = new RegExp(pattern);
|
|
847
|
+
return regex.test(url);
|
|
848
|
+
}
|
|
849
|
+
catch (_a) {
|
|
850
|
+
return url.includes(pattern);
|
|
851
|
+
}
|
|
852
|
+
});
|
|
853
|
+
if (matches)
|
|
854
|
+
return false;
|
|
855
|
+
}
|
|
856
|
+
return true;
|
|
857
|
+
}
|
|
858
|
+
catch (error) {
|
|
859
|
+
return false;
|
|
860
|
+
}
|
|
861
|
+
};
|
|
862
|
+
const normalizeUrl = (url) => {
|
|
863
|
+
return url.replace(/#.*$/, '').replace(/\/$/, '');
|
|
864
|
+
};
|
|
865
|
+
const extractLinksFromPage = () => __awaiter(this, void 0, void 0, function* () {
|
|
866
|
+
try {
|
|
867
|
+
yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
|
|
868
|
+
yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
|
|
869
|
+
yield new Promise(resolve => setTimeout(resolve, 1000));
|
|
870
|
+
const pageLinks = yield page.evaluate(() => {
|
|
871
|
+
const links = [];
|
|
872
|
+
const allAnchors = document.querySelectorAll('a');
|
|
873
|
+
for (let i = 0; i < allAnchors.length; i++) {
|
|
874
|
+
const anchor = allAnchors[i];
|
|
875
|
+
const fullHref = anchor.href;
|
|
876
|
+
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
|
|
877
|
+
links.push(fullHref);
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
return links;
|
|
881
|
+
});
|
|
882
|
+
return pageLinks;
|
|
883
|
+
}
|
|
884
|
+
catch (error) {
|
|
885
|
+
this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
|
|
886
|
+
return [];
|
|
887
|
+
}
|
|
888
|
+
});
|
|
889
|
+
const scrapePageContent = (url) => __awaiter(this, void 0, void 0, function* () {
|
|
890
|
+
const pageData = yield page.evaluate(() => {
|
|
891
|
+
var _a, _b;
|
|
892
|
+
const getMeta = (name) => {
|
|
893
|
+
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
|
894
|
+
return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
|
|
895
|
+
};
|
|
896
|
+
const getAllMeta = () => {
|
|
897
|
+
const metadata = {};
|
|
898
|
+
const metaTags = document.querySelectorAll('meta');
|
|
899
|
+
metaTags.forEach(tag => {
|
|
900
|
+
const name = tag.getAttribute('name') || tag.getAttribute('property');
|
|
901
|
+
const content = tag.getAttribute('content');
|
|
902
|
+
if (name && content) {
|
|
903
|
+
metadata[name] = content;
|
|
904
|
+
}
|
|
905
|
+
});
|
|
906
|
+
return metadata;
|
|
907
|
+
};
|
|
908
|
+
const title = document.title || '';
|
|
909
|
+
const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
|
|
910
|
+
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
|
|
911
|
+
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
|
912
|
+
const html = document.documentElement.outerHTML;
|
|
913
|
+
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
|
|
914
|
+
const allMetadata = getAllMeta();
|
|
915
|
+
return {
|
|
916
|
+
title,
|
|
917
|
+
description: getMeta('description'),
|
|
918
|
+
text: bodyText,
|
|
919
|
+
html: html,
|
|
920
|
+
links: links,
|
|
921
|
+
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
|
|
922
|
+
metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
|
|
923
|
+
};
|
|
924
|
+
});
|
|
925
|
+
return {
|
|
926
|
+
metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
|
|
927
|
+
html: pageData.html,
|
|
928
|
+
text: pageData.text,
|
|
929
|
+
links: pageData.links,
|
|
930
|
+
wordCount: pageData.wordCount,
|
|
931
|
+
scrapedAt: new Date().toISOString()
|
|
932
|
+
};
|
|
933
|
+
});
|
|
934
|
+
const visitedUrls = new Set();
|
|
935
|
+
const crawlResults = [];
|
|
936
|
+
const crawlQueue = [];
|
|
937
|
+
const normalizedBaseUrl = normalizeUrl(baseUrl);
|
|
938
|
+
visitedUrls.add(normalizedBaseUrl);
|
|
939
|
+
crawlQueue.push({ url: baseUrl, depth: 0 });
|
|
940
|
+
this.log(`Starting breadth-first crawl with maxDepth: ${crawlConfig.maxDepth}, limit: ${crawlConfig.limit}`, logger_1.Level.LOG);
|
|
695
941
|
if (crawlConfig.useSitemap) {
|
|
696
942
|
this.log('Fetching sitemap URLs...', logger_1.Level.LOG);
|
|
697
943
|
try {
|
|
698
944
|
const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
|
|
699
|
-
// Use XMLHttpRequest instead of fetch to avoid polyfills
|
|
700
945
|
const sitemapUrls = yield page.evaluate((url) => {
|
|
701
946
|
return new Promise((resolve) => {
|
|
702
947
|
const xhr = new XMLHttpRequest();
|
|
@@ -721,7 +966,13 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
721
966
|
if (sitemapUrls.length > 0) {
|
|
722
967
|
const nestedSitemaps = sitemapUrls.filter(url => url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/'));
|
|
723
968
|
const regularUrls = sitemapUrls.filter(url => !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/'));
|
|
724
|
-
|
|
969
|
+
for (const sitemapPageUrl of regularUrls) {
|
|
970
|
+
const normalized = normalizeUrl(sitemapPageUrl);
|
|
971
|
+
if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(sitemapPageUrl) && isUrlAllowedByRobots(sitemapPageUrl)) {
|
|
972
|
+
visitedUrls.add(normalized);
|
|
973
|
+
crawlQueue.push({ url: sitemapPageUrl, depth: 1 });
|
|
974
|
+
}
|
|
975
|
+
}
|
|
725
976
|
this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, logger_1.Level.LOG);
|
|
726
977
|
for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
|
|
727
978
|
try {
|
|
@@ -747,16 +998,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
747
998
|
xhr.send();
|
|
748
999
|
});
|
|
749
1000
|
}, nestedUrl);
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
1001
|
+
for (const nestedPageUrl of nestedUrls) {
|
|
1002
|
+
const normalized = normalizeUrl(nestedPageUrl);
|
|
1003
|
+
if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(nestedPageUrl) && isUrlAllowedByRobots(nestedPageUrl)) {
|
|
1004
|
+
visitedUrls.add(normalized);
|
|
1005
|
+
crawlQueue.push({ url: nestedPageUrl, depth: 1 });
|
|
1006
|
+
}
|
|
753
1007
|
}
|
|
1008
|
+
this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, logger_1.Level.LOG);
|
|
754
1009
|
}
|
|
755
1010
|
catch (error) {
|
|
756
1011
|
this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, logger_1.Level.WARN);
|
|
757
1012
|
}
|
|
758
1013
|
}
|
|
759
|
-
this.log(`Total URLs from
|
|
1014
|
+
this.log(`Total URLs queued from sitemaps: ${crawlQueue.length - 1}`, logger_1.Level.LOG);
|
|
760
1015
|
}
|
|
761
1016
|
else {
|
|
762
1017
|
this.log('No URLs found in sitemap or sitemap not available', logger_1.Level.WARN);
|
|
@@ -766,167 +1021,63 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
766
1021
|
this.log(`Sitemap fetch failed: ${error.message}`, logger_1.Level.WARN);
|
|
767
1022
|
}
|
|
768
1023
|
}
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
this.log('Network did not become idle, continuing anyway', logger_1.Level.WARN);
|
|
775
|
-
});
|
|
776
|
-
yield new Promise(resolve => setTimeout(resolve, 5000));
|
|
777
|
-
const anchorCount = yield page.evaluate(() => {
|
|
778
|
-
return document.querySelectorAll('a').length;
|
|
779
|
-
});
|
|
780
|
-
this.log(`Page has ${anchorCount} total anchor tags`, logger_1.Level.LOG);
|
|
781
|
-
const pageLinks = yield page.evaluate(() => {
|
|
782
|
-
const links = [];
|
|
783
|
-
const allAnchors = document.querySelectorAll('a');
|
|
784
|
-
console.log('Total anchors found:', allAnchors.length);
|
|
785
|
-
for (let i = 0; i < allAnchors.length; i++) {
|
|
786
|
-
const anchor = allAnchors[i];
|
|
787
|
-
const href = anchor.getAttribute('href');
|
|
788
|
-
const fullHref = anchor.href;
|
|
789
|
-
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
|
|
790
|
-
links.push(fullHref);
|
|
791
|
-
}
|
|
792
|
-
}
|
|
793
|
-
console.log('Links extracted:', links.length);
|
|
794
|
-
return links;
|
|
795
|
-
});
|
|
796
|
-
discoveredUrls.push(...pageLinks);
|
|
797
|
-
this.log(`Found ${pageLinks.length} links from page`, logger_1.Level.LOG);
|
|
798
|
-
}
|
|
799
|
-
catch (error) {
|
|
800
|
-
this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
|
|
1024
|
+
let processedCount = 0;
|
|
1025
|
+
while (crawlQueue.length > 0 && crawlResults.length < crawlConfig.limit) {
|
|
1026
|
+
if (this.isAborted) {
|
|
1027
|
+
this.log('Workflow aborted during crawl', logger_1.Level.WARN);
|
|
1028
|
+
break;
|
|
801
1029
|
}
|
|
802
|
-
|
|
803
|
-
|
|
1030
|
+
const { url, depth } = crawlQueue.shift();
|
|
1031
|
+
processedCount++;
|
|
1032
|
+
this.log(`[${crawlResults.length + 1}/${crawlConfig.limit}] Crawling (depth ${depth}): ${url}`, logger_1.Level.LOG);
|
|
804
1033
|
try {
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
return false;
|
|
809
|
-
}
|
|
810
|
-
else if (crawlConfig.mode === 'subdomain') {
|
|
811
|
-
if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
|
|
812
|
-
return false;
|
|
813
|
-
}
|
|
814
|
-
else if (crawlConfig.mode === 'path') {
|
|
815
|
-
if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
|
|
816
|
-
return false;
|
|
1034
|
+
if (robotRules.crawlDelay && crawlResults.length > 0) {
|
|
1035
|
+
this.log(`Applying crawl delay: ${robotRules.crawlDelay}ms`, logger_1.Level.LOG);
|
|
1036
|
+
yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
|
|
817
1037
|
}
|
|
818
|
-
if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
|
|
819
|
-
const matches = crawlConfig.includePaths.some(pattern => {
|
|
820
|
-
const regex = new RegExp(pattern);
|
|
821
|
-
return regex.test(url);
|
|
822
|
-
});
|
|
823
|
-
if (!matches)
|
|
824
|
-
return false;
|
|
825
|
-
}
|
|
826
|
-
if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
|
|
827
|
-
const matches = crawlConfig.excludePaths.some(pattern => {
|
|
828
|
-
const regex = new RegExp(pattern);
|
|
829
|
-
return regex.test(url);
|
|
830
|
-
});
|
|
831
|
-
if (matches)
|
|
832
|
-
return false;
|
|
833
|
-
}
|
|
834
|
-
return true;
|
|
835
|
-
}
|
|
836
|
-
catch (error) {
|
|
837
|
-
return false;
|
|
838
|
-
}
|
|
839
|
-
});
|
|
840
|
-
const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
|
|
841
|
-
return url.replace(/#.*$/, '').replace(/\/$/, '');
|
|
842
|
-
})));
|
|
843
|
-
const basePathname = parsedBase.pathname;
|
|
844
|
-
const prioritizedUrls = uniqueUrls.sort((a, b) => {
|
|
845
|
-
try {
|
|
846
|
-
const aUrl = new URL(a);
|
|
847
|
-
const bUrl = new URL(b);
|
|
848
|
-
const aMatchesBase = aUrl.pathname.startsWith(basePathname);
|
|
849
|
-
const bMatchesBase = bUrl.pathname.startsWith(basePathname);
|
|
850
|
-
if (aMatchesBase && !bMatchesBase)
|
|
851
|
-
return -1;
|
|
852
|
-
if (!aMatchesBase && bMatchesBase)
|
|
853
|
-
return 1;
|
|
854
|
-
return 0;
|
|
855
|
-
}
|
|
856
|
-
catch (error) {
|
|
857
|
-
return 0;
|
|
858
|
-
}
|
|
859
|
-
});
|
|
860
|
-
const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
|
|
861
|
-
this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, logger_1.Level.LOG);
|
|
862
|
-
this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, logger_1.Level.LOG);
|
|
863
|
-
const crawlResults = [];
|
|
864
|
-
for (let i = 0; i < finalUrls.length; i++) {
|
|
865
|
-
const url = finalUrls[i];
|
|
866
|
-
try {
|
|
867
|
-
this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, logger_1.Level.LOG);
|
|
868
1038
|
yield page.goto(url, {
|
|
869
1039
|
waitUntil: 'domcontentloaded',
|
|
870
1040
|
timeout: 30000
|
|
871
|
-
}).catch(() => {
|
|
872
|
-
|
|
1041
|
+
}).catch((err) => {
|
|
1042
|
+
throw new Error(`Navigation failed: ${err.message}`);
|
|
873
1043
|
});
|
|
874
1044
|
yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
|
|
875
|
-
const
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
const
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
}
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
|
897
|
-
const html = document.documentElement.outerHTML;
|
|
898
|
-
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
|
|
899
|
-
const allMetadata = getAllMeta();
|
|
900
|
-
return {
|
|
901
|
-
title,
|
|
902
|
-
description: getMeta('description'),
|
|
903
|
-
text: bodyText,
|
|
904
|
-
html: html,
|
|
905
|
-
links: links,
|
|
906
|
-
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
|
|
907
|
-
metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
|
|
908
|
-
};
|
|
909
|
-
});
|
|
910
|
-
crawlResults.push({
|
|
911
|
-
metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
|
|
912
|
-
html: pageData.html,
|
|
913
|
-
text: pageData.text,
|
|
914
|
-
links: pageData.links,
|
|
915
|
-
wordCount: pageData.wordCount,
|
|
916
|
-
scrapedAt: new Date().toISOString()
|
|
917
|
-
});
|
|
918
|
-
this.log(`✓ Scraped ${url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
|
|
1045
|
+
const pageResult = yield scrapePageContent(url);
|
|
1046
|
+
pageResult.metadata.depth = depth;
|
|
1047
|
+
crawlResults.push(pageResult);
|
|
1048
|
+
this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, logger_1.Level.LOG);
|
|
1049
|
+
if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
|
|
1050
|
+
const newLinks = yield extractLinksFromPage();
|
|
1051
|
+
let addedCount = 0;
|
|
1052
|
+
for (const link of newLinks) {
|
|
1053
|
+
const normalized = normalizeUrl(link);
|
|
1054
|
+
if (!visitedUrls.has(normalized) &&
|
|
1055
|
+
isUrlAllowedByConfig(link) &&
|
|
1056
|
+
isUrlAllowedByRobots(link)) {
|
|
1057
|
+
visitedUrls.add(normalized);
|
|
1058
|
+
crawlQueue.push({ url: link, depth: depth + 1 });
|
|
1059
|
+
addedCount++;
|
|
1060
|
+
}
|
|
1061
|
+
}
|
|
1062
|
+
if (addedCount > 0) {
|
|
1063
|
+
this.log(`Added ${addedCount} new URLs to queue at depth ${depth + 1}`, logger_1.Level.LOG);
|
|
1064
|
+
}
|
|
1065
|
+
}
|
|
919
1066
|
}
|
|
920
1067
|
catch (error) {
|
|
921
|
-
this.log(`Failed to
|
|
1068
|
+
this.log(`Failed to crawl ${url}: ${error.message}`, logger_1.Level.WARN);
|
|
922
1069
|
crawlResults.push({
|
|
923
|
-
|
|
1070
|
+
metadata: {
|
|
1071
|
+
url: url,
|
|
1072
|
+
sourceURL: url,
|
|
1073
|
+
depth: depth
|
|
1074
|
+
},
|
|
924
1075
|
error: error.message,
|
|
925
1076
|
scrapedAt: new Date().toISOString()
|
|
926
1077
|
});
|
|
927
1078
|
}
|
|
928
1079
|
}
|
|
929
|
-
this.log(`
|
|
1080
|
+
this.log(`Crawl completed: ${crawlResults.length} pages scraped (${processedCount} URLs processed, ${visitedUrls.size} URLs discovered)`, logger_1.Level.LOG);
|
|
930
1081
|
const actionType = "crawl";
|
|
931
1082
|
const actionName = "Crawl Results";
|
|
932
1083
|
if (!this.serializableDataByType[actionType]) {
|