mx-cloud 0.0.27 → 0.0.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/interpret.js +453 -165
- package/build/utils/markdown.d.ts +1 -0
- package/build/utils/markdown.js +153 -0
- package/package.json +9 -2
package/build/interpret.js
CHANGED
|
@@ -54,6 +54,7 @@ const concurrency_1 = __importDefault(require("./utils/concurrency"));
|
|
|
54
54
|
const preprocessor_1 = __importDefault(require("./preprocessor"));
|
|
55
55
|
const logger_1 = __importStar(require("./utils/logger"));
|
|
56
56
|
const selector_1 = require("./selector");
|
|
57
|
+
const markdown_1 = require("./utils/markdown");
|
|
57
58
|
/**
|
|
58
59
|
* Class for running the Smart Workflows.
|
|
59
60
|
*/
|
|
@@ -678,10 +679,8 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
678
679
|
}
|
|
679
680
|
this.log('Starting crawl operation', logger_1.Level.LOG);
|
|
680
681
|
try {
|
|
681
|
-
// Get current page URL and log it
|
|
682
682
|
const currentUrl = page.url();
|
|
683
683
|
this.log(`Current page URL: ${currentUrl}`, logger_1.Level.LOG);
|
|
684
|
-
// If page is on about:blank or empty, we need to wait for navigation
|
|
685
684
|
if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
|
|
686
685
|
this.log('Page not yet navigated, waiting for navigation...', logger_1.Level.WARN);
|
|
687
686
|
yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
|
|
@@ -690,13 +689,306 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
690
689
|
this.log(`Using base URL for crawl: ${baseUrl}`, logger_1.Level.LOG);
|
|
691
690
|
const parsedBase = new URL(baseUrl);
|
|
692
691
|
const baseDomain = parsedBase.hostname;
|
|
693
|
-
let
|
|
694
|
-
|
|
692
|
+
let robotRules = {
|
|
693
|
+
disallowedPaths: [],
|
|
694
|
+
allowedPaths: [],
|
|
695
|
+
crawlDelay: null
|
|
696
|
+
};
|
|
697
|
+
if (crawlConfig.respectRobots) {
|
|
698
|
+
this.log('Fetching robots.txt...', logger_1.Level.LOG);
|
|
699
|
+
try {
|
|
700
|
+
const robotsUrl = `${parsedBase.protocol}//${parsedBase.host}/robots.txt`;
|
|
701
|
+
const robotsContent = yield page.evaluate((url) => {
|
|
702
|
+
return new Promise((resolve) => {
|
|
703
|
+
const xhr = new XMLHttpRequest();
|
|
704
|
+
xhr.open('GET', url, true);
|
|
705
|
+
xhr.onload = function () {
|
|
706
|
+
if (xhr.status === 200) {
|
|
707
|
+
resolve(xhr.responseText);
|
|
708
|
+
}
|
|
709
|
+
else {
|
|
710
|
+
resolve('');
|
|
711
|
+
}
|
|
712
|
+
};
|
|
713
|
+
xhr.onerror = function () {
|
|
714
|
+
resolve('');
|
|
715
|
+
};
|
|
716
|
+
xhr.send();
|
|
717
|
+
});
|
|
718
|
+
}, robotsUrl);
|
|
719
|
+
if (robotsContent) {
|
|
720
|
+
const lines = robotsContent.split('\n');
|
|
721
|
+
let isRelevantUserAgent = false;
|
|
722
|
+
let foundSpecificUserAgent = false;
|
|
723
|
+
for (const line of lines) {
|
|
724
|
+
const trimmedLine = line.trim().toLowerCase();
|
|
725
|
+
if (trimmedLine.startsWith('#') || trimmedLine === '') {
|
|
726
|
+
continue;
|
|
727
|
+
}
|
|
728
|
+
const colonIndex = line.indexOf(':');
|
|
729
|
+
if (colonIndex === -1)
|
|
730
|
+
continue;
|
|
731
|
+
const directive = line.substring(0, colonIndex).trim().toLowerCase();
|
|
732
|
+
const value = line.substring(colonIndex + 1).trim();
|
|
733
|
+
if (directive === 'user-agent') {
|
|
734
|
+
const agent = value.toLowerCase();
|
|
735
|
+
if (agent === '*' && !foundSpecificUserAgent) {
|
|
736
|
+
isRelevantUserAgent = true;
|
|
737
|
+
}
|
|
738
|
+
else if (agent.includes('bot') || agent.includes('crawler') || agent.includes('spider')) {
|
|
739
|
+
isRelevantUserAgent = true;
|
|
740
|
+
foundSpecificUserAgent = true;
|
|
741
|
+
}
|
|
742
|
+
else {
|
|
743
|
+
if (!foundSpecificUserAgent) {
|
|
744
|
+
isRelevantUserAgent = false;
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
else if (isRelevantUserAgent) {
|
|
749
|
+
if (directive === 'disallow' && value) {
|
|
750
|
+
robotRules.disallowedPaths.push(value);
|
|
751
|
+
}
|
|
752
|
+
else if (directive === 'allow' && value) {
|
|
753
|
+
robotRules.allowedPaths.push(value);
|
|
754
|
+
}
|
|
755
|
+
else if (directive === 'crawl-delay' && value) {
|
|
756
|
+
const delay = parseFloat(value);
|
|
757
|
+
if (!isNaN(delay) && delay > 0) {
|
|
758
|
+
robotRules.crawlDelay = delay * 1000;
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
this.log(`Robots.txt parsed: ${robotRules.disallowedPaths.length} disallowed paths, ${robotRules.allowedPaths.length} allowed paths, crawl-delay: ${robotRules.crawlDelay || 'none'}`, logger_1.Level.LOG);
|
|
764
|
+
}
|
|
765
|
+
else {
|
|
766
|
+
this.log('No robots.txt found or not accessible, proceeding without restrictions', logger_1.Level.WARN);
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
catch (error) {
|
|
770
|
+
this.log(`Failed to fetch robots.txt: ${error.message}, proceeding without restrictions`, logger_1.Level.WARN);
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
const isUrlAllowedByRobots = (url) => {
|
|
774
|
+
if (!crawlConfig.respectRobots)
|
|
775
|
+
return true;
|
|
776
|
+
try {
|
|
777
|
+
const urlObj = new URL(url);
|
|
778
|
+
const pathname = urlObj.pathname;
|
|
779
|
+
for (const allowedPath of robotRules.allowedPaths) {
|
|
780
|
+
if (allowedPath === pathname || pathname.startsWith(allowedPath)) {
|
|
781
|
+
return true;
|
|
782
|
+
}
|
|
783
|
+
if (allowedPath.includes('*')) {
|
|
784
|
+
const regex = new RegExp('^' + allowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
|
|
785
|
+
if (regex.test(pathname)) {
|
|
786
|
+
return true;
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
for (const disallowedPath of robotRules.disallowedPaths) {
|
|
791
|
+
if (disallowedPath === '/') {
|
|
792
|
+
return false;
|
|
793
|
+
}
|
|
794
|
+
if (pathname.startsWith(disallowedPath)) {
|
|
795
|
+
return false;
|
|
796
|
+
}
|
|
797
|
+
if (disallowedPath.includes('*')) {
|
|
798
|
+
const regex = new RegExp('^' + disallowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
|
|
799
|
+
if (regex.test(pathname)) {
|
|
800
|
+
return false;
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
if (disallowedPath.endsWith('$')) {
|
|
804
|
+
const pattern = disallowedPath.slice(0, -1);
|
|
805
|
+
if (pathname === pattern || pathname.endsWith(pattern)) {
|
|
806
|
+
return false;
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
return true;
|
|
811
|
+
}
|
|
812
|
+
catch (error) {
|
|
813
|
+
return true;
|
|
814
|
+
}
|
|
815
|
+
};
|
|
816
|
+
const isUrlAllowedByConfig = (url) => {
|
|
817
|
+
try {
|
|
818
|
+
const urlObj = new URL(url);
|
|
819
|
+
if (crawlConfig.mode === 'domain') {
|
|
820
|
+
if (urlObj.hostname !== baseDomain)
|
|
821
|
+
return false;
|
|
822
|
+
}
|
|
823
|
+
else if (crawlConfig.mode === 'subdomain') {
|
|
824
|
+
if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
|
|
825
|
+
return false;
|
|
826
|
+
}
|
|
827
|
+
else if (crawlConfig.mode === 'path') {
|
|
828
|
+
if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
|
|
829
|
+
return false;
|
|
830
|
+
}
|
|
831
|
+
if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
|
|
832
|
+
const matches = crawlConfig.includePaths.some(pattern => {
|
|
833
|
+
try {
|
|
834
|
+
const regex = new RegExp(pattern);
|
|
835
|
+
return regex.test(url);
|
|
836
|
+
}
|
|
837
|
+
catch (_a) {
|
|
838
|
+
return url.includes(pattern);
|
|
839
|
+
}
|
|
840
|
+
});
|
|
841
|
+
if (!matches)
|
|
842
|
+
return false;
|
|
843
|
+
}
|
|
844
|
+
if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
|
|
845
|
+
const matches = crawlConfig.excludePaths.some(pattern => {
|
|
846
|
+
try {
|
|
847
|
+
const regex = new RegExp(pattern);
|
|
848
|
+
return regex.test(url);
|
|
849
|
+
}
|
|
850
|
+
catch (_a) {
|
|
851
|
+
return url.includes(pattern);
|
|
852
|
+
}
|
|
853
|
+
});
|
|
854
|
+
if (matches)
|
|
855
|
+
return false;
|
|
856
|
+
}
|
|
857
|
+
return true;
|
|
858
|
+
}
|
|
859
|
+
catch (error) {
|
|
860
|
+
return false;
|
|
861
|
+
}
|
|
862
|
+
};
|
|
863
|
+
const normalizeUrl = (url) => {
|
|
864
|
+
return url.replace(/#.*$/, '').replace(/\/$/, '');
|
|
865
|
+
};
|
|
866
|
+
const extractLinksFromPage = () => __awaiter(this, void 0, void 0, function* () {
|
|
867
|
+
try {
|
|
868
|
+
yield page.waitForLoadState('load', { timeout: 15000 }).catch(() => { });
|
|
869
|
+
yield page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => { });
|
|
870
|
+
yield new Promise(resolve => setTimeout(resolve, 1000));
|
|
871
|
+
const pageLinks = yield page.evaluate(() => {
|
|
872
|
+
const links = [];
|
|
873
|
+
const allAnchors = document.querySelectorAll('a');
|
|
874
|
+
for (let i = 0; i < allAnchors.length; i++) {
|
|
875
|
+
const anchor = allAnchors[i];
|
|
876
|
+
const fullHref = anchor.href;
|
|
877
|
+
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
|
|
878
|
+
links.push(fullHref);
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
return links;
|
|
882
|
+
});
|
|
883
|
+
return pageLinks;
|
|
884
|
+
}
|
|
885
|
+
catch (error) {
|
|
886
|
+
this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
|
|
887
|
+
return [];
|
|
888
|
+
}
|
|
889
|
+
});
|
|
890
|
+
const scrapePageContent = (url) => __awaiter(this, void 0, void 0, function* () {
|
|
891
|
+
const pageData = yield page.evaluate(() => {
|
|
892
|
+
var _a, _b;
|
|
893
|
+
const getMeta = (name) => {
|
|
894
|
+
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
|
895
|
+
return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
|
|
896
|
+
};
|
|
897
|
+
const getAllMeta = () => {
|
|
898
|
+
const metadata = {};
|
|
899
|
+
const metaTags = document.querySelectorAll('meta');
|
|
900
|
+
metaTags.forEach(tag => {
|
|
901
|
+
const name = tag.getAttribute('name') || tag.getAttribute('property');
|
|
902
|
+
const content = tag.getAttribute('content');
|
|
903
|
+
if (name && content) {
|
|
904
|
+
metadata[name] = content;
|
|
905
|
+
}
|
|
906
|
+
});
|
|
907
|
+
return metadata;
|
|
908
|
+
};
|
|
909
|
+
const title = document.title || '';
|
|
910
|
+
const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
|
|
911
|
+
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
|
|
912
|
+
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
|
913
|
+
const html = document.documentElement.outerHTML;
|
|
914
|
+
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
|
|
915
|
+
const allMetadata = getAllMeta();
|
|
916
|
+
return {
|
|
917
|
+
title,
|
|
918
|
+
description: getMeta('description'),
|
|
919
|
+
text: bodyText,
|
|
920
|
+
html: html,
|
|
921
|
+
links: links,
|
|
922
|
+
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
|
|
923
|
+
metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
|
|
924
|
+
};
|
|
925
|
+
});
|
|
926
|
+
const result = {
|
|
927
|
+
metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
|
|
928
|
+
scrapedAt: new Date().toISOString()
|
|
929
|
+
};
|
|
930
|
+
const formats = crawlConfig.outputFormats || [];
|
|
931
|
+
if (formats.includes('text')) {
|
|
932
|
+
result.text = pageData.text;
|
|
933
|
+
result.wordCount = pageData.wordCount;
|
|
934
|
+
}
|
|
935
|
+
if (formats.includes('html')) {
|
|
936
|
+
result.html = pageData.html;
|
|
937
|
+
result.links = pageData.links;
|
|
938
|
+
}
|
|
939
|
+
if (formats.includes('markdown')) {
|
|
940
|
+
try {
|
|
941
|
+
const markdown = yield (0, markdown_1.parseMarkdown)(pageData.html, url);
|
|
942
|
+
result.markdown = markdown;
|
|
943
|
+
}
|
|
944
|
+
catch (err) {
|
|
945
|
+
this.log(`Markdown conversion failed for ${url}: ${err.message}`, logger_1.Level.WARN);
|
|
946
|
+
result.markdown = '';
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
if (formats.includes('screenshot-visible')) {
|
|
950
|
+
try {
|
|
951
|
+
const screenshotBuffer = yield page.screenshot({ fullPage: false });
|
|
952
|
+
const screenshotName = `Crawl - ${crawlResults.length} - Visible`;
|
|
953
|
+
yield this.options.binaryCallback({
|
|
954
|
+
name: screenshotName,
|
|
955
|
+
data: screenshotBuffer,
|
|
956
|
+
mimeType: 'image/png'
|
|
957
|
+
}, 'image/png');
|
|
958
|
+
result.screenshotVisible = screenshotName;
|
|
959
|
+
}
|
|
960
|
+
catch (err) {
|
|
961
|
+
this.log(`Screenshot-visible failed for ${url}: ${err.message}`, logger_1.Level.WARN);
|
|
962
|
+
}
|
|
963
|
+
}
|
|
964
|
+
if (formats.includes('screenshot-fullpage')) {
|
|
965
|
+
try {
|
|
966
|
+
const screenshotBuffer = yield page.screenshot({ fullPage: true });
|
|
967
|
+
const screenshotName = `Crawl - ${crawlResults.length} - Full Page`;
|
|
968
|
+
yield this.options.binaryCallback({
|
|
969
|
+
name: screenshotName,
|
|
970
|
+
data: screenshotBuffer,
|
|
971
|
+
mimeType: 'image/png'
|
|
972
|
+
}, 'image/png');
|
|
973
|
+
result.screenshotFullpage = screenshotName;
|
|
974
|
+
}
|
|
975
|
+
catch (err) {
|
|
976
|
+
this.log(`Screenshot-fullpage failed for ${url}: ${err.message}`, logger_1.Level.WARN);
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
return result;
|
|
980
|
+
});
|
|
981
|
+
const visitedUrls = new Set();
|
|
982
|
+
const crawlResults = [];
|
|
983
|
+
const crawlQueue = [];
|
|
984
|
+
const normalizedBaseUrl = normalizeUrl(baseUrl);
|
|
985
|
+
visitedUrls.add(normalizedBaseUrl);
|
|
986
|
+
crawlQueue.push({ url: baseUrl, depth: 0 });
|
|
987
|
+
this.log(`Starting breadth-first crawl with maxDepth: ${crawlConfig.maxDepth}, limit: ${crawlConfig.limit}`, logger_1.Level.LOG);
|
|
695
988
|
if (crawlConfig.useSitemap) {
|
|
696
989
|
this.log('Fetching sitemap URLs...', logger_1.Level.LOG);
|
|
697
990
|
try {
|
|
698
991
|
const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
|
|
699
|
-
// Use XMLHttpRequest instead of fetch to avoid polyfills
|
|
700
992
|
const sitemapUrls = yield page.evaluate((url) => {
|
|
701
993
|
return new Promise((resolve) => {
|
|
702
994
|
const xhr = new XMLHttpRequest();
|
|
@@ -721,7 +1013,13 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
721
1013
|
if (sitemapUrls.length > 0) {
|
|
722
1014
|
const nestedSitemaps = sitemapUrls.filter(url => url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/'));
|
|
723
1015
|
const regularUrls = sitemapUrls.filter(url => !url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/'));
|
|
724
|
-
|
|
1016
|
+
for (const sitemapPageUrl of regularUrls) {
|
|
1017
|
+
const normalized = normalizeUrl(sitemapPageUrl);
|
|
1018
|
+
if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(sitemapPageUrl) && isUrlAllowedByRobots(sitemapPageUrl)) {
|
|
1019
|
+
visitedUrls.add(normalized);
|
|
1020
|
+
crawlQueue.push({ url: sitemapPageUrl, depth: 1 });
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
725
1023
|
this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, logger_1.Level.LOG);
|
|
726
1024
|
for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
|
|
727
1025
|
try {
|
|
@@ -747,16 +1045,20 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
747
1045
|
xhr.send();
|
|
748
1046
|
});
|
|
749
1047
|
}, nestedUrl);
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
1048
|
+
for (const nestedPageUrl of nestedUrls) {
|
|
1049
|
+
const normalized = normalizeUrl(nestedPageUrl);
|
|
1050
|
+
if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(nestedPageUrl) && isUrlAllowedByRobots(nestedPageUrl)) {
|
|
1051
|
+
visitedUrls.add(normalized);
|
|
1052
|
+
crawlQueue.push({ url: nestedPageUrl, depth: 1 });
|
|
1053
|
+
}
|
|
753
1054
|
}
|
|
1055
|
+
this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, logger_1.Level.LOG);
|
|
754
1056
|
}
|
|
755
1057
|
catch (error) {
|
|
756
1058
|
this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, logger_1.Level.WARN);
|
|
757
1059
|
}
|
|
758
1060
|
}
|
|
759
|
-
this.log(`Total URLs from
|
|
1061
|
+
this.log(`Total URLs queued from sitemaps: ${crawlQueue.length - 1}`, logger_1.Level.LOG);
|
|
760
1062
|
}
|
|
761
1063
|
else {
|
|
762
1064
|
this.log('No URLs found in sitemap or sitemap not available', logger_1.Level.WARN);
|
|
@@ -766,167 +1068,76 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
766
1068
|
this.log(`Sitemap fetch failed: ${error.message}`, logger_1.Level.WARN);
|
|
767
1069
|
}
|
|
768
1070
|
}
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
this.log('Network did not become idle, continuing anyway', logger_1.Level.WARN);
|
|
775
|
-
});
|
|
776
|
-
yield new Promise(resolve => setTimeout(resolve, 5000));
|
|
777
|
-
const anchorCount = yield page.evaluate(() => {
|
|
778
|
-
return document.querySelectorAll('a').length;
|
|
779
|
-
});
|
|
780
|
-
this.log(`Page has ${anchorCount} total anchor tags`, logger_1.Level.LOG);
|
|
781
|
-
const pageLinks = yield page.evaluate(() => {
|
|
782
|
-
const links = [];
|
|
783
|
-
const allAnchors = document.querySelectorAll('a');
|
|
784
|
-
console.log('Total anchors found:', allAnchors.length);
|
|
785
|
-
for (let i = 0; i < allAnchors.length; i++) {
|
|
786
|
-
const anchor = allAnchors[i];
|
|
787
|
-
const href = anchor.getAttribute('href');
|
|
788
|
-
const fullHref = anchor.href;
|
|
789
|
-
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
|
|
790
|
-
links.push(fullHref);
|
|
791
|
-
}
|
|
792
|
-
}
|
|
793
|
-
console.log('Links extracted:', links.length);
|
|
794
|
-
return links;
|
|
795
|
-
});
|
|
796
|
-
discoveredUrls.push(...pageLinks);
|
|
797
|
-
this.log(`Found ${pageLinks.length} links from page`, logger_1.Level.LOG);
|
|
798
|
-
}
|
|
799
|
-
catch (error) {
|
|
800
|
-
this.log(`Link extraction failed: ${error.message}`, logger_1.Level.WARN);
|
|
1071
|
+
let processedCount = 0;
|
|
1072
|
+
while (crawlQueue.length > 0 && crawlResults.length < crawlConfig.limit) {
|
|
1073
|
+
if (this.isAborted) {
|
|
1074
|
+
this.log('Workflow aborted during crawl', logger_1.Level.WARN);
|
|
1075
|
+
break;
|
|
801
1076
|
}
|
|
802
|
-
|
|
803
|
-
|
|
1077
|
+
const { url, depth } = crawlQueue.shift();
|
|
1078
|
+
processedCount++;
|
|
1079
|
+
this.log(`[${crawlResults.length + 1}/${crawlConfig.limit}] Crawling (depth ${depth}): ${url}`, logger_1.Level.LOG);
|
|
804
1080
|
try {
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
return false;
|
|
1081
|
+
if (robotRules.crawlDelay && crawlResults.length > 0) {
|
|
1082
|
+
this.log(`Applying crawl delay: ${robotRules.crawlDelay}ms`, logger_1.Level.LOG);
|
|
1083
|
+
yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
|
|
809
1084
|
}
|
|
810
|
-
else if (crawlConfig.mode === 'subdomain') {
|
|
811
|
-
if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain)
|
|
812
|
-
return false;
|
|
813
|
-
}
|
|
814
|
-
else if (crawlConfig.mode === 'path') {
|
|
815
|
-
if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname))
|
|
816
|
-
return false;
|
|
817
|
-
}
|
|
818
|
-
if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
|
|
819
|
-
const matches = crawlConfig.includePaths.some(pattern => {
|
|
820
|
-
const regex = new RegExp(pattern);
|
|
821
|
-
return regex.test(url);
|
|
822
|
-
});
|
|
823
|
-
if (!matches)
|
|
824
|
-
return false;
|
|
825
|
-
}
|
|
826
|
-
if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
|
|
827
|
-
const matches = crawlConfig.excludePaths.some(pattern => {
|
|
828
|
-
const regex = new RegExp(pattern);
|
|
829
|
-
return regex.test(url);
|
|
830
|
-
});
|
|
831
|
-
if (matches)
|
|
832
|
-
return false;
|
|
833
|
-
}
|
|
834
|
-
return true;
|
|
835
|
-
}
|
|
836
|
-
catch (error) {
|
|
837
|
-
return false;
|
|
838
|
-
}
|
|
839
|
-
});
|
|
840
|
-
const uniqueUrls = Array.from(new Set(filteredUrls.map(url => {
|
|
841
|
-
return url.replace(/#.*$/, '').replace(/\/$/, '');
|
|
842
|
-
})));
|
|
843
|
-
const basePathname = parsedBase.pathname;
|
|
844
|
-
const prioritizedUrls = uniqueUrls.sort((a, b) => {
|
|
845
|
-
try {
|
|
846
|
-
const aUrl = new URL(a);
|
|
847
|
-
const bUrl = new URL(b);
|
|
848
|
-
const aMatchesBase = aUrl.pathname.startsWith(basePathname);
|
|
849
|
-
const bMatchesBase = bUrl.pathname.startsWith(basePathname);
|
|
850
|
-
if (aMatchesBase && !bMatchesBase)
|
|
851
|
-
return -1;
|
|
852
|
-
if (!aMatchesBase && bMatchesBase)
|
|
853
|
-
return 1;
|
|
854
|
-
return 0;
|
|
855
|
-
}
|
|
856
|
-
catch (error) {
|
|
857
|
-
return 0;
|
|
858
|
-
}
|
|
859
|
-
});
|
|
860
|
-
const finalUrls = prioritizedUrls.slice(0, crawlConfig.limit);
|
|
861
|
-
this.log(`Crawl discovered ${finalUrls.length} URLs (from ${discoveredUrls.length} total)`, logger_1.Level.LOG);
|
|
862
|
-
this.log(`Starting to scrape content from ${finalUrls.length} discovered URLs...`, logger_1.Level.LOG);
|
|
863
|
-
const crawlResults = [];
|
|
864
|
-
for (let i = 0; i < finalUrls.length; i++) {
|
|
865
|
-
const url = finalUrls[i];
|
|
866
|
-
try {
|
|
867
|
-
this.log(`[${i + 1}/${finalUrls.length}] Scraping: ${url}`, logger_1.Level.LOG);
|
|
868
1085
|
yield page.goto(url, {
|
|
869
|
-
waitUntil: '
|
|
870
|
-
timeout:
|
|
871
|
-
}).catch(() => {
|
|
872
|
-
|
|
873
|
-
});
|
|
874
|
-
yield page.waitForLoadState('load', { timeout: 10000 }).catch(() => { });
|
|
875
|
-
const pageData = yield page.evaluate(() => {
|
|
876
|
-
var _a, _b;
|
|
877
|
-
const getMeta = (name) => {
|
|
878
|
-
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
|
879
|
-
return (meta === null || meta === void 0 ? void 0 : meta.getAttribute('content')) || '';
|
|
880
|
-
};
|
|
881
|
-
const getAllMeta = () => {
|
|
882
|
-
const metadata = {};
|
|
883
|
-
const metaTags = document.querySelectorAll('meta');
|
|
884
|
-
metaTags.forEach(tag => {
|
|
885
|
-
const name = tag.getAttribute('name') || tag.getAttribute('property');
|
|
886
|
-
const content = tag.getAttribute('content');
|
|
887
|
-
if (name && content) {
|
|
888
|
-
metadata[name] = content;
|
|
889
|
-
}
|
|
890
|
-
});
|
|
891
|
-
return metadata;
|
|
892
|
-
};
|
|
893
|
-
const title = document.title || '';
|
|
894
|
-
const bodyText = ((_a = document.body) === null || _a === void 0 ? void 0 : _a.innerText) || '';
|
|
895
|
-
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
|
|
896
|
-
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
|
897
|
-
const html = document.documentElement.outerHTML;
|
|
898
|
-
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
|
|
899
|
-
const allMetadata = getAllMeta();
|
|
900
|
-
return {
|
|
901
|
-
title,
|
|
902
|
-
description: getMeta('description'),
|
|
903
|
-
text: bodyText,
|
|
904
|
-
html: html,
|
|
905
|
-
links: links,
|
|
906
|
-
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
|
|
907
|
-
metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
|
|
908
|
-
};
|
|
1086
|
+
waitUntil: 'load',
|
|
1087
|
+
timeout: 60000
|
|
1088
|
+
}).catch((err) => {
|
|
1089
|
+
throw new Error(`Navigation failed: ${err.message}`);
|
|
909
1090
|
});
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
1091
|
+
yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
|
|
1092
|
+
const pageResult = yield scrapePageContent(url);
|
|
1093
|
+
pageResult.metadata.depth = depth;
|
|
1094
|
+
crawlResults.push(pageResult);
|
|
1095
|
+
const actionType = "crawl";
|
|
1096
|
+
const actionName = "Crawl Results";
|
|
1097
|
+
if (!this.serializableDataByType[actionType]) {
|
|
1098
|
+
this.serializableDataByType[actionType] = {};
|
|
1099
|
+
}
|
|
1100
|
+
this.serializableDataByType[actionType][actionName] = [...crawlResults];
|
|
1101
|
+
yield this.options.serializableCallback({
|
|
1102
|
+
crawl: this.serializableDataByType.crawl
|
|
917
1103
|
});
|
|
918
|
-
|
|
1104
|
+
if (this.isAborted) {
|
|
1105
|
+
this.log(`Run aborted after scraping ${url}, stopping crawl`, logger_1.Level.WARN);
|
|
1106
|
+
break;
|
|
1107
|
+
}
|
|
1108
|
+
this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, logger_1.Level.LOG);
|
|
1109
|
+
if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
|
|
1110
|
+
const newLinks = yield extractLinksFromPage();
|
|
1111
|
+
let addedCount = 0;
|
|
1112
|
+
for (const link of newLinks) {
|
|
1113
|
+
const normalized = normalizeUrl(link);
|
|
1114
|
+
if (!visitedUrls.has(normalized) &&
|
|
1115
|
+
isUrlAllowedByConfig(link) &&
|
|
1116
|
+
isUrlAllowedByRobots(link)) {
|
|
1117
|
+
visitedUrls.add(normalized);
|
|
1118
|
+
crawlQueue.push({ url: link, depth: depth + 1 });
|
|
1119
|
+
addedCount++;
|
|
1120
|
+
}
|
|
1121
|
+
}
|
|
1122
|
+
if (addedCount > 0) {
|
|
1123
|
+
this.log(`Added ${addedCount} new URLs to queue at depth ${depth + 1}`, logger_1.Level.LOG);
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
919
1126
|
}
|
|
920
1127
|
catch (error) {
|
|
921
|
-
this.log(`Failed to
|
|
1128
|
+
this.log(`Failed to crawl ${url}: ${error.message}`, logger_1.Level.WARN);
|
|
922
1129
|
crawlResults.push({
|
|
923
|
-
|
|
1130
|
+
metadata: {
|
|
1131
|
+
url: url,
|
|
1132
|
+
sourceURL: url,
|
|
1133
|
+
depth: depth
|
|
1134
|
+
},
|
|
924
1135
|
error: error.message,
|
|
925
1136
|
scrapedAt: new Date().toISOString()
|
|
926
1137
|
});
|
|
927
1138
|
}
|
|
928
1139
|
}
|
|
929
|
-
this.log(`
|
|
1140
|
+
this.log(`Crawl completed: ${crawlResults.length} pages scraped (${processedCount} URLs processed, ${visitedUrls.size} URLs discovered)`, logger_1.Level.LOG);
|
|
930
1141
|
const actionType = "crawl";
|
|
931
1142
|
const actionName = "Crawl Results";
|
|
932
1143
|
if (!this.serializableDataByType[actionType]) {
|
|
@@ -1157,6 +1368,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1157
1368
|
filters: searchConfig.filters || {},
|
|
1158
1369
|
resultsCount: searchResults.length,
|
|
1159
1370
|
results: searchResults,
|
|
1371
|
+
mode: searchConfig.mode,
|
|
1160
1372
|
searchedAt: new Date().toISOString()
|
|
1161
1373
|
};
|
|
1162
1374
|
this.serializableDataByType[actionType][actionName] = searchData;
|
|
@@ -1172,16 +1384,25 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1172
1384
|
this.log(`Starting to scrape content from ${searchResults.length} search results...`, logger_1.Level.LOG);
|
|
1173
1385
|
const scrapedResults = [];
|
|
1174
1386
|
for (let i = 0; i < searchResults.length; i++) {
|
|
1387
|
+
if (this.isAborted) {
|
|
1388
|
+
this.log(`Run aborted, stopping search scraping at result ${i + 1}/${searchResults.length}`, logger_1.Level.WARN);
|
|
1389
|
+
break;
|
|
1390
|
+
}
|
|
1175
1391
|
const result = searchResults[i];
|
|
1176
1392
|
try {
|
|
1177
1393
|
this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
|
|
1394
|
+
let navigationFailed = false;
|
|
1178
1395
|
yield page.goto(result.url, {
|
|
1179
|
-
waitUntil: '
|
|
1180
|
-
timeout:
|
|
1396
|
+
waitUntil: 'load',
|
|
1397
|
+
timeout: 60000
|
|
1181
1398
|
}).catch(() => {
|
|
1182
1399
|
this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
|
|
1400
|
+
navigationFailed = true;
|
|
1183
1401
|
});
|
|
1184
|
-
|
|
1402
|
+
if (navigationFailed) {
|
|
1403
|
+
continue;
|
|
1404
|
+
}
|
|
1405
|
+
yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
|
|
1185
1406
|
const pageData = yield page.evaluate(() => {
|
|
1186
1407
|
var _a, _b;
|
|
1187
1408
|
const getMeta = (name) => {
|
|
@@ -1217,7 +1438,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1217
1438
|
metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
|
|
1218
1439
|
};
|
|
1219
1440
|
});
|
|
1220
|
-
|
|
1441
|
+
const scrapedResult = {
|
|
1221
1442
|
searchResult: {
|
|
1222
1443
|
query: searchConfig.query,
|
|
1223
1444
|
position: result.position,
|
|
@@ -1225,12 +1446,79 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1225
1446
|
searchDescription: result.description,
|
|
1226
1447
|
},
|
|
1227
1448
|
metadata: Object.assign(Object.assign({}, pageData.metadata), { url: result.url, sourceURL: result.url }),
|
|
1228
|
-
html: pageData.html,
|
|
1229
|
-
text: pageData.text,
|
|
1230
|
-
links: pageData.links,
|
|
1231
|
-
wordCount: pageData.wordCount,
|
|
1232
1449
|
scrapedAt: new Date().toISOString()
|
|
1450
|
+
};
|
|
1451
|
+
const formats = searchConfig.outputFormats || [];
|
|
1452
|
+
if (formats.includes('text')) {
|
|
1453
|
+
scrapedResult.text = pageData.text;
|
|
1454
|
+
scrapedResult.wordCount = pageData.wordCount;
|
|
1455
|
+
}
|
|
1456
|
+
if (formats.includes('html')) {
|
|
1457
|
+
scrapedResult.html = pageData.html;
|
|
1458
|
+
scrapedResult.links = pageData.links;
|
|
1459
|
+
}
|
|
1460
|
+
if (formats.includes('markdown')) {
|
|
1461
|
+
try {
|
|
1462
|
+
const markdown = yield (0, markdown_1.parseMarkdown)(pageData.html, result.url);
|
|
1463
|
+
scrapedResult.markdown = markdown;
|
|
1464
|
+
}
|
|
1465
|
+
catch (err) {
|
|
1466
|
+
this.log(`Markdown conversion failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
|
|
1467
|
+
scrapedResult.markdown = '';
|
|
1468
|
+
}
|
|
1469
|
+
}
|
|
1470
|
+
if (formats.includes('screenshot-visible')) {
|
|
1471
|
+
try {
|
|
1472
|
+
const screenshotBuffer = yield page.screenshot({ fullPage: false });
|
|
1473
|
+
const screenshotName = `Search - ${i} - Visible`;
|
|
1474
|
+
yield this.options.binaryCallback({
|
|
1475
|
+
name: screenshotName,
|
|
1476
|
+
data: screenshotBuffer,
|
|
1477
|
+
mimeType: 'image/png'
|
|
1478
|
+
}, 'image/png');
|
|
1479
|
+
scrapedResult.screenshotVisible = screenshotName;
|
|
1480
|
+
}
|
|
1481
|
+
catch (err) {
|
|
1482
|
+
this.log(`Screenshot-visible failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
|
|
1483
|
+
}
|
|
1484
|
+
}
|
|
1485
|
+
if (formats.includes('screenshot-fullpage')) {
|
|
1486
|
+
try {
|
|
1487
|
+
const screenshotBuffer = yield page.screenshot({ fullPage: true });
|
|
1488
|
+
const screenshotName = `Search - ${i} - Full Page`;
|
|
1489
|
+
yield this.options.binaryCallback({
|
|
1490
|
+
name: screenshotName,
|
|
1491
|
+
data: screenshotBuffer,
|
|
1492
|
+
mimeType: 'image/png'
|
|
1493
|
+
}, 'image/png');
|
|
1494
|
+
scrapedResult.screenshotFullpage = screenshotName;
|
|
1495
|
+
}
|
|
1496
|
+
catch (err) {
|
|
1497
|
+
this.log(`Screenshot-fullpage failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
|
|
1498
|
+
}
|
|
1499
|
+
}
|
|
1500
|
+
scrapedResults.push(scrapedResult);
|
|
1501
|
+
const actionType = "search";
|
|
1502
|
+
const actionName = "Search Results";
|
|
1503
|
+
if (!this.serializableDataByType[actionType]) {
|
|
1504
|
+
this.serializableDataByType[actionType] = {};
|
|
1505
|
+
}
|
|
1506
|
+
this.serializableDataByType[actionType][actionName] = {
|
|
1507
|
+
query: searchConfig.query,
|
|
1508
|
+
provider: searchConfig.provider,
|
|
1509
|
+
filters: searchConfig.filters || {},
|
|
1510
|
+
resultsCount: scrapedResults.length,
|
|
1511
|
+
results: scrapedResults,
|
|
1512
|
+
mode: searchConfig.mode,
|
|
1513
|
+
searchedAt: new Date().toISOString()
|
|
1514
|
+
};
|
|
1515
|
+
yield this.options.serializableCallback({
|
|
1516
|
+
search: this.serializableDataByType.search
|
|
1233
1517
|
});
|
|
1518
|
+
if (this.isAborted) {
|
|
1519
|
+
this.log(`Run aborted after scraping ${result.url}, stopping search`, logger_1.Level.WARN);
|
|
1520
|
+
break;
|
|
1521
|
+
}
|
|
1234
1522
|
this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
|
|
1235
1523
|
}
|
|
1236
1524
|
catch (error) {
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function parseMarkdown(html: string | null | undefined, baseUrl?: string | null): Promise<string>;
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.parseMarkdown = parseMarkdown;
|
|
13
|
+
function parseMarkdown(html, baseUrl) {
|
|
14
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
15
|
+
const TurndownService = require("turndown");
|
|
16
|
+
const { gfm } = require("joplin-turndown-plugin-gfm");
|
|
17
|
+
const cheerio = require("cheerio");
|
|
18
|
+
const { URL } = require("url");
|
|
19
|
+
if (!html)
|
|
20
|
+
return "";
|
|
21
|
+
const tidiedHtml = tidyHtml(html);
|
|
22
|
+
const t = new TurndownService({
|
|
23
|
+
headingStyle: "atx", // ensures #### instead of ------
|
|
24
|
+
codeBlockStyle: "fenced",
|
|
25
|
+
});
|
|
26
|
+
// ---------------------------------------------
|
|
27
|
+
// Proper ATX headings #### instead of underline-style
|
|
28
|
+
// ---------------------------------------------
|
|
29
|
+
t.addRule("forceAtxHeadings", {
|
|
30
|
+
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
|
|
31
|
+
replacement: (content, node) => {
|
|
32
|
+
const level = Number(node.nodeName.charAt(1));
|
|
33
|
+
const clean = content.trim();
|
|
34
|
+
return `\n${"#".repeat(level)} ${clean}\n`;
|
|
35
|
+
},
|
|
36
|
+
});
|
|
37
|
+
// ---------------------------------------------
|
|
38
|
+
// Remove SVGs
|
|
39
|
+
// ---------------------------------------------
|
|
40
|
+
t.addRule("truncate-svg", {
|
|
41
|
+
filter: "svg",
|
|
42
|
+
replacement: () => "",
|
|
43
|
+
});
|
|
44
|
+
// ---------------------------------------------
|
|
45
|
+
// Improved paragraph cleanup
|
|
46
|
+
// ---------------------------------------------
|
|
47
|
+
t.addRule("improved-paragraph", {
|
|
48
|
+
filter: "p",
|
|
49
|
+
replacement: (innerText) => {
|
|
50
|
+
const trimmed = innerText.trim();
|
|
51
|
+
if (!trimmed)
|
|
52
|
+
return "";
|
|
53
|
+
return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
// ---------------------------------------------
|
|
57
|
+
// Inline link with fallback text
|
|
58
|
+
// ---------------------------------------------
|
|
59
|
+
t.addRule("inlineLink", {
|
|
60
|
+
filter: (node, opts) => node.nodeName === "A" && node.getAttribute("href"),
|
|
61
|
+
replacement: (content, node) => {
|
|
62
|
+
var _a, _b;
|
|
63
|
+
let text = content.trim();
|
|
64
|
+
// Fallback: aria-label → title → domain
|
|
65
|
+
if (!text) {
|
|
66
|
+
text =
|
|
67
|
+
((_a = node.getAttribute("aria-label")) === null || _a === void 0 ? void 0 : _a.trim()) ||
|
|
68
|
+
((_b = node.getAttribute("title")) === null || _b === void 0 ? void 0 : _b.trim()) ||
|
|
69
|
+
getDomainFromUrl(node.getAttribute("href")) ||
|
|
70
|
+
"link";
|
|
71
|
+
}
|
|
72
|
+
let href = node.getAttribute("href").trim();
|
|
73
|
+
// relative → absolute
|
|
74
|
+
if (baseUrl && isRelativeUrl(href)) {
|
|
75
|
+
try {
|
|
76
|
+
const u = new URL(href, baseUrl);
|
|
77
|
+
href = u.toString();
|
|
78
|
+
}
|
|
79
|
+
catch (_c) { }
|
|
80
|
+
}
|
|
81
|
+
href = cleanUrl(href);
|
|
82
|
+
return `[${text}](${href})`;
|
|
83
|
+
},
|
|
84
|
+
});
|
|
85
|
+
t.use(gfm);
|
|
86
|
+
// Convert HTML → Markdown
|
|
87
|
+
try {
|
|
88
|
+
let out = yield t.turndown(tidiedHtml);
|
|
89
|
+
out = fixBrokenLinks(out);
|
|
90
|
+
out = stripSkipLinks(out);
|
|
91
|
+
return out.trim();
|
|
92
|
+
}
|
|
93
|
+
catch (err) {
|
|
94
|
+
console.error("HTML→Markdown failed", { err });
|
|
95
|
+
return "";
|
|
96
|
+
}
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
// -----------------------------------------------------
|
|
100
|
+
// Helpers
|
|
101
|
+
// -----------------------------------------------------
|
|
102
|
+
function isRelativeUrl(url) {
|
|
103
|
+
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
|
|
104
|
+
}
|
|
105
|
+
function getDomainFromUrl(url) {
|
|
106
|
+
try {
|
|
107
|
+
const u = new URL(url);
|
|
108
|
+
return u.hostname.replace("www.", "");
|
|
109
|
+
}
|
|
110
|
+
catch (_a) {
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
function cleanUrl(u) {
|
|
115
|
+
return u;
|
|
116
|
+
}
|
|
117
|
+
function cleanAttribute(attr) {
|
|
118
|
+
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
|
|
119
|
+
}
|
|
120
|
+
function tidyHtml(html) {
|
|
121
|
+
const cheerio = require("cheerio");
|
|
122
|
+
const $ = cheerio.load(html);
|
|
123
|
+
const manuallyCleanedElements = [
|
|
124
|
+
"script",
|
|
125
|
+
"style",
|
|
126
|
+
"iframe",
|
|
127
|
+
"noscript",
|
|
128
|
+
"meta",
|
|
129
|
+
"link",
|
|
130
|
+
"object",
|
|
131
|
+
"embed",
|
|
132
|
+
"canvas",
|
|
133
|
+
"audio",
|
|
134
|
+
"video",
|
|
135
|
+
];
|
|
136
|
+
manuallyCleanedElements.forEach((tag) => $(tag).remove());
|
|
137
|
+
return $("body").html();
|
|
138
|
+
}
|
|
139
|
+
function fixBrokenLinks(md) {
|
|
140
|
+
let depth = 0;
|
|
141
|
+
let result = "";
|
|
142
|
+
for (const ch of md) {
|
|
143
|
+
if (ch === "[")
|
|
144
|
+
depth++;
|
|
145
|
+
if (ch === "]")
|
|
146
|
+
depth = Math.max(0, depth - 1);
|
|
147
|
+
result += depth > 0 && ch === "\n" ? "\\\n" : ch;
|
|
148
|
+
}
|
|
149
|
+
return result;
|
|
150
|
+
}
|
|
151
|
+
function stripSkipLinks(md) {
|
|
152
|
+
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
|
|
153
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mx-cloud",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.30",
|
|
4
4
|
"description": "mx cloud",
|
|
5
5
|
"main": "build/index.js",
|
|
6
6
|
"typings": "build/index.d.ts",
|
|
@@ -17,9 +17,16 @@
|
|
|
17
17
|
"license": "AGPL-3.0-or-later",
|
|
18
18
|
"dependencies": {
|
|
19
19
|
"@cliqz/adblocker-playwright": "^1.31.3",
|
|
20
|
+
"cheerio": "^1.1.2",
|
|
20
21
|
"cross-fetch": "^4.0.0",
|
|
21
22
|
"joi": "^17.6.0",
|
|
23
|
+
"joplin-turndown-plugin-gfm": "^1.0.12",
|
|
22
24
|
"nodemailer": "^6.10.0",
|
|
23
|
-
"playwright-core": "^1.57.0"
|
|
25
|
+
"playwright-core": "^1.57.0",
|
|
26
|
+
"rimraf": "^6.1.2",
|
|
27
|
+
"turndown": "^7.2.0"
|
|
28
|
+
},
|
|
29
|
+
"devDependencies": {
|
|
30
|
+
"@types/turndown": "^5.0.6"
|
|
24
31
|
}
|
|
25
32
|
}
|