mx-cloud 0.0.28 → 0.0.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.d.ts +1 -1
- package/build/interpret.js +329 -25
- package/build/utils/markdown.d.ts +1 -0
- package/build/utils/markdown.js +153 -0
- package/package.json +9 -2
package/build/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import Interpreter from './interpret';
|
|
2
2
|
export default Interpreter;
|
|
3
3
|
export { default as Preprocessor } from './preprocessor';
|
|
4
|
-
export type { WorkflowFile, WhereWhatPair, Where, What, } from './types/workflow';
|
|
4
|
+
export type { WorkflowFile, WhereWhatPair, Where, What, CustomFunctions, } from './types/workflow';
|
|
5
5
|
export { unaryOperators, naryOperators, meta as metaOperators } from './types/logic';
|
package/build/interpret.js
CHANGED
|
@@ -54,6 +54,7 @@ const concurrency_1 = __importDefault(require("./utils/concurrency"));
|
|
|
54
54
|
const preprocessor_1 = __importDefault(require("./preprocessor"));
|
|
55
55
|
const logger_1 = __importStar(require("./utils/logger"));
|
|
56
56
|
const selector_1 = require("./selector");
|
|
57
|
+
const markdown_1 = require("./utils/markdown");
|
|
57
58
|
/**
|
|
58
59
|
* Class for running the Smart Workflows.
|
|
59
60
|
*/
|
|
@@ -251,15 +252,19 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
251
252
|
}
|
|
252
253
|
yield page.close();
|
|
253
254
|
}),
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
255
|
+
// DEPRECATED: Old scrape action - commented out in favor of new workflow-based scrape action
|
|
256
|
+
// scrape: async (selector?: string) => {
|
|
257
|
+
// if (this.options.debugChannel?.setActionType) {
|
|
258
|
+
// this.options.debugChannel.setActionType('scrape');
|
|
259
|
+
// }
|
|
260
|
+
// await this.ensureScriptsLoaded(page);
|
|
261
|
+
// const scrapeResults: Record<string, string>[] = await page.evaluate((s) => window.scrape(s ?? null), selector);
|
|
262
|
+
// await this.callWithTimeout(
|
|
263
|
+
// () => this.options.serializableCallback(scrapeResults),
|
|
264
|
+
// 30000,
|
|
265
|
+
// 'serializableCallback (scrape)'
|
|
266
|
+
// );
|
|
267
|
+
// },
|
|
263
268
|
scrapeSchema: (schema_1, ...args_1) => __awaiter(this, [schema_1, ...args_1], void 0, function* (schema, actionName = "") {
|
|
264
269
|
var _a;
|
|
265
270
|
if (this.isAborted) {
|
|
@@ -922,14 +927,60 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
922
927
|
metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
|
|
923
928
|
};
|
|
924
929
|
});
|
|
925
|
-
|
|
930
|
+
const result = {
|
|
926
931
|
metadata: Object.assign(Object.assign({}, pageData.metadata), { url: url, sourceURL: url }),
|
|
927
|
-
html: pageData.html,
|
|
928
|
-
text: pageData.text,
|
|
929
|
-
links: pageData.links,
|
|
930
|
-
wordCount: pageData.wordCount,
|
|
931
932
|
scrapedAt: new Date().toISOString()
|
|
932
933
|
};
|
|
934
|
+
const formats = crawlConfig.outputFormats || [];
|
|
935
|
+
if (formats.includes('text')) {
|
|
936
|
+
result.text = pageData.text;
|
|
937
|
+
result.wordCount = pageData.wordCount;
|
|
938
|
+
}
|
|
939
|
+
if (formats.includes('html')) {
|
|
940
|
+
result.html = pageData.html;
|
|
941
|
+
result.links = pageData.links;
|
|
942
|
+
}
|
|
943
|
+
if (formats.includes('markdown')) {
|
|
944
|
+
try {
|
|
945
|
+
const markdown = yield (0, markdown_1.parseMarkdown)(pageData.html, url);
|
|
946
|
+
result.markdown = markdown;
|
|
947
|
+
}
|
|
948
|
+
catch (err) {
|
|
949
|
+
this.log(`Markdown conversion failed for ${url}: ${err.message}`, logger_1.Level.WARN);
|
|
950
|
+
result.markdown = '';
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
if (formats.includes('screenshot-visible')) {
|
|
954
|
+
try {
|
|
955
|
+
const screenshotBuffer = yield page.screenshot({ fullPage: false });
|
|
956
|
+
const screenshotName = `Crawl - ${crawlResults.length} - Visible`;
|
|
957
|
+
yield this.options.binaryCallback({
|
|
958
|
+
name: screenshotName,
|
|
959
|
+
data: screenshotBuffer,
|
|
960
|
+
mimeType: 'image/png'
|
|
961
|
+
}, 'image/png');
|
|
962
|
+
result.screenshotVisible = screenshotName;
|
|
963
|
+
}
|
|
964
|
+
catch (err) {
|
|
965
|
+
this.log(`Screenshot-visible failed for ${url}: ${err.message}`, logger_1.Level.WARN);
|
|
966
|
+
}
|
|
967
|
+
}
|
|
968
|
+
if (formats.includes('screenshot-fullpage')) {
|
|
969
|
+
try {
|
|
970
|
+
const screenshotBuffer = yield page.screenshot({ fullPage: true });
|
|
971
|
+
const screenshotName = `Crawl - ${crawlResults.length} - Full Page`;
|
|
972
|
+
yield this.options.binaryCallback({
|
|
973
|
+
name: screenshotName,
|
|
974
|
+
data: screenshotBuffer,
|
|
975
|
+
mimeType: 'image/png'
|
|
976
|
+
}, 'image/png');
|
|
977
|
+
result.screenshotFullpage = screenshotName;
|
|
978
|
+
}
|
|
979
|
+
catch (err) {
|
|
980
|
+
this.log(`Screenshot-fullpage failed for ${url}: ${err.message}`, logger_1.Level.WARN);
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
return result;
|
|
933
984
|
});
|
|
934
985
|
const visitedUrls = new Set();
|
|
935
986
|
const crawlResults = [];
|
|
@@ -1036,15 +1087,28 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1036
1087
|
yield new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay));
|
|
1037
1088
|
}
|
|
1038
1089
|
yield page.goto(url, {
|
|
1039
|
-
waitUntil: '
|
|
1040
|
-
timeout:
|
|
1090
|
+
waitUntil: 'load',
|
|
1091
|
+
timeout: 60000
|
|
1041
1092
|
}).catch((err) => {
|
|
1042
1093
|
throw new Error(`Navigation failed: ${err.message}`);
|
|
1043
1094
|
});
|
|
1044
|
-
yield page.waitForLoadState('
|
|
1095
|
+
yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
|
|
1045
1096
|
const pageResult = yield scrapePageContent(url);
|
|
1046
1097
|
pageResult.metadata.depth = depth;
|
|
1047
1098
|
crawlResults.push(pageResult);
|
|
1099
|
+
const actionType = "crawl";
|
|
1100
|
+
const actionName = "Crawl Results";
|
|
1101
|
+
if (!this.serializableDataByType[actionType]) {
|
|
1102
|
+
this.serializableDataByType[actionType] = {};
|
|
1103
|
+
}
|
|
1104
|
+
this.serializableDataByType[actionType][actionName] = [...crawlResults];
|
|
1105
|
+
yield this.options.serializableCallback({
|
|
1106
|
+
crawl: this.serializableDataByType.crawl
|
|
1107
|
+
});
|
|
1108
|
+
if (this.isAborted) {
|
|
1109
|
+
this.log(`Run aborted after scraping ${url}, stopping crawl`, logger_1.Level.WARN);
|
|
1110
|
+
break;
|
|
1111
|
+
}
|
|
1048
1112
|
this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, logger_1.Level.LOG);
|
|
1049
1113
|
if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
|
|
1050
1114
|
const newLinks = yield extractLinksFromPage();
|
|
@@ -1308,6 +1372,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1308
1372
|
filters: searchConfig.filters || {},
|
|
1309
1373
|
resultsCount: searchResults.length,
|
|
1310
1374
|
results: searchResults,
|
|
1375
|
+
mode: searchConfig.mode,
|
|
1311
1376
|
searchedAt: new Date().toISOString()
|
|
1312
1377
|
};
|
|
1313
1378
|
this.serializableDataByType[actionType][actionName] = searchData;
|
|
@@ -1323,16 +1388,25 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1323
1388
|
this.log(`Starting to scrape content from ${searchResults.length} search results...`, logger_1.Level.LOG);
|
|
1324
1389
|
const scrapedResults = [];
|
|
1325
1390
|
for (let i = 0; i < searchResults.length; i++) {
|
|
1391
|
+
if (this.isAborted) {
|
|
1392
|
+
this.log(`Run aborted, stopping search scraping at result ${i + 1}/${searchResults.length}`, logger_1.Level.WARN);
|
|
1393
|
+
break;
|
|
1394
|
+
}
|
|
1326
1395
|
const result = searchResults[i];
|
|
1327
1396
|
try {
|
|
1328
1397
|
this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, logger_1.Level.LOG);
|
|
1398
|
+
let navigationFailed = false;
|
|
1329
1399
|
yield page.goto(result.url, {
|
|
1330
|
-
waitUntil: '
|
|
1331
|
-
timeout:
|
|
1400
|
+
waitUntil: 'load',
|
|
1401
|
+
timeout: 60000
|
|
1332
1402
|
}).catch(() => {
|
|
1333
1403
|
this.log(`Failed to navigate to ${result.url}, skipping...`, logger_1.Level.WARN);
|
|
1404
|
+
navigationFailed = true;
|
|
1334
1405
|
});
|
|
1335
|
-
|
|
1406
|
+
if (navigationFailed) {
|
|
1407
|
+
continue;
|
|
1408
|
+
}
|
|
1409
|
+
yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
|
|
1336
1410
|
const pageData = yield page.evaluate(() => {
|
|
1337
1411
|
var _a, _b;
|
|
1338
1412
|
const getMeta = (name) => {
|
|
@@ -1368,7 +1442,7 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1368
1442
|
metadata: Object.assign(Object.assign({}, allMetadata), { title, language: document.documentElement.lang || '', favicon: ((_b = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]')) === null || _b === void 0 ? void 0 : _b.href) || '', statusCode: 200 })
|
|
1369
1443
|
};
|
|
1370
1444
|
});
|
|
1371
|
-
|
|
1445
|
+
const scrapedResult = {
|
|
1372
1446
|
searchResult: {
|
|
1373
1447
|
query: searchConfig.query,
|
|
1374
1448
|
position: result.position,
|
|
@@ -1376,12 +1450,79 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1376
1450
|
searchDescription: result.description,
|
|
1377
1451
|
},
|
|
1378
1452
|
metadata: Object.assign(Object.assign({}, pageData.metadata), { url: result.url, sourceURL: result.url }),
|
|
1379
|
-
html: pageData.html,
|
|
1380
|
-
text: pageData.text,
|
|
1381
|
-
links: pageData.links,
|
|
1382
|
-
wordCount: pageData.wordCount,
|
|
1383
1453
|
scrapedAt: new Date().toISOString()
|
|
1454
|
+
};
|
|
1455
|
+
const formats = searchConfig.outputFormats || [];
|
|
1456
|
+
if (formats.includes('text')) {
|
|
1457
|
+
scrapedResult.text = pageData.text;
|
|
1458
|
+
scrapedResult.wordCount = pageData.wordCount;
|
|
1459
|
+
}
|
|
1460
|
+
if (formats.includes('html')) {
|
|
1461
|
+
scrapedResult.html = pageData.html;
|
|
1462
|
+
scrapedResult.links = pageData.links;
|
|
1463
|
+
}
|
|
1464
|
+
if (formats.includes('markdown')) {
|
|
1465
|
+
try {
|
|
1466
|
+
const markdown = yield (0, markdown_1.parseMarkdown)(pageData.html, result.url);
|
|
1467
|
+
scrapedResult.markdown = markdown;
|
|
1468
|
+
}
|
|
1469
|
+
catch (err) {
|
|
1470
|
+
this.log(`Markdown conversion failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
|
|
1471
|
+
scrapedResult.markdown = '';
|
|
1472
|
+
}
|
|
1473
|
+
}
|
|
1474
|
+
if (formats.includes('screenshot-visible')) {
|
|
1475
|
+
try {
|
|
1476
|
+
const screenshotBuffer = yield page.screenshot({ fullPage: false });
|
|
1477
|
+
const screenshotName = `Search - ${i} - Visible`;
|
|
1478
|
+
yield this.options.binaryCallback({
|
|
1479
|
+
name: screenshotName,
|
|
1480
|
+
data: screenshotBuffer,
|
|
1481
|
+
mimeType: 'image/png'
|
|
1482
|
+
}, 'image/png');
|
|
1483
|
+
scrapedResult.screenshotVisible = screenshotName;
|
|
1484
|
+
}
|
|
1485
|
+
catch (err) {
|
|
1486
|
+
this.log(`Screenshot-visible failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
|
|
1487
|
+
}
|
|
1488
|
+
}
|
|
1489
|
+
if (formats.includes('screenshot-fullpage')) {
|
|
1490
|
+
try {
|
|
1491
|
+
const screenshotBuffer = yield page.screenshot({ fullPage: true });
|
|
1492
|
+
const screenshotName = `Search - ${i} - Full Page`;
|
|
1493
|
+
yield this.options.binaryCallback({
|
|
1494
|
+
name: screenshotName,
|
|
1495
|
+
data: screenshotBuffer,
|
|
1496
|
+
mimeType: 'image/png'
|
|
1497
|
+
}, 'image/png');
|
|
1498
|
+
scrapedResult.screenshotFullpage = screenshotName;
|
|
1499
|
+
}
|
|
1500
|
+
catch (err) {
|
|
1501
|
+
this.log(`Screenshot-fullpage failed for ${result.url}: ${err.message}`, logger_1.Level.WARN);
|
|
1502
|
+
}
|
|
1503
|
+
}
|
|
1504
|
+
scrapedResults.push(scrapedResult);
|
|
1505
|
+
const actionType = "search";
|
|
1506
|
+
const actionName = "Search Results";
|
|
1507
|
+
if (!this.serializableDataByType[actionType]) {
|
|
1508
|
+
this.serializableDataByType[actionType] = {};
|
|
1509
|
+
}
|
|
1510
|
+
this.serializableDataByType[actionType][actionName] = {
|
|
1511
|
+
query: searchConfig.query,
|
|
1512
|
+
provider: searchConfig.provider,
|
|
1513
|
+
filters: searchConfig.filters || {},
|
|
1514
|
+
resultsCount: scrapedResults.length,
|
|
1515
|
+
results: scrapedResults,
|
|
1516
|
+
mode: searchConfig.mode,
|
|
1517
|
+
searchedAt: new Date().toISOString()
|
|
1518
|
+
};
|
|
1519
|
+
yield this.options.serializableCallback({
|
|
1520
|
+
search: this.serializableDataByType.search
|
|
1384
1521
|
});
|
|
1522
|
+
if (this.isAborted) {
|
|
1523
|
+
this.log(`Run aborted after scraping ${result.url}, stopping search`, logger_1.Level.WARN);
|
|
1524
|
+
break;
|
|
1525
|
+
}
|
|
1385
1526
|
this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, logger_1.Level.LOG);
|
|
1386
1527
|
}
|
|
1387
1528
|
catch (error) {
|
|
@@ -1430,6 +1571,169 @@ class Interpreter extends events_1.EventEmitter {
|
|
|
1430
1571
|
throw new Error(`Search execution error: ${error.message}`);
|
|
1431
1572
|
}
|
|
1432
1573
|
}),
|
|
1574
|
+
/**
|
|
1575
|
+
* scrape action: Converts a webpage to text, markdown, HTML, and/or screenshots.
|
|
1576
|
+
* This is the workflow action for scrape robots.
|
|
1577
|
+
*/
|
|
1578
|
+
scrape: (scrapeConfig) => __awaiter(this, void 0, void 0, function* () {
|
|
1579
|
+
var _a;
|
|
1580
|
+
if (this.isAborted) {
|
|
1581
|
+
this.log('Workflow aborted, stopping scrape', logger_1.Level.WARN);
|
|
1582
|
+
return;
|
|
1583
|
+
}
|
|
1584
|
+
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
|
|
1585
|
+
this.options.debugChannel.setActionType('scrape');
|
|
1586
|
+
}
|
|
1587
|
+
this.log(`Starting scrape for URL: ${scrapeConfig.url}`, logger_1.Level.LOG);
|
|
1588
|
+
try {
|
|
1589
|
+
const formats = scrapeConfig.formats || ['markdown', 'html', 'text'];
|
|
1590
|
+
const url = scrapeConfig.url;
|
|
1591
|
+
if (!url) {
|
|
1592
|
+
throw new Error('No URL specified for scrape action');
|
|
1593
|
+
}
|
|
1594
|
+
const currentUrl = page.url();
|
|
1595
|
+
if (currentUrl === 'about:blank' || currentUrl === '' || !currentUrl.includes(new URL(url).hostname)) {
|
|
1596
|
+
this.log(`Navigating to ${url}`, logger_1.Level.LOG);
|
|
1597
|
+
yield page.goto(url, { waitUntil: 'load', timeout: 60000 });
|
|
1598
|
+
yield page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { });
|
|
1599
|
+
}
|
|
1600
|
+
const serializableOutput = {};
|
|
1601
|
+
const SCRAPE_TIMEOUT = 120000;
|
|
1602
|
+
if (formats.includes('text')) {
|
|
1603
|
+
try {
|
|
1604
|
+
const textPromise = page.evaluate(() => {
|
|
1605
|
+
const body = document.body;
|
|
1606
|
+
if (!body)
|
|
1607
|
+
return '';
|
|
1608
|
+
return body.innerText || body.textContent || '';
|
|
1609
|
+
});
|
|
1610
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
1611
|
+
setTimeout(() => reject(new Error(`Text extraction timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
|
1612
|
+
});
|
|
1613
|
+
const text = yield Promise.race([textPromise, timeoutPromise]);
|
|
1614
|
+
if (text && text.trim().length > 0) {
|
|
1615
|
+
serializableOutput.text = [{ content: text.trim() }];
|
|
1616
|
+
this.log('Text extraction completed', logger_1.Level.LOG);
|
|
1617
|
+
}
|
|
1618
|
+
}
|
|
1619
|
+
catch (error) {
|
|
1620
|
+
this.log(`Text extraction failed: ${error.message}`, logger_1.Level.WARN);
|
|
1621
|
+
}
|
|
1622
|
+
}
|
|
1623
|
+
if (formats.includes('markdown')) {
|
|
1624
|
+
try {
|
|
1625
|
+
const html = yield page.evaluate(() => {
|
|
1626
|
+
const selectors = [
|
|
1627
|
+
"script", "style", "link[rel='stylesheet']", "noscript", "meta",
|
|
1628
|
+
"svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
|
|
1629
|
+
];
|
|
1630
|
+
selectors.forEach(sel => {
|
|
1631
|
+
document.querySelectorAll(sel).forEach(e => e.remove());
|
|
1632
|
+
});
|
|
1633
|
+
const all = document.querySelectorAll("*");
|
|
1634
|
+
all.forEach(el => {
|
|
1635
|
+
[...el.attributes].forEach(attr => {
|
|
1636
|
+
if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
|
|
1637
|
+
el.removeAttribute(attr.name);
|
|
1638
|
+
}
|
|
1639
|
+
});
|
|
1640
|
+
});
|
|
1641
|
+
return document.documentElement.outerHTML;
|
|
1642
|
+
});
|
|
1643
|
+
const markdownPromise = (0, markdown_1.parseMarkdown)(html, url);
|
|
1644
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
1645
|
+
setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
|
1646
|
+
});
|
|
1647
|
+
const markdown = yield Promise.race([markdownPromise, timeoutPromise]);
|
|
1648
|
+
if (markdown && markdown.trim().length > 0) {
|
|
1649
|
+
serializableOutput.markdown = [{ content: markdown }];
|
|
1650
|
+
this.log('Markdown conversion completed', logger_1.Level.LOG);
|
|
1651
|
+
}
|
|
1652
|
+
}
|
|
1653
|
+
catch (error) {
|
|
1654
|
+
this.log(`Markdown conversion failed: ${error.message}`, logger_1.Level.WARN);
|
|
1655
|
+
}
|
|
1656
|
+
}
|
|
1657
|
+
if (formats.includes('html')) {
|
|
1658
|
+
try {
|
|
1659
|
+
const htmlPromise = page.evaluate(() => {
|
|
1660
|
+
const selectors = [
|
|
1661
|
+
"script", "style", "link[rel='stylesheet']", "noscript", "meta",
|
|
1662
|
+
"svg", "img", "picture", "source", "video", "audio", "iframe", "object", "embed"
|
|
1663
|
+
];
|
|
1664
|
+
selectors.forEach(sel => {
|
|
1665
|
+
document.querySelectorAll(sel).forEach(e => e.remove());
|
|
1666
|
+
});
|
|
1667
|
+
const all = document.querySelectorAll("*");
|
|
1668
|
+
all.forEach(el => {
|
|
1669
|
+
[...el.attributes].forEach(attr => {
|
|
1670
|
+
if (attr.name.startsWith("on") || attr.name === "data-mx-id") {
|
|
1671
|
+
el.removeAttribute(attr.name);
|
|
1672
|
+
}
|
|
1673
|
+
});
|
|
1674
|
+
});
|
|
1675
|
+
return document.documentElement.outerHTML;
|
|
1676
|
+
});
|
|
1677
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
1678
|
+
setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT / 1000}s`)), SCRAPE_TIMEOUT);
|
|
1679
|
+
});
|
|
1680
|
+
const html = yield Promise.race([htmlPromise, timeoutPromise]);
|
|
1681
|
+
if (html && html.trim().length > 0) {
|
|
1682
|
+
serializableOutput.html = [{ content: html }];
|
|
1683
|
+
this.log('HTML conversion completed', logger_1.Level.LOG);
|
|
1684
|
+
}
|
|
1685
|
+
}
|
|
1686
|
+
catch (error) {
|
|
1687
|
+
this.log(`HTML conversion failed: ${error.message}`, logger_1.Level.WARN);
|
|
1688
|
+
}
|
|
1689
|
+
}
|
|
1690
|
+
if (formats.includes('screenshot-visible')) {
|
|
1691
|
+
try {
|
|
1692
|
+
const screenshotBuffer = yield page.screenshot({ fullPage: false, type: 'png' });
|
|
1693
|
+
if (screenshotBuffer && screenshotBuffer.length > 0) {
|
|
1694
|
+
yield this.options.binaryCallback({
|
|
1695
|
+
name: 'screenshot-visible',
|
|
1696
|
+
data: screenshotBuffer,
|
|
1697
|
+
mimeType: 'image/png'
|
|
1698
|
+
}, 'image/png');
|
|
1699
|
+
this.log('Visible screenshot captured', logger_1.Level.LOG);
|
|
1700
|
+
}
|
|
1701
|
+
}
|
|
1702
|
+
catch (error) {
|
|
1703
|
+
this.log(`Screenshot-visible failed: ${error.message}`, logger_1.Level.WARN);
|
|
1704
|
+
}
|
|
1705
|
+
}
|
|
1706
|
+
if (formats.includes('screenshot-fullpage')) {
|
|
1707
|
+
try {
|
|
1708
|
+
const screenshotBuffer = yield page.screenshot({ fullPage: true, type: 'png' });
|
|
1709
|
+
if (screenshotBuffer && screenshotBuffer.length > 0) {
|
|
1710
|
+
yield this.options.binaryCallback({
|
|
1711
|
+
name: 'screenshot-fullpage',
|
|
1712
|
+
data: screenshotBuffer,
|
|
1713
|
+
mimeType: 'image/png'
|
|
1714
|
+
}, 'image/png');
|
|
1715
|
+
this.log('Full page screenshot captured', logger_1.Level.LOG);
|
|
1716
|
+
}
|
|
1717
|
+
}
|
|
1718
|
+
catch (error) {
|
|
1719
|
+
this.log(`Screenshot-fullpage failed: ${error.message}`, logger_1.Level.WARN);
|
|
1720
|
+
}
|
|
1721
|
+
}
|
|
1722
|
+
const hasSerializableOutput = Object.keys(serializableOutput).length > 0 &&
|
|
1723
|
+
Object.values(serializableOutput).some((arr) => Array.isArray(arr) && arr.length > 0);
|
|
1724
|
+
if (hasSerializableOutput) {
|
|
1725
|
+
yield this.options.serializableCallback({ scrape: serializableOutput });
|
|
1726
|
+
this.log(`scrape completed successfully for ${url}`, logger_1.Level.LOG);
|
|
1727
|
+
}
|
|
1728
|
+
else {
|
|
1729
|
+
this.log(`scrape completed but no content could be extracted from ${url}`, logger_1.Level.WARN);
|
|
1730
|
+
}
|
|
1731
|
+
}
|
|
1732
|
+
catch (error) {
|
|
1733
|
+
this.log(`scrape action failed: ${error.message}`, logger_1.Level.ERROR);
|
|
1734
|
+
throw new Error(`scrape execution error: ${error.message}`);
|
|
1735
|
+
}
|
|
1736
|
+
}),
|
|
1433
1737
|
};
|
|
1434
1738
|
const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
|
|
1435
1739
|
console.log("Executing action:", methodName, args);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function parseMarkdown(html: string | null | undefined, baseUrl?: string | null): Promise<string>;
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.parseMarkdown = parseMarkdown;
|
|
13
|
+
function parseMarkdown(html, baseUrl) {
|
|
14
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
15
|
+
const TurndownService = require("turndown");
|
|
16
|
+
const { gfm } = require("joplin-turndown-plugin-gfm");
|
|
17
|
+
const cheerio = require("cheerio");
|
|
18
|
+
const { URL } = require("url");
|
|
19
|
+
if (!html)
|
|
20
|
+
return "";
|
|
21
|
+
const tidiedHtml = tidyHtml(html);
|
|
22
|
+
const t = new TurndownService({
|
|
23
|
+
headingStyle: "atx", // ensures #### instead of ------
|
|
24
|
+
codeBlockStyle: "fenced",
|
|
25
|
+
});
|
|
26
|
+
// ---------------------------------------------
|
|
27
|
+
// Proper ATX headings #### instead of underline-style
|
|
28
|
+
// ---------------------------------------------
|
|
29
|
+
t.addRule("forceAtxHeadings", {
|
|
30
|
+
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
|
|
31
|
+
replacement: (content, node) => {
|
|
32
|
+
const level = Number(node.nodeName.charAt(1));
|
|
33
|
+
const clean = content.trim();
|
|
34
|
+
return `\n${"#".repeat(level)} ${clean}\n`;
|
|
35
|
+
},
|
|
36
|
+
});
|
|
37
|
+
// ---------------------------------------------
|
|
38
|
+
// Remove SVGs
|
|
39
|
+
// ---------------------------------------------
|
|
40
|
+
t.addRule("truncate-svg", {
|
|
41
|
+
filter: "svg",
|
|
42
|
+
replacement: () => "",
|
|
43
|
+
});
|
|
44
|
+
// ---------------------------------------------
|
|
45
|
+
// Improved paragraph cleanup
|
|
46
|
+
// ---------------------------------------------
|
|
47
|
+
t.addRule("improved-paragraph", {
|
|
48
|
+
filter: "p",
|
|
49
|
+
replacement: (innerText) => {
|
|
50
|
+
const trimmed = innerText.trim();
|
|
51
|
+
if (!trimmed)
|
|
52
|
+
return "";
|
|
53
|
+
return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
// ---------------------------------------------
|
|
57
|
+
// Inline link with fallback text
|
|
58
|
+
// ---------------------------------------------
|
|
59
|
+
t.addRule("inlineLink", {
|
|
60
|
+
filter: (node, opts) => node.nodeName === "A" && node.getAttribute("href"),
|
|
61
|
+
replacement: (content, node) => {
|
|
62
|
+
var _a, _b;
|
|
63
|
+
let text = content.trim();
|
|
64
|
+
// Fallback: aria-label → title → domain
|
|
65
|
+
if (!text) {
|
|
66
|
+
text =
|
|
67
|
+
((_a = node.getAttribute("aria-label")) === null || _a === void 0 ? void 0 : _a.trim()) ||
|
|
68
|
+
((_b = node.getAttribute("title")) === null || _b === void 0 ? void 0 : _b.trim()) ||
|
|
69
|
+
getDomainFromUrl(node.getAttribute("href")) ||
|
|
70
|
+
"link";
|
|
71
|
+
}
|
|
72
|
+
let href = node.getAttribute("href").trim();
|
|
73
|
+
// relative → absolute
|
|
74
|
+
if (baseUrl && isRelativeUrl(href)) {
|
|
75
|
+
try {
|
|
76
|
+
const u = new URL(href, baseUrl);
|
|
77
|
+
href = u.toString();
|
|
78
|
+
}
|
|
79
|
+
catch (_c) { }
|
|
80
|
+
}
|
|
81
|
+
href = cleanUrl(href);
|
|
82
|
+
return `[${text}](${href})`;
|
|
83
|
+
},
|
|
84
|
+
});
|
|
85
|
+
t.use(gfm);
|
|
86
|
+
// Convert HTML → Markdown
|
|
87
|
+
try {
|
|
88
|
+
let out = yield t.turndown(tidiedHtml);
|
|
89
|
+
out = fixBrokenLinks(out);
|
|
90
|
+
out = stripSkipLinks(out);
|
|
91
|
+
return out.trim();
|
|
92
|
+
}
|
|
93
|
+
catch (err) {
|
|
94
|
+
console.error("HTML→Markdown failed", { err });
|
|
95
|
+
return "";
|
|
96
|
+
}
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
// -----------------------------------------------------
|
|
100
|
+
// Helpers
|
|
101
|
+
// -----------------------------------------------------
|
|
102
|
+
function isRelativeUrl(url) {
|
|
103
|
+
return !url.includes("://") && !url.startsWith("mailto:") && !url.startsWith("tel:");
|
|
104
|
+
}
|
|
105
|
+
function getDomainFromUrl(url) {
|
|
106
|
+
try {
|
|
107
|
+
const u = new URL(url);
|
|
108
|
+
return u.hostname.replace("www.", "");
|
|
109
|
+
}
|
|
110
|
+
catch (_a) {
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
function cleanUrl(u) {
|
|
115
|
+
return u;
|
|
116
|
+
}
|
|
117
|
+
function cleanAttribute(attr) {
|
|
118
|
+
return attr ? attr.replace(/(\n+\s*)+/g, "\n") : "";
|
|
119
|
+
}
|
|
120
|
+
function tidyHtml(html) {
|
|
121
|
+
const cheerio = require("cheerio");
|
|
122
|
+
const $ = cheerio.load(html);
|
|
123
|
+
const manuallyCleanedElements = [
|
|
124
|
+
"script",
|
|
125
|
+
"style",
|
|
126
|
+
"iframe",
|
|
127
|
+
"noscript",
|
|
128
|
+
"meta",
|
|
129
|
+
"link",
|
|
130
|
+
"object",
|
|
131
|
+
"embed",
|
|
132
|
+
"canvas",
|
|
133
|
+
"audio",
|
|
134
|
+
"video",
|
|
135
|
+
];
|
|
136
|
+
manuallyCleanedElements.forEach((tag) => $(tag).remove());
|
|
137
|
+
return $("body").html();
|
|
138
|
+
}
|
|
139
|
+
function fixBrokenLinks(md) {
|
|
140
|
+
let depth = 0;
|
|
141
|
+
let result = "";
|
|
142
|
+
for (const ch of md) {
|
|
143
|
+
if (ch === "[")
|
|
144
|
+
depth++;
|
|
145
|
+
if (ch === "]")
|
|
146
|
+
depth = Math.max(0, depth - 1);
|
|
147
|
+
result += depth > 0 && ch === "\n" ? "\\\n" : ch;
|
|
148
|
+
}
|
|
149
|
+
return result;
|
|
150
|
+
}
|
|
151
|
+
function stripSkipLinks(md) {
|
|
152
|
+
return md.replace(/\[Skip to Content\]\(#[^\)]*\)/gi, "");
|
|
153
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mx-cloud",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.31",
|
|
4
4
|
"description": "mx cloud",
|
|
5
5
|
"main": "build/index.js",
|
|
6
6
|
"typings": "build/index.d.ts",
|
|
@@ -17,9 +17,16 @@
|
|
|
17
17
|
"license": "AGPL-3.0-or-later",
|
|
18
18
|
"dependencies": {
|
|
19
19
|
"@cliqz/adblocker-playwright": "^1.31.3",
|
|
20
|
+
"cheerio": "^1.1.2",
|
|
20
21
|
"cross-fetch": "^4.0.0",
|
|
21
22
|
"joi": "^17.6.0",
|
|
23
|
+
"joplin-turndown-plugin-gfm": "^1.0.12",
|
|
22
24
|
"nodemailer": "^6.10.0",
|
|
23
|
-
"playwright-core": "^1.57.0"
|
|
25
|
+
"playwright-core": "^1.57.0",
|
|
26
|
+
"rimraf": "^6.1.2",
|
|
27
|
+
"turndown": "^7.2.0"
|
|
28
|
+
},
|
|
29
|
+
"devDependencies": {
|
|
30
|
+
"@types/turndown": "^5.0.6"
|
|
24
31
|
}
|
|
25
32
|
}
|