gologin-web-access 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/LICENSE +21 -0
- package/README.md +344 -0
- package/dist/cli.js +173 -0
- package/dist/commands/back.js +13 -0
- package/dist/commands/batch.js +81 -0
- package/dist/commands/batchChangeTrack.js +99 -0
- package/dist/commands/batchExtract.js +97 -0
- package/dist/commands/batchScrape.js +140 -0
- package/dist/commands/changeTrack.js +65 -0
- package/dist/commands/check.js +14 -0
- package/dist/commands/click.js +14 -0
- package/dist/commands/close.js +19 -0
- package/dist/commands/configInit.js +77 -0
- package/dist/commands/configShow.js +23 -0
- package/dist/commands/cookies.js +22 -0
- package/dist/commands/cookiesClear.js +13 -0
- package/dist/commands/cookiesImport.js +14 -0
- package/dist/commands/crawl.js +71 -0
- package/dist/commands/crawlErrors.js +20 -0
- package/dist/commands/crawlResult.js +27 -0
- package/dist/commands/crawlStart.js +56 -0
- package/dist/commands/crawlStatus.js +25 -0
- package/dist/commands/current.js +14 -0
- package/dist/commands/dblclick.js +14 -0
- package/dist/commands/eval.js +20 -0
- package/dist/commands/extract.js +44 -0
- package/dist/commands/fill.js +15 -0
- package/dist/commands/find.js +16 -0
- package/dist/commands/focus.js +14 -0
- package/dist/commands/forward.js +13 -0
- package/dist/commands/get.js +15 -0
- package/dist/commands/hover.js +14 -0
- package/dist/commands/jobs.js +47 -0
- package/dist/commands/map.js +61 -0
- package/dist/commands/open.js +22 -0
- package/dist/commands/parseDocument.js +34 -0
- package/dist/commands/pdf.js +14 -0
- package/dist/commands/press.js +15 -0
- package/dist/commands/read.js +51 -0
- package/dist/commands/reload.js +13 -0
- package/dist/commands/run.js +76 -0
- package/dist/commands/scrape.js +19 -0
- package/dist/commands/scrapeJson.js +24 -0
- package/dist/commands/scrapeMarkdown.js +37 -0
- package/dist/commands/scrapeScreenshot.js +65 -0
- package/dist/commands/scrapeText.js +37 -0
- package/dist/commands/screenshot.js +23 -0
- package/dist/commands/scroll.js +23 -0
- package/dist/commands/scrollIntoView.js +14 -0
- package/dist/commands/search.js +39 -0
- package/dist/commands/searchBrowser.js +28 -0
- package/dist/commands/select.js +15 -0
- package/dist/commands/sessions.js +14 -0
- package/dist/commands/shared.js +102 -0
- package/dist/commands/snapshot.js +18 -0
- package/dist/commands/storageClear.js +18 -0
- package/dist/commands/storageExport.js +26 -0
- package/dist/commands/storageImport.js +23 -0
- package/dist/commands/tabClose.js +18 -0
- package/dist/commands/tabFocus.js +15 -0
- package/dist/commands/tabOpen.js +19 -0
- package/dist/commands/tabs.js +13 -0
- package/dist/commands/type.js +15 -0
- package/dist/commands/uncheck.js +14 -0
- package/dist/commands/upload.js +15 -0
- package/dist/commands/wait.js +27 -0
- package/dist/config.js +260 -0
- package/dist/doctor.js +86 -0
- package/dist/internal-agent/cli.js +336 -0
- package/dist/internal-agent/commands/back.js +12 -0
- package/dist/internal-agent/commands/check.js +17 -0
- package/dist/internal-agent/commands/click.js +17 -0
- package/dist/internal-agent/commands/close.js +12 -0
- package/dist/internal-agent/commands/cookies.js +23 -0
- package/dist/internal-agent/commands/cookiesClear.js +12 -0
- package/dist/internal-agent/commands/cookiesImport.js +18 -0
- package/dist/internal-agent/commands/current.js +9 -0
- package/dist/internal-agent/commands/dblclick.js +17 -0
- package/dist/internal-agent/commands/doctor.js +53 -0
- package/dist/internal-agent/commands/eval.js +30 -0
- package/dist/internal-agent/commands/fill.js +18 -0
- package/dist/internal-agent/commands/find.js +86 -0
- package/dist/internal-agent/commands/focus.js +17 -0
- package/dist/internal-agent/commands/forward.js +12 -0
- package/dist/internal-agent/commands/get.js +19 -0
- package/dist/internal-agent/commands/hover.js +17 -0
- package/dist/internal-agent/commands/open.js +67 -0
- package/dist/internal-agent/commands/pdf.js +18 -0
- package/dist/internal-agent/commands/press.js +19 -0
- package/dist/internal-agent/commands/reload.js +12 -0
- package/dist/internal-agent/commands/screenshot.js +22 -0
- package/dist/internal-agent/commands/scroll.js +25 -0
- package/dist/internal-agent/commands/scrollIntoView.js +17 -0
- package/dist/internal-agent/commands/select.js +18 -0
- package/dist/internal-agent/commands/sessions.js +15 -0
- package/dist/internal-agent/commands/shared.js +51 -0
- package/dist/internal-agent/commands/snapshot.js +16 -0
- package/dist/internal-agent/commands/storageClear.js +13 -0
- package/dist/internal-agent/commands/storageExport.js +24 -0
- package/dist/internal-agent/commands/storageImport.js +20 -0
- package/dist/internal-agent/commands/tabClose.js +21 -0
- package/dist/internal-agent/commands/tabFocus.js +21 -0
- package/dist/internal-agent/commands/tabOpen.js +13 -0
- package/dist/internal-agent/commands/tabs.js +17 -0
- package/dist/internal-agent/commands/type.js +18 -0
- package/dist/internal-agent/commands/uncheck.js +17 -0
- package/dist/internal-agent/commands/upload.js +18 -0
- package/dist/internal-agent/commands/wait.js +41 -0
- package/dist/internal-agent/daemon/browser.js +818 -0
- package/dist/internal-agent/daemon/refStore.js +26 -0
- package/dist/internal-agent/daemon/server.js +330 -0
- package/dist/internal-agent/daemon/sessionManager.js +684 -0
- package/dist/internal-agent/daemon/snapshot.js +285 -0
- package/dist/internal-agent/lib/config.js +59 -0
- package/dist/internal-agent/lib/daemon.js +300 -0
- package/dist/internal-agent/lib/errors.js +63 -0
- package/dist/internal-agent/lib/types.js +2 -0
- package/dist/internal-agent/lib/utils.js +165 -0
- package/dist/jobRunner.js +56 -0
- package/dist/lib/agentCli.js +158 -0
- package/dist/lib/browserRead.js +125 -0
- package/dist/lib/browserStructured.js +77 -0
- package/dist/lib/changeTracking.js +117 -0
- package/dist/lib/cloudApi.js +41 -0
- package/dist/lib/concurrency.js +15 -0
- package/dist/lib/crawl.js +313 -0
- package/dist/lib/document.js +170 -0
- package/dist/lib/errors.js +55 -0
- package/dist/lib/extract.js +65 -0
- package/dist/lib/extractRunner.js +22 -0
- package/dist/lib/jobRegistry.js +164 -0
- package/dist/lib/output.js +122 -0
- package/dist/lib/readSource.js +297 -0
- package/dist/lib/runbooks.js +193 -0
- package/dist/lib/search.js +727 -0
- package/dist/lib/selfCli.js +136 -0
- package/dist/lib/structuredScrape.js +83 -0
- package/dist/lib/types.js +2 -0
- package/dist/lib/unlocker.js +383 -0
- package/package.json +67 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.buildTrackingKey = buildTrackingKey;
|
|
7
|
+
exports.compareAndPersistSnapshot = compareAndPersistSnapshot;
|
|
8
|
+
exports.scrapeForTracking = scrapeForTracking;
|
|
9
|
+
exports.normalizeTrackingFormat = normalizeTrackingFormat;
|
|
10
|
+
const crypto_1 = require("crypto");
|
|
11
|
+
const fs_1 = require("fs");
|
|
12
|
+
const path_1 = __importDefault(require("path"));
|
|
13
|
+
const diff_1 = require("diff");
|
|
14
|
+
const unlocker_1 = require("./unlocker");
|
|
15
|
+
function buildTrackingKey(url, explicitKey) {
|
|
16
|
+
return (explicitKey ?? url)
|
|
17
|
+
.toLowerCase()
|
|
18
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
19
|
+
.replace(/^-+|-+$/g, "")
|
|
20
|
+
.slice(0, 120);
|
|
21
|
+
}
|
|
22
|
+
async function compareAndPersistSnapshot(config, snapshot) {
|
|
23
|
+
await fs_1.promises.mkdir(config.trackingDir, { recursive: true });
|
|
24
|
+
const next = {
|
|
25
|
+
...snapshot,
|
|
26
|
+
hash: sha256(snapshot.content),
|
|
27
|
+
updatedAt: new Date().toISOString()
|
|
28
|
+
};
|
|
29
|
+
const previous = await readTrackedSnapshot(config, snapshot.key);
|
|
30
|
+
await fs_1.promises.writeFile(trackingPath(config, snapshot.key), `${JSON.stringify(next, null, 2)}\n`, "utf8");
|
|
31
|
+
if (!previous) {
|
|
32
|
+
return {
|
|
33
|
+
key: next.key,
|
|
34
|
+
url: next.url,
|
|
35
|
+
format: next.format,
|
|
36
|
+
status: "new",
|
|
37
|
+
currentHash: next.hash,
|
|
38
|
+
updatedAt: next.updatedAt
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
if (previous.hash === next.hash) {
|
|
42
|
+
return {
|
|
43
|
+
key: next.key,
|
|
44
|
+
url: next.url,
|
|
45
|
+
format: next.format,
|
|
46
|
+
status: "same",
|
|
47
|
+
previousHash: previous.hash,
|
|
48
|
+
currentHash: next.hash,
|
|
49
|
+
updatedAt: next.updatedAt
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
key: next.key,
|
|
54
|
+
url: next.url,
|
|
55
|
+
format: next.format,
|
|
56
|
+
status: "changed",
|
|
57
|
+
previousHash: previous.hash,
|
|
58
|
+
currentHash: next.hash,
|
|
59
|
+
updatedAt: next.updatedAt,
|
|
60
|
+
diff: (0, diff_1.createPatch)(next.key, previous.content, next.content, previous.updatedAt, next.updatedAt)
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
async function scrapeForTracking(url, apiKey, format, options = {}) {
|
|
64
|
+
switch (format) {
|
|
65
|
+
case "html": {
|
|
66
|
+
const result = await (0, unlocker_1.scrapeRenderedHtml)(url, apiKey, options);
|
|
67
|
+
return {
|
|
68
|
+
content: result.content,
|
|
69
|
+
request: result.request,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
case "text": {
|
|
73
|
+
const result = await (0, unlocker_1.scrapeText)(url, apiKey, options);
|
|
74
|
+
return {
|
|
75
|
+
content: result.text,
|
|
76
|
+
request: result.request,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
case "json": {
|
|
80
|
+
const result = await (0, unlocker_1.scrapeJson)(url, apiKey, options);
|
|
81
|
+
return {
|
|
82
|
+
title: result.data.title ?? undefined,
|
|
83
|
+
content: JSON.stringify(result.data, null, 2),
|
|
84
|
+
request: result.request,
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
case "markdown":
|
|
88
|
+
default: {
|
|
89
|
+
const result = await (0, unlocker_1.scrapeMarkdown)(url, apiKey, options);
|
|
90
|
+
return {
|
|
91
|
+
content: result.markdown,
|
|
92
|
+
request: result.request,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
function normalizeTrackingFormat(value) {
|
|
98
|
+
if (value === "html" || value === "markdown" || value === "text" || value === "json") {
|
|
99
|
+
return value;
|
|
100
|
+
}
|
|
101
|
+
throw new Error(`Unsupported change-track format: ${value}`);
|
|
102
|
+
}
|
|
103
|
+
function trackingPath(config, key) {
|
|
104
|
+
return path_1.default.join(config.trackingDir, `${key}.json`);
|
|
105
|
+
}
|
|
106
|
+
async function readTrackedSnapshot(config, key) {
|
|
107
|
+
try {
|
|
108
|
+
const raw = await fs_1.promises.readFile(trackingPath(config, key), "utf8");
|
|
109
|
+
return JSON.parse(raw);
|
|
110
|
+
}
|
|
111
|
+
catch {
|
|
112
|
+
return undefined;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
function sha256(value) {
|
|
116
|
+
return (0, crypto_1.createHash)("sha256").update(value).digest("hex");
|
|
117
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.validateCloudToken = validateCloudToken;
|
|
4
|
+
exports.getProfile = getProfile;
|
|
5
|
+
const errors_1 = require("./errors");
|
|
6
|
+
const GOLOGIN_API_BASE_URL = "https://api.gologin.com";
|
|
7
|
+
async function validateCloudToken(token) {
|
|
8
|
+
const response = await fetch(`${GOLOGIN_API_BASE_URL}/browser/v2`, {
|
|
9
|
+
headers: {
|
|
10
|
+
Authorization: `Bearer ${token}`,
|
|
11
|
+
},
|
|
12
|
+
});
|
|
13
|
+
if (response.ok) {
|
|
14
|
+
return { ok: true };
|
|
15
|
+
}
|
|
16
|
+
const body = await response.text();
|
|
17
|
+
return {
|
|
18
|
+
ok: false,
|
|
19
|
+
status: response.status,
|
|
20
|
+
detail: body.slice(0, 300) || `status ${response.status}`,
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
async function getProfile(profileId, token) {
|
|
24
|
+
const response = await fetch(`${GOLOGIN_API_BASE_URL}/browser/${encodeURIComponent(profileId)}`, {
|
|
25
|
+
headers: {
|
|
26
|
+
Authorization: `Bearer ${token}`,
|
|
27
|
+
},
|
|
28
|
+
});
|
|
29
|
+
if (response.status === 404) {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
if (!response.ok) {
|
|
33
|
+
const body = await response.text();
|
|
34
|
+
throw new errors_1.HttpError(`Gologin profile lookup failed with status ${response.status}.`, response.status, body.slice(0, 300));
|
|
35
|
+
}
|
|
36
|
+
const data = (await response.json());
|
|
37
|
+
return {
|
|
38
|
+
id: data.id ?? profileId,
|
|
39
|
+
name: data.name,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.mapWithConcurrency = mapWithConcurrency;
|
|
4
|
+
async function mapWithConcurrency(items, concurrency, mapper) {
|
|
5
|
+
const results = new Array(items.length);
|
|
6
|
+
let nextIndex = 0;
|
|
7
|
+
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
8
|
+
while (nextIndex < items.length) {
|
|
9
|
+
const currentIndex = nextIndex;
|
|
10
|
+
nextIndex += 1;
|
|
11
|
+
results[currentIndex] = await mapper(items[currentIndex], currentIndex);
|
|
12
|
+
}
|
|
13
|
+
}));
|
|
14
|
+
return results;
|
|
15
|
+
}
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.mapSite = mapSite;
|
|
4
|
+
exports.crawlSite = crawlSite;
|
|
5
|
+
exports.resolveTraversalStatus = resolveTraversalStatus;
|
|
6
|
+
const readSource_1 = require("./readSource");
|
|
7
|
+
const unlocker_1 = require("./unlocker");
|
|
8
|
+
async function mapSite(rootUrl, apiKey, options) {
|
|
9
|
+
const pages = await traverseSite(rootUrl, apiKey, options);
|
|
10
|
+
const status = resolveTraversalStatus(pages.pages.length, pages.pages.filter((page) => !page.ok).length);
|
|
11
|
+
return {
|
|
12
|
+
status,
|
|
13
|
+
rootUrl: pages.rootUrl,
|
|
14
|
+
visited: pages.pages.length,
|
|
15
|
+
failed: pages.pages.filter((page) => !page.ok).length,
|
|
16
|
+
limit: options.limit,
|
|
17
|
+
maxDepth: options.maxDepth,
|
|
18
|
+
includeSubdomains: options.includeSubdomains,
|
|
19
|
+
pages: pages.pages.map((page) => page.ok
|
|
20
|
+
? {
|
|
21
|
+
url: page.url,
|
|
22
|
+
depth: page.depth,
|
|
23
|
+
ok: true,
|
|
24
|
+
title: page.data.title ?? null,
|
|
25
|
+
description: page.data.description ?? null,
|
|
26
|
+
canonical: page.data.canonical ?? null,
|
|
27
|
+
links: page.links,
|
|
28
|
+
}
|
|
29
|
+
: {
|
|
30
|
+
url: page.url,
|
|
31
|
+
depth: page.depth,
|
|
32
|
+
ok: false,
|
|
33
|
+
links: page.links,
|
|
34
|
+
error: page.error,
|
|
35
|
+
}),
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
async function crawlSite(rootUrl, apiKey, format, options) {
|
|
39
|
+
const pages = await traverseSite(rootUrl, apiKey, options);
|
|
40
|
+
const status = resolveTraversalStatus(pages.pages.length, pages.pages.filter((page) => !page.ok).length);
|
|
41
|
+
return {
|
|
42
|
+
status,
|
|
43
|
+
rootUrl: pages.rootUrl,
|
|
44
|
+
format,
|
|
45
|
+
visited: pages.pages.length,
|
|
46
|
+
failed: pages.pages.filter((page) => !page.ok).length,
|
|
47
|
+
limit: options.limit,
|
|
48
|
+
maxDepth: options.maxDepth,
|
|
49
|
+
includeSubdomains: options.includeSubdomains,
|
|
50
|
+
pages: pages.pages.map((page) => page.ok
|
|
51
|
+
? {
|
|
52
|
+
url: page.url,
|
|
53
|
+
depth: page.depth,
|
|
54
|
+
ok: true,
|
|
55
|
+
title: page.data.title ?? null,
|
|
56
|
+
description: page.data.description ?? null,
|
|
57
|
+
canonical: page.data.canonical ?? null,
|
|
58
|
+
links: page.links,
|
|
59
|
+
format,
|
|
60
|
+
output: page.outputByFormat[format],
|
|
61
|
+
}
|
|
62
|
+
: {
|
|
63
|
+
url: page.url,
|
|
64
|
+
depth: page.depth,
|
|
65
|
+
ok: false,
|
|
66
|
+
links: page.links,
|
|
67
|
+
error: page.error,
|
|
68
|
+
}),
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
function resolveTraversalStatus(visited, failed) {
|
|
72
|
+
if (visited === 0 || failed >= visited) {
|
|
73
|
+
return "failed";
|
|
74
|
+
}
|
|
75
|
+
if (failed > 0) {
|
|
76
|
+
return "partial";
|
|
77
|
+
}
|
|
78
|
+
return "ok";
|
|
79
|
+
}
|
|
80
|
+
async function traverseSite(rootUrl, apiKey, options) {
|
|
81
|
+
const normalizedRootUrl = normalizeVisitUrl(rootUrl);
|
|
82
|
+
const scope = createScope(normalizedRootUrl, options.includeSubdomains);
|
|
83
|
+
const initialFrontier = await buildInitialFrontier(normalizedRootUrl, apiKey, scope, options);
|
|
84
|
+
const visited = new Set(initialFrontier);
|
|
85
|
+
const pages = [];
|
|
86
|
+
let frontier = initialFrontier;
|
|
87
|
+
for (let depth = 0; depth <= options.maxDepth && frontier.length > 0 && pages.length < options.limit; depth += 1) {
|
|
88
|
+
const remaining = options.limit - pages.length;
|
|
89
|
+
const levelUrls = frontier.slice(0, remaining);
|
|
90
|
+
const levelResults = await mapWithConcurrency(levelUrls, options.concurrency, async (url) => {
|
|
91
|
+
try {
|
|
92
|
+
const page = await scrapePage(url, apiKey, scope, options);
|
|
93
|
+
return {
|
|
94
|
+
url,
|
|
95
|
+
depth,
|
|
96
|
+
ok: true,
|
|
97
|
+
data: page.data,
|
|
98
|
+
links: page.links,
|
|
99
|
+
outputByFormat: page.outputByFormat,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
catch (error) {
|
|
103
|
+
return {
|
|
104
|
+
url,
|
|
105
|
+
depth,
|
|
106
|
+
ok: false,
|
|
107
|
+
error: error instanceof Error ? error.message : "Unknown error",
|
|
108
|
+
links: [],
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
pages.push(...levelResults);
|
|
113
|
+
if (depth === options.maxDepth || pages.length >= options.limit) {
|
|
114
|
+
break;
|
|
115
|
+
}
|
|
116
|
+
const nextFrontier = [];
|
|
117
|
+
for (const page of levelResults) {
|
|
118
|
+
if (!page.ok) {
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
for (const link of page.links) {
|
|
122
|
+
if (visited.has(link)) {
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
if (visited.size >= options.limit) {
|
|
126
|
+
break;
|
|
127
|
+
}
|
|
128
|
+
visited.add(link);
|
|
129
|
+
nextFrontier.push(link);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
frontier = nextFrontier;
|
|
133
|
+
}
|
|
134
|
+
return {
|
|
135
|
+
rootUrl: normalizedRootUrl,
|
|
136
|
+
pages,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
async function scrapePage(url, apiKey, scope, options) {
|
|
140
|
+
const scraped = await (0, unlocker_1.scrapeRenderedHtml)(url, apiKey);
|
|
141
|
+
const data = (0, unlocker_1.htmlToStructuredData)(scraped.content);
|
|
142
|
+
const links = extractScopedLinks(url, data.links, scope, options);
|
|
143
|
+
const readable = options.onlyMainContent ? (0, readSource_1.extractReadableSegmentFromHtml)(scraped.content) : null;
|
|
144
|
+
const htmlOutput = readable ? readable.html : scraped.content;
|
|
145
|
+
return {
|
|
146
|
+
url,
|
|
147
|
+
data,
|
|
148
|
+
links,
|
|
149
|
+
outputByFormat: {
|
|
150
|
+
html: htmlOutput,
|
|
151
|
+
markdown: (0, unlocker_1.htmlToMarkdown)(htmlOutput),
|
|
152
|
+
text: readable ? readable.text : (0, unlocker_1.htmlToText)(scraped.content),
|
|
153
|
+
json: data,
|
|
154
|
+
},
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
async function buildInitialFrontier(rootUrl, apiKey, scope, options) {
|
|
158
|
+
const rootAllowed = matchesScopeRules(rootUrl, options);
|
|
159
|
+
const sitemapUrls = options.sitemapMode === "skip" ? [] : await fetchSitemapUrls(rootUrl, apiKey, scope, options).catch(() => []);
|
|
160
|
+
if (options.sitemapMode === "only") {
|
|
161
|
+
const seeded = sitemapUrls.filter((url) => matchesScopeRules(url, options));
|
|
162
|
+
return seeded.length > 0 ? seeded.slice(0, options.limit) : rootAllowed ? [rootUrl] : [];
|
|
163
|
+
}
|
|
164
|
+
const urls = [
|
|
165
|
+
...(rootAllowed ? [rootUrl] : []),
|
|
166
|
+
...sitemapUrls.filter((url) => matchesScopeRules(url, options)),
|
|
167
|
+
];
|
|
168
|
+
return Array.from(new Set(urls)).slice(0, options.limit);
|
|
169
|
+
}
|
|
170
|
+
function createScope(rootUrl, includeSubdomains) {
|
|
171
|
+
const parsed = new URL(rootUrl);
|
|
172
|
+
return {
|
|
173
|
+
rootUrl,
|
|
174
|
+
rootHost: parsed.hostname.toLowerCase(),
|
|
175
|
+
rootOrigin: parsed.origin,
|
|
176
|
+
includeSubdomains,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
function extractScopedLinks(pageUrl, links, scope, options) {
|
|
180
|
+
const normalized = new Set();
|
|
181
|
+
for (const link of links) {
|
|
182
|
+
const candidate = normalizeDiscoveredUrl(link.href, pageUrl, options.ignoreQueryParameters);
|
|
183
|
+
if (!candidate) {
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
186
|
+
if (!isInScope(candidate, scope)) {
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
if (!matchesScopeRules(candidate, options)) {
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
normalized.add(candidate);
|
|
193
|
+
}
|
|
194
|
+
return Array.from(normalized);
|
|
195
|
+
}
|
|
196
|
+
function normalizeVisitUrl(value, ignoreQueryParameters = false) {
|
|
197
|
+
const parsed = new URL(value);
|
|
198
|
+
parsed.hash = "";
|
|
199
|
+
if (ignoreQueryParameters) {
|
|
200
|
+
parsed.search = "";
|
|
201
|
+
}
|
|
202
|
+
if (parsed.pathname === "") {
|
|
203
|
+
parsed.pathname = "/";
|
|
204
|
+
}
|
|
205
|
+
return parsed.toString();
|
|
206
|
+
}
|
|
207
|
+
function normalizeDiscoveredUrl(href, pageUrl, ignoreQueryParameters) {
|
|
208
|
+
const trimmed = href.trim();
|
|
209
|
+
if (!trimmed) {
|
|
210
|
+
return null;
|
|
211
|
+
}
|
|
212
|
+
if (trimmed.startsWith("#") ||
|
|
213
|
+
trimmed.startsWith("javascript:") ||
|
|
214
|
+
trimmed.startsWith("mailto:") ||
|
|
215
|
+
trimmed.startsWith("tel:") ||
|
|
216
|
+
trimmed.startsWith("data:")) {
|
|
217
|
+
return null;
|
|
218
|
+
}
|
|
219
|
+
try {
|
|
220
|
+
const resolved = new URL(trimmed, pageUrl);
|
|
221
|
+
if (resolved.protocol !== "http:" && resolved.protocol !== "https:") {
|
|
222
|
+
return null;
|
|
223
|
+
}
|
|
224
|
+
resolved.hash = "";
|
|
225
|
+
if (ignoreQueryParameters) {
|
|
226
|
+
resolved.search = "";
|
|
227
|
+
}
|
|
228
|
+
if (resolved.pathname === "") {
|
|
229
|
+
resolved.pathname = "/";
|
|
230
|
+
}
|
|
231
|
+
return resolved.toString();
|
|
232
|
+
}
|
|
233
|
+
catch {
|
|
234
|
+
return null;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
function isInScope(url, scope) {
|
|
238
|
+
const parsed = new URL(url);
|
|
239
|
+
const host = parsed.hostname.toLowerCase();
|
|
240
|
+
if (scope.includeSubdomains) {
|
|
241
|
+
return host === scope.rootHost || host.endsWith(`.${scope.rootHost}`);
|
|
242
|
+
}
|
|
243
|
+
return parsed.origin === scope.rootOrigin;
|
|
244
|
+
}
|
|
245
|
+
async function fetchSitemapUrls(rootUrl, apiKey, scope, options) {
|
|
246
|
+
const root = new URL(rootUrl);
|
|
247
|
+
const sitemapUrl = new URL("/sitemap.xml", root.origin).toString();
|
|
248
|
+
const scraped = await (0, unlocker_1.scrapeRenderedHtml)(sitemapUrl, apiKey);
|
|
249
|
+
const directEntries = extractXmlLocValues(scraped.content)
|
|
250
|
+
.map((url) => normalizeVisitUrl(url, options.ignoreQueryParameters))
|
|
251
|
+
.filter((url) => isInScope(url, scope) && matchesScopeRules(url, options));
|
|
252
|
+
if (!scraped.content.includes("<sitemapindex")) {
|
|
253
|
+
return Array.from(new Set(directEntries));
|
|
254
|
+
}
|
|
255
|
+
const nested = directEntries.filter((url) => url.endsWith(".xml"));
|
|
256
|
+
const pageUrls = directEntries.filter((url) => !url.endsWith(".xml"));
|
|
257
|
+
const nestedPages = await mapWithConcurrency(nested.slice(0, 10), 2, async (url) => {
|
|
258
|
+
try {
|
|
259
|
+
const nestedScrape = await (0, unlocker_1.scrapeRenderedHtml)(url, apiKey);
|
|
260
|
+
return extractXmlLocValues(nestedScrape.content)
|
|
261
|
+
.map((item) => normalizeVisitUrl(item, options.ignoreQueryParameters))
|
|
262
|
+
.filter((item) => isInScope(item, scope) && matchesScopeRules(item, options));
|
|
263
|
+
}
|
|
264
|
+
catch {
|
|
265
|
+
return [];
|
|
266
|
+
}
|
|
267
|
+
});
|
|
268
|
+
return Array.from(new Set([...pageUrls, ...nestedPages.flat()]));
|
|
269
|
+
}
|
|
270
|
+
function extractXmlLocValues(xml) {
|
|
271
|
+
return Array.from(xml.matchAll(/<loc>([\s\S]*?)<\/loc>/gi))
|
|
272
|
+
.map((match) => decodeXmlEntities(match[1].trim()))
|
|
273
|
+
.filter(Boolean);
|
|
274
|
+
}
|
|
275
|
+
function decodeXmlEntities(value) {
|
|
276
|
+
return value
|
|
277
|
+
.replace(/&/g, "&")
|
|
278
|
+
.replace(/</g, "<")
|
|
279
|
+
.replace(/>/g, ">")
|
|
280
|
+
.replace(/"/g, "\"")
|
|
281
|
+
.replace(/'/g, "'");
|
|
282
|
+
}
|
|
283
|
+
function matchesScopeRules(url, options) {
|
|
284
|
+
if (options.includePatterns.length > 0 && !matchesAnyPattern(url, options.includePatterns)) {
|
|
285
|
+
return false;
|
|
286
|
+
}
|
|
287
|
+
if (options.excludePatterns.length > 0 && matchesAnyPattern(url, options.excludePatterns)) {
|
|
288
|
+
return false;
|
|
289
|
+
}
|
|
290
|
+
return true;
|
|
291
|
+
}
|
|
292
|
+
function matchesAnyPattern(url, patterns) {
|
|
293
|
+
return patterns.some((pattern) => buildPatternRegex(pattern).test(url));
|
|
294
|
+
}
|
|
295
|
+
function buildPatternRegex(pattern) {
|
|
296
|
+
const escaped = pattern
|
|
297
|
+
.trim()
|
|
298
|
+
.replace(/[.+?^${}()|[\]\\]/g, "\\$&")
|
|
299
|
+
.replace(/\*/g, ".*");
|
|
300
|
+
return new RegExp(escaped, "i");
|
|
301
|
+
}
|
|
302
|
+
async function mapWithConcurrency(items, concurrency, mapper) {
|
|
303
|
+
const results = new Array(items.length);
|
|
304
|
+
let nextIndex = 0;
|
|
305
|
+
await Promise.all(Array.from({ length: Math.min(Math.max(1, concurrency), items.length) }, async () => {
|
|
306
|
+
while (nextIndex < items.length) {
|
|
307
|
+
const currentIndex = nextIndex;
|
|
308
|
+
nextIndex += 1;
|
|
309
|
+
results[currentIndex] = await mapper(items[currentIndex], currentIndex);
|
|
310
|
+
}
|
|
311
|
+
}));
|
|
312
|
+
return results;
|
|
313
|
+
}
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.parseDocumentSource = parseDocumentSource;
|
|
40
|
+
const fs_1 = require("fs");
|
|
41
|
+
const path_1 = __importDefault(require("path"));
|
|
42
|
+
const mammoth_1 = __importDefault(require("mammoth"));
|
|
43
|
+
const pdf_parse_1 = require("pdf-parse");
|
|
44
|
+
const XLSX = __importStar(require("xlsx"));
|
|
45
|
+
const unlocker_1 = require("./unlocker");
|
|
46
|
+
async function parseDocumentSource(source) {
|
|
47
|
+
const loaded = await loadSource(source);
|
|
48
|
+
const kind = detectDocumentKind(source, loaded.contentType);
|
|
49
|
+
switch (kind) {
|
|
50
|
+
case "pdf": {
|
|
51
|
+
const parser = new pdf_parse_1.PDFParse({ data: loaded.buffer });
|
|
52
|
+
try {
|
|
53
|
+
const parsed = await parser.getText();
|
|
54
|
+
const info = await parser.getInfo().catch(() => undefined);
|
|
55
|
+
return {
|
|
56
|
+
source,
|
|
57
|
+
kind,
|
|
58
|
+
text: parsed.text.trim(),
|
|
59
|
+
metadata: {
|
|
60
|
+
pages: parsed.total,
|
|
61
|
+
info: info?.info ?? {}
|
|
62
|
+
}
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
finally {
|
|
66
|
+
await parser.destroy().catch(() => undefined);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
case "docx": {
|
|
70
|
+
const parsed = await mammoth_1.default.extractRawText({ buffer: loaded.buffer });
|
|
71
|
+
return {
|
|
72
|
+
source,
|
|
73
|
+
kind,
|
|
74
|
+
text: parsed.value.trim(),
|
|
75
|
+
metadata: {
|
|
76
|
+
messages: parsed.messages
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
case "xlsx": {
|
|
81
|
+
const workbook = XLSX.read(loaded.buffer, { type: "buffer" });
|
|
82
|
+
const parts = workbook.SheetNames.map((sheetName) => {
|
|
83
|
+
const sheet = workbook.Sheets[sheetName];
|
|
84
|
+
const rows = XLSX.utils.sheet_to_json(sheet, {
|
|
85
|
+
header: 1,
|
|
86
|
+
raw: false
|
|
87
|
+
});
|
|
88
|
+
const body = rows.map((row) => row.map((cell) => (cell ?? "").toString()).join("\t")).join("\n").trim();
|
|
89
|
+
return `# ${sheetName}\n${body}`.trim();
|
|
90
|
+
}).filter(Boolean);
|
|
91
|
+
return {
|
|
92
|
+
source,
|
|
93
|
+
kind,
|
|
94
|
+
text: parts.join("\n\n").trim(),
|
|
95
|
+
metadata: {
|
|
96
|
+
sheets: workbook.SheetNames
|
|
97
|
+
}
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
case "html": {
|
|
101
|
+
const raw = loaded.buffer.toString("utf8");
|
|
102
|
+
return {
|
|
103
|
+
source,
|
|
104
|
+
kind,
|
|
105
|
+
text: (0, unlocker_1.htmlToText)(raw),
|
|
106
|
+
metadata: {}
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
default: {
|
|
110
|
+
return {
|
|
111
|
+
source,
|
|
112
|
+
kind,
|
|
113
|
+
text: loaded.buffer.toString("utf8").trim(),
|
|
114
|
+
metadata: {}
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
async function loadSource(source) {
|
|
120
|
+
if (looksLikeUrl(source)) {
|
|
121
|
+
const response = await fetch(source);
|
|
122
|
+
if (!response.ok) {
|
|
123
|
+
throw new Error(`Failed to fetch ${source}: ${response.status}`);
|
|
124
|
+
}
|
|
125
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
126
|
+
return {
|
|
127
|
+
buffer,
|
|
128
|
+
contentType: response.headers.get("content-type")
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
return {
|
|
132
|
+
buffer: await fs_1.promises.readFile(path_1.default.resolve(source))
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
function looksLikeUrl(value) {
|
|
136
|
+
try {
|
|
137
|
+
const url = new URL(value);
|
|
138
|
+
return url.protocol === "http:" || url.protocol === "https:";
|
|
139
|
+
}
|
|
140
|
+
catch {
|
|
141
|
+
return false;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
function detectDocumentKind(source, contentType) {
|
|
145
|
+
const lowerContentType = contentType?.toLowerCase() ?? "";
|
|
146
|
+
const lowerSource = source.toLowerCase();
|
|
147
|
+
if (lowerContentType.includes("pdf") || lowerSource.endsWith(".pdf")) {
|
|
148
|
+
return "pdf";
|
|
149
|
+
}
|
|
150
|
+
if (lowerContentType.includes("wordprocessingml.document") ||
|
|
151
|
+
lowerContentType.includes("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ||
|
|
152
|
+
lowerSource.endsWith(".docx")) {
|
|
153
|
+
return "docx";
|
|
154
|
+
}
|
|
155
|
+
if (lowerContentType.includes("spreadsheetml.sheet") ||
|
|
156
|
+
lowerContentType.includes("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") ||
|
|
157
|
+
lowerSource.endsWith(".xlsx")) {
|
|
158
|
+
return "xlsx";
|
|
159
|
+
}
|
|
160
|
+
if (lowerContentType.includes("html") || lowerSource.endsWith(".html") || lowerSource.endsWith(".htm")) {
|
|
161
|
+
return "html";
|
|
162
|
+
}
|
|
163
|
+
if (lowerSource.endsWith(".md")) {
|
|
164
|
+
return "markdown";
|
|
165
|
+
}
|
|
166
|
+
if (lowerSource.endsWith(".json")) {
|
|
167
|
+
return "json";
|
|
168
|
+
}
|
|
169
|
+
return "text";
|
|
170
|
+
}
|