n8n-nodes-crawl4ai-plus 2.0.9 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +23 -23
- package/README.md +129 -41
- package/dist/credentials/Crawl4aiApi.credentials.js +2 -34
- package/dist/credentials/Crawl4aiApi.credentials.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlMultipleUrls.operation.js +1230 -30
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlMultipleUrls.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlSingleUrl.operation.js +715 -9
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlSingleUrl.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.js +495 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/operations.js +9 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/operations.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/processRawHtml.operation.js +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/processRawHtml.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.d.ts +4 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.js +94 -60
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.d.ts +8 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.js +49 -12
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/interfaces.d.ts +38 -5
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.d.ts +13 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.js +270 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.js +445 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cssExtractor.operation.js +108 -8
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cssExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/jsonExtractor.operation.js +49 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/jsonExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/llmExtractor.operation.js +134 -17
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/llmExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/operations.js +27 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/operations.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/regexExtractor.operation.js +206 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/regexExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.js +376 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.d.ts +4 -2
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.js +53 -16
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/index.js +11 -11
- package/package.json +1 -1
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.description = void 0;
|
|
4
|
+
exports.execute = execute;
|
|
5
|
+
const n8n_workflow_1 = require("n8n-workflow");
|
|
6
|
+
const utils_1 = require("../helpers/utils");
|
|
7
|
+
exports.description = [
|
|
8
|
+
{
|
|
9
|
+
displayName: 'URL',
|
|
10
|
+
name: 'url',
|
|
11
|
+
type: 'string',
|
|
12
|
+
required: true,
|
|
13
|
+
default: '',
|
|
14
|
+
placeholder: 'https://example.com',
|
|
15
|
+
description: 'The URL to extract SEO metadata from',
|
|
16
|
+
displayOptions: {
|
|
17
|
+
show: {
|
|
18
|
+
operation: ['seoExtractor'],
|
|
19
|
+
},
|
|
20
|
+
},
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
displayName: 'Metadata Types',
|
|
24
|
+
name: 'metadataTypes',
|
|
25
|
+
type: 'multiOptions',
|
|
26
|
+
options: [
|
|
27
|
+
{
|
|
28
|
+
name: 'Basic Meta Tags',
|
|
29
|
+
value: 'basic',
|
|
30
|
+
description: 'Title, description, keywords, canonical URL',
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
name: 'JSON-LD Structured Data',
|
|
34
|
+
value: 'jsonLd',
|
|
35
|
+
description: 'Schema.org structured data in JSON-LD format',
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
name: 'Language & Locale',
|
|
39
|
+
value: 'language',
|
|
40
|
+
description: 'HTML lang, hreflang tags, locale settings',
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
name: 'Open Graph (OG) Tags',
|
|
44
|
+
value: 'openGraph',
|
|
45
|
+
description: 'OG title, description, image, type, URL',
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
name: 'Robots & Indexing',
|
|
49
|
+
value: 'robots',
|
|
50
|
+
description: 'Robots meta, noindex, nofollow directives',
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
name: 'Twitter Cards',
|
|
54
|
+
value: 'twitter',
|
|
55
|
+
description: 'Twitter card metadata',
|
|
56
|
+
},
|
|
57
|
+
],
|
|
58
|
+
default: ['basic', 'openGraph', 'jsonLd'],
|
|
59
|
+
description: 'Select which types of SEO metadata to extract',
|
|
60
|
+
displayOptions: {
|
|
61
|
+
show: {
|
|
62
|
+
operation: ['seoExtractor'],
|
|
63
|
+
},
|
|
64
|
+
},
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
displayName: 'Browser Options',
|
|
68
|
+
name: 'browserOptions',
|
|
69
|
+
type: 'collection',
|
|
70
|
+
placeholder: 'Add Option',
|
|
71
|
+
default: {},
|
|
72
|
+
displayOptions: {
|
|
73
|
+
show: {
|
|
74
|
+
operation: ['seoExtractor'],
|
|
75
|
+
},
|
|
76
|
+
},
|
|
77
|
+
options: [
|
|
78
|
+
{
|
|
79
|
+
displayName: 'Browser Type',
|
|
80
|
+
name: 'browserType',
|
|
81
|
+
type: 'options',
|
|
82
|
+
options: [
|
|
83
|
+
{
|
|
84
|
+
name: 'Chromium',
|
|
85
|
+
value: 'chromium',
|
|
86
|
+
description: 'Use Chromium browser (default, most compatible)',
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
name: 'Firefox',
|
|
90
|
+
value: 'firefox',
|
|
91
|
+
description: 'Use Firefox browser',
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
name: 'Webkit',
|
|
95
|
+
value: 'webkit',
|
|
96
|
+
description: 'Use Webkit browser (Safari engine)',
|
|
97
|
+
},
|
|
98
|
+
],
|
|
99
|
+
default: 'chromium',
|
|
100
|
+
description: 'Which browser engine to use for crawling',
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
displayName: 'Enable JavaScript',
|
|
104
|
+
name: 'java_script_enabled',
|
|
105
|
+
type: 'boolean',
|
|
106
|
+
default: true,
|
|
107
|
+
description: 'Whether to enable JavaScript execution (recommended for dynamic SEO tags)',
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
displayName: 'Headless Mode',
|
|
111
|
+
name: 'headless',
|
|
112
|
+
type: 'boolean',
|
|
113
|
+
default: true,
|
|
114
|
+
description: 'Whether to run browser in headless mode',
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
displayName: 'Timeout (MS)',
|
|
118
|
+
name: 'timeout',
|
|
119
|
+
type: 'number',
|
|
120
|
+
default: 30000,
|
|
121
|
+
description: 'Maximum time to wait for the browser to load the page',
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
displayName: 'Wait For',
|
|
125
|
+
name: 'waitFor',
|
|
126
|
+
type: 'string',
|
|
127
|
+
default: '',
|
|
128
|
+
placeholder: 'head',
|
|
129
|
+
description: 'CSS selector to wait for before extracting (useful for dynamically injected meta tags)',
|
|
130
|
+
},
|
|
131
|
+
],
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
displayName: 'Options',
|
|
135
|
+
name: 'options',
|
|
136
|
+
type: 'collection',
|
|
137
|
+
placeholder: 'Add Option',
|
|
138
|
+
default: {},
|
|
139
|
+
displayOptions: {
|
|
140
|
+
show: {
|
|
141
|
+
operation: ['seoExtractor'],
|
|
142
|
+
},
|
|
143
|
+
},
|
|
144
|
+
options: [
|
|
145
|
+
{
|
|
146
|
+
displayName: 'Cache Mode',
|
|
147
|
+
name: 'cacheMode',
|
|
148
|
+
type: 'options',
|
|
149
|
+
options: [
|
|
150
|
+
{
|
|
151
|
+
name: 'Bypass (Skip Cache)',
|
|
152
|
+
value: 'BYPASS',
|
|
153
|
+
description: 'Skip cache for this operation, fetch fresh content',
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
name: 'Disabled (No Cache)',
|
|
157
|
+
value: 'DISABLED',
|
|
158
|
+
description: 'No caching at all',
|
|
159
|
+
},
|
|
160
|
+
{
|
|
161
|
+
name: 'Enabled (Read/Write)',
|
|
162
|
+
value: 'ENABLED',
|
|
163
|
+
description: 'Use cache if available, save new results to cache',
|
|
164
|
+
},
|
|
165
|
+
{
|
|
166
|
+
name: 'Read Only',
|
|
167
|
+
value: 'READ_ONLY',
|
|
168
|
+
description: 'Only read from cache, do not write new results',
|
|
169
|
+
},
|
|
170
|
+
{
|
|
171
|
+
name: 'Write Only',
|
|
172
|
+
value: 'WRITE_ONLY',
|
|
173
|
+
description: 'Only write to cache, do not read existing cache',
|
|
174
|
+
},
|
|
175
|
+
],
|
|
176
|
+
default: 'ENABLED',
|
|
177
|
+
description: 'How to use the cache when crawling',
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
displayName: 'Include Raw HTML',
|
|
181
|
+
name: 'includeRawHtml',
|
|
182
|
+
type: 'boolean',
|
|
183
|
+
default: false,
|
|
184
|
+
description: 'Whether to include the raw HTML head section in output',
|
|
185
|
+
},
|
|
186
|
+
],
|
|
187
|
+
},
|
|
188
|
+
];
|
|
189
|
+
const SEO_FIELDS = {
|
|
190
|
+
basic: [
|
|
191
|
+
{ name: 'title', selector: 'title', type: 'text' },
|
|
192
|
+
{ name: 'metaDescription', selector: 'meta[name="description"]', type: 'attribute', attribute: 'content' },
|
|
193
|
+
{ name: 'metaKeywords', selector: 'meta[name="keywords"]', type: 'attribute', attribute: 'content' },
|
|
194
|
+
{ name: 'canonicalUrl', selector: 'link[rel="canonical"]', type: 'attribute', attribute: 'href' },
|
|
195
|
+
{ name: 'author', selector: 'meta[name="author"]', type: 'attribute', attribute: 'content' },
|
|
196
|
+
{ name: 'viewport', selector: 'meta[name="viewport"]', type: 'attribute', attribute: 'content' },
|
|
197
|
+
],
|
|
198
|
+
openGraph: [
|
|
199
|
+
{ name: 'ogTitle', selector: 'meta[property="og:title"]', type: 'attribute', attribute: 'content' },
|
|
200
|
+
{ name: 'ogDescription', selector: 'meta[property="og:description"]', type: 'attribute', attribute: 'content' },
|
|
201
|
+
{ name: 'ogImage', selector: 'meta[property="og:image"]', type: 'attribute', attribute: 'content' },
|
|
202
|
+
{ name: 'ogType', selector: 'meta[property="og:type"]', type: 'attribute', attribute: 'content' },
|
|
203
|
+
{ name: 'ogUrl', selector: 'meta[property="og:url"]', type: 'attribute', attribute: 'content' },
|
|
204
|
+
{ name: 'ogSiteName', selector: 'meta[property="og:site_name"]', type: 'attribute', attribute: 'content' },
|
|
205
|
+
{ name: 'ogLocale', selector: 'meta[property="og:locale"]', type: 'attribute', attribute: 'content' },
|
|
206
|
+
],
|
|
207
|
+
twitter: [
|
|
208
|
+
{ name: 'twitterCard', selector: 'meta[name="twitter:card"]', type: 'attribute', attribute: 'content' },
|
|
209
|
+
{ name: 'twitterTitle', selector: 'meta[name="twitter:title"]', type: 'attribute', attribute: 'content' },
|
|
210
|
+
{ name: 'twitterDescription', selector: 'meta[name="twitter:description"]', type: 'attribute', attribute: 'content' },
|
|
211
|
+
{ name: 'twitterImage', selector: 'meta[name="twitter:image"]', type: 'attribute', attribute: 'content' },
|
|
212
|
+
{ name: 'twitterSite', selector: 'meta[name="twitter:site"]', type: 'attribute', attribute: 'content' },
|
|
213
|
+
{ name: 'twitterCreator', selector: 'meta[name="twitter:creator"]', type: 'attribute', attribute: 'content' },
|
|
214
|
+
],
|
|
215
|
+
robots: [
|
|
216
|
+
{ name: 'robots', selector: 'meta[name="robots"]', type: 'attribute', attribute: 'content' },
|
|
217
|
+
{ name: 'googlebot', selector: 'meta[name="googlebot"]', type: 'attribute', attribute: 'content' },
|
|
218
|
+
{ name: 'bingbot', selector: 'meta[name="bingbot"]', type: 'attribute', attribute: 'content' },
|
|
219
|
+
],
|
|
220
|
+
language: [
|
|
221
|
+
{ name: 'htmlLang', selector: 'html', type: 'attribute', attribute: 'lang' },
|
|
222
|
+
{ name: 'contentLanguage', selector: 'meta[http-equiv="content-language"]', type: 'attribute', attribute: 'content' },
|
|
223
|
+
],
|
|
224
|
+
};
|
|
225
|
+
async function execute(items, nodeOptions) {
|
|
226
|
+
var _a;
|
|
227
|
+
const allResults = [];
|
|
228
|
+
for (let i = 0; i < items.length; i++) {
|
|
229
|
+
try {
|
|
230
|
+
const url = this.getNodeParameter('url', i, '');
|
|
231
|
+
const metadataTypes = this.getNodeParameter('metadataTypes', i, ['basic', 'openGraph', 'jsonLd']);
|
|
232
|
+
const browserOptions = this.getNodeParameter('browserOptions', i, {});
|
|
233
|
+
const options = this.getNodeParameter('options', i, {});
|
|
234
|
+
if (!url) {
|
|
235
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'URL cannot be empty.', { itemIndex: i });
|
|
236
|
+
}
|
|
237
|
+
if (!(0, utils_1.isValidUrl)(url)) {
|
|
238
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid URL: ${url}`, { itemIndex: i });
|
|
239
|
+
}
|
|
240
|
+
if (!metadataTypes || metadataTypes.length === 0) {
|
|
241
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'At least one metadata type must be selected.', { itemIndex: i });
|
|
242
|
+
}
|
|
243
|
+
const fields = [];
|
|
244
|
+
for (const metaType of metadataTypes) {
|
|
245
|
+
if (metaType !== 'jsonLd' && SEO_FIELDS[metaType]) {
|
|
246
|
+
fields.push(...SEO_FIELDS[metaType]);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
const extractionStrategy = fields.length > 0 ? {
|
|
250
|
+
type: 'JsonCssExtractionStrategy',
|
|
251
|
+
params: {
|
|
252
|
+
schema: {
|
|
253
|
+
type: 'dict',
|
|
254
|
+
value: {
|
|
255
|
+
name: 'SEO_Metadata',
|
|
256
|
+
baseSelector: 'html',
|
|
257
|
+
fields: fields.map(field => ({
|
|
258
|
+
name: field.name,
|
|
259
|
+
selector: field.selector,
|
|
260
|
+
type: field.type,
|
|
261
|
+
...(field.attribute ? { attribute: field.attribute } : {}),
|
|
262
|
+
})),
|
|
263
|
+
},
|
|
264
|
+
},
|
|
265
|
+
},
|
|
266
|
+
} : null;
|
|
267
|
+
const crawlerOptions = {
|
|
268
|
+
...browserOptions,
|
|
269
|
+
cacheMode: options.cacheMode || 'ENABLED',
|
|
270
|
+
waitFor: browserOptions.waitFor,
|
|
271
|
+
};
|
|
272
|
+
const crawlerConfig = (0, utils_1.createCrawlerRunConfig)(crawlerOptions);
|
|
273
|
+
if (extractionStrategy) {
|
|
274
|
+
crawlerConfig.extractionStrategy = extractionStrategy;
|
|
275
|
+
}
|
|
276
|
+
const crawler = await (0, utils_1.getCrawl4aiClient)(this);
|
|
277
|
+
const result = await crawler.arun(url, crawlerConfig);
|
|
278
|
+
if (!result.success) {
|
|
279
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Failed to crawl URL: ${result.error_message || 'Unknown error'}`, { itemIndex: i });
|
|
280
|
+
}
|
|
281
|
+
let seoData = {};
|
|
282
|
+
if (result.extracted_content) {
|
|
283
|
+
try {
|
|
284
|
+
const parsed = JSON.parse(result.extracted_content);
|
|
285
|
+
if (Array.isArray(parsed) && parsed.length > 0) {
|
|
286
|
+
seoData = { ...seoData, ...parsed[0] };
|
|
287
|
+
}
|
|
288
|
+
else if (typeof parsed === 'object') {
|
|
289
|
+
seoData = { ...seoData, ...parsed };
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
catch (e) {
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
if (metadataTypes.includes('jsonLd')) {
|
|
296
|
+
const jsonLdData = extractJsonLd(result.html || result.cleaned_html || '');
|
|
297
|
+
if (jsonLdData.length > 0) {
|
|
298
|
+
seoData.jsonLd = jsonLdData;
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
if (metadataTypes.includes('language')) {
|
|
302
|
+
const hreflangTags = extractHreflang(result.html || result.cleaned_html || '');
|
|
303
|
+
if (hreflangTags.length > 0) {
|
|
304
|
+
seoData.hreflang = hreflangTags;
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
const formattedResult = {
|
|
308
|
+
url,
|
|
309
|
+
success: true,
|
|
310
|
+
seo: seoData,
|
|
311
|
+
...(options.includeRawHtml ? { rawHtml: extractHead(result.html || '') } : {}),
|
|
312
|
+
};
|
|
313
|
+
allResults.push({
|
|
314
|
+
json: formattedResult,
|
|
315
|
+
pairedItem: { item: i },
|
|
316
|
+
});
|
|
317
|
+
}
|
|
318
|
+
catch (error) {
|
|
319
|
+
if (this.continueOnFail()) {
|
|
320
|
+
const node = this.getNode();
|
|
321
|
+
const errorItemIndex = (_a = error.itemIndex) !== null && _a !== void 0 ? _a : i;
|
|
322
|
+
allResults.push({
|
|
323
|
+
json: items[i].json,
|
|
324
|
+
error: new n8n_workflow_1.NodeOperationError(node, error.message, { itemIndex: errorItemIndex }),
|
|
325
|
+
pairedItem: { item: i },
|
|
326
|
+
});
|
|
327
|
+
continue;
|
|
328
|
+
}
|
|
329
|
+
throw error;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
return allResults;
|
|
333
|
+
}
|
|
334
|
+
function extractJsonLd(html) {
|
|
335
|
+
const jsonLdData = [];
|
|
336
|
+
const regex = /<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
337
|
+
let match;
|
|
338
|
+
while ((match = regex.exec(html)) !== null) {
|
|
339
|
+
try {
|
|
340
|
+
const data = JSON.parse(match[1].trim());
|
|
341
|
+
jsonLdData.push(data);
|
|
342
|
+
}
|
|
343
|
+
catch (e) {
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
return jsonLdData;
|
|
347
|
+
}
|
|
348
|
+
function extractHreflang(html) {
|
|
349
|
+
const hreflangTags = [];
|
|
350
|
+
const regex = /<link[^>]*rel=["']alternate["'][^>]*hreflang=["']([^"']+)["'][^>]*href=["']([^"']+)["'][^>]*\/?>/gi;
|
|
351
|
+
const regex2 = /<link[^>]*hreflang=["']([^"']+)["'][^>]*rel=["']alternate["'][^>]*href=["']([^"']+)["'][^>]*\/?>/gi;
|
|
352
|
+
const regex3 = /<link[^>]*href=["']([^"']+)["'][^>]*hreflang=["']([^"']+)["'][^>]*rel=["']alternate["'][^>]*\/?>/gi;
|
|
353
|
+
let match;
|
|
354
|
+
while ((match = regex.exec(html)) !== null) {
|
|
355
|
+
hreflangTags.push({ lang: match[1], href: match[2] });
|
|
356
|
+
}
|
|
357
|
+
while ((match = regex2.exec(html)) !== null) {
|
|
358
|
+
hreflangTags.push({ lang: match[1], href: match[2] });
|
|
359
|
+
}
|
|
360
|
+
while ((match = regex3.exec(html)) !== null) {
|
|
361
|
+
hreflangTags.push({ lang: match[2], href: match[1] });
|
|
362
|
+
}
|
|
363
|
+
const seen = new Set();
|
|
364
|
+
return hreflangTags.filter(tag => {
|
|
365
|
+
const key = `${tag.lang}:${tag.href}`;
|
|
366
|
+
if (seen.has(key))
|
|
367
|
+
return false;
|
|
368
|
+
seen.add(key);
|
|
369
|
+
return true;
|
|
370
|
+
});
|
|
371
|
+
}
|
|
372
|
+
function extractHead(html) {
|
|
373
|
+
const match = html.match(/<head[^>]*>([\s\S]*?)<\/head>/i);
|
|
374
|
+
return match ? match[1] : '';
|
|
375
|
+
}
|
|
376
|
+
//# sourceMappingURL=seoExtractor.operation.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"seoExtractor.operation.js","sourceRoot":"","sources":["../../../../nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.ts"],"names":[],"mappings":";;;AAuPA,0BAoJC;AArYD,+CAAkD;AAIlD,4CAI0B;AAGb,QAAA,WAAW,GAAsB;IAC5C;QACE,WAAW,EAAE,KAAK;QAClB,IAAI,EAAE,KAAK;QACX,IAAI,EAAE,QAAQ;QACd,QAAQ,EAAE,IAAI;QACd,OAAO,EAAE,EAAE;QACX,WAAW,EAAE,qBAAqB;QAClC,WAAW,EAAE,sCAAsC;QACnD,cAAc,EAAE;YACd,IAAI,EAAE;gBACJ,SAAS,EAAE,CAAC,cAAc,CAAC;aAC5B;SACF;KACF;IACD;QACE,WAAW,EAAE,gBAAgB;QAC7B,IAAI,EAAE,eAAe;QACrB,IAAI,EAAE,cAAc;QACpB,OAAO,EAAE;YACP;gBACE,IAAI,EAAE,iBAAiB;gBACvB,KAAK,EAAE,OAAO;gBACd,WAAW,EAAE,6CAA6C;aAC3D;YACD;gBACE,IAAI,EAAE,yBAAyB;gBAC/B,KAAK,EAAE,QAAQ;gBACf,WAAW,EAAE,8CAA8C;aAC5D;YACD;gBACE,IAAI,EAAE,mBAAmB;gBACzB,KAAK,EAAE,UAAU;gBACjB,WAAW,EAAE,2CAA2C;aACzD;YACJ;gBACK,IAAI,EAAE,sBAAsB;gBAC5B,KAAK,EAAE,WAAW;gBAClB,WAAW,EAAE,yCAAyC;aACvD;YACD;gBACE,IAAI,EAAE,mBAAmB;gBACzB,KAAK,EAAE,QAAQ;gBACf,WAAW,EAAE,2CAA2C;aACzD;YACJ;gBACK,IAAI,EAAE,eAAe;gBACrB,KAAK,EAAE,SAAS;gBAChB,WAAW,EAAE,uBAAuB;aACrC;SACF;QACD,OAAO,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,QAAQ,CAAC;QACzC,WAAW,EAAE,+CAA+C;QAC5D,cAAc,EAAE;YACd,IAAI,EAAE;gBACJ,SAAS,EAAE,CAAC,cAAc,CAAC;aAC5B;SACF;KACF;IACD;QACE,WAAW,EAAE,iBAAiB;QAC9B,IAAI,EAAE,gBAAgB;QACtB,IAAI,EAAE,YAAY;QAClB,WAAW,EAAE,YAAY;QACzB,OAAO,EAAE,EAAE;QACX,cAAc,EAAE;YACd,IAAI,EAAE;gBACJ,SAAS,EAAE,CAAC,cAAc,CAAC;aAC5B;SACF;QACD,OAAO,EAAE;YACP;gBACE,WAAW,EAAE,cAAc;gBAC3B,IAAI,EAAE,aAAa;gBACnB,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE;oBACP;wBACE,IAAI,EAAE,UAAU;wBAChB,KAAK,EAAE,UAAU;wBACjB,WAAW,EAAE,iDAAiD;qBAC/D;oBACD;wBACE,IAAI,EAAE,SAAS;wBACf,KAAK,EAAE,SAAS;wBAChB,WAAW,EAAE,qBAAqB;qBACnC;oBACD;wBACE,IAAI,EAAE,QAAQ;wBACd,KAAK,EAAE,QAAQ;wBACf,WAAW,EAAE,oCAAoC;qBAClD;iBACF;gBACD,OAAO,EAAE,UAAU;gBACnB,WAAW,EAAE,0CAA0C;aACxD;YACD;gBACE,WAAW,EAAE,mBAAmB;gBAChC,IAAI,EAAE,qBAAqB;gBAC3B,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE,IAAI;gBACb,WAAW,EAAE,2EAA2E;aACzF;YACD;gBACE,WAAW,EAAE,eAAe;gBAC5B,IAAI,EAAE,UAAU;gBAChB,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE,IAAI;gBACb,WAAW,EAAE,yCAAyC;aACvD;YACD;gBACE,WAAW,EAAE,cAAc;gBAC3B,IAAI,EAAE,SAAS;gBACf,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,KAAK;gBACd,WAAW,EAAE,uDAAuD;aACrE;YACD;gBACE,WAAW,EAAE,UAAU;gBACvB,IAAI,EAAE,SAAS;gBACf,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,EAAE;gBACX,WAAW,EAAE,MAAM;gBACnB,WAAW,EAAE,wFAAwF;aACtG;SACF;KACF;IACD;QACE,WAAW,EAAE,SAAS;QACtB,IAAI,EAAE,SAAS;QACf,IAAI,EAAE,YAAY;QAClB,WAAW,EAAE,YAAY;QACzB,OAAO,EAAE,EAAE;QACX,cAAc,EAAE;YACd,IAAI,EAAE;gBACJ,SAAS,EAAE,CAAC,cAAc,CAAC;aAC5B;SACF;QACD,OAAO,EAAE;YACP;gBACE,WAAW,EAAE,YAAY;gBACzB,IAAI,EAAE,WAAW;gBACjB,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE;oBACP;wBACE,IAAI,EAAE,qBAAqB;wBAC3B,KAAK,EAAE,QAAQ;wBACf,WAAW,EAAE,oDAAoD;qBAClE;oBACD;wBACE,IAAI,EAAE,qBAAqB;wBAC3B,KAAK,EAAE,UAAU;wBACjB,WAAW,EAAE,mBAAmB;qBACjC;oBACD;wBACE,IAAI,EAAE,sBAAsB;wBAC5B,KAAK,EAAE,SAAS;wBAChB,WAAW,EAAE,mDAAmD;qBACjE;oBACD;wBACE,IAAI,EAAE,WAAW;wBACjB,KAAK,EAAE,WAAW;wBAClB,WAAW,EAAE,gDAAgD;qBAC9D;oBACD;wBACE,IAAI,EAAE,YAAY;wBAClB,KAAK,EAAE,YAAY;wBACnB,WAAW,EAAE,iDAAiD;qBAC/D;iBACF;gBACD,OAAO,EAAE,SAAS;gBAClB,WAAW,EAAE,oCAAoC;aAClD;YACD;gBACE,WAAW,EAAE,kBAAkB;gBAC/B,IAAI,EAAE,gBAAgB;gBACtB,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE,KAAK;gBACd,WAAW,EAAE,wDAAwD;aACtE;SACF;KACF;CACF,CAAC;AAWF,MAAM,UAAU,GAA+B;IAC7C,KAAK,EAAE;QACL,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE;QAClD,EAAE,IAAI,EAAE,iBAAiB,EAAE,QAAQ,EAAE,0BAA0B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QAC1G,EAAE,IAAI,EAAE,cAAc,EAAE,QAAQ,EAAE,uBAAuB,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QACpG,EAAE,IAAI,EAAE,cAAc,EAAE,QAAQ,EAAE,uBAAuB,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,EAAE;QACjG,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QAC5F,EAAE,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,uBAAuB,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;KACjG;IACD,SAAS,EAAE;QACT,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,2BAA2B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QACnG,EAAE,IAAI,EAAE,eAAe,EAAE,QAAQ,EAAE,iCAAiC,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QAC/G,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,2BAA2B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QACnG,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,0BAA0B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QACjG,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,yBAAyB,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QAC/F,EAAE,IAAI,EAAE,YAAY,EAAE,QAAQ,EAAE,+BAA+B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QAC1G,EAAE,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,4BAA4B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;KACtG;IACD,OAAO,EAAE;QACP,EAAE,IAAI,EAAE,aAAa,EAAE,QAAQ,EAAE,2BAA2B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QACvG,EAAE,IAAI,EAAE,cAAc,EAAE,QAAQ,EAAE,4BAA4B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QACzG,EAAE,IAAI,EAAE,oBAAoB,EAAE,QAAQ,EAAE,kCAAkC,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QACrH,EAAE,IAAI,EAAE,cAAc,EAAE,QAAQ,EAAE,4BAA4B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QACzG,EAAE,IAAI,EAAE,aAAa,EAAE,QAAQ,EAAE,2BAA2B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QACvG,EAAE,IAAI,EAAE,gBAAgB,EAAE,QAAQ,EAAE,8BAA8B,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;KAC9G;IACD,MAAM,EAAE;QACN,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QAC5F,EAAE,IAAI,EAAE,WAAW,EAAE,QAAQ,EAAE,wBAAwB,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;QAClG,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,sBAAsB,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;KAC/F;IACD,QAAQ,EAAE;QACR,EAAE,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,EAAE;QAC5E,EAAE,IAAI,EAAE,iBAAiB,EAAE,QAAQ,EAAE,qCAAqC,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE;KACtH;CACF,CAAC;AAGK,KAAK,UAAU,OAAO,CAE3B,KAA2B,EAC3B,WAAgC;;IAEhC,MAAM,UAAU,GAAyB,EAAE,CAAC;IAE5C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,IAAI,CAAC;YAEH,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAW,CAAC;YAC1D,MAAM,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,eAAe,EAAE,CAAC,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,QAAQ,CAAC,CAAa,CAAC;YAC9G,MAAM,cAAc,GAAG,IAAI,CAAC,gBAAgB,CAAC,gBAAgB,EAAE,CAAC,EAAE,EAAE,CAAgB,CAAC;YACrF,MAAM,OAAO,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,CAAC,EAAE,EAAE,CAAgB,CAAC;YAEvE,IAAI,CAAC,GAAG,EAAE,CAAC;gBACT,MAAM,IAAI,iCAAkB,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,sBAAsB,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;YACzF,CAAC;YAED,IAAI,CAAC,IAAA,kBAAU,EAAC,GAAG,CAAC,EAAE,CAAC;gBACrB,MAAM,IAAI,iCAAkB,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,gBAAgB,GAAG,EAAE,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;YACxF,CAAC;YAED,IAAI,CAAC,aAAa,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACjD,MAAM,IAAI,iCAAkB,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,8CAA8C,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;YACjH,CAAC;YAGD,MAAM,MAAM,GAAe,EAAE,CAAC;YAC9B,KAAK,MAAM,QAAQ,IAAI,aAAa,EAAE,CAAC;gBACrC,IAAI,QAAQ,KAAK,QAAQ,IAAI,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;oBAClD,MAAM,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,CAAC;gBACvC,CAAC;YACH,CAAC;YAGD,MAAM,kBAAkB,GAAQ,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;gBAClD,IAAI,EAAE,2BAA2B;gBACjC,MAAM,EAAE;oBACN,MAAM,EAAE;wBACN,IAAI,EAAE,MAAM;wBACZ,KAAK,EAAE;4BACL,IAAI,EAAE,cAAc;4BACpB,YAAY,EAAE,MAAM;4BACpB,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;gCAC3B,IAAI,EAAE,KAAK,CAAC,IAAI;gCAChB,QAAQ,EAAE,KAAK,CAAC,QAAQ;gCACxB,IAAI,EAAE,KAAK,CAAC,IAAI;gCAChB,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,KAAK,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;6BAC3D,CAAC,CAAC;yBACJ;qBACF;iBACF;aACF,CAAC,CAAC,CAAC,IAAI,CAAC;YAGT,MAAM,cAAc,GAAQ;gBAC1B,GAAG,cAAc;gBACjB,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,SAAS;gBACzC,OAAO,EAAE,cAAc,CAAC,OAAO;aAChC,CAAC;YAEF,MAAM,aAAa,GAAG,IAAA,8BAAsB,EAAC,cAAc,CAAC,CAAC;YAG7D,IAAI,kBAAkB,EAAE,CAAC;gBACvB,aAAa,CAAC,kBAAkB,GAAG,kBAAkB,CAAC;YACxD,CAAC;YAGD,MAAM,OAAO,GAAG,MAAM,IAAA,yBAAiB,EAAC,IAAI,CAAC,CAAC;YAG9C,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,GAAG,EAAE,aAAa,CAAC,CAAC;YAEtD,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;gBACpB,MAAM,IAAI,iCAAkB,CAC1B,IAAI,CAAC,OAAO,EAAE,EACd,wBAAwB,MAAM,CAAC,aAAa,IAAI,eAAe,EAAE,EACjE,EAAE,SAAS,EAAE,CAAC,EAAE,CACjB,CAAC;YACJ,CAAC;YAGD,IAAI,OAAO,GAAgB,EAAE,CAAC;YAE9B,IAAI,MAAM,CAAC,iBAAiB,EAAE,CAAC;gBAC7B,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC;oBAEpD,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBAC/C,OAAO,GAAG,EAAE,GAAG,OAAO,EAAE,GAAG,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;oBACzC,CAAC;yBAAM,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;wBACtC,OAAO,GAAG,EAAE,GAAG,OAAO,EAAE,GAAG,MAAM,EAAE,CAAC;oBACtC,CAAC;gBACH,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;gBAEb,CAAC;YACH,CAAC;YAGD,IAAI,aAAa,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACrC,MAAM,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,IAAI,MAAM,CAAC,YAAY,IAAI,EAAE,CAAC,CAAC;gBAC3E,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC1B,OAAO,CAAC,MAAM,GAAG,UAAU,CAAC;gBAC9B,CAAC;YACH,CAAC;YAGD,IAAI,aAAa,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;gBACvC,MAAM,YAAY,GAAG,eAAe,CAAC,MAAM,CAAC,IAAI,IAAI,MAAM,CAAC,YAAY,IAAI,EAAE,CAAC,CAAC;gBAC/E,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC5B,OAAO,CAAC,QAAQ,GAAG,YAAY,CAAC;gBAClC,CAAC;YACH,CAAC;YAGD,MAAM,eAAe,GAAgB;gBACnC,GAAG;gBACH,OAAO,EAAE,IAAI;gBACb,GAAG,EAAE,OAAO;gBACZ,GAAG,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,WAAW,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aAC/E,CAAC;YAGF,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,eAAe;gBACrB,UAAU,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE;aACxB,CAAC,CAAC;QAEL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAEf,IAAI,IAAI,CAAC,cAAc,EAAE,EAAE,CAAC;gBAC1B,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC;gBAC5B,MAAM,cAAc,GAAG,MAAC,KAAa,CAAC,SAAS,mCAAI,CAAC,CAAC;gBACrD,UAAU,CAAC,IAAI,CAAC;oBACd,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI;oBACnB,KAAK,EAAE,IAAI,iCAAkB,CAAC,IAAI,EAAG,KAAe,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,cAAc,EAAE,CAAC;oBAC5F,UAAU,EAAE,EAAE,IAAI,EAAE,CAAC,EAAE;iBACxB,CAAC,CAAC;gBACH,SAAS;YACX,CAAC;YAED,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAOD,SAAS,aAAa,CAAC,IAAY;IACjC,MAAM,UAAU,GAAU,EAAE,CAAC;IAC7B,MAAM,KAAK,GAAG,4EAA4E,CAAC;IAC3F,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC3C,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YACzC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxB,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;QAEb,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAKD,SAAS,eAAe,CAAC,IAAY;IACnC,MAAM,YAAY,GAA0C,EAAE,CAAC;IAC/D,MAAM,KAAK,GAAG,oGAAoG,CAAC;IACnH,MAAM,MAAM,GAAG,oGAAoG,CAAC;IACpH,MAAM,MAAM,GAAG,oGAAoG,CAAC;IAEpH,IAAI,KAAK,CAAC;IAEV,OAAO,CAAC,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC3C,YAAY,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACxD,CAAC;IAED,OAAO,CAAC,KAAK,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC5C,YAAY,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACxD,CAAC;IAED,OAAO,CAAC,KAAK,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC5C,YAAY,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACxD,CAAC;IAGD,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,OAAO,YAAY,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;QAC/B,MAAM,GAAG,GAAG,GAAG,GAAG,CAAC,IAAI,IAAI,GAAG,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,KAAK,CAAC;QAChC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACd,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;AACL,CAAC;AAKD,SAAS,WAAW,CAAC,IAAY;IAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;IAC3D,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;AAC/B,CAAC"}
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
-
export { getCrawl4aiClient, createBrowserConfig, createCrawlerRunConfig, safeJsonParse, cleanText, isValidUrl } from '../../Crawl4aiPlusBasicCrawler/helpers/utils';
|
|
1
|
+
export { getCrawl4aiClient, createBrowserConfig, createCrawlerRunConfig, safeJsonParse, cleanText, isValidUrl, buildLlmConfig, validateLlmCredentials, } from '../../Crawl4aiPlusBasicCrawler/helpers/utils';
|
|
2
|
+
export type { LlmConfigResult } from '../../Crawl4aiPlusBasicCrawler/helpers/utils';
|
|
2
3
|
import { IDataObject } from 'n8n-workflow';
|
|
3
4
|
import { CssSelectorSchema, LlmSchema } from './interfaces';
|
|
4
5
|
export declare function createCssSelectorExtractionStrategy(schema: CssSelectorSchema): any;
|
|
5
|
-
export declare function createLlmExtractionStrategy(schema: LlmSchema, instruction: string, provider: string, apiKey?: string, baseUrl?: string): any;
|
|
6
|
+
export declare function createLlmExtractionStrategy(schema: LlmSchema, instruction: string, provider: string, apiKey?: string, baseUrl?: string, inputFormat?: 'markdown' | 'html' | 'fit_markdown'): any;
|
|
7
|
+
export declare function createCosineExtractionStrategy(semanticFilter: string, options?: IDataObject): any;
|
|
6
8
|
export declare function cleanExtractedData(data: IDataObject): IDataObject;
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.isValidUrl = exports.cleanText = exports.safeJsonParse = exports.createCrawlerRunConfig = exports.createBrowserConfig = exports.getCrawl4aiClient = void 0;
|
|
3
|
+
exports.validateLlmCredentials = exports.buildLlmConfig = exports.isValidUrl = exports.cleanText = exports.safeJsonParse = exports.createCrawlerRunConfig = exports.createBrowserConfig = exports.getCrawl4aiClient = void 0;
|
|
4
4
|
exports.createCssSelectorExtractionStrategy = createCssSelectorExtractionStrategy;
|
|
5
5
|
exports.createLlmExtractionStrategy = createLlmExtractionStrategy;
|
|
6
|
+
exports.createCosineExtractionStrategy = createCosineExtractionStrategy;
|
|
6
7
|
exports.cleanExtractedData = cleanExtractedData;
|
|
7
8
|
var utils_1 = require("../../Crawl4aiPlusBasicCrawler/helpers/utils");
|
|
8
9
|
Object.defineProperty(exports, "getCrawl4aiClient", { enumerable: true, get: function () { return utils_1.getCrawl4aiClient; } });
|
|
@@ -11,6 +12,8 @@ Object.defineProperty(exports, "createCrawlerRunConfig", { enumerable: true, get
|
|
|
11
12
|
Object.defineProperty(exports, "safeJsonParse", { enumerable: true, get: function () { return utils_1.safeJsonParse; } });
|
|
12
13
|
Object.defineProperty(exports, "cleanText", { enumerable: true, get: function () { return utils_1.cleanText; } });
|
|
13
14
|
Object.defineProperty(exports, "isValidUrl", { enumerable: true, get: function () { return utils_1.isValidUrl; } });
|
|
15
|
+
Object.defineProperty(exports, "buildLlmConfig", { enumerable: true, get: function () { return utils_1.buildLlmConfig; } });
|
|
16
|
+
Object.defineProperty(exports, "validateLlmCredentials", { enumerable: true, get: function () { return utils_1.validateLlmCredentials; } });
|
|
14
17
|
const utils_2 = require("../../Crawl4aiPlusBasicCrawler/helpers/utils");
|
|
15
18
|
function createCssSelectorExtractionStrategy(schema) {
|
|
16
19
|
return {
|
|
@@ -32,7 +35,7 @@ function createCssSelectorExtractionStrategy(schema) {
|
|
|
32
35
|
},
|
|
33
36
|
};
|
|
34
37
|
}
|
|
35
|
-
function createLlmExtractionStrategy(schema, instruction, provider, apiKey, baseUrl) {
|
|
38
|
+
function createLlmExtractionStrategy(schema, instruction, provider, apiKey, baseUrl, inputFormat) {
|
|
36
39
|
const llmConfigParams = {
|
|
37
40
|
provider: provider || 'openai/gpt-4o',
|
|
38
41
|
api_token: apiKey,
|
|
@@ -40,22 +43,56 @@ function createLlmExtractionStrategy(schema, instruction, provider, apiKey, base
|
|
|
40
43
|
if (baseUrl && baseUrl.trim() !== '') {
|
|
41
44
|
llmConfigParams.api_base = baseUrl;
|
|
42
45
|
}
|
|
46
|
+
const strategyParams = {
|
|
47
|
+
llm_config: {
|
|
48
|
+
type: 'LLMConfig',
|
|
49
|
+
params: llmConfigParams,
|
|
50
|
+
},
|
|
51
|
+
instruction,
|
|
52
|
+
schema: {
|
|
53
|
+
type: 'dict',
|
|
54
|
+
value: schema,
|
|
55
|
+
},
|
|
56
|
+
extraction_type: 'schema',
|
|
57
|
+
apply_chunking: false,
|
|
58
|
+
force_json_response: true,
|
|
59
|
+
};
|
|
60
|
+
if (inputFormat && inputFormat !== 'markdown') {
|
|
61
|
+
strategyParams.input_format = inputFormat;
|
|
62
|
+
}
|
|
43
63
|
return {
|
|
44
64
|
type: 'LLMExtractionStrategy',
|
|
45
|
-
params:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
65
|
+
params: strategyParams,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
function createCosineExtractionStrategy(semanticFilter, options = {}) {
|
|
69
|
+
const strategyParams = {
|
|
70
|
+
semantic_filter: semanticFilter,
|
|
71
|
+
};
|
|
72
|
+
if (options.wordCountThreshold !== undefined) {
|
|
73
|
+
strategyParams.word_count_threshold = Number(options.wordCountThreshold);
|
|
74
|
+
}
|
|
75
|
+
if (options.simThreshold !== undefined) {
|
|
76
|
+
strategyParams.sim_threshold = Number(options.simThreshold);
|
|
77
|
+
}
|
|
78
|
+
if (options.maxDist !== undefined) {
|
|
79
|
+
strategyParams.max_dist = Number(options.maxDist);
|
|
80
|
+
}
|
|
81
|
+
if (options.linkageMethod !== undefined && options.linkageMethod !== '') {
|
|
82
|
+
strategyParams.linkage_method = String(options.linkageMethod);
|
|
83
|
+
}
|
|
84
|
+
if (options.topK !== undefined) {
|
|
85
|
+
strategyParams.top_k = Number(options.topK);
|
|
86
|
+
}
|
|
87
|
+
if (options.modelName !== undefined && options.modelName !== '') {
|
|
88
|
+
strategyParams.model_name = String(options.modelName);
|
|
89
|
+
}
|
|
90
|
+
if (options.verbose === true) {
|
|
91
|
+
strategyParams.verbose = true;
|
|
92
|
+
}
|
|
93
|
+
return {
|
|
94
|
+
type: 'CosineStrategy',
|
|
95
|
+
params: strategyParams,
|
|
59
96
|
};
|
|
60
97
|
}
|
|
61
98
|
function cleanExtractedData(data) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"utils.js","sourceRoot":"","sources":["../../../../nodes/Crawl4aiPlusContentExtractor/helpers/utils.ts"],"names":[],"mappings":";;;
|
|
1
|
+
{"version":3,"file":"utils.js","sourceRoot":"","sources":["../../../../nodes/Crawl4aiPlusContentExtractor/helpers/utils.ts"],"names":[],"mappings":";;;AAwBA,kFAmBC;AAYD,kEA0CC;AAQD,wEAyCC;AAKD,gDAyBC;AA/KD,sEASsD;AARpD,0GAAA,iBAAiB,OAAA;AACjB,4GAAA,mBAAmB,OAAA;AACnB,+GAAA,sBAAsB,OAAA;AACtB,sGAAA,aAAa,OAAA;AACb,kGAAA,SAAS,OAAA;AACT,mGAAA,UAAU,OAAA;AACV,uGAAA,cAAc,OAAA;AACd,+GAAA,sBAAsB,OAAA;AAOxB,wEAAyE;AAQzE,SAAgB,mCAAmC,CAAC,MAAyB;IAC3E,OAAO;QACL,IAAI,EAAE,2BAA2B;QACjC,MAAM,EAAE;YACN,MAAM,EAAE;gBACN,IAAI,EAAE,MAAM;gBACZ,KAAK,EAAE;oBACL,IAAI,EAAE,MAAM,CAAC,IAAI;oBACjB,YAAY,EAAE,MAAM,CAAC,YAAY;oBACjC,MAAM,EAAE,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;wBAClC,IAAI,EAAE,KAAK,CAAC,IAAI;wBAChB,QAAQ,EAAE,KAAK,CAAC,QAAQ;wBACxB,IAAI,EAAE,KAAK,CAAC,IAAI;wBAChB,SAAS,EAAE,KAAK,CAAC,SAAS;qBAC3B,CAAC,CAAC;iBACJ;aACF;SACF;KACF,CAAC;AACJ,CAAC;AAYD,SAAgB,2BAA2B,CACzC,MAAiB,EACjB,WAAmB,EACnB,QAAgB,EAChB,MAAe,EACf,OAAgB,EAChB,WAAkD;IAElD,MAAM,eAAe,GAAQ;QAC3B,QAAQ,EAAE,QAAQ,IAAI,eAAe;QACrC,SAAS,EAAE,MAAM;KAClB,CAAC;IAGF,IAAI,OAAO,IAAI,OAAO,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;QACrC,eAAe,CAAC,QAAQ,GAAG,OAAO,CAAC;IACrC,CAAC;IAED,MAAM,cAAc,GAAQ;QAC1B,UAAU,EAAE;YACV,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,eAAe;SACxB;QACD,WAAW;QACX,MAAM,EAAE;YACN,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE,MAAM;SACd;QACD,eAAe,EAAE,QAAQ;QACzB,cAAc,EAAE,KAAK;QACrB,mBAAmB,EAAE,IAAI;KAC1B,CAAC;IAGF,IAAI,WAAW,IAAI,WAAW,KAAK,UAAU,EAAE,CAAC;QAC9C,cAAc,CAAC,YAAY,GAAG,WAAW,CAAC;IAC5C,CAAC;IAED,OAAO;QACL,IAAI,EAAE,uBAAuB;QAC7B,MAAM,EAAE,cAAc;KACvB,CAAC;AACJ,CAAC;AAQD,SAAgB,8BAA8B,CAC5C,cAAsB,EACtB,UAAuB,EAAE;IAEzB,MAAM,cAAc,GAAQ;QAC1B,eAAe,EAAE,cAAc;KAChC,CAAC;IAGF,IAAI,OAAO,CAAC,kBAAkB,KAAK,SAAS,EAAE,CAAC;QAC7C,cAAc,CAAC,oBAAoB,GAAG,MAAM,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC;IAC3E,CAAC;IAED,IAAI,OAAO,CAAC,YAAY,KAAK,SAAS,EAAE,CAAC;QACvC,cAAc,CAAC,aAAa,GAAG,MAAM,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;IAC9D,CAAC;IAED,IAAI,OAAO,CAAC,OAAO,KAAK,SAAS,EAAE,CAAC;QAClC,cAAc,CAAC,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IACpD,CAAC;IAED,IAAI,OAAO,CAAC,aAAa,KAAK,SAAS,IAAI,OAAO,CAAC,aAAa,KAAK,EAAE,EAAE,CAAC;QACxE,cAAc,CAAC,cAAc,GAAG,MAAM,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IAChE,CAAC;IAED,IAAI,OAAO,CAAC,IAAI,KAAK,SAAS,EAAE,CAAC;QAC/B,cAAc,CAAC,KAAK,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAC9C,CAAC;IAED,IAAI,OAAO,CAAC,SAAS,KAAK,SAAS,IAAI,OAAO,CAAC,SAAS,KAAK,EAAE,EAAE,CAAC;QAChE,cAAc,CAAC,UAAU,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IACxD,CAAC;IAED,IAAI,OAAO,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;QAC7B,cAAc,CAAC,OAAO,GAAG,IAAI,CAAC;IAChC,CAAC;IAED,OAAO;QACL,IAAI,EAAE,gBAAgB;QACtB,MAAM,EAAE,cAAc;KACvB,CAAC;AACJ,CAAC;AAKD,SAAgB,kBAAkB,CAAC,IAAiB;IAClD,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IAErB,MAAM,WAAW,GAAgB,EAAE,CAAC;IAEpC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE;QAC5C,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,WAAW,CAAC,GAAG,CAAC,GAAG,IAAA,iBAAS,EAAC,KAAK,CAAC,CAAC;QACtC,CAAC;aAAM,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;YAChC,WAAW,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE;gBAClC,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;oBAC7B,OAAO,IAAA,iBAAS,EAAC,IAAI,CAAC,CAAC;gBACzB,CAAC;qBAAM,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;oBACrD,OAAO,kBAAkB,CAAC,IAAmB,CAAC,CAAC;gBACjD,CAAC;gBACD,OAAO,IAAI,CAAC;YACd,CAAC,CAAC,CAAC;QACL,CAAC;aAAM,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;YACvD,WAAW,CAAC,GAAG,CAAC,GAAG,kBAAkB,CAAC,KAAoB,CAAC,CAAC;QAC9D,CAAC;aAAM,CAAC;YACN,WAAW,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;QAC3B,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,WAAW,CAAC;AACrB,CAAC"}
|