@kreuzberg/kreuzcrawl 0.1.0-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +18 -0
- package/build.rs +5 -0
- package/index.d.ts +404 -0
- package/index.js +592 -0
- package/npm/darwin-arm64/README.md +3 -0
- package/npm/darwin-arm64/kreuzcrawl-node.darwin-arm64.node +0 -0
- package/npm/darwin-arm64/package.json +19 -0
- package/npm/kreuzberg-kreuzcrawl-darwin-arm64-0.1.0-rc.10.tgz +0 -0
- package/npm/kreuzberg-kreuzcrawl-linux-arm64-gnu-0.1.0-rc.10.tgz +0 -0
- package/npm/kreuzberg-kreuzcrawl-linux-x64-gnu-0.1.0-rc.10.tgz +0 -0
- package/npm/kreuzberg-kreuzcrawl-win32-x64-msvc-0.1.0-rc.10.tgz +0 -0
- package/npm/linux-arm64-gnu/README.md +3 -0
- package/npm/linux-arm64-gnu/kreuzcrawl-node.linux-arm64-gnu.node +0 -0
- package/npm/linux-arm64-gnu/package.json +22 -0
- package/npm/linux-x64-gnu/README.md +3 -0
- package/npm/linux-x64-gnu/kreuzcrawl-node.linux-x64-gnu.node +0 -0
- package/npm/linux-x64-gnu/package.json +22 -0
- package/npm/win32-x64-msvc/README.md +3 -0
- package/npm/win32-x64-msvc/kreuzcrawl-node.win32-x64-msvc.node +0 -0
- package/npm/win32-x64-msvc/package.json +19 -0
- package/package.json +43 -0
- package/src/lib.rs +1928 -0
package/Cargo.toml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "kreuzcrawl-node"
|
|
3
|
+
version = "0.1.0-rc.10"
|
|
4
|
+
edition = "2024"
|
|
5
|
+
license = "Elastic-2.0"
|
|
6
|
+
|
|
7
|
+
[lib]
|
|
8
|
+
crate-type = ["cdylib"]
|
|
9
|
+
|
|
10
|
+
[dependencies]
|
|
11
|
+
kreuzcrawl = { path = "../kreuzcrawl" }
|
|
12
|
+
napi = { version = "3", features = ["async"] }
|
|
13
|
+
napi-derive = "3"
|
|
14
|
+
serde = { version = "1", features = ["derive"] }
|
|
15
|
+
serde_json = "1"
|
|
16
|
+
|
|
17
|
+
[build-dependencies]
|
|
18
|
+
napi-build = "2"
|
package/build.rs
ADDED
package/index.d.ts
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
/* auto-generated by NAPI-RS */
|
|
2
|
+
/* eslint-disable */
|
|
3
|
+
export declare class JsCrawlEngineHandle {
|
|
4
|
+
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export declare function batchCrawl(engine: JsCrawlEngineHandle, urls: Array<string>): Promise<Array<JsBatchCrawlResult>>
|
|
8
|
+
|
|
9
|
+
export declare function batchScrape(engine: JsCrawlEngineHandle, urls: Array<string>): Promise<Array<JsBatchScrapeResult>>
|
|
10
|
+
|
|
11
|
+
export declare function crawl(engine: JsCrawlEngineHandle, url: string): Promise<JsCrawlResult>
|
|
12
|
+
|
|
13
|
+
export declare function createEngine(config?: JsCrawlConfig | undefined | null): JsCrawlEngineHandle
|
|
14
|
+
|
|
15
|
+
export interface JsActionResult {
|
|
16
|
+
actionIndex?: number
|
|
17
|
+
actionType?: string
|
|
18
|
+
success?: boolean
|
|
19
|
+
data?: string
|
|
20
|
+
error?: string
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface JsArticleMetadata {
|
|
24
|
+
publishedTime?: string
|
|
25
|
+
modifiedTime?: string
|
|
26
|
+
author?: string
|
|
27
|
+
section?: string
|
|
28
|
+
tags?: Array<string>
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export declare const enum JsAssetCategory {
|
|
32
|
+
Document = 'document',
|
|
33
|
+
Image = 'image',
|
|
34
|
+
Audio = 'audio',
|
|
35
|
+
Video = 'video',
|
|
36
|
+
Font = 'font',
|
|
37
|
+
Stylesheet = 'stylesheet',
|
|
38
|
+
Script = 'script',
|
|
39
|
+
Archive = 'archive',
|
|
40
|
+
Data = 'data',
|
|
41
|
+
Other = 'other'
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export interface JsAuthConfig {
|
|
45
|
+
type: string
|
|
46
|
+
username?: string
|
|
47
|
+
password?: string
|
|
48
|
+
token?: string
|
|
49
|
+
name?: string
|
|
50
|
+
value?: string
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export interface JsBatchCrawlResult {
|
|
54
|
+
url?: string
|
|
55
|
+
result?: JsCrawlResult
|
|
56
|
+
error?: string
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export interface JsBatchScrapeResult {
|
|
60
|
+
url?: string
|
|
61
|
+
result?: JsScrapeResult
|
|
62
|
+
error?: string
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export interface JsBrowserConfig {
|
|
66
|
+
mode?: JsBrowserMode
|
|
67
|
+
endpoint?: string
|
|
68
|
+
timeout?: number
|
|
69
|
+
wait?: JsBrowserWait
|
|
70
|
+
waitSelector?: string
|
|
71
|
+
extraWait?: number
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export declare const enum JsBrowserMode {
|
|
75
|
+
Auto = 'auto',
|
|
76
|
+
Always = 'always',
|
|
77
|
+
Never = 'never'
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export declare const enum JsBrowserWait {
|
|
81
|
+
NetworkIdle = 'network_idle',
|
|
82
|
+
Selector = 'selector',
|
|
83
|
+
Fixed = 'fixed'
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export interface JsCachedPage {
|
|
87
|
+
url?: string
|
|
88
|
+
statusCode?: number
|
|
89
|
+
contentType?: string
|
|
90
|
+
body?: string
|
|
91
|
+
etag?: string
|
|
92
|
+
lastModified?: string
|
|
93
|
+
cachedAt?: number
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export interface JsCitationReference {
|
|
97
|
+
index?: number
|
|
98
|
+
url?: string
|
|
99
|
+
text?: string
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export interface JsCitationResult {
|
|
103
|
+
content?: string
|
|
104
|
+
references?: Array<JsCitationReference>
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export interface JsCookieInfo {
|
|
108
|
+
name?: string
|
|
109
|
+
value?: string
|
|
110
|
+
domain?: string
|
|
111
|
+
path?: string
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
export interface JsCrawlConfig {
|
|
115
|
+
maxDepth?: number
|
|
116
|
+
maxPages?: number
|
|
117
|
+
maxConcurrent?: number
|
|
118
|
+
respectRobotsTxt?: boolean
|
|
119
|
+
userAgent?: string
|
|
120
|
+
stayOnDomain?: boolean
|
|
121
|
+
allowSubdomains?: boolean
|
|
122
|
+
includePaths?: Array<string>
|
|
123
|
+
excludePaths?: Array<string>
|
|
124
|
+
customHeaders?: Record<string, string>
|
|
125
|
+
requestTimeout?: number
|
|
126
|
+
maxRedirects?: number
|
|
127
|
+
retryCount?: number
|
|
128
|
+
retryCodes?: Array<number>
|
|
129
|
+
cookiesEnabled?: boolean
|
|
130
|
+
auth?: JsAuthConfig
|
|
131
|
+
maxBodySize?: number
|
|
132
|
+
mainContentOnly?: boolean
|
|
133
|
+
removeTags?: Array<string>
|
|
134
|
+
mapLimit?: number
|
|
135
|
+
mapSearch?: string
|
|
136
|
+
downloadAssets?: boolean
|
|
137
|
+
assetTypes?: Array<JsAssetCategory>
|
|
138
|
+
maxAssetSize?: number
|
|
139
|
+
browser?: JsBrowserConfig
|
|
140
|
+
proxy?: JsProxyConfig
|
|
141
|
+
userAgents?: Array<string>
|
|
142
|
+
captureScreenshot?: boolean
|
|
143
|
+
downloadDocuments?: boolean
|
|
144
|
+
documentMaxSize?: number
|
|
145
|
+
documentMimeTypes?: Array<string>
|
|
146
|
+
warcOutput?: string
|
|
147
|
+
browserProfile?: string
|
|
148
|
+
saveBrowserProfile?: boolean
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
export declare const enum JsCrawlEvent {
|
|
152
|
+
Page = 'Page',
|
|
153
|
+
Error = 'Error',
|
|
154
|
+
Complete = 'Complete'
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
export interface JsCrawlPageResult {
|
|
158
|
+
url?: string
|
|
159
|
+
normalizedUrl?: string
|
|
160
|
+
statusCode?: number
|
|
161
|
+
contentType?: string
|
|
162
|
+
html?: string
|
|
163
|
+
bodySize?: number
|
|
164
|
+
metadata?: JsPageMetadata
|
|
165
|
+
links?: Array<JsLinkInfo>
|
|
166
|
+
images?: Array<JsImageInfo>
|
|
167
|
+
feeds?: Array<JsFeedInfo>
|
|
168
|
+
jsonLd?: Array<JsJsonLdEntry>
|
|
169
|
+
depth?: number
|
|
170
|
+
stayedOnDomain?: boolean
|
|
171
|
+
wasSkipped?: boolean
|
|
172
|
+
isPdf?: boolean
|
|
173
|
+
detectedCharset?: string
|
|
174
|
+
markdown?: JsMarkdownResult
|
|
175
|
+
extractedData?: string
|
|
176
|
+
extractionMeta?: JsExtractionMeta
|
|
177
|
+
downloadedDocument?: JsDownloadedDocument
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
export interface JsCrawlResult {
|
|
181
|
+
pages?: Array<JsCrawlPageResult>
|
|
182
|
+
finalUrl?: string
|
|
183
|
+
redirectCount?: number
|
|
184
|
+
wasSkipped?: boolean
|
|
185
|
+
error?: string
|
|
186
|
+
cookies?: Array<JsCookieInfo>
|
|
187
|
+
normalizedUrls?: Array<string>
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
export interface JsDownloadedAsset {
|
|
191
|
+
url?: string
|
|
192
|
+
contentHash?: string
|
|
193
|
+
mimeType?: string
|
|
194
|
+
size?: number
|
|
195
|
+
assetCategory?: JsAssetCategory
|
|
196
|
+
htmlTag?: string
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
export interface JsDownloadedDocument {
|
|
200
|
+
url?: string
|
|
201
|
+
mimeType?: string
|
|
202
|
+
content?: Array<number>
|
|
203
|
+
size?: number
|
|
204
|
+
filename?: string
|
|
205
|
+
contentHash?: string
|
|
206
|
+
headers?: Record<string, string>
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
export interface JsExtractionMeta {
|
|
210
|
+
cost?: number
|
|
211
|
+
promptTokens?: number
|
|
212
|
+
completionTokens?: number
|
|
213
|
+
model?: string
|
|
214
|
+
chunksProcessed?: number
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
export interface JsFaviconInfo {
|
|
218
|
+
url?: string
|
|
219
|
+
rel?: string
|
|
220
|
+
sizes?: string
|
|
221
|
+
mimeType?: string
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
export interface JsFeedInfo {
|
|
225
|
+
url?: string
|
|
226
|
+
title?: string
|
|
227
|
+
feedType?: JsFeedType
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
export declare const enum JsFeedType {
|
|
231
|
+
Rss = 'rss',
|
|
232
|
+
Atom = 'atom',
|
|
233
|
+
JsonFeed = 'json_feed'
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
export interface JsHeadingInfo {
|
|
237
|
+
level?: number
|
|
238
|
+
text?: string
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
export interface JsHreflangEntry {
|
|
242
|
+
lang?: string
|
|
243
|
+
url?: string
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
export interface JsImageInfo {
|
|
247
|
+
url?: string
|
|
248
|
+
alt?: string
|
|
249
|
+
width?: number
|
|
250
|
+
height?: number
|
|
251
|
+
source?: JsImageSource
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
export declare const enum JsImageSource {
|
|
255
|
+
Img = 'img',
|
|
256
|
+
PictureSource = 'picture_source',
|
|
257
|
+
OgImage = 'og_image',
|
|
258
|
+
TwitterImage = 'twitter_image'
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
export interface JsInteractionResult {
|
|
262
|
+
actionResults?: Array<JsActionResult>
|
|
263
|
+
finalHtml?: string
|
|
264
|
+
finalUrl?: string
|
|
265
|
+
screenshot?: Array<number>
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
export interface JsJsonLdEntry {
|
|
269
|
+
schemaType?: string
|
|
270
|
+
name?: string
|
|
271
|
+
raw?: string
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
export interface JsLinkInfo {
|
|
275
|
+
url?: string
|
|
276
|
+
text?: string
|
|
277
|
+
linkType?: JsLinkType
|
|
278
|
+
rel?: string
|
|
279
|
+
nofollow?: boolean
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
export declare const enum JsLinkType {
|
|
283
|
+
Internal = 'internal',
|
|
284
|
+
External = 'external',
|
|
285
|
+
Anchor = 'anchor',
|
|
286
|
+
Document = 'document'
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
export interface JsMapResult {
|
|
290
|
+
urls?: Array<JsSitemapUrl>
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
export interface JsMarkdownResult {
|
|
294
|
+
content?: string
|
|
295
|
+
documentStructure?: string
|
|
296
|
+
tables?: Array<string>
|
|
297
|
+
warnings?: Array<string>
|
|
298
|
+
citations?: JsCitationResult
|
|
299
|
+
fitContent?: string
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
export interface JsPageMetadata {
|
|
303
|
+
title?: string
|
|
304
|
+
description?: string
|
|
305
|
+
canonicalUrl?: string
|
|
306
|
+
keywords?: string
|
|
307
|
+
author?: string
|
|
308
|
+
viewport?: string
|
|
309
|
+
themeColor?: string
|
|
310
|
+
generator?: string
|
|
311
|
+
robots?: string
|
|
312
|
+
htmlLang?: string
|
|
313
|
+
htmlDir?: string
|
|
314
|
+
ogTitle?: string
|
|
315
|
+
ogType?: string
|
|
316
|
+
ogImage?: string
|
|
317
|
+
ogDescription?: string
|
|
318
|
+
ogUrl?: string
|
|
319
|
+
ogSiteName?: string
|
|
320
|
+
ogLocale?: string
|
|
321
|
+
ogVideo?: string
|
|
322
|
+
ogAudio?: string
|
|
323
|
+
ogLocaleAlternates?: Array<string>
|
|
324
|
+
twitterCard?: string
|
|
325
|
+
twitterTitle?: string
|
|
326
|
+
twitterDescription?: string
|
|
327
|
+
twitterImage?: string
|
|
328
|
+
twitterSite?: string
|
|
329
|
+
twitterCreator?: string
|
|
330
|
+
dcTitle?: string
|
|
331
|
+
dcCreator?: string
|
|
332
|
+
dcSubject?: string
|
|
333
|
+
dcDescription?: string
|
|
334
|
+
dcPublisher?: string
|
|
335
|
+
dcDate?: string
|
|
336
|
+
dcType?: string
|
|
337
|
+
dcFormat?: string
|
|
338
|
+
dcIdentifier?: string
|
|
339
|
+
dcLanguage?: string
|
|
340
|
+
dcRights?: string
|
|
341
|
+
article?: JsArticleMetadata
|
|
342
|
+
hreflangs?: Array<JsHreflangEntry>
|
|
343
|
+
favicons?: Array<JsFaviconInfo>
|
|
344
|
+
headings?: Array<JsHeadingInfo>
|
|
345
|
+
wordCount?: number
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
export interface JsProxyConfig {
|
|
349
|
+
url?: string
|
|
350
|
+
username?: string
|
|
351
|
+
password?: string
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
export interface JsResponseMeta {
|
|
355
|
+
etag?: string
|
|
356
|
+
lastModified?: string
|
|
357
|
+
cacheControl?: string
|
|
358
|
+
server?: string
|
|
359
|
+
xPoweredBy?: string
|
|
360
|
+
contentLanguage?: string
|
|
361
|
+
contentEncoding?: string
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
export interface JsScrapeResult {
|
|
365
|
+
statusCode?: number
|
|
366
|
+
contentType?: string
|
|
367
|
+
html?: string
|
|
368
|
+
bodySize?: number
|
|
369
|
+
metadata?: JsPageMetadata
|
|
370
|
+
links?: Array<JsLinkInfo>
|
|
371
|
+
images?: Array<JsImageInfo>
|
|
372
|
+
feeds?: Array<JsFeedInfo>
|
|
373
|
+
jsonLd?: Array<JsJsonLdEntry>
|
|
374
|
+
isAllowed?: boolean
|
|
375
|
+
crawlDelay?: number
|
|
376
|
+
noindexDetected?: boolean
|
|
377
|
+
nofollowDetected?: boolean
|
|
378
|
+
xRobotsTag?: string
|
|
379
|
+
isPdf?: boolean
|
|
380
|
+
wasSkipped?: boolean
|
|
381
|
+
detectedCharset?: string
|
|
382
|
+
mainContentOnly?: boolean
|
|
383
|
+
authHeaderSent?: boolean
|
|
384
|
+
responseMeta?: JsResponseMeta
|
|
385
|
+
assets?: Array<JsDownloadedAsset>
|
|
386
|
+
jsRenderHint?: boolean
|
|
387
|
+
browserUsed?: boolean
|
|
388
|
+
markdown?: JsMarkdownResult
|
|
389
|
+
extractedData?: string
|
|
390
|
+
extractionMeta?: JsExtractionMeta
|
|
391
|
+
screenshot?: Array<number>
|
|
392
|
+
downloadedDocument?: JsDownloadedDocument
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
export interface JsSitemapUrl {
|
|
396
|
+
url?: string
|
|
397
|
+
lastmod?: string
|
|
398
|
+
changefreq?: string
|
|
399
|
+
priority?: string
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
export declare function mapUrls(engine: JsCrawlEngineHandle, url: string): Promise<JsMapResult>
|
|
403
|
+
|
|
404
|
+
export declare function scrape(engine: JsCrawlEngineHandle, url: string): Promise<JsScrapeResult>
|