@kreuzberg/kreuzcrawl 0.1.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml ADDED
@@ -0,0 +1,18 @@
1
+ [package]
2
+ name = "kreuzcrawl-node"
3
+ version = "0.1.0-rc.10"
4
+ edition = "2024"
5
+ license = "Elastic-2.0"
6
+
7
+ [lib]
8
+ crate-type = ["cdylib"]
9
+
10
+ [dependencies]
11
+ kreuzcrawl = { path = "../kreuzcrawl" }
12
+ napi = { version = "3", features = ["async"] }
13
+ napi-derive = "3"
14
+ serde = { version = "1", features = ["derive"] }
15
+ serde_json = "1"
16
+
17
+ [build-dependencies]
18
+ napi-build = "2"
package/build.rs ADDED
@@ -0,0 +1,5 @@
1
+ extern crate napi_build;
2
+
3
+ fn main() {
4
+ napi_build::setup();
5
+ }
package/index.d.ts ADDED
@@ -0,0 +1,404 @@
1
+ /* auto-generated by NAPI-RS */
2
+ /* eslint-disable */
3
+ export declare class JsCrawlEngineHandle {
4
+
5
+ }
6
+
7
+ export declare function batchCrawl(engine: JsCrawlEngineHandle, urls: Array<string>): Promise<Array<JsBatchCrawlResult>>
8
+
9
+ export declare function batchScrape(engine: JsCrawlEngineHandle, urls: Array<string>): Promise<Array<JsBatchScrapeResult>>
10
+
11
+ export declare function crawl(engine: JsCrawlEngineHandle, url: string): Promise<JsCrawlResult>
12
+
13
+ export declare function createEngine(config?: JsCrawlConfig | undefined | null): JsCrawlEngineHandle
14
+
15
+ export interface JsActionResult {
16
+ actionIndex?: number
17
+ actionType?: string
18
+ success?: boolean
19
+ data?: string
20
+ error?: string
21
+ }
22
+
23
+ export interface JsArticleMetadata {
24
+ publishedTime?: string
25
+ modifiedTime?: string
26
+ author?: string
27
+ section?: string
28
+ tags?: Array<string>
29
+ }
30
+
31
+ export declare const enum JsAssetCategory {
32
+ Document = 'document',
33
+ Image = 'image',
34
+ Audio = 'audio',
35
+ Video = 'video',
36
+ Font = 'font',
37
+ Stylesheet = 'stylesheet',
38
+ Script = 'script',
39
+ Archive = 'archive',
40
+ Data = 'data',
41
+ Other = 'other'
42
+ }
43
+
44
+ export interface JsAuthConfig {
45
+ type: string
46
+ username?: string
47
+ password?: string
48
+ token?: string
49
+ name?: string
50
+ value?: string
51
+ }
52
+
53
+ export interface JsBatchCrawlResult {
54
+ url?: string
55
+ result?: JsCrawlResult
56
+ error?: string
57
+ }
58
+
59
+ export interface JsBatchScrapeResult {
60
+ url?: string
61
+ result?: JsScrapeResult
62
+ error?: string
63
+ }
64
+
65
+ export interface JsBrowserConfig {
66
+ mode?: JsBrowserMode
67
+ endpoint?: string
68
+ timeout?: number
69
+ wait?: JsBrowserWait
70
+ waitSelector?: string
71
+ extraWait?: number
72
+ }
73
+
74
+ export declare const enum JsBrowserMode {
75
+ Auto = 'auto',
76
+ Always = 'always',
77
+ Never = 'never'
78
+ }
79
+
80
+ export declare const enum JsBrowserWait {
81
+ NetworkIdle = 'network_idle',
82
+ Selector = 'selector',
83
+ Fixed = 'fixed'
84
+ }
85
+
86
+ export interface JsCachedPage {
87
+ url?: string
88
+ statusCode?: number
89
+ contentType?: string
90
+ body?: string
91
+ etag?: string
92
+ lastModified?: string
93
+ cachedAt?: number
94
+ }
95
+
96
+ export interface JsCitationReference {
97
+ index?: number
98
+ url?: string
99
+ text?: string
100
+ }
101
+
102
+ export interface JsCitationResult {
103
+ content?: string
104
+ references?: Array<JsCitationReference>
105
+ }
106
+
107
+ export interface JsCookieInfo {
108
+ name?: string
109
+ value?: string
110
+ domain?: string
111
+ path?: string
112
+ }
113
+
114
+ export interface JsCrawlConfig {
115
+ maxDepth?: number
116
+ maxPages?: number
117
+ maxConcurrent?: number
118
+ respectRobotsTxt?: boolean
119
+ userAgent?: string
120
+ stayOnDomain?: boolean
121
+ allowSubdomains?: boolean
122
+ includePaths?: Array<string>
123
+ excludePaths?: Array<string>
124
+ customHeaders?: Record<string, string>
125
+ requestTimeout?: number
126
+ maxRedirects?: number
127
+ retryCount?: number
128
+ retryCodes?: Array<number>
129
+ cookiesEnabled?: boolean
130
+ auth?: JsAuthConfig
131
+ maxBodySize?: number
132
+ mainContentOnly?: boolean
133
+ removeTags?: Array<string>
134
+ mapLimit?: number
135
+ mapSearch?: string
136
+ downloadAssets?: boolean
137
+ assetTypes?: Array<JsAssetCategory>
138
+ maxAssetSize?: number
139
+ browser?: JsBrowserConfig
140
+ proxy?: JsProxyConfig
141
+ userAgents?: Array<string>
142
+ captureScreenshot?: boolean
143
+ downloadDocuments?: boolean
144
+ documentMaxSize?: number
145
+ documentMimeTypes?: Array<string>
146
+ warcOutput?: string
147
+ browserProfile?: string
148
+ saveBrowserProfile?: boolean
149
+ }
150
+
151
+ export declare const enum JsCrawlEvent {
152
+ Page = 'Page',
153
+ Error = 'Error',
154
+ Complete = 'Complete'
155
+ }
156
+
157
+ export interface JsCrawlPageResult {
158
+ url?: string
159
+ normalizedUrl?: string
160
+ statusCode?: number
161
+ contentType?: string
162
+ html?: string
163
+ bodySize?: number
164
+ metadata?: JsPageMetadata
165
+ links?: Array<JsLinkInfo>
166
+ images?: Array<JsImageInfo>
167
+ feeds?: Array<JsFeedInfo>
168
+ jsonLd?: Array<JsJsonLdEntry>
169
+ depth?: number
170
+ stayedOnDomain?: boolean
171
+ wasSkipped?: boolean
172
+ isPdf?: boolean
173
+ detectedCharset?: string
174
+ markdown?: JsMarkdownResult
175
+ extractedData?: string
176
+ extractionMeta?: JsExtractionMeta
177
+ downloadedDocument?: JsDownloadedDocument
178
+ }
179
+
180
+ export interface JsCrawlResult {
181
+ pages?: Array<JsCrawlPageResult>
182
+ finalUrl?: string
183
+ redirectCount?: number
184
+ wasSkipped?: boolean
185
+ error?: string
186
+ cookies?: Array<JsCookieInfo>
187
+ normalizedUrls?: Array<string>
188
+ }
189
+
190
+ export interface JsDownloadedAsset {
191
+ url?: string
192
+ contentHash?: string
193
+ mimeType?: string
194
+ size?: number
195
+ assetCategory?: JsAssetCategory
196
+ htmlTag?: string
197
+ }
198
+
199
+ export interface JsDownloadedDocument {
200
+ url?: string
201
+ mimeType?: string
202
+ content?: Array<number>
203
+ size?: number
204
+ filename?: string
205
+ contentHash?: string
206
+ headers?: Record<string, string>
207
+ }
208
+
209
+ export interface JsExtractionMeta {
210
+ cost?: number
211
+ promptTokens?: number
212
+ completionTokens?: number
213
+ model?: string
214
+ chunksProcessed?: number
215
+ }
216
+
217
+ export interface JsFaviconInfo {
218
+ url?: string
219
+ rel?: string
220
+ sizes?: string
221
+ mimeType?: string
222
+ }
223
+
224
+ export interface JsFeedInfo {
225
+ url?: string
226
+ title?: string
227
+ feedType?: JsFeedType
228
+ }
229
+
230
+ export declare const enum JsFeedType {
231
+ Rss = 'rss',
232
+ Atom = 'atom',
233
+ JsonFeed = 'json_feed'
234
+ }
235
+
236
+ export interface JsHeadingInfo {
237
+ level?: number
238
+ text?: string
239
+ }
240
+
241
+ export interface JsHreflangEntry {
242
+ lang?: string
243
+ url?: string
244
+ }
245
+
246
+ export interface JsImageInfo {
247
+ url?: string
248
+ alt?: string
249
+ width?: number
250
+ height?: number
251
+ source?: JsImageSource
252
+ }
253
+
254
+ export declare const enum JsImageSource {
255
+ Img = 'img',
256
+ PictureSource = 'picture_source',
257
+ OgImage = 'og_image',
258
+ TwitterImage = 'twitter_image'
259
+ }
260
+
261
+ export interface JsInteractionResult {
262
+ actionResults?: Array<JsActionResult>
263
+ finalHtml?: string
264
+ finalUrl?: string
265
+ screenshot?: Array<number>
266
+ }
267
+
268
+ export interface JsJsonLdEntry {
269
+ schemaType?: string
270
+ name?: string
271
+ raw?: string
272
+ }
273
+
274
+ export interface JsLinkInfo {
275
+ url?: string
276
+ text?: string
277
+ linkType?: JsLinkType
278
+ rel?: string
279
+ nofollow?: boolean
280
+ }
281
+
282
+ export declare const enum JsLinkType {
283
+ Internal = 'internal',
284
+ External = 'external',
285
+ Anchor = 'anchor',
286
+ Document = 'document'
287
+ }
288
+
289
+ export interface JsMapResult {
290
+ urls?: Array<JsSitemapUrl>
291
+ }
292
+
293
+ export interface JsMarkdownResult {
294
+ content?: string
295
+ documentStructure?: string
296
+ tables?: Array<string>
297
+ warnings?: Array<string>
298
+ citations?: JsCitationResult
299
+ fitContent?: string
300
+ }
301
+
302
+ export interface JsPageMetadata {
303
+ title?: string
304
+ description?: string
305
+ canonicalUrl?: string
306
+ keywords?: string
307
+ author?: string
308
+ viewport?: string
309
+ themeColor?: string
310
+ generator?: string
311
+ robots?: string
312
+ htmlLang?: string
313
+ htmlDir?: string
314
+ ogTitle?: string
315
+ ogType?: string
316
+ ogImage?: string
317
+ ogDescription?: string
318
+ ogUrl?: string
319
+ ogSiteName?: string
320
+ ogLocale?: string
321
+ ogVideo?: string
322
+ ogAudio?: string
323
+ ogLocaleAlternates?: Array<string>
324
+ twitterCard?: string
325
+ twitterTitle?: string
326
+ twitterDescription?: string
327
+ twitterImage?: string
328
+ twitterSite?: string
329
+ twitterCreator?: string
330
+ dcTitle?: string
331
+ dcCreator?: string
332
+ dcSubject?: string
333
+ dcDescription?: string
334
+ dcPublisher?: string
335
+ dcDate?: string
336
+ dcType?: string
337
+ dcFormat?: string
338
+ dcIdentifier?: string
339
+ dcLanguage?: string
340
+ dcRights?: string
341
+ article?: JsArticleMetadata
342
+ hreflangs?: Array<JsHreflangEntry>
343
+ favicons?: Array<JsFaviconInfo>
344
+ headings?: Array<JsHeadingInfo>
345
+ wordCount?: number
346
+ }
347
+
348
+ export interface JsProxyConfig {
349
+ url?: string
350
+ username?: string
351
+ password?: string
352
+ }
353
+
354
+ export interface JsResponseMeta {
355
+ etag?: string
356
+ lastModified?: string
357
+ cacheControl?: string
358
+ server?: string
359
+ xPoweredBy?: string
360
+ contentLanguage?: string
361
+ contentEncoding?: string
362
+ }
363
+
364
+ export interface JsScrapeResult {
365
+ statusCode?: number
366
+ contentType?: string
367
+ html?: string
368
+ bodySize?: number
369
+ metadata?: JsPageMetadata
370
+ links?: Array<JsLinkInfo>
371
+ images?: Array<JsImageInfo>
372
+ feeds?: Array<JsFeedInfo>
373
+ jsonLd?: Array<JsJsonLdEntry>
374
+ isAllowed?: boolean
375
+ crawlDelay?: number
376
+ noindexDetected?: boolean
377
+ nofollowDetected?: boolean
378
+ xRobotsTag?: string
379
+ isPdf?: boolean
380
+ wasSkipped?: boolean
381
+ detectedCharset?: string
382
+ mainContentOnly?: boolean
383
+ authHeaderSent?: boolean
384
+ responseMeta?: JsResponseMeta
385
+ assets?: Array<JsDownloadedAsset>
386
+ jsRenderHint?: boolean
387
+ browserUsed?: boolean
388
+ markdown?: JsMarkdownResult
389
+ extractedData?: string
390
+ extractionMeta?: JsExtractionMeta
391
+ screenshot?: Array<number>
392
+ downloadedDocument?: JsDownloadedDocument
393
+ }
394
+
395
+ export interface JsSitemapUrl {
396
+ url?: string
397
+ lastmod?: string
398
+ changefreq?: string
399
+ priority?: string
400
+ }
401
+
402
+ export declare function mapUrls(engine: JsCrawlEngineHandle, url: string): Promise<JsMapResult>
403
+
404
+ export declare function scrape(engine: JsCrawlEngineHandle, url: string): Promise<JsScrapeResult>