@snap-agent/rag-web 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,727 @@
1
+ import { RAGPlugin, RAGContext, RAGDocument, IngestOptions, IngestResult, BulkOperation, BulkResult } from '@snap-agent/core';
2
+
3
+ /**
4
+ * Content document with minimal required fields and flexible metadata
5
+ *
6
+ * Only three things are required:
7
+ * - id: Unique identifier
8
+ * - content: Text to embed and search
9
+ * - metadata.type: Content classification (e.g., 'blog', 'page', 'project', 'team')
10
+ *
11
+ * Everything else in metadata is pass-through - store any fields you need.
12
+ */
13
+ interface WebDocument {
14
+ id: string;
15
+ content: string;
16
+ metadata: {
17
+ type: string;
18
+ title?: string;
19
+ url?: string;
20
+ [key: string]: any;
21
+ };
22
+ }
23
+ /**
24
+ * Stored document with embedding and system fields
25
+ */
26
+ interface StoredWebDocument extends WebDocument {
27
+ tenantId: string;
28
+ agentId?: string;
29
+ embedding: number[];
30
+ createdAt: Date;
31
+ updatedAt?: Date;
32
+ }
33
+ /**
34
+ * Plugin configuration
35
+ */
36
+ interface WebRAGConfig {
37
+ mongoUri: string;
38
+ dbName: string;
39
+ collection?: string;
40
+ openaiApiKey: string;
41
+ embeddingModel?: string;
42
+ tenantId: string;
43
+ vectorIndexName?: string;
44
+ numCandidates?: number;
45
+ limit?: number;
46
+ minScore?: number;
47
+ filterableFields?: string[];
48
+ typeBoosts?: Record<string, number>;
49
+ recencyBoost?: {
50
+ enabled: boolean;
51
+ field: string;
52
+ decayDays: number;
53
+ maxBoost?: number;
54
+ };
55
+ cache?: {
56
+ embeddings?: {
57
+ enabled: boolean;
58
+ ttl?: number;
59
+ maxSize?: number;
60
+ };
61
+ };
62
+ priority?: number;
63
+ /**
64
+ * Persistent crawl ledger (MongoDB) — skip re-crawl within TTL, audit per URL.
65
+ * Per-request overrides: pass `crawlLedger` on SitemapConfig / WebsiteCrawlConfig / etc.
66
+ */
67
+ crawlLedger?: CrawlLedgerPluginConfig;
68
+ }
69
+ /**
70
+ * Global defaults for crawl ledger (MongoDB collection separate from vector content).
71
+ */
72
+ interface CrawlLedgerPluginConfig {
73
+ /** Default off so existing installs behave the same */
74
+ enabled?: boolean;
75
+ /** Collection name (default: web_crawl_ledger) */
76
+ collection?: string;
77
+ /** Skip re-crawl if last status was indexed and younger than this (default: 7 days) */
78
+ ttlMsIndexed?: number;
79
+ /** Skip re-crawl if last status was a failure and younger than this (default: 1 hour) */
80
+ ttlMsFailure?: number;
81
+ /**
82
+ * Skip re-crawl if last status was `error` (e.g. Playwright timeout) and younger than this.
83
+ * Shorter than ttlMsFailure so transient render errors retry sooner (default: 5 minutes).
84
+ */
85
+ ttlMsRenderError?: number;
86
+ }
87
+ /**
88
+ * Per-ingest crawl ledger options (merged over plugin.crawlLedger).
89
+ * To bypass skip-TTL for one run: pass `forceRecrawl: true` on IngestOptions (SDK).
90
+ */
91
+ interface CrawlLedgerOptions {
92
+ enabled?: boolean;
93
+ ttlMsIndexed?: number;
94
+ ttlMsFailure?: number;
95
+ ttlMsRenderError?: number;
96
+ /** Max rows in result.metadata.pageStatuses (default: 500) */
97
+ maxPageStatuses?: number;
98
+ }
99
+ type CrawlLedgerStatus = 'indexed' | 'skipped_ledger' | 'too_small' | 'non_html' | 'blocked_suspected' | 'error';
100
+ interface CrawlPageStatusEntry {
101
+ url: string;
102
+ urlNormalized?: string;
103
+ status: CrawlLedgerStatus;
104
+ modeUsed?: string;
105
+ contentLength?: number;
106
+ /** Raw-ish body text length before selector pick (debug) */
107
+ bodyTextLengthHint?: number;
108
+ title?: string;
109
+ docId?: string;
110
+ httpStatus?: number;
111
+ error?: string;
112
+ skippedReason?: string;
113
+ }
114
+ interface CrawlLedgerDocument {
115
+ tenantId: string;
116
+ agentId: string;
117
+ urlNormalized: string;
118
+ url: string;
119
+ domain: string;
120
+ lastStatus: CrawlLedgerStatus;
121
+ lastCrawledAt: Date;
122
+ modeUsed?: string;
123
+ contentLength?: number;
124
+ title?: string;
125
+ docId?: string;
126
+ httpStatus?: number;
127
+ errorMessage?: string;
128
+ updatedAt: Date;
129
+ }
130
+ /**
131
+ * URL source for ingesting content from external APIs
132
+ */
133
+ interface URLSource {
134
+ url: string;
135
+ type: 'json' | 'csv' | 'xml' | 'api';
136
+ auth?: URLSourceAuth;
137
+ transform?: DataTransform;
138
+ headers?: Record<string, string>;
139
+ timeout?: number;
140
+ metadata?: Record<string, any>;
141
+ }
142
+ interface URLSourceAuth {
143
+ type: 'bearer' | 'basic' | 'api-key' | 'custom';
144
+ token?: string;
145
+ username?: string;
146
+ password?: string;
147
+ header?: string;
148
+ key?: string;
149
+ headers?: Record<string, string>;
150
+ }
151
+ interface DataTransform {
152
+ documentPath?: string;
153
+ fieldMapping?: {
154
+ id?: string;
155
+ content?: string;
156
+ type?: string | (() => string);
157
+ [key: string]: string | (() => string) | undefined;
158
+ };
159
+ }
160
+ /**
161
+ * Drupal JSON:API specific configuration
162
+ */
163
+ interface DrupalConfig {
164
+ baseUrl: string;
165
+ contentTypes: string[];
166
+ auth?: URLSourceAuth;
167
+ mappings?: Record<string, {
168
+ content: string;
169
+ fields?: Record<string, string>;
170
+ }>;
171
+ }
172
+ /**
173
+ * WordPress REST API specific configuration
174
+ */
175
+ interface WordPressConfig {
176
+ baseUrl: string;
177
+ postTypes?: string[];
178
+ auth?: URLSourceAuth;
179
+ perPage?: number;
180
+ maxPages?: number;
181
+ mappings?: Record<string, {
182
+ content?: string;
183
+ fields?: Record<string, string>;
184
+ }>;
185
+ }
186
+ /**
187
+ * Sanity.io specific configuration
188
+ */
189
+ interface SanityConfig {
190
+ projectId: string;
191
+ dataset: string;
192
+ apiVersion?: string;
193
+ token?: string;
194
+ useCdn?: boolean;
195
+ queries: Record<string, {
196
+ query: string;
197
+ content: string;
198
+ fields?: Record<string, string>;
199
+ }>;
200
+ }
201
+ /**
202
+ * Strapi specific configuration
203
+ */
204
+ interface StrapiConfig {
205
+ baseUrl: string;
206
+ apiToken?: string;
207
+ contentTypes: string[];
208
+ pageSize?: number;
209
+ maxPages?: number;
210
+ mappings?: Record<string, {
211
+ content?: string;
212
+ fields?: Record<string, string>;
213
+ useAttributes?: boolean;
214
+ }>;
215
+ }
216
+ /**
217
+ * Sitemap crawling configuration
218
+ * For non-technical clients - just provide the sitemap URL
219
+ */
220
+ interface SitemapConfig {
221
+ sitemapUrl?: string;
222
+ baseUrl?: string;
223
+ maxPages?: number;
224
+ concurrency?: number;
225
+ delayMs?: number;
226
+ timeout?: number;
227
+ contentSelector?: string;
228
+ titleSelector?: string;
229
+ removeSelectors?: string[];
230
+ /** Minimum cleaned text length to accept a page (default: 50) */
231
+ minExtractedContentLength?: number;
232
+ includePatterns?: string[];
233
+ excludePatterns?: string[];
234
+ /** Strip query string for crawl ledger key (default: false) */
235
+ stripQueryParams?: boolean;
236
+ typeFromUrl?: Record<string, string>;
237
+ defaultType?: string;
238
+ metadata?: Record<string, any>;
239
+ /**
240
+ * Rendering mode for JS-heavy sites
241
+ * - false: only static HTML fetch
242
+ * - true: always render with a headless browser
243
+ * - "auto": try static first, render as fallback when content is too small / looks dynamic
244
+ */
245
+ render?: boolean | 'auto';
246
+ /**
247
+ * Render options (used when render is true/auto)
248
+ */
249
+ renderOptions?: RenderOptions;
250
+ /**
251
+ * Debug/observability options
252
+ */
253
+ debug?: DebugOptions;
254
+ crawlLedger?: CrawlLedgerOptions;
255
+ }
256
+ /**
257
+ * Direct URL list crawling configuration
258
+ */
259
+ interface UrlListConfig {
260
+ contentSelector?: string;
261
+ titleSelector?: string;
262
+ removeSelectors?: string[];
263
+ concurrency?: number;
264
+ delayMs?: number;
265
+ timeout?: number;
266
+ type?: string;
267
+ typeFromUrl?: Record<string, string>;
268
+ metadata?: Record<string, any>;
269
+ render?: boolean | 'auto';
270
+ renderOptions?: RenderOptions;
271
+ debug?: DebugOptions;
272
+ stripQueryParams?: boolean;
273
+ crawlLedger?: CrawlLedgerOptions;
274
+ }
275
+ /**
276
+ * Single page ingestion (no discovery)
277
+ */
278
+ interface SinglePageConfig {
279
+ url: string;
280
+ contentSelector?: string;
281
+ titleSelector?: string;
282
+ removeSelectors?: string[];
283
+ timeout?: number;
284
+ type?: string;
285
+ typeFromUrl?: Record<string, string>;
286
+ metadata?: Record<string, any>;
287
+ render?: boolean | 'auto';
288
+ renderOptions?: RenderOptions;
289
+ debug?: DebugOptions;
290
+ /** Ledger key normalization (default: true) */
291
+ stripQueryParams?: boolean;
292
+ crawlLedger?: CrawlLedgerOptions;
293
+ }
294
+ /**
295
+ * Website crawling configuration (no sitemap)
296
+ * Discovers internal links starting from a base URL and then crawls them.
297
+ */
298
+ interface WebsiteCrawlConfig {
299
+ baseUrl: string;
300
+ maxPages?: number;
301
+ maxDepth?: number;
302
+ concurrency?: number;
303
+ delayMs?: number;
304
+ timeout?: number;
305
+ includePatterns?: string[];
306
+ excludePatterns?: string[];
307
+ stripQueryParams?: boolean;
308
+ contentSelector?: string;
309
+ titleSelector?: string;
310
+ removeSelectors?: string[];
311
+ typeFromUrl?: Record<string, string>;
312
+ defaultType?: string;
313
+ metadata?: Record<string, any>;
314
+ render?: boolean | 'auto';
315
+ renderOptions?: RenderOptions;
316
+ debug?: DebugOptions;
317
+ crawlLedger?: CrawlLedgerOptions;
318
+ }
319
+ interface RenderOptions {
320
+ /**
321
+ * Minimum extracted content length to accept from static crawl before falling back to render.
322
+ * Used only when render === "auto".
323
+ */
324
+ minContentLength?: number;
325
+ /**
326
+ * Navigation wait strategy for the headless browser.
327
+ */
328
+ waitUntil?: 'domcontentloaded' | 'load' | 'networkidle';
329
+ /**
330
+ * Optional selector that indicates the page's main content is ready.
331
+ */
332
+ waitForSelector?: string;
333
+ /**
334
+ * Scroll settings for infinite scroll pages.
335
+ */
336
+ scroll?: {
337
+ enabled?: boolean;
338
+ maxScrolls?: number;
339
+ scrollDelayMs?: number;
340
+ stableIterations?: number;
341
+ };
342
+ /**
343
+ * Wait after navigation (and optional waitForSelector) before reading HTML.
344
+ * Helps WordPress/Elementor and other late-hydrated layouts.
345
+ */
346
+ postRenderDelayMs?: number;
347
+ }
348
+ interface DebugOptions {
349
+ enabled?: boolean;
350
+ level?: 'summary' | 'verbose';
351
+ saveDir?: string;
352
+ maxPerUrlLogs?: number;
353
+ }
354
+ /**
355
+ * RSS/Atom feed configuration
356
+ */
357
+ interface RSSConfig {
358
+ feedUrl: string;
359
+ useFullContent?: boolean;
360
+ fetchFullContent?: boolean;
361
+ contentSelector?: string;
362
+ type?: string;
363
+ metadata?: Record<string, any>;
364
+ }
365
+ /**
366
+ * Crawl result for sitemap/URL crawling
367
+ */
368
+ interface CrawlResult extends WebIngestResult {
369
+ urlsCrawled: number;
370
+ urlsSkipped: number;
371
+ urlsFailed: number;
372
+ crawledAt: Date;
373
+ }
374
+ /**
375
+ * Ingest result
376
+ */
377
+ interface WebIngestResult {
378
+ success: boolean;
379
+ indexed: number;
380
+ failed: number;
381
+ errors?: Array<{
382
+ id: string;
383
+ error: string;
384
+ }>;
385
+ metadata?: Record<string, any>;
386
+ }
387
+ /**
388
+ * URL ingest result
389
+ */
390
+ interface WebURLIngestResult extends WebIngestResult {
391
+ sourceUrl: string;
392
+ fetchedAt: Date;
393
+ documentsFetched: number;
394
+ }
395
+
396
+ /**
397
+ * Web RAG Plugin
398
+ *
399
+ * Schema-agnostic RAG plugin for web content.
400
+ * Works with Drupal, WordPress, Contentful, or any content source.
401
+ *
402
+ * Key features:
403
+ * - Flexible metadata: Only id, content, and type are required
404
+ * - Pass-through fields: Store any metadata, get it back in results
405
+ * - URL ingestion: Fetch from JSON, CSV, XML APIs
406
+ * - Drupal helpers: JSON:API parsing and field mapping
407
+ * - Type/recency boosts: Prioritize certain content types or fresh content
408
+ */
409
+
410
+ declare class WebRAGPlugin implements RAGPlugin {
411
+ name: string;
412
+ type: "rag";
413
+ priority: number;
414
+ private config;
415
+ private client;
416
+ private db;
417
+ private openai;
418
+ private embeddingCache;
419
+ private cacheStats;
420
+ constructor(config: WebRAGConfig);
421
+ private getCollection;
422
+ private getLedgerCollection;
423
+ /**
424
+ * List recent crawl ledger rows (for dashboards / pagination in the front).
425
+ */
426
+ listCrawlLedger(options?: {
427
+ agentId?: string;
428
+ domain?: string;
429
+ status?: CrawlLedgerStatus;
430
+ limit?: number;
431
+ skip?: number;
432
+ }): Promise<CrawlLedgerDocument[]>;
433
+ private resolveCrawlLedgerOptions;
434
+ private normalizeLedgerUrl;
435
+ private shouldSkipLedger;
436
+ private findLedgerEntry;
437
+ private toLedgerStatus;
438
+ private upsertLedgerRecord;
439
+ private pushPageStatus;
440
+ disconnect(): Promise<void>;
441
+ /**
442
+ * Retrieve contextual content for a message
443
+ */
444
+ retrieveContext(message: string, options?: {
445
+ agentId?: string;
446
+ threadId?: string;
447
+ filters?: Record<string, any>;
448
+ metadata?: Record<string, any>;
449
+ }): Promise<RAGContext>;
450
+ /**
451
+ * Format retrieved content for LLM context
452
+ */
453
+ private formatResultsToContext;
454
+ private formatFieldName;
455
+ private formatFieldValue;
456
+ private vectorSearch;
457
+ private generateEmbedding;
458
+ private generateEmbeddingsBatch;
459
+ /**
460
+ * Ingest documents into the CMS RAG system
461
+ */
462
+ ingest(documents: RAGDocument[], options?: IngestOptions): Promise<IngestResult>;
463
+ /**
464
+ * Update a single document
465
+ */
466
+ update(id: string, document: Partial<RAGDocument>, options?: IngestOptions): Promise<void>;
467
+ /**
468
+ * Delete document(s) by ID
469
+ */
470
+ delete(ids: string | string[], options?: IngestOptions): Promise<number>;
471
+ /**
472
+ * Bulk operations
473
+ */
474
+ bulk(operations: BulkOperation[], options?: IngestOptions): Promise<BulkResult>;
475
+ /**
476
+ * Ingest content from a URL (JSON, CSV, XML, or API)
477
+ */
478
+ ingestFromUrl(source: URLSource, options?: IngestOptions): Promise<WebURLIngestResult>;
479
+ private buildAuthHeaders;
480
+ private transformJsonToDocuments;
481
+ private transformCsvToDocuments;
482
+ private parseCsvLine;
483
+ private transformXmlToDocuments;
484
+ private extractByPath;
485
+ private extractField;
486
+ /**
487
+ * Ingest content from a Drupal site using JSON:API
488
+ */
489
+ ingestFromDrupal(config: DrupalConfig, options?: IngestOptions): Promise<WebURLIngestResult[]>;
490
+ /**
491
+ * Parse Drupal JSON:API node type (e.g., 'node--project' → 'project')
492
+ */
493
+ static parseDrupalType(type: string): string;
494
+ /**
495
+ * Ingest content from a WordPress site using REST API
496
+ *
497
+ * @example
498
+ * ```typescript
499
+ * await plugin.ingestFromWordPress({
500
+ * baseUrl: 'https://myblog.com',
501
+ * postTypes: ['posts', 'pages'],
502
+ * perPage: 100,
503
+ * });
504
+ * ```
505
+ */
506
+ ingestFromWordPress(config: WordPressConfig, options?: IngestOptions): Promise<WebURLIngestResult[]>;
507
+ /**
508
+ * Normalize WordPress post type to a cleaner name
509
+ */
510
+ private normalizeWordPressType;
511
+ /**
512
+ * Ingest content from a Sanity.io project using GROQ queries
513
+ *
514
+ * @example
515
+ * ```typescript
516
+ * await plugin.ingestFromSanity({
517
+ * projectId: 'abc123',
518
+ * dataset: 'production',
519
+ * queries: {
520
+ * post: {
521
+ * query: '*[_type == "post" && !(_id in path("drafts.**"))]',
522
+ * content: 'body',
523
+ * fields: {
524
+ * author: 'author->name',
525
+ * categories: 'categories[]->title',
526
+ * },
527
+ * },
528
+ * },
529
+ * });
530
+ * ```
531
+ */
532
+ ingestFromSanity(config: SanityConfig, options?: IngestOptions): Promise<WebURLIngestResult[]>;
533
+ /**
534
+ * Convert Sanity Portable Text blocks to plain text
535
+ * Useful for extracting content from rich text fields
536
+ */
537
+ static sanityBlocksToText(blocks: any[]): string;
538
+ /**
539
+ * Ingest content from a Strapi CMS (v4 by default)
540
+ *
541
+ * @example
542
+ * ```typescript
543
+ * await plugin.ingestFromStrapi({
544
+ * baseUrl: 'https://my-strapi.com',
545
+ * apiToken: process.env.STRAPI_TOKEN,
546
+ * contentTypes: ['articles', 'pages'],
547
+ * mappings: {
548
+ * articles: {
549
+ * content: 'attributes.content',
550
+ * fields: {
551
+ * author: 'attributes.author.data.attributes.name',
552
+ * category: 'attributes.category.data.attributes.name',
553
+ * },
554
+ * },
555
+ * },
556
+ * });
557
+ * ```
558
+ */
559
+ ingestFromStrapi(config: StrapiConfig, options?: IngestOptions): Promise<WebURLIngestResult[]>;
560
+ /**
561
+ * Normalize Strapi collection type to singular form
562
+ */
563
+ private normalizeStrapiType;
564
+ /**
565
+ * Ingest content by crawling a website's sitemap
566
+ * Perfect for non-technical clients - just provide the sitemap URL
567
+ *
568
+ * @example
569
+ * ```typescript
570
+ * // Simple usage - just provide the sitemap
571
+ * await plugin.ingestFromSitemap({
572
+ * sitemapUrl: 'https://my-site/sitemap.xml',
573
+ * });
574
+ *
575
+ * // Or auto-discover sitemap from base URL
576
+ * await plugin.ingestFromSitemap({
577
+ * baseUrl: 'https://my-site',
578
+ * });
579
+ *
580
+ * // With content selectors and type inference
581
+ * await plugin.ingestFromSitemap({
582
+ * sitemapUrl: 'https://my-site/sitemap.xml',
583
+ * contentSelector: 'article, .main-content',
584
+ * excludePatterns: ['/cart', '/checkout', '/admin'],
585
+ * typeFromUrl: {
586
+ * '/projects/': 'project',
587
+ * '/perspectives/': 'blog',
588
+ * '/people/': 'team',
589
+ * },
590
+ * });
591
+ * ```
592
+ */
593
+ ingestFromSitemap(config: SitemapConfig, options?: IngestOptions): Promise<CrawlResult>;
594
+ /**
595
+ * Ingest content from a website that has no sitemap (or sitemap is incomplete).
596
+ * Discovers internal links from `baseUrl` (BFS) and then crawls the discovered URLs.
597
+ *
598
+ * This uses the same extraction pipeline as `ingestFromSitemap()` (via `crawlPage()`).
599
+ */
600
+ ingestFromWebsite(config: WebsiteCrawlConfig, options?: IngestOptions): Promise<CrawlResult>;
601
+ /**
602
+ * Parse sitemap XML and extract URLs
603
+ */
604
+ private parseSitemap;
605
+ /**
606
+ * Extract URLs from sitemap XML
607
+ */
608
+ private extractUrlsFromXml;
609
+ private discoverInternalUrls;
610
+ private normalizeWebsiteUrl;
611
+ private fetchHtml;
612
+ private extractInternalLinks;
613
+ /**
614
+ * Ingest content from a list of URLs
615
+ *
616
+ * @example
617
+ * ```typescript
618
+ * await plugin.ingestFromUrls([
619
+ * 'https://example.com/about',
620
+ * 'https://example.com/services',
621
+ * 'https://example.com/contact',
622
+ * ], {
623
+ * contentSelector: '.page-content',
624
+ * type: 'page',
625
+ * });
626
+ * ```
627
+ */
628
+ ingestFromUrls(urls: string[], config?: UrlListConfig, options?: IngestOptions): Promise<CrawlResult>;
629
+ /**
630
+ * Ingest a single page from a URL (no sitemap discovery, no link lookup).
631
+ * Uses the same crawl pipeline (static/render/auto) as other web ingestion methods.
632
+ */
633
+ ingestSinglePageFromUrl(config: SinglePageConfig, options?: IngestOptions): Promise<CrawlResult>;
634
+ /**
635
+ * Crawl a list of URLs and ingest their content
636
+ */
637
+ private crawlUrls;
638
+ /**
639
+ * Crawl a single page and extract content
640
+ */
641
+ private crawlPage;
642
+ /**
643
+ * Default chain works for many WordPress / Elementor / block themes where `.first()`
644
+ * would otherwise hit an empty wrapper.
645
+ */
646
+ private static readonly DEFAULT_CONTENT_SELECTOR;
647
+ private stripNoiseFromDom;
648
+ /** Longest cleaned text among selector matches and full body (after noise strip). */
649
+ private extractBestContentText;
650
+ private bodyTextLengthHint;
651
+ private extractDocumentFromHtml;
652
+ private looksLikeDynamicShell;
653
+ private diagFromRenderedAttempt;
654
+ private crawlPageSmart;
655
+ private crawlPageRendered;
656
+ private discoverSitemaps;
657
+ private createDebugCollector;
658
+ /**
659
+ * Clean extracted text content
660
+ */
661
+ private cleanContent;
662
+ /**
663
+ * Convert URL to a stable document ID
664
+ */
665
+ private urlToId;
666
+ /**
667
+ * Delay helper
668
+ */
669
+ private delay;
670
+ /**
671
+ * Ingest content from an RSS or Atom feed
672
+ *
673
+ * @example
674
+ * ```typescript
675
+ * // Simple RSS ingestion
676
+ * await plugin.ingestFromRSS({
677
+ * feedUrl: 'https://myblog.com/feed/',
678
+ * });
679
+ *
680
+ * // Fetch full page content for each item
681
+ * await plugin.ingestFromRSS({
682
+ * feedUrl: 'https://myblog.com/feed/',
683
+ * fetchFullContent: true,
684
+ * contentSelector: 'article',
685
+ * });
686
+ * ```
687
+ */
688
+ ingestFromRSS(config: RSSConfig, options?: IngestOptions): Promise<CrawlResult>;
689
+ /**
690
+ * Parse RSS/Atom feed XML
691
+ */
692
+ private parseRSSFeed;
693
+ /**
694
+ * Extract a single value from XML
695
+ */
696
+ private extractXmlValue;
697
+ /**
698
+ * Extract multiple values from XML
699
+ */
700
+ private extractXmlValues;
701
+ /**
702
+ * Extract link from Atom entry
703
+ */
704
+ private extractAtomLink;
705
+ /**
706
+ * Strip HTML tags from content
707
+ */
708
+ private stripHtml;
709
+ /**
710
+ * Get cache statistics
711
+ */
712
+ getCacheStats(): {
713
+ hits: number;
714
+ misses: number;
715
+ hitRate: string;
716
+ };
717
+ /**
718
+ * Clear embedding cache
719
+ */
720
+ clearCache(): void;
721
+ /**
722
+ * Get plugin configuration (for persistence)
723
+ */
724
+ getConfig(): Record<string, any>;
725
+ }
726
+
727
+ export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };