@snap-agent/rag-web 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +83 -18
- package/dist/index.d.ts +83 -18
- package/dist/index.js +527 -144
- package/dist/index.mjs +519 -143
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
|
|
|
118
118
|
interface CrawlLedgerDocument {
|
|
119
119
|
tenantId: string;
|
|
120
120
|
agentId: string;
|
|
121
|
+
/** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
|
|
122
|
+
ingestionId?: string;
|
|
121
123
|
urlNormalized: string;
|
|
122
124
|
url: string;
|
|
123
125
|
domain: string;
|
|
@@ -369,10 +371,46 @@ interface RSSConfig {
|
|
|
369
371
|
/**
|
|
370
372
|
* Crawl result for sitemap/URL crawling
|
|
371
373
|
*/
|
|
374
|
+
type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
|
|
375
|
+
/** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
|
|
376
|
+
interface CrawlProgressUpdate {
|
|
377
|
+
phase: CrawlProgressPhase;
|
|
378
|
+
/** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
|
|
379
|
+
urlsDiscovered?: number;
|
|
380
|
+
/** URLs that will be crawled in this run (≤ maxPages). */
|
|
381
|
+
urlsScheduled?: number;
|
|
382
|
+
/** During crawl: batches done. During indexing: documents fully embedded so far. */
|
|
383
|
+
pagesProcessed?: number;
|
|
384
|
+
/** During indexing: total text chunks to embed (drives web_content writes). */
|
|
385
|
+
chunksTotal?: number;
|
|
386
|
+
/** During indexing: chunks embedded so far. */
|
|
387
|
+
chunksProcessed?: number;
|
|
388
|
+
}
|
|
389
|
+
type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
|
|
390
|
+
type BulkProgressPhase = 'processing' | 'indexing';
|
|
391
|
+
/** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
|
|
392
|
+
interface BulkProgressUpdate {
|
|
393
|
+
phase: BulkProgressPhase;
|
|
394
|
+
opsTotal: number;
|
|
395
|
+
opsDone: number;
|
|
396
|
+
currentOpType?: 'insert' | 'update' | 'delete';
|
|
397
|
+
currentUrl?: string;
|
|
398
|
+
}
|
|
399
|
+
type BulkProgressCallback = (update: BulkProgressUpdate) => void;
|
|
400
|
+
/** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
|
|
401
|
+
type CrawlPageEvent = {
|
|
402
|
+
url: string;
|
|
403
|
+
event: 'start' | 'done';
|
|
404
|
+
status?: string;
|
|
405
|
+
error?: string;
|
|
406
|
+
};
|
|
407
|
+
type CrawlPageCallback = (event: CrawlPageEvent) => void;
|
|
372
408
|
interface CrawlResult extends WebIngestResult {
|
|
373
409
|
urlsCrawled: number;
|
|
374
410
|
urlsSkipped: number;
|
|
375
411
|
urlsFailed: number;
|
|
412
|
+
/** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
|
|
413
|
+
urlsScheduled?: number;
|
|
376
414
|
crawledAt: Date;
|
|
377
415
|
}
|
|
378
416
|
/**
|
|
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
423
461
|
private cacheStats;
|
|
424
462
|
constructor(config: WebRAGConfig);
|
|
425
463
|
private getCollection;
|
|
464
|
+
private ledgerIndexesEnsured;
|
|
426
465
|
private getLedgerCollection;
|
|
427
466
|
/**
|
|
428
467
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
430
469
|
listCrawlLedger(options?: {
|
|
431
470
|
agentId?: string;
|
|
432
471
|
domain?: string;
|
|
472
|
+
ingestionId?: string;
|
|
433
473
|
status?: CrawlLedgerStatus;
|
|
434
474
|
limit?: number;
|
|
435
475
|
skip?: number;
|
|
@@ -650,34 +690,20 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
650
690
|
* Crawl a single page and extract content
|
|
651
691
|
*/
|
|
652
692
|
private crawlPage;
|
|
653
|
-
/**
|
|
654
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
655
|
-
* would otherwise hit an empty wrapper.
|
|
656
|
-
*/
|
|
657
|
-
private static readonly DEFAULT_CONTENT_SELECTOR;
|
|
658
|
-
private stripNoiseFromDom;
|
|
659
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
660
|
-
private extractBestContentText;
|
|
661
693
|
private bodyTextLengthHint;
|
|
662
694
|
private extractDocumentFromHtml;
|
|
663
|
-
/**
|
|
664
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
665
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
666
|
-
*/
|
|
667
|
-
private extractHeroImage;
|
|
668
695
|
private looksLikeDynamicShell;
|
|
669
696
|
private diagFromRenderedAttempt;
|
|
670
697
|
private crawlPageSmart;
|
|
671
698
|
private crawlPageRendered;
|
|
672
699
|
private discoverSitemaps;
|
|
700
|
+
private emitBulkProgress;
|
|
701
|
+
private emitCrawlProgress;
|
|
702
|
+
private emitCrawlPage;
|
|
673
703
|
private createDebugCollector;
|
|
674
704
|
/**
|
|
675
705
|
* Clean extracted text content
|
|
676
706
|
*/
|
|
677
|
-
private cleanContent;
|
|
678
|
-
/**
|
|
679
|
-
* Convert URL to a stable document ID
|
|
680
|
-
*/
|
|
681
707
|
private urlToId;
|
|
682
708
|
/**
|
|
683
709
|
* Delay helper
|
|
@@ -740,4 +766,43 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
740
766
|
getConfig(): Record<string, any>;
|
|
741
767
|
}
|
|
742
768
|
|
|
743
|
-
|
|
769
|
+
interface HtmlPageExtractOptions {
|
|
770
|
+
titleSelector?: string;
|
|
771
|
+
contentSelector?: string;
|
|
772
|
+
removeSelectors?: string[];
|
|
773
|
+
defaultType?: string;
|
|
774
|
+
typeFromUrl?: Record<string, string>;
|
|
775
|
+
minExtractedContentLength?: number;
|
|
776
|
+
metadata?: Record<string, unknown>;
|
|
777
|
+
}
|
|
778
|
+
interface HtmlPageExtractResult {
|
|
779
|
+
id: string;
|
|
780
|
+
metadata: Record<string, unknown>;
|
|
781
|
+
content: string;
|
|
782
|
+
/** True when content meets minExtractedContentLength (default 50). */
|
|
783
|
+
indexable: boolean;
|
|
784
|
+
contentPreview: string;
|
|
785
|
+
}
|
|
786
|
+
declare function urlToDocumentId(url: string): string;
|
|
787
|
+
declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
|
|
788
|
+
/**
|
|
789
|
+
* Extract full page metadata + main content the same way web-rag does on HTML ingest.
|
|
790
|
+
* Unlike ingest, always returns metadata even when content is too short to index.
|
|
791
|
+
*/
|
|
792
|
+
declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
|
|
793
|
+
|
|
794
|
+
interface ProductMetadata {
|
|
795
|
+
price?: number;
|
|
796
|
+
currency?: string;
|
|
797
|
+
availability?: string;
|
|
798
|
+
}
|
|
799
|
+
/**
|
|
800
|
+
* Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
|
|
801
|
+
* Per-field priority: JSON-LD → Open Graph → microdata.
|
|
802
|
+
*/
|
|
803
|
+
declare function extractProductMetadata(html: string): ProductMetadata;
|
|
804
|
+
declare function parsePrice(value: unknown): number | undefined;
|
|
805
|
+
declare function normalizeCurrency(value: unknown): string | undefined;
|
|
806
|
+
declare function normalizeAvailability(value: unknown): string | undefined;
|
|
807
|
+
|
|
808
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };
|
package/dist/index.d.ts
CHANGED
|
@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
|
|
|
118
118
|
interface CrawlLedgerDocument {
|
|
119
119
|
tenantId: string;
|
|
120
120
|
agentId: string;
|
|
121
|
+
/** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
|
|
122
|
+
ingestionId?: string;
|
|
121
123
|
urlNormalized: string;
|
|
122
124
|
url: string;
|
|
123
125
|
domain: string;
|
|
@@ -369,10 +371,46 @@ interface RSSConfig {
|
|
|
369
371
|
/**
|
|
370
372
|
* Crawl result for sitemap/URL crawling
|
|
371
373
|
*/
|
|
374
|
+
type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
|
|
375
|
+
/** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
|
|
376
|
+
interface CrawlProgressUpdate {
|
|
377
|
+
phase: CrawlProgressPhase;
|
|
378
|
+
/** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
|
|
379
|
+
urlsDiscovered?: number;
|
|
380
|
+
/** URLs that will be crawled in this run (≤ maxPages). */
|
|
381
|
+
urlsScheduled?: number;
|
|
382
|
+
/** During crawl: batches done. During indexing: documents fully embedded so far. */
|
|
383
|
+
pagesProcessed?: number;
|
|
384
|
+
/** During indexing: total text chunks to embed (drives web_content writes). */
|
|
385
|
+
chunksTotal?: number;
|
|
386
|
+
/** During indexing: chunks embedded so far. */
|
|
387
|
+
chunksProcessed?: number;
|
|
388
|
+
}
|
|
389
|
+
type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
|
|
390
|
+
type BulkProgressPhase = 'processing' | 'indexing';
|
|
391
|
+
/** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
|
|
392
|
+
interface BulkProgressUpdate {
|
|
393
|
+
phase: BulkProgressPhase;
|
|
394
|
+
opsTotal: number;
|
|
395
|
+
opsDone: number;
|
|
396
|
+
currentOpType?: 'insert' | 'update' | 'delete';
|
|
397
|
+
currentUrl?: string;
|
|
398
|
+
}
|
|
399
|
+
type BulkProgressCallback = (update: BulkProgressUpdate) => void;
|
|
400
|
+
/** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
|
|
401
|
+
type CrawlPageEvent = {
|
|
402
|
+
url: string;
|
|
403
|
+
event: 'start' | 'done';
|
|
404
|
+
status?: string;
|
|
405
|
+
error?: string;
|
|
406
|
+
};
|
|
407
|
+
type CrawlPageCallback = (event: CrawlPageEvent) => void;
|
|
372
408
|
interface CrawlResult extends WebIngestResult {
|
|
373
409
|
urlsCrawled: number;
|
|
374
410
|
urlsSkipped: number;
|
|
375
411
|
urlsFailed: number;
|
|
412
|
+
/** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
|
|
413
|
+
urlsScheduled?: number;
|
|
376
414
|
crawledAt: Date;
|
|
377
415
|
}
|
|
378
416
|
/**
|
|
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
423
461
|
private cacheStats;
|
|
424
462
|
constructor(config: WebRAGConfig);
|
|
425
463
|
private getCollection;
|
|
464
|
+
private ledgerIndexesEnsured;
|
|
426
465
|
private getLedgerCollection;
|
|
427
466
|
/**
|
|
428
467
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
430
469
|
listCrawlLedger(options?: {
|
|
431
470
|
agentId?: string;
|
|
432
471
|
domain?: string;
|
|
472
|
+
ingestionId?: string;
|
|
433
473
|
status?: CrawlLedgerStatus;
|
|
434
474
|
limit?: number;
|
|
435
475
|
skip?: number;
|
|
@@ -650,34 +690,20 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
650
690
|
* Crawl a single page and extract content
|
|
651
691
|
*/
|
|
652
692
|
private crawlPage;
|
|
653
|
-
/**
|
|
654
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
655
|
-
* would otherwise hit an empty wrapper.
|
|
656
|
-
*/
|
|
657
|
-
private static readonly DEFAULT_CONTENT_SELECTOR;
|
|
658
|
-
private stripNoiseFromDom;
|
|
659
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
660
|
-
private extractBestContentText;
|
|
661
693
|
private bodyTextLengthHint;
|
|
662
694
|
private extractDocumentFromHtml;
|
|
663
|
-
/**
|
|
664
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
665
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
666
|
-
*/
|
|
667
|
-
private extractHeroImage;
|
|
668
695
|
private looksLikeDynamicShell;
|
|
669
696
|
private diagFromRenderedAttempt;
|
|
670
697
|
private crawlPageSmart;
|
|
671
698
|
private crawlPageRendered;
|
|
672
699
|
private discoverSitemaps;
|
|
700
|
+
private emitBulkProgress;
|
|
701
|
+
private emitCrawlProgress;
|
|
702
|
+
private emitCrawlPage;
|
|
673
703
|
private createDebugCollector;
|
|
674
704
|
/**
|
|
675
705
|
* Clean extracted text content
|
|
676
706
|
*/
|
|
677
|
-
private cleanContent;
|
|
678
|
-
/**
|
|
679
|
-
* Convert URL to a stable document ID
|
|
680
|
-
*/
|
|
681
707
|
private urlToId;
|
|
682
708
|
/**
|
|
683
709
|
* Delay helper
|
|
@@ -740,4 +766,43 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
740
766
|
getConfig(): Record<string, any>;
|
|
741
767
|
}
|
|
742
768
|
|
|
743
|
-
|
|
769
|
+
interface HtmlPageExtractOptions {
|
|
770
|
+
titleSelector?: string;
|
|
771
|
+
contentSelector?: string;
|
|
772
|
+
removeSelectors?: string[];
|
|
773
|
+
defaultType?: string;
|
|
774
|
+
typeFromUrl?: Record<string, string>;
|
|
775
|
+
minExtractedContentLength?: number;
|
|
776
|
+
metadata?: Record<string, unknown>;
|
|
777
|
+
}
|
|
778
|
+
interface HtmlPageExtractResult {
|
|
779
|
+
id: string;
|
|
780
|
+
metadata: Record<string, unknown>;
|
|
781
|
+
content: string;
|
|
782
|
+
/** True when content meets minExtractedContentLength (default 50). */
|
|
783
|
+
indexable: boolean;
|
|
784
|
+
contentPreview: string;
|
|
785
|
+
}
|
|
786
|
+
declare function urlToDocumentId(url: string): string;
|
|
787
|
+
declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
|
|
788
|
+
/**
|
|
789
|
+
* Extract full page metadata + main content the same way web-rag does on HTML ingest.
|
|
790
|
+
* Unlike ingest, always returns metadata even when content is too short to index.
|
|
791
|
+
*/
|
|
792
|
+
declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
|
|
793
|
+
|
|
794
|
+
interface ProductMetadata {
|
|
795
|
+
price?: number;
|
|
796
|
+
currency?: string;
|
|
797
|
+
availability?: string;
|
|
798
|
+
}
|
|
799
|
+
/**
|
|
800
|
+
* Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
|
|
801
|
+
* Per-field priority: JSON-LD → Open Graph → microdata.
|
|
802
|
+
*/
|
|
803
|
+
declare function extractProductMetadata(html: string): ProductMetadata;
|
|
804
|
+
declare function parsePrice(value: unknown): number | undefined;
|
|
805
|
+
declare function normalizeCurrency(value: unknown): string | undefined;
|
|
806
|
+
declare function normalizeAvailability(value: unknown): string | undefined;
|
|
807
|
+
|
|
808
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };
|