npm - @snap-agent/rag-web - Versions diffs - 0.1.4 → 0.1.6 - Mend

@snap-agent/rag-web 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
 interface CrawlLedgerDocument {
     tenantId: string;
     agentId: string;
+    /** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
+    ingestionId?: string;
     urlNormalized: string;
     url: string;
     domain: string;
@@ -369,10 +371,46 @@ interface RSSConfig {
 /**
  * Crawl result for sitemap/URL crawling
  */
+type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
+/** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
+interface CrawlProgressUpdate {
+    phase: CrawlProgressPhase;
+    /** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
+    urlsDiscovered?: number;
+    /** URLs that will be crawled in this run (≤ maxPages). */
+    urlsScheduled?: number;
+    /** During crawl: batches done. During indexing: documents fully embedded so far. */
+    pagesProcessed?: number;
+    /** During indexing: total text chunks to embed (drives web_content writes). */
+    chunksTotal?: number;
+    /** During indexing: chunks embedded so far. */
+    chunksProcessed?: number;
+}
+type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
+type BulkProgressPhase = 'processing' | 'indexing';
+/** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
+interface BulkProgressUpdate {
+    phase: BulkProgressPhase;
+    opsTotal: number;
+    opsDone: number;
+    currentOpType?: 'insert' | 'update' | 'delete';
+    currentUrl?: string;
+}
+type BulkProgressCallback = (update: BulkProgressUpdate) => void;
+/** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
+type CrawlPageEvent = {
+    url: string;
+    event: 'start' | 'done';
+    status?: string;
+    error?: string;
+};
+type CrawlPageCallback = (event: CrawlPageEvent) => void;
 interface CrawlResult extends WebIngestResult {
     urlsCrawled: number;
     urlsSkipped: number;
     urlsFailed: number;
+    /** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
+    urlsScheduled?: number;
     crawledAt: Date;
 }
 /**
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
     private cacheStats;
     constructor(config: WebRAGConfig);
     private getCollection;
+    private ledgerIndexesEnsured;
     private getLedgerCollection;
     /**
      * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
     listCrawlLedger(options?: {
         agentId?: string;
         domain?: string;
+        ingestionId?: string;
         status?: CrawlLedgerStatus;
         limit?: number;
         skip?: number;
@@ -650,34 +690,20 @@ declare class WebRAGPlugin implements RAGPlugin {
      * Crawl a single page and extract content
      */
     private crawlPage;
-    /**
-     * Default chain works for many WordPress / Elementor / block themes where `.first()`
-     * would otherwise hit an empty wrapper.
-     */
-    private static readonly DEFAULT_CONTENT_SELECTOR;
-    private stripNoiseFromDom;
-    /** Longest cleaned text among selector matches and full body (after noise strip). */
-    private extractBestContentText;
     private bodyTextLengthHint;
     private extractDocumentFromHtml;
-    /**
-     * Fallback image extraction: finds the first meaningful image in the content area.
-     * Skips icons, avatars, and tiny assets by filtering on common patterns.
-     */
-    private extractHeroImage;
     private looksLikeDynamicShell;
     private diagFromRenderedAttempt;
     private crawlPageSmart;
     private crawlPageRendered;
     private discoverSitemaps;
+    private emitBulkProgress;
+    private emitCrawlProgress;
+    private emitCrawlPage;
     private createDebugCollector;
     /**
      * Clean extracted text content
      */
-    private cleanContent;
-    /**
-     * Convert URL to a stable document ID
-     */
     private urlToId;
     /**
      * Delay helper
@@ -740,4 +766,43 @@ declare class WebRAGPlugin implements RAGPlugin {
     getConfig(): Record<string, any>;
 }
-export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
+interface HtmlPageExtractOptions {
+    titleSelector?: string;
+    contentSelector?: string;
+    removeSelectors?: string[];
+    defaultType?: string;
+    typeFromUrl?: Record<string, string>;
+    minExtractedContentLength?: number;
+    metadata?: Record<string, unknown>;
+}
+interface HtmlPageExtractResult {
+    id: string;
+    metadata: Record<string, unknown>;
+    content: string;
+    /** True when content meets minExtractedContentLength (default 50). */
+    indexable: boolean;
+    contentPreview: string;
+}
+declare function urlToDocumentId(url: string): string;
+declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
+/**
+ * Extract full page metadata + main content the same way web-rag does on HTML ingest.
+ * Unlike ingest, always returns metadata even when content is too short to index.
+ */
+declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
+interface ProductMetadata {
+    price?: number;
+    currency?: string;
+    availability?: string;
+}
+/**
+ * Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
+ * Per-field priority: JSON-LD → Open Graph → microdata.
+ */
+declare function extractProductMetadata(html: string): ProductMetadata;
+declare function parsePrice(value: unknown): number | undefined;
+declare function normalizeCurrency(value: unknown): string | undefined;
+declare function normalizeAvailability(value: unknown): string | undefined;
+export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };

package/dist/index.d.ts CHANGED Viewed

@@ -118,6 +118,8 @@ interface CrawlPageStatusEntry {
 interface CrawlLedgerDocument {
     tenantId: string;
     agentId: string;
+    /** Correlates ledger rows with a single ingest run (from crawl metadata.ingestionId). */
+    ingestionId?: string;
     urlNormalized: string;
     url: string;
     domain: string;
@@ -369,10 +371,46 @@ interface RSSConfig {
 /**
  * Crawl result for sitemap/URL crawling
  */
+type CrawlProgressPhase = 'discovering' | 'crawling' | 'indexing';
+/** Live crawl progress (via `metadata.onCrawlProgress` on WebsiteCrawlConfig). */
+interface CrawlProgressUpdate {
+    phase: CrawlProgressPhase;
+    /** URLs found in sitemap/BFS (may exceed urlsScheduled when capped by maxPages). */
+    urlsDiscovered?: number;
+    /** URLs that will be crawled in this run (≤ maxPages). */
+    urlsScheduled?: number;
+    /** During crawl: batches done. During indexing: documents fully embedded so far. */
+    pagesProcessed?: number;
+    /** During indexing: total text chunks to embed (drives web_content writes). */
+    chunksTotal?: number;
+    /** During indexing: chunks embedded so far. */
+    chunksProcessed?: number;
+}
+type CrawlProgressCallback = (update: CrawlProgressUpdate) => void;
+type BulkProgressPhase = 'processing' | 'indexing';
+/** Live bulk progress (via `metadata.onBulkProgress` on IngestOptions). */
+interface BulkProgressUpdate {
+    phase: BulkProgressPhase;
+    opsTotal: number;
+    opsDone: number;
+    currentOpType?: 'insert' | 'update' | 'delete';
+    currentUrl?: string;
+}
+type BulkProgressCallback = (update: BulkProgressUpdate) => void;
+/** Per-URL crawl lifecycle (via `metadata.onCrawlPage` on WebsiteCrawlConfig). */
+type CrawlPageEvent = {
+    url: string;
+    event: 'start' | 'done';
+    status?: string;
+    error?: string;
+};
+type CrawlPageCallback = (event: CrawlPageEvent) => void;
 interface CrawlResult extends WebIngestResult {
     urlsCrawled: number;
     urlsSkipped: number;
     urlsFailed: number;
+    /** URLs selected for this crawl batch (≤ maxPages); for progress UI. */
+    urlsScheduled?: number;
     crawledAt: Date;
 }
 /**
@@ -423,6 +461,7 @@ declare class WebRAGPlugin implements RAGPlugin {
     private cacheStats;
     constructor(config: WebRAGConfig);
     private getCollection;
+    private ledgerIndexesEnsured;
     private getLedgerCollection;
     /**
      * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -430,6 +469,7 @@ declare class WebRAGPlugin implements RAGPlugin {
     listCrawlLedger(options?: {
         agentId?: string;
         domain?: string;
+        ingestionId?: string;
         status?: CrawlLedgerStatus;
         limit?: number;
         skip?: number;
@@ -650,34 +690,20 @@ declare class WebRAGPlugin implements RAGPlugin {
      * Crawl a single page and extract content
      */
     private crawlPage;
-    /**
-     * Default chain works for many WordPress / Elementor / block themes where `.first()`
-     * would otherwise hit an empty wrapper.
-     */
-    private static readonly DEFAULT_CONTENT_SELECTOR;
-    private stripNoiseFromDom;
-    /** Longest cleaned text among selector matches and full body (after noise strip). */
-    private extractBestContentText;
     private bodyTextLengthHint;
     private extractDocumentFromHtml;
-    /**
-     * Fallback image extraction: finds the first meaningful image in the content area.
-     * Skips icons, avatars, and tiny assets by filtering on common patterns.
-     */
-    private extractHeroImage;
     private looksLikeDynamicShell;
     private diagFromRenderedAttempt;
     private crawlPageSmart;
     private crawlPageRendered;
     private discoverSitemaps;
+    private emitBulkProgress;
+    private emitCrawlProgress;
+    private emitCrawlPage;
     private createDebugCollector;
     /**
      * Clean extracted text content
      */
-    private cleanContent;
-    /**
-     * Convert URL to a stable document ID
-     */
     private urlToId;
     /**
      * Delay helper
@@ -740,4 +766,43 @@ declare class WebRAGPlugin implements RAGPlugin {
     getConfig(): Record<string, any>;
 }
-export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
+interface HtmlPageExtractOptions {
+    titleSelector?: string;
+    contentSelector?: string;
+    removeSelectors?: string[];
+    defaultType?: string;
+    typeFromUrl?: Record<string, string>;
+    minExtractedContentLength?: number;
+    metadata?: Record<string, unknown>;
+}
+interface HtmlPageExtractResult {
+    id: string;
+    metadata: Record<string, unknown>;
+    content: string;
+    /** True when content meets minExtractedContentLength (default 50). */
+    indexable: boolean;
+    contentPreview: string;
+}
+declare function urlToDocumentId(url: string): string;
+declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
+/**
+ * Extract full page metadata + main content the same way web-rag does on HTML ingest.
+ * Unlike ingest, always returns metadata even when content is too short to index.
+ */
+declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
+interface ProductMetadata {
+    price?: number;
+    currency?: string;
+    availability?: string;
+}
+/**
+ * Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
+ * Per-field priority: JSON-LD → Open Graph → microdata.
+ */
+declare function extractProductMetadata(html: string): ProductMetadata;
+declare function parsePrice(value: unknown): number | undefined;
+declare function normalizeCurrency(value: unknown): string | undefined;
+declare function normalizeAvailability(value: unknown): string | undefined;
+export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, normalizeAvailability, normalizeCurrency, parsePrice, urlToDocumentId };