npm - @snap-agent/rag-web - Versions diffs - 0.1.5 → 0.1.7 - Mend

@snap-agent/rag-web 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -690,21 +690,8 @@ declare class WebRAGPlugin implements RAGPlugin {
      * Crawl a single page and extract content
      */
     private crawlPage;
-    /**
-     * Default chain works for many WordPress / Elementor / block themes where `.first()`
-     * would otherwise hit an empty wrapper.
-     */
-    private static readonly DEFAULT_CONTENT_SELECTOR;
-    private stripNoiseFromDom;
-    /** Longest cleaned text among selector matches and full body (after noise strip). */
-    private extractBestContentText;
     private bodyTextLengthHint;
     private extractDocumentFromHtml;
-    /**
-     * Fallback image extraction: finds the first meaningful image in the content area.
-     * Skips icons, avatars, and tiny assets by filtering on common patterns.
-     */
-    private extractHeroImage;
     private looksLikeDynamicShell;
     private diagFromRenderedAttempt;
     private crawlPageSmart;
@@ -717,10 +704,6 @@ declare class WebRAGPlugin implements RAGPlugin {
     /**
      * Clean extracted text content
      */
-    private cleanContent;
-    /**
-     * Convert URL to a stable document ID
-     */
     private urlToId;
     /**
      * Delay helper
@@ -783,4 +766,70 @@ declare class WebRAGPlugin implements RAGPlugin {
     getConfig(): Record<string, any>;
 }
-export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
+interface HtmlPageExtractOptions {
+    titleSelector?: string;
+    contentSelector?: string;
+    removeSelectors?: string[];
+    defaultType?: string;
+    typeFromUrl?: Record<string, string>;
+    minExtractedContentLength?: number;
+    metadata?: Record<string, unknown>;
+}
+interface HtmlPageExtractResult {
+    id: string;
+    metadata: Record<string, unknown>;
+    content: string;
+    /** True when content meets minExtractedContentLength (default 50). */
+    indexable: boolean;
+    contentPreview: string;
+}
+declare function urlToDocumentId(url: string): string;
+declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
+/**
+ * Extract full page metadata + main content the same way web-rag does on HTML ingest.
+ * Unlike ingest, always returns metadata even when content is too short to index.
+ */
+declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
+interface ProductMetadata {
+    price?: number;
+    currency?: string;
+    availability?: string;
+}
+/**
+ * Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
+ * Per-field priority: JSON-LD → Open Graph → microdata.
+ */
+declare function extractProductMetadata(html: string): ProductMetadata;
+declare function parsePrice(value: unknown): number | undefined;
+declare function normalizeCurrency(value: unknown): string | undefined;
+declare function normalizeAvailability(value: unknown): string | undefined;
+/** Abstract page roles — vertical-agnostic. */
+type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
+interface PageCardMetadataInput {
+    url: string;
+    title?: string;
+    /** Primary heading (h1) — preferred for displayTitle over the document title tag. */
+    headingTitle?: string;
+    description?: string;
+    imageUrl?: string;
+    html?: string;
+    /** Type already resolved from typeFromUrl / defaultType. */
+    type?: string;
+    hasProductPrice?: boolean;
+}
+interface PageCardMetadataResult {
+    type: PageCardType | string;
+    cardEligible: boolean;
+    cardPriority: number;
+    displayTitle?: string;
+    displayDescription?: string;
+    displayImageUrl?: string;
+}
+declare function normalizeDisplayTitle(title?: string): string | undefined;
+declare function hardExcludePage(url: string, title?: string): boolean;
+declare function inferTypeFromUrl(url: string): PageCardType | undefined;
+declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
+export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };

package/dist/index.d.ts CHANGED Viewed

@@ -690,21 +690,8 @@ declare class WebRAGPlugin implements RAGPlugin {
      * Crawl a single page and extract content
      */
     private crawlPage;
-    /**
-     * Default chain works for many WordPress / Elementor / block themes where `.first()`
-     * would otherwise hit an empty wrapper.
-     */
-    private static readonly DEFAULT_CONTENT_SELECTOR;
-    private stripNoiseFromDom;
-    /** Longest cleaned text among selector matches and full body (after noise strip). */
-    private extractBestContentText;
     private bodyTextLengthHint;
     private extractDocumentFromHtml;
-    /**
-     * Fallback image extraction: finds the first meaningful image in the content area.
-     * Skips icons, avatars, and tiny assets by filtering on common patterns.
-     */
-    private extractHeroImage;
     private looksLikeDynamicShell;
     private diagFromRenderedAttempt;
     private crawlPageSmart;
@@ -717,10 +704,6 @@ declare class WebRAGPlugin implements RAGPlugin {
     /**
      * Clean extracted text content
      */
-    private cleanContent;
-    /**
-     * Convert URL to a stable document ID
-     */
     private urlToId;
     /**
      * Delay helper
@@ -783,4 +766,70 @@ declare class WebRAGPlugin implements RAGPlugin {
     getConfig(): Record<string, any>;
 }
-export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
+interface HtmlPageExtractOptions {
+    titleSelector?: string;
+    contentSelector?: string;
+    removeSelectors?: string[];
+    defaultType?: string;
+    typeFromUrl?: Record<string, string>;
+    minExtractedContentLength?: number;
+    metadata?: Record<string, unknown>;
+}
+interface HtmlPageExtractResult {
+    id: string;
+    metadata: Record<string, unknown>;
+    content: string;
+    /** True when content meets minExtractedContentLength (default 50). */
+    indexable: boolean;
+    contentPreview: string;
+}
+declare function urlToDocumentId(url: string): string;
+declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
+/**
+ * Extract full page metadata + main content the same way web-rag does on HTML ingest.
+ * Unlike ingest, always returns metadata even when content is too short to index.
+ */
+declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
+interface ProductMetadata {
+    price?: number;
+    currency?: string;
+    availability?: string;
+}
+/**
+ * Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
+ * Per-field priority: JSON-LD → Open Graph → microdata.
+ */
+declare function extractProductMetadata(html: string): ProductMetadata;
+declare function parsePrice(value: unknown): number | undefined;
+declare function normalizeCurrency(value: unknown): string | undefined;
+declare function normalizeAvailability(value: unknown): string | undefined;
+/** Abstract page roles — vertical-agnostic. */
+type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
+interface PageCardMetadataInput {
+    url: string;
+    title?: string;
+    /** Primary heading (h1) — preferred for displayTitle over the document title tag. */
+    headingTitle?: string;
+    description?: string;
+    imageUrl?: string;
+    html?: string;
+    /** Type already resolved from typeFromUrl / defaultType. */
+    type?: string;
+    hasProductPrice?: boolean;
+}
+interface PageCardMetadataResult {
+    type: PageCardType | string;
+    cardEligible: boolean;
+    cardPriority: number;
+    displayTitle?: string;
+    displayDescription?: string;
+    displayImageUrl?: string;
+}
+declare function normalizeDisplayTitle(title?: string): string | undefined;
+declare function hardExcludePage(url: string, title?: string): boolean;
+declare function inferTypeFromUrl(url: string): PageCardType | undefined;
+declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
+export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };