@snap-agent/rag-web 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -690,21 +690,8 @@ declare class WebRAGPlugin implements RAGPlugin {
690
690
  * Crawl a single page and extract content
691
691
  */
692
692
  private crawlPage;
693
- /**
694
- * Default chain works for many WordPress / Elementor / block themes where `.first()`
695
- * would otherwise hit an empty wrapper.
696
- */
697
- private static readonly DEFAULT_CONTENT_SELECTOR;
698
- private stripNoiseFromDom;
699
- /** Longest cleaned text among selector matches and full body (after noise strip). */
700
- private extractBestContentText;
701
693
  private bodyTextLengthHint;
702
694
  private extractDocumentFromHtml;
703
- /**
704
- * Fallback image extraction: finds the first meaningful image in the content area.
705
- * Skips icons, avatars, and tiny assets by filtering on common patterns.
706
- */
707
- private extractHeroImage;
708
695
  private looksLikeDynamicShell;
709
696
  private diagFromRenderedAttempt;
710
697
  private crawlPageSmart;
@@ -717,10 +704,6 @@ declare class WebRAGPlugin implements RAGPlugin {
717
704
  /**
718
705
  * Clean extracted text content
719
706
  */
720
- private cleanContent;
721
- /**
722
- * Convert URL to a stable document ID
723
- */
724
707
  private urlToId;
725
708
  /**
726
709
  * Delay helper
@@ -783,4 +766,70 @@ declare class WebRAGPlugin implements RAGPlugin {
783
766
  getConfig(): Record<string, any>;
784
767
  }
785
768
 
786
- export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
769
+ interface HtmlPageExtractOptions {
770
+ titleSelector?: string;
771
+ contentSelector?: string;
772
+ removeSelectors?: string[];
773
+ defaultType?: string;
774
+ typeFromUrl?: Record<string, string>;
775
+ minExtractedContentLength?: number;
776
+ metadata?: Record<string, unknown>;
777
+ }
778
+ interface HtmlPageExtractResult {
779
+ id: string;
780
+ metadata: Record<string, unknown>;
781
+ content: string;
782
+ /** True when content meets minExtractedContentLength (default 50). */
783
+ indexable: boolean;
784
+ contentPreview: string;
785
+ }
786
+ declare function urlToDocumentId(url: string): string;
787
+ declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
788
+ /**
789
+ * Extract full page metadata + main content the same way web-rag does on HTML ingest.
790
+ * Unlike ingest, always returns metadata even when content is too short to index.
791
+ */
792
+ declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
793
+
794
+ interface ProductMetadata {
795
+ price?: number;
796
+ currency?: string;
797
+ availability?: string;
798
+ }
799
+ /**
800
+ * Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
801
+ * Per-field priority: JSON-LD → Open Graph → microdata.
802
+ */
803
+ declare function extractProductMetadata(html: string): ProductMetadata;
804
+ declare function parsePrice(value: unknown): number | undefined;
805
+ declare function normalizeCurrency(value: unknown): string | undefined;
806
+ declare function normalizeAvailability(value: unknown): string | undefined;
807
+
808
+ /** Abstract page roles — vertical-agnostic. */
809
+ type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
810
+ interface PageCardMetadataInput {
811
+ url: string;
812
+ title?: string;
813
+ /** Primary heading (h1) — preferred for displayTitle over the document title tag. */
814
+ headingTitle?: string;
815
+ description?: string;
816
+ imageUrl?: string;
817
+ html?: string;
818
+ /** Type already resolved from typeFromUrl / defaultType. */
819
+ type?: string;
820
+ hasProductPrice?: boolean;
821
+ }
822
+ interface PageCardMetadataResult {
823
+ type: PageCardType | string;
824
+ cardEligible: boolean;
825
+ cardPriority: number;
826
+ displayTitle?: string;
827
+ displayDescription?: string;
828
+ displayImageUrl?: string;
829
+ }
830
+ declare function normalizeDisplayTitle(title?: string): string | undefined;
831
+ declare function hardExcludePage(url: string, title?: string): boolean;
832
+ declare function inferTypeFromUrl(url: string): PageCardType | undefined;
833
+ declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
834
+
835
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
package/dist/index.d.ts CHANGED
@@ -690,21 +690,8 @@ declare class WebRAGPlugin implements RAGPlugin {
690
690
  * Crawl a single page and extract content
691
691
  */
692
692
  private crawlPage;
693
- /**
694
- * Default chain works for many WordPress / Elementor / block themes where `.first()`
695
- * would otherwise hit an empty wrapper.
696
- */
697
- private static readonly DEFAULT_CONTENT_SELECTOR;
698
- private stripNoiseFromDom;
699
- /** Longest cleaned text among selector matches and full body (after noise strip). */
700
- private extractBestContentText;
701
693
  private bodyTextLengthHint;
702
694
  private extractDocumentFromHtml;
703
- /**
704
- * Fallback image extraction: finds the first meaningful image in the content area.
705
- * Skips icons, avatars, and tiny assets by filtering on common patterns.
706
- */
707
- private extractHeroImage;
708
695
  private looksLikeDynamicShell;
709
696
  private diagFromRenderedAttempt;
710
697
  private crawlPageSmart;
@@ -717,10 +704,6 @@ declare class WebRAGPlugin implements RAGPlugin {
717
704
  /**
718
705
  * Clean extracted text content
719
706
  */
720
- private cleanContent;
721
- /**
722
- * Convert URL to a stable document ID
723
- */
724
707
  private urlToId;
725
708
  /**
726
709
  * Delay helper
@@ -783,4 +766,70 @@ declare class WebRAGPlugin implements RAGPlugin {
783
766
  getConfig(): Record<string, any>;
784
767
  }
785
768
 
786
- export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };
769
+ interface HtmlPageExtractOptions {
770
+ titleSelector?: string;
771
+ contentSelector?: string;
772
+ removeSelectors?: string[];
773
+ defaultType?: string;
774
+ typeFromUrl?: Record<string, string>;
775
+ minExtractedContentLength?: number;
776
+ metadata?: Record<string, unknown>;
777
+ }
778
+ interface HtmlPageExtractResult {
779
+ id: string;
780
+ metadata: Record<string, unknown>;
781
+ content: string;
782
+ /** True when content meets minExtractedContentLength (default 50). */
783
+ indexable: boolean;
784
+ contentPreview: string;
785
+ }
786
+ declare function urlToDocumentId(url: string): string;
787
+ declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
788
+ /**
789
+ * Extract full page metadata + main content the same way web-rag does on HTML ingest.
790
+ * Unlike ingest, always returns metadata even when content is too short to index.
791
+ */
792
+ declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
793
+
794
+ interface ProductMetadata {
795
+ price?: number;
796
+ currency?: string;
797
+ availability?: string;
798
+ }
799
+ /**
800
+ * Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
801
+ * Per-field priority: JSON-LD → Open Graph → microdata.
802
+ */
803
+ declare function extractProductMetadata(html: string): ProductMetadata;
804
+ declare function parsePrice(value: unknown): number | undefined;
805
+ declare function normalizeCurrency(value: unknown): string | undefined;
806
+ declare function normalizeAvailability(value: unknown): string | undefined;
807
+
808
+ /** Abstract page roles — vertical-agnostic. */
809
+ type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
810
+ interface PageCardMetadataInput {
811
+ url: string;
812
+ title?: string;
813
+ /** Primary heading (h1) — preferred for displayTitle over the document title tag. */
814
+ headingTitle?: string;
815
+ description?: string;
816
+ imageUrl?: string;
817
+ html?: string;
818
+ /** Type already resolved from typeFromUrl / defaultType. */
819
+ type?: string;
820
+ hasProductPrice?: boolean;
821
+ }
822
+ interface PageCardMetadataResult {
823
+ type: PageCardType | string;
824
+ cardEligible: boolean;
825
+ cardPriority: number;
826
+ displayTitle?: string;
827
+ displayDescription?: string;
828
+ displayImageUrl?: string;
829
+ }
830
+ declare function normalizeDisplayTitle(title?: string): string | undefined;
831
+ declare function hardExcludePage(url: string, title?: string): boolean;
832
+ declare function inferTypeFromUrl(url: string): PageCardType | undefined;
833
+ declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
834
+
835
+ export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };