@snap-agent/rag-web 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +67 -18
- package/dist/index.d.ts +67 -18
- package/dist/index.js +590 -134
- package/dist/index.mjs +578 -133
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -690,21 +690,8 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
690
690
|
* Crawl a single page and extract content
|
|
691
691
|
*/
|
|
692
692
|
private crawlPage;
|
|
693
|
-
/**
|
|
694
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
695
|
-
* would otherwise hit an empty wrapper.
|
|
696
|
-
*/
|
|
697
|
-
private static readonly DEFAULT_CONTENT_SELECTOR;
|
|
698
|
-
private stripNoiseFromDom;
|
|
699
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
700
|
-
private extractBestContentText;
|
|
701
693
|
private bodyTextLengthHint;
|
|
702
694
|
private extractDocumentFromHtml;
|
|
703
|
-
/**
|
|
704
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
705
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
706
|
-
*/
|
|
707
|
-
private extractHeroImage;
|
|
708
695
|
private looksLikeDynamicShell;
|
|
709
696
|
private diagFromRenderedAttempt;
|
|
710
697
|
private crawlPageSmart;
|
|
@@ -717,10 +704,6 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
717
704
|
/**
|
|
718
705
|
* Clean extracted text content
|
|
719
706
|
*/
|
|
720
|
-
private cleanContent;
|
|
721
|
-
/**
|
|
722
|
-
* Convert URL to a stable document ID
|
|
723
|
-
*/
|
|
724
707
|
private urlToId;
|
|
725
708
|
/**
|
|
726
709
|
* Delay helper
|
|
@@ -783,4 +766,70 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
783
766
|
getConfig(): Record<string, any>;
|
|
784
767
|
}
|
|
785
768
|
|
|
786
|
-
|
|
769
|
+
interface HtmlPageExtractOptions {
|
|
770
|
+
titleSelector?: string;
|
|
771
|
+
contentSelector?: string;
|
|
772
|
+
removeSelectors?: string[];
|
|
773
|
+
defaultType?: string;
|
|
774
|
+
typeFromUrl?: Record<string, string>;
|
|
775
|
+
minExtractedContentLength?: number;
|
|
776
|
+
metadata?: Record<string, unknown>;
|
|
777
|
+
}
|
|
778
|
+
interface HtmlPageExtractResult {
|
|
779
|
+
id: string;
|
|
780
|
+
metadata: Record<string, unknown>;
|
|
781
|
+
content: string;
|
|
782
|
+
/** True when content meets minExtractedContentLength (default 50). */
|
|
783
|
+
indexable: boolean;
|
|
784
|
+
contentPreview: string;
|
|
785
|
+
}
|
|
786
|
+
declare function urlToDocumentId(url: string): string;
|
|
787
|
+
declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
|
|
788
|
+
/**
|
|
789
|
+
* Extract full page metadata + main content the same way web-rag does on HTML ingest.
|
|
790
|
+
* Unlike ingest, always returns metadata even when content is too short to index.
|
|
791
|
+
*/
|
|
792
|
+
declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
|
|
793
|
+
|
|
794
|
+
interface ProductMetadata {
|
|
795
|
+
price?: number;
|
|
796
|
+
currency?: string;
|
|
797
|
+
availability?: string;
|
|
798
|
+
}
|
|
799
|
+
/**
|
|
800
|
+
* Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
|
|
801
|
+
* Per-field priority: JSON-LD → Open Graph → microdata.
|
|
802
|
+
*/
|
|
803
|
+
declare function extractProductMetadata(html: string): ProductMetadata;
|
|
804
|
+
declare function parsePrice(value: unknown): number | undefined;
|
|
805
|
+
declare function normalizeCurrency(value: unknown): string | undefined;
|
|
806
|
+
declare function normalizeAvailability(value: unknown): string | undefined;
|
|
807
|
+
|
|
808
|
+
/** Abstract page roles — vertical-agnostic. */
|
|
809
|
+
type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
|
|
810
|
+
interface PageCardMetadataInput {
|
|
811
|
+
url: string;
|
|
812
|
+
title?: string;
|
|
813
|
+
/** Primary heading (h1) — preferred for displayTitle over the document title tag. */
|
|
814
|
+
headingTitle?: string;
|
|
815
|
+
description?: string;
|
|
816
|
+
imageUrl?: string;
|
|
817
|
+
html?: string;
|
|
818
|
+
/** Type already resolved from typeFromUrl / defaultType. */
|
|
819
|
+
type?: string;
|
|
820
|
+
hasProductPrice?: boolean;
|
|
821
|
+
}
|
|
822
|
+
interface PageCardMetadataResult {
|
|
823
|
+
type: PageCardType | string;
|
|
824
|
+
cardEligible: boolean;
|
|
825
|
+
cardPriority: number;
|
|
826
|
+
displayTitle?: string;
|
|
827
|
+
displayDescription?: string;
|
|
828
|
+
displayImageUrl?: string;
|
|
829
|
+
}
|
|
830
|
+
declare function normalizeDisplayTitle(title?: string): string | undefined;
|
|
831
|
+
declare function hardExcludePage(url: string, title?: string): boolean;
|
|
832
|
+
declare function inferTypeFromUrl(url: string): PageCardType | undefined;
|
|
833
|
+
declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
|
|
834
|
+
|
|
835
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
|
package/dist/index.d.ts
CHANGED
|
@@ -690,21 +690,8 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
690
690
|
* Crawl a single page and extract content
|
|
691
691
|
*/
|
|
692
692
|
private crawlPage;
|
|
693
|
-
/**
|
|
694
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
695
|
-
* would otherwise hit an empty wrapper.
|
|
696
|
-
*/
|
|
697
|
-
private static readonly DEFAULT_CONTENT_SELECTOR;
|
|
698
|
-
private stripNoiseFromDom;
|
|
699
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
700
|
-
private extractBestContentText;
|
|
701
693
|
private bodyTextLengthHint;
|
|
702
694
|
private extractDocumentFromHtml;
|
|
703
|
-
/**
|
|
704
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
705
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
706
|
-
*/
|
|
707
|
-
private extractHeroImage;
|
|
708
695
|
private looksLikeDynamicShell;
|
|
709
696
|
private diagFromRenderedAttempt;
|
|
710
697
|
private crawlPageSmart;
|
|
@@ -717,10 +704,6 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
717
704
|
/**
|
|
718
705
|
* Clean extracted text content
|
|
719
706
|
*/
|
|
720
|
-
private cleanContent;
|
|
721
|
-
/**
|
|
722
|
-
* Convert URL to a stable document ID
|
|
723
|
-
*/
|
|
724
707
|
private urlToId;
|
|
725
708
|
/**
|
|
726
709
|
* Delay helper
|
|
@@ -783,4 +766,70 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
783
766
|
getConfig(): Record<string, any>;
|
|
784
767
|
}
|
|
785
768
|
|
|
786
|
-
|
|
769
|
+
interface HtmlPageExtractOptions {
|
|
770
|
+
titleSelector?: string;
|
|
771
|
+
contentSelector?: string;
|
|
772
|
+
removeSelectors?: string[];
|
|
773
|
+
defaultType?: string;
|
|
774
|
+
typeFromUrl?: Record<string, string>;
|
|
775
|
+
minExtractedContentLength?: number;
|
|
776
|
+
metadata?: Record<string, unknown>;
|
|
777
|
+
}
|
|
778
|
+
interface HtmlPageExtractResult {
|
|
779
|
+
id: string;
|
|
780
|
+
metadata: Record<string, unknown>;
|
|
781
|
+
content: string;
|
|
782
|
+
/** True when content meets minExtractedContentLength (default 50). */
|
|
783
|
+
indexable: boolean;
|
|
784
|
+
contentPreview: string;
|
|
785
|
+
}
|
|
786
|
+
declare function urlToDocumentId(url: string): string;
|
|
787
|
+
declare function bodyTextLengthHint(html: string, options?: HtmlPageExtractOptions): number;
|
|
788
|
+
/**
|
|
789
|
+
* Extract full page metadata + main content the same way web-rag does on HTML ingest.
|
|
790
|
+
* Unlike ingest, always returns metadata even when content is too short to index.
|
|
791
|
+
*/
|
|
792
|
+
declare function extractPageFromHtml(url: string, html: string, options?: HtmlPageExtractOptions): HtmlPageExtractResult;
|
|
793
|
+
|
|
794
|
+
interface ProductMetadata {
|
|
795
|
+
price?: number;
|
|
796
|
+
currency?: string;
|
|
797
|
+
availability?: string;
|
|
798
|
+
}
|
|
799
|
+
/**
|
|
800
|
+
* Extract structured product fields from HTML (JSON-LD, Open Graph, microdata).
|
|
801
|
+
* Per-field priority: JSON-LD → Open Graph → microdata.
|
|
802
|
+
*/
|
|
803
|
+
declare function extractProductMetadata(html: string): ProductMetadata;
|
|
804
|
+
declare function parsePrice(value: unknown): number | undefined;
|
|
805
|
+
declare function normalizeCurrency(value: unknown): string | undefined;
|
|
806
|
+
declare function normalizeAvailability(value: unknown): string | undefined;
|
|
807
|
+
|
|
808
|
+
/** Abstract page roles — vertical-agnostic. */
|
|
809
|
+
type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
|
|
810
|
+
interface PageCardMetadataInput {
|
|
811
|
+
url: string;
|
|
812
|
+
title?: string;
|
|
813
|
+
/** Primary heading (h1) — preferred for displayTitle over the document title tag. */
|
|
814
|
+
headingTitle?: string;
|
|
815
|
+
description?: string;
|
|
816
|
+
imageUrl?: string;
|
|
817
|
+
html?: string;
|
|
818
|
+
/** Type already resolved from typeFromUrl / defaultType. */
|
|
819
|
+
type?: string;
|
|
820
|
+
hasProductPrice?: boolean;
|
|
821
|
+
}
|
|
822
|
+
interface PageCardMetadataResult {
|
|
823
|
+
type: PageCardType | string;
|
|
824
|
+
cardEligible: boolean;
|
|
825
|
+
cardPriority: number;
|
|
826
|
+
displayTitle?: string;
|
|
827
|
+
displayDescription?: string;
|
|
828
|
+
displayImageUrl?: string;
|
|
829
|
+
}
|
|
830
|
+
declare function normalizeDisplayTitle(title?: string): string | undefined;
|
|
831
|
+
declare function hardExcludePage(url: string, title?: string): boolean;
|
|
832
|
+
declare function inferTypeFromUrl(url: string): PageCardType | undefined;
|
|
833
|
+
declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
|
|
834
|
+
|
|
835
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
|