npm - @snap-agent/rag-web - Versions diffs - 0.1.0 - Mend

@snap-agent/rag-web 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.mts ADDED Viewed

@@ -0,0 +1,727 @@
+import { RAGPlugin, RAGContext, RAGDocument, IngestOptions, IngestResult, BulkOperation, BulkResult } from '@snap-agent/core';
+/**
+ * Content document with minimal required fields and flexible metadata
+ *
+ * Only three things are required:
+ * - id: Unique identifier
+ * - content: Text to embed and search
+ * - metadata.type: Content classification (e.g., 'blog', 'page', 'project', 'team')
+ *
+ * Everything else in metadata is pass-through - store any fields you need.
+ */
+interface WebDocument {
+    id: string;
+    content: string;
+    metadata: {
+        type: string;
+        title?: string;
+        url?: string;
+        [key: string]: any;
+    };
+}
+/**
+ * Stored document with embedding and system fields
+ */
+interface StoredWebDocument extends WebDocument {
+    tenantId: string;
+    agentId?: string;
+    embedding: number[];
+    createdAt: Date;
+    updatedAt?: Date;
+}
+/**
+ * Plugin configuration
+ */
+interface WebRAGConfig {
+    mongoUri: string;
+    dbName: string;
+    collection?: string;
+    openaiApiKey: string;
+    embeddingModel?: string;
+    tenantId: string;
+    vectorIndexName?: string;
+    numCandidates?: number;
+    limit?: number;
+    minScore?: number;
+    filterableFields?: string[];
+    typeBoosts?: Record<string, number>;
+    recencyBoost?: {
+        enabled: boolean;
+        field: string;
+        decayDays: number;
+        maxBoost?: number;
+    };
+    cache?: {
+        embeddings?: {
+            enabled: boolean;
+            ttl?: number;
+            maxSize?: number;
+        };
+    };
+    priority?: number;
+    /**
+     * Persistent crawl ledger (MongoDB) — skip re-crawl within TTL, audit per URL.
+     * Per-request overrides: pass `crawlLedger` on SitemapConfig / WebsiteCrawlConfig / etc.
+     */
+    crawlLedger?: CrawlLedgerPluginConfig;
+}
+/**
+ * Global defaults for crawl ledger (MongoDB collection separate from vector content).
+ */
+interface CrawlLedgerPluginConfig {
+    /** Default off so existing installs behave the same */
+    enabled?: boolean;
+    /** Collection name (default: web_crawl_ledger) */
+    collection?: string;
+    /** Skip re-crawl if last status was indexed and younger than this (default: 7 days) */
+    ttlMsIndexed?: number;
+    /** Skip re-crawl if last status was a failure and younger than this (default: 1 hour) */
+    ttlMsFailure?: number;
+    /**
+     * Skip re-crawl if last status was `error` (e.g. Playwright timeout) and younger than this.
+     * Shorter than ttlMsFailure so transient render errors retry sooner (default: 5 minutes).
+     */
+    ttlMsRenderError?: number;
+}
+/**
+ * Per-ingest crawl ledger options (merged over plugin.crawlLedger).
+ * To bypass skip-TTL for one run: pass `forceRecrawl: true` on IngestOptions (SDK).
+ */
+interface CrawlLedgerOptions {
+    enabled?: boolean;
+    ttlMsIndexed?: number;
+    ttlMsFailure?: number;
+    ttlMsRenderError?: number;
+    /** Max rows in result.metadata.pageStatuses (default: 500) */
+    maxPageStatuses?: number;
+}
+type CrawlLedgerStatus = 'indexed' | 'skipped_ledger' | 'too_small' | 'non_html' | 'blocked_suspected' | 'error';
+interface CrawlPageStatusEntry {
+    url: string;
+    urlNormalized?: string;
+    status: CrawlLedgerStatus;
+    modeUsed?: string;
+    contentLength?: number;
+    /** Raw-ish body text length before selector pick (debug) */
+    bodyTextLengthHint?: number;
+    title?: string;
+    docId?: string;
+    httpStatus?: number;
+    error?: string;
+    skippedReason?: string;
+}
+interface CrawlLedgerDocument {
+    tenantId: string;
+    agentId: string;
+    urlNormalized: string;
+    url: string;
+    domain: string;
+    lastStatus: CrawlLedgerStatus;
+    lastCrawledAt: Date;
+    modeUsed?: string;
+    contentLength?: number;
+    title?: string;
+    docId?: string;
+    httpStatus?: number;
+    errorMessage?: string;
+    updatedAt: Date;
+}
+/**
+ * URL source for ingesting content from external APIs
+ */
+interface URLSource {
+    url: string;
+    type: 'json' | 'csv' | 'xml' | 'api';
+    auth?: URLSourceAuth;
+    transform?: DataTransform;
+    headers?: Record<string, string>;
+    timeout?: number;
+    metadata?: Record<string, any>;
+}
+interface URLSourceAuth {
+    type: 'bearer' | 'basic' | 'api-key' | 'custom';
+    token?: string;
+    username?: string;
+    password?: string;
+    header?: string;
+    key?: string;
+    headers?: Record<string, string>;
+}
+interface DataTransform {
+    documentPath?: string;
+    fieldMapping?: {
+        id?: string;
+        content?: string;
+        type?: string | (() => string);
+        [key: string]: string | (() => string) | undefined;
+    };
+}
+/**
+ * Drupal JSON:API specific configuration
+ */
+interface DrupalConfig {
+    baseUrl: string;
+    contentTypes: string[];
+    auth?: URLSourceAuth;
+    mappings?: Record<string, {
+        content: string;
+        fields?: Record<string, string>;
+    }>;
+}
+/**
+ * WordPress REST API specific configuration
+ */
+interface WordPressConfig {
+    baseUrl: string;
+    postTypes?: string[];
+    auth?: URLSourceAuth;
+    perPage?: number;
+    maxPages?: number;
+    mappings?: Record<string, {
+        content?: string;
+        fields?: Record<string, string>;
+    }>;
+}
+/**
+ * Sanity.io specific configuration
+ */
+interface SanityConfig {
+    projectId: string;
+    dataset: string;
+    apiVersion?: string;
+    token?: string;
+    useCdn?: boolean;
+    queries: Record<string, {
+        query: string;
+        content: string;
+        fields?: Record<string, string>;
+    }>;
+}
+/**
+ * Strapi specific configuration
+ */
+interface StrapiConfig {
+    baseUrl: string;
+    apiToken?: string;
+    contentTypes: string[];
+    pageSize?: number;
+    maxPages?: number;
+    mappings?: Record<string, {
+        content?: string;
+        fields?: Record<string, string>;
+        useAttributes?: boolean;
+    }>;
+}
+/**
+ * Sitemap crawling configuration
+ * For non-technical clients - just provide the sitemap URL
+ */
+interface SitemapConfig {
+    sitemapUrl?: string;
+    baseUrl?: string;
+    maxPages?: number;
+    concurrency?: number;
+    delayMs?: number;
+    timeout?: number;
+    contentSelector?: string;
+    titleSelector?: string;
+    removeSelectors?: string[];
+    /** Minimum cleaned text length to accept a page (default: 50) */
+    minExtractedContentLength?: number;
+    includePatterns?: string[];
+    excludePatterns?: string[];
+    /** Strip query string for crawl ledger key (default: false) */
+    stripQueryParams?: boolean;
+    typeFromUrl?: Record<string, string>;
+    defaultType?: string;
+    metadata?: Record<string, any>;
+    /**
+     * Rendering mode for JS-heavy sites
+     * - false: only static HTML fetch
+     * - true: always render with a headless browser
+     * - "auto": try static first, render as fallback when content is too small / looks dynamic
+     */
+    render?: boolean | 'auto';
+    /**
+     * Render options (used when render is true/auto)
+     */
+    renderOptions?: RenderOptions;
+    /**
+     * Debug/observability options
+     */
+    debug?: DebugOptions;
+    crawlLedger?: CrawlLedgerOptions;
+}
+/**
+ * Direct URL list crawling configuration
+ */
+interface UrlListConfig {
+    contentSelector?: string;
+    titleSelector?: string;
+    removeSelectors?: string[];
+    concurrency?: number;
+    delayMs?: number;
+    timeout?: number;
+    type?: string;
+    typeFromUrl?: Record<string, string>;
+    metadata?: Record<string, any>;
+    render?: boolean | 'auto';
+    renderOptions?: RenderOptions;
+    debug?: DebugOptions;
+    stripQueryParams?: boolean;
+    crawlLedger?: CrawlLedgerOptions;
+}
+/**
+ * Single page ingestion (no discovery)
+ */
+interface SinglePageConfig {
+    url: string;
+    contentSelector?: string;
+    titleSelector?: string;
+    removeSelectors?: string[];
+    timeout?: number;
+    type?: string;
+    typeFromUrl?: Record<string, string>;
+    metadata?: Record<string, any>;
+    render?: boolean | 'auto';
+    renderOptions?: RenderOptions;
+    debug?: DebugOptions;
+    /** Ledger key normalization (default: true) */
+    stripQueryParams?: boolean;
+    crawlLedger?: CrawlLedgerOptions;
+}
+/**
+ * Website crawling configuration (no sitemap)
+ * Discovers internal links starting from a base URL and then crawls them.
+ */
+interface WebsiteCrawlConfig {
+    baseUrl: string;
+    maxPages?: number;
+    maxDepth?: number;
+    concurrency?: number;
+    delayMs?: number;
+    timeout?: number;
+    includePatterns?: string[];
+    excludePatterns?: string[];
+    stripQueryParams?: boolean;
+    contentSelector?: string;
+    titleSelector?: string;
+    removeSelectors?: string[];
+    typeFromUrl?: Record<string, string>;
+    defaultType?: string;
+    metadata?: Record<string, any>;
+    render?: boolean | 'auto';
+    renderOptions?: RenderOptions;
+    debug?: DebugOptions;
+    crawlLedger?: CrawlLedgerOptions;
+}
+interface RenderOptions {
+    /**
+     * Minimum extracted content length to accept from static crawl before falling back to render.
+     * Used only when render === "auto".
+     */
+    minContentLength?: number;
+    /**
+     * Navigation wait strategy for the headless browser.
+     */
+    waitUntil?: 'domcontentloaded' | 'load' | 'networkidle';
+    /**
+     * Optional selector that indicates the page's main content is ready.
+     */
+    waitForSelector?: string;
+    /**
+     * Scroll settings for infinite scroll pages.
+     */
+    scroll?: {
+        enabled?: boolean;
+        maxScrolls?: number;
+        scrollDelayMs?: number;
+        stableIterations?: number;
+    };
+    /**
+     * Wait after navigation (and optional waitForSelector) before reading HTML.
+     * Helps WordPress/Elementor and other late-hydrated layouts.
+     */
+    postRenderDelayMs?: number;
+}
+interface DebugOptions {
+    enabled?: boolean;
+    level?: 'summary' | 'verbose';
+    saveDir?: string;
+    maxPerUrlLogs?: number;
+}
+/**
+ * RSS/Atom feed configuration
+ */
+interface RSSConfig {
+    feedUrl: string;
+    useFullContent?: boolean;
+    fetchFullContent?: boolean;
+    contentSelector?: string;
+    type?: string;
+    metadata?: Record<string, any>;
+}
+/**
+ * Crawl result for sitemap/URL crawling
+ */
+interface CrawlResult extends WebIngestResult {
+    urlsCrawled: number;
+    urlsSkipped: number;
+    urlsFailed: number;
+    crawledAt: Date;
+}
+/**
+ * Ingest result
+ */
+interface WebIngestResult {
+    success: boolean;
+    indexed: number;
+    failed: number;
+    errors?: Array<{
+        id: string;
+        error: string;
+    }>;
+    metadata?: Record<string, any>;
+}
+/**
+ * URL ingest result
+ */
+interface WebURLIngestResult extends WebIngestResult {
+    sourceUrl: string;
+    fetchedAt: Date;
+    documentsFetched: number;
+}
+/**
+ * Web RAG Plugin
+ *
+ * Schema-agnostic RAG plugin for web content.
+ * Works with Drupal, WordPress, Contentful, or any content source.
+ *
+ * Key features:
+ * - Flexible metadata: Only id, content, and type are required
+ * - Pass-through fields: Store any metadata, get it back in results
+ * - URL ingestion: Fetch from JSON, CSV, XML APIs
+ * - Drupal helpers: JSON:API parsing and field mapping
+ * - Type/recency boosts: Prioritize certain content types or fresh content
+ */
+declare class WebRAGPlugin implements RAGPlugin {
+    name: string;
+    type: "rag";
+    priority: number;
+    private config;
+    private client;
+    private db;
+    private openai;
+    private embeddingCache;
+    private cacheStats;
+    constructor(config: WebRAGConfig);
+    private getCollection;
+    private getLedgerCollection;
+    /**
+     * List recent crawl ledger rows (for dashboards / pagination in the front).
+     */
+    listCrawlLedger(options?: {
+        agentId?: string;
+        domain?: string;
+        status?: CrawlLedgerStatus;
+        limit?: number;
+        skip?: number;
+    }): Promise<CrawlLedgerDocument[]>;
+    private resolveCrawlLedgerOptions;
+    private normalizeLedgerUrl;
+    private shouldSkipLedger;
+    private findLedgerEntry;
+    private toLedgerStatus;
+    private upsertLedgerRecord;
+    private pushPageStatus;
+    disconnect(): Promise<void>;
+    /**
+     * Retrieve contextual content for a message
+     */
+    retrieveContext(message: string, options?: {
+        agentId?: string;
+        threadId?: string;
+        filters?: Record<string, any>;
+        metadata?: Record<string, any>;
+    }): Promise<RAGContext>;
+    /**
+     * Format retrieved content for LLM context
+     */
+    private formatResultsToContext;
+    private formatFieldName;
+    private formatFieldValue;
+    private vectorSearch;
+    private generateEmbedding;
+    private generateEmbeddingsBatch;
+    /**
+     * Ingest documents into the CMS RAG system
+     */
+    ingest(documents: RAGDocument[], options?: IngestOptions): Promise<IngestResult>;
+    /**
+     * Update a single document
+     */
+    update(id: string, document: Partial<RAGDocument>, options?: IngestOptions): Promise<void>;
+    /**
+     * Delete document(s) by ID
+     */
+    delete(ids: string | string[], options?: IngestOptions): Promise<number>;
+    /**
+     * Bulk operations
+     */
+    bulk(operations: BulkOperation[], options?: IngestOptions): Promise<BulkResult>;
+    /**
+     * Ingest content from a URL (JSON, CSV, XML, or API)
+     */
+    ingestFromUrl(source: URLSource, options?: IngestOptions): Promise<WebURLIngestResult>;
+    private buildAuthHeaders;
+    private transformJsonToDocuments;
+    private transformCsvToDocuments;
+    private parseCsvLine;
+    private transformXmlToDocuments;
+    private extractByPath;
+    private extractField;
+    /**
+     * Ingest content from a Drupal site using JSON:API
+     */
+    ingestFromDrupal(config: DrupalConfig, options?: IngestOptions): Promise<WebURLIngestResult[]>;
+    /**
+     * Parse Drupal JSON:API node type (e.g., 'node--project' → 'project')
+     */
+    static parseDrupalType(type: string): string;
+    /**
+     * Ingest content from a WordPress site using REST API
+     *
+     * @example
+     * ```typescript
+     * await plugin.ingestFromWordPress({
+     *   baseUrl: 'https://myblog.com',
+     *   postTypes: ['posts', 'pages'],
+     *   perPage: 100,
+     * });
+     * ```
+     */
+    ingestFromWordPress(config: WordPressConfig, options?: IngestOptions): Promise<WebURLIngestResult[]>;
+    /**
+     * Normalize WordPress post type to a cleaner name
+     */
+    private normalizeWordPressType;
+    /**
+     * Ingest content from a Sanity.io project using GROQ queries
+     *
+     * @example
+     * ```typescript
+     * await plugin.ingestFromSanity({
+     *   projectId: 'abc123',
+     *   dataset: 'production',
+     *   queries: {
+     *     post: {
+     *       query: '*[_type == "post" && !(_id in path("drafts.**"))]',
+     *       content: 'body',
+     *       fields: {
+     *         author: 'author->name',
+     *         categories: 'categories[]->title',
+     *       },
+     *     },
+     *   },
+     * });
+     * ```
+     */
+    ingestFromSanity(config: SanityConfig, options?: IngestOptions): Promise<WebURLIngestResult[]>;
+    /**
+     * Convert Sanity Portable Text blocks to plain text
+     * Useful for extracting content from rich text fields
+     */
+    static sanityBlocksToText(blocks: any[]): string;
+    /**
+     * Ingest content from a Strapi CMS (v4 by default)
+     *
+     * @example
+     * ```typescript
+     * await plugin.ingestFromStrapi({
+     *   baseUrl: 'https://my-strapi.com',
+     *   apiToken: process.env.STRAPI_TOKEN,
+     *   contentTypes: ['articles', 'pages'],
+     *   mappings: {
+     *     articles: {
+     *       content: 'attributes.content',
+     *       fields: {
+     *         author: 'attributes.author.data.attributes.name',
+     *         category: 'attributes.category.data.attributes.name',
+     *       },
+     *     },
+     *   },
+     * });
+     * ```
+     */
+    ingestFromStrapi(config: StrapiConfig, options?: IngestOptions): Promise<WebURLIngestResult[]>;
+    /**
+     * Normalize Strapi collection type to singular form
+     */
+    private normalizeStrapiType;
+    /**
+     * Ingest content by crawling a website's sitemap
+     * Perfect for non-technical clients - just provide the sitemap URL
+     *
+     * @example
+     * ```typescript
+     * // Simple usage - just provide the sitemap
+     * await plugin.ingestFromSitemap({
+     *   sitemapUrl: 'https://my-site/sitemap.xml',
+     * });
+     *
+     * // Or auto-discover sitemap from base URL
+     * await plugin.ingestFromSitemap({
+     *   baseUrl: 'https://my-site',
+     * });
+     *
+     * // With content selectors and type inference
+     * await plugin.ingestFromSitemap({
+     *   sitemapUrl: 'https://my-site/sitemap.xml',
+     *   contentSelector: 'article, .main-content',
+     *   excludePatterns: ['/cart', '/checkout', '/admin'],
+     *   typeFromUrl: {
+     *     '/projects/': 'project',
+     *     '/perspectives/': 'blog',
+     *     '/people/': 'team',
+     *   },
+     * });
+     * ```
+     */
+    ingestFromSitemap(config: SitemapConfig, options?: IngestOptions): Promise<CrawlResult>;
+    /**
+     * Ingest content from a website that has no sitemap (or sitemap is incomplete).
+     * Discovers internal links from `baseUrl` (BFS) and then crawls the discovered URLs.
+     *
+     * This uses the same extraction pipeline as `ingestFromSitemap()` (via `crawlPage()`).
+     */
+    ingestFromWebsite(config: WebsiteCrawlConfig, options?: IngestOptions): Promise<CrawlResult>;
+    /**
+     * Parse sitemap XML and extract URLs
+     */
+    private parseSitemap;
+    /**
+     * Extract URLs from sitemap XML
+     */
+    private extractUrlsFromXml;
+    private discoverInternalUrls;
+    private normalizeWebsiteUrl;
+    private fetchHtml;
+    private extractInternalLinks;
+    /**
+     * Ingest content from a list of URLs
+     *
+     * @example
+     * ```typescript
+     * await plugin.ingestFromUrls([
+     *   'https://example.com/about',
+     *   'https://example.com/services',
+     *   'https://example.com/contact',
+     * ], {
+     *   contentSelector: '.page-content',
+     *   type: 'page',
+     * });
+     * ```
+     */
+    ingestFromUrls(urls: string[], config?: UrlListConfig, options?: IngestOptions): Promise<CrawlResult>;
+    /**
+     * Ingest a single page from a URL (no sitemap discovery, no link lookup).
+     * Uses the same crawl pipeline (static/render/auto) as other web ingestion methods.
+     */
+    ingestSinglePageFromUrl(config: SinglePageConfig, options?: IngestOptions): Promise<CrawlResult>;
+    /**
+     * Crawl a list of URLs and ingest their content
+     */
+    private crawlUrls;
+    /**
+     * Crawl a single page and extract content
+     */
+    private crawlPage;
+    /**
+     * Default chain works for many WordPress / Elementor / block themes where `.first()`
+     * would otherwise hit an empty wrapper.
+     */
+    private static readonly DEFAULT_CONTENT_SELECTOR;
+    private stripNoiseFromDom;
+    /** Longest cleaned text among selector matches and full body (after noise strip). */
+    private extractBestContentText;
+    private bodyTextLengthHint;
+    private extractDocumentFromHtml;
+    private looksLikeDynamicShell;
+    private diagFromRenderedAttempt;
+    private crawlPageSmart;
+    private crawlPageRendered;
+    private discoverSitemaps;
+    private createDebugCollector;
+    /**
+     * Clean extracted text content
+     */
+    private cleanContent;
+    /**
+     * Convert URL to a stable document ID
+     */
+    private urlToId;
+    /**
+     * Delay helper
+     */
+    private delay;
+    /**
+     * Ingest content from an RSS or Atom feed
+     *
+     * @example
+     * ```typescript
+     * // Simple RSS ingestion
+     * await plugin.ingestFromRSS({
+     *   feedUrl: 'https://myblog.com/feed/',
+     * });
+     *
+     * // Fetch full page content for each item
+     * await plugin.ingestFromRSS({
+     *   feedUrl: 'https://myblog.com/feed/',
+     *   fetchFullContent: true,
+     *   contentSelector: 'article',
+     * });
+     * ```
+     */
+    ingestFromRSS(config: RSSConfig, options?: IngestOptions): Promise<CrawlResult>;
+    /**
+     * Parse RSS/Atom feed XML
+     */
+    private parseRSSFeed;
+    /**
+     * Extract a single value from XML
+     */
+    private extractXmlValue;
+    /**
+     * Extract multiple values from XML
+     */
+    private extractXmlValues;
+    /**
+     * Extract link from Atom entry
+     */
+    private extractAtomLink;
+    /**
+     * Strip HTML tags from content
+     */
+    private stripHtml;
+    /**
+     * Get cache statistics
+     */
+    getCacheStats(): {
+        hits: number;
+        misses: number;
+        hitRate: string;
+    };
+    /**
+     * Clear embedding cache
+     */
+    clearCache(): void;
+    /**
+     * Get plugin configuration (for persistence)
+     */
+    getConfig(): Record<string, any>;
+}
+export { type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageStatusEntry, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig };