npm - @eidentic/rag - Versions diffs - 0.1.0 - Mend

@eidentic/rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.cts ADDED Viewed

@@ -0,0 +1,225 @@
+import { MemoryEvent, Scope } from '@eidentic/types';
+/**
+ * Text chunking for RAG document ingestion.
+ *
+ * Three strategies:
+ *   "fixed"     — slide a fixed-size window, breaking on word boundaries near the size limit.
+ *   "paragraph" — split on blank-line paragraph breaks first, then fall back to word boundary
+ *                 splits for paragraphs that are too long.
+ *   "sentence"  — split on sentence-ending punctuation (., !, ?) first, then fall back to word
+ *                 boundary splits for sentences that are too long.
+ *
+ * All strategies respect `overlap`: the last `overlap` characters of the previous chunk are
+ * prepended to the next chunk so retrieval can straddle boundaries.
+ */
+/** One text chunk produced by {@link chunkText}. */
+interface Chunk {
+    /** The chunk text (may include overlap prefix from the previous chunk). */
+    text: string;
+    /** Zero-based chunk index. */
+    index: number;
+    /** Start byte-offset in the *original* text (without overlap prefix). */
+    start: number;
+    /** End byte-offset in the *original* text (exclusive). */
+    end: number;
+}
+/** Options for {@link chunkText}. */
+interface ChunkOptions {
+    /**
+     * Target chunk size in characters. The actual chunk may be slightly smaller or larger if it
+     * is hard to find a clean boundary near the limit. Default: 1000.
+     */
+    size?: number;
+    /**
+     * Overlap in characters — how many characters from the end of the previous chunk to prepend
+     * to the next one. Must be < size. Default: 150.
+     */
+    overlap?: number;
+    /**
+     * Chunking strategy:
+     *   - "fixed"     — word-boundary window sliding over the whole text (default).
+     *   - "paragraph" — prefer blank-line boundaries; large paragraphs are further split.
+     *   - "sentence"  — prefer sentence-ending boundaries; long runs are further split.
+     */
+    strategy?: "fixed" | "paragraph" | "sentence";
+}
+/**
+ * Split `text` into overlapping chunks suitable for embedding and retrieval.
+ *
+ * Returns an empty array for empty/whitespace-only input.
+ */
+declare function chunkText(text: string, opts?: ChunkOptions): Chunk[];
+/** The extracted text and metadata from a document loader. */
+interface LoadedDocument {
+    /** Plain text extracted from the document. */
+    text: string;
+    /** Metadata attached to each ingested chunk. */
+    metadata: Record<string, unknown>;
+}
+/**
+ * Options for {@link loadMarkdown}.
+ */
+interface MarkdownLoaderOptions {
+    /** Stable source identifier placed into `metadata.source`. Defaults to `"markdown"`. */
+    source?: string;
+}
+/**
+ * Strip Markdown syntax and return plain readable text.
+ *
+ * Handles: headings, bold/italic/code spans, links/images, fenced/indented code blocks,
+ * blockquotes, horizontal rules, and HTML tags embedded in MD.
+ * Does NOT require any external dependency — pure regex.
+ */
+declare function loadMarkdown(content: string, opts?: MarkdownLoaderOptions): LoadedDocument;
+/**
+ * Options for {@link loadHtml}.
+ */
+interface HtmlLoaderOptions {
+    /** Stable source identifier placed into `metadata.source`. Defaults to `"html"`. */
+    source?: string;
+}
+/**
+ * Extract readable text from an HTML string.
+ *
+ * Removes `<script>`, `<style>`, `<head>`, and `<noscript>` elements, then
+ * walks the DOM collecting text nodes. Collapses runs of whitespace and
+ * preserves newlines at block-level boundaries.
+ *
+ * Uses `node-html-parser` — a lightweight HTML parser with no headless
+ * browser requirement.
+ */
+declare function loadHtml(html: string, opts?: HtmlLoaderOptions): LoadedDocument;
+/**
+ * Options for {@link loadPdf}.
+ */
+interface PdfLoaderOptions {
+    /** Stable source identifier placed into `metadata.source`. Defaults to `"pdf"`. */
+    source?: string;
+    /**
+     * Injectable parser function for testing — when provided, `pdf-parse` is NOT
+     * dynamically imported. Must accept a `Buffer` and return a promise of
+     * `{ text: string; numpages: number }`.
+     * @internal
+     */
+    _parser?: (buf: Buffer) => Promise<{
+        text: string;
+        numpages: number;
+    }>;
+}
+/**
+ * Extract text from a PDF `Buffer`.
+ *
+ * `pdf-parse` is an **optional peer dependency** — install it separately:
+ * ```sh
+ * npm install pdf-parse
+ * # or
+ * pnpm add pdf-parse
+ * ```
+ *
+ * @param buf  - PDF file contents as a `Buffer`.
+ * @param opts - Optional configuration.
+ * @returns Extracted text and metadata (`source`, `pages`).
+ * @throws Error if `pdf-parse` is not installed and no `_parser` is provided.
+ */
+declare function loadPdf(buf: Buffer, opts?: PdfLoaderOptions): Promise<LoadedDocument>;
+/** A structural memory interface — any object with an `ingest` method works here. */
+interface IngestableMemory {
+    ingest(events: MemoryEvent[]): Promise<void>;
+}
+/** A URL-based document source. */
+interface UrlSource {
+    url: string;
+}
+/**
+ * A typed document content source.
+ *
+ * Use this to pass pre-loaded document bytes/strings to `ingestDocument` so the
+ * correct loader (Markdown stripper, HTML extractor, or PDF parser) is applied
+ * before chunking.
+ *
+ * Example:
+ * ```ts
+ * await ingestDocument(
+ *   { type: "html", data: "<html>…</html>", source: "https://example.com/page" },
+ *   { memory, scope },
+ * );
+ * ```
+ */
+type TypedContentSource = {
+    type: "markdown";
+    data: string;
+    source?: string;
+} | {
+    type: "html";
+    data: string;
+    source?: string;
+} | {
+    type: "pdf";
+    data: Buffer;
+    source?: string;
+    _parser?: PdfLoaderOptions["_parser"];
+};
+/** Options for {@link ingestDocument}. */
+interface IngestDocumentOptions {
+    /** The memory to ingest into. Accepts any object with `ingest(events)` — not tied to `@eidentic/memory`. */
+    memory: IngestableMemory;
+    /** The memory scope to attach events to. */
+    scope: Scope;
+    /**
+     * Stable document identifier used to build chunk ids (`${docId}:chunk:${i}`).
+     * Defaults to a slug derived from the source URL or a truncated hash of the text.
+     */
+    docId?: string;
+    /** Chunking options forwarded to {@link chunkText}. */
+    chunk?: ChunkOptions;
+    /**
+     * Fetch implementation override (useful in tests). Defaults to {@link resilientFetch}.
+     *
+     * **SSRF contract:** the provided implementation MUST respect the `redirect: "manual"`
+     * option and return a 3xx response instead of silently following redirects.  If the
+     * implementation auto-follows redirects (e.g. the default `globalThis.fetch` without
+     * the `manual` option), the SSRF guard will detect this at runtime and throw, because
+     * redirect chains must be validated hop-by-hop.  If you supply a custom fetch, ensure
+     * it honours `{ redirect: "manual" }`.
+     */
+    fetchImpl?: typeof fetch;
+    /**
+     * Optional egress allowlist of hostnames for URL-based ingestion (§5.6 / §10.3).
+     *
+     * - **Omitted (`undefined`):** no domain restriction — any public http(s) host is allowed
+     *   (private/loopback/metadata hosts are still always blocked by the SSRF guard).
+     * - **Empty array (`[]`):** denies ALL URL fetches (explicit lockdown).
+     * - **Non-empty:** restricts URL fetches to the listed hosts and their subdomains.
+     *
+     * Has no effect when `source` is a plain string (no fetch occurs).
+     */
+    allowlist?: string[];
+}
+/**
+ * Ingest a document into a memory store via chunking.
+ *
+ * @param source - The document source. Three overloads:
+ *   - **`string`** — raw text, chunked directly. **No network fetch occurs.**
+ *     If the string starts with `http://` or `https://` you probably meant
+ *     `{ url: "..." }` instead — a warning will be emitted to `console.warn`.
+ *   - **`{ url: string }`** — fetch the URL (public http(s) only; private/loopback
+ *     /metadata addresses are always rejected by the SSRF guard) and treat the
+ *     response body as plain text. All redirect hops are re-validated against the
+ *     same SSRF guard before following; a maximum of {@link MAX_REDIRECT_HOPS} hops
+ *     is enforced to prevent redirect loops.
+ *   - **`TypedContentSource`** — a pre-loaded document with an explicit type:
+ *       - `{ type: "markdown", data: string }` — strip MD syntax then chunk.
+ *       - `{ type: "html", data: string }` — extract readable text then chunk.
+ *       - `{ type: "pdf", data: Buffer }` — parse PDF via `pdf-parse` then chunk.
+ *         `pdf-parse` must be installed separately (`npm install pdf-parse`).
+ *
+ * Returns `{ chunks: number }` — the number of chunks ingested.
+ */
+declare function ingestDocument(source: string | UrlSource | TypedContentSource, opts: IngestDocumentOptions): Promise<{
+    chunks: number;
+}>;
+export { type Chunk, type ChunkOptions, type HtmlLoaderOptions, type IngestDocumentOptions, type IngestableMemory, type LoadedDocument, type MarkdownLoaderOptions, type PdfLoaderOptions, type TypedContentSource, type UrlSource, chunkText, ingestDocument, loadHtml, loadMarkdown, loadPdf };

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,225 @@
+import { MemoryEvent, Scope } from '@eidentic/types';
+/**
+ * Text chunking for RAG document ingestion.
+ *
+ * Three strategies:
+ *   "fixed"     — slide a fixed-size window, breaking on word boundaries near the size limit.
+ *   "paragraph" — split on blank-line paragraph breaks first, then fall back to word boundary
+ *                 splits for paragraphs that are too long.
+ *   "sentence"  — split on sentence-ending punctuation (., !, ?) first, then fall back to word
+ *                 boundary splits for sentences that are too long.
+ *
+ * All strategies respect `overlap`: the last `overlap` characters of the previous chunk are
+ * prepended to the next chunk so retrieval can straddle boundaries.
+ */
+/** One text chunk produced by {@link chunkText}. */
+interface Chunk {
+    /** The chunk text (may include overlap prefix from the previous chunk). */
+    text: string;
+    /** Zero-based chunk index. */
+    index: number;
+    /** Start byte-offset in the *original* text (without overlap prefix). */
+    start: number;
+    /** End byte-offset in the *original* text (exclusive). */
+    end: number;
+}
+/** Options for {@link chunkText}. */
+interface ChunkOptions {
+    /**
+     * Target chunk size in characters. The actual chunk may be slightly smaller or larger if it
+     * is hard to find a clean boundary near the limit. Default: 1000.
+     */
+    size?: number;
+    /**
+     * Overlap in characters — how many characters from the end of the previous chunk to prepend
+     * to the next one. Must be < size. Default: 150.
+     */
+    overlap?: number;
+    /**
+     * Chunking strategy:
+     *   - "fixed"     — word-boundary window sliding over the whole text (default).
+     *   - "paragraph" — prefer blank-line boundaries; large paragraphs are further split.
+     *   - "sentence"  — prefer sentence-ending boundaries; long runs are further split.
+     */
+    strategy?: "fixed" | "paragraph" | "sentence";
+}
+/**
+ * Split `text` into overlapping chunks suitable for embedding and retrieval.
+ *
+ * Returns an empty array for empty/whitespace-only input.
+ */
+declare function chunkText(text: string, opts?: ChunkOptions): Chunk[];
+/** The extracted text and metadata from a document loader. */
+interface LoadedDocument {
+    /** Plain text extracted from the document. */
+    text: string;
+    /** Metadata attached to each ingested chunk. */
+    metadata: Record<string, unknown>;
+}
+/**
+ * Options for {@link loadMarkdown}.
+ */
+interface MarkdownLoaderOptions {
+    /** Stable source identifier placed into `metadata.source`. Defaults to `"markdown"`. */
+    source?: string;
+}
+/**
+ * Strip Markdown syntax and return plain readable text.
+ *
+ * Handles: headings, bold/italic/code spans, links/images, fenced/indented code blocks,
+ * blockquotes, horizontal rules, and HTML tags embedded in MD.
+ * Does NOT require any external dependency — pure regex.
+ */
+declare function loadMarkdown(content: string, opts?: MarkdownLoaderOptions): LoadedDocument;
+/**
+ * Options for {@link loadHtml}.
+ */
+interface HtmlLoaderOptions {
+    /** Stable source identifier placed into `metadata.source`. Defaults to `"html"`. */
+    source?: string;
+}
+/**
+ * Extract readable text from an HTML string.
+ *
+ * Removes `<script>`, `<style>`, `<head>`, and `<noscript>` elements, then
+ * walks the DOM collecting text nodes. Collapses runs of whitespace and
+ * preserves newlines at block-level boundaries.
+ *
+ * Uses `node-html-parser` — a lightweight HTML parser with no headless
+ * browser requirement.
+ */
+declare function loadHtml(html: string, opts?: HtmlLoaderOptions): LoadedDocument;
+/**
+ * Options for {@link loadPdf}.
+ */
+interface PdfLoaderOptions {
+    /** Stable source identifier placed into `metadata.source`. Defaults to `"pdf"`. */
+    source?: string;
+    /**
+     * Injectable parser function for testing — when provided, `pdf-parse` is NOT
+     * dynamically imported. Must accept a `Buffer` and return a promise of
+     * `{ text: string; numpages: number }`.
+     * @internal
+     */
+    _parser?: (buf: Buffer) => Promise<{
+        text: string;
+        numpages: number;
+    }>;
+}
+/**
+ * Extract text from a PDF `Buffer`.
+ *
+ * `pdf-parse` is an **optional peer dependency** — install it separately:
+ * ```sh
+ * npm install pdf-parse
+ * # or
+ * pnpm add pdf-parse
+ * ```
+ *
+ * @param buf  - PDF file contents as a `Buffer`.
+ * @param opts - Optional configuration.
+ * @returns Extracted text and metadata (`source`, `pages`).
+ * @throws Error if `pdf-parse` is not installed and no `_parser` is provided.
+ */
+declare function loadPdf(buf: Buffer, opts?: PdfLoaderOptions): Promise<LoadedDocument>;
+/** A structural memory interface — any object with an `ingest` method works here. */
+interface IngestableMemory {
+    ingest(events: MemoryEvent[]): Promise<void>;
+}
+/** A URL-based document source. */
+interface UrlSource {
+    url: string;
+}
+/**
+ * A typed document content source.
+ *
+ * Use this to pass pre-loaded document bytes/strings to `ingestDocument` so the
+ * correct loader (Markdown stripper, HTML extractor, or PDF parser) is applied
+ * before chunking.
+ *
+ * Example:
+ * ```ts
+ * await ingestDocument(
+ *   { type: "html", data: "<html>…</html>", source: "https://example.com/page" },
+ *   { memory, scope },
+ * );
+ * ```
+ */
+type TypedContentSource = {
+    type: "markdown";
+    data: string;
+    source?: string;
+} | {
+    type: "html";
+    data: string;
+    source?: string;
+} | {
+    type: "pdf";
+    data: Buffer;
+    source?: string;
+    _parser?: PdfLoaderOptions["_parser"];
+};
+/** Options for {@link ingestDocument}. */
+interface IngestDocumentOptions {
+    /** The memory to ingest into. Accepts any object with `ingest(events)` — not tied to `@eidentic/memory`. */
+    memory: IngestableMemory;
+    /** The memory scope to attach events to. */
+    scope: Scope;
+    /**
+     * Stable document identifier used to build chunk ids (`${docId}:chunk:${i}`).
+     * Defaults to a slug derived from the source URL or a truncated hash of the text.
+     */
+    docId?: string;
+    /** Chunking options forwarded to {@link chunkText}. */
+    chunk?: ChunkOptions;
+    /**
+     * Fetch implementation override (useful in tests). Defaults to {@link resilientFetch}.
+     *
+     * **SSRF contract:** the provided implementation MUST respect the `redirect: "manual"`
+     * option and return a 3xx response instead of silently following redirects.  If the
+     * implementation auto-follows redirects (e.g. the default `globalThis.fetch` without
+     * the `manual` option), the SSRF guard will detect this at runtime and throw, because
+     * redirect chains must be validated hop-by-hop.  If you supply a custom fetch, ensure
+     * it honours `{ redirect: "manual" }`.
+     */
+    fetchImpl?: typeof fetch;
+    /**
+     * Optional egress allowlist of hostnames for URL-based ingestion (§5.6 / §10.3).
+     *
+     * - **Omitted (`undefined`):** no domain restriction — any public http(s) host is allowed
+     *   (private/loopback/metadata hosts are still always blocked by the SSRF guard).
+     * - **Empty array (`[]`):** denies ALL URL fetches (explicit lockdown).
+     * - **Non-empty:** restricts URL fetches to the listed hosts and their subdomains.
+     *
+     * Has no effect when `source` is a plain string (no fetch occurs).
+     */
+    allowlist?: string[];
+}
+/**
+ * Ingest a document into a memory store via chunking.
+ *
+ * @param source - The document source. Three overloads:
+ *   - **`string`** — raw text, chunked directly. **No network fetch occurs.**
+ *     If the string starts with `http://` or `https://` you probably meant
+ *     `{ url: "..." }` instead — a warning will be emitted to `console.warn`.
+ *   - **`{ url: string }`** — fetch the URL (public http(s) only; private/loopback
+ *     /metadata addresses are always rejected by the SSRF guard) and treat the
+ *     response body as plain text. All redirect hops are re-validated against the
+ *     same SSRF guard before following; a maximum of {@link MAX_REDIRECT_HOPS} hops
+ *     is enforced to prevent redirect loops.
+ *   - **`TypedContentSource`** — a pre-loaded document with an explicit type:
+ *       - `{ type: "markdown", data: string }` — strip MD syntax then chunk.
+ *       - `{ type: "html", data: string }` — extract readable text then chunk.
+ *       - `{ type: "pdf", data: Buffer }` — parse PDF via `pdf-parse` then chunk.
+ *         `pdf-parse` must be installed separately (`npm install pdf-parse`).
+ *
+ * Returns `{ chunks: number }` — the number of chunks ingested.
+ */
+declare function ingestDocument(source: string | UrlSource | TypedContentSource, opts: IngestDocumentOptions): Promise<{
+    chunks: number;
+}>;
+export { type Chunk, type ChunkOptions, type HtmlLoaderOptions, type IngestDocumentOptions, type IngestableMemory, type LoadedDocument, type MarkdownLoaderOptions, type PdfLoaderOptions, type TypedContentSource, type UrlSource, chunkText, ingestDocument, loadHtml, loadMarkdown, loadPdf };