@eidentic/rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,225 @@
1
+ import { MemoryEvent, Scope } from '@eidentic/types';
2
+
3
+ /**
4
+ * Text chunking for RAG document ingestion.
5
+ *
6
+ * Three strategies:
7
+ * "fixed" — slide a fixed-size window, breaking on word boundaries near the size limit.
8
+ * "paragraph" — split on blank-line paragraph breaks first, then fall back to word boundary
9
+ * splits for paragraphs that are too long.
10
+ * "sentence" — split on sentence-ending punctuation (., !, ?) first, then fall back to word
11
+ * boundary splits for sentences that are too long.
12
+ *
13
+ * All strategies respect `overlap`: the last `overlap` characters of the previous chunk are
14
+ * prepended to the next chunk so retrieval can straddle boundaries.
15
+ */
16
+ /** One text chunk produced by {@link chunkText}. */
17
+ interface Chunk {
18
+ /** The chunk text (may include overlap prefix from the previous chunk). */
19
+ text: string;
20
+ /** Zero-based chunk index. */
21
+ index: number;
22
+ /** Start byte-offset in the *original* text (without overlap prefix). */
23
+ start: number;
24
+ /** End byte-offset in the *original* text (exclusive). */
25
+ end: number;
26
+ }
27
+ /** Options for {@link chunkText}. */
28
+ interface ChunkOptions {
29
+ /**
30
+ * Target chunk size in characters. The actual chunk may be slightly smaller or larger if it
31
+ * is hard to find a clean boundary near the limit. Default: 1000.
32
+ */
33
+ size?: number;
34
+ /**
35
+ * Overlap in characters — how many characters from the end of the previous chunk to prepend
36
+ * to the next one. Must be < size. Default: 150.
37
+ */
38
+ overlap?: number;
39
+ /**
40
+ * Chunking strategy:
41
+ * - "fixed" — word-boundary window sliding over the whole text (default).
42
+ * - "paragraph" — prefer blank-line boundaries; large paragraphs are further split.
43
+ * - "sentence" — prefer sentence-ending boundaries; long runs are further split.
44
+ */
45
+ strategy?: "fixed" | "paragraph" | "sentence";
46
+ }
47
+ /**
48
+ * Split `text` into overlapping chunks suitable for embedding and retrieval.
49
+ *
50
+ * Returns an empty array for empty/whitespace-only input.
51
+ */
52
+ declare function chunkText(text: string, opts?: ChunkOptions): Chunk[];
53
+
54
+ /** The extracted text and metadata from a document loader. */
55
+ interface LoadedDocument {
56
+ /** Plain text extracted from the document. */
57
+ text: string;
58
+ /** Metadata attached to each ingested chunk. */
59
+ metadata: Record<string, unknown>;
60
+ }
61
+ /**
62
+ * Options for {@link loadMarkdown}.
63
+ */
64
+ interface MarkdownLoaderOptions {
65
+ /** Stable source identifier placed into `metadata.source`. Defaults to `"markdown"`. */
66
+ source?: string;
67
+ }
68
+ /**
69
+ * Strip Markdown syntax and return plain readable text.
70
+ *
71
+ * Handles: headings, bold/italic/code spans, links/images, fenced/indented code blocks,
72
+ * blockquotes, horizontal rules, and HTML tags embedded in MD.
73
+ * Does NOT require any external dependency — pure regex.
74
+ */
75
+ declare function loadMarkdown(content: string, opts?: MarkdownLoaderOptions): LoadedDocument;
76
+ /**
77
+ * Options for {@link loadHtml}.
78
+ */
79
+ interface HtmlLoaderOptions {
80
+ /** Stable source identifier placed into `metadata.source`. Defaults to `"html"`. */
81
+ source?: string;
82
+ }
83
+ /**
84
+ * Extract readable text from an HTML string.
85
+ *
86
+ * Removes `<script>`, `<style>`, `<head>`, and `<noscript>` elements, then
87
+ * walks the DOM collecting text nodes. Collapses runs of whitespace and
88
+ * preserves newlines at block-level boundaries.
89
+ *
90
+ * Uses `node-html-parser` — a lightweight HTML parser with no headless
91
+ * browser requirement.
92
+ */
93
+ declare function loadHtml(html: string, opts?: HtmlLoaderOptions): LoadedDocument;
94
+ /**
95
+ * Options for {@link loadPdf}.
96
+ */
97
+ interface PdfLoaderOptions {
98
+ /** Stable source identifier placed into `metadata.source`. Defaults to `"pdf"`. */
99
+ source?: string;
100
+ /**
101
+ * Injectable parser function for testing — when provided, `pdf-parse` is NOT
102
+ * dynamically imported. Must accept a `Buffer` and return a promise of
103
+ * `{ text: string; numpages: number }`.
104
+ * @internal
105
+ */
106
+ _parser?: (buf: Buffer) => Promise<{
107
+ text: string;
108
+ numpages: number;
109
+ }>;
110
+ }
111
+ /**
112
+ * Extract text from a PDF `Buffer`.
113
+ *
114
+ * `pdf-parse` is an **optional peer dependency** — install it separately:
115
+ * ```sh
116
+ * npm install pdf-parse
117
+ * # or
118
+ * pnpm add pdf-parse
119
+ * ```
120
+ *
121
+ * @param buf - PDF file contents as a `Buffer`.
122
+ * @param opts - Optional configuration.
123
+ * @returns Extracted text and metadata (`source`, `pages`).
124
+ * @throws Error if `pdf-parse` is not installed and no `_parser` is provided.
125
+ */
126
+ declare function loadPdf(buf: Buffer, opts?: PdfLoaderOptions): Promise<LoadedDocument>;
127
+
128
+ /** A structural memory interface — any object with an `ingest` method works here. */
129
+ interface IngestableMemory {
130
+ ingest(events: MemoryEvent[]): Promise<void>;
131
+ }
132
+ /** A URL-based document source. */
133
+ interface UrlSource {
134
+ url: string;
135
+ }
136
+ /**
137
+ * A typed document content source.
138
+ *
139
+ * Use this to pass pre-loaded document bytes/strings to `ingestDocument` so the
140
+ * correct loader (Markdown stripper, HTML extractor, or PDF parser) is applied
141
+ * before chunking.
142
+ *
143
+ * Example:
144
+ * ```ts
145
+ * await ingestDocument(
146
+ * { type: "html", data: "<html>…</html>", source: "https://example.com/page" },
147
+ * { memory, scope },
148
+ * );
149
+ * ```
150
+ */
151
+ type TypedContentSource = {
152
+ type: "markdown";
153
+ data: string;
154
+ source?: string;
155
+ } | {
156
+ type: "html";
157
+ data: string;
158
+ source?: string;
159
+ } | {
160
+ type: "pdf";
161
+ data: Buffer;
162
+ source?: string;
163
+ _parser?: PdfLoaderOptions["_parser"];
164
+ };
165
+ /** Options for {@link ingestDocument}. */
166
+ interface IngestDocumentOptions {
167
+ /** The memory to ingest into. Accepts any object with `ingest(events)` — not tied to `@eidentic/memory`. */
168
+ memory: IngestableMemory;
169
+ /** The memory scope to attach events to. */
170
+ scope: Scope;
171
+ /**
172
+ * Stable document identifier used to build chunk ids (`${docId}:chunk:${i}`).
173
+ * Defaults to a slug derived from the source URL or a truncated hash of the text.
174
+ */
175
+ docId?: string;
176
+ /** Chunking options forwarded to {@link chunkText}. */
177
+ chunk?: ChunkOptions;
178
+ /**
179
+ * Fetch implementation override (useful in tests). Defaults to {@link resilientFetch}.
180
+ *
181
+ * **SSRF contract:** the provided implementation MUST respect the `redirect: "manual"`
182
+ * option and return a 3xx response instead of silently following redirects. If the
183
+ * implementation auto-follows redirects (e.g. the default `globalThis.fetch` without
184
+ * the `manual` option), the SSRF guard will detect this at runtime and throw, because
185
+ * redirect chains must be validated hop-by-hop. If you supply a custom fetch, ensure
186
+ * it honours `{ redirect: "manual" }`.
187
+ */
188
+ fetchImpl?: typeof fetch;
189
+ /**
190
+ * Optional egress allowlist of hostnames for URL-based ingestion (§5.6 / §10.3).
191
+ *
192
+ * - **Omitted (`undefined`):** no domain restriction — any public http(s) host is allowed
193
+ * (private/loopback/metadata hosts are still always blocked by the SSRF guard).
194
+ * - **Empty array (`[]`):** denies ALL URL fetches (explicit lockdown).
195
+ * - **Non-empty:** restricts URL fetches to the listed hosts and their subdomains.
196
+ *
197
+ * Has no effect when `source` is a plain string (no fetch occurs).
198
+ */
199
+ allowlist?: string[];
200
+ }
201
+ /**
202
+ * Ingest a document into a memory store via chunking.
203
+ *
204
+ * @param source - The document source. Three overloads:
205
+ * - **`string`** — raw text, chunked directly. **No network fetch occurs.**
206
+ * If the string starts with `http://` or `https://` you probably meant
207
+ * `{ url: "..." }` instead — a warning will be emitted to `console.warn`.
208
+ * - **`{ url: string }`** — fetch the URL (public http(s) only; private/loopback
209
+ * /metadata addresses are always rejected by the SSRF guard) and treat the
210
+ * response body as plain text. All redirect hops are re-validated against the
211
+ * same SSRF guard before following; a maximum of {@link MAX_REDIRECT_HOPS} hops
212
+ * is enforced to prevent redirect loops.
213
+ * - **`TypedContentSource`** — a pre-loaded document with an explicit type:
214
+ * - `{ type: "markdown", data: string }` — strip MD syntax then chunk.
215
+ * - `{ type: "html", data: string }` — extract readable text then chunk.
216
+ * - `{ type: "pdf", data: Buffer }` — parse PDF via `pdf-parse` then chunk.
217
+ * `pdf-parse` must be installed separately (`npm install pdf-parse`).
218
+ *
219
+ * Returns `{ chunks: number }` — the number of chunks ingested.
220
+ */
221
+ declare function ingestDocument(source: string | UrlSource | TypedContentSource, opts: IngestDocumentOptions): Promise<{
222
+ chunks: number;
223
+ }>;
224
+
225
+ export { type Chunk, type ChunkOptions, type HtmlLoaderOptions, type IngestDocumentOptions, type IngestableMemory, type LoadedDocument, type MarkdownLoaderOptions, type PdfLoaderOptions, type TypedContentSource, type UrlSource, chunkText, ingestDocument, loadHtml, loadMarkdown, loadPdf };
@@ -0,0 +1,225 @@
1
+ import { MemoryEvent, Scope } from '@eidentic/types';
2
+
3
+ /**
4
+ * Text chunking for RAG document ingestion.
5
+ *
6
+ * Three strategies:
7
+ * "fixed" — slide a fixed-size window, breaking on word boundaries near the size limit.
8
+ * "paragraph" — split on blank-line paragraph breaks first, then fall back to word boundary
9
+ * splits for paragraphs that are too long.
10
+ * "sentence" — split on sentence-ending punctuation (., !, ?) first, then fall back to word
11
+ * boundary splits for sentences that are too long.
12
+ *
13
+ * All strategies respect `overlap`: the last `overlap` characters of the previous chunk are
14
+ * prepended to the next chunk so retrieval can straddle boundaries.
15
+ */
16
+ /** One text chunk produced by {@link chunkText}. */
17
+ interface Chunk {
18
+ /** The chunk text (may include overlap prefix from the previous chunk). */
19
+ text: string;
20
+ /** Zero-based chunk index. */
21
+ index: number;
22
+ /** Start byte-offset in the *original* text (without overlap prefix). */
23
+ start: number;
24
+ /** End byte-offset in the *original* text (exclusive). */
25
+ end: number;
26
+ }
27
+ /** Options for {@link chunkText}. */
28
+ interface ChunkOptions {
29
+ /**
30
+ * Target chunk size in characters. The actual chunk may be slightly smaller or larger if it
31
+ * is hard to find a clean boundary near the limit. Default: 1000.
32
+ */
33
+ size?: number;
34
+ /**
35
+ * Overlap in characters — how many characters from the end of the previous chunk to prepend
36
+ * to the next one. Must be < size. Default: 150.
37
+ */
38
+ overlap?: number;
39
+ /**
40
+ * Chunking strategy:
41
+ * - "fixed" — word-boundary window sliding over the whole text (default).
42
+ * - "paragraph" — prefer blank-line boundaries; large paragraphs are further split.
43
+ * - "sentence" — prefer sentence-ending boundaries; long runs are further split.
44
+ */
45
+ strategy?: "fixed" | "paragraph" | "sentence";
46
+ }
47
+ /**
48
+ * Split `text` into overlapping chunks suitable for embedding and retrieval.
49
+ *
50
+ * Returns an empty array for empty/whitespace-only input.
51
+ */
52
+ declare function chunkText(text: string, opts?: ChunkOptions): Chunk[];
53
+
54
+ /** The extracted text and metadata from a document loader. */
55
+ interface LoadedDocument {
56
+ /** Plain text extracted from the document. */
57
+ text: string;
58
+ /** Metadata attached to each ingested chunk. */
59
+ metadata: Record<string, unknown>;
60
+ }
61
+ /**
62
+ * Options for {@link loadMarkdown}.
63
+ */
64
+ interface MarkdownLoaderOptions {
65
+ /** Stable source identifier placed into `metadata.source`. Defaults to `"markdown"`. */
66
+ source?: string;
67
+ }
68
+ /**
69
+ * Strip Markdown syntax and return plain readable text.
70
+ *
71
+ * Handles: headings, bold/italic/code spans, links/images, fenced/indented code blocks,
72
+ * blockquotes, horizontal rules, and HTML tags embedded in MD.
73
+ * Does NOT require any external dependency — pure regex.
74
+ */
75
+ declare function loadMarkdown(content: string, opts?: MarkdownLoaderOptions): LoadedDocument;
76
+ /**
77
+ * Options for {@link loadHtml}.
78
+ */
79
+ interface HtmlLoaderOptions {
80
+ /** Stable source identifier placed into `metadata.source`. Defaults to `"html"`. */
81
+ source?: string;
82
+ }
83
+ /**
84
+ * Extract readable text from an HTML string.
85
+ *
86
+ * Removes `<script>`, `<style>`, `<head>`, and `<noscript>` elements, then
87
+ * walks the DOM collecting text nodes. Collapses runs of whitespace and
88
+ * preserves newlines at block-level boundaries.
89
+ *
90
+ * Uses `node-html-parser` — a lightweight HTML parser with no headless
91
+ * browser requirement.
92
+ */
93
+ declare function loadHtml(html: string, opts?: HtmlLoaderOptions): LoadedDocument;
94
+ /**
95
+ * Options for {@link loadPdf}.
96
+ */
97
+ interface PdfLoaderOptions {
98
+ /** Stable source identifier placed into `metadata.source`. Defaults to `"pdf"`. */
99
+ source?: string;
100
+ /**
101
+ * Injectable parser function for testing — when provided, `pdf-parse` is NOT
102
+ * dynamically imported. Must accept a `Buffer` and return a promise of
103
+ * `{ text: string; numpages: number }`.
104
+ * @internal
105
+ */
106
+ _parser?: (buf: Buffer) => Promise<{
107
+ text: string;
108
+ numpages: number;
109
+ }>;
110
+ }
111
+ /**
112
+ * Extract text from a PDF `Buffer`.
113
+ *
114
+ * `pdf-parse` is an **optional peer dependency** — install it separately:
115
+ * ```sh
116
+ * npm install pdf-parse
117
+ * # or
118
+ * pnpm add pdf-parse
119
+ * ```
120
+ *
121
+ * @param buf - PDF file contents as a `Buffer`.
122
+ * @param opts - Optional configuration.
123
+ * @returns Extracted text and metadata (`source`, `pages`).
124
+ * @throws Error if `pdf-parse` is not installed and no `_parser` is provided.
125
+ */
126
+ declare function loadPdf(buf: Buffer, opts?: PdfLoaderOptions): Promise<LoadedDocument>;
127
+
128
+ /** A structural memory interface — any object with an `ingest` method works here. */
129
+ interface IngestableMemory {
130
+ ingest(events: MemoryEvent[]): Promise<void>;
131
+ }
132
+ /** A URL-based document source. */
133
+ interface UrlSource {
134
+ url: string;
135
+ }
136
+ /**
137
+ * A typed document content source.
138
+ *
139
+ * Use this to pass pre-loaded document bytes/strings to `ingestDocument` so the
140
+ * correct loader (Markdown stripper, HTML extractor, or PDF parser) is applied
141
+ * before chunking.
142
+ *
143
+ * Example:
144
+ * ```ts
145
+ * await ingestDocument(
146
+ * { type: "html", data: "<html>…</html>", source: "https://example.com/page" },
147
+ * { memory, scope },
148
+ * );
149
+ * ```
150
+ */
151
+ type TypedContentSource = {
152
+ type: "markdown";
153
+ data: string;
154
+ source?: string;
155
+ } | {
156
+ type: "html";
157
+ data: string;
158
+ source?: string;
159
+ } | {
160
+ type: "pdf";
161
+ data: Buffer;
162
+ source?: string;
163
+ _parser?: PdfLoaderOptions["_parser"];
164
+ };
165
+ /** Options for {@link ingestDocument}. */
166
+ interface IngestDocumentOptions {
167
+ /** The memory to ingest into. Accepts any object with `ingest(events)` — not tied to `@eidentic/memory`. */
168
+ memory: IngestableMemory;
169
+ /** The memory scope to attach events to. */
170
+ scope: Scope;
171
+ /**
172
+ * Stable document identifier used to build chunk ids (`${docId}:chunk:${i}`).
173
+ * Defaults to a slug derived from the source URL or a truncated hash of the text.
174
+ */
175
+ docId?: string;
176
+ /** Chunking options forwarded to {@link chunkText}. */
177
+ chunk?: ChunkOptions;
178
+ /**
179
+ * Fetch implementation override (useful in tests). Defaults to {@link resilientFetch}.
180
+ *
181
+ * **SSRF contract:** the provided implementation MUST respect the `redirect: "manual"`
182
+ * option and return a 3xx response instead of silently following redirects. If the
183
+ * implementation auto-follows redirects (e.g. the default `globalThis.fetch` without
184
+ * the `manual` option), the SSRF guard will detect this at runtime and throw, because
185
+ * redirect chains must be validated hop-by-hop. If you supply a custom fetch, ensure
186
+ * it honours `{ redirect: "manual" }`.
187
+ */
188
+ fetchImpl?: typeof fetch;
189
+ /**
190
+ * Optional egress allowlist of hostnames for URL-based ingestion (§5.6 / §10.3).
191
+ *
192
+ * - **Omitted (`undefined`):** no domain restriction — any public http(s) host is allowed
193
+ * (private/loopback/metadata hosts are still always blocked by the SSRF guard).
194
+ * - **Empty array (`[]`):** denies ALL URL fetches (explicit lockdown).
195
+ * - **Non-empty:** restricts URL fetches to the listed hosts and their subdomains.
196
+ *
197
+ * Has no effect when `source` is a plain string (no fetch occurs).
198
+ */
199
+ allowlist?: string[];
200
+ }
201
+ /**
202
+ * Ingest a document into a memory store via chunking.
203
+ *
204
+ * @param source - The document source. Three overloads:
205
+ * - **`string`** — raw text, chunked directly. **No network fetch occurs.**
206
+ * If the string starts with `http://` or `https://` you probably meant
207
+ * `{ url: "..." }` instead — a warning will be emitted to `console.warn`.
208
+ * - **`{ url: string }`** — fetch the URL (public http(s) only; private/loopback
209
+ * /metadata addresses are always rejected by the SSRF guard) and treat the
210
+ * response body as plain text. All redirect hops are re-validated against the
211
+ * same SSRF guard before following; a maximum of {@link MAX_REDIRECT_HOPS} hops
212
+ * is enforced to prevent redirect loops.
213
+ * - **`TypedContentSource`** — a pre-loaded document with an explicit type:
214
+ * - `{ type: "markdown", data: string }` — strip MD syntax then chunk.
215
+ * - `{ type: "html", data: string }` — extract readable text then chunk.
216
+ * - `{ type: "pdf", data: Buffer }` — parse PDF via `pdf-parse` then chunk.
217
+ * `pdf-parse` must be installed separately (`npm install pdf-parse`).
218
+ *
219
+ * Returns `{ chunks: number }` — the number of chunks ingested.
220
+ */
221
+ declare function ingestDocument(source: string | UrlSource | TypedContentSource, opts: IngestDocumentOptions): Promise<{
222
+ chunks: number;
223
+ }>;
224
+
225
+ export { type Chunk, type ChunkOptions, type HtmlLoaderOptions, type IngestDocumentOptions, type IngestableMemory, type LoadedDocument, type MarkdownLoaderOptions, type PdfLoaderOptions, type TypedContentSource, type UrlSource, chunkText, ingestDocument, loadHtml, loadMarkdown, loadPdf };