@eidentic/rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,392 @@
1
+ var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
2
+ get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
3
+ }) : x)(function(x) {
4
+ if (typeof require !== "undefined") return require.apply(this, arguments);
5
+ throw Error('Dynamic require of "' + x + '" is not supported');
6
+ });
7
+
8
+ // src/chunk.ts
9
+ function splitByWordBoundary(text, size) {
10
+ if (text.length <= size) return text ? [text] : [];
11
+ const segments = [];
12
+ let pos = 0;
13
+ while (pos < text.length) {
14
+ if (pos + size >= text.length) {
15
+ segments.push(text.slice(pos));
16
+ break;
17
+ }
18
+ let cut = pos + size;
19
+ while (cut > pos && !/\s/.test(text[cut])) cut--;
20
+ if (cut === pos) {
21
+ cut = pos + size;
22
+ } else {
23
+ while (cut < text.length && /\s/.test(text[cut])) cut++;
24
+ }
25
+ segments.push(text.slice(pos, cut).trimEnd());
26
+ pos = cut;
27
+ }
28
+ return segments.filter((s) => s.length > 0);
29
+ }
30
+ function splitIntoParagraphs(text) {
31
+ return text.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0);
32
+ }
33
+ function splitIntoSentences(text) {
34
+ const parts = text.split(/(?<=[.!?])\s+/);
35
+ return parts.map((p) => p.trim()).filter((p) => p.length > 0);
36
+ }
37
+ function packSegments(segments, source, size, overlap, separator = "\n\n") {
38
+ const segmentOffsets = [];
39
+ let searchFrom = 0;
40
+ for (const seg of segments) {
41
+ const idx = source.indexOf(seg, searchFrom);
42
+ if (idx === -1) {
43
+ const prev = segmentOffsets[segmentOffsets.length - 1];
44
+ const fallbackStart = prev ? prev.end : 0;
45
+ segmentOffsets.push({ start: fallbackStart, end: fallbackStart + seg.length, text: seg });
46
+ searchFrom = fallbackStart + seg.length;
47
+ } else {
48
+ segmentOffsets.push({ start: idx, end: idx + seg.length, text: seg });
49
+ searchFrom = idx + seg.length;
50
+ }
51
+ }
52
+ const chunks = [];
53
+ let current = [];
54
+ let currentLen = 0;
55
+ const flush = () => {
56
+ if (current.length === 0) return;
57
+ const start = current[0].start;
58
+ const end = current[current.length - 1].end;
59
+ const text = current.map((s) => s.text).join(separator);
60
+ chunks.push({ text, start, end });
61
+ current = [];
62
+ currentLen = 0;
63
+ };
64
+ for (const seg of segmentOffsets) {
65
+ if (currentLen > 0 && currentLen + separator.length + seg.text.length > size) {
66
+ flush();
67
+ }
68
+ if (seg.text.length > size) {
69
+ flush();
70
+ const subSegs = splitByWordBoundary(seg.text, size);
71
+ let subOffset = seg.start;
72
+ for (const sub of subSegs) {
73
+ const subStart = source.indexOf(sub, subOffset);
74
+ const actualStart = subStart === -1 ? subOffset : subStart;
75
+ chunks.push({ text: sub, start: actualStart, end: actualStart + sub.length });
76
+ subOffset = actualStart + sub.length;
77
+ }
78
+ continue;
79
+ }
80
+ current.push(seg);
81
+ currentLen += (currentLen > 0 ? separator.length : 0) + seg.text.length;
82
+ }
83
+ flush();
84
+ return chunks.map((chunk, i) => {
85
+ if (i === 0 || overlap === 0) return chunk;
86
+ const prev = chunks[i - 1];
87
+ const prefix = prev.text.slice(-overlap);
88
+ return { text: prefix + chunk.text, start: chunk.start, end: chunk.end };
89
+ });
90
+ }
91
+ function chunkText(text, opts) {
92
+ const size = opts?.size ?? 1e3;
93
+ const overlap = Math.min(opts?.overlap ?? 150, size - 1);
94
+ const strategy = opts?.strategy ?? "fixed";
95
+ const normalized = text.trim();
96
+ if (normalized.length === 0) return [];
97
+ if (strategy === "paragraph" || strategy === "sentence") {
98
+ const naturalSegments = strategy === "paragraph" ? splitIntoParagraphs(normalized) : splitIntoSentences(normalized);
99
+ const separator = strategy === "sentence" ? " " : "\n\n";
100
+ const packed = packSegments(naturalSegments, normalized, size, overlap, separator);
101
+ return packed.map((c, i) => ({ text: c.text, index: i, start: c.start, end: c.end }));
102
+ }
103
+ const chunks = [];
104
+ let pos = 0;
105
+ let index = 0;
106
+ while (pos < normalized.length) {
107
+ let end = pos + size;
108
+ if (end >= normalized.length) {
109
+ end = normalized.length;
110
+ } else {
111
+ let cut = end;
112
+ while (cut > pos && !/\s/.test(normalized[cut])) cut--;
113
+ if (cut === pos) {
114
+ cut = end;
115
+ }
116
+ end = cut;
117
+ }
118
+ const chunkText2 = normalized.slice(pos, end).trim();
119
+ if (chunkText2.length > 0) {
120
+ let fullText = chunkText2;
121
+ if (index > 0 && overlap > 0) {
122
+ const prev = chunks[chunks.length - 1];
123
+ const prefix = prev.text.slice(-overlap);
124
+ fullText = prefix + chunkText2;
125
+ }
126
+ chunks.push({ text: fullText, index, start: pos, end });
127
+ index++;
128
+ }
129
+ let next = end;
130
+ while (next < normalized.length && /\s/.test(normalized[next])) next++;
131
+ if (next === pos) {
132
+ break;
133
+ }
134
+ pos = next;
135
+ }
136
+ return chunks;
137
+ }
138
+
139
+ // src/ingest.ts
140
+ import { resilientFetch, assertFetchableUrl } from "@eidentic/tools";
141
+
142
+ // src/loaders.ts
143
+ import { createRequire } from "node:module";
144
+ import { parse as parseHtml } from "node-html-parser";
145
+ function loadMarkdown(content, opts) {
146
+ const source = opts?.source ?? "markdown";
147
+ let text = content;
148
+ text = text.replace(/```[\s\S]*?```/g, "");
149
+ text = text.replace(/^(?: |\t).+$/gm, "");
150
+ text = text.replace(/<[^>]+>/g, "");
151
+ text = text.replace(/^#{1,6}\s+(.+)$/gm, "$1");
152
+ text = text.replace(/^[=-]{2,}\s*$/gm, "");
153
+ text = text.replace(/^(?:[*_-][\s]*){3,}$/gm, "");
154
+ text = text.replace(/!\[([^\]]*)\]\([^)]*\)/g, "$1");
155
+ text = text.replace(/\[([^\]]+)\]\([^)]*\)/g, "$1");
156
+ text = text.replace(/\[([^\]]+)\]\[[^\]]*\]/g, "$1");
157
+ text = text.replace(/^\[[^\]]+\]:\s+\S+(?:\s+"[^"]*")?$/gm, "");
158
+ text = text.replace(/\*{1,2}([^*\n]+)\*{1,2}/g, "$1");
159
+ text = text.replace(/_{1,2}([^_\n]+)_{1,2}/g, "$1");
160
+ text = text.replace(/`([^`]+)`/g, "$1");
161
+ text = text.replace(/~~([^~]+)~~/g, "$1");
162
+ text = text.replace(/^>\s?/gm, "");
163
+ text = text.replace(/^[\s]*[-*+]\s+/gm, "");
164
+ text = text.replace(/^[\s]*\d+\.\s+/gm, "");
165
+ text = text.replace(/\n{3,}/g, "\n\n");
166
+ text = text.trim();
167
+ return { text, metadata: { source } };
168
+ }
169
+ var BLOCK_TAGS = /* @__PURE__ */ new Set([
170
+ "p",
171
+ "div",
172
+ "section",
173
+ "article",
174
+ "aside",
175
+ "main",
176
+ "header",
177
+ "footer",
178
+ "nav",
179
+ "h1",
180
+ "h2",
181
+ "h3",
182
+ "h4",
183
+ "h5",
184
+ "h6",
185
+ "li",
186
+ "dt",
187
+ "dd",
188
+ "ul",
189
+ "ol",
190
+ "blockquote",
191
+ "pre",
192
+ "figure",
193
+ "figcaption",
194
+ "br",
195
+ "hr",
196
+ "tr",
197
+ "th",
198
+ "td",
199
+ "table"
200
+ ]);
201
+ function loadHtml(html, opts) {
202
+ const source = opts?.source ?? "html";
203
+ const root = parseHtml(html, {
204
+ lowerCaseTagName: true,
205
+ comment: false,
206
+ blockTextElements: {
207
+ script: false,
208
+ style: false,
209
+ noscript: false,
210
+ pre: true
211
+ }
212
+ });
213
+ for (const tag of ["script", "style", "noscript", "head"]) {
214
+ for (const el of root.querySelectorAll(tag)) {
215
+ el.remove();
216
+ }
217
+ }
218
+ function extractText(node) {
219
+ if (node.nodeType === 3) {
220
+ return node.rawText ?? node.text ?? "";
221
+ }
222
+ const tag = node.tagName?.toLowerCase() ?? "";
223
+ const isBlock = BLOCK_TAGS.has(tag);
224
+ const childText = node.childNodes.map((c) => extractText(c)).join("");
225
+ return isBlock ? `
226
+ ${childText}
227
+ ` : childText;
228
+ }
229
+ let text = extractText(root);
230
+ text = text.replace(/[^\S\n]+/g, " ");
231
+ text = text.replace(/\n{3,}/g, "\n\n");
232
+ text = text.trim();
233
+ return { text, metadata: { source } };
234
+ }
235
+ function loadPdfParseModule() {
236
+ const req = typeof __require === "function" ? __require : createRequire(import.meta.url);
237
+ try {
238
+ const mod = req("pdf-parse");
239
+ const fn = typeof mod === "function" ? mod : mod.default;
240
+ if (typeof fn !== "function") {
241
+ throw new Error("unexpected module shape");
242
+ }
243
+ return fn;
244
+ } catch (err) {
245
+ const msg = err instanceof Error ? err.message : String(err);
246
+ if (msg.includes("Cannot find module") || msg.includes("unexpected module shape")) {
247
+ throw new Error(
248
+ "[eidentic/rag] loadPdf requires the `pdf-parse` package.\nInstall it in your project:\n npm install pdf-parse\n # or\n pnpm add pdf-parse"
249
+ );
250
+ }
251
+ throw err;
252
+ }
253
+ }
254
+ async function loadPdf(buf, opts) {
255
+ const source = opts?.source ?? "pdf";
256
+ const parse = opts?._parser ?? loadPdfParseModule();
257
+ const result = await parse(buf);
258
+ let text = result.text;
259
+ text = text.replace(/[^\S\n]+/g, " ");
260
+ text = text.replace(/\n{3,}/g, "\n\n");
261
+ text = text.trim();
262
+ return {
263
+ text,
264
+ metadata: { source, pages: result.numpages }
265
+ };
266
+ }
267
+
268
+ // src/ingest.ts
269
+ function slugFromUrl(url) {
270
+ try {
271
+ const u = new URL(url);
272
+ const parts = (u.hostname + u.pathname).toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/(^-|-$)/g, "");
273
+ return parts.slice(0, 64) || "doc";
274
+ } catch {
275
+ return "doc";
276
+ }
277
+ }
278
+ function hashSlug(text) {
279
+ let h = 5381;
280
+ for (let i = 0; i < Math.min(text.length, 2048); i++) {
281
+ h = (h << 5) + h + text.charCodeAt(i) >>> 0;
282
+ }
283
+ return `doc-${h.toString(16)}`;
284
+ }
285
+ var MAX_REDIRECT_HOPS = 5;
286
+ function normalizeUrl(url) {
287
+ try {
288
+ const u = new URL(url);
289
+ return `${u.protocol}//${u.host}${u.pathname.replace(/\/+$/, "")}${u.search}`;
290
+ } catch {
291
+ return url;
292
+ }
293
+ }
294
+ async function ingestDocument(source, opts) {
295
+ let rawText;
296
+ let docId;
297
+ let extraMeta = {};
298
+ if (typeof source === "string") {
299
+ if (source.startsWith("http://") || source.startsWith("https://")) {
300
+ console.warn(
301
+ `[ingestDocument] source looks like a URL but was passed as a plain string \u2014 it will be ingested as raw text, NOT fetched. If you meant to fetch this URL, use { url: "${source}" } instead.`
302
+ );
303
+ }
304
+ rawText = source;
305
+ docId = opts.docId ?? hashSlug(rawText);
306
+ } else if ("type" in source) {
307
+ const typedSource = source;
308
+ let loaded;
309
+ if (typedSource.type === "markdown") {
310
+ loaded = loadMarkdown(typedSource.data, { source: typedSource.source });
311
+ } else if (typedSource.type === "html") {
312
+ loaded = loadHtml(typedSource.data, { source: typedSource.source });
313
+ } else {
314
+ loaded = await loadPdf(typedSource.data, {
315
+ source: typedSource.source,
316
+ _parser: typedSource._parser
317
+ });
318
+ }
319
+ rawText = loaded.text;
320
+ extraMeta = loaded.metadata;
321
+ docId = opts.docId ?? hashSlug(rawText);
322
+ } else {
323
+ assertFetchableUrl(source.url, { allowlist: opts.allowlist });
324
+ const doFetch = opts.fetchImpl ?? globalThis.fetch;
325
+ let currentUrl = source.url;
326
+ let hops = 0;
327
+ let finalRes;
328
+ while (true) {
329
+ const res = await resilientFetch(currentUrl, { redirect: "manual" }, { fetchImpl: doFetch });
330
+ const autoFollowed = res.redirected === true || res.status < 300 && typeof res.url === "string" && res.url !== "" && normalizeUrl(res.url) !== normalizeUrl(currentUrl);
331
+ if (autoFollowed) {
332
+ throw new Error(
333
+ 'ingestDocument: the supplied fetchImpl auto-followed a redirect instead of honouring redirect:"manual". Replace it with a fetch implementation that returns 3xx responses rather than following them, so every redirect hop can be re-validated by the SSRF guard.'
334
+ );
335
+ }
336
+ if (res.status >= 300 && res.status < 400) {
337
+ hops++;
338
+ if (hops > MAX_REDIRECT_HOPS) {
339
+ throw new Error(
340
+ `ingestDocument: too many redirects (> ${MAX_REDIRECT_HOPS} hops) starting from ${source.url}`
341
+ );
342
+ }
343
+ const location = res.headers.get("location");
344
+ if (location === null) {
345
+ throw new Error(`ingestDocument: redirect has no Location header (hop ${hops})`);
346
+ }
347
+ let nextUrl;
348
+ try {
349
+ nextUrl = new URL(location, currentUrl);
350
+ } catch {
351
+ throw new Error(`ingestDocument: invalid redirect target at hop ${hops} (URL omitted for security)`);
352
+ }
353
+ assertFetchableUrl(nextUrl.toString(), { allowlist: opts.allowlist });
354
+ currentUrl = nextUrl.toString();
355
+ } else {
356
+ finalRes = res;
357
+ break;
358
+ }
359
+ }
360
+ if (!finalRes.ok) {
361
+ throw new Error(`ingestDocument: failed to fetch ${source.url} \u2014 HTTP ${finalRes.status}`);
362
+ }
363
+ rawText = await finalRes.text();
364
+ docId = opts.docId ?? slugFromUrl(source.url);
365
+ }
366
+ const chunks = chunkText(rawText, opts.chunk);
367
+ if (chunks.length === 0) return { chunks: 0 };
368
+ let citationSource;
369
+ if (typeof source === "string") {
370
+ citationSource = docId;
371
+ } else if ("type" in source) {
372
+ citationSource = extraMeta.source ?? docId;
373
+ } else {
374
+ citationSource = source.url;
375
+ }
376
+ const events = chunks.map((chunk, i) => ({
377
+ id: `${docId}:chunk:${i}`,
378
+ scope: opts.scope,
379
+ text: chunk.text,
380
+ // Merge loader-provided metadata (source, pages, etc.) into chunk metadata.
381
+ metadata: { ...extraMeta, source: citationSource }
382
+ }));
383
+ await opts.memory.ingest(events);
384
+ return { chunks: chunks.length };
385
+ }
386
+ export {
387
+ chunkText,
388
+ ingestDocument,
389
+ loadHtml,
390
+ loadMarkdown,
391
+ loadPdf
392
+ };
package/package.json ADDED
@@ -0,0 +1,65 @@
1
+ {
2
+ "name": "@eidentic/rag",
3
+ "version": "0.1.0",
4
+ "type": "module",
5
+ "license": "Apache-2.0",
6
+ "publishConfig": {
7
+ "access": "public"
8
+ },
9
+ "repository": {
10
+ "type": "git",
11
+ "url": "git+https://github.com/eidentic/eidentic.git",
12
+ "directory": "packages/rag"
13
+ },
14
+ "main": "./dist/index.cjs",
15
+ "module": "./dist/index.js",
16
+ "types": "./dist/index.d.ts",
17
+ "exports": {
18
+ ".": {
19
+ "types": "./dist/index.d.ts",
20
+ "import": "./dist/index.js",
21
+ "require": "./dist/index.cjs"
22
+ }
23
+ },
24
+ "sideEffects": false,
25
+ "files": [
26
+ "dist",
27
+ "LICENSE",
28
+ "README.md"
29
+ ],
30
+ "dependencies": {
31
+ "node-html-parser": "^7.1.0",
32
+ "@eidentic/tools": "0.1.0",
33
+ "@eidentic/types": "0.1.0"
34
+ },
35
+ "peerDependencies": {
36
+ "pdf-parse": "^1.1.1"
37
+ },
38
+ "peerDependenciesMeta": {
39
+ "pdf-parse": {
40
+ "optional": true
41
+ }
42
+ },
43
+ "description": "RAG pipeline utilities for Eidentic — chunk text, ingest documents (PDF / HTML / Markdown), and load content into agent memory.",
44
+ "keywords": [
45
+ "ai",
46
+ "agents",
47
+ "typescript",
48
+ "eidentic",
49
+ "rag",
50
+ "document-loading",
51
+ "chunking",
52
+ "memory"
53
+ ],
54
+ "homepage": "https://github.com/eidentic/eidentic#readme",
55
+ "bugs": {
56
+ "url": "https://github.com/eidentic/eidentic/issues"
57
+ },
58
+ "engines": {
59
+ "node": ">=22"
60
+ },
61
+ "scripts": {
62
+ "build": "tsup src/index.ts --format esm,cjs --dts --clean",
63
+ "typecheck": "tsc --noEmit"
64
+ }
65
+ }