unrag 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/README.md +2 -2
  2. package/dist/cli/index.js +251 -42
  3. package/package.json +2 -1
  4. package/registry/config/unrag.config.ts +140 -7
  5. package/registry/connectors/notion/render.ts +78 -0
  6. package/registry/connectors/notion/sync.ts +12 -3
  7. package/registry/connectors/notion/types.ts +3 -1
  8. package/registry/core/assets.ts +54 -0
  9. package/registry/core/config.ts +150 -0
  10. package/registry/core/context-engine.ts +69 -1
  11. package/registry/core/index.ts +15 -2
  12. package/registry/core/ingest.ts +743 -17
  13. package/registry/core/types.ts +606 -0
  14. package/registry/docs/unrag.md +6 -0
  15. package/registry/embedding/ai.ts +89 -8
  16. package/registry/extractors/_shared/fetch.ts +113 -0
  17. package/registry/extractors/_shared/media.ts +14 -0
  18. package/registry/extractors/_shared/text.ts +11 -0
  19. package/registry/extractors/audio-transcribe/index.ts +75 -0
  20. package/registry/extractors/file-docx/index.ts +53 -0
  21. package/registry/extractors/file-pptx/index.ts +92 -0
  22. package/registry/extractors/file-text/index.ts +85 -0
  23. package/registry/extractors/file-xlsx/index.ts +58 -0
  24. package/registry/extractors/image-caption-llm/index.ts +60 -0
  25. package/registry/extractors/image-ocr/index.ts +60 -0
  26. package/registry/extractors/pdf-llm/index.ts +84 -0
  27. package/registry/extractors/pdf-ocr/index.ts +125 -0
  28. package/registry/extractors/pdf-text-layer/index.ts +76 -0
  29. package/registry/extractors/video-frames/index.ts +126 -0
  30. package/registry/extractors/video-transcribe/index.ts +78 -0
  31. package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
@@ -1,8 +1,12 @@
1
- import type { ContextEngine } from "../../core";
2
- import type { IngestResult } from "../../core/types";
1
+ import type { IngestResult } from "../../core";
3
2
  import { createNotionClient, type NotionClient } from "./client";
4
3
  import { normalizeNotionPageId32, toUuidHyphenated } from "./ids";
5
- import { renderNotionBlocksToText, type NotionBlock, type NotionBlockNode } from "./render";
4
+ import {
5
+ extractNotionAssets,
6
+ renderNotionBlocksToText,
7
+ type NotionBlock,
8
+ type NotionBlockNode,
9
+ } from "./render";
6
10
  import type {
7
11
  BuildNotionPageIngestInputArgs,
8
12
  NotionPageDocument,
@@ -29,6 +33,7 @@ export function buildNotionPageIngestInput(
29
33
  sourceId,
30
34
  content: args.content,
31
35
  metadata: args.metadata ?? {},
36
+ assets: args.assets ?? [],
32
37
  };
33
38
  }
34
39
 
@@ -108,6 +113,7 @@ export async function loadNotionPageDocument(args: {
108
113
  const tree = await buildBlockTree(args.notion, apiId, 0, args.maxDepth ?? 4);
109
114
  const body = renderNotionBlocksToText(tree);
110
115
  const content = [title.trim(), body.trim()].filter(Boolean).join("\n\n");
116
+ const assets = extractNotionAssets(tree);
111
117
 
112
118
  const metadata = {
113
119
  connector: "notion",
@@ -121,6 +127,7 @@ export async function loadNotionPageDocument(args: {
121
127
  const ingest = buildNotionPageIngestInput({
122
128
  pageId,
123
129
  content,
130
+ assets,
124
131
  metadata: metadata as any,
125
132
  sourceIdPrefix: args.sourceIdPrefix,
126
133
  });
@@ -129,6 +136,7 @@ export async function loadNotionPageDocument(args: {
129
136
  sourceId: ingest.sourceId,
130
137
  content: ingest.content,
131
138
  metadata: ingest.metadata ?? {},
139
+ assets: ingest.assets ?? [],
132
140
  };
133
141
  }
134
142
 
@@ -178,6 +186,7 @@ export async function syncNotionPages(
178
186
  const result: IngestResult = await input.engine.ingest({
179
187
  sourceId: doc.sourceId,
180
188
  content: doc.content,
189
+ assets: doc.assets,
181
190
  metadata: doc.metadata as any,
182
191
  });
183
192
 
@@ -1,5 +1,5 @@
1
1
  import type { ContextEngine } from "../../core";
2
- import type { IngestInput } from "../../core/types";
2
+ import type { AssetInput, IngestInput } from "../../core";
3
3
 
4
4
  export type NotionSyncProgressEvent =
5
5
  | { type: "page:start"; pageId: string; sourceId: string }
@@ -42,11 +42,13 @@ export type NotionPageDocument = {
42
42
  sourceId: string;
43
43
  content: string;
44
44
  metadata: Record<string, unknown>;
45
+ assets: AssetInput[];
45
46
  };
46
47
 
47
48
  export type BuildNotionPageIngestInputArgs = {
48
49
  pageId: string; // normalized 32-hex (no dashes)
49
50
  content: string;
51
+ assets?: AssetInput[];
50
52
  metadata?: Record<string, unknown>;
51
53
  sourceIdPrefix?: string;
52
54
  };
@@ -0,0 +1,54 @@
1
+ import type { AssetKind, Chunk } from "./types";
2
+
3
+ export type ChunkAssetRef = {
4
+ assetId: string;
5
+ assetKind: AssetKind;
6
+ assetUri?: string;
7
+ assetMediaType?: string;
8
+ extractor?: string;
9
+ };
10
+
11
+ const assetKinds = new Set<AssetKind>(["image", "pdf", "audio", "video", "file"]);
12
+
13
+ /**
14
+ * Convenience helper to extract an asset reference from a retrieved chunk.
15
+ *
16
+ * Asset chunks are represented as standard text chunks whose `metadata` contains:
17
+ * - `assetKind`: "image" | "pdf" | "audio" | "video" | "file"
18
+ * - `assetId`: stable identifier emitted by the connector/ingester
19
+ * - optional `assetUri`, `assetMediaType`, and `extractor`
20
+ */
21
+ export function getChunkAssetRef(
22
+ chunk: Pick<Chunk, "metadata">
23
+ ): ChunkAssetRef | null {
24
+ const meta = chunk.metadata as any;
25
+ const kind = meta?.assetKind;
26
+ const id = meta?.assetId;
27
+
28
+ if (typeof kind !== "string" || !assetKinds.has(kind as AssetKind)) {
29
+ return null;
30
+ }
31
+ if (typeof id !== "string" || !id) {
32
+ return null;
33
+ }
34
+
35
+ const assetUri = typeof meta?.assetUri === "string" ? meta.assetUri : undefined;
36
+ const assetMediaType =
37
+ typeof meta?.assetMediaType === "string" ? meta.assetMediaType : undefined;
38
+ const extractor =
39
+ typeof meta?.extractor === "string" ? meta.extractor : undefined;
40
+
41
+ return {
42
+ assetId: id,
43
+ assetKind: kind as AssetKind,
44
+ ...(assetUri ? { assetUri } : {}),
45
+ ...(assetMediaType ? { assetMediaType } : {}),
46
+ ...(extractor ? { extractor } : {}),
47
+ };
48
+ }
49
+
50
+ export function isAssetChunk(chunk: Pick<Chunk, "metadata">): boolean {
51
+ return getChunkAssetRef(chunk) !== null;
52
+ }
53
+
54
+
@@ -2,6 +2,9 @@ import type {
2
2
  Chunker,
3
3
  ContextEngineConfig,
4
4
  ResolvedContextEngineConfig,
5
+ AssetProcessingConfig,
6
+ DeepPartial,
7
+ ContentStorageConfig,
5
8
  } from "./types";
6
9
  import { defaultChunker, resolveChunkingOptions } from "./chunking";
7
10
 
@@ -10,6 +13,150 @@ export const defineConfig = (config: ContextEngineConfig): ContextEngineConfig =
10
13
 
11
14
  const defaultIdGenerator = () => crypto.randomUUID();
12
15
 
16
+ const DEFAULT_PDF_LLM_MODEL = "google/gemini-2.0-flash";
17
+ const DEFAULT_IMAGE_OCR_MODEL = "google/gemini-2.0-flash";
18
+ const DEFAULT_IMAGE_CAPTION_MODEL = "google/gemini-2.0-flash";
19
+ const DEFAULT_AUDIO_TRANSCRIBE_MODEL = "openai/whisper-1";
20
+ const DEFAULT_VIDEO_TRANSCRIBE_MODEL = "openai/whisper-1";
21
+
22
+ export const defaultAssetProcessingConfig: AssetProcessingConfig = {
23
+ onUnsupportedAsset: "skip",
24
+ onError: "skip",
25
+ concurrency: 4,
26
+ hooks: {
27
+ onEvent: undefined,
28
+ },
29
+ fetch: {
30
+ enabled: true,
31
+ allowedHosts: undefined,
32
+ maxBytes: 15 * 1024 * 1024, // 15MB
33
+ timeoutMs: 20_000,
34
+ headers: undefined,
35
+ },
36
+ pdf: {
37
+ textLayer: {
38
+ enabled: false,
39
+ maxBytes: 15 * 1024 * 1024, // 15MB
40
+ maxOutputChars: 200_000,
41
+ minChars: 200,
42
+ maxPages: undefined,
43
+ },
44
+ llmExtraction: {
45
+ enabled: false, // library default (cost-safe)
46
+ model: DEFAULT_PDF_LLM_MODEL,
47
+ prompt:
48
+ "Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
49
+ timeoutMs: 60_000,
50
+ maxBytes: 15 * 1024 * 1024, // 15MB
51
+ maxOutputChars: 200_000,
52
+ },
53
+ ocr: {
54
+ enabled: false,
55
+ maxBytes: 15 * 1024 * 1024, // 15MB
56
+ maxOutputChars: 200_000,
57
+ minChars: 200,
58
+ maxPages: undefined,
59
+ pdftoppmPath: undefined,
60
+ tesseractPath: undefined,
61
+ dpi: 200,
62
+ lang: "eng",
63
+ },
64
+ },
65
+ image: {
66
+ ocr: {
67
+ enabled: false,
68
+ model: DEFAULT_IMAGE_OCR_MODEL,
69
+ prompt:
70
+ "Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
71
+ timeoutMs: 60_000,
72
+ maxBytes: 10 * 1024 * 1024, // 10MB
73
+ maxOutputChars: 50_000,
74
+ },
75
+ captionLlm: {
76
+ enabled: false,
77
+ model: DEFAULT_IMAGE_CAPTION_MODEL,
78
+ prompt:
79
+ "Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
80
+ timeoutMs: 60_000,
81
+ maxBytes: 10 * 1024 * 1024, // 10MB
82
+ maxOutputChars: 10_000,
83
+ },
84
+ },
85
+ audio: {
86
+ transcription: {
87
+ enabled: false,
88
+ model: DEFAULT_AUDIO_TRANSCRIBE_MODEL,
89
+ timeoutMs: 120_000,
90
+ maxBytes: 25 * 1024 * 1024, // 25MB
91
+ },
92
+ },
93
+ video: {
94
+ transcription: {
95
+ enabled: false,
96
+ model: DEFAULT_VIDEO_TRANSCRIBE_MODEL,
97
+ timeoutMs: 120_000,
98
+ maxBytes: 50 * 1024 * 1024, // 50MB
99
+ },
100
+ frames: {
101
+ enabled: false,
102
+ sampleFps: 0.2,
103
+ maxFrames: 50,
104
+ ffmpegPath: undefined,
105
+ maxBytes: 50 * 1024 * 1024, // 50MB
106
+ model: "google/gemini-2.0-flash",
107
+ prompt:
108
+ "Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
109
+ timeoutMs: 60_000,
110
+ maxOutputChars: 50_000,
111
+ },
112
+ },
113
+ file: {
114
+ text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
115
+ docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
116
+ pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
117
+ xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
118
+ },
119
+ };
120
+
121
+ export const defaultContentStorageConfig: ContentStorageConfig = {
122
+ storeChunkContent: true,
123
+ storeDocumentContent: true,
124
+ };
125
+
126
+ const mergeDeep = <T extends Record<string, any>>(
127
+ base: T,
128
+ overrides: DeepPartial<T> | undefined
129
+ ): T => {
130
+ if (!overrides) return base;
131
+ const out: any = Array.isArray(base) ? [...base] : { ...base };
132
+ for (const key of Object.keys(overrides) as Array<keyof T>) {
133
+ const nextVal = overrides[key];
134
+ if (nextVal === undefined) continue;
135
+ const baseVal = base[key];
136
+ if (
137
+ baseVal &&
138
+ typeof baseVal === "object" &&
139
+ !Array.isArray(baseVal) &&
140
+ nextVal &&
141
+ typeof nextVal === "object" &&
142
+ !Array.isArray(nextVal)
143
+ ) {
144
+ out[key] = mergeDeep(baseVal, nextVal as any);
145
+ } else {
146
+ out[key] = nextVal as any;
147
+ }
148
+ }
149
+ return out as T;
150
+ };
151
+
152
+ export const resolveAssetProcessingConfig = (
153
+ overrides?: DeepPartial<AssetProcessingConfig>
154
+ ): AssetProcessingConfig => mergeDeep(defaultAssetProcessingConfig, overrides);
155
+
156
+ export const resolveContentStorageConfig = (
157
+ overrides?: DeepPartial<ContentStorageConfig>
158
+ ): ContentStorageConfig => mergeDeep(defaultContentStorageConfig, overrides);
159
+
13
160
  export const resolveConfig = (
14
161
  config: ContextEngineConfig
15
162
  ): ResolvedContextEngineConfig => {
@@ -21,6 +168,9 @@ export const resolveConfig = (
21
168
  defaults: resolveChunkingOptions(config.defaults),
22
169
  chunker,
23
170
  idGenerator: config.idGenerator ?? defaultIdGenerator,
171
+ extractors: config.extractors ?? [],
172
+ storage: resolveContentStorageConfig(config.storage),
173
+ assetProcessing: resolveAssetProcessingConfig(config.assetProcessing),
24
174
  };
25
175
  };
26
176
 
@@ -1,15 +1,21 @@
1
1
  import { deleteDocuments } from "./delete";
2
- import { ingest } from "./ingest";
2
+ import { ingest, planIngest } from "./ingest";
3
3
  import { retrieve } from "./retrieve";
4
4
  import { defineConfig, resolveConfig } from "./config";
5
+ import { createAiEmbeddingProvider } from "../embedding/ai";
5
6
  import type {
7
+ AssetExtractor,
6
8
  ContextEngineConfig,
7
9
  DeleteInput,
10
+ DefineUnragConfigInput,
11
+ EmbeddingProvider,
8
12
  IngestInput,
9
13
  IngestResult,
14
+ IngestPlanResult,
10
15
  ResolvedContextEngineConfig,
11
16
  RetrieveInput,
12
17
  RetrieveResult,
18
+ UnragCreateEngineRuntime,
13
19
  } from "./types";
14
20
 
15
21
  export class ContextEngine {
@@ -23,6 +29,16 @@ export class ContextEngine {
23
29
  return ingest(this.config, input);
24
30
  }
25
31
 
32
+ /**
33
+ * Dry-run for ingestion. Returns which assets would be processed and by which extractors,
34
+ * without calling external services.
35
+ *
36
+ * Note: chunk counts/embeddings are not produced in dry-run.
37
+ */
38
+ async planIngest(input: IngestInput): Promise<IngestPlanResult> {
39
+ return planIngest(this.config, input);
40
+ }
41
+
26
42
  async retrieve(input: RetrieveInput): Promise<RetrieveResult> {
27
43
  return retrieve(this.config, input);
28
44
  }
@@ -37,4 +53,56 @@ export const createContextEngine = (config: ContextEngineConfig) =>
37
53
 
38
54
  export { defineConfig };
39
55
 
56
+ /**
57
+ * Ergonomic, higher-level config wrapper.
58
+ *
59
+ * This helps keep `unrag.config.ts` as a single source of truth while still
60
+ * allowing runtime wiring (DB client/store, optional extractors).
61
+ */
62
+ export const defineUnragConfig = <T extends DefineUnragConfigInput>(config: T) => {
63
+ let embeddingProvider: EmbeddingProvider | undefined;
64
+
65
+ const getEmbeddingProvider = () => {
66
+ if (embeddingProvider) return embeddingProvider;
67
+
68
+ if (config.embedding.provider === "ai") {
69
+ embeddingProvider = createAiEmbeddingProvider(config.embedding.config);
70
+ return embeddingProvider;
71
+ }
72
+
73
+ embeddingProvider = config.embedding.create();
74
+ return embeddingProvider;
75
+ };
76
+
77
+ const defaults = {
78
+ chunking: config.defaults?.chunking ?? {},
79
+ retrieval: {
80
+ topK: config.defaults?.retrieval?.topK ?? 8,
81
+ },
82
+ } as const;
83
+
84
+ const createEngineConfig = (runtime: UnragCreateEngineRuntime): ContextEngineConfig => {
85
+ const baseExtractors = (config.engine?.extractors ?? []) as AssetExtractor[];
86
+ const extractors =
87
+ typeof runtime.extractors === "function"
88
+ ? runtime.extractors(baseExtractors)
89
+ : runtime.extractors ?? baseExtractors;
90
+
91
+ return defineConfig({
92
+ ...(config.engine ?? {}),
93
+ defaults: defaults.chunking,
94
+ embedding: getEmbeddingProvider(),
95
+ store: runtime.store,
96
+ extractors,
97
+ });
98
+ };
99
+
100
+ return {
101
+ defaults,
102
+ createEngineConfig,
103
+ createEngine: (runtime: UnragCreateEngineRuntime) =>
104
+ new ContextEngine(createEngineConfig(runtime)),
105
+ };
106
+ };
107
+
40
108
 
@@ -1,8 +1,21 @@
1
- export { ContextEngine, createContextEngine, defineConfig } from "./context-engine";
1
+ export {
2
+ ContextEngine,
3
+ createContextEngine,
4
+ defineConfig,
5
+ defineUnragConfig,
6
+ } from "./context-engine";
2
7
  export { deleteDocuments } from "./delete";
3
- export { ingest } from "./ingest";
8
+ export { ingest, planIngest } from "./ingest";
4
9
  export { retrieve } from "./retrieve";
5
10
  export { defaultChunker, resolveChunkingOptions } from "./chunking";
11
+ export {
12
+ defaultAssetProcessingConfig,
13
+ defaultContentStorageConfig,
14
+ resolveAssetProcessingConfig,
15
+ resolveContentStorageConfig,
16
+ } from "./config";
17
+ export { getChunkAssetRef, isAssetChunk } from "./assets";
18
+ export type { ChunkAssetRef } from "./assets";
6
19
  export * from "./types";
7
20
 
8
21