unrag 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/cli/index.js +611 -174
  2. package/package.json +12 -6
  3. package/registry/config/unrag.config.ts +9 -8
  4. package/registry/connectors/google-drive/_api-types.ts +60 -0
  5. package/registry/connectors/google-drive/client.ts +99 -38
  6. package/registry/connectors/google-drive/sync.ts +97 -69
  7. package/registry/connectors/google-drive/types.ts +76 -37
  8. package/registry/connectors/notion/client.ts +12 -3
  9. package/registry/connectors/notion/render.ts +62 -23
  10. package/registry/connectors/notion/sync.ts +30 -23
  11. package/registry/core/assets.ts +11 -10
  12. package/registry/core/config.ts +10 -25
  13. package/registry/core/context-engine.ts +71 -2
  14. package/registry/core/deep-merge.ts +45 -0
  15. package/registry/core/ingest.ts +117 -44
  16. package/registry/core/types.ts +96 -2
  17. package/registry/docs/unrag.md +6 -1
  18. package/registry/embedding/_shared.ts +25 -0
  19. package/registry/embedding/ai.ts +8 -68
  20. package/registry/embedding/azure.ts +88 -0
  21. package/registry/embedding/bedrock.ts +88 -0
  22. package/registry/embedding/cohere.ts +88 -0
  23. package/registry/embedding/google.ts +102 -0
  24. package/registry/embedding/mistral.ts +71 -0
  25. package/registry/embedding/ollama.ts +90 -0
  26. package/registry/embedding/openai.ts +88 -0
  27. package/registry/embedding/openrouter.ts +127 -0
  28. package/registry/embedding/together.ts +77 -0
  29. package/registry/embedding/vertex.ts +111 -0
  30. package/registry/embedding/voyage.ts +169 -0
  31. package/registry/extractors/audio-transcribe/index.ts +39 -23
  32. package/registry/extractors/file-docx/index.ts +8 -1
  33. package/registry/extractors/file-pptx/index.ts +22 -1
  34. package/registry/extractors/file-xlsx/index.ts +24 -1
  35. package/registry/extractors/image-caption-llm/index.ts +8 -3
  36. package/registry/extractors/image-ocr/index.ts +9 -4
  37. package/registry/extractors/pdf-llm/index.ts +9 -4
  38. package/registry/extractors/pdf-text-layer/index.ts +23 -2
  39. package/registry/extractors/video-frames/index.ts +8 -3
  40. package/registry/extractors/video-transcribe/index.ts +40 -24
  41. package/registry/manifest.json +346 -0
  42. package/registry/store/drizzle-postgres-pgvector/store.ts +26 -6
package/package.json CHANGED
@@ -6,18 +6,24 @@
6
6
  "bin": {
7
7
  "unrag": "./dist/cli/index.js"
8
8
  },
9
- "version": "0.2.5",
9
+ "version": "0.2.7",
10
10
  "private": false,
11
11
  "license": "Apache-2.0",
12
12
  "devDependencies": {
13
+ "@ai-sdk/amazon-bedrock": "^3.0.72",
14
+ "@ai-sdk/cohere": "^3.0.1",
15
+ "@ai-sdk/google": "^3.0.1",
16
+ "@ai-sdk/openai": "^3.0.1",
17
+ "@openrouter/sdk": "^0.3.10",
18
+ "@prisma/client": "^6.0.0",
13
19
  "@types/bun": "latest",
14
20
  "@types/pg": "^8.16.0",
15
- "@prisma/client": "^6.0.0",
16
- "prisma": "^6.0.0",
17
- "drizzle-orm": "^0.45.1",
21
+ "ai": "^6.0.3",
18
22
  "drizzle-kit": "^0.31.8",
19
- "ai": "^5.0.113",
20
- "pg": "^8.16.3"
23
+ "drizzle-orm": "^0.45.1",
24
+ "pg": "^8.16.3",
25
+ "prisma": "^6.0.0",
26
+ "voyage-ai-provider": "^3.0.0"
21
27
  },
22
28
  "dependencies": {
23
29
  "@clack/prompts": "^0.11.0",
@@ -18,19 +18,18 @@
18
18
  export const unrag = defineUnragConfig({
19
19
  defaults: {
20
20
  chunking: {
21
- chunkSize: 200,
22
- chunkOverlap: 40,
21
+ chunkSize: 200, // __UNRAG_DEFAULT_chunkSize__
22
+ chunkOverlap: 40, // __UNRAG_DEFAULT_chunkOverlap__
23
23
  },
24
24
  retrieval: {
25
- topK: 8,
25
+ topK: 8, // __UNRAG_DEFAULT_topK__
26
26
  },
27
27
  },
28
28
  embedding: {
29
29
  provider: "ai",
30
30
  config: {
31
- type: "text", // __UNRAG_EMBEDDING_TYPE__
32
31
  model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__
33
- timeoutMs: 15_000,
32
+ timeoutMs: 15_000, // __UNRAG_EMBEDDING_TIMEOUT__
34
33
  },
35
34
  },
36
35
  engine: {
@@ -41,8 +40,8 @@ export const unrag = defineUnragConfig({
41
40
  * - storeDocumentContent: whether the full original document text is stored in `documents.content`.
42
41
  */
43
42
  storage: {
44
- storeChunkContent: true,
45
- storeDocumentContent: true,
43
+ storeChunkContent: true, // __UNRAG_STORAGE_storeChunkContent__
44
+ storeDocumentContent: true, // __UNRAG_STORAGE_storeDocumentContent__
46
45
  },
47
46
  /**
48
47
  * Optional extractor modules that can process non-text assets into text outputs.
@@ -62,9 +61,10 @@ export const unrag = defineUnragConfig({
62
61
  *
63
62
  * Notes:
64
63
  * - This generated config is cost-safe by default (all extraction is off).
65
- * - `unrag init` can enable rich media + multimodal embeddings for you.
64
+ * - `unrag init --rich-media` can enable rich media ingestion for you (extractors + assetProcessing flags).
66
65
  * - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
67
66
  */
67
+ // __UNRAG_ASSET_PROCESSING_BLOCK_START__
68
68
  assetProcessing: {
69
69
  onUnsupportedAsset: "skip",
70
70
  onError: "skip",
@@ -181,6 +181,7 @@ export const unrag = defineUnragConfig({
181
181
  },
182
182
  },
183
183
  },
184
+ // __UNRAG_ASSET_PROCESSING_BLOCK_END__
184
185
  },
185
186
  } as const);
186
187
 
@@ -0,0 +1,60 @@
1
+ /**
2
+ * Structural types for the Google Drive API.
3
+ *
4
+ * These are minimal interfaces that match the googleapis API structure,
5
+ * allowing the connector to work without depending on googleapis types at compile time.
6
+ */
7
+
8
+ export interface DriveFile {
9
+ id?: string | null;
10
+ name?: string | null;
11
+ mimeType?: string | null;
12
+ size?: string | null;
13
+ webViewLink?: string | null;
14
+ modifiedTime?: string | null;
15
+ parents?: string[] | null;
16
+ shortcutDetails?: {
17
+ targetId?: string | null;
18
+ targetMimeType?: string | null;
19
+ } | null;
20
+ }
21
+
22
+ export interface DriveFileList {
23
+ files?: DriveFile[];
24
+ nextPageToken?: string | null;
25
+ }
26
+
27
+ export interface DriveFilesResource {
28
+ get(params: {
29
+ fileId: string;
30
+ fields?: string;
31
+ alt?: string;
32
+ supportsAllDrives?: boolean;
33
+ }): Promise<{ data: DriveFile | ArrayBuffer | string }>;
34
+
35
+ list(params: {
36
+ q?: string;
37
+ fields?: string;
38
+ pageToken?: string;
39
+ pageSize?: number;
40
+ supportsAllDrives?: boolean;
41
+ includeItemsFromAllDrives?: boolean;
42
+ }): Promise<{ data: DriveFileList }>;
43
+
44
+ export(params: {
45
+ fileId: string;
46
+ mimeType: string;
47
+ }): Promise<{ data: ArrayBuffer | string }>;
48
+ }
49
+
50
+ export interface DriveClient {
51
+ files: DriveFilesResource;
52
+ }
53
+
54
+ /**
55
+ * Auth client interface - minimal subset used by the connector.
56
+ */
57
+ export interface AuthClient {
58
+ getAccessToken?(): Promise<{ token?: string | null }>;
59
+ }
60
+
@@ -1,4 +1,11 @@
1
- import type { GoogleDriveAuth } from "./types";
1
+ import type { DriveClient, AuthClient } from "./_api-types";
2
+ import type {
3
+ GoogleDriveAuth,
4
+ GoogleDriveOAuthAuth,
5
+ GoogleDriveServiceAccountAuth,
6
+ GoogleDriveGoogleAuthAuth,
7
+ ServiceAccountCredentials,
8
+ } from "./types";
2
9
 
3
10
  export const DEFAULT_DRIVE_SCOPES = [
4
11
  "https://www.googleapis.com/auth/drive.readonly",
@@ -17,36 +24,51 @@ type NormalizedAuth =
17
24
  }
18
25
  | {
19
26
  kind: "service_account";
20
- credentials: Record<string, any>;
27
+ credentials: ServiceAccountCredentials;
21
28
  subject?: string;
22
29
  }
23
30
  | { kind: "google_auth"; auth: unknown };
24
31
 
32
+ /**
33
+ * Type guard for service account auth.
34
+ */
35
+ function isServiceAccountAuth(auth: GoogleDriveAuth): auth is GoogleDriveServiceAccountAuth {
36
+ return auth.kind === "service_account";
37
+ }
38
+
39
+ /**
40
+ * Type guard for google auth.
41
+ */
42
+ function isGoogleAuth(auth: GoogleDriveAuth): auth is GoogleDriveGoogleAuthAuth {
43
+ return auth.kind === "google_auth";
44
+ }
45
+
46
+ /**
47
+ * Type guard for oauth.
48
+ */
49
+ function isOAuthAuth(auth: GoogleDriveAuth): auth is GoogleDriveOAuthAuth {
50
+ return auth.kind === "oauth";
51
+ }
52
+
25
53
  export function normalizeGoogleDriveAuth(auth: GoogleDriveAuth): NormalizedAuth {
26
54
  if (!auth || typeof auth !== "object") {
27
55
  throw new Error("Google Drive auth is required");
28
56
  }
29
57
 
30
- const kind = (auth as any).kind;
31
- if (kind !== "oauth" && kind !== "service_account" && kind !== "google_auth") {
32
- throw new Error(`Unknown Google Drive auth kind: ${String(kind)}`);
33
- }
34
-
35
- if (kind === "google_auth") {
36
- const a = (auth as any).auth;
37
- if (!a) throw new Error('Google Drive auth.kind="google_auth" requires auth');
38
- return { kind: "google_auth", auth: a };
58
+ if (isGoogleAuth(auth)) {
59
+ if (!auth.auth) throw new Error('Google Drive auth.kind="google_auth" requires auth');
60
+ return { kind: "google_auth", auth: auth.auth };
39
61
  }
40
62
 
41
- if (kind === "service_account") {
42
- const raw = (auth as any).credentialsJson;
63
+ if (isServiceAccountAuth(auth)) {
64
+ const raw = auth.credentialsJson;
43
65
  if (!raw) {
44
66
  throw new Error(
45
67
  'Google Drive auth.kind="service_account" requires credentialsJson'
46
68
  );
47
69
  }
48
- const credentials =
49
- typeof raw === "string" ? (JSON.parse(raw) as Record<string, any>) : (raw as any);
70
+ const credentials: ServiceAccountCredentials =
71
+ typeof raw === "string" ? (JSON.parse(raw) as ServiceAccountCredentials) : raw;
50
72
  if (!credentials?.client_email || !credentials?.private_key) {
51
73
  throw new Error(
52
74
  'Google Drive service account credentials must include "client_email" and "private_key".'
@@ -55,29 +77,33 @@ export function normalizeGoogleDriveAuth(auth: GoogleDriveAuth): NormalizedAuth
55
77
  return {
56
78
  kind: "service_account",
57
79
  credentials,
58
- subject: (auth as any).subject ? String((auth as any).subject) : undefined,
80
+ subject: auth.subject ? String(auth.subject) : undefined,
59
81
  };
60
82
  }
61
83
 
62
- // oauth
63
- if ((auth as any).oauthClient) {
64
- return { kind: "oauth_client", oauthClient: (auth as any).oauthClient };
65
- }
84
+ if (isOAuthAuth(auth)) {
85
+ // oauth
86
+ if (auth.oauthClient) {
87
+ return { kind: "oauth_client", oauthClient: auth.oauthClient };
88
+ }
66
89
 
67
- const { clientId, clientSecret, redirectUri, refreshToken, accessToken } = auth as any;
68
- if (!clientId || !clientSecret || !redirectUri || !refreshToken) {
69
- throw new Error(
70
- 'Google Drive auth.kind="oauth" requires either oauthClient or { clientId, clientSecret, redirectUri, refreshToken }'
71
- );
90
+ const { clientId, clientSecret, redirectUri, refreshToken, accessToken } = auth;
91
+ if (!clientId || !clientSecret || !redirectUri || !refreshToken) {
92
+ throw new Error(
93
+ 'Google Drive auth.kind="oauth" requires either oauthClient or { clientId, clientSecret, redirectUri, refreshToken }'
94
+ );
95
+ }
96
+ return {
97
+ kind: "oauth_config",
98
+ clientId: String(clientId),
99
+ clientSecret: String(clientSecret),
100
+ redirectUri: String(redirectUri),
101
+ refreshToken: String(refreshToken),
102
+ ...(accessToken ? { accessToken: String(accessToken) } : {}),
103
+ };
72
104
  }
73
- return {
74
- kind: "oauth_config",
75
- clientId: String(clientId),
76
- clientSecret: String(clientSecret),
77
- redirectUri: String(redirectUri),
78
- refreshToken: String(refreshToken),
79
- ...(accessToken ? { accessToken: String(accessToken) } : {}),
80
- };
105
+
106
+ throw new Error(`Unknown Google Drive auth kind: ${String((auth as Record<string, unknown>).kind)}`);
81
107
  }
82
108
 
83
109
  const asMessage = (err: unknown) => {
@@ -89,6 +115,41 @@ const asMessage = (err: unknown) => {
89
115
  }
90
116
  };
91
117
 
118
+ /**
119
+ * Google Auth Library module shape for dynamic import.
120
+ */
121
+ interface GoogleAuthLibraryModule {
122
+ OAuth2Client?: new (
123
+ clientId: string,
124
+ clientSecret: string,
125
+ redirectUri: string
126
+ ) => {
127
+ setCredentials(credentials: Record<string, string>): void;
128
+ };
129
+ OAuth2?: new (
130
+ clientId: string,
131
+ clientSecret: string,
132
+ redirectUri: string
133
+ ) => {
134
+ setCredentials(credentials: Record<string, string>): void;
135
+ };
136
+ JWT?: new (options: {
137
+ email: string;
138
+ key: string;
139
+ scopes: string[];
140
+ subject?: string;
141
+ }) => unknown;
142
+ }
143
+
144
+ /**
145
+ * Googleapis module shape for dynamic import.
146
+ */
147
+ interface GoogleApisModule {
148
+ google: {
149
+ drive(options: { version: string; auth: unknown }): DriveClient;
150
+ };
151
+ }
152
+
92
153
  /**
93
154
  * Creates a Google Drive API client from a plug-and-play auth input.
94
155
  *
@@ -98,11 +159,11 @@ const asMessage = (err: unknown) => {
98
159
  export async function createGoogleDriveClient(args: {
99
160
  auth: GoogleDriveAuth;
100
161
  scopes?: string[];
101
- }): Promise<{ drive: any; authClient: any }> {
162
+ }): Promise<{ drive: DriveClient; authClient: AuthClient }> {
102
163
  const normalized = normalizeGoogleDriveAuth(args.auth);
103
164
  const scopes = (args.scopes?.length ? args.scopes : DEFAULT_DRIVE_SCOPES) as string[];
104
165
 
105
- let authClient: any;
166
+ let authClient: unknown;
106
167
 
107
168
  try {
108
169
  if (normalized.kind === "oauth_client") {
@@ -111,7 +172,7 @@ export async function createGoogleDriveClient(args: {
111
172
  authClient = normalized.auth;
112
173
  } else {
113
174
  // google-auth-library (dynamic)
114
- const gal: any = await import("google-auth-library");
175
+ const gal = (await import("google-auth-library")) as GoogleAuthLibraryModule;
115
176
 
116
177
  if (normalized.kind === "oauth_config") {
117
178
  const OAuth2Client = gal.OAuth2Client ?? gal.OAuth2;
@@ -143,7 +204,7 @@ export async function createGoogleDriveClient(args: {
143
204
  }
144
205
  }
145
206
 
146
- const { google }: any = await import("googleapis");
207
+ const { google } = (await import("googleapis")) as GoogleApisModule;
147
208
  if (!google?.drive) {
148
209
  throw new Error("googleapis.google.drive not found");
149
210
  }
@@ -153,7 +214,7 @@ export async function createGoogleDriveClient(args: {
153
214
  auth: authClient,
154
215
  });
155
216
 
156
- return { drive, authClient };
217
+ return { drive, authClient: authClient as AuthClient };
157
218
  } catch (err) {
158
219
  const msg = asMessage(err);
159
220
  if (