@realtimex/folio 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. package/.env.example +20 -0
  2. package/README.md +63 -0
  3. package/api/server.ts +130 -0
  4. package/api/src/config/index.ts +96 -0
  5. package/api/src/middleware/auth.ts +128 -0
  6. package/api/src/middleware/errorHandler.ts +88 -0
  7. package/api/src/middleware/index.ts +4 -0
  8. package/api/src/middleware/rateLimit.ts +71 -0
  9. package/api/src/middleware/validation.ts +58 -0
  10. package/api/src/routes/accounts.ts +142 -0
  11. package/api/src/routes/baseline-config.ts +124 -0
  12. package/api/src/routes/chat.ts +154 -0
  13. package/api/src/routes/health.ts +61 -0
  14. package/api/src/routes/index.ts +35 -0
  15. package/api/src/routes/ingestions.ts +275 -0
  16. package/api/src/routes/migrate.ts +112 -0
  17. package/api/src/routes/policies.ts +121 -0
  18. package/api/src/routes/processing.ts +90 -0
  19. package/api/src/routes/rules.ts +11 -0
  20. package/api/src/routes/sdk.ts +100 -0
  21. package/api/src/routes/settings.ts +80 -0
  22. package/api/src/routes/setup.ts +389 -0
  23. package/api/src/routes/stats.ts +81 -0
  24. package/api/src/routes/tts.ts +190 -0
  25. package/api/src/services/BaselineConfigService.ts +208 -0
  26. package/api/src/services/ChatService.ts +204 -0
  27. package/api/src/services/GoogleDriveService.ts +331 -0
  28. package/api/src/services/GoogleSheetsService.ts +1107 -0
  29. package/api/src/services/IngestionService.ts +1187 -0
  30. package/api/src/services/ModelCapabilityService.ts +248 -0
  31. package/api/src/services/PolicyEngine.ts +1625 -0
  32. package/api/src/services/PolicyLearningService.ts +527 -0
  33. package/api/src/services/PolicyLoader.ts +249 -0
  34. package/api/src/services/RAGService.ts +391 -0
  35. package/api/src/services/SDKService.ts +249 -0
  36. package/api/src/services/supabase.ts +113 -0
  37. package/api/src/utils/Actuator.ts +284 -0
  38. package/api/src/utils/actions/ActionHandler.ts +34 -0
  39. package/api/src/utils/actions/AppendToGSheetAction.ts +260 -0
  40. package/api/src/utils/actions/AutoRenameAction.ts +58 -0
  41. package/api/src/utils/actions/CopyAction.ts +120 -0
  42. package/api/src/utils/actions/CopyToGDriveAction.ts +64 -0
  43. package/api/src/utils/actions/LogCsvAction.ts +48 -0
  44. package/api/src/utils/actions/NotifyAction.ts +39 -0
  45. package/api/src/utils/actions/RenameAction.ts +57 -0
  46. package/api/src/utils/actions/WebhookAction.ts +58 -0
  47. package/api/src/utils/actions/utils.ts +293 -0
  48. package/api/src/utils/llmResponse.ts +61 -0
  49. package/api/src/utils/logger.ts +67 -0
  50. package/bin/folio-deploy.js +12 -0
  51. package/bin/folio-setup.js +45 -0
  52. package/bin/folio.js +65 -0
  53. package/dist/api/server.js +106 -0
  54. package/dist/api/src/config/index.js +81 -0
  55. package/dist/api/src/middleware/auth.js +93 -0
  56. package/dist/api/src/middleware/errorHandler.js +73 -0
  57. package/dist/api/src/middleware/index.js +4 -0
  58. package/dist/api/src/middleware/rateLimit.js +43 -0
  59. package/dist/api/src/middleware/validation.js +54 -0
  60. package/dist/api/src/routes/accounts.js +110 -0
  61. package/dist/api/src/routes/baseline-config.js +91 -0
  62. package/dist/api/src/routes/chat.js +114 -0
  63. package/dist/api/src/routes/health.js +52 -0
  64. package/dist/api/src/routes/index.js +31 -0
  65. package/dist/api/src/routes/ingestions.js +207 -0
  66. package/dist/api/src/routes/migrate.js +91 -0
  67. package/dist/api/src/routes/policies.js +86 -0
  68. package/dist/api/src/routes/processing.js +75 -0
  69. package/dist/api/src/routes/rules.js +8 -0
  70. package/dist/api/src/routes/sdk.js +80 -0
  71. package/dist/api/src/routes/settings.js +68 -0
  72. package/dist/api/src/routes/setup.js +315 -0
  73. package/dist/api/src/routes/stats.js +62 -0
  74. package/dist/api/src/routes/tts.js +178 -0
  75. package/dist/api/src/services/BaselineConfigService.js +168 -0
  76. package/dist/api/src/services/ChatService.js +166 -0
  77. package/dist/api/src/services/GoogleDriveService.js +280 -0
  78. package/dist/api/src/services/GoogleSheetsService.js +795 -0
  79. package/dist/api/src/services/IngestionService.js +990 -0
  80. package/dist/api/src/services/ModelCapabilityService.js +179 -0
  81. package/dist/api/src/services/PolicyEngine.js +1353 -0
  82. package/dist/api/src/services/PolicyLearningService.js +397 -0
  83. package/dist/api/src/services/PolicyLoader.js +159 -0
  84. package/dist/api/src/services/RAGService.js +295 -0
  85. package/dist/api/src/services/SDKService.js +212 -0
  86. package/dist/api/src/services/supabase.js +72 -0
  87. package/dist/api/src/utils/Actuator.js +225 -0
  88. package/dist/api/src/utils/actions/ActionHandler.js +1 -0
  89. package/dist/api/src/utils/actions/AppendToGSheetAction.js +191 -0
  90. package/dist/api/src/utils/actions/AutoRenameAction.js +49 -0
  91. package/dist/api/src/utils/actions/CopyAction.js +112 -0
  92. package/dist/api/src/utils/actions/CopyToGDriveAction.js +55 -0
  93. package/dist/api/src/utils/actions/LogCsvAction.js +42 -0
  94. package/dist/api/src/utils/actions/NotifyAction.js +32 -0
  95. package/dist/api/src/utils/actions/RenameAction.js +51 -0
  96. package/dist/api/src/utils/actions/WebhookAction.js +51 -0
  97. package/dist/api/src/utils/actions/utils.js +237 -0
  98. package/dist/api/src/utils/llmResponse.js +63 -0
  99. package/dist/api/src/utils/logger.js +51 -0
  100. package/dist/assets/index-DzN8-j-e.css +1 -0
  101. package/dist/assets/index-Uy-ai3Dh.js +113 -0
  102. package/dist/favicon.svg +31 -0
  103. package/dist/folio-logo.svg +46 -0
  104. package/dist/index.html +14 -0
  105. package/docs-dev/FPE-spec.md +196 -0
  106. package/docs-dev/folio-prd.md +47 -0
  107. package/docs-dev/foundation-checklist.md +30 -0
  108. package/docs-dev/hybrid-routing-architecture.md +205 -0
  109. package/docs-dev/ingestion-engine.md +69 -0
  110. package/docs-dev/port-from-email-automator.md +32 -0
  111. package/docs-dev/tech-spec.md +98 -0
  112. package/index.html +13 -0
  113. package/package.json +101 -0
  114. package/public/favicon.svg +31 -0
  115. package/public/folio-logo.svg +46 -0
  116. package/scripts/dev-task.mjs +51 -0
  117. package/scripts/get-latest-migration-timestamp.mjs +34 -0
  118. package/scripts/migrate.sh +91 -0
  119. package/supabase/.temp/cli-latest +1 -0
  120. package/supabase/.temp/gotrue-version +1 -0
  121. package/supabase/.temp/pooler-url +1 -0
  122. package/supabase/.temp/postgres-version +1 -0
  123. package/supabase/.temp/project-ref +1 -0
  124. package/supabase/.temp/rest-version +1 -0
  125. package/supabase/.temp/storage-migration +1 -0
  126. package/supabase/.temp/storage-version +1 -0
  127. package/supabase/config.toml +64 -0
  128. package/supabase/functions/_shared/auth.ts +35 -0
  129. package/supabase/functions/_shared/cors.ts +12 -0
  130. package/supabase/functions/_shared/supabaseAdmin.ts +17 -0
  131. package/supabase/functions/api-v1-settings/index.ts +66 -0
  132. package/supabase/functions/setup/index.ts +91 -0
  133. package/supabase/migrations/20260223000000_initial_foundation.sql +136 -0
  134. package/supabase/migrations/20260223000001_add_migration_rpc.sql +10 -0
  135. package/supabase/migrations/20260224000002_add_init_state_view.sql +20 -0
  136. package/supabase/migrations/20260224000003_port_user_creation_parity.sql +139 -0
  137. package/supabase/migrations/20260224000004_add_avatars_storage.sql +26 -0
  138. package/supabase/migrations/20260224000005_add_tts_and_embed_settings.sql +24 -0
  139. package/supabase/migrations/20260224000006_add_policies_table.sql +48 -0
  140. package/supabase/migrations/20260224000007_fix_migration_rpc.sql +9 -0
  141. package/supabase/migrations/20260224000008_add_ingestions_table.sql +42 -0
  142. package/supabase/migrations/20260225000000_setup_compatible_mode.sql +119 -0
  143. package/supabase/migrations/20260225000001_restore_ingestions.sql +49 -0
  144. package/supabase/migrations/20260225000002_add_ingestion_trace.sql +2 -0
  145. package/supabase/migrations/20260225000003_add_baseline_configs.sql +35 -0
  146. package/supabase/migrations/20260226000000_add_processing_events.sql +26 -0
  147. package/supabase/migrations/20260226000001_add_ingestion_file_hash.sql +10 -0
  148. package/supabase/migrations/20260226000002_add_dynamic_rag.sql +150 -0
  149. package/supabase/migrations/20260226000003_add_ingestion_summary.sql +4 -0
  150. package/supabase/migrations/20260226000004_add_ingestion_tags.sql +7 -0
  151. package/supabase/migrations/20260226000005_add_chat_tables.sql +60 -0
  152. package/supabase/migrations/20260227000000_harden_chat_messages_rls.sql +25 -0
  153. package/supabase/migrations/20260228000000_add_vision_model_capabilities.sql +8 -0
  154. package/supabase/migrations/20260228000001_add_policy_match_feedback.sql +51 -0
  155. package/supabase/migrations/29991231235959_test_migration.sql +0 -0
  156. package/supabase/templates/confirmation.html +76 -0
  157. package/supabase/templates/email-change.html +76 -0
  158. package/supabase/templates/invite.html +72 -0
  159. package/supabase/templates/magic-link.html +68 -0
  160. package/supabase/templates/recovery.html +82 -0
  161. package/tsconfig.api.json +16 -0
  162. package/tsconfig.json +25 -0
  163. package/vite.config.ts +146 -0
@@ -0,0 +1,1187 @@
1
+ import type { SupabaseClient } from "@supabase/supabase-js";
2
+ import fs from "fs/promises";
3
+ import { PDFParse } from "pdf-parse";
4
+ import { createLogger } from "../utils/logger.js";
5
+ import { PolicyLoader } from "./PolicyLoader.js";
6
+ import type { FolioPolicy } from "./PolicyLoader.js";
7
+ import { PolicyEngine } from "./PolicyEngine.js";
8
+ import { PolicyLearningService } from "./PolicyLearningService.js";
9
+ import { BaselineConfigService } from "./BaselineConfigService.js";
10
+ import { Actuator } from "../utils/Actuator.js";
11
+ import { extractLlmResponse, previewLlmText } from "../utils/llmResponse.js";
12
+ import { RAGService } from "./RAGService.js";
13
+ import { SDKService } from "./SDKService.js";
14
+ import { ModelCapabilityService } from "./ModelCapabilityService.js";
15
+
16
+ const logger = createLogger("IngestionService");
17
+
18
+ /**
19
+ * Multi-signal classifier that decides whether pdf-parse extracted enough
20
+ * real text to skip GPU OCR and go straight to the local LLM (Fast Path).
21
+ *
22
+ * Four independent signals must all pass:
23
+ *
24
+ * 1. Minimum content – collapse whitespace before counting so sparse/formatted
25
+ * PDFs (forms, invoices) don't fail on raw length alone.
26
+ * 2. Word count – Unicode-aware (\p{L}) so French, German, Japanese, etc.
27
+ * aren't penalised; pure symbol/number docs are caught.
28
+ * 3. Garbage ratio – control chars + U+FFFD are the signature of image bytes
29
+ * that were mis-decoded as text. >2 % → encoding failure.
30
+ * 4. Page coverage – only for multi-page docs: if fewer than 40 % of pages
31
+ * yield non-trivial text the document is mostly scanned.
32
+ */
33
+ function isPdfTextExtractable(pdfData: {
34
+ text: string;
35
+ pages: Array<{ num: number; text: string }>;
36
+ total: number;
37
+ }): boolean {
38
+ const raw = pdfData.text ?? '';
39
+
40
+ // Signal 1: at least 100 printable characters after whitespace normalisation
41
+ if (raw.replace(/\s+/g, ' ').trim().length < 100) return false;
42
+
43
+ // Signal 2: at least 20 word-like tokens (≥2 Unicode letters)
44
+ const words = raw.match(/\p{L}{2,}/gu) ?? [];
45
+ if (words.length < 20) return false;
46
+
47
+ // Signal 3: garbage character ratio must be below 2 %
48
+ // eslint-disable-next-line no-control-regex
49
+ const garbageCount = (raw.match(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\uFFFD]/g) ?? []).length;
50
+ if (raw.length > 0 && garbageCount / raw.length > 0.02) return false;
51
+
52
+ // Signal 4: page coverage — getText() always emits one entry per page,
53
+ // so pages.length === total. For docs with >2 pages, at least 40 % of
54
+ // pages must contain >30 non-whitespace characters.
55
+ if (pdfData.total > 2 && pdfData.pages.length > 0) {
56
+ const pagesWithText = pdfData.pages.filter(
57
+ (p) => (p.text ?? '').replace(/\s/g, '').length > 30
58
+ ).length;
59
+ if (pagesWithText / pdfData.total < 0.4) return false;
60
+ }
61
+
62
+ return true;
63
+ }
64
+
65
+ export type IngestionStatus = "pending" | "processing" | "matched" | "no_match" | "error" | "duplicate";
66
+ export type IngestionSource = "upload" | "dropzone" | "email" | "url";
67
+
68
+ export interface Ingestion {
69
+ id: string;
70
+ user_id: string;
71
+ source: IngestionSource;
72
+ filename: string;
73
+ mime_type?: string;
74
+ file_size?: number;
75
+ file_hash?: string;
76
+ status: IngestionStatus;
77
+ policy_id?: string;
78
+ policy_name?: string;
79
+ extracted?: Record<string, unknown>;
80
+ actions_taken?: string[];
81
+ error_message?: string;
82
+ storage_path?: string;
83
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
84
+ trace?: Array<{ timestamp: string; step: string; details?: any }>;
85
+ tags?: string[];
86
+ summary?: string | null;
87
+ created_at: string;
88
+ updated_at: string;
89
+ }
90
+
91
+ export class IngestionService {
92
+ private static readonly NON_IDEMPOTENT_ACTION_TYPES = new Set([
93
+ "append_to_google_sheet",
94
+ "webhook",
95
+ "copy_to_gdrive",
96
+ "copy",
97
+ "log_csv",
98
+ "notify",
99
+ ]);
100
+
101
+ private static listNonIdempotentPolicyActions(policy: FolioPolicy): string[] {
102
+ const actionTypes = Array.isArray(policy.spec.actions)
103
+ ? policy.spec.actions.map((action) => String(action?.type ?? "").trim()).filter(Boolean)
104
+ : [];
105
+ return Array.from(new Set(actionTypes.filter((actionType) => this.NON_IDEMPOTENT_ACTION_TYPES.has(actionType))));
106
+ }
107
+
108
+ private static valueToSemanticText(value: unknown): string {
109
+ if (value == null) return "";
110
+ if (Array.isArray(value)) {
111
+ return value
112
+ .map((item) => this.valueToSemanticText(item))
113
+ .filter(Boolean)
114
+ .join(", ");
115
+ }
116
+ if (typeof value === "object") {
117
+ try {
118
+ return JSON.stringify(value);
119
+ } catch {
120
+ return String(value);
121
+ }
122
+ }
123
+ return String(value);
124
+ }
125
+
126
+ private static buildVlmSemanticText(opts: {
127
+ filename: string;
128
+ finalStatus: string;
129
+ policyName?: string;
130
+ extracted: Record<string, unknown>;
131
+ tags: string[];
132
+ }): string {
133
+ const { filename, finalStatus, policyName, extracted, tags } = opts;
134
+ const lines: string[] = [
135
+ `Document filename: ${filename}`,
136
+ "Document source: VLM image extraction",
137
+ `Processing status: ${finalStatus}`,
138
+ ];
139
+
140
+ if (policyName) {
141
+ lines.push(`Matched policy: ${policyName}`);
142
+ }
143
+ if (tags.length > 0) {
144
+ lines.push(`Tags: ${tags.join(", ")}`);
145
+ }
146
+
147
+ const fieldLines = Object.entries(extracted)
148
+ .filter(([key]) => key !== "_enrichment")
149
+ .map(([key, value]) => ({ key, value: this.valueToSemanticText(value).trim() }))
150
+ .filter((entry) => entry.value.length > 0)
151
+ .slice(0, 80)
152
+ .map((entry) => `- ${entry.key}: ${entry.value}`);
153
+
154
+ if (fieldLines.length > 0) {
155
+ lines.push("Extracted fields:");
156
+ lines.push(...fieldLines);
157
+ } else {
158
+ lines.push("Extracted fields: none");
159
+ }
160
+
161
+ const enrichment = extracted["_enrichment"];
162
+ if (enrichment && typeof enrichment === "object" && !Array.isArray(enrichment)) {
163
+ const enrichmentKeys = Object.keys(enrichment as Record<string, unknown>);
164
+ if (enrichmentKeys.length > 0) {
165
+ lines.push(`Enrichment fields: ${enrichmentKeys.join(", ")}`);
166
+ }
167
+ }
168
+
169
+ lines.push("Synthetic semantic text generated from VLM output for retrieval.");
170
+ return lines.join("\n");
171
+ }
172
+
173
+ private static countExtractedSemanticFields(extracted: Record<string, unknown>): number {
174
+ return Object.entries(extracted)
175
+ .filter(([key]) => key !== "_enrichment")
176
+ .map(([, value]) => value)
177
+ .map((value) => this.valueToSemanticText(value).trim())
178
+ .filter((value) => value.length > 0).length;
179
+ }
180
+
181
+ private static queueVlmSemanticEmbedding(opts: {
182
+ ingestionId: string;
183
+ userId: string;
184
+ filename: string;
185
+ finalStatus: string;
186
+ policyName?: string;
187
+ extracted: Record<string, unknown>;
188
+ tags: string[];
189
+ supabase: SupabaseClient;
190
+ embedSettings: { embedding_provider?: string; embedding_model?: string };
191
+ }): { synthetic_chars: number; extracted_fields: number; tags_count: number } {
192
+ const syntheticText = this.buildVlmSemanticText({
193
+ filename: opts.filename,
194
+ finalStatus: opts.finalStatus,
195
+ policyName: opts.policyName,
196
+ extracted: opts.extracted,
197
+ tags: opts.tags,
198
+ });
199
+ const details = {
200
+ synthetic_chars: syntheticText.length,
201
+ extracted_fields: this.countExtractedSemanticFields(opts.extracted),
202
+ tags_count: opts.tags.length,
203
+ };
204
+
205
+ Actuator.logEvent(opts.ingestionId, opts.userId, "analysis", "RAG Embedding", {
206
+ action: "Queued synthetic VLM embedding",
207
+ ...details,
208
+ }, opts.supabase);
209
+
210
+ RAGService.chunkAndEmbed(
211
+ opts.ingestionId,
212
+ opts.userId,
213
+ syntheticText,
214
+ opts.supabase,
215
+ opts.embedSettings
216
+ ).then(() => {
217
+ Actuator.logEvent(opts.ingestionId, opts.userId, "analysis", "RAG Embedding", {
218
+ action: "Completed synthetic VLM embedding",
219
+ ...details,
220
+ }, opts.supabase);
221
+ }).catch((err) => {
222
+ logger.error(`RAG embedding failed for synthetic VLM text ${opts.ingestionId}`, err);
223
+ const errorMessage = err instanceof Error ? err.message : String(err);
224
+ Actuator.logEvent(opts.ingestionId, opts.userId, "error", "RAG Embedding", {
225
+ action: "Synthetic VLM embedding failed",
226
+ error: errorMessage,
227
+ ...details,
228
+ }, opts.supabase);
229
+ });
230
+
231
+ return details;
232
+ }
233
+
234
+ /**
235
+ * Ingest a document using Hybrid Routing Architecture.
236
+ */
237
+ static async ingest(opts: {
238
+ supabase: SupabaseClient;
239
+ userId: string;
240
+ filename: string;
241
+ mimeType?: string;
242
+ fileSize?: number;
243
+ source?: IngestionSource;
244
+ filePath: string;
245
+ content: string;
246
+ fileHash?: string;
247
+ }): Promise<Ingestion> {
248
+ const { supabase, userId, filename, mimeType, fileSize, source = "upload", filePath, content, fileHash } = opts;
249
+
250
+ // Duplicate detection — check if this exact file content was already ingested
251
+ if (fileHash) {
252
+ const { data: existing } = await supabase
253
+ .from("ingestions")
254
+ .select("id, filename, created_at")
255
+ .eq("user_id", userId)
256
+ .eq("file_hash", fileHash)
257
+ .eq("status", "matched")
258
+ .order("created_at", { ascending: true })
259
+ .limit(1)
260
+ .maybeSingle();
261
+
262
+ if (existing) {
263
+ logger.info(`Duplicate file detected: '${filename}' matches ingestion ${existing.id} ('${existing.filename}')`);
264
+ const { data: dupIngestion } = await supabase
265
+ .from("ingestions")
266
+ .insert({
267
+ user_id: userId,
268
+ source,
269
+ filename,
270
+ mime_type: mimeType,
271
+ file_size: fileSize,
272
+ storage_path: filePath,
273
+ file_hash: fileHash,
274
+ status: "duplicate",
275
+ extracted: { duplicate_of: existing.id, original_filename: existing.filename },
276
+ })
277
+ .select()
278
+ .single();
279
+ return dupIngestion as Ingestion;
280
+ }
281
+ }
282
+
283
+ // 1. Insert into ingestions
284
+ const { data: ingestion, error: insertErr } = await supabase
285
+ .from("ingestions")
286
+ .insert({
287
+ user_id: userId,
288
+ source,
289
+ filename,
290
+ mime_type: mimeType,
291
+ file_size: fileSize,
292
+ storage_path: filePath,
293
+ file_hash: fileHash ?? null,
294
+ status: "processing"
295
+ })
296
+ .select()
297
+ .single();
298
+
299
+ if (insertErr || !ingestion) throw new Error(`Failed to create ingestion record: ${insertErr?.message}`);
300
+
301
+ logger.info(`Processing ingestion ${ingestion.id}: ${filename}`);
302
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Ingestion started", source, filename, fileSize }, supabase);
303
+
304
+ // 2. Document Triage
305
+ let isFastPath = false;
306
+ let isVlmFastPath = false;
307
+ let extractionContent = content;
308
+ const ext = filename.toLowerCase().split('.').pop() || '';
309
+ const fastExts = ['txt', 'md', 'csv', 'json'];
310
+ const imageExts = ['png', 'jpg', 'jpeg', 'webp'];
311
+
312
+ // Pre-fetch settings to decide whether we should attempt VLM.
313
+ const { data: triageSettingsRow } = await supabase
314
+ .from("user_settings")
315
+ .select("llm_provider, llm_model, embedding_provider, embedding_model, vision_model_capabilities")
316
+ .eq("user_id", userId)
317
+ .maybeSingle();
318
+ const visionResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow);
319
+ const llmModel = visionResolution.model;
320
+ const llmProvider = visionResolution.provider;
321
+
322
+ if (fastExts.includes(ext)) {
323
+ isFastPath = true;
324
+ } else if (imageExts.includes(ext) && visionResolution.shouldAttempt) {
325
+ try {
326
+ const buffer = await fs.readFile(filePath);
327
+ const base64 = buffer.toString('base64');
328
+ const mimeTypeActual = mimeType || `image/${ext === 'jpg' ? 'jpeg' : ext}`;
329
+ // Special marker for PolicyEngine
330
+ extractionContent = `[VLM_IMAGE_DATA:data:${mimeTypeActual};base64,${base64}]`;
331
+ isFastPath = true;
332
+ isVlmFastPath = true;
333
+ logger.info(`Smart Triage: Image ${filename} routed to Fast Path using native VLM (${llmModel}).`);
334
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "VLM Fast Path selected", type: ext, model: llmModel }, supabase);
335
+ } catch (err) {
336
+ logger.warn(`Failed to read VLM image ${filename}. Routing to Heavy Path.`, { err });
337
+ }
338
+ } else if (imageExts.includes(ext)) {
339
+ logger.info(`Smart Triage: Image ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked vision-unsupported.`);
340
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
341
+ action: "VLM skipped (model marked unsupported)",
342
+ type: ext,
343
+ model: llmModel,
344
+ provider: llmProvider
345
+ }, supabase);
346
+ } else if (ext === 'pdf') {
347
+ try {
348
+ const buffer = await fs.readFile(filePath);
349
+ const parser = new PDFParse({ data: buffer });
350
+ const pdfData = await parser.getText();
351
+ if (isPdfTextExtractable(pdfData)) {
352
+ isFastPath = true;
353
+ extractionContent = pdfData.text;
354
+ logger.info(`Smart Triage: PDF ${filename} passed text quality check (${pdfData.pages.filter(p => p.text.trim().length > 30).length}/${pdfData.total} pages with text). Routing to Fast Path.`);
355
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Smart Triage passed", type: "pdf", fast_path: true }, supabase);
356
+ } else {
357
+ logger.info(`Smart Triage: PDF ${filename} failed text quality check. Routing to Heavy Path.`);
358
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Smart Triage failed", type: "pdf", fast_path: false }, supabase);
359
+ }
360
+ } catch (err) {
361
+ logger.warn(`Failed to parse PDF ${filename}. Routing to Heavy Path.`, { err });
362
+ Actuator.logEvent(ingestion.id, userId, "error", "Triage", { action: "PDF parse failed", error: String(err) }, supabase);
363
+ }
364
+ }
365
+
366
+ if (isFastPath) {
367
+ try {
368
+ // 3. Fast Path — fetch all dependencies in parallel
369
+ const [userPolicies, processingSettingsRow, baselineConfig] = await Promise.all([
370
+ PolicyLoader.load(false, supabase),
371
+ supabase.from("user_settings").select("llm_provider, llm_model, embedding_provider, embedding_model").eq("user_id", userId).maybeSingle(),
372
+ BaselineConfigService.getActive(supabase, userId),
373
+ ]);
374
+ const llmSettings = {
375
+ llm_provider: processingSettingsRow.data?.llm_provider ?? undefined,
376
+ llm_model: processingSettingsRow.data?.llm_model ?? undefined,
377
+ };
378
+ const embedSettings = {
379
+ embedding_provider: processingSettingsRow.data?.embedding_provider ?? undefined,
380
+ embedding_model: processingSettingsRow.data?.embedding_model ?? undefined,
381
+ };
382
+ const doc = { filePath: filePath, text: extractionContent, ingestionId: ingestion.id, userId, supabase };
383
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
384
+ const baselineTrace: Array<{ timestamp: string; step: string; details?: any }> = [];
385
+
386
+ // Fire and forget Semantic Embedding Storage
387
+ RAGService.chunkAndEmbed(ingestion.id, userId, doc.text, supabase, embedSettings).catch(err => {
388
+ logger.error(`RAG embedding failed for ${ingestion.id}`, err);
389
+ });
390
+
391
+ // 4. Stage 1: Baseline extraction (always runs, LLM call 1 of max 2)
392
+ baselineTrace.push({
393
+ timestamp: new Date().toISOString(),
394
+ step: "LLM request (baseline extraction)",
395
+ details: {
396
+ provider: llmSettings.llm_provider ?? llmProvider,
397
+ model: llmSettings.llm_model ?? llmModel,
398
+ mode: isVlmFastPath ? "vision" : "text",
399
+ }
400
+ });
401
+
402
+ const baselineResult = await PolicyEngine.extractBaseline(
403
+ doc,
404
+ { context: baselineConfig?.context, fields: baselineConfig?.fields },
405
+ llmSettings
406
+ );
407
+ const baselineEntities = baselineResult.entities;
408
+ const autoTags = baselineResult.tags;
409
+ baselineTrace.push({
410
+ timestamp: new Date().toISOString(),
411
+ step: "LLM response (baseline extraction)",
412
+ details: {
413
+ entities_count: Object.keys(baselineEntities).length,
414
+ uncertain_count: baselineResult.uncertain_fields.length,
415
+ tags_count: autoTags.length,
416
+ }
417
+ });
418
+
419
+ // Enrich the document with extracted entities so policy keyword/semantic
420
+ // conditions can match against semantic field values (e.g. document_type:
421
+ // "invoice") even when those exact words don't appear in the raw text.
422
+ const entityLines = Object.entries(baselineEntities)
423
+ .filter(([, v]) => v != null)
424
+ .map(([k, v]) => `${k}: ${Array.isArray(v) ? (v as unknown[]).join(", ") : String(v)}`);
425
+ const enrichedDoc = entityLines.length > 0
426
+ ? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
427
+ : doc;
428
+
429
+ // 5. Stage 2: Policy matching + policy-specific field extraction
430
+ let result;
431
+ if (userPolicies.length > 0) {
432
+ result = await PolicyEngine.processWithPolicies(enrichedDoc, userPolicies, llmSettings, baselineEntities);
433
+ } else {
434
+ result = await PolicyEngine.process(enrichedDoc, llmSettings, baselineEntities);
435
+ }
436
+
437
+ const policyName = userPolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name;
438
+ const finalStatus = result.status === "fallback" ? "no_match" : result.status;
439
+
440
+ // Merge: baseline entities are the foundation; policy-specific fields
441
+ // are overlaid on top so more precise extractions take precedence.
442
+ const mergedExtracted = { ...baselineEntities, ...result.extractedData };
443
+ let finalTrace = [...baselineTrace, ...(result.trace || [])];
444
+
445
+ const { data: updatedIngestion } = await supabase
446
+ .from("ingestions")
447
+ .update({
448
+ status: finalStatus,
449
+ policy_id: result.matchedPolicy,
450
+ policy_name: policyName,
451
+ extracted: mergedExtracted,
452
+ actions_taken: result.actionsExecuted,
453
+ trace: finalTrace,
454
+ tags: autoTags,
455
+ baseline_config_id: baselineConfig?.id ?? null,
456
+ })
457
+ .eq("id", ingestion.id)
458
+ .select()
459
+ .single();
460
+
461
+ if (isVlmFastPath) {
462
+ const embeddingMeta = this.queueVlmSemanticEmbedding({
463
+ ingestionId: ingestion.id,
464
+ userId,
465
+ filename,
466
+ finalStatus,
467
+ policyName,
468
+ extracted: mergedExtracted,
469
+ tags: autoTags,
470
+ supabase,
471
+ embedSettings,
472
+ });
473
+ finalTrace = [
474
+ ...finalTrace,
475
+ {
476
+ timestamp: new Date().toISOString(),
477
+ step: "Queued synthetic VLM embedding",
478
+ details: embeddingMeta,
479
+ }
480
+ ];
481
+ await supabase
482
+ .from("ingestions")
483
+ .update({ trace: finalTrace })
484
+ .eq("id", ingestion.id);
485
+ }
486
+
487
+ if (isVlmFastPath) {
488
+ await ModelCapabilityService.learnVisionSuccess({
489
+ supabase,
490
+ userId,
491
+ provider: llmSettings.llm_provider ?? llmProvider,
492
+ model: llmSettings.llm_model ?? llmModel,
493
+ });
494
+ }
495
+
496
+ return updatedIngestion as Ingestion;
497
+
498
+ } catch (err) {
499
+ const msg = err instanceof Error ? err.message : String(err);
500
+
501
+ if (isVlmFastPath) {
502
+ const learnedState = await ModelCapabilityService.learnVisionFailure({
503
+ supabase,
504
+ userId,
505
+ provider: llmProvider,
506
+ model: llmModel,
507
+ error: err,
508
+ });
509
+ logger.warn(`VLM extraction failed for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
510
+ Actuator.logEvent(ingestion.id, userId, "error", "Processing", {
511
+ action: "VLM Failed, Fallback to Heavy",
512
+ error: msg,
513
+ learned_state: learnedState,
514
+ }, supabase);
515
+ // Fall back to Heavy Path
516
+ isFastPath = false;
517
+ } else {
518
+ Actuator.logEvent(ingestion.id, userId, "error", "Processing", { error: msg }, supabase);
519
+ const { data: updatedIngestion } = await supabase
520
+ .from("ingestions")
521
+ .update({ status: "error", error_message: msg })
522
+ .eq("id", ingestion.id)
523
+ .select()
524
+ .single();
525
+ return updatedIngestion as Ingestion;
526
+ }
527
+ }
528
+ }
529
+
530
+ // 4. Heavy Path (Delegate to RealTimeX)
531
+ const { error: rtxErr } = await supabase
532
+ .from("rtx_activities")
533
+ .insert({
534
+ user_id: userId,
535
+ status: "pending", // Waiting for RealTimeX
536
+ raw_data: {
537
+ source,
538
+ filename,
539
+ mime_type: mimeType,
540
+ file_size: fileSize,
541
+ file_path: filePath,
542
+ ingestion_id: ingestion.id
543
+ }
544
+ });
545
+
546
+ if (rtxErr) {
547
+ logger.error(`Failed to delegate to rtx_activities`, { rtxErr });
548
+ }
549
+
550
+ const { data: pendingIngestion } = await supabase
551
+ .from("ingestions")
552
+ .update({ status: "pending" }) // UI shows pending
553
+ .eq("id", ingestion.id)
554
+ .select()
555
+ .single();
556
+
557
+ return pendingIngestion as Ingestion;
558
+ }
559
+
560
+ /**
561
+ * Re-run an existing ingestion
562
+ */
563
+ static async rerun(
564
+ ingestionId: string,
565
+ supabase: SupabaseClient,
566
+ userId: string,
567
+ opts: { forcedPolicyId?: string } = {}
568
+ ): Promise<boolean> {
569
+ const { data: ingestion, error } = await supabase
570
+ .from("ingestions")
571
+ .select("*")
572
+ .eq("id", ingestionId)
573
+ .eq("user_id", userId)
574
+ .single();
575
+
576
+ if (error || !ingestion) throw new Error("Ingestion not found");
577
+
578
+ await supabase
579
+ .from("ingestions")
580
+ .update({ status: "processing", error_message: null, policy_id: null, policy_name: null, extracted: {}, actions_taken: [], summary: null })
581
+ .eq("id", ingestionId);
582
+
583
+ Actuator.logEvent(ingestionId, userId, "info", "Triage", { action: "Re-run Initiated" }, supabase);
584
+
585
+ const filename = ingestion.filename;
586
+ const filePath = ingestion.storage_path;
587
+ if (!filePath) throw new Error("No storage path found for this ingestion");
588
+
589
+ let isFastPath = false;
590
+ let isVlmFastPath = false;
591
+ let extractionContent = "";
592
+ const ext = filename.toLowerCase().split('.').pop() || '';
593
+ const fastExts = ['txt', 'md', 'csv', 'json'];
594
+ const imageExts = ['png', 'jpg', 'jpeg', 'webp'];
595
+
596
+ const { data: triageSettingsRow } = await supabase
597
+ .from("user_settings")
598
+ .select("llm_provider, llm_model, embedding_provider, embedding_model, vision_model_capabilities")
599
+ .eq("user_id", userId)
600
+ .maybeSingle();
601
+ const visionResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow);
602
+ const llmModel = visionResolution.model;
603
+ const llmProvider = visionResolution.provider;
604
+
605
+ if (fastExts.includes(ext)) {
606
+ isFastPath = true;
607
+ extractionContent = await fs.readFile(filePath, "utf-8");
608
+ } else if (imageExts.includes(ext) && visionResolution.shouldAttempt) {
609
+ try {
610
+ const buffer = await fs.readFile(filePath);
611
+ const base64 = buffer.toString('base64');
612
+ const mimeTypeActual = `image/${ext === 'jpg' ? 'jpeg' : ext}`;
613
+ extractionContent = `[VLM_IMAGE_DATA:data:${mimeTypeActual};base64,${base64}]`;
614
+ isFastPath = true;
615
+ isVlmFastPath = true;
616
+ logger.info(`Smart Triage: Re-run image ${filename} routed to Fast Path using native VLM (${llmModel}).`);
617
+ Actuator.logEvent(ingestionId, userId, "info", "Triage", { action: "VLM Fast Path selected", type: ext, model: llmModel }, supabase);
618
+ } catch (err) {
619
+ logger.warn(`Failed to read VLM image ${filename} during rerun. Routing to Heavy Path.`, { err });
620
+ }
621
+ } else if (imageExts.includes(ext)) {
622
+ logger.info(`Smart Triage: Re-run image ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked vision-unsupported.`);
623
+ Actuator.logEvent(ingestionId, userId, "info", "Triage", {
624
+ action: "VLM skipped (model marked unsupported)",
625
+ type: ext,
626
+ model: llmModel,
627
+ provider: llmProvider
628
+ }, supabase);
629
+ } else if (ext === 'pdf') {
630
+ try {
631
+ const buffer = await fs.readFile(filePath);
632
+ const parser = new PDFParse({ data: buffer });
633
+ const pdfData = await parser.getText();
634
+ if (isPdfTextExtractable(pdfData)) {
635
+ isFastPath = true;
636
+ extractionContent = pdfData.text;
637
+ }
638
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
639
+ } catch (err) {
640
+ // ignore
641
+ }
642
+ }
643
+
644
+ if (isFastPath) {
645
+ const [userPolicies, processingSettingsRow, baselineConfig] = await Promise.all([
646
+ PolicyLoader.load(false, supabase),
647
+ supabase.from("user_settings").select("llm_provider, llm_model, embedding_provider, embedding_model").eq("user_id", userId).maybeSingle(),
648
+ BaselineConfigService.getActive(supabase, userId),
649
+ ]);
650
+ const llmSettings = {
651
+ llm_provider: processingSettingsRow.data?.llm_provider ?? undefined,
652
+ llm_model: processingSettingsRow.data?.llm_model ?? undefined,
653
+ };
654
+ const embedSettings = {
655
+ embedding_provider: processingSettingsRow.data?.embedding_provider ?? undefined,
656
+ embedding_model: processingSettingsRow.data?.embedding_model ?? undefined,
657
+ };
658
+ const doc = { filePath, text: extractionContent, ingestionId, userId, supabase };
659
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
660
+ const baselineTrace: Array<{ timestamp: string; step: string; details?: any }> = [];
661
+
662
+ // Fire and forget Semantic Embedding Storage for re-runs
663
+ RAGService.chunkAndEmbed(ingestionId, userId, doc.text, supabase, embedSettings).catch(err => {
664
+ logger.error(`RAG embedding failed during rerun for ${ingestionId}`, err);
665
+ });
666
+
667
+ baselineTrace.push({
668
+ timestamp: new Date().toISOString(),
669
+ step: "LLM request (baseline extraction)",
670
+ details: {
671
+ provider: llmSettings.llm_provider ?? llmProvider,
672
+ model: llmSettings.llm_model ?? llmModel,
673
+ mode: isVlmFastPath ? "vision" : "text",
674
+ }
675
+ });
676
+
677
+ const baselineResult = await PolicyEngine.extractBaseline(
678
+ doc,
679
+ { context: baselineConfig?.context, fields: baselineConfig?.fields },
680
+ llmSettings
681
+ );
682
+ const baselineEntities = baselineResult.entities;
683
+ const autoTags = baselineResult.tags;
684
+ baselineTrace.push({
685
+ timestamp: new Date().toISOString(),
686
+ step: "LLM response (baseline extraction)",
687
+ details: {
688
+ entities_count: Object.keys(baselineEntities).length,
689
+ uncertain_count: baselineResult.uncertain_fields.length,
690
+ tags_count: autoTags.length,
691
+ }
692
+ });
693
+
694
+ const entityLines = Object.entries(baselineEntities)
695
+ .filter(([, v]) => v != null)
696
+ .map(([k, v]) => `${k}: ${Array.isArray(v) ? (v as unknown[]).join(", ") : String(v)}`);
697
+ const enrichedDoc = entityLines.length > 0
698
+ ? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
699
+ : doc;
700
+
701
+ let finalStatus = "no_match";
702
+ let result: import("./PolicyEngine.js").ProcessingResult;
703
+ let policyName;
704
+ try {
705
+ const forcedPolicyId = opts.forcedPolicyId?.trim();
706
+ const activePolicies = forcedPolicyId
707
+ ? userPolicies.filter((policy) => policy.metadata.id === forcedPolicyId)
708
+ : userPolicies;
709
+
710
+ if (forcedPolicyId && activePolicies.length === 0) {
711
+ throw new Error(`Policy "${forcedPolicyId}" was not found or is disabled.`);
712
+ }
713
+
714
+ if (activePolicies.length > 0) {
715
+ result = await PolicyEngine.processWithPolicies(
716
+ enrichedDoc,
717
+ activePolicies,
718
+ llmSettings,
719
+ baselineEntities,
720
+ {
721
+ ...(forcedPolicyId ? { forcedPolicyId } : {}),
722
+ allowLearnedFallback: !forcedPolicyId,
723
+ }
724
+ );
725
+ } else {
726
+ result = await PolicyEngine.process(enrichedDoc, llmSettings, baselineEntities);
727
+ }
728
+
729
+ policyName = result.matchedPolicy ? activePolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name : undefined;
730
+ finalStatus = result.status === "fallback" ? "no_match" : result.status;
731
+ const mergedExtracted = { ...baselineEntities, ...result.extractedData };
732
+
733
+ // Preserve any human-added tags; merge with freshly generated auto-tags.
734
+ const existingTags: string[] = Array.isArray(ingestion.tags) ? ingestion.tags : [];
735
+ const mergedTags = [...new Set([...autoTags, ...existingTags])];
736
+ let rerunTrace = [
737
+ ...(ingestion.trace || []),
738
+ { timestamp: new Date().toISOString(), step: "--- Re-run Initiated ---" },
739
+ ...baselineTrace,
740
+ ...(result.trace || [])
741
+ ];
742
+
743
+ await supabase
744
+ .from("ingestions")
745
+ .update({
746
+ status: finalStatus,
747
+ policy_id: result.matchedPolicy,
748
+ policy_name: policyName,
749
+ extracted: mergedExtracted,
750
+ actions_taken: result.actionsExecuted,
751
+ trace: rerunTrace,
752
+ tags: mergedTags,
753
+ baseline_config_id: baselineConfig?.id ?? null,
754
+ })
755
+ .eq("id", ingestionId);
756
+
757
+ if (isVlmFastPath) {
758
+ const embeddingMeta = this.queueVlmSemanticEmbedding({
759
+ ingestionId,
760
+ userId,
761
+ filename,
762
+ finalStatus,
763
+ policyName,
764
+ extracted: mergedExtracted,
765
+ tags: mergedTags,
766
+ supabase,
767
+ embedSettings,
768
+ });
769
+ rerunTrace = [
770
+ ...rerunTrace,
771
+ {
772
+ timestamp: new Date().toISOString(),
773
+ step: "Queued synthetic VLM embedding",
774
+ details: embeddingMeta,
775
+ }
776
+ ];
777
+ await supabase
778
+ .from("ingestions")
779
+ .update({ trace: rerunTrace })
780
+ .eq("id", ingestionId);
781
+ }
782
+
783
+ if (isVlmFastPath) {
784
+ await ModelCapabilityService.learnVisionSuccess({
785
+ supabase,
786
+ userId,
787
+ provider: llmSettings.llm_provider ?? llmProvider,
788
+ model: llmSettings.llm_model ?? llmModel,
789
+ });
790
+ }
791
+
792
+ return finalStatus === "matched";
793
+ } catch (err: unknown) {
794
+ const msg = err instanceof Error ? err.message : String(err);
795
+ if (isVlmFastPath) {
796
+ const learnedState = await ModelCapabilityService.learnVisionFailure({
797
+ supabase,
798
+ userId,
799
+ provider: llmProvider,
800
+ model: llmModel,
801
+ error: err,
802
+ });
803
+ logger.warn(`VLM extraction failed during rerun for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
804
+ Actuator.logEvent(ingestionId, userId, "error", "Processing", {
805
+ action: "VLM Failed, Fallback to Heavy",
806
+ error: msg,
807
+ learned_state: learnedState,
808
+ }, supabase);
809
+ isFastPath = false; // Trigger heavy path fallthrough
810
+ } else {
811
+ throw err; // Re-throw to caller
812
+ }
813
+ }
814
+ }
815
+
816
+ // Re-delegate to rtx_activities
817
+ await supabase
818
+ .from("rtx_activities")
819
+ .insert({
820
+ user_id: userId,
821
+ status: "pending",
822
+ raw_data: {
823
+ source: ingestion.source,
824
+ filename,
825
+ mime_type: ingestion.mime_type,
826
+ file_size: ingestion.file_size,
827
+ file_path: filePath,
828
+ ingestion_id: ingestion.id
829
+ }
830
+ });
831
+
832
+ await supabase
833
+ .from("ingestions")
834
+ .update({ status: "pending" })
835
+ .eq("id", ingestionId);
836
+
837
+ return true;
838
+ }
839
+
840
+ /**
841
+ * Manually assign an ingestion to a policy and optionally persist it as
842
+ * learning feedback for future automatic matching.
843
+ */
844
+ static async matchToPolicy(
845
+ ingestionId: string,
846
+ policyId: string,
847
+ supabase: SupabaseClient,
848
+ userId: string,
849
+ opts: { learn?: boolean; rerun?: boolean; allowSideEffects?: boolean } = {}
850
+ ): Promise<Ingestion> {
851
+ const learn = opts.learn !== false;
852
+ const rerun = opts.rerun !== false;
853
+ const allowSideEffects = opts.allowSideEffects === true;
854
+ const normalizedPolicyId = policyId.trim();
855
+ if (!normalizedPolicyId) {
856
+ throw new Error("policy_id is required");
857
+ }
858
+
859
+ const { data: ingestion, error: ingestionError } = await supabase
860
+ .from("ingestions")
861
+ .select("*")
862
+ .eq("id", ingestionId)
863
+ .eq("user_id", userId)
864
+ .single();
865
+ if (ingestionError || !ingestion) {
866
+ throw new Error("Ingestion not found");
867
+ }
868
+
869
+ if (ingestion.status === "processing" || ingestion.status === "pending") {
870
+ throw new Error("Cannot manually match while ingestion is still processing");
871
+ }
872
+
873
+ const policies = await PolicyLoader.load(false, supabase);
874
+ const policy = policies.find((item) => item.metadata.id === normalizedPolicyId);
875
+ if (!policy) {
876
+ throw new Error(`Policy "${normalizedPolicyId}" was not found or is disabled.`);
877
+ }
878
+
879
+ const riskyActions = this.listNonIdempotentPolicyActions(policy);
880
+ if (rerun && riskyActions.length > 0 && !allowSideEffects) {
881
+ throw new Error(
882
+ `Re-running this policy may trigger side-effect actions (${riskyActions.join(", ")}). ` +
883
+ "Confirm allow_side_effects=true to continue."
884
+ );
885
+ }
886
+
887
+ let effectiveIngestion: Ingestion;
888
+ if (rerun) {
889
+ Actuator.logEvent(ingestionId, userId, "info", "Policy Matching", {
890
+ action: "Manual match requested with rerun",
891
+ policyId: policy.metadata.id,
892
+ policyName: policy.metadata.name,
893
+ learn,
894
+ risky_actions: riskyActions,
895
+ }, supabase);
896
+
897
+ await this.rerun(ingestionId, supabase, userId, { forcedPolicyId: policy.metadata.id });
898
+ const refreshed = await this.get(ingestionId, supabase, userId);
899
+ if (!refreshed) {
900
+ throw new Error("Ingestion not found after rerun.");
901
+ }
902
+ effectiveIngestion = refreshed;
903
+ } else {
904
+ const nextTrace = [
905
+ ...(Array.isArray(ingestion.trace) ? ingestion.trace : []),
906
+ {
907
+ timestamp: new Date().toISOString(),
908
+ step: "Manual policy match override",
909
+ details: {
910
+ policyId: policy.metadata.id,
911
+ policyName: policy.metadata.name,
912
+ learn,
913
+ rerun,
914
+ }
915
+ }
916
+ ];
917
+
918
+ const { data: updatedIngestion, error: updateError } = await supabase
919
+ .from("ingestions")
920
+ .update({
921
+ status: "matched",
922
+ policy_id: policy.metadata.id,
923
+ policy_name: policy.metadata.name,
924
+ error_message: null,
925
+ trace: nextTrace,
926
+ })
927
+ .eq("id", ingestionId)
928
+ .eq("user_id", userId)
929
+ .select("*")
930
+ .single();
931
+
932
+ if (updateError || !updatedIngestion) {
933
+ throw new Error(`Failed to update ingestion policy match: ${updateError?.message ?? "unknown error"}`);
934
+ }
935
+ effectiveIngestion = updatedIngestion as Ingestion;
936
+ }
937
+
938
+ Actuator.logEvent(ingestionId, userId, "info", "Policy Matching", {
939
+ action: rerun ? "Manual policy match override + rerun" : "Manual policy match override",
940
+ policyId: policy.metadata.id,
941
+ policyName: policy.metadata.name,
942
+ learn,
943
+ rerun,
944
+ }, supabase);
945
+
946
+ if (learn) {
947
+ await PolicyLearningService.recordManualMatch({
948
+ supabase,
949
+ userId,
950
+ ingestion: effectiveIngestion,
951
+ policyId: policy.metadata.id,
952
+ policyName: policy.metadata.name,
953
+ });
954
+ }
955
+
956
+ return effectiveIngestion;
957
+ }
958
+
959
+ /**
960
+ * Generate a user-reviewable refinement draft for an existing policy
961
+ * using evidence from a specific ingestion.
962
+ */
963
+ static async suggestPolicyRefinement(
964
+ ingestionId: string,
965
+ policyId: string,
966
+ supabase: SupabaseClient,
967
+ userId: string,
968
+ opts: { provider?: string; model?: string } = {}
969
+ ): Promise<{ policy: FolioPolicy; rationale: string[] }> {
970
+ const normalizedPolicyId = policyId.trim();
971
+ if (!normalizedPolicyId) {
972
+ throw new Error("policy_id is required");
973
+ }
974
+
975
+ const { data: ingestion, error: ingestionError } = await supabase
976
+ .from("ingestions")
977
+ .select("id,filename,mime_type,status,tags,summary,extracted,trace")
978
+ .eq("id", ingestionId)
979
+ .eq("user_id", userId)
980
+ .single();
981
+
982
+ if (ingestionError || !ingestion) {
983
+ throw new Error("Ingestion not found");
984
+ }
985
+
986
+ const policies = await PolicyLoader.load(false, supabase);
987
+ const targetPolicy = policies.find((policy) => policy.metadata.id === normalizedPolicyId);
988
+ if (!targetPolicy) {
989
+ throw new Error(`Policy "${normalizedPolicyId}" was not found or is disabled.`);
990
+ }
991
+
992
+ const suggestion = await PolicyEngine.suggestPolicyRefinement(
993
+ targetPolicy,
994
+ {
995
+ ingestionId,
996
+ filename: ingestion.filename as string,
997
+ mimeType: (ingestion.mime_type as string | null | undefined) ?? null,
998
+ status: String(ingestion.status ?? ""),
999
+ summary: (ingestion.summary as string | null | undefined) ?? null,
1000
+ tags: Array.isArray(ingestion.tags) ? ingestion.tags.map((tag) => String(tag)) : [],
1001
+ extracted: (ingestion.extracted as Record<string, unknown> | null | undefined) ?? {},
1002
+ trace: Array.isArray(ingestion.trace) ? ingestion.trace as Array<{ timestamp: string; step: string; details?: unknown }> : [],
1003
+ },
1004
+ {
1005
+ provider: opts.provider,
1006
+ model: opts.model,
1007
+ userId,
1008
+ supabase,
1009
+ }
1010
+ );
1011
+
1012
+ if (!suggestion.policy) {
1013
+ throw new Error(suggestion.error || "Unable to generate policy refinement suggestion.");
1014
+ }
1015
+
1016
+ return {
1017
+ policy: suggestion.policy,
1018
+ rationale: suggestion.rationale,
1019
+ };
1020
+ }
1021
+
1022
+ /**
1023
+ * List ingestions for a user, newest first.
1024
+ * Supports server-side pagination and ILIKE search across native text columns
1025
+ * (filename, policy_name, summary). Tags are handled client-side via the
1026
+ * filter bar; extracted JSONB search requires a tsvector migration (deferred).
1027
+ */
1028
+ static async list(
1029
+ supabase: SupabaseClient,
1030
+ userId: string,
1031
+ opts: { page?: number; pageSize?: number; query?: string } = {}
1032
+ ): Promise<{ ingestions: Ingestion[]; total: number }> {
1033
+ const { page = 1, pageSize = 20, query } = opts;
1034
+ const from = (page - 1) * pageSize;
1035
+ const to = from + pageSize - 1;
1036
+
1037
+ let q = supabase
1038
+ .from("ingestions")
1039
+ .select("*", { count: "exact" })
1040
+ .eq("user_id", userId)
1041
+ .order("created_at", { ascending: false });
1042
+
1043
+ if (query?.trim()) {
1044
+ const term = `%${query.trim()}%`;
1045
+ // PostgREST .or() only supports native column types — no ::cast expressions.
1046
+ // Searching filename, policy_name, and summary covers the most practical cases.
1047
+ q = q.or(
1048
+ `filename.ilike.${term},` +
1049
+ `policy_name.ilike.${term},` +
1050
+ `summary.ilike.${term}`
1051
+ );
1052
+ }
1053
+
1054
+ q = q.range(from, to);
1055
+
1056
+ const { data, error, count } = await q;
1057
+ if (error) throw new Error(`Failed to list ingestions: ${error.message}`);
1058
+ return { ingestions: data as Ingestion[], total: count ?? 0 };
1059
+ }
1060
+
1061
+ /**
1062
+ * Get a single ingestion by ID.
1063
+ */
1064
+ static async get(id: string, supabase: SupabaseClient, userId: string): Promise<Ingestion | null> {
1065
+ const { data } = await supabase
1066
+ .from("ingestions")
1067
+ .select("*")
1068
+ .eq("id", id)
1069
+ .eq("user_id", userId)
1070
+ .single();
1071
+ return data as Ingestion | null;
1072
+ }
1073
+
1074
+ /**
1075
+ * Delete an ingestion record.
1076
+ */
1077
+ static async delete(id: string, supabase: SupabaseClient, userId: string): Promise<boolean> {
1078
+ const { count, error } = await supabase
1079
+ .from("ingestions")
1080
+ .delete({ count: "exact" })
1081
+ .eq("id", id)
1082
+ .eq("user_id", userId);
1083
+
1084
+ if (error) throw new Error(`Failed to delete ingestion: ${error.message}`);
1085
+ return (count ?? 0) > 0;
1086
+ }
1087
+
1088
+ /**
1089
+ * Generate (or return cached) a 2-3 sentence prose summary for an ingestion.
1090
+ * Builds the prompt from already-extracted entities — no file I/O needed.
1091
+ * The result is saved back to ingestion.summary so subsequent calls are instant.
1092
+ */
1093
+ static async summarize(
1094
+ id: string,
1095
+ supabase: SupabaseClient,
1096
+ userId: string,
1097
+ llmSettings: { llm_provider?: string; llm_model?: string } = {}
1098
+ ): Promise<string | null> {
1099
+ const { data: ing } = await supabase
1100
+ .from("ingestions")
1101
+ .select("id, filename, extracted, summary, status")
1102
+ .eq("id", id)
1103
+ .eq("user_id", userId)
1104
+ .single();
1105
+
1106
+ if (!ing) throw new Error("Ingestion not found");
1107
+
1108
+ // Return cached summary if available
1109
+ if (ing.summary) return ing.summary as string;
1110
+
1111
+ // Cannot summarise documents that haven't been processed yet
1112
+ if (ing.status === "pending" || ing.status === "processing") return null;
1113
+
1114
+ const sdk = SDKService.getSDK();
1115
+ if (!sdk) {
1116
+ logger.warn("SDK unavailable — skipping summary generation");
1117
+ return null;
1118
+ }
1119
+
1120
+ const extracted: Record<string, unknown> = ing.extracted ?? {};
1121
+ const entityLines = Object.entries(extracted)
1122
+ .filter(([, v]) => v != null && String(v).trim() !== "")
1123
+ .map(([k, v]) => `${k}: ${Array.isArray(v) ? (v as unknown[]).join(", ") : String(v)}`);
1124
+
1125
+ if (entityLines.length === 0) return null;
1126
+
1127
+ const { provider, model } = await SDKService.resolveChatProvider(llmSettings);
1128
+
1129
+ const userPrompt =
1130
+ `Summarize this document:\nFilename: ${ing.filename}\n` +
1131
+ entityLines.join("\n");
1132
+
1133
+ try {
1134
+ Actuator.logEvent(id, userId, "analysis", "Summary Generation", {
1135
+ action: "LLM request (summary generation)",
1136
+ provider,
1137
+ model,
1138
+ extracted_fields_count: entityLines.length,
1139
+ filename: ing.filename,
1140
+ }, supabase);
1141
+ const result = await sdk.llm.chat(
1142
+ [
1143
+ {
1144
+ role: "system",
1145
+ content:
1146
+ "You are a document assistant. Write a concise 2-3 sentence prose summary of a document " +
1147
+ "based on its extracted metadata. Be specific — name the issuer, amount, date, and purpose " +
1148
+ "where available. Plain prose only, no bullet points or markdown formatting."
1149
+ },
1150
+ { role: "user", content: userPrompt }
1151
+ ],
1152
+ { provider, model }
1153
+ );
1154
+
1155
+ const summary: string = extractLlmResponse(result);
1156
+ Actuator.logEvent(id, userId, "analysis", "Summary Generation", {
1157
+ action: "LLM response (summary generation)",
1158
+ provider,
1159
+ model,
1160
+ raw_length: summary.length,
1161
+ raw_preview: previewLlmText(summary),
1162
+ }, supabase);
1163
+
1164
+ if (!summary.trim()) return null;
1165
+
1166
+ // Cache the result
1167
+ await supabase
1168
+ .from("ingestions")
1169
+ .update({ summary })
1170
+ .eq("id", id)
1171
+ .eq("user_id", userId);
1172
+
1173
+ logger.info(`Summary generated and cached for ingestion ${id}`);
1174
+ return summary;
1175
+ } catch (err) {
1176
+ logger.error("Summary generation failed", { err });
1177
+ const msg = err instanceof Error ? err.message : String(err);
1178
+ Actuator.logEvent(id, userId, "error", "Summary Generation", {
1179
+ action: "LLM summary generation failed",
1180
+ provider,
1181
+ model,
1182
+ error: msg,
1183
+ }, supabase);
1184
+ return null;
1185
+ }
1186
+ }
1187
+ }