@realtimex/folio 0.1.11 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,9 @@
1
1
  import fs from "fs/promises";
2
+ import { execFile } from "child_process";
3
+ import os from "os";
4
+ import path from "path";
2
5
  import { PDFParse } from "pdf-parse";
6
+ import { promisify } from "util";
3
7
  import { createLogger } from "../utils/logger.js";
4
8
  import { PolicyLoader } from "./PolicyLoader.js";
5
9
  import { PolicyEngine } from "./PolicyEngine.js";
@@ -11,6 +15,7 @@ import { RAGService } from "./RAGService.js";
11
15
  import { SDKService } from "./SDKService.js";
12
16
  import { ModelCapabilityService } from "./ModelCapabilityService.js";
13
17
  const logger = createLogger("IngestionService");
18
+ const execFileAsync = promisify(execFile);
14
19
  /**
15
20
  * Multi-signal classifier that decides whether pdf-parse extracted enough
16
21
  * real text to skip GPU OCR and go straight to the local LLM (Fast Path).
@@ -51,6 +56,17 @@ function isPdfTextExtractable(pdfData) {
51
56
  return true;
52
57
  }
53
58
  export class IngestionService {
59
+ static FAST_EXTS = ["txt", "md", "csv", "json"];
60
+ static IMAGE_EXTS = ["png", "jpg", "jpeg", "webp"];
61
+ static IMAGE_REENCODE_TIMEOUT_MS = 15000;
62
+ static IMAGE_REENCODE_RETRY_ENABLED = (process.env.FOLIO_VLM_IMAGE_REENCODE_RETRY_ENABLED ?? "true").toLowerCase() !== "false";
63
+ static IMAGE_REENCODE_RETRY_METRICS = {
64
+ attempted: 0,
65
+ succeeded: 0,
66
+ failed: 0,
67
+ skipped_disabled: 0,
68
+ skipped_unavailable: 0,
69
+ };
54
70
  static NON_IDEMPOTENT_ACTION_TYPES = new Set([
55
71
  "append_to_google_sheet",
56
72
  "webhook",
@@ -85,10 +101,10 @@ export class IngestionService {
85
101
  return String(value);
86
102
  }
87
103
  static buildVlmSemanticText(opts) {
88
- const { filename, finalStatus, policyName, extracted, tags } = opts;
104
+ const { filename, finalStatus, policyName, extracted, tags, modality } = opts;
89
105
  const lines = [
90
106
  `Document filename: ${filename}`,
91
- "Document source: VLM image extraction",
107
+ `Document source: VLM ${modality} extraction`,
92
108
  `Processing status: ${finalStatus}`,
93
109
  ];
94
110
  if (policyName) {
@@ -134,6 +150,7 @@ export class IngestionService {
134
150
  policyName: opts.policyName,
135
151
  extracted: opts.extracted,
136
152
  tags: opts.tags,
153
+ modality: opts.modality,
137
154
  });
138
155
  const details = {
139
156
  synthetic_chars: syntheticText.length,
@@ -160,6 +177,76 @@ export class IngestionService {
160
177
  });
161
178
  return details;
162
179
  }
180
+ static buildVlmPayloadMarker(modality, dataUrl) {
181
+ const prefix = modality === "pdf" ? "VLM_PDF_DATA" : "VLM_IMAGE_DATA";
182
+ return `[${prefix}:${dataUrl}]`;
183
+ }
184
+ static async fileToDataUrl(filePath, mimeType) {
185
+ const buffer = await fs.readFile(filePath);
186
+ const base64 = buffer.toString("base64");
187
+ return `data:${mimeType};base64,${base64}`;
188
+ }
189
+ static errorToMessage(error) {
190
+ if (error instanceof Error)
191
+ return error.message;
192
+ if (typeof error === "string")
193
+ return error;
194
+ if (error && typeof error === "object") {
195
+ const candidate = error;
196
+ if (typeof candidate.message === "string")
197
+ return candidate.message;
198
+ }
199
+ return String(error ?? "");
200
+ }
201
+ static isInvalidModelError(error) {
202
+ const message = this.errorToMessage(error).toLowerCase();
203
+ return message.includes("invalid model");
204
+ }
205
+ static async reencodeImageToPngDataUrl(filePath) {
206
+ const tempOutputPath = path.join(os.tmpdir(), `folio-vlm-reencode-${Date.now()}-${Math.random().toString(16).slice(2)}.png`);
207
+ try {
208
+ await execFileAsync("sips", ["-s", "format", "png", filePath, "--out", tempOutputPath], {
209
+ timeout: this.IMAGE_REENCODE_TIMEOUT_MS,
210
+ maxBuffer: 1024 * 1024,
211
+ });
212
+ const pngBuffer = await fs.readFile(tempOutputPath);
213
+ return `data:image/png;base64,${pngBuffer.toString("base64")}`;
214
+ }
215
+ catch {
216
+ return null;
217
+ }
218
+ finally {
219
+ await fs.unlink(tempOutputPath).catch(() => undefined);
220
+ }
221
+ }
222
+ static async maybeBuildImageRetryMarker(opts) {
223
+ if (!this.isInvalidModelError(opts.error))
224
+ return null;
225
+ if (!this.IMAGE_REENCODE_RETRY_ENABLED) {
226
+ this.bumpImageReencodeRetryMetric("skipped_disabled", opts);
227
+ logger.info(`VLM ${opts.phase} retry skipped for ${opts.filename}: re-encode retry disabled (${opts.provider}/${opts.model}).`);
228
+ return null;
229
+ }
230
+ const retryDataUrl = await this.reencodeImageToPngDataUrl(opts.filePath);
231
+ if (!retryDataUrl) {
232
+ this.bumpImageReencodeRetryMetric("skipped_unavailable", opts);
233
+ logger.warn(`VLM ${opts.phase} retry skipped for ${opts.filename}: image re-encode unavailable (${opts.provider}/${opts.model}).`);
234
+ return null;
235
+ }
236
+ logger.warn(`VLM ${opts.phase} failed for ${opts.filename} with invalid model. Retrying once with re-encoded image payload (${opts.provider}/${opts.model}).`);
237
+ return this.buildVlmPayloadMarker("image", retryDataUrl);
238
+ }
239
+ static bumpImageReencodeRetryMetric(outcome, meta) {
240
+ this.IMAGE_REENCODE_RETRY_METRICS[outcome] += 1;
241
+ logger.info("VLM image re-encode retry metric", {
242
+ outcome,
243
+ phase: meta.phase,
244
+ provider: meta.provider,
245
+ model: meta.model,
246
+ filename: meta.filename,
247
+ counters: { ...this.IMAGE_REENCODE_RETRY_METRICS },
248
+ });
249
+ }
163
250
  /**
164
251
  * Ingest a document using Hybrid Routing Architecture.
165
252
  */
@@ -217,32 +304,31 @@ export class IngestionService {
217
304
  Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Ingestion started", source, filename, fileSize, is_high_intent: true }, supabase);
218
305
  // 2. Document Triage
219
306
  let isFastPath = false;
220
- let isVlmFastPath = false;
307
+ let isMultimodalFastPath = false;
308
+ let multimodalModality = null;
221
309
  let extractionContent = content;
222
310
  const ext = filename.toLowerCase().split('.').pop() || '';
223
- const fastExts = ['txt', 'md', 'csv', 'json'];
224
- const imageExts = ['png', 'jpg', 'jpeg', 'webp'];
225
311
  // Pre-fetch settings to decide whether we should attempt VLM.
226
312
  const { data: triageSettingsRow } = await supabase
227
313
  .from("user_settings")
228
314
  .select("llm_provider, llm_model, embedding_provider, embedding_model, vision_model_capabilities")
229
315
  .eq("user_id", userId)
230
316
  .maybeSingle();
231
- const visionResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow);
232
- const llmModel = visionResolution.model;
233
- const llmProvider = visionResolution.provider;
234
- if (fastExts.includes(ext)) {
317
+ const imageResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "image");
318
+ const pdfResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "pdf");
319
+ const llmModel = imageResolution.model;
320
+ const llmProvider = imageResolution.provider;
321
+ if (this.FAST_EXTS.includes(ext)) {
235
322
  isFastPath = true;
236
323
  }
237
- else if (imageExts.includes(ext) && visionResolution.shouldAttempt) {
324
+ else if (this.IMAGE_EXTS.includes(ext) && imageResolution.shouldAttempt) {
238
325
  try {
239
- const buffer = await fs.readFile(filePath);
240
- const base64 = buffer.toString('base64');
241
326
  const mimeTypeActual = mimeType || `image/${ext === 'jpg' ? 'jpeg' : ext}`;
242
- // Special marker for PolicyEngine
243
- extractionContent = `[VLM_IMAGE_DATA:data:${mimeTypeActual};base64,${base64}]`;
327
+ const dataUrl = await this.fileToDataUrl(filePath, mimeTypeActual);
328
+ extractionContent = this.buildVlmPayloadMarker("image", dataUrl);
244
329
  isFastPath = true;
245
- isVlmFastPath = true;
330
+ isMultimodalFastPath = true;
331
+ multimodalModality = "image";
246
332
  logger.info(`Smart Triage: Image ${filename} routed to Fast Path using native VLM (${llmModel}).`);
247
333
  Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "VLM Fast Path selected", type: ext, model: llmModel }, supabase);
248
334
  }
@@ -250,7 +336,7 @@ export class IngestionService {
250
336
  logger.warn(`Failed to read VLM image ${filename}. Routing to Heavy Path.`, { err });
251
337
  }
252
338
  }
253
- else if (imageExts.includes(ext)) {
339
+ else if (this.IMAGE_EXTS.includes(ext)) {
254
340
  logger.info(`Smart Triage: Image ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked vision-unsupported.`);
255
341
  Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
256
342
  action: "VLM skipped (model marked unsupported)",
@@ -270,9 +356,30 @@ export class IngestionService {
270
356
  logger.info(`Smart Triage: PDF ${filename} passed text quality check (${pdfData.pages.filter(p => p.text.trim().length > 30).length}/${pdfData.total} pages with text). Routing to Fast Path.`);
271
357
  Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Smart Triage passed", type: "pdf", fast_path: true }, supabase);
272
358
  }
359
+ else if (pdfResolution.shouldAttempt) {
360
+ // Reuse the already-loaded parse buffer; avoid a second readFile in fileToDataUrl.
361
+ const dataUrl = `data:application/pdf;base64,${buffer.toString("base64")}`;
362
+ extractionContent = this.buildVlmPayloadMarker("pdf", dataUrl);
363
+ isFastPath = true;
364
+ isMultimodalFastPath = true;
365
+ multimodalModality = "pdf";
366
+ logger.info(`Smart Triage: PDF ${filename} routed to multimodal Fast Path using native VLM (${llmModel}).`);
367
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
368
+ action: "VLM Fast Path selected",
369
+ type: "pdf",
370
+ modality: "pdf",
371
+ model: llmModel,
372
+ }, supabase);
373
+ }
273
374
  else {
274
- logger.info(`Smart Triage: PDF ${filename} failed text quality check. Routing to Heavy Path.`);
275
- Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Smart Triage failed", type: "pdf", fast_path: false }, supabase);
375
+ logger.info(`Smart Triage: PDF ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked PDF-unsupported.`);
376
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
377
+ action: "VLM skipped (model marked unsupported)",
378
+ type: "pdf",
379
+ modality: "pdf",
380
+ model: llmModel,
381
+ provider: llmProvider,
382
+ }, supabase);
276
383
  }
277
384
  }
278
385
  catch (err) {
@@ -296,117 +403,184 @@ export class IngestionService {
296
403
  embedding_provider: processingSettingsRow.data?.embedding_provider ?? undefined,
297
404
  embedding_model: processingSettingsRow.data?.embedding_model ?? undefined,
298
405
  };
299
- const doc = { filePath: filePath, text: extractionContent, ingestionId: ingestion.id, userId, supabase };
300
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
301
- const baselineTrace = [];
302
- // Fire and forget Semantic Embedding Storage
303
- RAGService.chunkAndEmbed(ingestion.id, userId, doc.text, supabase, embedSettings).catch(err => {
304
- logger.error(`RAG embedding failed for ${ingestion.id}`, err);
305
- });
306
- // 4. Stage 1: Baseline extraction (always runs, LLM call 1 of max 2)
307
- baselineTrace.push({
308
- timestamp: new Date().toISOString(),
309
- step: "LLM request (baseline extraction)",
310
- details: {
311
- provider: llmSettings.llm_provider ?? llmProvider,
312
- model: llmSettings.llm_model ?? llmModel,
313
- mode: isVlmFastPath ? "vision" : "text",
406
+ const resolvedProvider = llmSettings.llm_provider ?? llmProvider;
407
+ const resolvedModel = llmSettings.llm_model ?? llmModel;
408
+ const runFastPathAttempt = async (attemptContent, attemptType) => {
409
+ const doc = { filePath: filePath, text: attemptContent, ingestionId: ingestion.id, userId, supabase };
410
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
411
+ const baselineTrace = [];
412
+ // Fire and forget Semantic Embedding Storage
413
+ RAGService.chunkAndEmbed(ingestion.id, userId, doc.text, supabase, embedSettings).catch(err => {
414
+ logger.error(`RAG embedding failed for ${ingestion.id}`, err);
415
+ });
416
+ // 4. Stage 1: Baseline extraction (always runs, LLM call 1 of max 2)
417
+ baselineTrace.push({
418
+ timestamp: new Date().toISOString(),
419
+ step: "LLM request (baseline extraction)",
420
+ details: {
421
+ provider: resolvedProvider,
422
+ model: resolvedModel,
423
+ mode: isMultimodalFastPath
424
+ ? `vision:${multimodalModality ?? "image"}${attemptType === "reencoded_image_retry" ? ":reencoded" : ""}`
425
+ : "text",
426
+ }
427
+ });
428
+ const baselineResult = await PolicyEngine.extractBaseline(doc, { context: baselineConfig?.context, fields: baselineConfig?.fields }, llmSettings);
429
+ const baselineEntities = baselineResult.entities;
430
+ const autoTags = baselineResult.tags;
431
+ baselineTrace.push({
432
+ timestamp: new Date().toISOString(),
433
+ step: "LLM response (baseline extraction)",
434
+ details: {
435
+ entities_count: Object.keys(baselineEntities).length,
436
+ uncertain_count: baselineResult.uncertain_fields.length,
437
+ tags_count: autoTags.length,
438
+ }
439
+ });
440
+ // Enrich the document with extracted entities so policy keyword/semantic
441
+ // conditions can match against semantic field values (e.g. document_type:
442
+ // "invoice") even when those exact words don't appear in the raw text.
443
+ const entityLines = Object.entries(baselineEntities)
444
+ .filter(([, v]) => v != null)
445
+ .map(([k, v]) => `${k}: ${Array.isArray(v) ? v.join(", ") : String(v)}`);
446
+ const enrichedDoc = entityLines.length > 0
447
+ ? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
448
+ : doc;
449
+ // 5. Stage 2: Policy matching + policy-specific field extraction
450
+ let result;
451
+ if (userPolicies.length > 0) {
452
+ result = await PolicyEngine.processWithPolicies(enrichedDoc, userPolicies, llmSettings, baselineEntities);
314
453
  }
315
- });
316
- const baselineResult = await PolicyEngine.extractBaseline(doc, { context: baselineConfig?.context, fields: baselineConfig?.fields }, llmSettings);
317
- const baselineEntities = baselineResult.entities;
318
- const autoTags = baselineResult.tags;
319
- baselineTrace.push({
320
- timestamp: new Date().toISOString(),
321
- step: "LLM response (baseline extraction)",
322
- details: {
323
- entities_count: Object.keys(baselineEntities).length,
324
- uncertain_count: baselineResult.uncertain_fields.length,
325
- tags_count: autoTags.length,
454
+ else {
455
+ result = await PolicyEngine.process(enrichedDoc, llmSettings, baselineEntities);
326
456
  }
327
- });
328
- // Enrich the document with extracted entities so policy keyword/semantic
329
- // conditions can match against semantic field values (e.g. document_type:
330
- // "invoice") even when those exact words don't appear in the raw text.
331
- const entityLines = Object.entries(baselineEntities)
332
- .filter(([, v]) => v != null)
333
- .map(([k, v]) => `${k}: ${Array.isArray(v) ? v.join(", ") : String(v)}`);
334
- const enrichedDoc = entityLines.length > 0
335
- ? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
336
- : doc;
337
- // 5. Stage 2: Policy matching + policy-specific field extraction
338
- let result;
339
- if (userPolicies.length > 0) {
340
- result = await PolicyEngine.processWithPolicies(enrichedDoc, userPolicies, llmSettings, baselineEntities);
457
+ const policyName = userPolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name;
458
+ const finalStatus = result.status === "fallback" ? "no_match" : result.status;
459
+ // Merge: baseline entities are the foundation; policy-specific fields
460
+ // are overlaid on top so more precise extractions take precedence.
461
+ const mergedExtracted = { ...baselineEntities, ...result.extractedData };
462
+ let finalTrace = [...baselineTrace, ...(result.trace || [])];
463
+ const { data: updatedIngestion } = await supabase
464
+ .from("ingestions")
465
+ .update({
466
+ status: finalStatus,
467
+ policy_id: result.matchedPolicy,
468
+ policy_name: policyName,
469
+ extracted: mergedExtracted,
470
+ actions_taken: result.actionsExecuted,
471
+ trace: finalTrace,
472
+ tags: autoTags,
473
+ baseline_config_id: baselineConfig?.id ?? null,
474
+ })
475
+ .eq("id", ingestion.id)
476
+ .select()
477
+ .single();
478
+ if (isMultimodalFastPath && multimodalModality) {
479
+ const embeddingMeta = this.queueVlmSemanticEmbedding({
480
+ ingestionId: ingestion.id,
481
+ userId,
482
+ filename,
483
+ finalStatus,
484
+ policyName,
485
+ extracted: mergedExtracted,
486
+ tags: autoTags,
487
+ modality: multimodalModality,
488
+ supabase,
489
+ embedSettings,
490
+ });
491
+ finalTrace = [
492
+ ...finalTrace,
493
+ {
494
+ timestamp: new Date().toISOString(),
495
+ step: "Queued synthetic VLM embedding",
496
+ details: embeddingMeta,
497
+ }
498
+ ];
499
+ await supabase
500
+ .from("ingestions")
501
+ .update({ trace: finalTrace })
502
+ .eq("id", ingestion.id);
503
+ }
504
+ if (isMultimodalFastPath && multimodalModality) {
505
+ await ModelCapabilityService.learnVisionSuccess({
506
+ supabase,
507
+ userId,
508
+ provider: resolvedProvider,
509
+ model: resolvedModel,
510
+ modality: multimodalModality,
511
+ });
512
+ }
513
+ return updatedIngestion;
514
+ };
515
+ let terminalError = null;
516
+ try {
517
+ return await runFastPathAttempt(extractionContent, "primary");
341
518
  }
342
- else {
343
- result = await PolicyEngine.process(enrichedDoc, llmSettings, baselineEntities);
519
+ catch (primaryErr) {
520
+ terminalError = primaryErr;
344
521
  }
345
- const policyName = userPolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name;
346
- const finalStatus = result.status === "fallback" ? "no_match" : result.status;
347
- // Merge: baseline entities are the foundation; policy-specific fields
348
- // are overlaid on top so more precise extractions take precedence.
349
- const mergedExtracted = { ...baselineEntities, ...result.extractedData };
350
- let finalTrace = [...baselineTrace, ...(result.trace || [])];
351
- const { data: updatedIngestion } = await supabase
352
- .from("ingestions")
353
- .update({
354
- status: finalStatus,
355
- policy_id: result.matchedPolicy,
356
- policy_name: policyName,
357
- extracted: mergedExtracted,
358
- actions_taken: result.actionsExecuted,
359
- trace: finalTrace,
360
- tags: autoTags,
361
- baseline_config_id: baselineConfig?.id ?? null,
362
- })
363
- .eq("id", ingestion.id)
364
- .select()
365
- .single();
366
- if (isVlmFastPath) {
367
- const embeddingMeta = this.queueVlmSemanticEmbedding({
368
- ingestionId: ingestion.id,
369
- userId,
522
+ if (isMultimodalFastPath && multimodalModality === "image") {
523
+ const retryMarker = await this.maybeBuildImageRetryMarker({
524
+ error: terminalError,
525
+ filePath,
370
526
  filename,
371
- finalStatus,
372
- policyName,
373
- extracted: mergedExtracted,
374
- tags: autoTags,
375
- supabase,
376
- embedSettings,
527
+ provider: resolvedProvider,
528
+ model: resolvedModel,
529
+ phase: "ingest",
377
530
  });
378
- finalTrace = [
379
- ...finalTrace,
380
- {
381
- timestamp: new Date().toISOString(),
382
- step: "Queued synthetic VLM embedding",
383
- details: embeddingMeta,
531
+ if (retryMarker) {
532
+ this.bumpImageReencodeRetryMetric("attempted", {
533
+ phase: "ingest",
534
+ provider: resolvedProvider,
535
+ model: resolvedModel,
536
+ filename,
537
+ });
538
+ Actuator.logEvent(ingestion.id, userId, "info", "Processing", {
539
+ action: "Retrying VLM with re-encoded image payload",
540
+ provider: resolvedProvider,
541
+ model: resolvedModel,
542
+ }, supabase);
543
+ try {
544
+ const retryResult = await runFastPathAttempt(retryMarker, "reencoded_image_retry");
545
+ this.bumpImageReencodeRetryMetric("succeeded", {
546
+ phase: "ingest",
547
+ provider: resolvedProvider,
548
+ model: resolvedModel,
549
+ filename,
550
+ });
551
+ Actuator.logEvent(ingestion.id, userId, "analysis", "Processing", {
552
+ action: "VLM re-encoded image retry succeeded",
553
+ provider: resolvedProvider,
554
+ model: resolvedModel,
555
+ }, supabase);
556
+ return retryResult;
384
557
  }
385
- ];
386
- await supabase
387
- .from("ingestions")
388
- .update({ trace: finalTrace })
389
- .eq("id", ingestion.id);
390
- }
391
- if (isVlmFastPath) {
392
- await ModelCapabilityService.learnVisionSuccess({
393
- supabase,
394
- userId,
395
- provider: llmSettings.llm_provider ?? llmProvider,
396
- model: llmSettings.llm_model ?? llmModel,
397
- });
558
+ catch (retryErr) {
559
+ this.bumpImageReencodeRetryMetric("failed", {
560
+ phase: "ingest",
561
+ provider: resolvedProvider,
562
+ model: resolvedModel,
563
+ filename,
564
+ });
565
+ Actuator.logEvent(ingestion.id, userId, "error", "Processing", {
566
+ action: "VLM re-encoded image retry failed",
567
+ provider: resolvedProvider,
568
+ model: resolvedModel,
569
+ error: this.errorToMessage(retryErr),
570
+ }, supabase);
571
+ terminalError = retryErr;
572
+ }
573
+ }
398
574
  }
399
- return updatedIngestion;
400
- }
401
- catch (err) {
402
- const msg = err instanceof Error ? err.message : String(err);
403
- if (isVlmFastPath) {
575
+ const msg = this.errorToMessage(terminalError);
576
+ if (isMultimodalFastPath && multimodalModality) {
404
577
  const learnedState = await ModelCapabilityService.learnVisionFailure({
405
578
  supabase,
406
579
  userId,
407
- provider: llmProvider,
408
- model: llmModel,
409
- error: err,
580
+ provider: resolvedProvider,
581
+ model: resolvedModel,
582
+ error: terminalError,
583
+ modality: multimodalModality,
410
584
  });
411
585
  logger.warn(`VLM extraction failed for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
412
586
  Actuator.logEvent(ingestion.id, userId, "error", "Processing", {
@@ -428,6 +602,17 @@ export class IngestionService {
428
602
  return updatedIngestion;
429
603
  }
430
604
  }
605
+ catch (err) {
606
+ const msg = this.errorToMessage(err);
607
+ Actuator.logEvent(ingestion.id, userId, "error", "Processing", { error: msg }, supabase);
608
+ const { data: updatedIngestion } = await supabase
609
+ .from("ingestions")
610
+ .update({ status: "error", error_message: msg })
611
+ .eq("id", ingestion.id)
612
+ .select()
613
+ .single();
614
+ return updatedIngestion;
615
+ }
431
616
  }
432
617
  // 4. Heavy Path (Delegate to RealTimeX)
433
618
  const { error: rtxErr } = await supabase
@@ -477,31 +662,31 @@ export class IngestionService {
477
662
  if (!filePath)
478
663
  throw new Error("No storage path found for this ingestion");
479
664
  let isFastPath = false;
480
- let isVlmFastPath = false;
665
+ let isMultimodalFastPath = false;
666
+ let multimodalModality = null;
481
667
  let extractionContent = "";
482
668
  const ext = filename.toLowerCase().split('.').pop() || '';
483
- const fastExts = ['txt', 'md', 'csv', 'json'];
484
- const imageExts = ['png', 'jpg', 'jpeg', 'webp'];
485
669
  const { data: triageSettingsRow } = await supabase
486
670
  .from("user_settings")
487
671
  .select("llm_provider, llm_model, embedding_provider, embedding_model, vision_model_capabilities")
488
672
  .eq("user_id", userId)
489
673
  .maybeSingle();
490
- const visionResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow);
491
- const llmModel = visionResolution.model;
492
- const llmProvider = visionResolution.provider;
493
- if (fastExts.includes(ext)) {
674
+ const imageResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "image");
675
+ const pdfResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "pdf");
676
+ const llmModel = imageResolution.model;
677
+ const llmProvider = imageResolution.provider;
678
+ if (this.FAST_EXTS.includes(ext)) {
494
679
  isFastPath = true;
495
680
  extractionContent = await fs.readFile(filePath, "utf-8");
496
681
  }
497
- else if (imageExts.includes(ext) && visionResolution.shouldAttempt) {
682
+ else if (this.IMAGE_EXTS.includes(ext) && imageResolution.shouldAttempt) {
498
683
  try {
499
- const buffer = await fs.readFile(filePath);
500
- const base64 = buffer.toString('base64');
501
684
  const mimeTypeActual = `image/${ext === 'jpg' ? 'jpeg' : ext}`;
502
- extractionContent = `[VLM_IMAGE_DATA:data:${mimeTypeActual};base64,${base64}]`;
685
+ const dataUrl = await this.fileToDataUrl(filePath, mimeTypeActual);
686
+ extractionContent = this.buildVlmPayloadMarker("image", dataUrl);
503
687
  isFastPath = true;
504
- isVlmFastPath = true;
688
+ isMultimodalFastPath = true;
689
+ multimodalModality = "image";
505
690
  logger.info(`Smart Triage: Re-run image ${filename} routed to Fast Path using native VLM (${llmModel}).`);
506
691
  Actuator.logEvent(ingestionId, userId, "info", "Triage", { action: "VLM Fast Path selected", type: ext, model: llmModel }, supabase);
507
692
  }
@@ -509,7 +694,7 @@ export class IngestionService {
509
694
  logger.warn(`Failed to read VLM image ${filename} during rerun. Routing to Heavy Path.`, { err });
510
695
  }
511
696
  }
512
- else if (imageExts.includes(ext)) {
697
+ else if (this.IMAGE_EXTS.includes(ext)) {
513
698
  logger.info(`Smart Triage: Re-run image ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked vision-unsupported.`);
514
699
  Actuator.logEvent(ingestionId, userId, "info", "Triage", {
515
700
  action: "VLM skipped (model marked unsupported)",
@@ -527,10 +712,34 @@ export class IngestionService {
527
712
  isFastPath = true;
528
713
  extractionContent = pdfData.text;
529
714
  }
530
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
715
+ else if (pdfResolution.shouldAttempt) {
716
+ // Reuse the already-loaded parse buffer; avoid a second readFile in fileToDataUrl.
717
+ const dataUrl = `data:application/pdf;base64,${buffer.toString("base64")}`;
718
+ extractionContent = this.buildVlmPayloadMarker("pdf", dataUrl);
719
+ isFastPath = true;
720
+ isMultimodalFastPath = true;
721
+ multimodalModality = "pdf";
722
+ logger.info(`Smart Triage: Re-run PDF ${filename} routed to multimodal Fast Path using native VLM (${llmModel}).`);
723
+ Actuator.logEvent(ingestionId, userId, "info", "Triage", {
724
+ action: "VLM Fast Path selected",
725
+ type: "pdf",
726
+ modality: "pdf",
727
+ model: llmModel,
728
+ }, supabase);
729
+ }
730
+ else {
731
+ logger.info(`Smart Triage: Re-run PDF ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked PDF-unsupported.`);
732
+ Actuator.logEvent(ingestionId, userId, "info", "Triage", {
733
+ action: "VLM skipped (model marked unsupported)",
734
+ type: "pdf",
735
+ modality: "pdf",
736
+ model: llmModel,
737
+ provider: llmProvider
738
+ }, supabase);
739
+ }
531
740
  }
532
741
  catch (err) {
533
- // ignore
742
+ logger.warn(`Failed to parse PDF ${filename} during rerun. Routing to Heavy Path.`, { err });
534
743
  }
535
744
  }
536
745
  if (isFastPath) {
@@ -547,44 +756,47 @@ export class IngestionService {
547
756
  embedding_provider: processingSettingsRow.data?.embedding_provider ?? undefined,
548
757
  embedding_model: processingSettingsRow.data?.embedding_model ?? undefined,
549
758
  };
550
- const doc = { filePath, text: extractionContent, ingestionId, userId, supabase };
551
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
552
- const baselineTrace = [];
553
- // Fire and forget Semantic Embedding Storage for re-runs
554
- RAGService.chunkAndEmbed(ingestionId, userId, doc.text, supabase, embedSettings).catch(err => {
555
- logger.error(`RAG embedding failed during rerun for ${ingestionId}`, err);
556
- });
557
- baselineTrace.push({
558
- timestamp: new Date().toISOString(),
559
- step: "LLM request (baseline extraction)",
560
- details: {
561
- provider: llmSettings.llm_provider ?? llmProvider,
562
- model: llmSettings.llm_model ?? llmModel,
563
- mode: isVlmFastPath ? "vision" : "text",
564
- }
565
- });
566
- const baselineResult = await PolicyEngine.extractBaseline(doc, { context: baselineConfig?.context, fields: baselineConfig?.fields }, llmSettings);
567
- const baselineEntities = baselineResult.entities;
568
- const autoTags = baselineResult.tags;
569
- baselineTrace.push({
570
- timestamp: new Date().toISOString(),
571
- step: "LLM response (baseline extraction)",
572
- details: {
573
- entities_count: Object.keys(baselineEntities).length,
574
- uncertain_count: baselineResult.uncertain_fields.length,
575
- tags_count: autoTags.length,
576
- }
577
- });
578
- const entityLines = Object.entries(baselineEntities)
579
- .filter(([, v]) => v != null)
580
- .map(([k, v]) => `${k}: ${Array.isArray(v) ? v.join(", ") : String(v)}`);
581
- const enrichedDoc = entityLines.length > 0
582
- ? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
583
- : doc;
584
- let finalStatus = "no_match";
585
- let result;
586
- let policyName;
587
- try {
759
+ const resolvedProvider = llmSettings.llm_provider ?? llmProvider;
760
+ const resolvedModel = llmSettings.llm_model ?? llmModel;
761
+ const runFastPathAttempt = async (attemptContent, attemptType) => {
762
+ const doc = { filePath, text: attemptContent, ingestionId, userId, supabase };
763
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
764
+ const baselineTrace = [];
765
+ // Fire and forget Semantic Embedding Storage for re-runs
766
+ RAGService.chunkAndEmbed(ingestionId, userId, doc.text, supabase, embedSettings).catch(err => {
767
+ logger.error(`RAG embedding failed during rerun for ${ingestionId}`, err);
768
+ });
769
+ baselineTrace.push({
770
+ timestamp: new Date().toISOString(),
771
+ step: "LLM request (baseline extraction)",
772
+ details: {
773
+ provider: resolvedProvider,
774
+ model: resolvedModel,
775
+ mode: isMultimodalFastPath
776
+ ? `vision:${multimodalModality ?? "image"}${attemptType === "reencoded_image_retry" ? ":reencoded" : ""}`
777
+ : "text",
778
+ }
779
+ });
780
+ const baselineResult = await PolicyEngine.extractBaseline(doc, { context: baselineConfig?.context, fields: baselineConfig?.fields }, llmSettings);
781
+ const baselineEntities = baselineResult.entities;
782
+ const autoTags = baselineResult.tags;
783
+ baselineTrace.push({
784
+ timestamp: new Date().toISOString(),
785
+ step: "LLM response (baseline extraction)",
786
+ details: {
787
+ entities_count: Object.keys(baselineEntities).length,
788
+ uncertain_count: baselineResult.uncertain_fields.length,
789
+ tags_count: autoTags.length,
790
+ }
791
+ });
792
+ const entityLines = Object.entries(baselineEntities)
793
+ .filter(([, v]) => v != null)
794
+ .map(([k, v]) => `${k}: ${Array.isArray(v) ? v.join(", ") : String(v)}`);
795
+ const enrichedDoc = entityLines.length > 0
796
+ ? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
797
+ : doc;
798
+ let finalStatus = "no_match";
799
+ let result;
588
800
  const forcedPolicyId = opts.forcedPolicyId?.trim();
589
801
  const activePolicies = forcedPolicyId
590
802
  ? userPolicies.filter((policy) => policy.metadata.id === forcedPolicyId)
@@ -601,7 +813,7 @@ export class IngestionService {
601
813
  else {
602
814
  result = await PolicyEngine.process(enrichedDoc, llmSettings, baselineEntities);
603
815
  }
604
- policyName = result.matchedPolicy ? activePolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name : undefined;
816
+ const policyName = result.matchedPolicy ? activePolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name : undefined;
605
817
  finalStatus = result.status === "fallback" ? "no_match" : result.status;
606
818
  const mergedExtracted = { ...baselineEntities, ...result.extractedData };
607
819
  // Preserve any human-added tags; merge with freshly generated auto-tags.
@@ -626,7 +838,7 @@ export class IngestionService {
626
838
  baseline_config_id: baselineConfig?.id ?? null,
627
839
  })
628
840
  .eq("id", ingestionId);
629
- if (isVlmFastPath) {
841
+ if (isMultimodalFastPath && multimodalModality) {
630
842
  const embeddingMeta = this.queueVlmSemanticEmbedding({
631
843
  ingestionId,
632
844
  userId,
@@ -635,6 +847,7 @@ export class IngestionService {
635
847
  policyName,
636
848
  extracted: mergedExtracted,
637
849
  tags: mergedTags,
850
+ modality: multimodalModality,
638
851
  supabase,
639
852
  embedSettings,
640
853
  });
@@ -651,38 +864,98 @@ export class IngestionService {
651
864
  .update({ trace: rerunTrace })
652
865
  .eq("id", ingestionId);
653
866
  }
654
- if (isVlmFastPath) {
867
+ if (isMultimodalFastPath && multimodalModality) {
655
868
  await ModelCapabilityService.learnVisionSuccess({
656
869
  supabase,
657
870
  userId,
658
- provider: llmSettings.llm_provider ?? llmProvider,
659
- model: llmSettings.llm_model ?? llmModel,
871
+ provider: resolvedProvider,
872
+ model: resolvedModel,
873
+ modality: multimodalModality,
660
874
  });
661
875
  }
662
876
  return finalStatus === "matched";
877
+ };
878
+ let terminalError = null;
879
+ try {
880
+ return await runFastPathAttempt(extractionContent, "primary");
663
881
  }
664
- catch (err) {
665
- const msg = err instanceof Error ? err.message : String(err);
666
- if (isVlmFastPath) {
667
- const learnedState = await ModelCapabilityService.learnVisionFailure({
668
- supabase,
669
- userId,
670
- provider: llmProvider,
671
- model: llmModel,
672
- error: err,
882
+ catch (primaryErr) {
883
+ terminalError = primaryErr;
884
+ }
885
+ if (isMultimodalFastPath && multimodalModality === "image") {
886
+ const retryMarker = await this.maybeBuildImageRetryMarker({
887
+ error: terminalError,
888
+ filePath,
889
+ filename,
890
+ provider: resolvedProvider,
891
+ model: resolvedModel,
892
+ phase: "rerun",
893
+ });
894
+ if (retryMarker) {
895
+ this.bumpImageReencodeRetryMetric("attempted", {
896
+ phase: "rerun",
897
+ provider: resolvedProvider,
898
+ model: resolvedModel,
899
+ filename,
673
900
  });
674
- logger.warn(`VLM extraction failed during rerun for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
675
- Actuator.logEvent(ingestionId, userId, "error", "Processing", {
676
- action: "VLM Failed, Fallback to Heavy",
677
- error: msg,
678
- learned_state: learnedState,
901
+ Actuator.logEvent(ingestionId, userId, "info", "Processing", {
902
+ action: "Retrying VLM with re-encoded image payload",
903
+ provider: resolvedProvider,
904
+ model: resolvedModel,
679
905
  }, supabase);
680
- isFastPath = false; // Trigger heavy path fallthrough
681
- }
682
- else {
683
- throw err; // Re-throw to caller
906
+ try {
907
+ const retryResult = await runFastPathAttempt(retryMarker, "reencoded_image_retry");
908
+ this.bumpImageReencodeRetryMetric("succeeded", {
909
+ phase: "rerun",
910
+ provider: resolvedProvider,
911
+ model: resolvedModel,
912
+ filename,
913
+ });
914
+ Actuator.logEvent(ingestionId, userId, "analysis", "Processing", {
915
+ action: "VLM re-encoded image retry succeeded",
916
+ provider: resolvedProvider,
917
+ model: resolvedModel,
918
+ }, supabase);
919
+ return retryResult;
920
+ }
921
+ catch (retryErr) {
922
+ this.bumpImageReencodeRetryMetric("failed", {
923
+ phase: "rerun",
924
+ provider: resolvedProvider,
925
+ model: resolvedModel,
926
+ filename,
927
+ });
928
+ Actuator.logEvent(ingestionId, userId, "error", "Processing", {
929
+ action: "VLM re-encoded image retry failed",
930
+ provider: resolvedProvider,
931
+ model: resolvedModel,
932
+ error: this.errorToMessage(retryErr),
933
+ }, supabase);
934
+ terminalError = retryErr;
935
+ }
684
936
  }
685
937
  }
938
+ const msg = this.errorToMessage(terminalError);
939
+ if (isMultimodalFastPath && multimodalModality) {
940
+ const learnedState = await ModelCapabilityService.learnVisionFailure({
941
+ supabase,
942
+ userId,
943
+ provider: resolvedProvider,
944
+ model: resolvedModel,
945
+ error: terminalError,
946
+ modality: multimodalModality,
947
+ });
948
+ logger.warn(`VLM extraction failed during rerun for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
949
+ Actuator.logEvent(ingestionId, userId, "error", "Processing", {
950
+ action: "VLM Failed, Fallback to Heavy",
951
+ error: msg,
952
+ learned_state: learnedState,
953
+ }, supabase);
954
+ isFastPath = false; // Trigger heavy path fallthrough
955
+ }
956
+ else {
957
+ throw terminalError instanceof Error ? terminalError : new Error(msg); // Re-throw to caller
958
+ }
686
959
  }
687
960
  // Re-delegate to rtx_activities
688
961
  await supabase