@realtimex/folio 0.1.11 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,10 @@
1
1
  import type { SupabaseClient } from "@supabase/supabase-js";
2
2
  import fs from "fs/promises";
3
+ import { execFile } from "child_process";
4
+ import os from "os";
5
+ import path from "path";
3
6
  import { PDFParse } from "pdf-parse";
7
+ import { promisify } from "util";
4
8
  import { createLogger } from "../utils/logger.js";
5
9
  import { PolicyLoader } from "./PolicyLoader.js";
6
10
  import type { FolioPolicy } from "./PolicyLoader.js";
@@ -11,9 +15,10 @@ import { Actuator } from "../utils/Actuator.js";
11
15
  import { extractLlmResponse, previewLlmText } from "../utils/llmResponse.js";
12
16
  import { RAGService } from "./RAGService.js";
13
17
  import { SDKService } from "./SDKService.js";
14
- import { ModelCapabilityService } from "./ModelCapabilityService.js";
18
+ import { ModelCapabilityService, type VisionCapabilityModality } from "./ModelCapabilityService.js";
15
19
 
16
20
  const logger = createLogger("IngestionService");
21
+ const execFileAsync = promisify(execFile);
17
22
 
18
23
  /**
19
24
  * Multi-signal classifier that decides whether pdf-parse extracted enough
@@ -89,6 +94,18 @@ export interface Ingestion {
89
94
  }
90
95
 
91
96
  export class IngestionService {
97
+ private static readonly FAST_EXTS = ["txt", "md", "csv", "json"] as const;
98
+ private static readonly IMAGE_EXTS = ["png", "jpg", "jpeg", "webp"] as const;
99
+ private static readonly IMAGE_REENCODE_TIMEOUT_MS = 15000;
100
+ private static readonly IMAGE_REENCODE_RETRY_ENABLED = (process.env.FOLIO_VLM_IMAGE_REENCODE_RETRY_ENABLED ?? "true").toLowerCase() !== "false";
101
+ private static readonly IMAGE_REENCODE_RETRY_METRICS = {
102
+ attempted: 0,
103
+ succeeded: 0,
104
+ failed: 0,
105
+ skipped_disabled: 0,
106
+ skipped_unavailable: 0,
107
+ };
108
+
92
109
  private static readonly NON_IDEMPOTENT_ACTION_TYPES = new Set([
93
110
  "append_to_google_sheet",
94
111
  "webhook",
@@ -129,11 +146,12 @@ export class IngestionService {
129
146
  policyName?: string;
130
147
  extracted: Record<string, unknown>;
131
148
  tags: string[];
149
+ modality: VisionCapabilityModality;
132
150
  }): string {
133
- const { filename, finalStatus, policyName, extracted, tags } = opts;
151
+ const { filename, finalStatus, policyName, extracted, tags, modality } = opts;
134
152
  const lines: string[] = [
135
153
  `Document filename: ${filename}`,
136
- "Document source: VLM image extraction",
154
+ `Document source: VLM ${modality} extraction`,
137
155
  `Processing status: ${finalStatus}`,
138
156
  ];
139
157
 
@@ -186,6 +204,7 @@ export class IngestionService {
186
204
  policyName?: string;
187
205
  extracted: Record<string, unknown>;
188
206
  tags: string[];
207
+ modality: VisionCapabilityModality;
189
208
  supabase: SupabaseClient;
190
209
  embedSettings: { embedding_provider?: string; embedding_model?: string };
191
210
  }): { synthetic_chars: number; extracted_fields: number; tags_count: number } {
@@ -195,6 +214,7 @@ export class IngestionService {
195
214
  policyName: opts.policyName,
196
215
  extracted: opts.extracted,
197
216
  tags: opts.tags,
217
+ modality: opts.modality,
198
218
  });
199
219
  const details = {
200
220
  synthetic_chars: syntheticText.length,
@@ -231,6 +251,101 @@ export class IngestionService {
231
251
  return details;
232
252
  }
233
253
 
254
+ private static buildVlmPayloadMarker(modality: VisionCapabilityModality, dataUrl: string): string {
255
+ const prefix = modality === "pdf" ? "VLM_PDF_DATA" : "VLM_IMAGE_DATA";
256
+ return `[${prefix}:${dataUrl}]`;
257
+ }
258
+
259
+ private static async fileToDataUrl(filePath: string, mimeType: string): Promise<string> {
260
+ const buffer = await fs.readFile(filePath);
261
+ const base64 = buffer.toString("base64");
262
+ return `data:${mimeType};base64,${base64}`;
263
+ }
264
+
265
+ private static errorToMessage(error: unknown): string {
266
+ if (error instanceof Error) return error.message;
267
+ if (typeof error === "string") return error;
268
+ if (error && typeof error === "object") {
269
+ const candidate = error as Record<string, unknown>;
270
+ if (typeof candidate.message === "string") return candidate.message;
271
+ }
272
+ return String(error ?? "");
273
+ }
274
+
275
+ private static isInvalidModelError(error: unknown): boolean {
276
+ const message = this.errorToMessage(error).toLowerCase();
277
+ return message.includes("invalid model");
278
+ }
279
+
280
+ private static async reencodeImageToPngDataUrl(filePath: string): Promise<string | null> {
281
+ const tempOutputPath = path.join(
282
+ os.tmpdir(),
283
+ `folio-vlm-reencode-${Date.now()}-${Math.random().toString(16).slice(2)}.png`
284
+ );
285
+ try {
286
+ await execFileAsync("sips", ["-s", "format", "png", filePath, "--out", tempOutputPath], {
287
+ timeout: this.IMAGE_REENCODE_TIMEOUT_MS,
288
+ maxBuffer: 1024 * 1024,
289
+ });
290
+ const pngBuffer = await fs.readFile(tempOutputPath);
291
+ return `data:image/png;base64,${pngBuffer.toString("base64")}`;
292
+ } catch {
293
+ return null;
294
+ } finally {
295
+ await fs.unlink(tempOutputPath).catch(() => undefined);
296
+ }
297
+ }
298
+
299
+ private static async maybeBuildImageRetryMarker(opts: {
300
+ error: unknown;
301
+ filePath: string;
302
+ filename: string;
303
+ provider: string;
304
+ model: string;
305
+ phase: "ingest" | "rerun";
306
+ }): Promise<string | null> {
307
+ if (!this.isInvalidModelError(opts.error)) return null;
308
+ if (!this.IMAGE_REENCODE_RETRY_ENABLED) {
309
+ this.bumpImageReencodeRetryMetric("skipped_disabled", opts);
310
+ logger.info(
311
+ `VLM ${opts.phase} retry skipped for ${opts.filename}: re-encode retry disabled (${opts.provider}/${opts.model}).`
312
+ );
313
+ return null;
314
+ }
315
+ const retryDataUrl = await this.reencodeImageToPngDataUrl(opts.filePath);
316
+ if (!retryDataUrl) {
317
+ this.bumpImageReencodeRetryMetric("skipped_unavailable", opts);
318
+ logger.warn(
319
+ `VLM ${opts.phase} retry skipped for ${opts.filename}: image re-encode unavailable (${opts.provider}/${opts.model}).`
320
+ );
321
+ return null;
322
+ }
323
+ logger.warn(
324
+ `VLM ${opts.phase} failed for ${opts.filename} with invalid model. Retrying once with re-encoded image payload (${opts.provider}/${opts.model}).`
325
+ );
326
+ return this.buildVlmPayloadMarker("image", retryDataUrl);
327
+ }
328
+
329
+ private static bumpImageReencodeRetryMetric(
330
+ outcome: keyof typeof IngestionService.IMAGE_REENCODE_RETRY_METRICS,
331
+ meta: {
332
+ phase: "ingest" | "rerun";
333
+ provider: string;
334
+ model: string;
335
+ filename: string;
336
+ }
337
+ ): void {
338
+ this.IMAGE_REENCODE_RETRY_METRICS[outcome] += 1;
339
+ logger.info("VLM image re-encode retry metric", {
340
+ outcome,
341
+ phase: meta.phase,
342
+ provider: meta.provider,
343
+ model: meta.model,
344
+ filename: meta.filename,
345
+ counters: { ...this.IMAGE_REENCODE_RETRY_METRICS },
346
+ });
347
+ }
348
+
234
349
  /**
235
350
  * Ingest a document using Hybrid Routing Architecture.
236
351
  */
@@ -303,11 +418,10 @@ export class IngestionService {
303
418
 
304
419
  // 2. Document Triage
305
420
  let isFastPath = false;
306
- let isVlmFastPath = false;
421
+ let isMultimodalFastPath = false;
422
+ let multimodalModality: VisionCapabilityModality | null = null;
307
423
  let extractionContent = content;
308
424
  const ext = filename.toLowerCase().split('.').pop() || '';
309
- const fastExts = ['txt', 'md', 'csv', 'json'];
310
- const imageExts = ['png', 'jpg', 'jpeg', 'webp'];
311
425
 
312
426
  // Pre-fetch settings to decide whether we should attempt VLM.
313
427
  const { data: triageSettingsRow } = await supabase
@@ -315,27 +429,27 @@ export class IngestionService {
315
429
  .select("llm_provider, llm_model, embedding_provider, embedding_model, vision_model_capabilities")
316
430
  .eq("user_id", userId)
317
431
  .maybeSingle();
318
- const visionResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow);
319
- const llmModel = visionResolution.model;
320
- const llmProvider = visionResolution.provider;
432
+ const imageResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "image");
433
+ const pdfResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "pdf");
434
+ const llmModel = imageResolution.model;
435
+ const llmProvider = imageResolution.provider;
321
436
 
322
- if (fastExts.includes(ext)) {
437
+ if (this.FAST_EXTS.includes(ext as typeof this.FAST_EXTS[number])) {
323
438
  isFastPath = true;
324
- } else if (imageExts.includes(ext) && visionResolution.shouldAttempt) {
439
+ } else if (this.IMAGE_EXTS.includes(ext as typeof this.IMAGE_EXTS[number]) && imageResolution.shouldAttempt) {
325
440
  try {
326
- const buffer = await fs.readFile(filePath);
327
- const base64 = buffer.toString('base64');
328
441
  const mimeTypeActual = mimeType || `image/${ext === 'jpg' ? 'jpeg' : ext}`;
329
- // Special marker for PolicyEngine
330
- extractionContent = `[VLM_IMAGE_DATA:data:${mimeTypeActual};base64,${base64}]`;
442
+ const dataUrl = await this.fileToDataUrl(filePath, mimeTypeActual);
443
+ extractionContent = this.buildVlmPayloadMarker("image", dataUrl);
331
444
  isFastPath = true;
332
- isVlmFastPath = true;
445
+ isMultimodalFastPath = true;
446
+ multimodalModality = "image";
333
447
  logger.info(`Smart Triage: Image ${filename} routed to Fast Path using native VLM (${llmModel}).`);
334
448
  Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "VLM Fast Path selected", type: ext, model: llmModel }, supabase);
335
449
  } catch (err) {
336
450
  logger.warn(`Failed to read VLM image ${filename}. Routing to Heavy Path.`, { err });
337
451
  }
338
- } else if (imageExts.includes(ext)) {
452
+ } else if (this.IMAGE_EXTS.includes(ext as typeof this.IMAGE_EXTS[number])) {
339
453
  logger.info(`Smart Triage: Image ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked vision-unsupported.`);
340
454
  Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
341
455
  action: "VLM skipped (model marked unsupported)",
@@ -353,9 +467,29 @@ export class IngestionService {
353
467
  extractionContent = pdfData.text;
354
468
  logger.info(`Smart Triage: PDF ${filename} passed text quality check (${pdfData.pages.filter(p => p.text.trim().length > 30).length}/${pdfData.total} pages with text). Routing to Fast Path.`);
355
469
  Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Smart Triage passed", type: "pdf", fast_path: true }, supabase);
470
+ } else if (pdfResolution.shouldAttempt) {
471
+ // Reuse the already-loaded parse buffer; avoid a second readFile in fileToDataUrl.
472
+ const dataUrl = `data:application/pdf;base64,${buffer.toString("base64")}`;
473
+ extractionContent = this.buildVlmPayloadMarker("pdf", dataUrl);
474
+ isFastPath = true;
475
+ isMultimodalFastPath = true;
476
+ multimodalModality = "pdf";
477
+ logger.info(`Smart Triage: PDF ${filename} routed to multimodal Fast Path using native VLM (${llmModel}).`);
478
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
479
+ action: "VLM Fast Path selected",
480
+ type: "pdf",
481
+ modality: "pdf",
482
+ model: llmModel,
483
+ }, supabase);
356
484
  } else {
357
- logger.info(`Smart Triage: PDF ${filename} failed text quality check. Routing to Heavy Path.`);
358
- Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Smart Triage failed", type: "pdf", fast_path: false }, supabase);
485
+ logger.info(`Smart Triage: PDF ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked PDF-unsupported.`);
486
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
487
+ action: "VLM skipped (model marked unsupported)",
488
+ type: "pdf",
489
+ modality: "pdf",
490
+ model: llmModel,
491
+ provider: llmProvider,
492
+ }, supabase);
359
493
  }
360
494
  } catch (err) {
361
495
  logger.warn(`Failed to parse PDF ${filename}. Routing to Heavy Path.`, { err });
@@ -379,132 +513,203 @@ export class IngestionService {
379
513
  embedding_provider: processingSettingsRow.data?.embedding_provider ?? undefined,
380
514
  embedding_model: processingSettingsRow.data?.embedding_model ?? undefined,
381
515
  };
382
- const doc = { filePath: filePath, text: extractionContent, ingestionId: ingestion.id, userId, supabase };
383
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
384
- const baselineTrace: Array<{ timestamp: string; step: string; details?: any }> = [];
516
+ const resolvedProvider = llmSettings.llm_provider ?? llmProvider;
517
+ const resolvedModel = llmSettings.llm_model ?? llmModel;
518
+
519
+ const runFastPathAttempt = async (
520
+ attemptContent: string,
521
+ attemptType: "primary" | "reencoded_image_retry"
522
+ ): Promise<Ingestion> => {
523
+ const doc = { filePath: filePath, text: attemptContent, ingestionId: ingestion.id, userId, supabase };
524
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
525
+ const baselineTrace: Array<{ timestamp: string; step: string; details?: any }> = [];
526
+
527
+ // Fire and forget Semantic Embedding Storage
528
+ RAGService.chunkAndEmbed(ingestion.id, userId, doc.text, supabase, embedSettings).catch(err => {
529
+ logger.error(`RAG embedding failed for ${ingestion.id}`, err);
530
+ });
385
531
 
386
- // Fire and forget Semantic Embedding Storage
387
- RAGService.chunkAndEmbed(ingestion.id, userId, doc.text, supabase, embedSettings).catch(err => {
388
- logger.error(`RAG embedding failed for ${ingestion.id}`, err);
389
- });
532
+ // 4. Stage 1: Baseline extraction (always runs, LLM call 1 of max 2)
533
+ baselineTrace.push({
534
+ timestamp: new Date().toISOString(),
535
+ step: "LLM request (baseline extraction)",
536
+ details: {
537
+ provider: resolvedProvider,
538
+ model: resolvedModel,
539
+ mode: isMultimodalFastPath
540
+ ? `vision:${multimodalModality ?? "image"}${attemptType === "reencoded_image_retry" ? ":reencoded" : ""}`
541
+ : "text",
542
+ }
543
+ });
390
544
 
391
- // 4. Stage 1: Baseline extraction (always runs, LLM call 1 of max 2)
392
- baselineTrace.push({
393
- timestamp: new Date().toISOString(),
394
- step: "LLM request (baseline extraction)",
395
- details: {
396
- provider: llmSettings.llm_provider ?? llmProvider,
397
- model: llmSettings.llm_model ?? llmModel,
398
- mode: isVlmFastPath ? "vision" : "text",
399
- }
400
- });
545
+ const baselineResult = await PolicyEngine.extractBaseline(
546
+ doc,
547
+ { context: baselineConfig?.context, fields: baselineConfig?.fields },
548
+ llmSettings
549
+ );
550
+ const baselineEntities = baselineResult.entities;
551
+ const autoTags = baselineResult.tags;
552
+ baselineTrace.push({
553
+ timestamp: new Date().toISOString(),
554
+ step: "LLM response (baseline extraction)",
555
+ details: {
556
+ entities_count: Object.keys(baselineEntities).length,
557
+ uncertain_count: baselineResult.uncertain_fields.length,
558
+ tags_count: autoTags.length,
559
+ }
560
+ });
401
561
 
402
- const baselineResult = await PolicyEngine.extractBaseline(
403
- doc,
404
- { context: baselineConfig?.context, fields: baselineConfig?.fields },
405
- llmSettings
406
- );
407
- const baselineEntities = baselineResult.entities;
408
- const autoTags = baselineResult.tags;
409
- baselineTrace.push({
410
- timestamp: new Date().toISOString(),
411
- step: "LLM response (baseline extraction)",
412
- details: {
413
- entities_count: Object.keys(baselineEntities).length,
414
- uncertain_count: baselineResult.uncertain_fields.length,
415
- tags_count: autoTags.length,
562
+ // Enrich the document with extracted entities so policy keyword/semantic
563
+ // conditions can match against semantic field values (e.g. document_type:
564
+ // "invoice") even when those exact words don't appear in the raw text.
565
+ const entityLines = Object.entries(baselineEntities)
566
+ .filter(([, v]) => v != null)
567
+ .map(([k, v]) => `${k}: ${Array.isArray(v) ? (v as unknown[]).join(", ") : String(v)}`);
568
+ const enrichedDoc = entityLines.length > 0
569
+ ? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
570
+ : doc;
571
+
572
+ // 5. Stage 2: Policy matching + policy-specific field extraction
573
+ let result;
574
+ if (userPolicies.length > 0) {
575
+ result = await PolicyEngine.processWithPolicies(enrichedDoc, userPolicies, llmSettings, baselineEntities);
576
+ } else {
577
+ result = await PolicyEngine.process(enrichedDoc, llmSettings, baselineEntities);
416
578
  }
417
- });
418
579
 
419
- // Enrich the document with extracted entities so policy keyword/semantic
420
- // conditions can match against semantic field values (e.g. document_type:
421
- // "invoice") even when those exact words don't appear in the raw text.
422
- const entityLines = Object.entries(baselineEntities)
423
- .filter(([, v]) => v != null)
424
- .map(([k, v]) => `${k}: ${Array.isArray(v) ? (v as unknown[]).join(", ") : String(v)}`);
425
- const enrichedDoc = entityLines.length > 0
426
- ? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
427
- : doc;
580
+ const policyName = userPolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name;
581
+ const finalStatus = result.status === "fallback" ? "no_match" : result.status;
428
582
 
429
- // 5. Stage 2: Policy matching + policy-specific field extraction
430
- let result;
431
- if (userPolicies.length > 0) {
432
- result = await PolicyEngine.processWithPolicies(enrichedDoc, userPolicies, llmSettings, baselineEntities);
433
- } else {
434
- result = await PolicyEngine.process(enrichedDoc, llmSettings, baselineEntities);
435
- }
583
+ // Merge: baseline entities are the foundation; policy-specific fields
584
+ // are overlaid on top so more precise extractions take precedence.
585
+ const mergedExtracted = { ...baselineEntities, ...result.extractedData };
586
+ let finalTrace = [...baselineTrace, ...(result.trace || [])];
436
587
 
437
- const policyName = userPolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name;
438
- const finalStatus = result.status === "fallback" ? "no_match" : result.status;
588
+ const { data: updatedIngestion } = await supabase
589
+ .from("ingestions")
590
+ .update({
591
+ status: finalStatus,
592
+ policy_id: result.matchedPolicy,
593
+ policy_name: policyName,
594
+ extracted: mergedExtracted,
595
+ actions_taken: result.actionsExecuted,
596
+ trace: finalTrace,
597
+ tags: autoTags,
598
+ baseline_config_id: baselineConfig?.id ?? null,
599
+ })
600
+ .eq("id", ingestion.id)
601
+ .select()
602
+ .single();
439
603
 
440
- // Merge: baseline entities are the foundation; policy-specific fields
441
- // are overlaid on top so more precise extractions take precedence.
442
- const mergedExtracted = { ...baselineEntities, ...result.extractedData };
443
- let finalTrace = [...baselineTrace, ...(result.trace || [])];
604
+ if (isMultimodalFastPath && multimodalModality) {
605
+ const embeddingMeta = this.queueVlmSemanticEmbedding({
606
+ ingestionId: ingestion.id,
607
+ userId,
608
+ filename,
609
+ finalStatus,
610
+ policyName,
611
+ extracted: mergedExtracted,
612
+ tags: autoTags,
613
+ modality: multimodalModality,
614
+ supabase,
615
+ embedSettings,
616
+ });
617
+ finalTrace = [
618
+ ...finalTrace,
619
+ {
620
+ timestamp: new Date().toISOString(),
621
+ step: "Queued synthetic VLM embedding",
622
+ details: embeddingMeta,
623
+ }
624
+ ];
625
+ await supabase
626
+ .from("ingestions")
627
+ .update({ trace: finalTrace })
628
+ .eq("id", ingestion.id);
629
+ }
444
630
 
445
- const { data: updatedIngestion } = await supabase
446
- .from("ingestions")
447
- .update({
448
- status: finalStatus,
449
- policy_id: result.matchedPolicy,
450
- policy_name: policyName,
451
- extracted: mergedExtracted,
452
- actions_taken: result.actionsExecuted,
453
- trace: finalTrace,
454
- tags: autoTags,
455
- baseline_config_id: baselineConfig?.id ?? null,
456
- })
457
- .eq("id", ingestion.id)
458
- .select()
459
- .single();
631
+ if (isMultimodalFastPath && multimodalModality) {
632
+ await ModelCapabilityService.learnVisionSuccess({
633
+ supabase,
634
+ userId,
635
+ provider: resolvedProvider,
636
+ model: resolvedModel,
637
+ modality: multimodalModality,
638
+ });
639
+ }
460
640
 
461
- if (isVlmFastPath) {
462
- const embeddingMeta = this.queueVlmSemanticEmbedding({
463
- ingestionId: ingestion.id,
464
- userId,
465
- filename,
466
- finalStatus,
467
- policyName,
468
- extracted: mergedExtracted,
469
- tags: autoTags,
470
- supabase,
471
- embedSettings,
472
- });
473
- finalTrace = [
474
- ...finalTrace,
475
- {
476
- timestamp: new Date().toISOString(),
477
- step: "Queued synthetic VLM embedding",
478
- details: embeddingMeta,
479
- }
480
- ];
481
- await supabase
482
- .from("ingestions")
483
- .update({ trace: finalTrace })
484
- .eq("id", ingestion.id);
641
+ return updatedIngestion as Ingestion;
642
+ };
643
+
644
+ let terminalError: unknown = null;
645
+ try {
646
+ return await runFastPathAttempt(extractionContent, "primary");
647
+ } catch (primaryErr) {
648
+ terminalError = primaryErr;
485
649
  }
486
650
 
487
- if (isVlmFastPath) {
488
- await ModelCapabilityService.learnVisionSuccess({
489
- supabase,
490
- userId,
491
- provider: llmSettings.llm_provider ?? llmProvider,
492
- model: llmSettings.llm_model ?? llmModel,
651
+ if (isMultimodalFastPath && multimodalModality === "image") {
652
+ const retryMarker = await this.maybeBuildImageRetryMarker({
653
+ error: terminalError,
654
+ filePath,
655
+ filename,
656
+ provider: resolvedProvider,
657
+ model: resolvedModel,
658
+ phase: "ingest",
493
659
  });
660
+ if (retryMarker) {
661
+ this.bumpImageReencodeRetryMetric("attempted", {
662
+ phase: "ingest",
663
+ provider: resolvedProvider,
664
+ model: resolvedModel,
665
+ filename,
666
+ });
667
+ Actuator.logEvent(ingestion.id, userId, "info", "Processing", {
668
+ action: "Retrying VLM with re-encoded image payload",
669
+ provider: resolvedProvider,
670
+ model: resolvedModel,
671
+ }, supabase);
672
+ try {
673
+ const retryResult = await runFastPathAttempt(retryMarker, "reencoded_image_retry");
674
+ this.bumpImageReencodeRetryMetric("succeeded", {
675
+ phase: "ingest",
676
+ provider: resolvedProvider,
677
+ model: resolvedModel,
678
+ filename,
679
+ });
680
+ Actuator.logEvent(ingestion.id, userId, "analysis", "Processing", {
681
+ action: "VLM re-encoded image retry succeeded",
682
+ provider: resolvedProvider,
683
+ model: resolvedModel,
684
+ }, supabase);
685
+ return retryResult;
686
+ } catch (retryErr) {
687
+ this.bumpImageReencodeRetryMetric("failed", {
688
+ phase: "ingest",
689
+ provider: resolvedProvider,
690
+ model: resolvedModel,
691
+ filename,
692
+ });
693
+ Actuator.logEvent(ingestion.id, userId, "error", "Processing", {
694
+ action: "VLM re-encoded image retry failed",
695
+ provider: resolvedProvider,
696
+ model: resolvedModel,
697
+ error: this.errorToMessage(retryErr),
698
+ }, supabase);
699
+ terminalError = retryErr;
700
+ }
701
+ }
494
702
  }
495
703
 
496
- return updatedIngestion as Ingestion;
497
-
498
- } catch (err) {
499
- const msg = err instanceof Error ? err.message : String(err);
500
-
501
- if (isVlmFastPath) {
704
+ const msg = this.errorToMessage(terminalError);
705
+ if (isMultimodalFastPath && multimodalModality) {
502
706
  const learnedState = await ModelCapabilityService.learnVisionFailure({
503
707
  supabase,
504
708
  userId,
505
- provider: llmProvider,
506
- model: llmModel,
507
- error: err,
709
+ provider: resolvedProvider,
710
+ model: resolvedModel,
711
+ error: terminalError,
712
+ modality: multimodalModality,
508
713
  });
509
714
  logger.warn(`VLM extraction failed for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
510
715
  Actuator.logEvent(ingestion.id, userId, "error", "Processing", {
@@ -524,6 +729,16 @@ export class IngestionService {
524
729
  .single();
525
730
  return updatedIngestion as Ingestion;
526
731
  }
732
+ } catch (err) {
733
+ const msg = this.errorToMessage(err);
734
+ Actuator.logEvent(ingestion.id, userId, "error", "Processing", { error: msg }, supabase);
735
+ const { data: updatedIngestion } = await supabase
736
+ .from("ingestions")
737
+ .update({ status: "error", error_message: msg })
738
+ .eq("id", ingestion.id)
739
+ .select()
740
+ .single();
741
+ return updatedIngestion as Ingestion;
527
742
  }
528
743
  }
529
744
 
@@ -587,38 +802,38 @@ export class IngestionService {
587
802
  if (!filePath) throw new Error("No storage path found for this ingestion");
588
803
 
589
804
  let isFastPath = false;
590
- let isVlmFastPath = false;
805
+ let isMultimodalFastPath = false;
806
+ let multimodalModality: VisionCapabilityModality | null = null;
591
807
  let extractionContent = "";
592
808
  const ext = filename.toLowerCase().split('.').pop() || '';
593
- const fastExts = ['txt', 'md', 'csv', 'json'];
594
- const imageExts = ['png', 'jpg', 'jpeg', 'webp'];
595
809
 
596
810
  const { data: triageSettingsRow } = await supabase
597
811
  .from("user_settings")
598
812
  .select("llm_provider, llm_model, embedding_provider, embedding_model, vision_model_capabilities")
599
813
  .eq("user_id", userId)
600
814
  .maybeSingle();
601
- const visionResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow);
602
- const llmModel = visionResolution.model;
603
- const llmProvider = visionResolution.provider;
815
+ const imageResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "image");
816
+ const pdfResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "pdf");
817
+ const llmModel = imageResolution.model;
818
+ const llmProvider = imageResolution.provider;
604
819
 
605
- if (fastExts.includes(ext)) {
820
+ if (this.FAST_EXTS.includes(ext as typeof this.FAST_EXTS[number])) {
606
821
  isFastPath = true;
607
822
  extractionContent = await fs.readFile(filePath, "utf-8");
608
- } else if (imageExts.includes(ext) && visionResolution.shouldAttempt) {
823
+ } else if (this.IMAGE_EXTS.includes(ext as typeof this.IMAGE_EXTS[number]) && imageResolution.shouldAttempt) {
609
824
  try {
610
- const buffer = await fs.readFile(filePath);
611
- const base64 = buffer.toString('base64');
612
825
  const mimeTypeActual = `image/${ext === 'jpg' ? 'jpeg' : ext}`;
613
- extractionContent = `[VLM_IMAGE_DATA:data:${mimeTypeActual};base64,${base64}]`;
826
+ const dataUrl = await this.fileToDataUrl(filePath, mimeTypeActual);
827
+ extractionContent = this.buildVlmPayloadMarker("image", dataUrl);
614
828
  isFastPath = true;
615
- isVlmFastPath = true;
829
+ isMultimodalFastPath = true;
830
+ multimodalModality = "image";
616
831
  logger.info(`Smart Triage: Re-run image ${filename} routed to Fast Path using native VLM (${llmModel}).`);
617
832
  Actuator.logEvent(ingestionId, userId, "info", "Triage", { action: "VLM Fast Path selected", type: ext, model: llmModel }, supabase);
618
833
  } catch (err) {
619
834
  logger.warn(`Failed to read VLM image ${filename} during rerun. Routing to Heavy Path.`, { err });
620
835
  }
621
- } else if (imageExts.includes(ext)) {
836
+ } else if (this.IMAGE_EXTS.includes(ext as typeof this.IMAGE_EXTS[number])) {
622
837
  logger.info(`Smart Triage: Re-run image ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked vision-unsupported.`);
623
838
  Actuator.logEvent(ingestionId, userId, "info", "Triage", {
624
839
  action: "VLM skipped (model marked unsupported)",
@@ -634,10 +849,32 @@ export class IngestionService {
634
849
  if (isPdfTextExtractable(pdfData)) {
635
850
  isFastPath = true;
636
851
  extractionContent = pdfData.text;
852
+ } else if (pdfResolution.shouldAttempt) {
853
+ // Reuse the already-loaded parse buffer; avoid a second readFile in fileToDataUrl.
854
+ const dataUrl = `data:application/pdf;base64,${buffer.toString("base64")}`;
855
+ extractionContent = this.buildVlmPayloadMarker("pdf", dataUrl);
856
+ isFastPath = true;
857
+ isMultimodalFastPath = true;
858
+ multimodalModality = "pdf";
859
+ logger.info(`Smart Triage: Re-run PDF ${filename} routed to multimodal Fast Path using native VLM (${llmModel}).`);
860
+ Actuator.logEvent(ingestionId, userId, "info", "Triage", {
861
+ action: "VLM Fast Path selected",
862
+ type: "pdf",
863
+ modality: "pdf",
864
+ model: llmModel,
865
+ }, supabase);
866
+ } else {
867
+ logger.info(`Smart Triage: Re-run PDF ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked PDF-unsupported.`);
868
+ Actuator.logEvent(ingestionId, userId, "info", "Triage", {
869
+ action: "VLM skipped (model marked unsupported)",
870
+ type: "pdf",
871
+ modality: "pdf",
872
+ model: llmModel,
873
+ provider: llmProvider
874
+ }, supabase);
637
875
  }
638
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
639
876
  } catch (err) {
640
- // ignore
877
+ logger.warn(`Failed to parse PDF ${filename} during rerun. Routing to Heavy Path.`, { err });
641
878
  }
642
879
  }
643
880
 
@@ -655,53 +892,60 @@ export class IngestionService {
655
892
  embedding_provider: processingSettingsRow.data?.embedding_provider ?? undefined,
656
893
  embedding_model: processingSettingsRow.data?.embedding_model ?? undefined,
657
894
  };
658
- const doc = { filePath, text: extractionContent, ingestionId, userId, supabase };
659
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
660
- const baselineTrace: Array<{ timestamp: string; step: string; details?: any }> = [];
895
+ const resolvedProvider = llmSettings.llm_provider ?? llmProvider;
896
+ const resolvedModel = llmSettings.llm_model ?? llmModel;
897
+
898
+ const runFastPathAttempt = async (
899
+ attemptContent: string,
900
+ attemptType: "primary" | "reencoded_image_retry"
901
+ ): Promise<boolean> => {
902
+ const doc = { filePath, text: attemptContent, ingestionId, userId, supabase };
903
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
904
+ const baselineTrace: Array<{ timestamp: string; step: string; details?: any }> = [];
661
905
 
662
- // Fire and forget Semantic Embedding Storage for re-runs
663
- RAGService.chunkAndEmbed(ingestionId, userId, doc.text, supabase, embedSettings).catch(err => {
664
- logger.error(`RAG embedding failed during rerun for ${ingestionId}`, err);
665
- });
906
+ // Fire and forget Semantic Embedding Storage for re-runs
907
+ RAGService.chunkAndEmbed(ingestionId, userId, doc.text, supabase, embedSettings).catch(err => {
908
+ logger.error(`RAG embedding failed during rerun for ${ingestionId}`, err);
909
+ });
666
910
 
667
- baselineTrace.push({
668
- timestamp: new Date().toISOString(),
669
- step: "LLM request (baseline extraction)",
670
- details: {
671
- provider: llmSettings.llm_provider ?? llmProvider,
672
- model: llmSettings.llm_model ?? llmModel,
673
- mode: isVlmFastPath ? "vision" : "text",
674
- }
675
- });
911
+ baselineTrace.push({
912
+ timestamp: new Date().toISOString(),
913
+ step: "LLM request (baseline extraction)",
914
+ details: {
915
+ provider: resolvedProvider,
916
+ model: resolvedModel,
917
+ mode: isMultimodalFastPath
918
+ ? `vision:${multimodalModality ?? "image"}${attemptType === "reencoded_image_retry" ? ":reencoded" : ""}`
919
+ : "text",
920
+ }
921
+ });
676
922
 
677
- const baselineResult = await PolicyEngine.extractBaseline(
678
- doc,
679
- { context: baselineConfig?.context, fields: baselineConfig?.fields },
680
- llmSettings
681
- );
682
- const baselineEntities = baselineResult.entities;
683
- const autoTags = baselineResult.tags;
684
- baselineTrace.push({
685
- timestamp: new Date().toISOString(),
686
- step: "LLM response (baseline extraction)",
687
- details: {
688
- entities_count: Object.keys(baselineEntities).length,
689
- uncertain_count: baselineResult.uncertain_fields.length,
690
- tags_count: autoTags.length,
691
- }
692
- });
923
+ const baselineResult = await PolicyEngine.extractBaseline(
924
+ doc,
925
+ { context: baselineConfig?.context, fields: baselineConfig?.fields },
926
+ llmSettings
927
+ );
928
+ const baselineEntities = baselineResult.entities;
929
+ const autoTags = baselineResult.tags;
930
+ baselineTrace.push({
931
+ timestamp: new Date().toISOString(),
932
+ step: "LLM response (baseline extraction)",
933
+ details: {
934
+ entities_count: Object.keys(baselineEntities).length,
935
+ uncertain_count: baselineResult.uncertain_fields.length,
936
+ tags_count: autoTags.length,
937
+ }
938
+ });
693
939
 
694
- const entityLines = Object.entries(baselineEntities)
695
- .filter(([, v]) => v != null)
696
- .map(([k, v]) => `${k}: ${Array.isArray(v) ? (v as unknown[]).join(", ") : String(v)}`);
697
- const enrichedDoc = entityLines.length > 0
698
- ? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
699
- : doc;
940
+ const entityLines = Object.entries(baselineEntities)
941
+ .filter(([, v]) => v != null)
942
+ .map(([k, v]) => `${k}: ${Array.isArray(v) ? (v as unknown[]).join(", ") : String(v)}`);
943
+ const enrichedDoc = entityLines.length > 0
944
+ ? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
945
+ : doc;
700
946
 
701
- let finalStatus = "no_match";
702
- let result: import("./PolicyEngine.js").ProcessingResult;
703
- let policyName;
704
- try {
947
+ let finalStatus = "no_match";
948
+ let result: import("./PolicyEngine.js").ProcessingResult;
705
949
  const forcedPolicyId = opts.forcedPolicyId?.trim();
706
950
  const activePolicies = forcedPolicyId
707
951
  ? userPolicies.filter((policy) => policy.metadata.id === forcedPolicyId)
@@ -726,7 +970,7 @@ export class IngestionService {
726
970
  result = await PolicyEngine.process(enrichedDoc, llmSettings, baselineEntities);
727
971
  }
728
972
 
729
- policyName = result.matchedPolicy ? activePolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name : undefined;
973
+ const policyName = result.matchedPolicy ? activePolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name : undefined;
730
974
  finalStatus = result.status === "fallback" ? "no_match" : result.status;
731
975
  const mergedExtracted = { ...baselineEntities, ...result.extractedData };
732
976
 
@@ -754,7 +998,7 @@ export class IngestionService {
754
998
  })
755
999
  .eq("id", ingestionId);
756
1000
 
757
- if (isVlmFastPath) {
1001
+ if (isMultimodalFastPath && multimodalModality) {
758
1002
  const embeddingMeta = this.queueVlmSemanticEmbedding({
759
1003
  ingestionId,
760
1004
  userId,
@@ -763,6 +1007,7 @@ export class IngestionService {
763
1007
  policyName,
764
1008
  extracted: mergedExtracted,
765
1009
  tags: mergedTags,
1010
+ modality: multimodalModality,
766
1011
  supabase,
767
1012
  embedSettings,
768
1013
  });
@@ -780,37 +1025,99 @@ export class IngestionService {
780
1025
  .eq("id", ingestionId);
781
1026
  }
782
1027
 
783
- if (isVlmFastPath) {
1028
+ if (isMultimodalFastPath && multimodalModality) {
784
1029
  await ModelCapabilityService.learnVisionSuccess({
785
1030
  supabase,
786
1031
  userId,
787
- provider: llmSettings.llm_provider ?? llmProvider,
788
- model: llmSettings.llm_model ?? llmModel,
1032
+ provider: resolvedProvider,
1033
+ model: resolvedModel,
1034
+ modality: multimodalModality,
789
1035
  });
790
1036
  }
791
1037
 
792
1038
  return finalStatus === "matched";
793
- } catch (err: unknown) {
794
- const msg = err instanceof Error ? err.message : String(err);
795
- if (isVlmFastPath) {
796
- const learnedState = await ModelCapabilityService.learnVisionFailure({
797
- supabase,
798
- userId,
799
- provider: llmProvider,
800
- model: llmModel,
801
- error: err,
1039
+ };
1040
+
1041
+ let terminalError: unknown = null;
1042
+ try {
1043
+ return await runFastPathAttempt(extractionContent, "primary");
1044
+ } catch (primaryErr) {
1045
+ terminalError = primaryErr;
1046
+ }
1047
+
1048
+ if (isMultimodalFastPath && multimodalModality === "image") {
1049
+ const retryMarker = await this.maybeBuildImageRetryMarker({
1050
+ error: terminalError,
1051
+ filePath,
1052
+ filename,
1053
+ provider: resolvedProvider,
1054
+ model: resolvedModel,
1055
+ phase: "rerun",
1056
+ });
1057
+ if (retryMarker) {
1058
+ this.bumpImageReencodeRetryMetric("attempted", {
1059
+ phase: "rerun",
1060
+ provider: resolvedProvider,
1061
+ model: resolvedModel,
1062
+ filename,
802
1063
  });
803
- logger.warn(`VLM extraction failed during rerun for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
804
- Actuator.logEvent(ingestionId, userId, "error", "Processing", {
805
- action: "VLM Failed, Fallback to Heavy",
806
- error: msg,
807
- learned_state: learnedState,
1064
+ Actuator.logEvent(ingestionId, userId, "info", "Processing", {
1065
+ action: "Retrying VLM with re-encoded image payload",
1066
+ provider: resolvedProvider,
1067
+ model: resolvedModel,
808
1068
  }, supabase);
809
- isFastPath = false; // Trigger heavy path fallthrough
810
- } else {
811
- throw err; // Re-throw to caller
1069
+ try {
1070
+ const retryResult = await runFastPathAttempt(retryMarker, "reencoded_image_retry");
1071
+ this.bumpImageReencodeRetryMetric("succeeded", {
1072
+ phase: "rerun",
1073
+ provider: resolvedProvider,
1074
+ model: resolvedModel,
1075
+ filename,
1076
+ });
1077
+ Actuator.logEvent(ingestionId, userId, "analysis", "Processing", {
1078
+ action: "VLM re-encoded image retry succeeded",
1079
+ provider: resolvedProvider,
1080
+ model: resolvedModel,
1081
+ }, supabase);
1082
+ return retryResult;
1083
+ } catch (retryErr) {
1084
+ this.bumpImageReencodeRetryMetric("failed", {
1085
+ phase: "rerun",
1086
+ provider: resolvedProvider,
1087
+ model: resolvedModel,
1088
+ filename,
1089
+ });
1090
+ Actuator.logEvent(ingestionId, userId, "error", "Processing", {
1091
+ action: "VLM re-encoded image retry failed",
1092
+ provider: resolvedProvider,
1093
+ model: resolvedModel,
1094
+ error: this.errorToMessage(retryErr),
1095
+ }, supabase);
1096
+ terminalError = retryErr;
1097
+ }
812
1098
  }
813
1099
  }
1100
+
1101
+ const msg = this.errorToMessage(terminalError);
1102
+ if (isMultimodalFastPath && multimodalModality) {
1103
+ const learnedState = await ModelCapabilityService.learnVisionFailure({
1104
+ supabase,
1105
+ userId,
1106
+ provider: resolvedProvider,
1107
+ model: resolvedModel,
1108
+ error: terminalError,
1109
+ modality: multimodalModality,
1110
+ });
1111
+ logger.warn(`VLM extraction failed during rerun for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
1112
+ Actuator.logEvent(ingestionId, userId, "error", "Processing", {
1113
+ action: "VLM Failed, Fallback to Heavy",
1114
+ error: msg,
1115
+ learned_state: learnedState,
1116
+ }, supabase);
1117
+ isFastPath = false; // Trigger heavy path fallthrough
1118
+ } else {
1119
+ throw terminalError instanceof Error ? terminalError : new Error(msg); // Re-throw to caller
1120
+ }
814
1121
  }
815
1122
 
816
1123
  // Re-delegate to rtx_activities