@realtimex/folio 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,7 @@ import { Actuator } from "../utils/Actuator.js";
11
11
  import { extractLlmResponse, previewLlmText } from "../utils/llmResponse.js";
12
12
  import { RAGService } from "./RAGService.js";
13
13
  import { SDKService } from "./SDKService.js";
14
- import { ModelCapabilityService } from "./ModelCapabilityService.js";
14
+ import { ModelCapabilityService, type VisionCapabilityModality } from "./ModelCapabilityService.js";
15
15
 
16
16
  const logger = createLogger("IngestionService");
17
17
 
@@ -89,6 +89,9 @@ export interface Ingestion {
89
89
  }
90
90
 
91
91
  export class IngestionService {
92
+ private static readonly FAST_EXTS = ["txt", "md", "csv", "json"] as const;
93
+ private static readonly IMAGE_EXTS = ["png", "jpg", "jpeg", "webp"] as const;
94
+
92
95
  private static readonly NON_IDEMPOTENT_ACTION_TYPES = new Set([
93
96
  "append_to_google_sheet",
94
97
  "webhook",
@@ -129,11 +132,12 @@ export class IngestionService {
129
132
  policyName?: string;
130
133
  extracted: Record<string, unknown>;
131
134
  tags: string[];
135
+ modality: VisionCapabilityModality;
132
136
  }): string {
133
- const { filename, finalStatus, policyName, extracted, tags } = opts;
137
+ const { filename, finalStatus, policyName, extracted, tags, modality } = opts;
134
138
  const lines: string[] = [
135
139
  `Document filename: ${filename}`,
136
- "Document source: VLM image extraction",
140
+ `Document source: VLM ${modality} extraction`,
137
141
  `Processing status: ${finalStatus}`,
138
142
  ];
139
143
 
@@ -186,6 +190,7 @@ export class IngestionService {
186
190
  policyName?: string;
187
191
  extracted: Record<string, unknown>;
188
192
  tags: string[];
193
+ modality: VisionCapabilityModality;
189
194
  supabase: SupabaseClient;
190
195
  embedSettings: { embedding_provider?: string; embedding_model?: string };
191
196
  }): { synthetic_chars: number; extracted_fields: number; tags_count: number } {
@@ -195,6 +200,7 @@ export class IngestionService {
195
200
  policyName: opts.policyName,
196
201
  extracted: opts.extracted,
197
202
  tags: opts.tags,
203
+ modality: opts.modality,
198
204
  });
199
205
  const details = {
200
206
  synthetic_chars: syntheticText.length,
@@ -231,6 +237,17 @@ export class IngestionService {
231
237
  return details;
232
238
  }
233
239
 
240
+ private static buildVlmPayloadMarker(modality: VisionCapabilityModality, dataUrl: string): string {
241
+ const prefix = modality === "pdf" ? "VLM_PDF_DATA" : "VLM_IMAGE_DATA";
242
+ return `[${prefix}:${dataUrl}]`;
243
+ }
244
+
245
+ private static async fileToDataUrl(filePath: string, mimeType: string): Promise<string> {
246
+ const buffer = await fs.readFile(filePath);
247
+ const base64 = buffer.toString("base64");
248
+ return `data:${mimeType};base64,${base64}`;
249
+ }
250
+
234
251
  /**
235
252
  * Ingest a document using Hybrid Routing Architecture.
236
253
  */
@@ -303,11 +320,10 @@ export class IngestionService {
303
320
 
304
321
  // 2. Document Triage
305
322
  let isFastPath = false;
306
- let isVlmFastPath = false;
323
+ let isMultimodalFastPath = false;
324
+ let multimodalModality: VisionCapabilityModality | null = null;
307
325
  let extractionContent = content;
308
326
  const ext = filename.toLowerCase().split('.').pop() || '';
309
- const fastExts = ['txt', 'md', 'csv', 'json'];
310
- const imageExts = ['png', 'jpg', 'jpeg', 'webp'];
311
327
 
312
328
  // Pre-fetch settings to decide whether we should attempt VLM.
313
329
  const { data: triageSettingsRow } = await supabase
@@ -315,27 +331,27 @@ export class IngestionService {
315
331
  .select("llm_provider, llm_model, embedding_provider, embedding_model, vision_model_capabilities")
316
332
  .eq("user_id", userId)
317
333
  .maybeSingle();
318
- const visionResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow);
319
- const llmModel = visionResolution.model;
320
- const llmProvider = visionResolution.provider;
334
+ const imageResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "image");
335
+ const pdfResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "pdf");
336
+ const llmModel = imageResolution.model;
337
+ const llmProvider = imageResolution.provider;
321
338
 
322
- if (fastExts.includes(ext)) {
339
+ if (this.FAST_EXTS.includes(ext as typeof this.FAST_EXTS[number])) {
323
340
  isFastPath = true;
324
- } else if (imageExts.includes(ext) && visionResolution.shouldAttempt) {
341
+ } else if (this.IMAGE_EXTS.includes(ext as typeof this.IMAGE_EXTS[number]) && imageResolution.shouldAttempt) {
325
342
  try {
326
- const buffer = await fs.readFile(filePath);
327
- const base64 = buffer.toString('base64');
328
343
  const mimeTypeActual = mimeType || `image/${ext === 'jpg' ? 'jpeg' : ext}`;
329
- // Special marker for PolicyEngine
330
- extractionContent = `[VLM_IMAGE_DATA:data:${mimeTypeActual};base64,${base64}]`;
344
+ const dataUrl = await this.fileToDataUrl(filePath, mimeTypeActual);
345
+ extractionContent = this.buildVlmPayloadMarker("image", dataUrl);
331
346
  isFastPath = true;
332
- isVlmFastPath = true;
347
+ isMultimodalFastPath = true;
348
+ multimodalModality = "image";
333
349
  logger.info(`Smart Triage: Image ${filename} routed to Fast Path using native VLM (${llmModel}).`);
334
350
  Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "VLM Fast Path selected", type: ext, model: llmModel }, supabase);
335
351
  } catch (err) {
336
352
  logger.warn(`Failed to read VLM image ${filename}. Routing to Heavy Path.`, { err });
337
353
  }
338
- } else if (imageExts.includes(ext)) {
354
+ } else if (this.IMAGE_EXTS.includes(ext as typeof this.IMAGE_EXTS[number])) {
339
355
  logger.info(`Smart Triage: Image ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked vision-unsupported.`);
340
356
  Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
341
357
  action: "VLM skipped (model marked unsupported)",
@@ -353,9 +369,29 @@ export class IngestionService {
353
369
  extractionContent = pdfData.text;
354
370
  logger.info(`Smart Triage: PDF ${filename} passed text quality check (${pdfData.pages.filter(p => p.text.trim().length > 30).length}/${pdfData.total} pages with text). Routing to Fast Path.`);
355
371
  Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Smart Triage passed", type: "pdf", fast_path: true }, supabase);
372
+ } else if (pdfResolution.shouldAttempt) {
373
+ // Reuse the already-loaded parse buffer; avoid a second readFile in fileToDataUrl.
374
+ const dataUrl = `data:application/pdf;base64,${buffer.toString("base64")}`;
375
+ extractionContent = this.buildVlmPayloadMarker("pdf", dataUrl);
376
+ isFastPath = true;
377
+ isMultimodalFastPath = true;
378
+ multimodalModality = "pdf";
379
+ logger.info(`Smart Triage: PDF ${filename} routed to multimodal Fast Path using native VLM (${llmModel}).`);
380
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
381
+ action: "VLM Fast Path selected",
382
+ type: "pdf",
383
+ modality: "pdf",
384
+ model: llmModel,
385
+ }, supabase);
356
386
  } else {
357
- logger.info(`Smart Triage: PDF ${filename} failed text quality check. Routing to Heavy Path.`);
358
- Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Smart Triage failed", type: "pdf", fast_path: false }, supabase);
387
+ logger.info(`Smart Triage: PDF ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked PDF-unsupported.`);
388
+ Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
389
+ action: "VLM skipped (model marked unsupported)",
390
+ type: "pdf",
391
+ modality: "pdf",
392
+ model: llmModel,
393
+ provider: llmProvider,
394
+ }, supabase);
359
395
  }
360
396
  } catch (err) {
361
397
  logger.warn(`Failed to parse PDF ${filename}. Routing to Heavy Path.`, { err });
@@ -395,7 +431,7 @@ export class IngestionService {
395
431
  details: {
396
432
  provider: llmSettings.llm_provider ?? llmProvider,
397
433
  model: llmSettings.llm_model ?? llmModel,
398
- mode: isVlmFastPath ? "vision" : "text",
434
+ mode: isMultimodalFastPath ? `vision:${multimodalModality ?? "image"}` : "text",
399
435
  }
400
436
  });
401
437
 
@@ -458,7 +494,7 @@ export class IngestionService {
458
494
  .select()
459
495
  .single();
460
496
 
461
- if (isVlmFastPath) {
497
+ if (isMultimodalFastPath && multimodalModality) {
462
498
  const embeddingMeta = this.queueVlmSemanticEmbedding({
463
499
  ingestionId: ingestion.id,
464
500
  userId,
@@ -467,6 +503,7 @@ export class IngestionService {
467
503
  policyName,
468
504
  extracted: mergedExtracted,
469
505
  tags: autoTags,
506
+ modality: multimodalModality,
470
507
  supabase,
471
508
  embedSettings,
472
509
  });
@@ -484,12 +521,13 @@ export class IngestionService {
484
521
  .eq("id", ingestion.id);
485
522
  }
486
523
 
487
- if (isVlmFastPath) {
524
+ if (isMultimodalFastPath && multimodalModality) {
488
525
  await ModelCapabilityService.learnVisionSuccess({
489
526
  supabase,
490
527
  userId,
491
528
  provider: llmSettings.llm_provider ?? llmProvider,
492
529
  model: llmSettings.llm_model ?? llmModel,
530
+ modality: multimodalModality,
493
531
  });
494
532
  }
495
533
 
@@ -498,13 +536,14 @@ export class IngestionService {
498
536
  } catch (err) {
499
537
  const msg = err instanceof Error ? err.message : String(err);
500
538
 
501
- if (isVlmFastPath) {
539
+ if (isMultimodalFastPath && multimodalModality) {
502
540
  const learnedState = await ModelCapabilityService.learnVisionFailure({
503
541
  supabase,
504
542
  userId,
505
543
  provider: llmProvider,
506
544
  model: llmModel,
507
545
  error: err,
546
+ modality: multimodalModality,
508
547
  });
509
548
  logger.warn(`VLM extraction failed for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
510
549
  Actuator.logEvent(ingestion.id, userId, "error", "Processing", {
@@ -587,38 +626,38 @@ export class IngestionService {
587
626
  if (!filePath) throw new Error("No storage path found for this ingestion");
588
627
 
589
628
  let isFastPath = false;
590
- let isVlmFastPath = false;
629
+ let isMultimodalFastPath = false;
630
+ let multimodalModality: VisionCapabilityModality | null = null;
591
631
  let extractionContent = "";
592
632
  const ext = filename.toLowerCase().split('.').pop() || '';
593
- const fastExts = ['txt', 'md', 'csv', 'json'];
594
- const imageExts = ['png', 'jpg', 'jpeg', 'webp'];
595
633
 
596
634
  const { data: triageSettingsRow } = await supabase
597
635
  .from("user_settings")
598
636
  .select("llm_provider, llm_model, embedding_provider, embedding_model, vision_model_capabilities")
599
637
  .eq("user_id", userId)
600
638
  .maybeSingle();
601
- const visionResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow);
602
- const llmModel = visionResolution.model;
603
- const llmProvider = visionResolution.provider;
639
+ const imageResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "image");
640
+ const pdfResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow, "pdf");
641
+ const llmModel = imageResolution.model;
642
+ const llmProvider = imageResolution.provider;
604
643
 
605
- if (fastExts.includes(ext)) {
644
+ if (this.FAST_EXTS.includes(ext as typeof this.FAST_EXTS[number])) {
606
645
  isFastPath = true;
607
646
  extractionContent = await fs.readFile(filePath, "utf-8");
608
- } else if (imageExts.includes(ext) && visionResolution.shouldAttempt) {
647
+ } else if (this.IMAGE_EXTS.includes(ext as typeof this.IMAGE_EXTS[number]) && imageResolution.shouldAttempt) {
609
648
  try {
610
- const buffer = await fs.readFile(filePath);
611
- const base64 = buffer.toString('base64');
612
649
  const mimeTypeActual = `image/${ext === 'jpg' ? 'jpeg' : ext}`;
613
- extractionContent = `[VLM_IMAGE_DATA:data:${mimeTypeActual};base64,${base64}]`;
650
+ const dataUrl = await this.fileToDataUrl(filePath, mimeTypeActual);
651
+ extractionContent = this.buildVlmPayloadMarker("image", dataUrl);
614
652
  isFastPath = true;
615
- isVlmFastPath = true;
653
+ isMultimodalFastPath = true;
654
+ multimodalModality = "image";
616
655
  logger.info(`Smart Triage: Re-run image ${filename} routed to Fast Path using native VLM (${llmModel}).`);
617
656
  Actuator.logEvent(ingestionId, userId, "info", "Triage", { action: "VLM Fast Path selected", type: ext, model: llmModel }, supabase);
618
657
  } catch (err) {
619
658
  logger.warn(`Failed to read VLM image ${filename} during rerun. Routing to Heavy Path.`, { err });
620
659
  }
621
- } else if (imageExts.includes(ext)) {
660
+ } else if (this.IMAGE_EXTS.includes(ext as typeof this.IMAGE_EXTS[number])) {
622
661
  logger.info(`Smart Triage: Re-run image ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked vision-unsupported.`);
623
662
  Actuator.logEvent(ingestionId, userId, "info", "Triage", {
624
663
  action: "VLM skipped (model marked unsupported)",
@@ -634,10 +673,32 @@ export class IngestionService {
634
673
  if (isPdfTextExtractable(pdfData)) {
635
674
  isFastPath = true;
636
675
  extractionContent = pdfData.text;
676
+ } else if (pdfResolution.shouldAttempt) {
677
+ // Reuse the already-loaded parse buffer; avoid a second readFile in fileToDataUrl.
678
+ const dataUrl = `data:application/pdf;base64,${buffer.toString("base64")}`;
679
+ extractionContent = this.buildVlmPayloadMarker("pdf", dataUrl);
680
+ isFastPath = true;
681
+ isMultimodalFastPath = true;
682
+ multimodalModality = "pdf";
683
+ logger.info(`Smart Triage: Re-run PDF ${filename} routed to multimodal Fast Path using native VLM (${llmModel}).`);
684
+ Actuator.logEvent(ingestionId, userId, "info", "Triage", {
685
+ action: "VLM Fast Path selected",
686
+ type: "pdf",
687
+ modality: "pdf",
688
+ model: llmModel,
689
+ }, supabase);
690
+ } else {
691
+ logger.info(`Smart Triage: Re-run PDF ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked PDF-unsupported.`);
692
+ Actuator.logEvent(ingestionId, userId, "info", "Triage", {
693
+ action: "VLM skipped (model marked unsupported)",
694
+ type: "pdf",
695
+ modality: "pdf",
696
+ model: llmModel,
697
+ provider: llmProvider
698
+ }, supabase);
637
699
  }
638
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
639
700
  } catch (err) {
640
- // ignore
701
+ logger.warn(`Failed to parse PDF ${filename} during rerun. Routing to Heavy Path.`, { err });
641
702
  }
642
703
  }
643
704
 
@@ -667,12 +728,12 @@ export class IngestionService {
667
728
  baselineTrace.push({
668
729
  timestamp: new Date().toISOString(),
669
730
  step: "LLM request (baseline extraction)",
670
- details: {
671
- provider: llmSettings.llm_provider ?? llmProvider,
672
- model: llmSettings.llm_model ?? llmModel,
673
- mode: isVlmFastPath ? "vision" : "text",
674
- }
675
- });
731
+ details: {
732
+ provider: llmSettings.llm_provider ?? llmProvider,
733
+ model: llmSettings.llm_model ?? llmModel,
734
+ mode: isMultimodalFastPath ? `vision:${multimodalModality ?? "image"}` : "text",
735
+ }
736
+ });
676
737
 
677
738
  const baselineResult = await PolicyEngine.extractBaseline(
678
739
  doc,
@@ -754,7 +815,7 @@ export class IngestionService {
754
815
  })
755
816
  .eq("id", ingestionId);
756
817
 
757
- if (isVlmFastPath) {
818
+ if (isMultimodalFastPath && multimodalModality) {
758
819
  const embeddingMeta = this.queueVlmSemanticEmbedding({
759
820
  ingestionId,
760
821
  userId,
@@ -763,6 +824,7 @@ export class IngestionService {
763
824
  policyName,
764
825
  extracted: mergedExtracted,
765
826
  tags: mergedTags,
827
+ modality: multimodalModality,
766
828
  supabase,
767
829
  embedSettings,
768
830
  });
@@ -780,25 +842,27 @@ export class IngestionService {
780
842
  .eq("id", ingestionId);
781
843
  }
782
844
 
783
- if (isVlmFastPath) {
845
+ if (isMultimodalFastPath && multimodalModality) {
784
846
  await ModelCapabilityService.learnVisionSuccess({
785
847
  supabase,
786
848
  userId,
787
849
  provider: llmSettings.llm_provider ?? llmProvider,
788
850
  model: llmSettings.llm_model ?? llmModel,
851
+ modality: multimodalModality,
789
852
  });
790
853
  }
791
854
 
792
855
  return finalStatus === "matched";
793
856
  } catch (err: unknown) {
794
857
  const msg = err instanceof Error ? err.message : String(err);
795
- if (isVlmFastPath) {
858
+ if (isMultimodalFastPath && multimodalModality) {
796
859
  const learnedState = await ModelCapabilityService.learnVisionFailure({
797
860
  supabase,
798
861
  userId,
799
862
  provider: llmProvider,
800
863
  model: llmModel,
801
864
  error: err,
865
+ modality: multimodalModality,
802
866
  });
803
867
  logger.warn(`VLM extraction failed during rerun for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
804
868
  Actuator.logEvent(ingestionId, userId, "error", "Processing", {