@realtimex/folio 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,20 +8,21 @@ export class ModelCapabilityService {
8
8
  static UNSUPPORTED_CONFIRMATION_WINDOW_MS = 24 * 60 * 60 * 1000;
9
9
  static UNSUPPORTED_CONFIRMATION_FAILURES = 2;
10
10
  static UNSUPPORTED_SCORE_THRESHOLD = 3;
11
- static resolveVisionSupport(settingsRow) {
11
+ static resolveVisionSupport(settingsRow, modality = "image") {
12
12
  const provider = (settingsRow?.llm_provider || SDKService.DEFAULT_LLM_PROVIDER).trim();
13
13
  const model = (settingsRow?.llm_model || SDKService.DEFAULT_LLM_MODEL).trim();
14
- const state = this.getVisionState(settingsRow?.vision_model_capabilities, provider, model);
14
+ const state = this.getVisionState(settingsRow?.vision_model_capabilities, provider, model, modality);
15
15
  return {
16
16
  provider,
17
17
  model,
18
+ modality,
18
19
  state,
19
20
  shouldAttempt: state !== "unsupported",
20
21
  };
21
22
  }
22
- static getVisionState(rawMap, provider, model) {
23
+ static getVisionState(rawMap, provider, model, modality = "image") {
23
24
  const map = this.normalizeCapabilityMap(rawMap);
24
- const entry = map[this.capabilityKey(provider, model)];
25
+ const entry = map[this.capabilityKey(provider, model, modality)];
25
26
  if (!entry || this.isExpired(entry))
26
27
  return "unknown";
27
28
  if (entry.state === "pending_unsupported")
@@ -31,18 +32,21 @@ export class ModelCapabilityService {
31
32
  static async learnVisionSuccess(opts) {
32
33
  await this.writeCapability({
33
34
  ...opts,
35
+ modality: opts.modality ?? "image",
34
36
  state: "supported",
35
37
  reason: "vision_request_succeeded",
36
38
  ttlMs: this.SUPPORTED_TTL_MS,
37
39
  });
38
40
  }
39
41
  static async learnVisionFailure(opts) {
42
+ const modality = opts.modality ?? "image";
40
43
  const classification = this.classifyVisionFailure({
41
44
  error: opts.error,
42
45
  provider: opts.provider,
46
+ modality,
43
47
  });
44
48
  if (!classification.isCapabilityError) {
45
- logger.info(`Vision failure for ${opts.provider}/${opts.model} treated as non-capability; leaving capability unknown`, {
49
+ logger.info(`Vision failure for ${opts.provider}/${opts.model} (${modality}) treated as non-capability; leaving capability unknown`, {
46
50
  reason: classification.reason,
47
51
  score: classification.score,
48
52
  evidence: classification.evidence,
@@ -53,7 +57,7 @@ export class ModelCapabilityService {
53
57
  if (!map) {
54
58
  return "unknown";
55
59
  }
56
- const key = this.capabilityKey(opts.provider, opts.model);
60
+ const key = this.capabilityKey(opts.provider, opts.model, modality);
57
61
  const now = new Date();
58
62
  const failureCount = this.nextFailureCount(map[key], now.getTime());
59
63
  if (failureCount < this.UNSUPPORTED_CONFIRMATION_FAILURES) {
@@ -62,6 +66,7 @@ export class ModelCapabilityService {
62
66
  userId: opts.userId,
63
67
  provider: opts.provider,
64
68
  model: opts.model,
69
+ modality,
65
70
  state: "pending_unsupported",
66
71
  reason: "capability_signal_pending_confirmation",
67
72
  ttlMs: this.PENDING_UNSUPPORTED_TTL_MS,
@@ -77,6 +82,7 @@ export class ModelCapabilityService {
77
82
  userId: opts.userId,
78
83
  provider: opts.provider,
79
84
  model: opts.model,
85
+ modality,
80
86
  state: "unsupported",
81
87
  reason: classification.reason,
82
88
  ttlMs: this.UNSUPPORTED_TTL_MS,
@@ -113,13 +119,13 @@ export class ModelCapabilityService {
113
119
  return true;
114
120
  }
115
121
  static async writeCapability(opts) {
116
- const { supabase, userId, provider, model, state, reason, ttlMs, preloadedMap, failureCount, lastFailureAt, evidence, } = opts;
122
+ const { supabase, userId, provider, model, modality, state, reason, ttlMs, preloadedMap, failureCount, lastFailureAt, evidence, } = opts;
117
123
  const map = preloadedMap ?? (await this.readCapabilityMap(supabase, userId));
118
124
  if (!map) {
119
125
  return;
120
126
  }
121
127
  const now = new Date();
122
- const key = this.capabilityKey(provider, model);
128
+ const key = this.capabilityKey(provider, model, modality);
123
129
  const nextEntry = {
124
130
  state,
125
131
  learned_at: now.toISOString(),
@@ -140,7 +146,7 @@ export class ModelCapabilityService {
140
146
  if (!persisted) {
141
147
  return;
142
148
  }
143
- logger.info(`Updated model capability for ${provider}/${model}: ${state}`, {
149
+ logger.info(`Updated model capability for ${provider}/${model} (${modality}): ${state}`, {
144
150
  reason,
145
151
  ttlMs,
146
152
  failureCount,
@@ -191,9 +197,15 @@ export class ModelCapabilityService {
191
197
  }
192
198
  return normalized;
193
199
  }
194
- static capabilityKey(provider, model) {
200
+ static capabilityBaseKey(provider, model) {
195
201
  return `${provider.toLowerCase().trim()}:${model.toLowerCase().trim()}`;
196
202
  }
203
+ static capabilityKey(provider, model, modality = "image") {
204
+ const base = this.capabilityBaseKey(provider, model);
205
+ if (modality === "image")
206
+ return base;
207
+ return `${base}:${modality}`;
208
+ }
197
209
  static isExpired(entry) {
198
210
  if (!entry.expires_at)
199
211
  return false;
@@ -230,7 +242,7 @@ export class ModelCapabilityService {
230
242
  evidence: transientEvidence,
231
243
  };
232
244
  }
233
- const documentEvidence = this.matchDocumentSpecific(signal);
245
+ const documentEvidence = this.matchDocumentSpecific(signal, opts.modality);
234
246
  if (documentEvidence.length > 0) {
235
247
  return {
236
248
  isCapabilityError: false,
@@ -239,7 +251,7 @@ export class ModelCapabilityService {
239
251
  evidence: documentEvidence,
240
252
  };
241
253
  }
242
- const capability = this.scoreCapabilitySignal(signal, opts.provider);
254
+ const capability = this.scoreCapabilitySignal(signal, opts.provider, opts.modality);
243
255
  if (capability.score >= this.UNSUPPORTED_SCORE_THRESHOLD) {
244
256
  return {
245
257
  isCapabilityError: true,
@@ -373,8 +385,8 @@ export class ModelCapabilityService {
373
385
  ...messageMatches.map((match) => `msg:${match}`),
374
386
  ];
375
387
  }
376
- static matchDocumentSpecific(signal) {
377
- const codeMatches = this.matchCodes(signal.codes, [
388
+ static matchDocumentSpecific(signal, modality) {
389
+ const imageCodeHints = [
378
390
  "image_too_large",
379
391
  "invalid_base64",
380
392
  "invalid_image",
@@ -382,8 +394,8 @@ export class ModelCapabilityService {
382
394
  "malformed_image",
383
395
  "invalid_image_url",
384
396
  "image_decode_failed",
385
- ]);
386
- const messageMatches = this.matchMessage(signal.message, [
397
+ ];
398
+ const imageMessageHints = [
387
399
  "image too large",
388
400
  "invalid base64",
389
401
  "malformed image",
@@ -391,7 +403,30 @@ export class ModelCapabilityService {
391
403
  "unable to decode image",
392
404
  "failed to decode image",
393
405
  "invalid image url",
394
- ]);
406
+ ];
407
+ const pdfCodeHints = [
408
+ "invalid_pdf",
409
+ "malformed_pdf",
410
+ "corrupt_pdf",
411
+ "encrypted_pdf",
412
+ "password_protected_pdf",
413
+ "pdf_parse_error",
414
+ "file_too_large",
415
+ ];
416
+ const pdfMessageHints = [
417
+ "invalid pdf",
418
+ "malformed pdf",
419
+ "corrupt pdf",
420
+ "encrypted pdf",
421
+ "password protected pdf",
422
+ "failed to parse pdf",
423
+ "unable to parse pdf",
424
+ "pdf is corrupted",
425
+ "pdf too large",
426
+ "file too large",
427
+ ];
428
+ const codeMatches = this.matchCodes(signal.codes, modality === "pdf" ? pdfCodeHints : imageCodeHints);
429
+ const messageMatches = this.matchMessage(signal.message, modality === "pdf" ? pdfMessageHints : imageMessageHints);
395
430
  const statusMatches = Array.from(signal.statusCodes).filter((status) => {
396
431
  if (status === 413)
397
432
  return true;
@@ -406,54 +441,87 @@ export class ModelCapabilityService {
406
441
  ...messageMatches.map((match) => `msg:${match}`),
407
442
  ];
408
443
  }
409
- static scoreCapabilitySignal(signal, provider) {
444
+ static scoreCapabilitySignal(signal, provider, modality) {
410
445
  const evidence = [];
411
446
  let score = 0;
412
- const explicitCapabilityCodes = this.matchCodes(signal.codes, [
413
- "vision_not_supported",
414
- "unsupported_vision",
415
- "model_not_vision_capable",
416
- "image_not_supported",
417
- "unsupported_message_content",
418
- "unsupported_content_type_for_model",
419
- "unsupported_image_input",
420
- "invalid_model_for_vision",
421
- ]);
447
+ const explicitCapabilityCodes = this.matchCodes(signal.codes, modality === "pdf"
448
+ ? [
449
+ "pdf_not_supported",
450
+ "unsupported_pdf_input",
451
+ "unsupported_document_input",
452
+ "unsupported_file_input",
453
+ "input_file_not_supported",
454
+ "unsupported_file_type",
455
+ "model_not_document_capable",
456
+ ]
457
+ : [
458
+ "vision_not_supported",
459
+ "unsupported_vision",
460
+ "model_not_vision_capable",
461
+ "image_not_supported",
462
+ "unsupported_message_content",
463
+ "unsupported_content_type_for_model",
464
+ "unsupported_image_input",
465
+ "invalid_model_for_vision",
466
+ ]);
422
467
  if (explicitCapabilityCodes.length > 0) {
423
468
  score += 3;
424
469
  evidence.push(...explicitCapabilityCodes.map((match) => `code:${match}`));
425
470
  }
426
- const highPrecisionMessageMatches = this.matchMessage(signal.message, [
427
- "does not support images",
428
- "does not support image inputs",
429
- "model does not support image",
430
- "this model cannot process images",
431
- "text-only model",
432
- "images are not supported for this model",
433
- "vision is not supported for this model",
434
- "vision is not supported",
435
- "vision not supported",
436
- "image_url is only supported by certain models",
437
- ]);
471
+ const highPrecisionMessageMatches = this.matchMessage(signal.message, modality === "pdf"
472
+ ? [
473
+ "this model does not support pdf",
474
+ "model does not support pdf",
475
+ "pdf is not supported for this model",
476
+ "file input is not supported for this model",
477
+ "input_file is not supported",
478
+ "unsupported file type: application/pdf",
479
+ "application/pdf is not supported for this model",
480
+ ]
481
+ : [
482
+ "does not support images",
483
+ "does not support image inputs",
484
+ "model does not support image",
485
+ "this model cannot process images",
486
+ "text-only model",
487
+ "images are not supported for this model",
488
+ "vision is not supported for this model",
489
+ "vision is not supported",
490
+ "vision not supported",
491
+ "image_url is only supported by certain models",
492
+ ]);
438
493
  if (highPrecisionMessageMatches.length > 0) {
439
494
  score += 3;
440
495
  evidence.push(...highPrecisionMessageMatches.map((match) => `msg:${match}`));
441
496
  }
442
- const providerSpecificMatches = this.matchMessage(signal.message, this.providerCapabilityHints(provider));
497
+ const providerSpecificMatches = this.matchMessage(signal.message, this.providerCapabilityHints(provider, modality));
443
498
  if (providerSpecificMatches.length > 0) {
444
499
  score += 2;
445
500
  evidence.push(...providerSpecificMatches.map((match) => `provider:${match}`));
446
501
  }
447
- const weakCapabilityHints = this.matchMessage(signal.message, [
448
- "vision",
449
- "unsupported content type",
450
- "unsupported message content",
451
- "invalid content type",
452
- "unrecognized content type",
453
- "image_url",
454
- "multimodal",
455
- "multi-modal",
456
- ]);
502
+ const weakCapabilityHints = this.matchMessage(signal.message, modality === "pdf"
503
+ ? [
504
+ "pdf input",
505
+ "pdf support",
506
+ "pdf not supported",
507
+ "application/pdf",
508
+ "input_file",
509
+ "file input",
510
+ "document input",
511
+ "unsupported file type",
512
+ "unsupported content type",
513
+ "invalid content type",
514
+ ]
515
+ : [
516
+ "vision",
517
+ "unsupported content type",
518
+ "unsupported message content",
519
+ "invalid content type",
520
+ "unrecognized content type",
521
+ "image_url",
522
+ "multimodal",
523
+ "multi-modal",
524
+ ]);
457
525
  const hasClientValidationStatus = Array.from(signal.statusCodes).some((status) => [400, 415, 422].includes(status));
458
526
  if (weakCapabilityHints.length > 0 && hasClientValidationStatus) {
459
527
  score += 1;
@@ -468,8 +536,36 @@ export class ModelCapabilityService {
468
536
  evidence: Array.from(new Set(evidence)).slice(0, 8),
469
537
  };
470
538
  }
471
- static providerCapabilityHints(provider) {
539
+ static providerCapabilityHints(provider, modality) {
472
540
  const normalized = provider.toLowerCase().trim();
541
+ if (modality === "pdf") {
542
+ if (normalized.includes("openai")) {
543
+ return [
544
+ "input_file is not supported",
545
+ "unsupported file type: application/pdf",
546
+ "application/pdf is not supported for this model",
547
+ ];
548
+ }
549
+ if (normalized.includes("anthropic")) {
550
+ return [
551
+ "pdf is not supported for this model",
552
+ "file input is not supported for this model",
553
+ ];
554
+ }
555
+ if (normalized.includes("google") || normalized.includes("gemini")) {
556
+ return [
557
+ "unsupported document input",
558
+ "pdf input is not supported",
559
+ ];
560
+ }
561
+ if (normalized.includes("realtimex")) {
562
+ return [
563
+ "unsupported file input",
564
+ "invalid model",
565
+ ];
566
+ }
567
+ return [];
568
+ }
473
569
  if (normalized.includes("openai")) {
474
570
  return [
475
571
  "image_url is only supported by certain models",
@@ -8,20 +8,32 @@ import { extractLlmResponse, normalizeLlmContent, previewLlmText } from "../util
8
8
  import { DEFAULT_BASELINE_FIELDS } from "./BaselineConfigService.js";
9
9
  const logger = createLogger("PolicyEngine");
10
10
  /**
11
- * Helper to build LLM message content. If the text contains the VLM marker
12
- * generated by IngestionService, it casts the payload to an OpenAI-compatible
13
- * Vision array structure so the underlying SDK bridge can transmit the image.
11
+ * Helper to build LLM message content. If the text contains a VLM marker
12
+ * generated by IngestionService, it casts the payload to multimodal blocks.
14
13
  */
15
14
  function extractVlmPayload(text) {
16
- const marker = text.match(/\[VLM_IMAGE_DATA:(data:[^;]+;base64,[^\]]+)\]/);
17
- if (!marker)
18
- return null;
19
- const markerText = marker[0];
20
- const supplementalText = text.replace(markerText, "").trim().slice(0, 4000);
21
- return {
22
- imageDataUrl: marker[1],
23
- supplementalText,
24
- };
15
+ const imageMarker = text.match(/\[VLM_IMAGE_DATA:(data:[^;]+;base64,[^\]]+)\]/);
16
+ if (imageMarker) {
17
+ const markerText = imageMarker[0];
18
+ return {
19
+ kind: "image",
20
+ dataUrl: imageMarker[1],
21
+ supplementalText: text.replace(markerText, "").trim().slice(0, 4000),
22
+ };
23
+ }
24
+ const pdfMarker = text.match(/\[VLM_PDF_DATA:(data:[^;]+;base64,[^\]]+)\]/);
25
+ if (pdfMarker) {
26
+ const markerText = pdfMarker[0];
27
+ return {
28
+ kind: "pdf",
29
+ dataUrl: pdfMarker[1],
30
+ supplementalText: text.replace(markerText, "").trim().slice(0, 4000),
31
+ };
32
+ }
33
+ return null;
34
+ }
35
+ function hasVlmPayload(text) {
36
+ return text.includes("[VLM_IMAGE_DATA:") || text.includes("[VLM_PDF_DATA:");
25
37
  }
26
38
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
27
39
  function buildMessageContent(prompt, text, textFirst = false) {
@@ -30,10 +42,12 @@ function buildMessageContent(prompt, text, textFirst = false) {
30
42
  const textPrompt = vlmPayload.supplementalText
31
43
  ? `${prompt}\n\nSupplemental extracted fields:\n${vlmPayload.supplementalText}`
32
44
  : prompt;
33
- return [
34
- { type: "text", text: textPrompt },
35
- { type: "image_url", image_url: { url: vlmPayload.imageDataUrl } }
36
- ];
45
+ // `input_file` is not provider-agnostic (e.g. Anthropic-style block); providers
46
+ // that don't accept it will fail, and IngestionService will learn unsupported pdf modality.
47
+ const assetBlock = vlmPayload.kind === "pdf"
48
+ ? { type: "input_file", file_url: vlmPayload.dataUrl }
49
+ : { type: "image_url", image_url: { url: vlmPayload.dataUrl } };
50
+ return [{ type: "text", text: textPrompt }, assetBlock];
37
51
  }
38
52
  // Standard text payload
39
53
  return textFirst
@@ -340,7 +354,7 @@ async function evaluateCondition(condition, doc, trace, settings = {}) {
340
354
  model,
341
355
  condition_type: condition.type,
342
356
  prompt_preview: prompt.slice(0, 180),
343
- vision_payload: doc.text.includes("[VLM_IMAGE_DATA:")
357
+ vision_payload: hasVlmPayload(doc.text)
344
358
  }
345
359
  });
346
360
  Actuator.logEvent(doc.ingestionId, doc.userId, "analysis", "Policy Matching", {
@@ -349,7 +363,7 @@ async function evaluateCondition(condition, doc, trace, settings = {}) {
349
363
  model,
350
364
  condition_type: condition.type,
351
365
  prompt_preview: prompt.slice(0, 180),
352
- vision_payload: doc.text.includes("[VLM_IMAGE_DATA:")
366
+ vision_payload: hasVlmPayload(doc.text)
353
367
  }, doc.supabase);
354
368
  const result = await sdk.llm.chat([
355
369
  {
@@ -443,7 +457,7 @@ async function extractData(fields, doc, trace, settings = {}) {
443
457
  Fields to extract:
444
458
  ${fieldDescriptions}`;
445
459
  try {
446
- const isVlmPayload = doc.text.startsWith("[VLM_IMAGE_DATA:");
460
+ const isVlmPayload = hasVlmPayload(doc.text);
447
461
  const mixedPrompt = isVlmPayload
448
462
  ? `You are a precise data extraction engine. Return only valid JSON.\n\n${prompt}`
449
463
  : prompt;
@@ -593,7 +607,7 @@ Rules:
593
607
  model,
594
608
  known_fields_count: Object.keys(contractData).length,
595
609
  }, doc.supabase);
596
- const isVlmPayload = doc.text.startsWith("[VLM_IMAGE_DATA:");
610
+ const isVlmPayload = hasVlmPayload(doc.text);
597
611
  const mixedPrompt = isVlmPayload
598
612
  ? `You are a precise data extraction engine. Return only valid JSON.\n\n${prompt}`
599
613
  : prompt;
@@ -821,7 +835,9 @@ export class PolicyEngine {
821
835
  const allowLearnedFallback = opts.allowLearnedFallback !== false && !forcedPolicyId;
822
836
  if (allowLearnedFallback && doc.supabase && policies.length > 0) {
823
837
  try {
824
- const learningText = doc.text.replace(/\[VLM_IMAGE_DATA:[^\]]+\]/g, "");
838
+ const learningText = doc.text
839
+ .replace(/\[VLM_IMAGE_DATA:[^\]]+\]/g, "")
840
+ .replace(/\[VLM_PDF_DATA:[^\]]+\]/g, "");
825
841
  const learned = await PolicyLearningService.resolveLearnedCandidate({
826
842
  supabase: doc.supabase,
827
843
  userId: doc.userId,
@@ -923,7 +939,7 @@ export class PolicyEngine {
923
939
  `Include the calendar year if clearly present. Prefer hyphenated multi-word tags.\n` +
924
940
  `No markdown, no explanation — only the JSON object.`;
925
941
  const userPrompt = `Extract the following fields from the document:\n${fieldList}`;
926
- const isVlmPayload = doc.text.startsWith("[VLM_IMAGE_DATA:");
942
+ const isVlmPayload = hasVlmPayload(doc.text);
927
943
  const mixedPrompt = isVlmPayload ? `${systemPrompt}\n\n${userPrompt}` : userPrompt;
928
944
  try {
929
945
  Actuator.logEvent(doc.ingestionId, doc.userId, "analysis", "Baseline Extraction", {
@@ -90,8 +90,8 @@ export class RAGService {
90
90
  * Process an ingested document's raw text: chunk it, embed it, and store in DB.
91
91
  */
92
92
  static async chunkAndEmbed(ingestionId, userId, rawText, supabase, settings) {
93
- if (rawText.startsWith("[VLM_IMAGE_DATA:")) {
94
- logger.info(`Skipping chunking and embedding for VLM base64 image data (Ingestion: ${ingestionId})`);
93
+ if (/^\[VLM_(IMAGE|PDF)_DATA:/.test(rawText)) {
94
+ logger.info(`Skipping chunking and embedding for VLM base64 multimodal data (Ingestion: ${ingestionId})`);
95
95
  return;
96
96
  }
97
97
  const chunks = this.chunkText(rawText);