@realtimex/folio 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +20 -0
- package/README.md +63 -0
- package/api/server.ts +130 -0
- package/api/src/config/index.ts +96 -0
- package/api/src/middleware/auth.ts +128 -0
- package/api/src/middleware/errorHandler.ts +88 -0
- package/api/src/middleware/index.ts +4 -0
- package/api/src/middleware/rateLimit.ts +71 -0
- package/api/src/middleware/validation.ts +58 -0
- package/api/src/routes/accounts.ts +142 -0
- package/api/src/routes/baseline-config.ts +124 -0
- package/api/src/routes/chat.ts +154 -0
- package/api/src/routes/health.ts +61 -0
- package/api/src/routes/index.ts +35 -0
- package/api/src/routes/ingestions.ts +275 -0
- package/api/src/routes/migrate.ts +112 -0
- package/api/src/routes/policies.ts +121 -0
- package/api/src/routes/processing.ts +90 -0
- package/api/src/routes/rules.ts +11 -0
- package/api/src/routes/sdk.ts +100 -0
- package/api/src/routes/settings.ts +80 -0
- package/api/src/routes/setup.ts +389 -0
- package/api/src/routes/stats.ts +81 -0
- package/api/src/routes/tts.ts +190 -0
- package/api/src/services/BaselineConfigService.ts +208 -0
- package/api/src/services/ChatService.ts +204 -0
- package/api/src/services/GoogleDriveService.ts +331 -0
- package/api/src/services/GoogleSheetsService.ts +1107 -0
- package/api/src/services/IngestionService.ts +1187 -0
- package/api/src/services/ModelCapabilityService.ts +248 -0
- package/api/src/services/PolicyEngine.ts +1625 -0
- package/api/src/services/PolicyLearningService.ts +527 -0
- package/api/src/services/PolicyLoader.ts +249 -0
- package/api/src/services/RAGService.ts +391 -0
- package/api/src/services/SDKService.ts +249 -0
- package/api/src/services/supabase.ts +113 -0
- package/api/src/utils/Actuator.ts +284 -0
- package/api/src/utils/actions/ActionHandler.ts +34 -0
- package/api/src/utils/actions/AppendToGSheetAction.ts +260 -0
- package/api/src/utils/actions/AutoRenameAction.ts +58 -0
- package/api/src/utils/actions/CopyAction.ts +120 -0
- package/api/src/utils/actions/CopyToGDriveAction.ts +64 -0
- package/api/src/utils/actions/LogCsvAction.ts +48 -0
- package/api/src/utils/actions/NotifyAction.ts +39 -0
- package/api/src/utils/actions/RenameAction.ts +57 -0
- package/api/src/utils/actions/WebhookAction.ts +58 -0
- package/api/src/utils/actions/utils.ts +293 -0
- package/api/src/utils/llmResponse.ts +61 -0
- package/api/src/utils/logger.ts +67 -0
- package/bin/folio-deploy.js +12 -0
- package/bin/folio-setup.js +45 -0
- package/bin/folio.js +65 -0
- package/dist/api/server.js +106 -0
- package/dist/api/src/config/index.js +81 -0
- package/dist/api/src/middleware/auth.js +93 -0
- package/dist/api/src/middleware/errorHandler.js +73 -0
- package/dist/api/src/middleware/index.js +4 -0
- package/dist/api/src/middleware/rateLimit.js +43 -0
- package/dist/api/src/middleware/validation.js +54 -0
- package/dist/api/src/routes/accounts.js +110 -0
- package/dist/api/src/routes/baseline-config.js +91 -0
- package/dist/api/src/routes/chat.js +114 -0
- package/dist/api/src/routes/health.js +52 -0
- package/dist/api/src/routes/index.js +31 -0
- package/dist/api/src/routes/ingestions.js +207 -0
- package/dist/api/src/routes/migrate.js +91 -0
- package/dist/api/src/routes/policies.js +86 -0
- package/dist/api/src/routes/processing.js +75 -0
- package/dist/api/src/routes/rules.js +8 -0
- package/dist/api/src/routes/sdk.js +80 -0
- package/dist/api/src/routes/settings.js +68 -0
- package/dist/api/src/routes/setup.js +315 -0
- package/dist/api/src/routes/stats.js +62 -0
- package/dist/api/src/routes/tts.js +178 -0
- package/dist/api/src/services/BaselineConfigService.js +168 -0
- package/dist/api/src/services/ChatService.js +166 -0
- package/dist/api/src/services/GoogleDriveService.js +280 -0
- package/dist/api/src/services/GoogleSheetsService.js +795 -0
- package/dist/api/src/services/IngestionService.js +990 -0
- package/dist/api/src/services/ModelCapabilityService.js +179 -0
- package/dist/api/src/services/PolicyEngine.js +1353 -0
- package/dist/api/src/services/PolicyLearningService.js +397 -0
- package/dist/api/src/services/PolicyLoader.js +159 -0
- package/dist/api/src/services/RAGService.js +295 -0
- package/dist/api/src/services/SDKService.js +212 -0
- package/dist/api/src/services/supabase.js +72 -0
- package/dist/api/src/utils/Actuator.js +225 -0
- package/dist/api/src/utils/actions/ActionHandler.js +1 -0
- package/dist/api/src/utils/actions/AppendToGSheetAction.js +191 -0
- package/dist/api/src/utils/actions/AutoRenameAction.js +49 -0
- package/dist/api/src/utils/actions/CopyAction.js +112 -0
- package/dist/api/src/utils/actions/CopyToGDriveAction.js +55 -0
- package/dist/api/src/utils/actions/LogCsvAction.js +42 -0
- package/dist/api/src/utils/actions/NotifyAction.js +32 -0
- package/dist/api/src/utils/actions/RenameAction.js +51 -0
- package/dist/api/src/utils/actions/WebhookAction.js +51 -0
- package/dist/api/src/utils/actions/utils.js +237 -0
- package/dist/api/src/utils/llmResponse.js +63 -0
- package/dist/api/src/utils/logger.js +51 -0
- package/dist/assets/index-DzN8-j-e.css +1 -0
- package/dist/assets/index-Uy-ai3Dh.js +113 -0
- package/dist/favicon.svg +31 -0
- package/dist/folio-logo.svg +46 -0
- package/dist/index.html +14 -0
- package/docs-dev/FPE-spec.md +196 -0
- package/docs-dev/folio-prd.md +47 -0
- package/docs-dev/foundation-checklist.md +30 -0
- package/docs-dev/hybrid-routing-architecture.md +205 -0
- package/docs-dev/ingestion-engine.md +69 -0
- package/docs-dev/port-from-email-automator.md +32 -0
- package/docs-dev/tech-spec.md +98 -0
- package/index.html +13 -0
- package/package.json +101 -0
- package/public/favicon.svg +31 -0
- package/public/folio-logo.svg +46 -0
- package/scripts/dev-task.mjs +51 -0
- package/scripts/get-latest-migration-timestamp.mjs +34 -0
- package/scripts/migrate.sh +91 -0
- package/supabase/.temp/cli-latest +1 -0
- package/supabase/.temp/gotrue-version +1 -0
- package/supabase/.temp/pooler-url +1 -0
- package/supabase/.temp/postgres-version +1 -0
- package/supabase/.temp/project-ref +1 -0
- package/supabase/.temp/rest-version +1 -0
- package/supabase/.temp/storage-migration +1 -0
- package/supabase/.temp/storage-version +1 -0
- package/supabase/config.toml +64 -0
- package/supabase/functions/_shared/auth.ts +35 -0
- package/supabase/functions/_shared/cors.ts +12 -0
- package/supabase/functions/_shared/supabaseAdmin.ts +17 -0
- package/supabase/functions/api-v1-settings/index.ts +66 -0
- package/supabase/functions/setup/index.ts +91 -0
- package/supabase/migrations/20260223000000_initial_foundation.sql +136 -0
- package/supabase/migrations/20260223000001_add_migration_rpc.sql +10 -0
- package/supabase/migrations/20260224000002_add_init_state_view.sql +20 -0
- package/supabase/migrations/20260224000003_port_user_creation_parity.sql +139 -0
- package/supabase/migrations/20260224000004_add_avatars_storage.sql +26 -0
- package/supabase/migrations/20260224000005_add_tts_and_embed_settings.sql +24 -0
- package/supabase/migrations/20260224000006_add_policies_table.sql +48 -0
- package/supabase/migrations/20260224000007_fix_migration_rpc.sql +9 -0
- package/supabase/migrations/20260224000008_add_ingestions_table.sql +42 -0
- package/supabase/migrations/20260225000000_setup_compatible_mode.sql +119 -0
- package/supabase/migrations/20260225000001_restore_ingestions.sql +49 -0
- package/supabase/migrations/20260225000002_add_ingestion_trace.sql +2 -0
- package/supabase/migrations/20260225000003_add_baseline_configs.sql +35 -0
- package/supabase/migrations/20260226000000_add_processing_events.sql +26 -0
- package/supabase/migrations/20260226000001_add_ingestion_file_hash.sql +10 -0
- package/supabase/migrations/20260226000002_add_dynamic_rag.sql +150 -0
- package/supabase/migrations/20260226000003_add_ingestion_summary.sql +4 -0
- package/supabase/migrations/20260226000004_add_ingestion_tags.sql +7 -0
- package/supabase/migrations/20260226000005_add_chat_tables.sql +60 -0
- package/supabase/migrations/20260227000000_harden_chat_messages_rls.sql +25 -0
- package/supabase/migrations/20260228000000_add_vision_model_capabilities.sql +8 -0
- package/supabase/migrations/20260228000001_add_policy_match_feedback.sql +51 -0
- package/supabase/migrations/29991231235959_test_migration.sql +0 -0
- package/supabase/templates/confirmation.html +76 -0
- package/supabase/templates/email-change.html +76 -0
- package/supabase/templates/invite.html +72 -0
- package/supabase/templates/magic-link.html +68 -0
- package/supabase/templates/recovery.html +82 -0
- package/tsconfig.api.json +16 -0
- package/tsconfig.json +25 -0
- package/vite.config.ts +146 -0
|
@@ -0,0 +1,1187 @@
|
|
|
1
|
+
import type { SupabaseClient } from "@supabase/supabase-js";
|
|
2
|
+
import fs from "fs/promises";
|
|
3
|
+
import { PDFParse } from "pdf-parse";
|
|
4
|
+
import { createLogger } from "../utils/logger.js";
|
|
5
|
+
import { PolicyLoader } from "./PolicyLoader.js";
|
|
6
|
+
import type { FolioPolicy } from "./PolicyLoader.js";
|
|
7
|
+
import { PolicyEngine } from "./PolicyEngine.js";
|
|
8
|
+
import { PolicyLearningService } from "./PolicyLearningService.js";
|
|
9
|
+
import { BaselineConfigService } from "./BaselineConfigService.js";
|
|
10
|
+
import { Actuator } from "../utils/Actuator.js";
|
|
11
|
+
import { extractLlmResponse, previewLlmText } from "../utils/llmResponse.js";
|
|
12
|
+
import { RAGService } from "./RAGService.js";
|
|
13
|
+
import { SDKService } from "./SDKService.js";
|
|
14
|
+
import { ModelCapabilityService } from "./ModelCapabilityService.js";
|
|
15
|
+
|
|
16
|
+
const logger = createLogger("IngestionService");
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Multi-signal classifier that decides whether pdf-parse extracted enough
|
|
20
|
+
* real text to skip GPU OCR and go straight to the local LLM (Fast Path).
|
|
21
|
+
*
|
|
22
|
+
* Four independent signals must all pass:
|
|
23
|
+
*
|
|
24
|
+
* 1. Minimum content – collapse whitespace before counting so sparse/formatted
|
|
25
|
+
* PDFs (forms, invoices) don't fail on raw length alone.
|
|
26
|
+
* 2. Word count – Unicode-aware (\p{L}) so French, German, Japanese, etc.
|
|
27
|
+
* aren't penalised; pure symbol/number docs are caught.
|
|
28
|
+
* 3. Garbage ratio – control chars + U+FFFD are the signature of image bytes
|
|
29
|
+
* that were mis-decoded as text. >2 % → encoding failure.
|
|
30
|
+
* 4. Page coverage – only for multi-page docs: if fewer than 40 % of pages
|
|
31
|
+
* yield non-trivial text the document is mostly scanned.
|
|
32
|
+
*/
|
|
33
|
+
function isPdfTextExtractable(pdfData: {
|
|
34
|
+
text: string;
|
|
35
|
+
pages: Array<{ num: number; text: string }>;
|
|
36
|
+
total: number;
|
|
37
|
+
}): boolean {
|
|
38
|
+
const raw = pdfData.text ?? '';
|
|
39
|
+
|
|
40
|
+
// Signal 1: at least 100 printable characters after whitespace normalisation
|
|
41
|
+
if (raw.replace(/\s+/g, ' ').trim().length < 100) return false;
|
|
42
|
+
|
|
43
|
+
// Signal 2: at least 20 word-like tokens (≥2 Unicode letters)
|
|
44
|
+
const words = raw.match(/\p{L}{2,}/gu) ?? [];
|
|
45
|
+
if (words.length < 20) return false;
|
|
46
|
+
|
|
47
|
+
// Signal 3: garbage character ratio must be below 2 %
|
|
48
|
+
// eslint-disable-next-line no-control-regex
|
|
49
|
+
const garbageCount = (raw.match(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\uFFFD]/g) ?? []).length;
|
|
50
|
+
if (raw.length > 0 && garbageCount / raw.length > 0.02) return false;
|
|
51
|
+
|
|
52
|
+
// Signal 4: page coverage — getText() always emits one entry per page,
|
|
53
|
+
// so pages.length === total. For docs with >2 pages, at least 40 % of
|
|
54
|
+
// pages must contain >30 non-whitespace characters.
|
|
55
|
+
if (pdfData.total > 2 && pdfData.pages.length > 0) {
|
|
56
|
+
const pagesWithText = pdfData.pages.filter(
|
|
57
|
+
(p) => (p.text ?? '').replace(/\s/g, '').length > 30
|
|
58
|
+
).length;
|
|
59
|
+
if (pagesWithText / pdfData.total < 0.4) return false;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return true;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export type IngestionStatus = "pending" | "processing" | "matched" | "no_match" | "error" | "duplicate";
|
|
66
|
+
export type IngestionSource = "upload" | "dropzone" | "email" | "url";
|
|
67
|
+
|
|
68
|
+
export interface Ingestion {
|
|
69
|
+
id: string;
|
|
70
|
+
user_id: string;
|
|
71
|
+
source: IngestionSource;
|
|
72
|
+
filename: string;
|
|
73
|
+
mime_type?: string;
|
|
74
|
+
file_size?: number;
|
|
75
|
+
file_hash?: string;
|
|
76
|
+
status: IngestionStatus;
|
|
77
|
+
policy_id?: string;
|
|
78
|
+
policy_name?: string;
|
|
79
|
+
extracted?: Record<string, unknown>;
|
|
80
|
+
actions_taken?: string[];
|
|
81
|
+
error_message?: string;
|
|
82
|
+
storage_path?: string;
|
|
83
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
84
|
+
trace?: Array<{ timestamp: string; step: string; details?: any }>;
|
|
85
|
+
tags?: string[];
|
|
86
|
+
summary?: string | null;
|
|
87
|
+
created_at: string;
|
|
88
|
+
updated_at: string;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export class IngestionService {
|
|
92
|
+
private static readonly NON_IDEMPOTENT_ACTION_TYPES = new Set([
|
|
93
|
+
"append_to_google_sheet",
|
|
94
|
+
"webhook",
|
|
95
|
+
"copy_to_gdrive",
|
|
96
|
+
"copy",
|
|
97
|
+
"log_csv",
|
|
98
|
+
"notify",
|
|
99
|
+
]);
|
|
100
|
+
|
|
101
|
+
private static listNonIdempotentPolicyActions(policy: FolioPolicy): string[] {
|
|
102
|
+
const actionTypes = Array.isArray(policy.spec.actions)
|
|
103
|
+
? policy.spec.actions.map((action) => String(action?.type ?? "").trim()).filter(Boolean)
|
|
104
|
+
: [];
|
|
105
|
+
return Array.from(new Set(actionTypes.filter((actionType) => this.NON_IDEMPOTENT_ACTION_TYPES.has(actionType))));
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
private static valueToSemanticText(value: unknown): string {
|
|
109
|
+
if (value == null) return "";
|
|
110
|
+
if (Array.isArray(value)) {
|
|
111
|
+
return value
|
|
112
|
+
.map((item) => this.valueToSemanticText(item))
|
|
113
|
+
.filter(Boolean)
|
|
114
|
+
.join(", ");
|
|
115
|
+
}
|
|
116
|
+
if (typeof value === "object") {
|
|
117
|
+
try {
|
|
118
|
+
return JSON.stringify(value);
|
|
119
|
+
} catch {
|
|
120
|
+
return String(value);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return String(value);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
private static buildVlmSemanticText(opts: {
|
|
127
|
+
filename: string;
|
|
128
|
+
finalStatus: string;
|
|
129
|
+
policyName?: string;
|
|
130
|
+
extracted: Record<string, unknown>;
|
|
131
|
+
tags: string[];
|
|
132
|
+
}): string {
|
|
133
|
+
const { filename, finalStatus, policyName, extracted, tags } = opts;
|
|
134
|
+
const lines: string[] = [
|
|
135
|
+
`Document filename: ${filename}`,
|
|
136
|
+
"Document source: VLM image extraction",
|
|
137
|
+
`Processing status: ${finalStatus}`,
|
|
138
|
+
];
|
|
139
|
+
|
|
140
|
+
if (policyName) {
|
|
141
|
+
lines.push(`Matched policy: ${policyName}`);
|
|
142
|
+
}
|
|
143
|
+
if (tags.length > 0) {
|
|
144
|
+
lines.push(`Tags: ${tags.join(", ")}`);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const fieldLines = Object.entries(extracted)
|
|
148
|
+
.filter(([key]) => key !== "_enrichment")
|
|
149
|
+
.map(([key, value]) => ({ key, value: this.valueToSemanticText(value).trim() }))
|
|
150
|
+
.filter((entry) => entry.value.length > 0)
|
|
151
|
+
.slice(0, 80)
|
|
152
|
+
.map((entry) => `- ${entry.key}: ${entry.value}`);
|
|
153
|
+
|
|
154
|
+
if (fieldLines.length > 0) {
|
|
155
|
+
lines.push("Extracted fields:");
|
|
156
|
+
lines.push(...fieldLines);
|
|
157
|
+
} else {
|
|
158
|
+
lines.push("Extracted fields: none");
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const enrichment = extracted["_enrichment"];
|
|
162
|
+
if (enrichment && typeof enrichment === "object" && !Array.isArray(enrichment)) {
|
|
163
|
+
const enrichmentKeys = Object.keys(enrichment as Record<string, unknown>);
|
|
164
|
+
if (enrichmentKeys.length > 0) {
|
|
165
|
+
lines.push(`Enrichment fields: ${enrichmentKeys.join(", ")}`);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
lines.push("Synthetic semantic text generated from VLM output for retrieval.");
|
|
170
|
+
return lines.join("\n");
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
private static countExtractedSemanticFields(extracted: Record<string, unknown>): number {
|
|
174
|
+
return Object.entries(extracted)
|
|
175
|
+
.filter(([key]) => key !== "_enrichment")
|
|
176
|
+
.map(([, value]) => value)
|
|
177
|
+
.map((value) => this.valueToSemanticText(value).trim())
|
|
178
|
+
.filter((value) => value.length > 0).length;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
private static queueVlmSemanticEmbedding(opts: {
|
|
182
|
+
ingestionId: string;
|
|
183
|
+
userId: string;
|
|
184
|
+
filename: string;
|
|
185
|
+
finalStatus: string;
|
|
186
|
+
policyName?: string;
|
|
187
|
+
extracted: Record<string, unknown>;
|
|
188
|
+
tags: string[];
|
|
189
|
+
supabase: SupabaseClient;
|
|
190
|
+
embedSettings: { embedding_provider?: string; embedding_model?: string };
|
|
191
|
+
}): { synthetic_chars: number; extracted_fields: number; tags_count: number } {
|
|
192
|
+
const syntheticText = this.buildVlmSemanticText({
|
|
193
|
+
filename: opts.filename,
|
|
194
|
+
finalStatus: opts.finalStatus,
|
|
195
|
+
policyName: opts.policyName,
|
|
196
|
+
extracted: opts.extracted,
|
|
197
|
+
tags: opts.tags,
|
|
198
|
+
});
|
|
199
|
+
const details = {
|
|
200
|
+
synthetic_chars: syntheticText.length,
|
|
201
|
+
extracted_fields: this.countExtractedSemanticFields(opts.extracted),
|
|
202
|
+
tags_count: opts.tags.length,
|
|
203
|
+
};
|
|
204
|
+
|
|
205
|
+
Actuator.logEvent(opts.ingestionId, opts.userId, "analysis", "RAG Embedding", {
|
|
206
|
+
action: "Queued synthetic VLM embedding",
|
|
207
|
+
...details,
|
|
208
|
+
}, opts.supabase);
|
|
209
|
+
|
|
210
|
+
RAGService.chunkAndEmbed(
|
|
211
|
+
opts.ingestionId,
|
|
212
|
+
opts.userId,
|
|
213
|
+
syntheticText,
|
|
214
|
+
opts.supabase,
|
|
215
|
+
opts.embedSettings
|
|
216
|
+
).then(() => {
|
|
217
|
+
Actuator.logEvent(opts.ingestionId, opts.userId, "analysis", "RAG Embedding", {
|
|
218
|
+
action: "Completed synthetic VLM embedding",
|
|
219
|
+
...details,
|
|
220
|
+
}, opts.supabase);
|
|
221
|
+
}).catch((err) => {
|
|
222
|
+
logger.error(`RAG embedding failed for synthetic VLM text ${opts.ingestionId}`, err);
|
|
223
|
+
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
224
|
+
Actuator.logEvent(opts.ingestionId, opts.userId, "error", "RAG Embedding", {
|
|
225
|
+
action: "Synthetic VLM embedding failed",
|
|
226
|
+
error: errorMessage,
|
|
227
|
+
...details,
|
|
228
|
+
}, opts.supabase);
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
return details;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Ingest a document using Hybrid Routing Architecture.
|
|
236
|
+
*/
|
|
237
|
+
static async ingest(opts: {
|
|
238
|
+
supabase: SupabaseClient;
|
|
239
|
+
userId: string;
|
|
240
|
+
filename: string;
|
|
241
|
+
mimeType?: string;
|
|
242
|
+
fileSize?: number;
|
|
243
|
+
source?: IngestionSource;
|
|
244
|
+
filePath: string;
|
|
245
|
+
content: string;
|
|
246
|
+
fileHash?: string;
|
|
247
|
+
}): Promise<Ingestion> {
|
|
248
|
+
const { supabase, userId, filename, mimeType, fileSize, source = "upload", filePath, content, fileHash } = opts;
|
|
249
|
+
|
|
250
|
+
// Duplicate detection — check if this exact file content was already ingested
|
|
251
|
+
if (fileHash) {
|
|
252
|
+
const { data: existing } = await supabase
|
|
253
|
+
.from("ingestions")
|
|
254
|
+
.select("id, filename, created_at")
|
|
255
|
+
.eq("user_id", userId)
|
|
256
|
+
.eq("file_hash", fileHash)
|
|
257
|
+
.eq("status", "matched")
|
|
258
|
+
.order("created_at", { ascending: true })
|
|
259
|
+
.limit(1)
|
|
260
|
+
.maybeSingle();
|
|
261
|
+
|
|
262
|
+
if (existing) {
|
|
263
|
+
logger.info(`Duplicate file detected: '${filename}' matches ingestion ${existing.id} ('${existing.filename}')`);
|
|
264
|
+
const { data: dupIngestion } = await supabase
|
|
265
|
+
.from("ingestions")
|
|
266
|
+
.insert({
|
|
267
|
+
user_id: userId,
|
|
268
|
+
source,
|
|
269
|
+
filename,
|
|
270
|
+
mime_type: mimeType,
|
|
271
|
+
file_size: fileSize,
|
|
272
|
+
storage_path: filePath,
|
|
273
|
+
file_hash: fileHash,
|
|
274
|
+
status: "duplicate",
|
|
275
|
+
extracted: { duplicate_of: existing.id, original_filename: existing.filename },
|
|
276
|
+
})
|
|
277
|
+
.select()
|
|
278
|
+
.single();
|
|
279
|
+
return dupIngestion as Ingestion;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// 1. Insert into ingestions
|
|
284
|
+
const { data: ingestion, error: insertErr } = await supabase
|
|
285
|
+
.from("ingestions")
|
|
286
|
+
.insert({
|
|
287
|
+
user_id: userId,
|
|
288
|
+
source,
|
|
289
|
+
filename,
|
|
290
|
+
mime_type: mimeType,
|
|
291
|
+
file_size: fileSize,
|
|
292
|
+
storage_path: filePath,
|
|
293
|
+
file_hash: fileHash ?? null,
|
|
294
|
+
status: "processing"
|
|
295
|
+
})
|
|
296
|
+
.select()
|
|
297
|
+
.single();
|
|
298
|
+
|
|
299
|
+
if (insertErr || !ingestion) throw new Error(`Failed to create ingestion record: ${insertErr?.message}`);
|
|
300
|
+
|
|
301
|
+
logger.info(`Processing ingestion ${ingestion.id}: ${filename}`);
|
|
302
|
+
Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Ingestion started", source, filename, fileSize }, supabase);
|
|
303
|
+
|
|
304
|
+
// 2. Document Triage
|
|
305
|
+
let isFastPath = false;
|
|
306
|
+
let isVlmFastPath = false;
|
|
307
|
+
let extractionContent = content;
|
|
308
|
+
const ext = filename.toLowerCase().split('.').pop() || '';
|
|
309
|
+
const fastExts = ['txt', 'md', 'csv', 'json'];
|
|
310
|
+
const imageExts = ['png', 'jpg', 'jpeg', 'webp'];
|
|
311
|
+
|
|
312
|
+
// Pre-fetch settings to decide whether we should attempt VLM.
|
|
313
|
+
const { data: triageSettingsRow } = await supabase
|
|
314
|
+
.from("user_settings")
|
|
315
|
+
.select("llm_provider, llm_model, embedding_provider, embedding_model, vision_model_capabilities")
|
|
316
|
+
.eq("user_id", userId)
|
|
317
|
+
.maybeSingle();
|
|
318
|
+
const visionResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow);
|
|
319
|
+
const llmModel = visionResolution.model;
|
|
320
|
+
const llmProvider = visionResolution.provider;
|
|
321
|
+
|
|
322
|
+
if (fastExts.includes(ext)) {
|
|
323
|
+
isFastPath = true;
|
|
324
|
+
} else if (imageExts.includes(ext) && visionResolution.shouldAttempt) {
|
|
325
|
+
try {
|
|
326
|
+
const buffer = await fs.readFile(filePath);
|
|
327
|
+
const base64 = buffer.toString('base64');
|
|
328
|
+
const mimeTypeActual = mimeType || `image/${ext === 'jpg' ? 'jpeg' : ext}`;
|
|
329
|
+
// Special marker for PolicyEngine
|
|
330
|
+
extractionContent = `[VLM_IMAGE_DATA:data:${mimeTypeActual};base64,${base64}]`;
|
|
331
|
+
isFastPath = true;
|
|
332
|
+
isVlmFastPath = true;
|
|
333
|
+
logger.info(`Smart Triage: Image ${filename} routed to Fast Path using native VLM (${llmModel}).`);
|
|
334
|
+
Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "VLM Fast Path selected", type: ext, model: llmModel }, supabase);
|
|
335
|
+
} catch (err) {
|
|
336
|
+
logger.warn(`Failed to read VLM image ${filename}. Routing to Heavy Path.`, { err });
|
|
337
|
+
}
|
|
338
|
+
} else if (imageExts.includes(ext)) {
|
|
339
|
+
logger.info(`Smart Triage: Image ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked vision-unsupported.`);
|
|
340
|
+
Actuator.logEvent(ingestion.id, userId, "info", "Triage", {
|
|
341
|
+
action: "VLM skipped (model marked unsupported)",
|
|
342
|
+
type: ext,
|
|
343
|
+
model: llmModel,
|
|
344
|
+
provider: llmProvider
|
|
345
|
+
}, supabase);
|
|
346
|
+
} else if (ext === 'pdf') {
|
|
347
|
+
try {
|
|
348
|
+
const buffer = await fs.readFile(filePath);
|
|
349
|
+
const parser = new PDFParse({ data: buffer });
|
|
350
|
+
const pdfData = await parser.getText();
|
|
351
|
+
if (isPdfTextExtractable(pdfData)) {
|
|
352
|
+
isFastPath = true;
|
|
353
|
+
extractionContent = pdfData.text;
|
|
354
|
+
logger.info(`Smart Triage: PDF ${filename} passed text quality check (${pdfData.pages.filter(p => p.text.trim().length > 30).length}/${pdfData.total} pages with text). Routing to Fast Path.`);
|
|
355
|
+
Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Smart Triage passed", type: "pdf", fast_path: true }, supabase);
|
|
356
|
+
} else {
|
|
357
|
+
logger.info(`Smart Triage: PDF ${filename} failed text quality check. Routing to Heavy Path.`);
|
|
358
|
+
Actuator.logEvent(ingestion.id, userId, "info", "Triage", { action: "Smart Triage failed", type: "pdf", fast_path: false }, supabase);
|
|
359
|
+
}
|
|
360
|
+
} catch (err) {
|
|
361
|
+
logger.warn(`Failed to parse PDF ${filename}. Routing to Heavy Path.`, { err });
|
|
362
|
+
Actuator.logEvent(ingestion.id, userId, "error", "Triage", { action: "PDF parse failed", error: String(err) }, supabase);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
if (isFastPath) {
|
|
367
|
+
try {
|
|
368
|
+
// 3. Fast Path — fetch all dependencies in parallel
|
|
369
|
+
const [userPolicies, processingSettingsRow, baselineConfig] = await Promise.all([
|
|
370
|
+
PolicyLoader.load(false, supabase),
|
|
371
|
+
supabase.from("user_settings").select("llm_provider, llm_model, embedding_provider, embedding_model").eq("user_id", userId).maybeSingle(),
|
|
372
|
+
BaselineConfigService.getActive(supabase, userId),
|
|
373
|
+
]);
|
|
374
|
+
const llmSettings = {
|
|
375
|
+
llm_provider: processingSettingsRow.data?.llm_provider ?? undefined,
|
|
376
|
+
llm_model: processingSettingsRow.data?.llm_model ?? undefined,
|
|
377
|
+
};
|
|
378
|
+
const embedSettings = {
|
|
379
|
+
embedding_provider: processingSettingsRow.data?.embedding_provider ?? undefined,
|
|
380
|
+
embedding_model: processingSettingsRow.data?.embedding_model ?? undefined,
|
|
381
|
+
};
|
|
382
|
+
const doc = { filePath: filePath, text: extractionContent, ingestionId: ingestion.id, userId, supabase };
|
|
383
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
384
|
+
const baselineTrace: Array<{ timestamp: string; step: string; details?: any }> = [];
|
|
385
|
+
|
|
386
|
+
// Fire and forget Semantic Embedding Storage
|
|
387
|
+
RAGService.chunkAndEmbed(ingestion.id, userId, doc.text, supabase, embedSettings).catch(err => {
|
|
388
|
+
logger.error(`RAG embedding failed for ${ingestion.id}`, err);
|
|
389
|
+
});
|
|
390
|
+
|
|
391
|
+
// 4. Stage 1: Baseline extraction (always runs, LLM call 1 of max 2)
|
|
392
|
+
baselineTrace.push({
|
|
393
|
+
timestamp: new Date().toISOString(),
|
|
394
|
+
step: "LLM request (baseline extraction)",
|
|
395
|
+
details: {
|
|
396
|
+
provider: llmSettings.llm_provider ?? llmProvider,
|
|
397
|
+
model: llmSettings.llm_model ?? llmModel,
|
|
398
|
+
mode: isVlmFastPath ? "vision" : "text",
|
|
399
|
+
}
|
|
400
|
+
});
|
|
401
|
+
|
|
402
|
+
const baselineResult = await PolicyEngine.extractBaseline(
|
|
403
|
+
doc,
|
|
404
|
+
{ context: baselineConfig?.context, fields: baselineConfig?.fields },
|
|
405
|
+
llmSettings
|
|
406
|
+
);
|
|
407
|
+
const baselineEntities = baselineResult.entities;
|
|
408
|
+
const autoTags = baselineResult.tags;
|
|
409
|
+
baselineTrace.push({
|
|
410
|
+
timestamp: new Date().toISOString(),
|
|
411
|
+
step: "LLM response (baseline extraction)",
|
|
412
|
+
details: {
|
|
413
|
+
entities_count: Object.keys(baselineEntities).length,
|
|
414
|
+
uncertain_count: baselineResult.uncertain_fields.length,
|
|
415
|
+
tags_count: autoTags.length,
|
|
416
|
+
}
|
|
417
|
+
});
|
|
418
|
+
|
|
419
|
+
// Enrich the document with extracted entities so policy keyword/semantic
|
|
420
|
+
// conditions can match against semantic field values (e.g. document_type:
|
|
421
|
+
// "invoice") even when those exact words don't appear in the raw text.
|
|
422
|
+
const entityLines = Object.entries(baselineEntities)
|
|
423
|
+
.filter(([, v]) => v != null)
|
|
424
|
+
.map(([k, v]) => `${k}: ${Array.isArray(v) ? (v as unknown[]).join(", ") : String(v)}`);
|
|
425
|
+
const enrichedDoc = entityLines.length > 0
|
|
426
|
+
? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
|
|
427
|
+
: doc;
|
|
428
|
+
|
|
429
|
+
// 5. Stage 2: Policy matching + policy-specific field extraction
|
|
430
|
+
let result;
|
|
431
|
+
if (userPolicies.length > 0) {
|
|
432
|
+
result = await PolicyEngine.processWithPolicies(enrichedDoc, userPolicies, llmSettings, baselineEntities);
|
|
433
|
+
} else {
|
|
434
|
+
result = await PolicyEngine.process(enrichedDoc, llmSettings, baselineEntities);
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
const policyName = userPolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name;
|
|
438
|
+
const finalStatus = result.status === "fallback" ? "no_match" : result.status;
|
|
439
|
+
|
|
440
|
+
// Merge: baseline entities are the foundation; policy-specific fields
|
|
441
|
+
// are overlaid on top so more precise extractions take precedence.
|
|
442
|
+
const mergedExtracted = { ...baselineEntities, ...result.extractedData };
|
|
443
|
+
let finalTrace = [...baselineTrace, ...(result.trace || [])];
|
|
444
|
+
|
|
445
|
+
const { data: updatedIngestion } = await supabase
|
|
446
|
+
.from("ingestions")
|
|
447
|
+
.update({
|
|
448
|
+
status: finalStatus,
|
|
449
|
+
policy_id: result.matchedPolicy,
|
|
450
|
+
policy_name: policyName,
|
|
451
|
+
extracted: mergedExtracted,
|
|
452
|
+
actions_taken: result.actionsExecuted,
|
|
453
|
+
trace: finalTrace,
|
|
454
|
+
tags: autoTags,
|
|
455
|
+
baseline_config_id: baselineConfig?.id ?? null,
|
|
456
|
+
})
|
|
457
|
+
.eq("id", ingestion.id)
|
|
458
|
+
.select()
|
|
459
|
+
.single();
|
|
460
|
+
|
|
461
|
+
if (isVlmFastPath) {
|
|
462
|
+
const embeddingMeta = this.queueVlmSemanticEmbedding({
|
|
463
|
+
ingestionId: ingestion.id,
|
|
464
|
+
userId,
|
|
465
|
+
filename,
|
|
466
|
+
finalStatus,
|
|
467
|
+
policyName,
|
|
468
|
+
extracted: mergedExtracted,
|
|
469
|
+
tags: autoTags,
|
|
470
|
+
supabase,
|
|
471
|
+
embedSettings,
|
|
472
|
+
});
|
|
473
|
+
finalTrace = [
|
|
474
|
+
...finalTrace,
|
|
475
|
+
{
|
|
476
|
+
timestamp: new Date().toISOString(),
|
|
477
|
+
step: "Queued synthetic VLM embedding",
|
|
478
|
+
details: embeddingMeta,
|
|
479
|
+
}
|
|
480
|
+
];
|
|
481
|
+
await supabase
|
|
482
|
+
.from("ingestions")
|
|
483
|
+
.update({ trace: finalTrace })
|
|
484
|
+
.eq("id", ingestion.id);
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
if (isVlmFastPath) {
|
|
488
|
+
await ModelCapabilityService.learnVisionSuccess({
|
|
489
|
+
supabase,
|
|
490
|
+
userId,
|
|
491
|
+
provider: llmSettings.llm_provider ?? llmProvider,
|
|
492
|
+
model: llmSettings.llm_model ?? llmModel,
|
|
493
|
+
});
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
return updatedIngestion as Ingestion;
|
|
497
|
+
|
|
498
|
+
} catch (err) {
|
|
499
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
500
|
+
|
|
501
|
+
if (isVlmFastPath) {
|
|
502
|
+
const learnedState = await ModelCapabilityService.learnVisionFailure({
|
|
503
|
+
supabase,
|
|
504
|
+
userId,
|
|
505
|
+
provider: llmProvider,
|
|
506
|
+
model: llmModel,
|
|
507
|
+
error: err,
|
|
508
|
+
});
|
|
509
|
+
logger.warn(`VLM extraction failed for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
|
|
510
|
+
Actuator.logEvent(ingestion.id, userId, "error", "Processing", {
|
|
511
|
+
action: "VLM Failed, Fallback to Heavy",
|
|
512
|
+
error: msg,
|
|
513
|
+
learned_state: learnedState,
|
|
514
|
+
}, supabase);
|
|
515
|
+
// Fall back to Heavy Path
|
|
516
|
+
isFastPath = false;
|
|
517
|
+
} else {
|
|
518
|
+
Actuator.logEvent(ingestion.id, userId, "error", "Processing", { error: msg }, supabase);
|
|
519
|
+
const { data: updatedIngestion } = await supabase
|
|
520
|
+
.from("ingestions")
|
|
521
|
+
.update({ status: "error", error_message: msg })
|
|
522
|
+
.eq("id", ingestion.id)
|
|
523
|
+
.select()
|
|
524
|
+
.single();
|
|
525
|
+
return updatedIngestion as Ingestion;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
// 4. Heavy Path (Delegate to RealTimeX)
|
|
531
|
+
const { error: rtxErr } = await supabase
|
|
532
|
+
.from("rtx_activities")
|
|
533
|
+
.insert({
|
|
534
|
+
user_id: userId,
|
|
535
|
+
status: "pending", // Waiting for RealTimeX
|
|
536
|
+
raw_data: {
|
|
537
|
+
source,
|
|
538
|
+
filename,
|
|
539
|
+
mime_type: mimeType,
|
|
540
|
+
file_size: fileSize,
|
|
541
|
+
file_path: filePath,
|
|
542
|
+
ingestion_id: ingestion.id
|
|
543
|
+
}
|
|
544
|
+
});
|
|
545
|
+
|
|
546
|
+
if (rtxErr) {
|
|
547
|
+
logger.error(`Failed to delegate to rtx_activities`, { rtxErr });
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
const { data: pendingIngestion } = await supabase
|
|
551
|
+
.from("ingestions")
|
|
552
|
+
.update({ status: "pending" }) // UI shows pending
|
|
553
|
+
.eq("id", ingestion.id)
|
|
554
|
+
.select()
|
|
555
|
+
.single();
|
|
556
|
+
|
|
557
|
+
return pendingIngestion as Ingestion;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
/**
|
|
561
|
+
* Re-run an existing ingestion
|
|
562
|
+
*/
|
|
563
|
+
static async rerun(
|
|
564
|
+
ingestionId: string,
|
|
565
|
+
supabase: SupabaseClient,
|
|
566
|
+
userId: string,
|
|
567
|
+
opts: { forcedPolicyId?: string } = {}
|
|
568
|
+
): Promise<boolean> {
|
|
569
|
+
const { data: ingestion, error } = await supabase
|
|
570
|
+
.from("ingestions")
|
|
571
|
+
.select("*")
|
|
572
|
+
.eq("id", ingestionId)
|
|
573
|
+
.eq("user_id", userId)
|
|
574
|
+
.single();
|
|
575
|
+
|
|
576
|
+
if (error || !ingestion) throw new Error("Ingestion not found");
|
|
577
|
+
|
|
578
|
+
await supabase
|
|
579
|
+
.from("ingestions")
|
|
580
|
+
.update({ status: "processing", error_message: null, policy_id: null, policy_name: null, extracted: {}, actions_taken: [], summary: null })
|
|
581
|
+
.eq("id", ingestionId);
|
|
582
|
+
|
|
583
|
+
Actuator.logEvent(ingestionId, userId, "info", "Triage", { action: "Re-run Initiated" }, supabase);
|
|
584
|
+
|
|
585
|
+
const filename = ingestion.filename;
|
|
586
|
+
const filePath = ingestion.storage_path;
|
|
587
|
+
if (!filePath) throw new Error("No storage path found for this ingestion");
|
|
588
|
+
|
|
589
|
+
let isFastPath = false;
|
|
590
|
+
let isVlmFastPath = false;
|
|
591
|
+
let extractionContent = "";
|
|
592
|
+
const ext = filename.toLowerCase().split('.').pop() || '';
|
|
593
|
+
const fastExts = ['txt', 'md', 'csv', 'json'];
|
|
594
|
+
const imageExts = ['png', 'jpg', 'jpeg', 'webp'];
|
|
595
|
+
|
|
596
|
+
const { data: triageSettingsRow } = await supabase
|
|
597
|
+
.from("user_settings")
|
|
598
|
+
.select("llm_provider, llm_model, embedding_provider, embedding_model, vision_model_capabilities")
|
|
599
|
+
.eq("user_id", userId)
|
|
600
|
+
.maybeSingle();
|
|
601
|
+
const visionResolution = ModelCapabilityService.resolveVisionSupport(triageSettingsRow);
|
|
602
|
+
const llmModel = visionResolution.model;
|
|
603
|
+
const llmProvider = visionResolution.provider;
|
|
604
|
+
|
|
605
|
+
if (fastExts.includes(ext)) {
|
|
606
|
+
isFastPath = true;
|
|
607
|
+
extractionContent = await fs.readFile(filePath, "utf-8");
|
|
608
|
+
} else if (imageExts.includes(ext) && visionResolution.shouldAttempt) {
|
|
609
|
+
try {
|
|
610
|
+
const buffer = await fs.readFile(filePath);
|
|
611
|
+
const base64 = buffer.toString('base64');
|
|
612
|
+
const mimeTypeActual = `image/${ext === 'jpg' ? 'jpeg' : ext}`;
|
|
613
|
+
extractionContent = `[VLM_IMAGE_DATA:data:${mimeTypeActual};base64,${base64}]`;
|
|
614
|
+
isFastPath = true;
|
|
615
|
+
isVlmFastPath = true;
|
|
616
|
+
logger.info(`Smart Triage: Re-run image ${filename} routed to Fast Path using native VLM (${llmModel}).`);
|
|
617
|
+
Actuator.logEvent(ingestionId, userId, "info", "Triage", { action: "VLM Fast Path selected", type: ext, model: llmModel }, supabase);
|
|
618
|
+
} catch (err) {
|
|
619
|
+
logger.warn(`Failed to read VLM image ${filename} during rerun. Routing to Heavy Path.`, { err });
|
|
620
|
+
}
|
|
621
|
+
} else if (imageExts.includes(ext)) {
|
|
622
|
+
logger.info(`Smart Triage: Re-run image ${filename} kept on Heavy Path because ${llmProvider}/${llmModel} is marked vision-unsupported.`);
|
|
623
|
+
Actuator.logEvent(ingestionId, userId, "info", "Triage", {
|
|
624
|
+
action: "VLM skipped (model marked unsupported)",
|
|
625
|
+
type: ext,
|
|
626
|
+
model: llmModel,
|
|
627
|
+
provider: llmProvider
|
|
628
|
+
}, supabase);
|
|
629
|
+
} else if (ext === 'pdf') {
|
|
630
|
+
try {
|
|
631
|
+
const buffer = await fs.readFile(filePath);
|
|
632
|
+
const parser = new PDFParse({ data: buffer });
|
|
633
|
+
const pdfData = await parser.getText();
|
|
634
|
+
if (isPdfTextExtractable(pdfData)) {
|
|
635
|
+
isFastPath = true;
|
|
636
|
+
extractionContent = pdfData.text;
|
|
637
|
+
}
|
|
638
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
639
|
+
} catch (err) {
|
|
640
|
+
// ignore
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
if (isFastPath) {
|
|
645
|
+
const [userPolicies, processingSettingsRow, baselineConfig] = await Promise.all([
|
|
646
|
+
PolicyLoader.load(false, supabase),
|
|
647
|
+
supabase.from("user_settings").select("llm_provider, llm_model, embedding_provider, embedding_model").eq("user_id", userId).maybeSingle(),
|
|
648
|
+
BaselineConfigService.getActive(supabase, userId),
|
|
649
|
+
]);
|
|
650
|
+
const llmSettings = {
|
|
651
|
+
llm_provider: processingSettingsRow.data?.llm_provider ?? undefined,
|
|
652
|
+
llm_model: processingSettingsRow.data?.llm_model ?? undefined,
|
|
653
|
+
};
|
|
654
|
+
const embedSettings = {
|
|
655
|
+
embedding_provider: processingSettingsRow.data?.embedding_provider ?? undefined,
|
|
656
|
+
embedding_model: processingSettingsRow.data?.embedding_model ?? undefined,
|
|
657
|
+
};
|
|
658
|
+
const doc = { filePath, text: extractionContent, ingestionId, userId, supabase };
|
|
659
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
660
|
+
const baselineTrace: Array<{ timestamp: string; step: string; details?: any }> = [];
|
|
661
|
+
|
|
662
|
+
// Fire and forget Semantic Embedding Storage for re-runs
|
|
663
|
+
RAGService.chunkAndEmbed(ingestionId, userId, doc.text, supabase, embedSettings).catch(err => {
|
|
664
|
+
logger.error(`RAG embedding failed during rerun for ${ingestionId}`, err);
|
|
665
|
+
});
|
|
666
|
+
|
|
667
|
+
baselineTrace.push({
|
|
668
|
+
timestamp: new Date().toISOString(),
|
|
669
|
+
step: "LLM request (baseline extraction)",
|
|
670
|
+
details: {
|
|
671
|
+
provider: llmSettings.llm_provider ?? llmProvider,
|
|
672
|
+
model: llmSettings.llm_model ?? llmModel,
|
|
673
|
+
mode: isVlmFastPath ? "vision" : "text",
|
|
674
|
+
}
|
|
675
|
+
});
|
|
676
|
+
|
|
677
|
+
const baselineResult = await PolicyEngine.extractBaseline(
|
|
678
|
+
doc,
|
|
679
|
+
{ context: baselineConfig?.context, fields: baselineConfig?.fields },
|
|
680
|
+
llmSettings
|
|
681
|
+
);
|
|
682
|
+
const baselineEntities = baselineResult.entities;
|
|
683
|
+
const autoTags = baselineResult.tags;
|
|
684
|
+
baselineTrace.push({
|
|
685
|
+
timestamp: new Date().toISOString(),
|
|
686
|
+
step: "LLM response (baseline extraction)",
|
|
687
|
+
details: {
|
|
688
|
+
entities_count: Object.keys(baselineEntities).length,
|
|
689
|
+
uncertain_count: baselineResult.uncertain_fields.length,
|
|
690
|
+
tags_count: autoTags.length,
|
|
691
|
+
}
|
|
692
|
+
});
|
|
693
|
+
|
|
694
|
+
const entityLines = Object.entries(baselineEntities)
|
|
695
|
+
.filter(([, v]) => v != null)
|
|
696
|
+
.map(([k, v]) => `${k}: ${Array.isArray(v) ? (v as unknown[]).join(", ") : String(v)}`);
|
|
697
|
+
const enrichedDoc = entityLines.length > 0
|
|
698
|
+
? { ...doc, text: doc.text + "\n\n[Extracted fields]\n" + entityLines.join("\n") }
|
|
699
|
+
: doc;
|
|
700
|
+
|
|
701
|
+
let finalStatus = "no_match";
|
|
702
|
+
let result: import("./PolicyEngine.js").ProcessingResult;
|
|
703
|
+
let policyName;
|
|
704
|
+
try {
|
|
705
|
+
const forcedPolicyId = opts.forcedPolicyId?.trim();
|
|
706
|
+
const activePolicies = forcedPolicyId
|
|
707
|
+
? userPolicies.filter((policy) => policy.metadata.id === forcedPolicyId)
|
|
708
|
+
: userPolicies;
|
|
709
|
+
|
|
710
|
+
if (forcedPolicyId && activePolicies.length === 0) {
|
|
711
|
+
throw new Error(`Policy "${forcedPolicyId}" was not found or is disabled.`);
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
if (activePolicies.length > 0) {
|
|
715
|
+
result = await PolicyEngine.processWithPolicies(
|
|
716
|
+
enrichedDoc,
|
|
717
|
+
activePolicies,
|
|
718
|
+
llmSettings,
|
|
719
|
+
baselineEntities,
|
|
720
|
+
{
|
|
721
|
+
...(forcedPolicyId ? { forcedPolicyId } : {}),
|
|
722
|
+
allowLearnedFallback: !forcedPolicyId,
|
|
723
|
+
}
|
|
724
|
+
);
|
|
725
|
+
} else {
|
|
726
|
+
result = await PolicyEngine.process(enrichedDoc, llmSettings, baselineEntities);
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
policyName = result.matchedPolicy ? activePolicies.find((p) => p.metadata.id === result.matchedPolicy)?.metadata.name : undefined;
|
|
730
|
+
finalStatus = result.status === "fallback" ? "no_match" : result.status;
|
|
731
|
+
const mergedExtracted = { ...baselineEntities, ...result.extractedData };
|
|
732
|
+
|
|
733
|
+
// Preserve any human-added tags; merge with freshly generated auto-tags.
|
|
734
|
+
const existingTags: string[] = Array.isArray(ingestion.tags) ? ingestion.tags : [];
|
|
735
|
+
const mergedTags = [...new Set([...autoTags, ...existingTags])];
|
|
736
|
+
let rerunTrace = [
|
|
737
|
+
...(ingestion.trace || []),
|
|
738
|
+
{ timestamp: new Date().toISOString(), step: "--- Re-run Initiated ---" },
|
|
739
|
+
...baselineTrace,
|
|
740
|
+
...(result.trace || [])
|
|
741
|
+
];
|
|
742
|
+
|
|
743
|
+
await supabase
|
|
744
|
+
.from("ingestions")
|
|
745
|
+
.update({
|
|
746
|
+
status: finalStatus,
|
|
747
|
+
policy_id: result.matchedPolicy,
|
|
748
|
+
policy_name: policyName,
|
|
749
|
+
extracted: mergedExtracted,
|
|
750
|
+
actions_taken: result.actionsExecuted,
|
|
751
|
+
trace: rerunTrace,
|
|
752
|
+
tags: mergedTags,
|
|
753
|
+
baseline_config_id: baselineConfig?.id ?? null,
|
|
754
|
+
})
|
|
755
|
+
.eq("id", ingestionId);
|
|
756
|
+
|
|
757
|
+
if (isVlmFastPath) {
|
|
758
|
+
const embeddingMeta = this.queueVlmSemanticEmbedding({
|
|
759
|
+
ingestionId,
|
|
760
|
+
userId,
|
|
761
|
+
filename,
|
|
762
|
+
finalStatus,
|
|
763
|
+
policyName,
|
|
764
|
+
extracted: mergedExtracted,
|
|
765
|
+
tags: mergedTags,
|
|
766
|
+
supabase,
|
|
767
|
+
embedSettings,
|
|
768
|
+
});
|
|
769
|
+
rerunTrace = [
|
|
770
|
+
...rerunTrace,
|
|
771
|
+
{
|
|
772
|
+
timestamp: new Date().toISOString(),
|
|
773
|
+
step: "Queued synthetic VLM embedding",
|
|
774
|
+
details: embeddingMeta,
|
|
775
|
+
}
|
|
776
|
+
];
|
|
777
|
+
await supabase
|
|
778
|
+
.from("ingestions")
|
|
779
|
+
.update({ trace: rerunTrace })
|
|
780
|
+
.eq("id", ingestionId);
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
if (isVlmFastPath) {
|
|
784
|
+
await ModelCapabilityService.learnVisionSuccess({
|
|
785
|
+
supabase,
|
|
786
|
+
userId,
|
|
787
|
+
provider: llmSettings.llm_provider ?? llmProvider,
|
|
788
|
+
model: llmSettings.llm_model ?? llmModel,
|
|
789
|
+
});
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
return finalStatus === "matched";
|
|
793
|
+
} catch (err: unknown) {
|
|
794
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
795
|
+
if (isVlmFastPath) {
|
|
796
|
+
const learnedState = await ModelCapabilityService.learnVisionFailure({
|
|
797
|
+
supabase,
|
|
798
|
+
userId,
|
|
799
|
+
provider: llmProvider,
|
|
800
|
+
model: llmModel,
|
|
801
|
+
error: err,
|
|
802
|
+
});
|
|
803
|
+
logger.warn(`VLM extraction failed during rerun for ${filename}. Falling back to Heavy Path. Error: ${msg}`);
|
|
804
|
+
Actuator.logEvent(ingestionId, userId, "error", "Processing", {
|
|
805
|
+
action: "VLM Failed, Fallback to Heavy",
|
|
806
|
+
error: msg,
|
|
807
|
+
learned_state: learnedState,
|
|
808
|
+
}, supabase);
|
|
809
|
+
isFastPath = false; // Trigger heavy path fallthrough
|
|
810
|
+
} else {
|
|
811
|
+
throw err; // Re-throw to caller
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
// Re-delegate to rtx_activities
|
|
817
|
+
await supabase
|
|
818
|
+
.from("rtx_activities")
|
|
819
|
+
.insert({
|
|
820
|
+
user_id: userId,
|
|
821
|
+
status: "pending",
|
|
822
|
+
raw_data: {
|
|
823
|
+
source: ingestion.source,
|
|
824
|
+
filename,
|
|
825
|
+
mime_type: ingestion.mime_type,
|
|
826
|
+
file_size: ingestion.file_size,
|
|
827
|
+
file_path: filePath,
|
|
828
|
+
ingestion_id: ingestion.id
|
|
829
|
+
}
|
|
830
|
+
});
|
|
831
|
+
|
|
832
|
+
await supabase
|
|
833
|
+
.from("ingestions")
|
|
834
|
+
.update({ status: "pending" })
|
|
835
|
+
.eq("id", ingestionId);
|
|
836
|
+
|
|
837
|
+
return true;
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
/**
|
|
841
|
+
* Manually assign an ingestion to a policy and optionally persist it as
|
|
842
|
+
* learning feedback for future automatic matching.
|
|
843
|
+
*/
|
|
844
|
+
static async matchToPolicy(
|
|
845
|
+
ingestionId: string,
|
|
846
|
+
policyId: string,
|
|
847
|
+
supabase: SupabaseClient,
|
|
848
|
+
userId: string,
|
|
849
|
+
opts: { learn?: boolean; rerun?: boolean; allowSideEffects?: boolean } = {}
|
|
850
|
+
): Promise<Ingestion> {
|
|
851
|
+
const learn = opts.learn !== false;
|
|
852
|
+
const rerun = opts.rerun !== false;
|
|
853
|
+
const allowSideEffects = opts.allowSideEffects === true;
|
|
854
|
+
const normalizedPolicyId = policyId.trim();
|
|
855
|
+
if (!normalizedPolicyId) {
|
|
856
|
+
throw new Error("policy_id is required");
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
const { data: ingestion, error: ingestionError } = await supabase
|
|
860
|
+
.from("ingestions")
|
|
861
|
+
.select("*")
|
|
862
|
+
.eq("id", ingestionId)
|
|
863
|
+
.eq("user_id", userId)
|
|
864
|
+
.single();
|
|
865
|
+
if (ingestionError || !ingestion) {
|
|
866
|
+
throw new Error("Ingestion not found");
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
if (ingestion.status === "processing" || ingestion.status === "pending") {
|
|
870
|
+
throw new Error("Cannot manually match while ingestion is still processing");
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
const policies = await PolicyLoader.load(false, supabase);
|
|
874
|
+
const policy = policies.find((item) => item.metadata.id === normalizedPolicyId);
|
|
875
|
+
if (!policy) {
|
|
876
|
+
throw new Error(`Policy "${normalizedPolicyId}" was not found or is disabled.`);
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
const riskyActions = this.listNonIdempotentPolicyActions(policy);
|
|
880
|
+
if (rerun && riskyActions.length > 0 && !allowSideEffects) {
|
|
881
|
+
throw new Error(
|
|
882
|
+
`Re-running this policy may trigger side-effect actions (${riskyActions.join(", ")}). ` +
|
|
883
|
+
"Confirm allow_side_effects=true to continue."
|
|
884
|
+
);
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
let effectiveIngestion: Ingestion;
|
|
888
|
+
if (rerun) {
|
|
889
|
+
Actuator.logEvent(ingestionId, userId, "info", "Policy Matching", {
|
|
890
|
+
action: "Manual match requested with rerun",
|
|
891
|
+
policyId: policy.metadata.id,
|
|
892
|
+
policyName: policy.metadata.name,
|
|
893
|
+
learn,
|
|
894
|
+
risky_actions: riskyActions,
|
|
895
|
+
}, supabase);
|
|
896
|
+
|
|
897
|
+
await this.rerun(ingestionId, supabase, userId, { forcedPolicyId: policy.metadata.id });
|
|
898
|
+
const refreshed = await this.get(ingestionId, supabase, userId);
|
|
899
|
+
if (!refreshed) {
|
|
900
|
+
throw new Error("Ingestion not found after rerun.");
|
|
901
|
+
}
|
|
902
|
+
effectiveIngestion = refreshed;
|
|
903
|
+
} else {
|
|
904
|
+
const nextTrace = [
|
|
905
|
+
...(Array.isArray(ingestion.trace) ? ingestion.trace : []),
|
|
906
|
+
{
|
|
907
|
+
timestamp: new Date().toISOString(),
|
|
908
|
+
step: "Manual policy match override",
|
|
909
|
+
details: {
|
|
910
|
+
policyId: policy.metadata.id,
|
|
911
|
+
policyName: policy.metadata.name,
|
|
912
|
+
learn,
|
|
913
|
+
rerun,
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
];
|
|
917
|
+
|
|
918
|
+
const { data: updatedIngestion, error: updateError } = await supabase
|
|
919
|
+
.from("ingestions")
|
|
920
|
+
.update({
|
|
921
|
+
status: "matched",
|
|
922
|
+
policy_id: policy.metadata.id,
|
|
923
|
+
policy_name: policy.metadata.name,
|
|
924
|
+
error_message: null,
|
|
925
|
+
trace: nextTrace,
|
|
926
|
+
})
|
|
927
|
+
.eq("id", ingestionId)
|
|
928
|
+
.eq("user_id", userId)
|
|
929
|
+
.select("*")
|
|
930
|
+
.single();
|
|
931
|
+
|
|
932
|
+
if (updateError || !updatedIngestion) {
|
|
933
|
+
throw new Error(`Failed to update ingestion policy match: ${updateError?.message ?? "unknown error"}`);
|
|
934
|
+
}
|
|
935
|
+
effectiveIngestion = updatedIngestion as Ingestion;
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
Actuator.logEvent(ingestionId, userId, "info", "Policy Matching", {
|
|
939
|
+
action: rerun ? "Manual policy match override + rerun" : "Manual policy match override",
|
|
940
|
+
policyId: policy.metadata.id,
|
|
941
|
+
policyName: policy.metadata.name,
|
|
942
|
+
learn,
|
|
943
|
+
rerun,
|
|
944
|
+
}, supabase);
|
|
945
|
+
|
|
946
|
+
if (learn) {
|
|
947
|
+
await PolicyLearningService.recordManualMatch({
|
|
948
|
+
supabase,
|
|
949
|
+
userId,
|
|
950
|
+
ingestion: effectiveIngestion,
|
|
951
|
+
policyId: policy.metadata.id,
|
|
952
|
+
policyName: policy.metadata.name,
|
|
953
|
+
});
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
return effectiveIngestion;
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
/**
|
|
960
|
+
* Generate a user-reviewable refinement draft for an existing policy
|
|
961
|
+
* using evidence from a specific ingestion.
|
|
962
|
+
*/
|
|
963
|
+
static async suggestPolicyRefinement(
|
|
964
|
+
ingestionId: string,
|
|
965
|
+
policyId: string,
|
|
966
|
+
supabase: SupabaseClient,
|
|
967
|
+
userId: string,
|
|
968
|
+
opts: { provider?: string; model?: string } = {}
|
|
969
|
+
): Promise<{ policy: FolioPolicy; rationale: string[] }> {
|
|
970
|
+
const normalizedPolicyId = policyId.trim();
|
|
971
|
+
if (!normalizedPolicyId) {
|
|
972
|
+
throw new Error("policy_id is required");
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
const { data: ingestion, error: ingestionError } = await supabase
|
|
976
|
+
.from("ingestions")
|
|
977
|
+
.select("id,filename,mime_type,status,tags,summary,extracted,trace")
|
|
978
|
+
.eq("id", ingestionId)
|
|
979
|
+
.eq("user_id", userId)
|
|
980
|
+
.single();
|
|
981
|
+
|
|
982
|
+
if (ingestionError || !ingestion) {
|
|
983
|
+
throw new Error("Ingestion not found");
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
const policies = await PolicyLoader.load(false, supabase);
|
|
987
|
+
const targetPolicy = policies.find((policy) => policy.metadata.id === normalizedPolicyId);
|
|
988
|
+
if (!targetPolicy) {
|
|
989
|
+
throw new Error(`Policy "${normalizedPolicyId}" was not found or is disabled.`);
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
const suggestion = await PolicyEngine.suggestPolicyRefinement(
|
|
993
|
+
targetPolicy,
|
|
994
|
+
{
|
|
995
|
+
ingestionId,
|
|
996
|
+
filename: ingestion.filename as string,
|
|
997
|
+
mimeType: (ingestion.mime_type as string | null | undefined) ?? null,
|
|
998
|
+
status: String(ingestion.status ?? ""),
|
|
999
|
+
summary: (ingestion.summary as string | null | undefined) ?? null,
|
|
1000
|
+
tags: Array.isArray(ingestion.tags) ? ingestion.tags.map((tag) => String(tag)) : [],
|
|
1001
|
+
extracted: (ingestion.extracted as Record<string, unknown> | null | undefined) ?? {},
|
|
1002
|
+
trace: Array.isArray(ingestion.trace) ? ingestion.trace as Array<{ timestamp: string; step: string; details?: unknown }> : [],
|
|
1003
|
+
},
|
|
1004
|
+
{
|
|
1005
|
+
provider: opts.provider,
|
|
1006
|
+
model: opts.model,
|
|
1007
|
+
userId,
|
|
1008
|
+
supabase,
|
|
1009
|
+
}
|
|
1010
|
+
);
|
|
1011
|
+
|
|
1012
|
+
if (!suggestion.policy) {
|
|
1013
|
+
throw new Error(suggestion.error || "Unable to generate policy refinement suggestion.");
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
return {
|
|
1017
|
+
policy: suggestion.policy,
|
|
1018
|
+
rationale: suggestion.rationale,
|
|
1019
|
+
};
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
/**
|
|
1023
|
+
* List ingestions for a user, newest first.
|
|
1024
|
+
* Supports server-side pagination and ILIKE search across native text columns
|
|
1025
|
+
* (filename, policy_name, summary). Tags are handled client-side via the
|
|
1026
|
+
* filter bar; extracted JSONB search requires a tsvector migration (deferred).
|
|
1027
|
+
*/
|
|
1028
|
+
static async list(
|
|
1029
|
+
supabase: SupabaseClient,
|
|
1030
|
+
userId: string,
|
|
1031
|
+
opts: { page?: number; pageSize?: number; query?: string } = {}
|
|
1032
|
+
): Promise<{ ingestions: Ingestion[]; total: number }> {
|
|
1033
|
+
const { page = 1, pageSize = 20, query } = opts;
|
|
1034
|
+
const from = (page - 1) * pageSize;
|
|
1035
|
+
const to = from + pageSize - 1;
|
|
1036
|
+
|
|
1037
|
+
let q = supabase
|
|
1038
|
+
.from("ingestions")
|
|
1039
|
+
.select("*", { count: "exact" })
|
|
1040
|
+
.eq("user_id", userId)
|
|
1041
|
+
.order("created_at", { ascending: false });
|
|
1042
|
+
|
|
1043
|
+
if (query?.trim()) {
|
|
1044
|
+
const term = `%${query.trim()}%`;
|
|
1045
|
+
// PostgREST .or() only supports native column types — no ::cast expressions.
|
|
1046
|
+
// Searching filename, policy_name, and summary covers the most practical cases.
|
|
1047
|
+
q = q.or(
|
|
1048
|
+
`filename.ilike.${term},` +
|
|
1049
|
+
`policy_name.ilike.${term},` +
|
|
1050
|
+
`summary.ilike.${term}`
|
|
1051
|
+
);
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
q = q.range(from, to);
|
|
1055
|
+
|
|
1056
|
+
const { data, error, count } = await q;
|
|
1057
|
+
if (error) throw new Error(`Failed to list ingestions: ${error.message}`);
|
|
1058
|
+
return { ingestions: data as Ingestion[], total: count ?? 0 };
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
/**
|
|
1062
|
+
* Get a single ingestion by ID.
|
|
1063
|
+
*/
|
|
1064
|
+
static async get(id: string, supabase: SupabaseClient, userId: string): Promise<Ingestion | null> {
|
|
1065
|
+
const { data } = await supabase
|
|
1066
|
+
.from("ingestions")
|
|
1067
|
+
.select("*")
|
|
1068
|
+
.eq("id", id)
|
|
1069
|
+
.eq("user_id", userId)
|
|
1070
|
+
.single();
|
|
1071
|
+
return data as Ingestion | null;
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
/**
|
|
1075
|
+
* Delete an ingestion record.
|
|
1076
|
+
*/
|
|
1077
|
+
static async delete(id: string, supabase: SupabaseClient, userId: string): Promise<boolean> {
|
|
1078
|
+
const { count, error } = await supabase
|
|
1079
|
+
.from("ingestions")
|
|
1080
|
+
.delete({ count: "exact" })
|
|
1081
|
+
.eq("id", id)
|
|
1082
|
+
.eq("user_id", userId);
|
|
1083
|
+
|
|
1084
|
+
if (error) throw new Error(`Failed to delete ingestion: ${error.message}`);
|
|
1085
|
+
return (count ?? 0) > 0;
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
/**
|
|
1089
|
+
* Generate (or return cached) a 2-3 sentence prose summary for an ingestion.
|
|
1090
|
+
* Builds the prompt from already-extracted entities — no file I/O needed.
|
|
1091
|
+
* The result is saved back to ingestion.summary so subsequent calls are instant.
|
|
1092
|
+
*/
|
|
1093
|
+
static async summarize(
|
|
1094
|
+
id: string,
|
|
1095
|
+
supabase: SupabaseClient,
|
|
1096
|
+
userId: string,
|
|
1097
|
+
llmSettings: { llm_provider?: string; llm_model?: string } = {}
|
|
1098
|
+
): Promise<string | null> {
|
|
1099
|
+
const { data: ing } = await supabase
|
|
1100
|
+
.from("ingestions")
|
|
1101
|
+
.select("id, filename, extracted, summary, status")
|
|
1102
|
+
.eq("id", id)
|
|
1103
|
+
.eq("user_id", userId)
|
|
1104
|
+
.single();
|
|
1105
|
+
|
|
1106
|
+
if (!ing) throw new Error("Ingestion not found");
|
|
1107
|
+
|
|
1108
|
+
// Return cached summary if available
|
|
1109
|
+
if (ing.summary) return ing.summary as string;
|
|
1110
|
+
|
|
1111
|
+
// Cannot summarise documents that haven't been processed yet
|
|
1112
|
+
if (ing.status === "pending" || ing.status === "processing") return null;
|
|
1113
|
+
|
|
1114
|
+
const sdk = SDKService.getSDK();
|
|
1115
|
+
if (!sdk) {
|
|
1116
|
+
logger.warn("SDK unavailable — skipping summary generation");
|
|
1117
|
+
return null;
|
|
1118
|
+
}
|
|
1119
|
+
|
|
1120
|
+
const extracted: Record<string, unknown> = ing.extracted ?? {};
|
|
1121
|
+
const entityLines = Object.entries(extracted)
|
|
1122
|
+
.filter(([, v]) => v != null && String(v).trim() !== "")
|
|
1123
|
+
.map(([k, v]) => `${k}: ${Array.isArray(v) ? (v as unknown[]).join(", ") : String(v)}`);
|
|
1124
|
+
|
|
1125
|
+
if (entityLines.length === 0) return null;
|
|
1126
|
+
|
|
1127
|
+
const { provider, model } = await SDKService.resolveChatProvider(llmSettings);
|
|
1128
|
+
|
|
1129
|
+
const userPrompt =
|
|
1130
|
+
`Summarize this document:\nFilename: ${ing.filename}\n` +
|
|
1131
|
+
entityLines.join("\n");
|
|
1132
|
+
|
|
1133
|
+
try {
|
|
1134
|
+
Actuator.logEvent(id, userId, "analysis", "Summary Generation", {
|
|
1135
|
+
action: "LLM request (summary generation)",
|
|
1136
|
+
provider,
|
|
1137
|
+
model,
|
|
1138
|
+
extracted_fields_count: entityLines.length,
|
|
1139
|
+
filename: ing.filename,
|
|
1140
|
+
}, supabase);
|
|
1141
|
+
const result = await sdk.llm.chat(
|
|
1142
|
+
[
|
|
1143
|
+
{
|
|
1144
|
+
role: "system",
|
|
1145
|
+
content:
|
|
1146
|
+
"You are a document assistant. Write a concise 2-3 sentence prose summary of a document " +
|
|
1147
|
+
"based on its extracted metadata. Be specific — name the issuer, amount, date, and purpose " +
|
|
1148
|
+
"where available. Plain prose only, no bullet points or markdown formatting."
|
|
1149
|
+
},
|
|
1150
|
+
{ role: "user", content: userPrompt }
|
|
1151
|
+
],
|
|
1152
|
+
{ provider, model }
|
|
1153
|
+
);
|
|
1154
|
+
|
|
1155
|
+
const summary: string = extractLlmResponse(result);
|
|
1156
|
+
Actuator.logEvent(id, userId, "analysis", "Summary Generation", {
|
|
1157
|
+
action: "LLM response (summary generation)",
|
|
1158
|
+
provider,
|
|
1159
|
+
model,
|
|
1160
|
+
raw_length: summary.length,
|
|
1161
|
+
raw_preview: previewLlmText(summary),
|
|
1162
|
+
}, supabase);
|
|
1163
|
+
|
|
1164
|
+
if (!summary.trim()) return null;
|
|
1165
|
+
|
|
1166
|
+
// Cache the result
|
|
1167
|
+
await supabase
|
|
1168
|
+
.from("ingestions")
|
|
1169
|
+
.update({ summary })
|
|
1170
|
+
.eq("id", id)
|
|
1171
|
+
.eq("user_id", userId);
|
|
1172
|
+
|
|
1173
|
+
logger.info(`Summary generated and cached for ingestion ${id}`);
|
|
1174
|
+
return summary;
|
|
1175
|
+
} catch (err) {
|
|
1176
|
+
logger.error("Summary generation failed", { err });
|
|
1177
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1178
|
+
Actuator.logEvent(id, userId, "error", "Summary Generation", {
|
|
1179
|
+
action: "LLM summary generation failed",
|
|
1180
|
+
provider,
|
|
1181
|
+
model,
|
|
1182
|
+
error: msg,
|
|
1183
|
+
}, supabase);
|
|
1184
|
+
return null;
|
|
1185
|
+
}
|
|
1186
|
+
}
|
|
1187
|
+
}
|