@geotechcli/core 0.4.21 → 0.4.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/brain.d.ts +1 -5
- package/dist/agents/brain.d.ts.map +1 -1
- package/dist/agents/brain.js +4 -120
- package/dist/agents/brain.js.map +1 -1
- package/dist/agents/data-tools.js +759 -0
- package/dist/agents/data-tools.js.map +1 -1
- package/dist/agents/runtime-bootstrap.d.ts +6 -0
- package/dist/agents/runtime-bootstrap.d.ts.map +1 -0
- package/dist/agents/runtime-bootstrap.js +8 -0
- package/dist/agents/runtime-bootstrap.js.map +1 -0
- package/dist/agents/runtime-fallbacks.d.ts +7 -0
- package/dist/agents/runtime-fallbacks.d.ts.map +1 -0
- package/dist/agents/runtime-fallbacks.js +87 -0
- package/dist/agents/runtime-fallbacks.js.map +1 -0
- package/dist/agents/swarm.d.ts +1 -4
- package/dist/agents/swarm.d.ts.map +1 -1
- package/dist/agents/swarm.js +74 -8
- package/dist/agents/swarm.js.map +1 -1
- package/dist/agents/tool-runtime.d.ts +7 -0
- package/dist/agents/tool-runtime.d.ts.map +1 -0
- package/dist/agents/tool-runtime.js +9 -0
- package/dist/agents/tool-runtime.js.map +1 -0
- package/dist/config/index.d.ts +4 -4
- package/dist/config/index.js +1 -1
- package/dist/config/index.js.map +1 -1
- package/dist/geo/coordinates.d.ts +40 -0
- package/dist/geo/coordinates.d.ts.map +1 -0
- package/dist/geo/coordinates.js +461 -0
- package/dist/geo/coordinates.js.map +1 -0
- package/dist/geo/index.d.ts +1 -0
- package/dist/geo/index.d.ts.map +1 -1
- package/dist/geo/index.js +1 -0
- package/dist/geo/index.js.map +1 -1
- package/dist/index.d.ts +3 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -2
- package/dist/index.js.map +1 -1
- package/dist/ingest/ags.d.ts +3 -0
- package/dist/ingest/ags.d.ts.map +1 -1
- package/dist/ingest/ags.js +98 -9
- package/dist/ingest/ags.js.map +1 -1
- package/dist/ingest/cpt.d.ts +4 -0
- package/dist/ingest/cpt.d.ts.map +1 -1
- package/dist/ingest/cpt.js +87 -25
- package/dist/ingest/cpt.js.map +1 -1
- package/dist/ingest/document-inputs.d.ts +37 -0
- package/dist/ingest/document-inputs.d.ts.map +1 -0
- package/dist/ingest/document-inputs.js +197 -0
- package/dist/ingest/document-inputs.js.map +1 -0
- package/dist/ingest/geotech-document.d.ts +118 -0
- package/dist/ingest/geotech-document.d.ts.map +1 -0
- package/dist/ingest/geotech-document.js +1006 -0
- package/dist/ingest/geotech-document.js.map +1 -0
- package/dist/ingest/geotech-extract.d.ts +86 -0
- package/dist/ingest/geotech-extract.d.ts.map +1 -0
- package/dist/ingest/geotech-extract.js +652 -0
- package/dist/ingest/geotech-extract.js.map +1 -0
- package/dist/ingest/geotech-schemas.d.ts +248 -0
- package/dist/ingest/geotech-schemas.d.ts.map +1 -0
- package/dist/ingest/geotech-schemas.js +150 -0
- package/dist/ingest/geotech-schemas.js.map +1 -0
- package/dist/ingest/index.d.ts +8 -0
- package/dist/ingest/index.d.ts.map +1 -1
- package/dist/ingest/index.js +8 -0
- package/dist/ingest/index.js.map +1 -1
- package/dist/ingest/ingest-job-child.d.ts +2 -0
- package/dist/ingest/ingest-job-child.d.ts.map +1 -0
- package/dist/ingest/ingest-job-child.js +45 -0
- package/dist/ingest/ingest-job-child.js.map +1 -0
- package/dist/ingest/job-store.d.ts +117 -0
- package/dist/ingest/job-store.d.ts.map +1 -0
- package/dist/ingest/job-store.js +541 -0
- package/dist/ingest/job-store.js.map +1 -0
- package/dist/ingest/job-worker.d.ts +24 -0
- package/dist/ingest/job-worker.d.ts.map +1 -0
- package/dist/ingest/job-worker.js +1129 -0
- package/dist/ingest/job-worker.js.map +1 -0
- package/dist/ingest/pdf.d.ts +102 -0
- package/dist/ingest/pdf.d.ts.map +1 -0
- package/dist/ingest/pdf.js +1544 -0
- package/dist/ingest/pdf.js.map +1 -0
- package/dist/ingest/review-store.d.ts +215 -0
- package/dist/ingest/review-store.d.ts.map +1 -0
- package/dist/ingest/review-store.js +1995 -0
- package/dist/ingest/review-store.js.map +1 -0
- package/dist/llm/capabilities.d.ts +8 -0
- package/dist/llm/capabilities.d.ts.map +1 -0
- package/dist/llm/capabilities.js +73 -0
- package/dist/llm/capabilities.js.map +1 -0
- package/dist/llm/index.d.ts +3 -2
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +2 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/providers/anthropic.d.ts +6 -0
- package/dist/llm/providers/anthropic.d.ts.map +1 -1
- package/dist/llm/providers/anthropic.js +10 -1
- package/dist/llm/providers/anthropic.js.map +1 -1
- package/dist/llm/providers/hosted-beta.d.ts +6 -0
- package/dist/llm/providers/hosted-beta.d.ts.map +1 -1
- package/dist/llm/providers/hosted-beta.js +40 -10
- package/dist/llm/providers/hosted-beta.js.map +1 -1
- package/dist/llm/providers/huggingface.d.ts +6 -0
- package/dist/llm/providers/huggingface.d.ts.map +1 -1
- package/dist/llm/providers/huggingface.js +21 -1
- package/dist/llm/providers/huggingface.js.map +1 -1
- package/dist/llm/providers/openai-compatible.d.ts +6 -0
- package/dist/llm/providers/openai-compatible.d.ts.map +1 -1
- package/dist/llm/providers/openai-compatible.js +21 -1
- package/dist/llm/providers/openai-compatible.js.map +1 -1
- package/dist/llm/providers/zhipu.d.ts +6 -0
- package/dist/llm/providers/zhipu.d.ts.map +1 -1
- package/dist/llm/providers/zhipu.js +15 -1
- package/dist/llm/providers/zhipu.js.map +1 -1
- package/dist/llm/router.d.ts +7 -0
- package/dist/llm/router.d.ts.map +1 -1
- package/dist/llm/router.js +33 -13
- package/dist/llm/router.js.map +1 -1
- package/dist/llm/types.d.ts +22 -4
- package/dist/llm/types.d.ts.map +1 -1
- package/dist/llm/types.js.map +1 -1
- package/dist/meta/metadata.json +1 -1
- package/dist/report/html.d.ts +3 -0
- package/dist/report/html.d.ts.map +1 -0
- package/dist/report/html.js +626 -0
- package/dist/report/html.js.map +1 -0
- package/dist/report/index.d.ts +2 -0
- package/dist/report/index.d.ts.map +1 -1
- package/dist/report/index.js +2 -0
- package/dist/report/index.js.map +1 -1
- package/dist/report/ingest-dossier.d.ts +81 -0
- package/dist/report/ingest-dossier.d.ts.map +1 -0
- package/dist/report/ingest-dossier.js +324 -0
- package/dist/report/ingest-dossier.js.map +1 -0
- package/dist/storage/index.d.ts +5 -0
- package/dist/storage/index.d.ts.map +1 -1
- package/dist/storage/index.js +12 -6
- package/dist/storage/index.js.map +1 -1
- package/dist/vision/geotech-document.d.ts +46 -0
- package/dist/vision/geotech-document.d.ts.map +1 -0
- package/dist/vision/geotech-document.js +576 -0
- package/dist/vision/geotech-document.js.map +1 -0
- package/dist/vision/index.d.ts +31 -0
- package/dist/vision/index.d.ts.map +1 -1
- package/dist/vision/index.js +659 -27
- package/dist/vision/index.js.map +1 -1
- package/dist/vision/ocr.d.ts +29 -0
- package/dist/vision/ocr.d.ts.map +1 -0
- package/dist/vision/ocr.js +287 -0
- package/dist/vision/ocr.js.map +1 -0
- package/dist/vision/preprocess.d.ts +26 -0
- package/dist/vision/preprocess.d.ts.map +1 -0
- package/dist/vision/preprocess.js +194 -0
- package/dist/vision/preprocess.js.map +1 -0
- package/package.json +5 -1
|
@@ -0,0 +1,1129 @@
|
|
|
1
|
+
import { basename } from 'node:path';
|
|
2
|
+
import { buildLLMConfig } from '../config/index.js';
|
|
3
|
+
import { readDocumentPdfPageInputs } from './document-inputs.js';
|
|
4
|
+
import { inspectPdfDocument } from './pdf.js';
|
|
5
|
+
import { ingestBoreholeLogDocument, summarizeBoreholeIngestInspection, } from './geotech-extract.js';
|
|
6
|
+
import { buildPreflightLowYieldInsight, ingestGeotechDocument, inferPreflightLowYieldPageRole, } from './geotech-document.js';
|
|
7
|
+
import { extractGeotechDocumentFactsFromText, interpretGeotechDocumentPage, } from '../vision/geotech-document.js';
|
|
8
|
+
import { interpretBoreholeLogWithContext, transcribeDocumentImageText, } from '../vision/index.js';
|
|
9
|
+
import { recoverDocumentTextHint } from '../vision/ocr.js';
|
|
10
|
+
import { persistBoreholeIngestReview } from './review-store.js';
|
|
11
|
+
import { loadPersistedIngestJob, savePersistedIngestJob, } from './job-store.js';
|
|
12
|
+
const SLOW_VISUAL_ERROR_PATTERNS = [
|
|
13
|
+
/timeout/i,
|
|
14
|
+
/provider is busy/i,
|
|
15
|
+
/returned no content/i,
|
|
16
|
+
/did not contain assistant text/i,
|
|
17
|
+
/no completion choices/i,
|
|
18
|
+
/empty completion/i,
|
|
19
|
+
/temporarily unavailable/i,
|
|
20
|
+
/\b503\b/i,
|
|
21
|
+
/\b504\b/i,
|
|
22
|
+
];
|
|
23
|
+
const FATAL_PROVIDER_STOP_PATTERNS = [
|
|
24
|
+
/daily limit reached/i,
|
|
25
|
+
/remaining today:\s*0/i,
|
|
26
|
+
/insufficient[_\s-]?quota/i,
|
|
27
|
+
/quota exceeded/i,
|
|
28
|
+
/rate limit/i,
|
|
29
|
+
/\b429\b/i,
|
|
30
|
+
];
|
|
31
|
+
function nowIso(now) {
|
|
32
|
+
return (now ?? (() => new Date()))().toISOString();
|
|
33
|
+
}
|
|
34
|
+
function uniqueStrings(values) {
|
|
35
|
+
return [...new Set(values.filter((value) => typeof value === 'string' && value.trim().length > 0))];
|
|
36
|
+
}
|
|
37
|
+
function isBoreholeResult(result) {
|
|
38
|
+
return result.documentType === 'borehole-log';
|
|
39
|
+
}
|
|
40
|
+
function normalizeTextHint(value) {
|
|
41
|
+
if (typeof value !== 'string') {
|
|
42
|
+
return undefined;
|
|
43
|
+
}
|
|
44
|
+
const normalized = value.replace(/\s+/g, ' ').trim();
|
|
45
|
+
return normalized ? normalized.slice(0, 1600) : undefined;
|
|
46
|
+
}
|
|
47
|
+
function mapPageSourceKind(classification) {
|
|
48
|
+
return classification === 'image-only' || classification === 'text-unreadable' ? 'raster-image' : 'pdf-page';
|
|
49
|
+
}
|
|
50
|
+
async function mapWithConcurrency(values, concurrency, iterator) {
|
|
51
|
+
if (values.length === 0) {
|
|
52
|
+
return [];
|
|
53
|
+
}
|
|
54
|
+
const safeConcurrency = Math.max(1, Math.min(concurrency, values.length));
|
|
55
|
+
const results = new Array(values.length);
|
|
56
|
+
let cursor = 0;
|
|
57
|
+
const workers = Array.from({ length: safeConcurrency }, async () => {
|
|
58
|
+
while (cursor < values.length) {
|
|
59
|
+
const index = cursor;
|
|
60
|
+
cursor += 1;
|
|
61
|
+
results[index] = await iterator(values[index], index);
|
|
62
|
+
}
|
|
63
|
+
});
|
|
64
|
+
await Promise.all(workers);
|
|
65
|
+
return results;
|
|
66
|
+
}
|
|
67
|
+
function pageTextHintLooksBoreholeLike(value) {
|
|
68
|
+
if (!value) {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
return (/\bborehole\b/i.test(value)
|
|
72
|
+
|| /\bBH[-\s_/]?\d+\b/i.test(value)
|
|
73
|
+
|| /\bTP[-\s_/]?\d+\b/i.test(value)
|
|
74
|
+
|| /\bCPT[-\s_/]?\d+\b/i.test(value)
|
|
75
|
+
|| /\bSPT\b/i.test(value)
|
|
76
|
+
|| /\b(?:easting|northing|latitude|longitude|groundwater)\b/i.test(value)
|
|
77
|
+
|| /\b\d+(?:\.\d+)?\s*-\s*\d+(?:\.\d+)?\s*m\b/i.test(value));
|
|
78
|
+
}
|
|
79
|
+
function normalizeKnownBoreholeId(value) {
|
|
80
|
+
if (typeof value !== 'string') {
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
const trimmed = value.trim();
|
|
84
|
+
if (!trimmed || trimmed === 'BH-unknown') {
|
|
85
|
+
return null;
|
|
86
|
+
}
|
|
87
|
+
return trimmed;
|
|
88
|
+
}
|
|
89
|
+
function minimumLayerDepth(result) {
|
|
90
|
+
let minDepth = null;
|
|
91
|
+
for (const layer of result.layers) {
|
|
92
|
+
const candidates = [layer.depthFrom, layer.depthTo].filter((value) => value != null && Number.isFinite(value));
|
|
93
|
+
for (const candidate of candidates) {
|
|
94
|
+
minDepth = minDepth == null ? candidate : Math.min(minDepth, candidate);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
return minDepth;
|
|
98
|
+
}
|
|
99
|
+
function hasUsableBoreholeSignal(result, detectedBoreholeId) {
|
|
100
|
+
return (detectedBoreholeId != null
|
|
101
|
+
|| result.layers.length > 0
|
|
102
|
+
|| result.totalDepth != null
|
|
103
|
+
|| result.waterTableDepth != null
|
|
104
|
+
|| result.location != null
|
|
105
|
+
|| result.groundElevation != null
|
|
106
|
+
|| result.dateDrilled != null
|
|
107
|
+
|| result.drillingMethod != null);
|
|
108
|
+
}
|
|
109
|
+
function shouldIgnoreNonLogPage(result, detectedBoreholeId, pageTextHint) {
|
|
110
|
+
return (!hasUsableBoreholeSignal(result, detectedBoreholeId)
|
|
111
|
+
&& !pageTextHintLooksBoreholeLike(pageTextHint));
|
|
112
|
+
}
|
|
113
|
+
function shouldStartNewAnonymousGroup(result, state) {
|
|
114
|
+
const currentStartDepth = minimumLayerDepth(result);
|
|
115
|
+
return (state.hasCurrentGroup
|
|
116
|
+
&& state.priorContinuationDepth != null
|
|
117
|
+
&& state.priorContinuationDepth >= 3
|
|
118
|
+
&& currentStartDepth != null
|
|
119
|
+
&& currentStartDepth <= 0.5);
|
|
120
|
+
}
|
|
121
|
+
function advanceBoreholeProcessingState(state, result, pageTextHint, overrideBoreholeId) {
|
|
122
|
+
const next = { ...state };
|
|
123
|
+
const detectedBoreholeId = normalizeKnownBoreholeId(result.boreholeId);
|
|
124
|
+
if (shouldIgnoreNonLogPage(result, detectedBoreholeId, pageTextHint)) {
|
|
125
|
+
return next;
|
|
126
|
+
}
|
|
127
|
+
if (overrideBoreholeId) {
|
|
128
|
+
next.hasCurrentGroup = true;
|
|
129
|
+
next.currentGroupBoreholeId = overrideBoreholeId;
|
|
130
|
+
}
|
|
131
|
+
else if (!next.hasCurrentGroup) {
|
|
132
|
+
next.hasCurrentGroup = true;
|
|
133
|
+
next.currentGroupBoreholeId = detectedBoreholeId;
|
|
134
|
+
}
|
|
135
|
+
else if (detectedBoreholeId
|
|
136
|
+
&& next.currentGroupBoreholeId
|
|
137
|
+
&& detectedBoreholeId !== next.currentGroupBoreholeId) {
|
|
138
|
+
next.currentGroupBoreholeId = detectedBoreholeId;
|
|
139
|
+
}
|
|
140
|
+
else if (!detectedBoreholeId && shouldStartNewAnonymousGroup(result, next)) {
|
|
141
|
+
next.currentGroupBoreholeId = null;
|
|
142
|
+
}
|
|
143
|
+
else if (detectedBoreholeId && !next.currentGroupBoreholeId) {
|
|
144
|
+
next.currentGroupBoreholeId = detectedBoreholeId;
|
|
145
|
+
}
|
|
146
|
+
next.hasCurrentGroup = true;
|
|
147
|
+
if (detectedBoreholeId && !overrideBoreholeId) {
|
|
148
|
+
next.currentGroupBoreholeId = detectedBoreholeId;
|
|
149
|
+
}
|
|
150
|
+
next.lastResolvedBoreholeId = overrideBoreholeId ?? next.currentGroupBoreholeId ?? undefined;
|
|
151
|
+
next.priorContinuationDepth = result.continuationDepth ?? null;
|
|
152
|
+
return next;
|
|
153
|
+
}
|
|
154
|
+
function isSlowVisualPageError(message, classification, sourceKind) {
|
|
155
|
+
const looksVisual = sourceKind === 'raster-image' || classification === 'image-only' || classification === 'text-unreadable';
|
|
156
|
+
return looksVisual && SLOW_VISUAL_ERROR_PATTERNS.some((pattern) => pattern.test(message));
|
|
157
|
+
}
|
|
158
|
+
function isFatalProviderStopError(message) {
|
|
159
|
+
return FATAL_PROVIDER_STOP_PATTERNS.some((pattern) => pattern.test(message));
|
|
160
|
+
}
|
|
161
|
+
function normalizeCheckpointErrorMessage(message) {
|
|
162
|
+
let normalized = message.trim();
|
|
163
|
+
for (let iteration = 0; iteration < 4; iteration += 1) {
|
|
164
|
+
const updated = normalized.replace(/^Page \d+:\s*/i, '').trim();
|
|
165
|
+
if (updated === normalized) {
|
|
166
|
+
break;
|
|
167
|
+
}
|
|
168
|
+
normalized = updated;
|
|
169
|
+
}
|
|
170
|
+
return normalized;
|
|
171
|
+
}
|
|
172
|
+
async function withWorkerPageTimeout(promise, timeoutMs, errorMessage) {
|
|
173
|
+
if (!Number.isFinite(timeoutMs) || timeoutMs <= 0) {
|
|
174
|
+
return promise;
|
|
175
|
+
}
|
|
176
|
+
return await Promise.race([
|
|
177
|
+
promise,
|
|
178
|
+
new Promise((_, reject) => {
|
|
179
|
+
const timer = setTimeout(() => {
|
|
180
|
+
reject(new Error(errorMessage));
|
|
181
|
+
}, timeoutMs);
|
|
182
|
+
timer.unref?.();
|
|
183
|
+
}),
|
|
184
|
+
]);
|
|
185
|
+
}
|
|
186
|
+
function resolveWorkerPhaseTimeoutMs(config, input) {
|
|
187
|
+
const baseTimeoutMs = Math.min(Math.max(config.timeout ?? 120000, 60000), 120000);
|
|
188
|
+
const isHeavyVisualPage = input.sourceKind === 'raster-image'
|
|
189
|
+
|| input.classification === 'image-only'
|
|
190
|
+
|| input.classification === 'text-unreadable';
|
|
191
|
+
return isHeavyVisualPage
|
|
192
|
+
? Math.min(Math.max(baseTimeoutMs, 180000), 180000)
|
|
193
|
+
: baseTimeoutMs;
|
|
194
|
+
}
|
|
195
|
+
function resolveWorkerTextExtractionTimeoutMs(baseTimeoutMs, textHint) {
|
|
196
|
+
if (!textHint) {
|
|
197
|
+
return baseTimeoutMs;
|
|
198
|
+
}
|
|
199
|
+
if (textHint.length >= 1800) {
|
|
200
|
+
return Math.min(Math.max(baseTimeoutMs, 150000), 180000);
|
|
201
|
+
}
|
|
202
|
+
if (textHint.length >= 1000) {
|
|
203
|
+
return Math.max(baseTimeoutMs, 120000);
|
|
204
|
+
}
|
|
205
|
+
return baseTimeoutMs;
|
|
206
|
+
}
|
|
207
|
+
async function preparePdfPageInputs(filePath, inspection, concurrency, dependencies = {}) {
|
|
208
|
+
const readPageInputs = dependencies.readDocumentPdfPageInputs ?? readDocumentPdfPageInputs;
|
|
209
|
+
const normalizedPageInputs = await readPageInputs(filePath, {
|
|
210
|
+
inspection,
|
|
211
|
+
dependencies: {
|
|
212
|
+
extractPageImages: dependencies.extractPrimaryPdfPageImages,
|
|
213
|
+
},
|
|
214
|
+
});
|
|
215
|
+
return mapWithConcurrency(normalizedPageInputs, concurrency, async (page) => ({
|
|
216
|
+
base64: page.base64,
|
|
217
|
+
mimeType: page.mimeType,
|
|
218
|
+
fileBytes: page.fileBytes,
|
|
219
|
+
pageNumber: page.pageNumber,
|
|
220
|
+
totalPages: page.totalPages,
|
|
221
|
+
sourceKind: page.sourceKind ?? 'pdf-page',
|
|
222
|
+
}));
|
|
223
|
+
}
|
|
224
|
+
function buildInspectionSummary(inspection) {
|
|
225
|
+
if (!inspection) {
|
|
226
|
+
return null;
|
|
227
|
+
}
|
|
228
|
+
const pageClassificationCounts = {};
|
|
229
|
+
let imageHeavyPageCount = 0;
|
|
230
|
+
let nativeTextPageCount = 0;
|
|
231
|
+
let degradedPageCount = 0;
|
|
232
|
+
for (const page of inspection.pages) {
|
|
233
|
+
pageClassificationCounts[page.classification] = (pageClassificationCounts[page.classification] ?? 0) + 1;
|
|
234
|
+
if (page.classification === 'image-only' || page.classification === 'text-unreadable') {
|
|
235
|
+
imageHeavyPageCount += 1;
|
|
236
|
+
}
|
|
237
|
+
if (page.capabilities.nativeTextExtraction !== 'unavailable') {
|
|
238
|
+
nativeTextPageCount += 1;
|
|
239
|
+
}
|
|
240
|
+
if (page.degradation.level !== 'none') {
|
|
241
|
+
degradedPageCount += 1;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
return {
|
|
245
|
+
pageClassificationCounts,
|
|
246
|
+
imageHeavyPageCount,
|
|
247
|
+
nativeTextPageCount,
|
|
248
|
+
degradedPageCount,
|
|
249
|
+
ocrRecoveredPageCount: 0,
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
function summarizeReviewReasons(findings) {
|
|
253
|
+
return uniqueStrings(findings
|
|
254
|
+
.filter((finding) => finding.severity !== 'advisory')
|
|
255
|
+
.map((finding) => finding.message));
|
|
256
|
+
}
|
|
257
|
+
function buildSyntheticBoreholeResult(job, inspection, now) {
|
|
258
|
+
const pageFailures = job.checkpoints.pages
|
|
259
|
+
.filter((page) => page.status === 'failed')
|
|
260
|
+
.map((page) => page.error ?? `Page ${page.pageNumber} failed during async ingest.`);
|
|
261
|
+
const downgradedPages = job.checkpoints.pages.filter((page) => page.status === 'failed' && page.downgraded);
|
|
262
|
+
const normalFailedPages = job.checkpoints.pages.filter((page) => page.status === 'failed' && !page.downgraded);
|
|
263
|
+
const reviewFindings = [
|
|
264
|
+
...job.checkpoints.pages
|
|
265
|
+
.filter((page) => page.status === 'failed')
|
|
266
|
+
.map((page) => ({
|
|
267
|
+
code: page.downgraded ? 'page_visual_ingest_downgraded' : 'page_ingest_failed',
|
|
268
|
+
severity: page.downgraded ? 'review' : 'blocking',
|
|
269
|
+
scope: 'page',
|
|
270
|
+
message: page.downgraded
|
|
271
|
+
? `Page ${page.pageNumber} exceeded the slow visual budget and was downgraded to manual review.`
|
|
272
|
+
: page.error ?? `Page ${page.pageNumber} failed during ingest.`,
|
|
273
|
+
pageNumber: page.pageNumber,
|
|
274
|
+
})),
|
|
275
|
+
];
|
|
276
|
+
if (downgradedPages.length > 0) {
|
|
277
|
+
reviewFindings.push({
|
|
278
|
+
code: 'slow_visual_pages_present',
|
|
279
|
+
severity: 'review',
|
|
280
|
+
scope: 'document',
|
|
281
|
+
message: `${downgradedPages.length} slow visual page(s) were downgraded to manual review.`,
|
|
282
|
+
});
|
|
283
|
+
}
|
|
284
|
+
if (normalFailedPages.length > 0) {
|
|
285
|
+
reviewFindings.push({
|
|
286
|
+
code: 'page_failures_present',
|
|
287
|
+
severity: 'blocking',
|
|
288
|
+
scope: 'document',
|
|
289
|
+
message: `${normalFailedPages.length} page(s) failed during ingest and should be reviewed.`,
|
|
290
|
+
});
|
|
291
|
+
}
|
|
292
|
+
return {
|
|
293
|
+
kind: 'geotech-ingest-result',
|
|
294
|
+
schemaVersion: 1,
|
|
295
|
+
documentType: 'borehole-log',
|
|
296
|
+
generatedAt: nowIso(now),
|
|
297
|
+
source: {
|
|
298
|
+
filePath: job.source.filePath,
|
|
299
|
+
fileName: basename(job.source.filePath),
|
|
300
|
+
inputKind: 'pdf',
|
|
301
|
+
totalPages: job.source.totalPages,
|
|
302
|
+
successfulPages: 0,
|
|
303
|
+
failedPages: pageFailures.length,
|
|
304
|
+
},
|
|
305
|
+
inspection,
|
|
306
|
+
inspectionSummary: summarizeBoreholeIngestInspection(inspection),
|
|
307
|
+
boreholes: [],
|
|
308
|
+
pageAudits: job.checkpoints.pages.map((page) => ({
|
|
309
|
+
pageNumber: page.pageNumber,
|
|
310
|
+
detectedBoreholeId: null,
|
|
311
|
+
assignedGroup: 'unassigned',
|
|
312
|
+
classification: page.classification,
|
|
313
|
+
textHintSource: page.ocrSource ?? 'none',
|
|
314
|
+
parseStatus: page.status === 'completed' ? 'partial' : 'failed',
|
|
315
|
+
confidence: 0,
|
|
316
|
+
continuationDepth: null,
|
|
317
|
+
warnings: page.error ? [page.error] : [],
|
|
318
|
+
})),
|
|
319
|
+
pageFailures,
|
|
320
|
+
warnings: uniqueStrings(pageFailures),
|
|
321
|
+
reviewFindings,
|
|
322
|
+
reviewReasons: summarizeReviewReasons(reviewFindings),
|
|
323
|
+
reviewRequired: reviewFindings.some((finding) => finding.severity !== 'advisory'),
|
|
324
|
+
confidence: 0,
|
|
325
|
+
canAutoProceed: false,
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
function buildSyntheticGeotechDocumentResult(job, inspection, now) {
|
|
329
|
+
const pageFailures = job.checkpoints.pages
|
|
330
|
+
.filter((page) => page.status === 'failed')
|
|
331
|
+
.map((page) => page.error ?? `Page ${page.pageNumber} failed during async ingest.`);
|
|
332
|
+
const downgradedPages = job.checkpoints.pages.filter((page) => page.status === 'failed' && page.downgraded);
|
|
333
|
+
const normalFailedPages = job.checkpoints.pages.filter((page) => page.status === 'failed' && !page.downgraded);
|
|
334
|
+
const reviewFindings = [
|
|
335
|
+
...job.checkpoints.pages
|
|
336
|
+
.filter((page) => page.status === 'failed')
|
|
337
|
+
.map((page) => ({
|
|
338
|
+
code: page.downgraded ? 'page_visual_ingest_downgraded' : 'page_ingest_failed',
|
|
339
|
+
severity: page.downgraded ? 'review' : 'blocking',
|
|
340
|
+
scope: 'page',
|
|
341
|
+
message: page.downgraded
|
|
342
|
+
? `Page ${page.pageNumber} exceeded the slow visual budget and was downgraded to manual review.`
|
|
343
|
+
: page.error ?? `Page ${page.pageNumber} failed during ingest.`,
|
|
344
|
+
pageNumber: page.pageNumber,
|
|
345
|
+
})),
|
|
346
|
+
];
|
|
347
|
+
if (downgradedPages.length > 0) {
|
|
348
|
+
reviewFindings.push({
|
|
349
|
+
code: 'slow_visual_pages_present',
|
|
350
|
+
severity: 'review',
|
|
351
|
+
scope: 'document',
|
|
352
|
+
message: `${downgradedPages.length} slow visual page(s) were downgraded to manual review.`,
|
|
353
|
+
});
|
|
354
|
+
}
|
|
355
|
+
if (normalFailedPages.length > 0) {
|
|
356
|
+
reviewFindings.push({
|
|
357
|
+
code: 'page_failures_present',
|
|
358
|
+
severity: 'blocking',
|
|
359
|
+
scope: 'document',
|
|
360
|
+
message: `${normalFailedPages.length} page(s) failed during ingest and should be reviewed.`,
|
|
361
|
+
});
|
|
362
|
+
}
|
|
363
|
+
return {
|
|
364
|
+
kind: 'geotech-ingest-result',
|
|
365
|
+
schemaVersion: 1,
|
|
366
|
+
documentType: 'geotech-document',
|
|
367
|
+
generatedAt: nowIso(now),
|
|
368
|
+
source: {
|
|
369
|
+
filePath: job.source.filePath,
|
|
370
|
+
fileName: basename(job.source.filePath),
|
|
371
|
+
inputKind: 'pdf',
|
|
372
|
+
totalPages: job.source.totalPages,
|
|
373
|
+
successfulPages: 0,
|
|
374
|
+
failedPages: pageFailures.length,
|
|
375
|
+
},
|
|
376
|
+
inspection,
|
|
377
|
+
inspectionSummary: buildInspectionSummary(inspection),
|
|
378
|
+
documentClass: null,
|
|
379
|
+
title: null,
|
|
380
|
+
summary: null,
|
|
381
|
+
materials: [],
|
|
382
|
+
classifications: [],
|
|
383
|
+
parameters: [],
|
|
384
|
+
risks: [],
|
|
385
|
+
recommendations: [],
|
|
386
|
+
pageAudits: job.checkpoints.pages.map((page) => ({
|
|
387
|
+
pageNumber: page.pageNumber,
|
|
388
|
+
classification: page.classification,
|
|
389
|
+
textHintSource: page.ocrSource ?? 'none',
|
|
390
|
+
parseStatus: page.status === 'completed' ? 'partial' : 'failed',
|
|
391
|
+
confidence: 0,
|
|
392
|
+
materialCount: 0,
|
|
393
|
+
classificationCount: 0,
|
|
394
|
+
parameterCount: 0,
|
|
395
|
+
warnings: page.error ? [page.error] : [],
|
|
396
|
+
})),
|
|
397
|
+
pageFailures,
|
|
398
|
+
warnings: uniqueStrings(pageFailures),
|
|
399
|
+
reviewFindings,
|
|
400
|
+
reviewReasons: summarizeReviewReasons(reviewFindings),
|
|
401
|
+
parseStatus: 'failed',
|
|
402
|
+
confidence: 0,
|
|
403
|
+
reviewRequired: reviewFindings.some((finding) => finding.severity !== 'advisory'),
|
|
404
|
+
canAutoProceed: false,
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
function dedupeReviewFindings(reviewFindings) {
|
|
408
|
+
return [
|
|
409
|
+
...new Map(reviewFindings.map((finding) => {
|
|
410
|
+
const key = [
|
|
411
|
+
finding.code,
|
|
412
|
+
finding.severity,
|
|
413
|
+
finding.scope,
|
|
414
|
+
finding.message,
|
|
415
|
+
finding.pageNumber ?? '',
|
|
416
|
+
'boreholeId' in finding ? finding.boreholeId ?? '' : '',
|
|
417
|
+
'materialDescription' in finding ? finding.materialDescription ?? '' : '',
|
|
418
|
+
].join('|');
|
|
419
|
+
return [key, finding];
|
|
420
|
+
})).values(),
|
|
421
|
+
];
|
|
422
|
+
}
|
|
423
|
+
function applyBoreholeFailureDowngrades(result, checkpoints) {
|
|
424
|
+
const downgradedPageNumbers = new Set(checkpoints
|
|
425
|
+
.filter((page) => page.status === 'failed' && page.downgraded)
|
|
426
|
+
.map((page) => page.pageNumber));
|
|
427
|
+
if (downgradedPageNumbers.size === 0) {
|
|
428
|
+
return result;
|
|
429
|
+
}
|
|
430
|
+
const pageFailureCount = checkpoints.filter((page) => page.status === 'failed').length;
|
|
431
|
+
const nonDowngradedFailureCount = checkpoints.filter((page) => page.status === 'failed' && !page.downgraded).length;
|
|
432
|
+
const downgradedFailureCount = pageFailureCount - nonDowngradedFailureCount;
|
|
433
|
+
const reviewFindings = result.reviewFindings.map((finding) => {
|
|
434
|
+
if (finding.code === 'page_ingest_failed'
|
|
435
|
+
&& typeof finding.pageNumber === 'number'
|
|
436
|
+
&& downgradedPageNumbers.has(finding.pageNumber)) {
|
|
437
|
+
return {
|
|
438
|
+
...finding,
|
|
439
|
+
code: 'page_visual_ingest_downgraded',
|
|
440
|
+
severity: 'review',
|
|
441
|
+
message: `Page ${finding.pageNumber} exceeded the slow visual budget and was downgraded to manual review.`,
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
if (finding.code === 'page_failures_present' && nonDowngradedFailureCount === 0) {
|
|
445
|
+
return {
|
|
446
|
+
...finding,
|
|
447
|
+
code: 'slow_visual_pages_present',
|
|
448
|
+
severity: 'review',
|
|
449
|
+
message: `${downgradedFailureCount} slow visual page(s) were downgraded to manual review.`,
|
|
450
|
+
};
|
|
451
|
+
}
|
|
452
|
+
return finding;
|
|
453
|
+
});
|
|
454
|
+
const nextReviewFindings = dedupeReviewFindings(reviewFindings);
|
|
455
|
+
const reviewReasons = summarizeReviewReasons(nextReviewFindings);
|
|
456
|
+
return {
|
|
457
|
+
...result,
|
|
458
|
+
reviewFindings: nextReviewFindings,
|
|
459
|
+
reviewReasons,
|
|
460
|
+
reviewRequired: reviewReasons.length > 0,
|
|
461
|
+
canAutoProceed: false,
|
|
462
|
+
};
|
|
463
|
+
}
|
|
464
|
+
function applyGeotechFailureDowngrades(result, checkpoints) {
|
|
465
|
+
const downgradedPageNumbers = new Set(checkpoints
|
|
466
|
+
.filter((page) => page.status === 'failed' && page.downgraded)
|
|
467
|
+
.map((page) => page.pageNumber));
|
|
468
|
+
if (downgradedPageNumbers.size === 0) {
|
|
469
|
+
return result;
|
|
470
|
+
}
|
|
471
|
+
const pageFailureCount = checkpoints.filter((page) => page.status === 'failed').length;
|
|
472
|
+
const nonDowngradedFailureCount = checkpoints.filter((page) => page.status === 'failed' && !page.downgraded).length;
|
|
473
|
+
const downgradedFailureCount = pageFailureCount - nonDowngradedFailureCount;
|
|
474
|
+
const reviewFindings = result.reviewFindings.map((finding) => {
|
|
475
|
+
if (finding.code === 'page_ingest_failed'
|
|
476
|
+
&& typeof finding.pageNumber === 'number'
|
|
477
|
+
&& downgradedPageNumbers.has(finding.pageNumber)) {
|
|
478
|
+
return {
|
|
479
|
+
...finding,
|
|
480
|
+
code: 'page_visual_ingest_downgraded',
|
|
481
|
+
severity: 'review',
|
|
482
|
+
message: `Page ${finding.pageNumber} exceeded the slow visual budget and was downgraded to manual review.`,
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
if (finding.code === 'page_failures_present' && nonDowngradedFailureCount === 0) {
|
|
486
|
+
return {
|
|
487
|
+
...finding,
|
|
488
|
+
code: 'slow_visual_pages_present',
|
|
489
|
+
severity: 'review',
|
|
490
|
+
message: `${downgradedFailureCount} slow visual page(s) were downgraded to manual review.`,
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
return finding;
|
|
494
|
+
});
|
|
495
|
+
const nextReviewFindings = dedupeReviewFindings(reviewFindings);
|
|
496
|
+
const reviewReasons = summarizeReviewReasons(nextReviewFindings);
|
|
497
|
+
return {
|
|
498
|
+
...result,
|
|
499
|
+
reviewFindings: nextReviewFindings,
|
|
500
|
+
reviewReasons,
|
|
501
|
+
reviewRequired: reviewReasons.length > 0,
|
|
502
|
+
canAutoProceed: false,
|
|
503
|
+
};
|
|
504
|
+
}
|
|
505
|
+
function buildJobConfig(job, dependencies = {}) {
|
|
506
|
+
const buildConfig = dependencies.buildLLMConfig ?? buildLLMConfig;
|
|
507
|
+
const runtimeConfig = buildConfig();
|
|
508
|
+
return {
|
|
509
|
+
...runtimeConfig,
|
|
510
|
+
provider: job.config.provider,
|
|
511
|
+
baseUrl: job.config.baseUrl ?? runtimeConfig.baseUrl,
|
|
512
|
+
modelId: job.config.modelId ?? runtimeConfig.modelId,
|
|
513
|
+
visionModelId: job.config.visionModelId ?? runtimeConfig.visionModelId,
|
|
514
|
+
timeout: job.config.timeout ?? runtimeConfig.timeout,
|
|
515
|
+
};
|
|
516
|
+
}
|
|
517
|
+
function findCheckpoint(job, pageNumber) {
|
|
518
|
+
const checkpoint = job.checkpoints.pages.find((page) => page.pageNumber === pageNumber);
|
|
519
|
+
if (!checkpoint) {
|
|
520
|
+
throw new Error(`Persisted ingest job "${job.jobId}" is missing checkpoint metadata for page ${pageNumber}.`);
|
|
521
|
+
}
|
|
522
|
+
return checkpoint;
|
|
523
|
+
}
|
|
524
|
+
async function processGeotechDocumentPage(job, pageInput, config, dependencies) {
|
|
525
|
+
const inspect = dependencies.inspectPdfDocument ?? inspectPdfDocument;
|
|
526
|
+
const recoverTextHint = dependencies.recoverDocumentTextHint ?? recoverDocumentTextHint;
|
|
527
|
+
const interpretation = dependencies.interpretGeotechDocumentPage ?? interpretGeotechDocumentPage;
|
|
528
|
+
const extractTextFacts = dependencies.extractGeotechDocumentFactsFromText ?? extractGeotechDocumentFactsFromText;
|
|
529
|
+
const transcribe = dependencies.transcribeDocumentImageText ?? transcribeDocumentImageText;
|
|
530
|
+
const inspectionPage = job.inspection?.pages[pageInput.pageNumber - 1] ?? inspect(job.source.filePath).pages[pageInput.pageNumber - 1];
|
|
531
|
+
const lowYieldRole = inferPreflightLowYieldPageRole({
|
|
532
|
+
inspectionPage,
|
|
533
|
+
previousInspectionPage: job.inspection?.pages[pageInput.pageNumber - 2],
|
|
534
|
+
pageNumber: pageInput.pageNumber,
|
|
535
|
+
totalPages: pageInput.totalPages,
|
|
536
|
+
sourceKind: pageInput.sourceKind,
|
|
537
|
+
});
|
|
538
|
+
if (lowYieldRole && inspectionPage) {
|
|
539
|
+
return {
|
|
540
|
+
result: buildPreflightLowYieldInsight({
|
|
541
|
+
role: lowYieldRole,
|
|
542
|
+
inspectionPage,
|
|
543
|
+
pageNumber: pageInput.pageNumber,
|
|
544
|
+
totalPages: pageInput.totalPages,
|
|
545
|
+
}),
|
|
546
|
+
ocrTextHint: undefined,
|
|
547
|
+
ocrSource: 'none',
|
|
548
|
+
ocrWarnings: [
|
|
549
|
+
lowYieldRole === 'administrative'
|
|
550
|
+
? 'Administrative/cover page was summarized without a full multimodal extraction call.'
|
|
551
|
+
: 'Figure/appendix page was summarized without a full multimodal extraction call.',
|
|
552
|
+
],
|
|
553
|
+
};
|
|
554
|
+
}
|
|
555
|
+
const phaseTimeoutMs = resolveWorkerPhaseTimeoutMs(config, {
|
|
556
|
+
classification: inspectionPage?.classification,
|
|
557
|
+
sourceKind: pageInput.sourceKind,
|
|
558
|
+
});
|
|
559
|
+
const phaseConfig = {
|
|
560
|
+
...config,
|
|
561
|
+
timeout: phaseTimeoutMs,
|
|
562
|
+
};
|
|
563
|
+
let pageTextHint;
|
|
564
|
+
let ocrSource = 'none';
|
|
565
|
+
let ocrWarnings = [];
|
|
566
|
+
try {
|
|
567
|
+
const recovery = await withWorkerPageTimeout(recoverTextHint({
|
|
568
|
+
existingTextHint: inspectionPage?.normalizedArtifact?.nativeText ?? inspectionPage?.normalizedText,
|
|
569
|
+
existingTextAccepted: inspectionPage?.normalizedArtifact?.textQuality.accepted ?? true,
|
|
570
|
+
imageBase64: pageInput.base64,
|
|
571
|
+
mimeType: pageInput.mimeType,
|
|
572
|
+
config: phaseConfig,
|
|
573
|
+
pdfFilePath: job.source.filePath,
|
|
574
|
+
pdfPageNumber: pageInput.pageNumber,
|
|
575
|
+
visionTranscribe: transcribe,
|
|
576
|
+
}), phaseTimeoutMs, `Page ${pageInput.pageNumber}: OCR/text recovery timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
|
|
577
|
+
pageTextHint = normalizeTextHint(recovery.textHint);
|
|
578
|
+
ocrSource = recovery.source;
|
|
579
|
+
ocrWarnings = recovery.warnings;
|
|
580
|
+
}
|
|
581
|
+
catch (error) {
|
|
582
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
583
|
+
ocrWarnings = [
|
|
584
|
+
`OCR/text recovery failed (${normalizeCheckpointErrorMessage(message)}); proceeded with direct page interpretation.`,
|
|
585
|
+
];
|
|
586
|
+
}
|
|
587
|
+
const context = {
|
|
588
|
+
pageNumber: pageInput.pageNumber,
|
|
589
|
+
totalPages: pageInput.totalPages,
|
|
590
|
+
pageClassification: inspectionPage?.classification,
|
|
591
|
+
pageTextHint,
|
|
592
|
+
};
|
|
593
|
+
const extractionTimeoutMs = resolveWorkerTextExtractionTimeoutMs(phaseTimeoutMs, pageTextHint);
|
|
594
|
+
const extractionConfig = {
|
|
595
|
+
...config,
|
|
596
|
+
timeout: extractionTimeoutMs,
|
|
597
|
+
};
|
|
598
|
+
const result = pageTextHint
|
|
599
|
+
? await withWorkerPageTimeout(extractTextFacts(pageTextHint, extractionConfig, context), extractionTimeoutMs, `Page ${pageInput.pageNumber}: text extraction timed out after ${Math.round(extractionTimeoutMs / 1000)}s`)
|
|
600
|
+
: await withWorkerPageTimeout(interpretation(pageInput.base64, pageInput.mimeType, phaseConfig, context), phaseTimeoutMs, `Page ${pageInput.pageNumber}: visual page interpretation timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
|
|
601
|
+
return {
|
|
602
|
+
result,
|
|
603
|
+
ocrTextHint: pageTextHint,
|
|
604
|
+
ocrSource,
|
|
605
|
+
ocrWarnings,
|
|
606
|
+
};
|
|
607
|
+
}
|
|
608
|
+
async function processBoreholePage(job, pageInput, config, state, dependencies) {
|
|
609
|
+
const recoverTextHint = dependencies.recoverDocumentTextHint ?? recoverDocumentTextHint;
|
|
610
|
+
const transcribe = dependencies.transcribeDocumentImageText ?? transcribeDocumentImageText;
|
|
611
|
+
const interpret = dependencies.interpretBoreholeLogWithContext ?? interpretBoreholeLogWithContext;
|
|
612
|
+
const inspectionPage = job.inspection?.pages[pageInput.pageNumber - 1];
|
|
613
|
+
const phaseTimeoutMs = resolveWorkerPhaseTimeoutMs(config, {
|
|
614
|
+
classification: inspectionPage?.classification,
|
|
615
|
+
sourceKind: pageInput.sourceKind,
|
|
616
|
+
});
|
|
617
|
+
const phaseConfig = {
|
|
618
|
+
...config,
|
|
619
|
+
timeout: phaseTimeoutMs,
|
|
620
|
+
};
|
|
621
|
+
let pageTextHint = typeof inspectionPage?.normalizedText === 'string' ? inspectionPage.normalizedText : undefined;
|
|
622
|
+
const recovery = await withWorkerPageTimeout(recoverTextHint({
|
|
623
|
+
existingTextHint: pageTextHint,
|
|
624
|
+
existingTextAccepted: inspectionPage?.normalizedArtifact?.textQuality.accepted ?? true,
|
|
625
|
+
imageBase64: pageInput.base64,
|
|
626
|
+
mimeType: pageInput.mimeType,
|
|
627
|
+
config: phaseConfig,
|
|
628
|
+
pdfFilePath: job.source.filePath,
|
|
629
|
+
pdfPageNumber: pageInput.pageNumber,
|
|
630
|
+
visionTranscribe: transcribe,
|
|
631
|
+
}), phaseTimeoutMs, `Page ${pageInput.pageNumber}: OCR/text recovery timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
|
|
632
|
+
if (recovery.textHint) {
|
|
633
|
+
pageTextHint = recovery.textHint;
|
|
634
|
+
}
|
|
635
|
+
const context = {
|
|
636
|
+
boreholeId: state.lastResolvedBoreholeId,
|
|
637
|
+
pageNumber: pageInput.pageNumber,
|
|
638
|
+
totalPages: pageInput.totalPages,
|
|
639
|
+
priorContinuationDepth: state.priorContinuationDepth,
|
|
640
|
+
pageClassification: inspectionPage?.classification,
|
|
641
|
+
pageTextHint,
|
|
642
|
+
};
|
|
643
|
+
const result = await withWorkerPageTimeout(interpret(pageInput.base64, pageInput.mimeType, phaseConfig, context), phaseTimeoutMs, `Page ${pageInput.pageNumber}: visual page interpretation timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
|
|
644
|
+
const nextState = advanceBoreholeProcessingState(state, result, pageTextHint, job.request.overrideBoreholeId);
|
|
645
|
+
return {
|
|
646
|
+
result,
|
|
647
|
+
nextState,
|
|
648
|
+
ocrTextHint: recovery.textHint,
|
|
649
|
+
ocrSource: recovery.source,
|
|
650
|
+
ocrWarnings: recovery.warnings,
|
|
651
|
+
};
|
|
652
|
+
}
|
|
653
|
+
async function finalizeJobResult(job, pageInputs, config, dependencies) {
|
|
654
|
+
const completedPages = job.checkpoints.pages.filter((page) => page.status === 'completed');
|
|
655
|
+
if (completedPages.length === 0) {
|
|
656
|
+
return job.documentType === 'borehole-log'
|
|
657
|
+
? buildSyntheticBoreholeResult(job, job.inspection, dependencies.now)
|
|
658
|
+
: buildSyntheticGeotechDocumentResult(job, job.inspection, dependencies.now);
|
|
659
|
+
}
|
|
660
|
+
if (job.documentType === 'borehole-log') {
|
|
661
|
+
const pageInputMap = new Map(pageInputs.map((page) => [page.pageNumber, page]));
|
|
662
|
+
try {
|
|
663
|
+
const result = await ingestBoreholeLogDocument({
|
|
664
|
+
config,
|
|
665
|
+
source: {
|
|
666
|
+
filePath: job.source.filePath,
|
|
667
|
+
fileName: basename(job.source.filePath),
|
|
668
|
+
inputKind: 'pdf',
|
|
669
|
+
},
|
|
670
|
+
overrideBoreholeId: job.request.overrideBoreholeId,
|
|
671
|
+
inspection: job.inspection,
|
|
672
|
+
pages: job.checkpoints.pages
|
|
673
|
+
.map((checkpoint) => pageInputMap.get(checkpoint.pageNumber))
|
|
674
|
+
.filter((page) => Boolean(page)),
|
|
675
|
+
interpretPageWithContext: async (_base64, _mimeType, _config, context) => {
|
|
676
|
+
const pageNumber = context?.pageNumber;
|
|
677
|
+
if (!pageNumber) {
|
|
678
|
+
throw new Error('Replay borehole ingest requires a page number context.');
|
|
679
|
+
}
|
|
680
|
+
const checkpoint = findCheckpoint(job, pageNumber);
|
|
681
|
+
if (checkpoint.status === 'completed' && checkpoint.result) {
|
|
682
|
+
return checkpoint.result;
|
|
683
|
+
}
|
|
684
|
+
throw new Error(normalizeCheckpointErrorMessage(checkpoint.error ?? `Page ${pageNumber} failed during async ingest.`));
|
|
685
|
+
},
|
|
686
|
+
transcribePageImageText: async (_base64, _mimeType, _config) => {
|
|
687
|
+
const pageNumber = pageInputs.find((page) => page.base64 === _base64 && page.mimeType === _mimeType)?.pageNumber;
|
|
688
|
+
const checkpoint = pageNumber ? findCheckpoint(job, pageNumber) : undefined;
|
|
689
|
+
return {
|
|
690
|
+
text: checkpoint?.ocrTextHint ?? '',
|
|
691
|
+
warnings: checkpoint?.ocrWarnings ?? [],
|
|
692
|
+
usedFallback: false,
|
|
693
|
+
latencyMs: 0,
|
|
694
|
+
};
|
|
695
|
+
},
|
|
696
|
+
now: dependencies.now,
|
|
697
|
+
});
|
|
698
|
+
return applyBoreholeFailureDowngrades(result, job.checkpoints.pages);
|
|
699
|
+
}
|
|
700
|
+
catch (error) {
|
|
701
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
702
|
+
if (/No pages could be ingested successfully/i.test(message)) {
|
|
703
|
+
return buildSyntheticBoreholeResult(job, job.inspection, dependencies.now);
|
|
704
|
+
}
|
|
705
|
+
throw error;
|
|
706
|
+
}
|
|
707
|
+
}
|
|
708
|
+
const geotechPageInputMap = new Map(pageInputs.map((page) => [page.pageNumber, page]));
|
|
709
|
+
try {
|
|
710
|
+
const result = await ingestGeotechDocument({
|
|
711
|
+
config,
|
|
712
|
+
source: {
|
|
713
|
+
filePath: job.source.filePath,
|
|
714
|
+
fileName: basename(job.source.filePath),
|
|
715
|
+
inputKind: 'pdf',
|
|
716
|
+
},
|
|
717
|
+
inspection: job.inspection,
|
|
718
|
+
pages: job.checkpoints.pages
|
|
719
|
+
.map((checkpoint) => geotechPageInputMap.get(checkpoint.pageNumber))
|
|
720
|
+
.filter((page) => Boolean(page)),
|
|
721
|
+
interpretPage: async (_base64, _mimeType, _config, context) => {
|
|
722
|
+
const pageNumber = typeof context?.pageNumber === 'number' ? context.pageNumber : undefined;
|
|
723
|
+
if (!pageNumber) {
|
|
724
|
+
throw new Error('Replay geotech-document ingest requires a page number context.');
|
|
725
|
+
}
|
|
726
|
+
const checkpoint = findCheckpoint(job, pageNumber);
|
|
727
|
+
if (checkpoint.status === 'completed' && checkpoint.result) {
|
|
728
|
+
return checkpoint.result;
|
|
729
|
+
}
|
|
730
|
+
throw new Error(normalizeCheckpointErrorMessage(checkpoint.error ?? `Page ${pageNumber} failed during async ingest.`));
|
|
731
|
+
},
|
|
732
|
+
extractTextFacts: async (_pageText, _config, context) => {
|
|
733
|
+
const pageNumber = typeof context?.pageNumber === 'number' ? context.pageNumber : undefined;
|
|
734
|
+
if (!pageNumber) {
|
|
735
|
+
throw new Error('Replay geotech-document ingest requires a page number context.');
|
|
736
|
+
}
|
|
737
|
+
const checkpoint = findCheckpoint(job, pageNumber);
|
|
738
|
+
if (checkpoint.status === 'completed' && checkpoint.result) {
|
|
739
|
+
return checkpoint.result;
|
|
740
|
+
}
|
|
741
|
+
throw new Error(normalizeCheckpointErrorMessage(checkpoint.error ?? `Page ${pageNumber} failed during async ingest.`));
|
|
742
|
+
},
|
|
743
|
+
transcribePageImageText: async (_base64, _mimeType, _config) => {
|
|
744
|
+
const pageNumber = pageInputs.find((page) => page.base64 === _base64 && page.mimeType === _mimeType)?.pageNumber;
|
|
745
|
+
const checkpoint = pageNumber ? findCheckpoint(job, pageNumber) : undefined;
|
|
746
|
+
return {
|
|
747
|
+
text: checkpoint?.ocrTextHint ?? '',
|
|
748
|
+
warnings: checkpoint?.ocrWarnings ?? [],
|
|
749
|
+
usedFallback: false,
|
|
750
|
+
latencyMs: 0,
|
|
751
|
+
};
|
|
752
|
+
},
|
|
753
|
+
now: dependencies.now,
|
|
754
|
+
});
|
|
755
|
+
return applyGeotechFailureDowngrades(result, job.checkpoints.pages);
|
|
756
|
+
}
|
|
757
|
+
catch (error) {
|
|
758
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
759
|
+
if (/No pages could be ingested successfully/i.test(message)) {
|
|
760
|
+
return buildSyntheticGeotechDocumentResult(job, job.inspection, dependencies.now);
|
|
761
|
+
}
|
|
762
|
+
throw error;
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
function isCancelled(jobId) {
|
|
766
|
+
return loadPersistedIngestJob(jobId)?.execution.cancelRequested === true;
|
|
767
|
+
}
|
|
768
|
+
export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
|
|
769
|
+
const inspect = dependencies.inspectPdfDocument ?? inspectPdfDocument;
|
|
770
|
+
const persistReview = dependencies.persistReview ?? persistBoreholeIngestReview;
|
|
771
|
+
let currentJob = loadPersistedIngestJob(jobId);
|
|
772
|
+
if (!currentJob) {
|
|
773
|
+
throw new Error(`No persisted ingest job named "${jobId}" was found.`);
|
|
774
|
+
}
|
|
775
|
+
if (currentJob.status === 'completed') {
|
|
776
|
+
return currentJob;
|
|
777
|
+
}
|
|
778
|
+
const mutateQueue = [];
|
|
779
|
+
const mutateJob = async (mutator) => {
|
|
780
|
+
const run = async () => {
|
|
781
|
+
const latest = loadPersistedIngestJob(jobId) ?? currentJob;
|
|
782
|
+
currentJob = mutator(latest);
|
|
783
|
+
savePersistedIngestJob(currentJob);
|
|
784
|
+
};
|
|
785
|
+
const previous = mutateQueue[mutateQueue.length - 1];
|
|
786
|
+
const next = previous ? previous.then(run) : run();
|
|
787
|
+
mutateQueue.push(next);
|
|
788
|
+
await next;
|
|
789
|
+
};
|
|
790
|
+
await mutateJob((job) => ({
|
|
791
|
+
...job,
|
|
792
|
+
status: 'running',
|
|
793
|
+
startedAt: job.startedAt ?? nowIso(dependencies.now),
|
|
794
|
+
updatedAt: nowIso(dependencies.now),
|
|
795
|
+
execution: {
|
|
796
|
+
...job.execution,
|
|
797
|
+
pid: process.pid,
|
|
798
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
799
|
+
lastError: undefined,
|
|
800
|
+
cancelRequested: false,
|
|
801
|
+
},
|
|
802
|
+
}));
|
|
803
|
+
try {
|
|
804
|
+
const inspection = currentJob.inspection && currentJob.inspection.totalPages > 0
|
|
805
|
+
? currentJob.inspection
|
|
806
|
+
: inspect(currentJob.source.filePath);
|
|
807
|
+
if (!currentJob.inspection || currentJob.inspection.totalPages === 0) {
|
|
808
|
+
await mutateJob((job) => ({
|
|
809
|
+
...job,
|
|
810
|
+
inspection,
|
|
811
|
+
source: {
|
|
812
|
+
...job.source,
|
|
813
|
+
totalPages: inspection.totalPages,
|
|
814
|
+
weightedPageCost: inspection.pages.reduce((sum, page) => sum + (page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1), 0),
|
|
815
|
+
},
|
|
816
|
+
checkpoints: {
|
|
817
|
+
pages: inspection.pages.map((page) => job.checkpoints.pages.find((existing) => existing.pageNumber === page.pageNumber) ?? ({
|
|
818
|
+
pageNumber: page.pageNumber,
|
|
819
|
+
classification: page.classification,
|
|
820
|
+
sourceKind: mapPageSourceKind(page.classification),
|
|
821
|
+
weight: page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1,
|
|
822
|
+
status: 'pending',
|
|
823
|
+
attempts: 0,
|
|
824
|
+
updatedAt: nowIso(dependencies.now),
|
|
825
|
+
})),
|
|
826
|
+
},
|
|
827
|
+
}));
|
|
828
|
+
}
|
|
829
|
+
const config = buildJobConfig(currentJob, dependencies);
|
|
830
|
+
const pageInputs = await preparePdfPageInputs(currentJob.source.filePath, currentJob.inspection, currentJob.processing.pagePreprocessingConcurrency, dependencies);
|
|
831
|
+
let processedNewPages = 0;
|
|
832
|
+
if (currentJob.documentType === 'geotech-document') {
|
|
833
|
+
const geotechPageInputs = pageInputs;
|
|
834
|
+
const pendingPages = geotechPageInputs
|
|
835
|
+
.filter((page) => findCheckpoint(currentJob, page.pageNumber).status !== 'completed')
|
|
836
|
+
.sort((left, right) => left.pageNumber - right.pageNumber);
|
|
837
|
+
let fatalProviderStopMessage = null;
|
|
838
|
+
await mapWithConcurrency(pendingPages, currentJob.processing.chunkExtractionConcurrency, async (page) => {
|
|
839
|
+
if (isCancelled(jobId)) {
|
|
840
|
+
return;
|
|
841
|
+
}
|
|
842
|
+
if (fatalProviderStopMessage) {
|
|
843
|
+
await mutateJob((job) => ({
|
|
844
|
+
...job,
|
|
845
|
+
updatedAt: nowIso(dependencies.now),
|
|
846
|
+
execution: {
|
|
847
|
+
...job.execution,
|
|
848
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
849
|
+
},
|
|
850
|
+
checkpoints: {
|
|
851
|
+
pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber && pageCheckpoint.status === 'pending'
|
|
852
|
+
? {
|
|
853
|
+
...pageCheckpoint,
|
|
854
|
+
status: 'failed',
|
|
855
|
+
updatedAt: nowIso(dependencies.now),
|
|
856
|
+
error: `skipped after upstream provider stop. ${normalizeCheckpointErrorMessage(fatalProviderStopMessage ?? '')}`,
|
|
857
|
+
downgraded: false,
|
|
858
|
+
}
|
|
859
|
+
: pageCheckpoint),
|
|
860
|
+
},
|
|
861
|
+
}));
|
|
862
|
+
return;
|
|
863
|
+
}
|
|
864
|
+
await mutateJob((job) => ({
|
|
865
|
+
...job,
|
|
866
|
+
updatedAt: nowIso(dependencies.now),
|
|
867
|
+
execution: {
|
|
868
|
+
...job.execution,
|
|
869
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
870
|
+
},
|
|
871
|
+
checkpoints: {
|
|
872
|
+
pages: job.checkpoints.pages.map((checkpoint) => checkpoint.pageNumber === page.pageNumber
|
|
873
|
+
? {
|
|
874
|
+
...checkpoint,
|
|
875
|
+
attempts: checkpoint.attempts + 1,
|
|
876
|
+
updatedAt: nowIso(dependencies.now),
|
|
877
|
+
}
|
|
878
|
+
: checkpoint),
|
|
879
|
+
},
|
|
880
|
+
}));
|
|
881
|
+
try {
|
|
882
|
+
const processed = await processGeotechDocumentPage(currentJob, page, config, dependencies);
|
|
883
|
+
processedNewPages += 1;
|
|
884
|
+
await mutateJob((job) => ({
|
|
885
|
+
...job,
|
|
886
|
+
updatedAt: nowIso(dependencies.now),
|
|
887
|
+
execution: {
|
|
888
|
+
...job.execution,
|
|
889
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
890
|
+
},
|
|
891
|
+
checkpoints: {
|
|
892
|
+
pages: job.checkpoints.pages.map((checkpoint) => checkpoint.pageNumber === page.pageNumber
|
|
893
|
+
? {
|
|
894
|
+
...checkpoint,
|
|
895
|
+
status: 'completed',
|
|
896
|
+
updatedAt: nowIso(dependencies.now),
|
|
897
|
+
completedAt: nowIso(dependencies.now),
|
|
898
|
+
error: undefined,
|
|
899
|
+
downgraded: false,
|
|
900
|
+
ocrTextHint: processed.ocrTextHint,
|
|
901
|
+
ocrSource: processed.ocrSource,
|
|
902
|
+
ocrWarnings: processed.ocrWarnings,
|
|
903
|
+
result: processed.result,
|
|
904
|
+
}
|
|
905
|
+
: checkpoint),
|
|
906
|
+
},
|
|
907
|
+
}));
|
|
908
|
+
}
|
|
909
|
+
catch (error) {
|
|
910
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
911
|
+
const normalizedMessage = normalizeCheckpointErrorMessage(message);
|
|
912
|
+
const checkpoint = findCheckpoint(currentJob, page.pageNumber);
|
|
913
|
+
if (!fatalProviderStopMessage && isFatalProviderStopError(normalizedMessage)) {
|
|
914
|
+
fatalProviderStopMessage = normalizedMessage;
|
|
915
|
+
}
|
|
916
|
+
await mutateJob((job) => ({
|
|
917
|
+
...job,
|
|
918
|
+
updatedAt: nowIso(dependencies.now),
|
|
919
|
+
execution: {
|
|
920
|
+
...job.execution,
|
|
921
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
922
|
+
},
|
|
923
|
+
checkpoints: {
|
|
924
|
+
pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
|
|
925
|
+
? {
|
|
926
|
+
...pageCheckpoint,
|
|
927
|
+
status: 'failed',
|
|
928
|
+
updatedAt: nowIso(dependencies.now),
|
|
929
|
+
error: normalizedMessage,
|
|
930
|
+
downgraded: isSlowVisualPageError(normalizedMessage, checkpoint.classification, checkpoint.sourceKind),
|
|
931
|
+
}
|
|
932
|
+
: pageCheckpoint),
|
|
933
|
+
},
|
|
934
|
+
}));
|
|
935
|
+
}
|
|
936
|
+
});
|
|
937
|
+
if (fatalProviderStopMessage) {
|
|
938
|
+
await mutateJob((job) => ({
|
|
939
|
+
...job,
|
|
940
|
+
updatedAt: nowIso(dependencies.now),
|
|
941
|
+
execution: {
|
|
942
|
+
...job.execution,
|
|
943
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
944
|
+
},
|
|
945
|
+
checkpoints: {
|
|
946
|
+
pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.status === 'pending'
|
|
947
|
+
? {
|
|
948
|
+
...pageCheckpoint,
|
|
949
|
+
status: 'failed',
|
|
950
|
+
updatedAt: nowIso(dependencies.now),
|
|
951
|
+
error: `skipped after upstream provider stop. ${normalizeCheckpointErrorMessage(fatalProviderStopMessage ?? '')}`,
|
|
952
|
+
downgraded: false,
|
|
953
|
+
}
|
|
954
|
+
: pageCheckpoint),
|
|
955
|
+
},
|
|
956
|
+
}));
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
else {
|
|
960
|
+
let state = {
|
|
961
|
+
currentGroupBoreholeId: null,
|
|
962
|
+
hasCurrentGroup: false,
|
|
963
|
+
lastResolvedBoreholeId: currentJob.request.overrideBoreholeId,
|
|
964
|
+
priorContinuationDepth: null,
|
|
965
|
+
};
|
|
966
|
+
const boreholePageInputs = pageInputs;
|
|
967
|
+
for (const page of boreholePageInputs.sort((left, right) => left.pageNumber - right.pageNumber)) {
|
|
968
|
+
const checkpoint = findCheckpoint(currentJob, page.pageNumber);
|
|
969
|
+
if (checkpoint.status === 'completed' && checkpoint.result) {
|
|
970
|
+
state = advanceBoreholeProcessingState(state, checkpoint.result, checkpoint.ocrTextHint, currentJob.request.overrideBoreholeId);
|
|
971
|
+
continue;
|
|
972
|
+
}
|
|
973
|
+
if (isCancelled(jobId)) {
|
|
974
|
+
break;
|
|
975
|
+
}
|
|
976
|
+
if (dependencies.stopAfterNewPages != null && processedNewPages >= dependencies.stopAfterNewPages) {
|
|
977
|
+
throw new Error('Ingest job worker interrupted after checkpoint for test harness.');
|
|
978
|
+
}
|
|
979
|
+
await mutateJob((job) => ({
|
|
980
|
+
...job,
|
|
981
|
+
updatedAt: nowIso(dependencies.now),
|
|
982
|
+
execution: {
|
|
983
|
+
...job.execution,
|
|
984
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
985
|
+
},
|
|
986
|
+
checkpoints: {
|
|
987
|
+
pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
|
|
988
|
+
? {
|
|
989
|
+
...pageCheckpoint,
|
|
990
|
+
attempts: pageCheckpoint.attempts + 1,
|
|
991
|
+
updatedAt: nowIso(dependencies.now),
|
|
992
|
+
}
|
|
993
|
+
: pageCheckpoint),
|
|
994
|
+
},
|
|
995
|
+
}));
|
|
996
|
+
try {
|
|
997
|
+
const processed = await processBoreholePage(currentJob, page, config, state, dependencies);
|
|
998
|
+
processedNewPages += 1;
|
|
999
|
+
state = processed.nextState;
|
|
1000
|
+
await mutateJob((job) => ({
|
|
1001
|
+
...job,
|
|
1002
|
+
updatedAt: nowIso(dependencies.now),
|
|
1003
|
+
execution: {
|
|
1004
|
+
...job.execution,
|
|
1005
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
1006
|
+
},
|
|
1007
|
+
checkpoints: {
|
|
1008
|
+
pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
|
|
1009
|
+
? {
|
|
1010
|
+
...pageCheckpoint,
|
|
1011
|
+
status: 'completed',
|
|
1012
|
+
updatedAt: nowIso(dependencies.now),
|
|
1013
|
+
completedAt: nowIso(dependencies.now),
|
|
1014
|
+
error: undefined,
|
|
1015
|
+
downgraded: false,
|
|
1016
|
+
ocrTextHint: processed.ocrTextHint,
|
|
1017
|
+
ocrSource: processed.ocrSource,
|
|
1018
|
+
ocrWarnings: processed.ocrWarnings,
|
|
1019
|
+
result: processed.result,
|
|
1020
|
+
}
|
|
1021
|
+
: pageCheckpoint),
|
|
1022
|
+
},
|
|
1023
|
+
}));
|
|
1024
|
+
}
|
|
1025
|
+
catch (error) {
|
|
1026
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1027
|
+
const normalizedMessage = normalizeCheckpointErrorMessage(message);
|
|
1028
|
+
await mutateJob((job) => ({
|
|
1029
|
+
...job,
|
|
1030
|
+
updatedAt: nowIso(dependencies.now),
|
|
1031
|
+
execution: {
|
|
1032
|
+
...job.execution,
|
|
1033
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
1034
|
+
},
|
|
1035
|
+
checkpoints: {
|
|
1036
|
+
pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
|
|
1037
|
+
? {
|
|
1038
|
+
...pageCheckpoint,
|
|
1039
|
+
status: 'failed',
|
|
1040
|
+
updatedAt: nowIso(dependencies.now),
|
|
1041
|
+
error: normalizedMessage,
|
|
1042
|
+
downgraded: isSlowVisualPageError(normalizedMessage, pageCheckpoint.classification, pageCheckpoint.sourceKind),
|
|
1043
|
+
}
|
|
1044
|
+
: pageCheckpoint),
|
|
1045
|
+
},
|
|
1046
|
+
}));
|
|
1047
|
+
if (isFatalProviderStopError(normalizedMessage)) {
|
|
1048
|
+
await mutateJob((job) => ({
|
|
1049
|
+
...job,
|
|
1050
|
+
updatedAt: nowIso(dependencies.now),
|
|
1051
|
+
execution: {
|
|
1052
|
+
...job.execution,
|
|
1053
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
1054
|
+
},
|
|
1055
|
+
checkpoints: {
|
|
1056
|
+
pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.status === 'pending'
|
|
1057
|
+
? {
|
|
1058
|
+
...pageCheckpoint,
|
|
1059
|
+
status: 'failed',
|
|
1060
|
+
updatedAt: nowIso(dependencies.now),
|
|
1061
|
+
error: `skipped after upstream provider stop. ${normalizeCheckpointErrorMessage(message)}`,
|
|
1062
|
+
downgraded: false,
|
|
1063
|
+
}
|
|
1064
|
+
: pageCheckpoint),
|
|
1065
|
+
},
|
|
1066
|
+
}));
|
|
1067
|
+
break;
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
currentJob = loadPersistedIngestJob(jobId) ?? currentJob;
|
|
1073
|
+
if (currentJob.execution.cancelRequested) {
|
|
1074
|
+
await mutateJob((job) => ({
|
|
1075
|
+
...job,
|
|
1076
|
+
status: 'canceled',
|
|
1077
|
+
updatedAt: nowIso(dependencies.now),
|
|
1078
|
+
canceledAt: nowIso(dependencies.now),
|
|
1079
|
+
execution: {
|
|
1080
|
+
...job.execution,
|
|
1081
|
+
pid: undefined,
|
|
1082
|
+
},
|
|
1083
|
+
}));
|
|
1084
|
+
return loadPersistedIngestJob(jobId) ?? currentJob;
|
|
1085
|
+
}
|
|
1086
|
+
const finalResult = await finalizeJobResult(currentJob, pageInputs, config, dependencies);
|
|
1087
|
+
const persistedReview = currentJob.request.projectId
|
|
1088
|
+
? persistReview(currentJob.request.projectId, finalResult, {
|
|
1089
|
+
title: currentJob.request.reviewTitle,
|
|
1090
|
+
})
|
|
1091
|
+
: null;
|
|
1092
|
+
await mutateJob((job) => ({
|
|
1093
|
+
...job,
|
|
1094
|
+
status: 'completed',
|
|
1095
|
+
updatedAt: nowIso(dependencies.now),
|
|
1096
|
+
completedAt: nowIso(dependencies.now),
|
|
1097
|
+
execution: {
|
|
1098
|
+
...job.execution,
|
|
1099
|
+
pid: undefined,
|
|
1100
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
1101
|
+
},
|
|
1102
|
+
result: {
|
|
1103
|
+
ingestResult: finalResult,
|
|
1104
|
+
persistedReview: persistedReview
|
|
1105
|
+
? {
|
|
1106
|
+
datasetName: persistedReview.datasetName,
|
|
1107
|
+
reviewId: persistedReview.reviewId,
|
|
1108
|
+
createdAt: persistedReview.createdAt,
|
|
1109
|
+
}
|
|
1110
|
+
: undefined,
|
|
1111
|
+
},
|
|
1112
|
+
}));
|
|
1113
|
+
}
|
|
1114
|
+
catch (error) {
|
|
1115
|
+
await mutateJob((job) => ({
|
|
1116
|
+
...job,
|
|
1117
|
+
status: 'failed',
|
|
1118
|
+
updatedAt: nowIso(dependencies.now),
|
|
1119
|
+
execution: {
|
|
1120
|
+
...job.execution,
|
|
1121
|
+
pid: undefined,
|
|
1122
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
1123
|
+
lastError: error instanceof Error ? error.message : String(error),
|
|
1124
|
+
},
|
|
1125
|
+
}));
|
|
1126
|
+
}
|
|
1127
|
+
return loadPersistedIngestJob(jobId) ?? currentJob;
|
|
1128
|
+
}
|
|
1129
|
+
//# sourceMappingURL=job-worker.js.map
|