@geotechcli/core 0.4.21 → 0.4.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. package/dist/agents/brain.d.ts +1 -5
  2. package/dist/agents/brain.d.ts.map +1 -1
  3. package/dist/agents/brain.js +4 -120
  4. package/dist/agents/brain.js.map +1 -1
  5. package/dist/agents/data-tools.js +759 -0
  6. package/dist/agents/data-tools.js.map +1 -1
  7. package/dist/agents/runtime-bootstrap.d.ts +6 -0
  8. package/dist/agents/runtime-bootstrap.d.ts.map +1 -0
  9. package/dist/agents/runtime-bootstrap.js +8 -0
  10. package/dist/agents/runtime-bootstrap.js.map +1 -0
  11. package/dist/agents/runtime-fallbacks.d.ts +7 -0
  12. package/dist/agents/runtime-fallbacks.d.ts.map +1 -0
  13. package/dist/agents/runtime-fallbacks.js +87 -0
  14. package/dist/agents/runtime-fallbacks.js.map +1 -0
  15. package/dist/agents/swarm.d.ts +1 -4
  16. package/dist/agents/swarm.d.ts.map +1 -1
  17. package/dist/agents/swarm.js +74 -8
  18. package/dist/agents/swarm.js.map +1 -1
  19. package/dist/agents/tool-runtime.d.ts +7 -0
  20. package/dist/agents/tool-runtime.d.ts.map +1 -0
  21. package/dist/agents/tool-runtime.js +9 -0
  22. package/dist/agents/tool-runtime.js.map +1 -0
  23. package/dist/config/index.d.ts +4 -4
  24. package/dist/config/index.js +1 -1
  25. package/dist/config/index.js.map +1 -1
  26. package/dist/geo/coordinates.d.ts +40 -0
  27. package/dist/geo/coordinates.d.ts.map +1 -0
  28. package/dist/geo/coordinates.js +461 -0
  29. package/dist/geo/coordinates.js.map +1 -0
  30. package/dist/geo/index.d.ts +1 -0
  31. package/dist/geo/index.d.ts.map +1 -1
  32. package/dist/geo/index.js +1 -0
  33. package/dist/geo/index.js.map +1 -1
  34. package/dist/index.d.ts +3 -2
  35. package/dist/index.d.ts.map +1 -1
  36. package/dist/index.js +3 -2
  37. package/dist/index.js.map +1 -1
  38. package/dist/ingest/ags.d.ts +3 -0
  39. package/dist/ingest/ags.d.ts.map +1 -1
  40. package/dist/ingest/ags.js +98 -9
  41. package/dist/ingest/ags.js.map +1 -1
  42. package/dist/ingest/cpt.d.ts +4 -0
  43. package/dist/ingest/cpt.d.ts.map +1 -1
  44. package/dist/ingest/cpt.js +87 -25
  45. package/dist/ingest/cpt.js.map +1 -1
  46. package/dist/ingest/document-inputs.d.ts +37 -0
  47. package/dist/ingest/document-inputs.d.ts.map +1 -0
  48. package/dist/ingest/document-inputs.js +197 -0
  49. package/dist/ingest/document-inputs.js.map +1 -0
  50. package/dist/ingest/geotech-document.d.ts +118 -0
  51. package/dist/ingest/geotech-document.d.ts.map +1 -0
  52. package/dist/ingest/geotech-document.js +1006 -0
  53. package/dist/ingest/geotech-document.js.map +1 -0
  54. package/dist/ingest/geotech-extract.d.ts +86 -0
  55. package/dist/ingest/geotech-extract.d.ts.map +1 -0
  56. package/dist/ingest/geotech-extract.js +652 -0
  57. package/dist/ingest/geotech-extract.js.map +1 -0
  58. package/dist/ingest/geotech-schemas.d.ts +248 -0
  59. package/dist/ingest/geotech-schemas.d.ts.map +1 -0
  60. package/dist/ingest/geotech-schemas.js +150 -0
  61. package/dist/ingest/geotech-schemas.js.map +1 -0
  62. package/dist/ingest/index.d.ts +8 -0
  63. package/dist/ingest/index.d.ts.map +1 -1
  64. package/dist/ingest/index.js +8 -0
  65. package/dist/ingest/index.js.map +1 -1
  66. package/dist/ingest/ingest-job-child.d.ts +2 -0
  67. package/dist/ingest/ingest-job-child.d.ts.map +1 -0
  68. package/dist/ingest/ingest-job-child.js +45 -0
  69. package/dist/ingest/ingest-job-child.js.map +1 -0
  70. package/dist/ingest/job-store.d.ts +117 -0
  71. package/dist/ingest/job-store.d.ts.map +1 -0
  72. package/dist/ingest/job-store.js +541 -0
  73. package/dist/ingest/job-store.js.map +1 -0
  74. package/dist/ingest/job-worker.d.ts +24 -0
  75. package/dist/ingest/job-worker.d.ts.map +1 -0
  76. package/dist/ingest/job-worker.js +1129 -0
  77. package/dist/ingest/job-worker.js.map +1 -0
  78. package/dist/ingest/pdf.d.ts +102 -0
  79. package/dist/ingest/pdf.d.ts.map +1 -0
  80. package/dist/ingest/pdf.js +1544 -0
  81. package/dist/ingest/pdf.js.map +1 -0
  82. package/dist/ingest/review-store.d.ts +215 -0
  83. package/dist/ingest/review-store.d.ts.map +1 -0
  84. package/dist/ingest/review-store.js +1995 -0
  85. package/dist/ingest/review-store.js.map +1 -0
  86. package/dist/llm/capabilities.d.ts +8 -0
  87. package/dist/llm/capabilities.d.ts.map +1 -0
  88. package/dist/llm/capabilities.js +73 -0
  89. package/dist/llm/capabilities.js.map +1 -0
  90. package/dist/llm/index.d.ts +3 -2
  91. package/dist/llm/index.d.ts.map +1 -1
  92. package/dist/llm/index.js +2 -1
  93. package/dist/llm/index.js.map +1 -1
  94. package/dist/llm/providers/anthropic.d.ts +6 -0
  95. package/dist/llm/providers/anthropic.d.ts.map +1 -1
  96. package/dist/llm/providers/anthropic.js +10 -1
  97. package/dist/llm/providers/anthropic.js.map +1 -1
  98. package/dist/llm/providers/hosted-beta.d.ts +6 -0
  99. package/dist/llm/providers/hosted-beta.d.ts.map +1 -1
  100. package/dist/llm/providers/hosted-beta.js +40 -10
  101. package/dist/llm/providers/hosted-beta.js.map +1 -1
  102. package/dist/llm/providers/huggingface.d.ts +6 -0
  103. package/dist/llm/providers/huggingface.d.ts.map +1 -1
  104. package/dist/llm/providers/huggingface.js +21 -1
  105. package/dist/llm/providers/huggingface.js.map +1 -1
  106. package/dist/llm/providers/openai-compatible.d.ts +6 -0
  107. package/dist/llm/providers/openai-compatible.d.ts.map +1 -1
  108. package/dist/llm/providers/openai-compatible.js +21 -1
  109. package/dist/llm/providers/openai-compatible.js.map +1 -1
  110. package/dist/llm/providers/zhipu.d.ts +6 -0
  111. package/dist/llm/providers/zhipu.d.ts.map +1 -1
  112. package/dist/llm/providers/zhipu.js +15 -1
  113. package/dist/llm/providers/zhipu.js.map +1 -1
  114. package/dist/llm/router.d.ts +7 -0
  115. package/dist/llm/router.d.ts.map +1 -1
  116. package/dist/llm/router.js +33 -13
  117. package/dist/llm/router.js.map +1 -1
  118. package/dist/llm/types.d.ts +22 -4
  119. package/dist/llm/types.d.ts.map +1 -1
  120. package/dist/llm/types.js.map +1 -1
  121. package/dist/meta/metadata.json +1 -1
  122. package/dist/report/html.d.ts +3 -0
  123. package/dist/report/html.d.ts.map +1 -0
  124. package/dist/report/html.js +626 -0
  125. package/dist/report/html.js.map +1 -0
  126. package/dist/report/index.d.ts +2 -0
  127. package/dist/report/index.d.ts.map +1 -1
  128. package/dist/report/index.js +2 -0
  129. package/dist/report/index.js.map +1 -1
  130. package/dist/report/ingest-dossier.d.ts +81 -0
  131. package/dist/report/ingest-dossier.d.ts.map +1 -0
  132. package/dist/report/ingest-dossier.js +324 -0
  133. package/dist/report/ingest-dossier.js.map +1 -0
  134. package/dist/storage/index.d.ts +5 -0
  135. package/dist/storage/index.d.ts.map +1 -1
  136. package/dist/storage/index.js +12 -6
  137. package/dist/storage/index.js.map +1 -1
  138. package/dist/vision/geotech-document.d.ts +46 -0
  139. package/dist/vision/geotech-document.d.ts.map +1 -0
  140. package/dist/vision/geotech-document.js +576 -0
  141. package/dist/vision/geotech-document.js.map +1 -0
  142. package/dist/vision/index.d.ts +31 -0
  143. package/dist/vision/index.d.ts.map +1 -1
  144. package/dist/vision/index.js +659 -27
  145. package/dist/vision/index.js.map +1 -1
  146. package/dist/vision/ocr.d.ts +29 -0
  147. package/dist/vision/ocr.d.ts.map +1 -0
  148. package/dist/vision/ocr.js +287 -0
  149. package/dist/vision/ocr.js.map +1 -0
  150. package/dist/vision/preprocess.d.ts +26 -0
  151. package/dist/vision/preprocess.d.ts.map +1 -0
  152. package/dist/vision/preprocess.js +194 -0
  153. package/dist/vision/preprocess.js.map +1 -0
  154. package/package.json +5 -1
@@ -0,0 +1,1129 @@
1
+ import { basename } from 'node:path';
2
+ import { buildLLMConfig } from '../config/index.js';
3
+ import { readDocumentPdfPageInputs } from './document-inputs.js';
4
+ import { inspectPdfDocument } from './pdf.js';
5
+ import { ingestBoreholeLogDocument, summarizeBoreholeIngestInspection, } from './geotech-extract.js';
6
+ import { buildPreflightLowYieldInsight, ingestGeotechDocument, inferPreflightLowYieldPageRole, } from './geotech-document.js';
7
+ import { extractGeotechDocumentFactsFromText, interpretGeotechDocumentPage, } from '../vision/geotech-document.js';
8
+ import { interpretBoreholeLogWithContext, transcribeDocumentImageText, } from '../vision/index.js';
9
+ import { recoverDocumentTextHint } from '../vision/ocr.js';
10
+ import { persistBoreholeIngestReview } from './review-store.js';
11
+ import { loadPersistedIngestJob, savePersistedIngestJob, } from './job-store.js';
12
+ const SLOW_VISUAL_ERROR_PATTERNS = [
13
+ /timeout/i,
14
+ /provider is busy/i,
15
+ /returned no content/i,
16
+ /did not contain assistant text/i,
17
+ /no completion choices/i,
18
+ /empty completion/i,
19
+ /temporarily unavailable/i,
20
+ /\b503\b/i,
21
+ /\b504\b/i,
22
+ ];
23
+ const FATAL_PROVIDER_STOP_PATTERNS = [
24
+ /daily limit reached/i,
25
+ /remaining today:\s*0/i,
26
+ /insufficient[_\s-]?quota/i,
27
+ /quota exceeded/i,
28
+ /rate limit/i,
29
+ /\b429\b/i,
30
+ ];
31
+ function nowIso(now) {
32
+ return (now ?? (() => new Date()))().toISOString();
33
+ }
34
+ function uniqueStrings(values) {
35
+ return [...new Set(values.filter((value) => typeof value === 'string' && value.trim().length > 0))];
36
+ }
37
+ function isBoreholeResult(result) {
38
+ return result.documentType === 'borehole-log';
39
+ }
40
+ function normalizeTextHint(value) {
41
+ if (typeof value !== 'string') {
42
+ return undefined;
43
+ }
44
+ const normalized = value.replace(/\s+/g, ' ').trim();
45
+ return normalized ? normalized.slice(0, 1600) : undefined;
46
+ }
47
+ function mapPageSourceKind(classification) {
48
+ return classification === 'image-only' || classification === 'text-unreadable' ? 'raster-image' : 'pdf-page';
49
+ }
50
+ async function mapWithConcurrency(values, concurrency, iterator) {
51
+ if (values.length === 0) {
52
+ return [];
53
+ }
54
+ const safeConcurrency = Math.max(1, Math.min(concurrency, values.length));
55
+ const results = new Array(values.length);
56
+ let cursor = 0;
57
+ const workers = Array.from({ length: safeConcurrency }, async () => {
58
+ while (cursor < values.length) {
59
+ const index = cursor;
60
+ cursor += 1;
61
+ results[index] = await iterator(values[index], index);
62
+ }
63
+ });
64
+ await Promise.all(workers);
65
+ return results;
66
+ }
67
+ function pageTextHintLooksBoreholeLike(value) {
68
+ if (!value) {
69
+ return false;
70
+ }
71
+ return (/\bborehole\b/i.test(value)
72
+ || /\bBH[-\s_/]?\d+\b/i.test(value)
73
+ || /\bTP[-\s_/]?\d+\b/i.test(value)
74
+ || /\bCPT[-\s_/]?\d+\b/i.test(value)
75
+ || /\bSPT\b/i.test(value)
76
+ || /\b(?:easting|northing|latitude|longitude|groundwater)\b/i.test(value)
77
+ || /\b\d+(?:\.\d+)?\s*-\s*\d+(?:\.\d+)?\s*m\b/i.test(value));
78
+ }
79
+ function normalizeKnownBoreholeId(value) {
80
+ if (typeof value !== 'string') {
81
+ return null;
82
+ }
83
+ const trimmed = value.trim();
84
+ if (!trimmed || trimmed === 'BH-unknown') {
85
+ return null;
86
+ }
87
+ return trimmed;
88
+ }
89
+ function minimumLayerDepth(result) {
90
+ let minDepth = null;
91
+ for (const layer of result.layers) {
92
+ const candidates = [layer.depthFrom, layer.depthTo].filter((value) => value != null && Number.isFinite(value));
93
+ for (const candidate of candidates) {
94
+ minDepth = minDepth == null ? candidate : Math.min(minDepth, candidate);
95
+ }
96
+ }
97
+ return minDepth;
98
+ }
99
+ function hasUsableBoreholeSignal(result, detectedBoreholeId) {
100
+ return (detectedBoreholeId != null
101
+ || result.layers.length > 0
102
+ || result.totalDepth != null
103
+ || result.waterTableDepth != null
104
+ || result.location != null
105
+ || result.groundElevation != null
106
+ || result.dateDrilled != null
107
+ || result.drillingMethod != null);
108
+ }
109
+ function shouldIgnoreNonLogPage(result, detectedBoreholeId, pageTextHint) {
110
+ return (!hasUsableBoreholeSignal(result, detectedBoreholeId)
111
+ && !pageTextHintLooksBoreholeLike(pageTextHint));
112
+ }
113
+ function shouldStartNewAnonymousGroup(result, state) {
114
+ const currentStartDepth = minimumLayerDepth(result);
115
+ return (state.hasCurrentGroup
116
+ && state.priorContinuationDepth != null
117
+ && state.priorContinuationDepth >= 3
118
+ && currentStartDepth != null
119
+ && currentStartDepth <= 0.5);
120
+ }
121
+ function advanceBoreholeProcessingState(state, result, pageTextHint, overrideBoreholeId) {
122
+ const next = { ...state };
123
+ const detectedBoreholeId = normalizeKnownBoreholeId(result.boreholeId);
124
+ if (shouldIgnoreNonLogPage(result, detectedBoreholeId, pageTextHint)) {
125
+ return next;
126
+ }
127
+ if (overrideBoreholeId) {
128
+ next.hasCurrentGroup = true;
129
+ next.currentGroupBoreholeId = overrideBoreholeId;
130
+ }
131
+ else if (!next.hasCurrentGroup) {
132
+ next.hasCurrentGroup = true;
133
+ next.currentGroupBoreholeId = detectedBoreholeId;
134
+ }
135
+ else if (detectedBoreholeId
136
+ && next.currentGroupBoreholeId
137
+ && detectedBoreholeId !== next.currentGroupBoreholeId) {
138
+ next.currentGroupBoreholeId = detectedBoreholeId;
139
+ }
140
+ else if (!detectedBoreholeId && shouldStartNewAnonymousGroup(result, next)) {
141
+ next.currentGroupBoreholeId = null;
142
+ }
143
+ else if (detectedBoreholeId && !next.currentGroupBoreholeId) {
144
+ next.currentGroupBoreholeId = detectedBoreholeId;
145
+ }
146
+ next.hasCurrentGroup = true;
147
+ if (detectedBoreholeId && !overrideBoreholeId) {
148
+ next.currentGroupBoreholeId = detectedBoreholeId;
149
+ }
150
+ next.lastResolvedBoreholeId = overrideBoreholeId ?? next.currentGroupBoreholeId ?? undefined;
151
+ next.priorContinuationDepth = result.continuationDepth ?? null;
152
+ return next;
153
+ }
154
+ function isSlowVisualPageError(message, classification, sourceKind) {
155
+ const looksVisual = sourceKind === 'raster-image' || classification === 'image-only' || classification === 'text-unreadable';
156
+ return looksVisual && SLOW_VISUAL_ERROR_PATTERNS.some((pattern) => pattern.test(message));
157
+ }
158
+ function isFatalProviderStopError(message) {
159
+ return FATAL_PROVIDER_STOP_PATTERNS.some((pattern) => pattern.test(message));
160
+ }
161
+ function normalizeCheckpointErrorMessage(message) {
162
+ let normalized = message.trim();
163
+ for (let iteration = 0; iteration < 4; iteration += 1) {
164
+ const updated = normalized.replace(/^Page \d+:\s*/i, '').trim();
165
+ if (updated === normalized) {
166
+ break;
167
+ }
168
+ normalized = updated;
169
+ }
170
+ return normalized;
171
+ }
172
+ async function withWorkerPageTimeout(promise, timeoutMs, errorMessage) {
173
+ if (!Number.isFinite(timeoutMs) || timeoutMs <= 0) {
174
+ return promise;
175
+ }
176
+ return await Promise.race([
177
+ promise,
178
+ new Promise((_, reject) => {
179
+ const timer = setTimeout(() => {
180
+ reject(new Error(errorMessage));
181
+ }, timeoutMs);
182
+ timer.unref?.();
183
+ }),
184
+ ]);
185
+ }
186
+ function resolveWorkerPhaseTimeoutMs(config, input) {
187
+ const baseTimeoutMs = Math.min(Math.max(config.timeout ?? 120000, 60000), 120000);
188
+ const isHeavyVisualPage = input.sourceKind === 'raster-image'
189
+ || input.classification === 'image-only'
190
+ || input.classification === 'text-unreadable';
191
+ return isHeavyVisualPage
192
+ ? Math.min(Math.max(baseTimeoutMs, 180000), 180000)
193
+ : baseTimeoutMs;
194
+ }
195
+ function resolveWorkerTextExtractionTimeoutMs(baseTimeoutMs, textHint) {
196
+ if (!textHint) {
197
+ return baseTimeoutMs;
198
+ }
199
+ if (textHint.length >= 1800) {
200
+ return Math.min(Math.max(baseTimeoutMs, 150000), 180000);
201
+ }
202
+ if (textHint.length >= 1000) {
203
+ return Math.max(baseTimeoutMs, 120000);
204
+ }
205
+ return baseTimeoutMs;
206
+ }
207
+ async function preparePdfPageInputs(filePath, inspection, concurrency, dependencies = {}) {
208
+ const readPageInputs = dependencies.readDocumentPdfPageInputs ?? readDocumentPdfPageInputs;
209
+ const normalizedPageInputs = await readPageInputs(filePath, {
210
+ inspection,
211
+ dependencies: {
212
+ extractPageImages: dependencies.extractPrimaryPdfPageImages,
213
+ },
214
+ });
215
+ return mapWithConcurrency(normalizedPageInputs, concurrency, async (page) => ({
216
+ base64: page.base64,
217
+ mimeType: page.mimeType,
218
+ fileBytes: page.fileBytes,
219
+ pageNumber: page.pageNumber,
220
+ totalPages: page.totalPages,
221
+ sourceKind: page.sourceKind ?? 'pdf-page',
222
+ }));
223
+ }
224
+ function buildInspectionSummary(inspection) {
225
+ if (!inspection) {
226
+ return null;
227
+ }
228
+ const pageClassificationCounts = {};
229
+ let imageHeavyPageCount = 0;
230
+ let nativeTextPageCount = 0;
231
+ let degradedPageCount = 0;
232
+ for (const page of inspection.pages) {
233
+ pageClassificationCounts[page.classification] = (pageClassificationCounts[page.classification] ?? 0) + 1;
234
+ if (page.classification === 'image-only' || page.classification === 'text-unreadable') {
235
+ imageHeavyPageCount += 1;
236
+ }
237
+ if (page.capabilities.nativeTextExtraction !== 'unavailable') {
238
+ nativeTextPageCount += 1;
239
+ }
240
+ if (page.degradation.level !== 'none') {
241
+ degradedPageCount += 1;
242
+ }
243
+ }
244
+ return {
245
+ pageClassificationCounts,
246
+ imageHeavyPageCount,
247
+ nativeTextPageCount,
248
+ degradedPageCount,
249
+ ocrRecoveredPageCount: 0,
250
+ };
251
+ }
252
+ function summarizeReviewReasons(findings) {
253
+ return uniqueStrings(findings
254
+ .filter((finding) => finding.severity !== 'advisory')
255
+ .map((finding) => finding.message));
256
+ }
257
+ function buildSyntheticBoreholeResult(job, inspection, now) {
258
+ const pageFailures = job.checkpoints.pages
259
+ .filter((page) => page.status === 'failed')
260
+ .map((page) => page.error ?? `Page ${page.pageNumber} failed during async ingest.`);
261
+ const downgradedPages = job.checkpoints.pages.filter((page) => page.status === 'failed' && page.downgraded);
262
+ const normalFailedPages = job.checkpoints.pages.filter((page) => page.status === 'failed' && !page.downgraded);
263
+ const reviewFindings = [
264
+ ...job.checkpoints.pages
265
+ .filter((page) => page.status === 'failed')
266
+ .map((page) => ({
267
+ code: page.downgraded ? 'page_visual_ingest_downgraded' : 'page_ingest_failed',
268
+ severity: page.downgraded ? 'review' : 'blocking',
269
+ scope: 'page',
270
+ message: page.downgraded
271
+ ? `Page ${page.pageNumber} exceeded the slow visual budget and was downgraded to manual review.`
272
+ : page.error ?? `Page ${page.pageNumber} failed during ingest.`,
273
+ pageNumber: page.pageNumber,
274
+ })),
275
+ ];
276
+ if (downgradedPages.length > 0) {
277
+ reviewFindings.push({
278
+ code: 'slow_visual_pages_present',
279
+ severity: 'review',
280
+ scope: 'document',
281
+ message: `${downgradedPages.length} slow visual page(s) were downgraded to manual review.`,
282
+ });
283
+ }
284
+ if (normalFailedPages.length > 0) {
285
+ reviewFindings.push({
286
+ code: 'page_failures_present',
287
+ severity: 'blocking',
288
+ scope: 'document',
289
+ message: `${normalFailedPages.length} page(s) failed during ingest and should be reviewed.`,
290
+ });
291
+ }
292
+ return {
293
+ kind: 'geotech-ingest-result',
294
+ schemaVersion: 1,
295
+ documentType: 'borehole-log',
296
+ generatedAt: nowIso(now),
297
+ source: {
298
+ filePath: job.source.filePath,
299
+ fileName: basename(job.source.filePath),
300
+ inputKind: 'pdf',
301
+ totalPages: job.source.totalPages,
302
+ successfulPages: 0,
303
+ failedPages: pageFailures.length,
304
+ },
305
+ inspection,
306
+ inspectionSummary: summarizeBoreholeIngestInspection(inspection),
307
+ boreholes: [],
308
+ pageAudits: job.checkpoints.pages.map((page) => ({
309
+ pageNumber: page.pageNumber,
310
+ detectedBoreholeId: null,
311
+ assignedGroup: 'unassigned',
312
+ classification: page.classification,
313
+ textHintSource: page.ocrSource ?? 'none',
314
+ parseStatus: page.status === 'completed' ? 'partial' : 'failed',
315
+ confidence: 0,
316
+ continuationDepth: null,
317
+ warnings: page.error ? [page.error] : [],
318
+ })),
319
+ pageFailures,
320
+ warnings: uniqueStrings(pageFailures),
321
+ reviewFindings,
322
+ reviewReasons: summarizeReviewReasons(reviewFindings),
323
+ reviewRequired: reviewFindings.some((finding) => finding.severity !== 'advisory'),
324
+ confidence: 0,
325
+ canAutoProceed: false,
326
+ };
327
+ }
328
+ function buildSyntheticGeotechDocumentResult(job, inspection, now) {
329
+ const pageFailures = job.checkpoints.pages
330
+ .filter((page) => page.status === 'failed')
331
+ .map((page) => page.error ?? `Page ${page.pageNumber} failed during async ingest.`);
332
+ const downgradedPages = job.checkpoints.pages.filter((page) => page.status === 'failed' && page.downgraded);
333
+ const normalFailedPages = job.checkpoints.pages.filter((page) => page.status === 'failed' && !page.downgraded);
334
+ const reviewFindings = [
335
+ ...job.checkpoints.pages
336
+ .filter((page) => page.status === 'failed')
337
+ .map((page) => ({
338
+ code: page.downgraded ? 'page_visual_ingest_downgraded' : 'page_ingest_failed',
339
+ severity: page.downgraded ? 'review' : 'blocking',
340
+ scope: 'page',
341
+ message: page.downgraded
342
+ ? `Page ${page.pageNumber} exceeded the slow visual budget and was downgraded to manual review.`
343
+ : page.error ?? `Page ${page.pageNumber} failed during ingest.`,
344
+ pageNumber: page.pageNumber,
345
+ })),
346
+ ];
347
+ if (downgradedPages.length > 0) {
348
+ reviewFindings.push({
349
+ code: 'slow_visual_pages_present',
350
+ severity: 'review',
351
+ scope: 'document',
352
+ message: `${downgradedPages.length} slow visual page(s) were downgraded to manual review.`,
353
+ });
354
+ }
355
+ if (normalFailedPages.length > 0) {
356
+ reviewFindings.push({
357
+ code: 'page_failures_present',
358
+ severity: 'blocking',
359
+ scope: 'document',
360
+ message: `${normalFailedPages.length} page(s) failed during ingest and should be reviewed.`,
361
+ });
362
+ }
363
+ return {
364
+ kind: 'geotech-ingest-result',
365
+ schemaVersion: 1,
366
+ documentType: 'geotech-document',
367
+ generatedAt: nowIso(now),
368
+ source: {
369
+ filePath: job.source.filePath,
370
+ fileName: basename(job.source.filePath),
371
+ inputKind: 'pdf',
372
+ totalPages: job.source.totalPages,
373
+ successfulPages: 0,
374
+ failedPages: pageFailures.length,
375
+ },
376
+ inspection,
377
+ inspectionSummary: buildInspectionSummary(inspection),
378
+ documentClass: null,
379
+ title: null,
380
+ summary: null,
381
+ materials: [],
382
+ classifications: [],
383
+ parameters: [],
384
+ risks: [],
385
+ recommendations: [],
386
+ pageAudits: job.checkpoints.pages.map((page) => ({
387
+ pageNumber: page.pageNumber,
388
+ classification: page.classification,
389
+ textHintSource: page.ocrSource ?? 'none',
390
+ parseStatus: page.status === 'completed' ? 'partial' : 'failed',
391
+ confidence: 0,
392
+ materialCount: 0,
393
+ classificationCount: 0,
394
+ parameterCount: 0,
395
+ warnings: page.error ? [page.error] : [],
396
+ })),
397
+ pageFailures,
398
+ warnings: uniqueStrings(pageFailures),
399
+ reviewFindings,
400
+ reviewReasons: summarizeReviewReasons(reviewFindings),
401
+ parseStatus: 'failed',
402
+ confidence: 0,
403
+ reviewRequired: reviewFindings.some((finding) => finding.severity !== 'advisory'),
404
+ canAutoProceed: false,
405
+ };
406
+ }
407
+ function dedupeReviewFindings(reviewFindings) {
408
+ return [
409
+ ...new Map(reviewFindings.map((finding) => {
410
+ const key = [
411
+ finding.code,
412
+ finding.severity,
413
+ finding.scope,
414
+ finding.message,
415
+ finding.pageNumber ?? '',
416
+ 'boreholeId' in finding ? finding.boreholeId ?? '' : '',
417
+ 'materialDescription' in finding ? finding.materialDescription ?? '' : '',
418
+ ].join('|');
419
+ return [key, finding];
420
+ })).values(),
421
+ ];
422
+ }
423
+ function applyBoreholeFailureDowngrades(result, checkpoints) {
424
+ const downgradedPageNumbers = new Set(checkpoints
425
+ .filter((page) => page.status === 'failed' && page.downgraded)
426
+ .map((page) => page.pageNumber));
427
+ if (downgradedPageNumbers.size === 0) {
428
+ return result;
429
+ }
430
+ const pageFailureCount = checkpoints.filter((page) => page.status === 'failed').length;
431
+ const nonDowngradedFailureCount = checkpoints.filter((page) => page.status === 'failed' && !page.downgraded).length;
432
+ const downgradedFailureCount = pageFailureCount - nonDowngradedFailureCount;
433
+ const reviewFindings = result.reviewFindings.map((finding) => {
434
+ if (finding.code === 'page_ingest_failed'
435
+ && typeof finding.pageNumber === 'number'
436
+ && downgradedPageNumbers.has(finding.pageNumber)) {
437
+ return {
438
+ ...finding,
439
+ code: 'page_visual_ingest_downgraded',
440
+ severity: 'review',
441
+ message: `Page ${finding.pageNumber} exceeded the slow visual budget and was downgraded to manual review.`,
442
+ };
443
+ }
444
+ if (finding.code === 'page_failures_present' && nonDowngradedFailureCount === 0) {
445
+ return {
446
+ ...finding,
447
+ code: 'slow_visual_pages_present',
448
+ severity: 'review',
449
+ message: `${downgradedFailureCount} slow visual page(s) were downgraded to manual review.`,
450
+ };
451
+ }
452
+ return finding;
453
+ });
454
+ const nextReviewFindings = dedupeReviewFindings(reviewFindings);
455
+ const reviewReasons = summarizeReviewReasons(nextReviewFindings);
456
+ return {
457
+ ...result,
458
+ reviewFindings: nextReviewFindings,
459
+ reviewReasons,
460
+ reviewRequired: reviewReasons.length > 0,
461
+ canAutoProceed: false,
462
+ };
463
+ }
464
+ function applyGeotechFailureDowngrades(result, checkpoints) {
465
+ const downgradedPageNumbers = new Set(checkpoints
466
+ .filter((page) => page.status === 'failed' && page.downgraded)
467
+ .map((page) => page.pageNumber));
468
+ if (downgradedPageNumbers.size === 0) {
469
+ return result;
470
+ }
471
+ const pageFailureCount = checkpoints.filter((page) => page.status === 'failed').length;
472
+ const nonDowngradedFailureCount = checkpoints.filter((page) => page.status === 'failed' && !page.downgraded).length;
473
+ const downgradedFailureCount = pageFailureCount - nonDowngradedFailureCount;
474
+ const reviewFindings = result.reviewFindings.map((finding) => {
475
+ if (finding.code === 'page_ingest_failed'
476
+ && typeof finding.pageNumber === 'number'
477
+ && downgradedPageNumbers.has(finding.pageNumber)) {
478
+ return {
479
+ ...finding,
480
+ code: 'page_visual_ingest_downgraded',
481
+ severity: 'review',
482
+ message: `Page ${finding.pageNumber} exceeded the slow visual budget and was downgraded to manual review.`,
483
+ };
484
+ }
485
+ if (finding.code === 'page_failures_present' && nonDowngradedFailureCount === 0) {
486
+ return {
487
+ ...finding,
488
+ code: 'slow_visual_pages_present',
489
+ severity: 'review',
490
+ message: `${downgradedFailureCount} slow visual page(s) were downgraded to manual review.`,
491
+ };
492
+ }
493
+ return finding;
494
+ });
495
+ const nextReviewFindings = dedupeReviewFindings(reviewFindings);
496
+ const reviewReasons = summarizeReviewReasons(nextReviewFindings);
497
+ return {
498
+ ...result,
499
+ reviewFindings: nextReviewFindings,
500
+ reviewReasons,
501
+ reviewRequired: reviewReasons.length > 0,
502
+ canAutoProceed: false,
503
+ };
504
+ }
505
+ function buildJobConfig(job, dependencies = {}) {
506
+ const buildConfig = dependencies.buildLLMConfig ?? buildLLMConfig;
507
+ const runtimeConfig = buildConfig();
508
+ return {
509
+ ...runtimeConfig,
510
+ provider: job.config.provider,
511
+ baseUrl: job.config.baseUrl ?? runtimeConfig.baseUrl,
512
+ modelId: job.config.modelId ?? runtimeConfig.modelId,
513
+ visionModelId: job.config.visionModelId ?? runtimeConfig.visionModelId,
514
+ timeout: job.config.timeout ?? runtimeConfig.timeout,
515
+ };
516
+ }
517
+ function findCheckpoint(job, pageNumber) {
518
+ const checkpoint = job.checkpoints.pages.find((page) => page.pageNumber === pageNumber);
519
+ if (!checkpoint) {
520
+ throw new Error(`Persisted ingest job "${job.jobId}" is missing checkpoint metadata for page ${pageNumber}.`);
521
+ }
522
+ return checkpoint;
523
+ }
524
+ async function processGeotechDocumentPage(job, pageInput, config, dependencies) {
525
+ const inspect = dependencies.inspectPdfDocument ?? inspectPdfDocument;
526
+ const recoverTextHint = dependencies.recoverDocumentTextHint ?? recoverDocumentTextHint;
527
+ const interpretation = dependencies.interpretGeotechDocumentPage ?? interpretGeotechDocumentPage;
528
+ const extractTextFacts = dependencies.extractGeotechDocumentFactsFromText ?? extractGeotechDocumentFactsFromText;
529
+ const transcribe = dependencies.transcribeDocumentImageText ?? transcribeDocumentImageText;
530
+ const inspectionPage = job.inspection?.pages[pageInput.pageNumber - 1] ?? inspect(job.source.filePath).pages[pageInput.pageNumber - 1];
531
+ const lowYieldRole = inferPreflightLowYieldPageRole({
532
+ inspectionPage,
533
+ previousInspectionPage: job.inspection?.pages[pageInput.pageNumber - 2],
534
+ pageNumber: pageInput.pageNumber,
535
+ totalPages: pageInput.totalPages,
536
+ sourceKind: pageInput.sourceKind,
537
+ });
538
+ if (lowYieldRole && inspectionPage) {
539
+ return {
540
+ result: buildPreflightLowYieldInsight({
541
+ role: lowYieldRole,
542
+ inspectionPage,
543
+ pageNumber: pageInput.pageNumber,
544
+ totalPages: pageInput.totalPages,
545
+ }),
546
+ ocrTextHint: undefined,
547
+ ocrSource: 'none',
548
+ ocrWarnings: [
549
+ lowYieldRole === 'administrative'
550
+ ? 'Administrative/cover page was summarized without a full multimodal extraction call.'
551
+ : 'Figure/appendix page was summarized without a full multimodal extraction call.',
552
+ ],
553
+ };
554
+ }
555
+ const phaseTimeoutMs = resolveWorkerPhaseTimeoutMs(config, {
556
+ classification: inspectionPage?.classification,
557
+ sourceKind: pageInput.sourceKind,
558
+ });
559
+ const phaseConfig = {
560
+ ...config,
561
+ timeout: phaseTimeoutMs,
562
+ };
563
+ let pageTextHint;
564
+ let ocrSource = 'none';
565
+ let ocrWarnings = [];
566
+ try {
567
+ const recovery = await withWorkerPageTimeout(recoverTextHint({
568
+ existingTextHint: inspectionPage?.normalizedArtifact?.nativeText ?? inspectionPage?.normalizedText,
569
+ existingTextAccepted: inspectionPage?.normalizedArtifact?.textQuality.accepted ?? true,
570
+ imageBase64: pageInput.base64,
571
+ mimeType: pageInput.mimeType,
572
+ config: phaseConfig,
573
+ pdfFilePath: job.source.filePath,
574
+ pdfPageNumber: pageInput.pageNumber,
575
+ visionTranscribe: transcribe,
576
+ }), phaseTimeoutMs, `Page ${pageInput.pageNumber}: OCR/text recovery timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
577
+ pageTextHint = normalizeTextHint(recovery.textHint);
578
+ ocrSource = recovery.source;
579
+ ocrWarnings = recovery.warnings;
580
+ }
581
+ catch (error) {
582
+ const message = error instanceof Error ? error.message : String(error);
583
+ ocrWarnings = [
584
+ `OCR/text recovery failed (${normalizeCheckpointErrorMessage(message)}); proceeded with direct page interpretation.`,
585
+ ];
586
+ }
587
+ const context = {
588
+ pageNumber: pageInput.pageNumber,
589
+ totalPages: pageInput.totalPages,
590
+ pageClassification: inspectionPage?.classification,
591
+ pageTextHint,
592
+ };
593
+ const extractionTimeoutMs = resolveWorkerTextExtractionTimeoutMs(phaseTimeoutMs, pageTextHint);
594
+ const extractionConfig = {
595
+ ...config,
596
+ timeout: extractionTimeoutMs,
597
+ };
598
+ const result = pageTextHint
599
+ ? await withWorkerPageTimeout(extractTextFacts(pageTextHint, extractionConfig, context), extractionTimeoutMs, `Page ${pageInput.pageNumber}: text extraction timed out after ${Math.round(extractionTimeoutMs / 1000)}s`)
600
+ : await withWorkerPageTimeout(interpretation(pageInput.base64, pageInput.mimeType, phaseConfig, context), phaseTimeoutMs, `Page ${pageInput.pageNumber}: visual page interpretation timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
601
+ return {
602
+ result,
603
+ ocrTextHint: pageTextHint,
604
+ ocrSource,
605
+ ocrWarnings,
606
+ };
607
+ }
608
+ async function processBoreholePage(job, pageInput, config, state, dependencies) {
609
+ const recoverTextHint = dependencies.recoverDocumentTextHint ?? recoverDocumentTextHint;
610
+ const transcribe = dependencies.transcribeDocumentImageText ?? transcribeDocumentImageText;
611
+ const interpret = dependencies.interpretBoreholeLogWithContext ?? interpretBoreholeLogWithContext;
612
+ const inspectionPage = job.inspection?.pages[pageInput.pageNumber - 1];
613
+ const phaseTimeoutMs = resolveWorkerPhaseTimeoutMs(config, {
614
+ classification: inspectionPage?.classification,
615
+ sourceKind: pageInput.sourceKind,
616
+ });
617
+ const phaseConfig = {
618
+ ...config,
619
+ timeout: phaseTimeoutMs,
620
+ };
621
+ let pageTextHint = typeof inspectionPage?.normalizedText === 'string' ? inspectionPage.normalizedText : undefined;
622
+ const recovery = await withWorkerPageTimeout(recoverTextHint({
623
+ existingTextHint: pageTextHint,
624
+ existingTextAccepted: inspectionPage?.normalizedArtifact?.textQuality.accepted ?? true,
625
+ imageBase64: pageInput.base64,
626
+ mimeType: pageInput.mimeType,
627
+ config: phaseConfig,
628
+ pdfFilePath: job.source.filePath,
629
+ pdfPageNumber: pageInput.pageNumber,
630
+ visionTranscribe: transcribe,
631
+ }), phaseTimeoutMs, `Page ${pageInput.pageNumber}: OCR/text recovery timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
632
+ if (recovery.textHint) {
633
+ pageTextHint = recovery.textHint;
634
+ }
635
+ const context = {
636
+ boreholeId: state.lastResolvedBoreholeId,
637
+ pageNumber: pageInput.pageNumber,
638
+ totalPages: pageInput.totalPages,
639
+ priorContinuationDepth: state.priorContinuationDepth,
640
+ pageClassification: inspectionPage?.classification,
641
+ pageTextHint,
642
+ };
643
+ const result = await withWorkerPageTimeout(interpret(pageInput.base64, pageInput.mimeType, phaseConfig, context), phaseTimeoutMs, `Page ${pageInput.pageNumber}: visual page interpretation timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
644
+ const nextState = advanceBoreholeProcessingState(state, result, pageTextHint, job.request.overrideBoreholeId);
645
+ return {
646
+ result,
647
+ nextState,
648
+ ocrTextHint: recovery.textHint,
649
+ ocrSource: recovery.source,
650
+ ocrWarnings: recovery.warnings,
651
+ };
652
+ }
653
+ async function finalizeJobResult(job, pageInputs, config, dependencies) {
654
+ const completedPages = job.checkpoints.pages.filter((page) => page.status === 'completed');
655
+ if (completedPages.length === 0) {
656
+ return job.documentType === 'borehole-log'
657
+ ? buildSyntheticBoreholeResult(job, job.inspection, dependencies.now)
658
+ : buildSyntheticGeotechDocumentResult(job, job.inspection, dependencies.now);
659
+ }
660
+ if (job.documentType === 'borehole-log') {
661
+ const pageInputMap = new Map(pageInputs.map((page) => [page.pageNumber, page]));
662
+ try {
663
+ const result = await ingestBoreholeLogDocument({
664
+ config,
665
+ source: {
666
+ filePath: job.source.filePath,
667
+ fileName: basename(job.source.filePath),
668
+ inputKind: 'pdf',
669
+ },
670
+ overrideBoreholeId: job.request.overrideBoreholeId,
671
+ inspection: job.inspection,
672
+ pages: job.checkpoints.pages
673
+ .map((checkpoint) => pageInputMap.get(checkpoint.pageNumber))
674
+ .filter((page) => Boolean(page)),
675
+ interpretPageWithContext: async (_base64, _mimeType, _config, context) => {
676
+ const pageNumber = context?.pageNumber;
677
+ if (!pageNumber) {
678
+ throw new Error('Replay borehole ingest requires a page number context.');
679
+ }
680
+ const checkpoint = findCheckpoint(job, pageNumber);
681
+ if (checkpoint.status === 'completed' && checkpoint.result) {
682
+ return checkpoint.result;
683
+ }
684
+ throw new Error(normalizeCheckpointErrorMessage(checkpoint.error ?? `Page ${pageNumber} failed during async ingest.`));
685
+ },
686
+ transcribePageImageText: async (_base64, _mimeType, _config) => {
687
+ const pageNumber = pageInputs.find((page) => page.base64 === _base64 && page.mimeType === _mimeType)?.pageNumber;
688
+ const checkpoint = pageNumber ? findCheckpoint(job, pageNumber) : undefined;
689
+ return {
690
+ text: checkpoint?.ocrTextHint ?? '',
691
+ warnings: checkpoint?.ocrWarnings ?? [],
692
+ usedFallback: false,
693
+ latencyMs: 0,
694
+ };
695
+ },
696
+ now: dependencies.now,
697
+ });
698
+ return applyBoreholeFailureDowngrades(result, job.checkpoints.pages);
699
+ }
700
+ catch (error) {
701
+ const message = error instanceof Error ? error.message : String(error);
702
+ if (/No pages could be ingested successfully/i.test(message)) {
703
+ return buildSyntheticBoreholeResult(job, job.inspection, dependencies.now);
704
+ }
705
+ throw error;
706
+ }
707
+ }
708
+ const geotechPageInputMap = new Map(pageInputs.map((page) => [page.pageNumber, page]));
709
+ try {
710
+ const result = await ingestGeotechDocument({
711
+ config,
712
+ source: {
713
+ filePath: job.source.filePath,
714
+ fileName: basename(job.source.filePath),
715
+ inputKind: 'pdf',
716
+ },
717
+ inspection: job.inspection,
718
+ pages: job.checkpoints.pages
719
+ .map((checkpoint) => geotechPageInputMap.get(checkpoint.pageNumber))
720
+ .filter((page) => Boolean(page)),
721
+ interpretPage: async (_base64, _mimeType, _config, context) => {
722
+ const pageNumber = typeof context?.pageNumber === 'number' ? context.pageNumber : undefined;
723
+ if (!pageNumber) {
724
+ throw new Error('Replay geotech-document ingest requires a page number context.');
725
+ }
726
+ const checkpoint = findCheckpoint(job, pageNumber);
727
+ if (checkpoint.status === 'completed' && checkpoint.result) {
728
+ return checkpoint.result;
729
+ }
730
+ throw new Error(normalizeCheckpointErrorMessage(checkpoint.error ?? `Page ${pageNumber} failed during async ingest.`));
731
+ },
732
+ extractTextFacts: async (_pageText, _config, context) => {
733
+ const pageNumber = typeof context?.pageNumber === 'number' ? context.pageNumber : undefined;
734
+ if (!pageNumber) {
735
+ throw new Error('Replay geotech-document ingest requires a page number context.');
736
+ }
737
+ const checkpoint = findCheckpoint(job, pageNumber);
738
+ if (checkpoint.status === 'completed' && checkpoint.result) {
739
+ return checkpoint.result;
740
+ }
741
+ throw new Error(normalizeCheckpointErrorMessage(checkpoint.error ?? `Page ${pageNumber} failed during async ingest.`));
742
+ },
743
+ transcribePageImageText: async (_base64, _mimeType, _config) => {
744
+ const pageNumber = pageInputs.find((page) => page.base64 === _base64 && page.mimeType === _mimeType)?.pageNumber;
745
+ const checkpoint = pageNumber ? findCheckpoint(job, pageNumber) : undefined;
746
+ return {
747
+ text: checkpoint?.ocrTextHint ?? '',
748
+ warnings: checkpoint?.ocrWarnings ?? [],
749
+ usedFallback: false,
750
+ latencyMs: 0,
751
+ };
752
+ },
753
+ now: dependencies.now,
754
+ });
755
+ return applyGeotechFailureDowngrades(result, job.checkpoints.pages);
756
+ }
757
+ catch (error) {
758
+ const message = error instanceof Error ? error.message : String(error);
759
+ if (/No pages could be ingested successfully/i.test(message)) {
760
+ return buildSyntheticGeotechDocumentResult(job, job.inspection, dependencies.now);
761
+ }
762
+ throw error;
763
+ }
764
+ }
765
+ function isCancelled(jobId) {
766
+ return loadPersistedIngestJob(jobId)?.execution.cancelRequested === true;
767
+ }
768
+ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
769
+ const inspect = dependencies.inspectPdfDocument ?? inspectPdfDocument;
770
+ const persistReview = dependencies.persistReview ?? persistBoreholeIngestReview;
771
+ let currentJob = loadPersistedIngestJob(jobId);
772
+ if (!currentJob) {
773
+ throw new Error(`No persisted ingest job named "${jobId}" was found.`);
774
+ }
775
+ if (currentJob.status === 'completed') {
776
+ return currentJob;
777
+ }
778
+ const mutateQueue = [];
779
+ const mutateJob = async (mutator) => {
780
+ const run = async () => {
781
+ const latest = loadPersistedIngestJob(jobId) ?? currentJob;
782
+ currentJob = mutator(latest);
783
+ savePersistedIngestJob(currentJob);
784
+ };
785
+ const previous = mutateQueue[mutateQueue.length - 1];
786
+ const next = previous ? previous.then(run) : run();
787
+ mutateQueue.push(next);
788
+ await next;
789
+ };
790
+ await mutateJob((job) => ({
791
+ ...job,
792
+ status: 'running',
793
+ startedAt: job.startedAt ?? nowIso(dependencies.now),
794
+ updatedAt: nowIso(dependencies.now),
795
+ execution: {
796
+ ...job.execution,
797
+ pid: process.pid,
798
+ lastHeartbeatAt: nowIso(dependencies.now),
799
+ lastError: undefined,
800
+ cancelRequested: false,
801
+ },
802
+ }));
803
+ try {
804
+ const inspection = currentJob.inspection && currentJob.inspection.totalPages > 0
805
+ ? currentJob.inspection
806
+ : inspect(currentJob.source.filePath);
807
+ if (!currentJob.inspection || currentJob.inspection.totalPages === 0) {
808
+ await mutateJob((job) => ({
809
+ ...job,
810
+ inspection,
811
+ source: {
812
+ ...job.source,
813
+ totalPages: inspection.totalPages,
814
+ weightedPageCost: inspection.pages.reduce((sum, page) => sum + (page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1), 0),
815
+ },
816
+ checkpoints: {
817
+ pages: inspection.pages.map((page) => job.checkpoints.pages.find((existing) => existing.pageNumber === page.pageNumber) ?? ({
818
+ pageNumber: page.pageNumber,
819
+ classification: page.classification,
820
+ sourceKind: mapPageSourceKind(page.classification),
821
+ weight: page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1,
822
+ status: 'pending',
823
+ attempts: 0,
824
+ updatedAt: nowIso(dependencies.now),
825
+ })),
826
+ },
827
+ }));
828
+ }
829
+ const config = buildJobConfig(currentJob, dependencies);
830
+ const pageInputs = await preparePdfPageInputs(currentJob.source.filePath, currentJob.inspection, currentJob.processing.pagePreprocessingConcurrency, dependencies);
831
+ let processedNewPages = 0;
832
+ if (currentJob.documentType === 'geotech-document') {
833
+ const geotechPageInputs = pageInputs;
834
+ const pendingPages = geotechPageInputs
835
+ .filter((page) => findCheckpoint(currentJob, page.pageNumber).status !== 'completed')
836
+ .sort((left, right) => left.pageNumber - right.pageNumber);
837
+ let fatalProviderStopMessage = null;
838
+ await mapWithConcurrency(pendingPages, currentJob.processing.chunkExtractionConcurrency, async (page) => {
839
+ if (isCancelled(jobId)) {
840
+ return;
841
+ }
842
+ if (fatalProviderStopMessage) {
843
+ await mutateJob((job) => ({
844
+ ...job,
845
+ updatedAt: nowIso(dependencies.now),
846
+ execution: {
847
+ ...job.execution,
848
+ lastHeartbeatAt: nowIso(dependencies.now),
849
+ },
850
+ checkpoints: {
851
+ pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber && pageCheckpoint.status === 'pending'
852
+ ? {
853
+ ...pageCheckpoint,
854
+ status: 'failed',
855
+ updatedAt: nowIso(dependencies.now),
856
+ error: `skipped after upstream provider stop. ${normalizeCheckpointErrorMessage(fatalProviderStopMessage ?? '')}`,
857
+ downgraded: false,
858
+ }
859
+ : pageCheckpoint),
860
+ },
861
+ }));
862
+ return;
863
+ }
864
+ await mutateJob((job) => ({
865
+ ...job,
866
+ updatedAt: nowIso(dependencies.now),
867
+ execution: {
868
+ ...job.execution,
869
+ lastHeartbeatAt: nowIso(dependencies.now),
870
+ },
871
+ checkpoints: {
872
+ pages: job.checkpoints.pages.map((checkpoint) => checkpoint.pageNumber === page.pageNumber
873
+ ? {
874
+ ...checkpoint,
875
+ attempts: checkpoint.attempts + 1,
876
+ updatedAt: nowIso(dependencies.now),
877
+ }
878
+ : checkpoint),
879
+ },
880
+ }));
881
+ try {
882
+ const processed = await processGeotechDocumentPage(currentJob, page, config, dependencies);
883
+ processedNewPages += 1;
884
+ await mutateJob((job) => ({
885
+ ...job,
886
+ updatedAt: nowIso(dependencies.now),
887
+ execution: {
888
+ ...job.execution,
889
+ lastHeartbeatAt: nowIso(dependencies.now),
890
+ },
891
+ checkpoints: {
892
+ pages: job.checkpoints.pages.map((checkpoint) => checkpoint.pageNumber === page.pageNumber
893
+ ? {
894
+ ...checkpoint,
895
+ status: 'completed',
896
+ updatedAt: nowIso(dependencies.now),
897
+ completedAt: nowIso(dependencies.now),
898
+ error: undefined,
899
+ downgraded: false,
900
+ ocrTextHint: processed.ocrTextHint,
901
+ ocrSource: processed.ocrSource,
902
+ ocrWarnings: processed.ocrWarnings,
903
+ result: processed.result,
904
+ }
905
+ : checkpoint),
906
+ },
907
+ }));
908
+ }
909
+ catch (error) {
910
+ const message = error instanceof Error ? error.message : String(error);
911
+ const normalizedMessage = normalizeCheckpointErrorMessage(message);
912
+ const checkpoint = findCheckpoint(currentJob, page.pageNumber);
913
+ if (!fatalProviderStopMessage && isFatalProviderStopError(normalizedMessage)) {
914
+ fatalProviderStopMessage = normalizedMessage;
915
+ }
916
+ await mutateJob((job) => ({
917
+ ...job,
918
+ updatedAt: nowIso(dependencies.now),
919
+ execution: {
920
+ ...job.execution,
921
+ lastHeartbeatAt: nowIso(dependencies.now),
922
+ },
923
+ checkpoints: {
924
+ pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
925
+ ? {
926
+ ...pageCheckpoint,
927
+ status: 'failed',
928
+ updatedAt: nowIso(dependencies.now),
929
+ error: normalizedMessage,
930
+ downgraded: isSlowVisualPageError(normalizedMessage, checkpoint.classification, checkpoint.sourceKind),
931
+ }
932
+ : pageCheckpoint),
933
+ },
934
+ }));
935
+ }
936
+ });
937
+ if (fatalProviderStopMessage) {
938
+ await mutateJob((job) => ({
939
+ ...job,
940
+ updatedAt: nowIso(dependencies.now),
941
+ execution: {
942
+ ...job.execution,
943
+ lastHeartbeatAt: nowIso(dependencies.now),
944
+ },
945
+ checkpoints: {
946
+ pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.status === 'pending'
947
+ ? {
948
+ ...pageCheckpoint,
949
+ status: 'failed',
950
+ updatedAt: nowIso(dependencies.now),
951
+ error: `skipped after upstream provider stop. ${normalizeCheckpointErrorMessage(fatalProviderStopMessage ?? '')}`,
952
+ downgraded: false,
953
+ }
954
+ : pageCheckpoint),
955
+ },
956
+ }));
957
+ }
958
+ }
959
+ else {
960
+ let state = {
961
+ currentGroupBoreholeId: null,
962
+ hasCurrentGroup: false,
963
+ lastResolvedBoreholeId: currentJob.request.overrideBoreholeId,
964
+ priorContinuationDepth: null,
965
+ };
966
+ const boreholePageInputs = pageInputs;
967
+ for (const page of boreholePageInputs.sort((left, right) => left.pageNumber - right.pageNumber)) {
968
+ const checkpoint = findCheckpoint(currentJob, page.pageNumber);
969
+ if (checkpoint.status === 'completed' && checkpoint.result) {
970
+ state = advanceBoreholeProcessingState(state, checkpoint.result, checkpoint.ocrTextHint, currentJob.request.overrideBoreholeId);
971
+ continue;
972
+ }
973
+ if (isCancelled(jobId)) {
974
+ break;
975
+ }
976
+ if (dependencies.stopAfterNewPages != null && processedNewPages >= dependencies.stopAfterNewPages) {
977
+ throw new Error('Ingest job worker interrupted after checkpoint for test harness.');
978
+ }
979
+ await mutateJob((job) => ({
980
+ ...job,
981
+ updatedAt: nowIso(dependencies.now),
982
+ execution: {
983
+ ...job.execution,
984
+ lastHeartbeatAt: nowIso(dependencies.now),
985
+ },
986
+ checkpoints: {
987
+ pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
988
+ ? {
989
+ ...pageCheckpoint,
990
+ attempts: pageCheckpoint.attempts + 1,
991
+ updatedAt: nowIso(dependencies.now),
992
+ }
993
+ : pageCheckpoint),
994
+ },
995
+ }));
996
+ try {
997
+ const processed = await processBoreholePage(currentJob, page, config, state, dependencies);
998
+ processedNewPages += 1;
999
+ state = processed.nextState;
1000
+ await mutateJob((job) => ({
1001
+ ...job,
1002
+ updatedAt: nowIso(dependencies.now),
1003
+ execution: {
1004
+ ...job.execution,
1005
+ lastHeartbeatAt: nowIso(dependencies.now),
1006
+ },
1007
+ checkpoints: {
1008
+ pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
1009
+ ? {
1010
+ ...pageCheckpoint,
1011
+ status: 'completed',
1012
+ updatedAt: nowIso(dependencies.now),
1013
+ completedAt: nowIso(dependencies.now),
1014
+ error: undefined,
1015
+ downgraded: false,
1016
+ ocrTextHint: processed.ocrTextHint,
1017
+ ocrSource: processed.ocrSource,
1018
+ ocrWarnings: processed.ocrWarnings,
1019
+ result: processed.result,
1020
+ }
1021
+ : pageCheckpoint),
1022
+ },
1023
+ }));
1024
+ }
1025
+ catch (error) {
1026
+ const message = error instanceof Error ? error.message : String(error);
1027
+ const normalizedMessage = normalizeCheckpointErrorMessage(message);
1028
+ await mutateJob((job) => ({
1029
+ ...job,
1030
+ updatedAt: nowIso(dependencies.now),
1031
+ execution: {
1032
+ ...job.execution,
1033
+ lastHeartbeatAt: nowIso(dependencies.now),
1034
+ },
1035
+ checkpoints: {
1036
+ pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
1037
+ ? {
1038
+ ...pageCheckpoint,
1039
+ status: 'failed',
1040
+ updatedAt: nowIso(dependencies.now),
1041
+ error: normalizedMessage,
1042
+ downgraded: isSlowVisualPageError(normalizedMessage, pageCheckpoint.classification, pageCheckpoint.sourceKind),
1043
+ }
1044
+ : pageCheckpoint),
1045
+ },
1046
+ }));
1047
+ if (isFatalProviderStopError(normalizedMessage)) {
1048
+ await mutateJob((job) => ({
1049
+ ...job,
1050
+ updatedAt: nowIso(dependencies.now),
1051
+ execution: {
1052
+ ...job.execution,
1053
+ lastHeartbeatAt: nowIso(dependencies.now),
1054
+ },
1055
+ checkpoints: {
1056
+ pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.status === 'pending'
1057
+ ? {
1058
+ ...pageCheckpoint,
1059
+ status: 'failed',
1060
+ updatedAt: nowIso(dependencies.now),
1061
+ error: `skipped after upstream provider stop. ${normalizeCheckpointErrorMessage(message)}`,
1062
+ downgraded: false,
1063
+ }
1064
+ : pageCheckpoint),
1065
+ },
1066
+ }));
1067
+ break;
1068
+ }
1069
+ }
1070
+ }
1071
+ }
1072
+ currentJob = loadPersistedIngestJob(jobId) ?? currentJob;
1073
+ if (currentJob.execution.cancelRequested) {
1074
+ await mutateJob((job) => ({
1075
+ ...job,
1076
+ status: 'canceled',
1077
+ updatedAt: nowIso(dependencies.now),
1078
+ canceledAt: nowIso(dependencies.now),
1079
+ execution: {
1080
+ ...job.execution,
1081
+ pid: undefined,
1082
+ },
1083
+ }));
1084
+ return loadPersistedIngestJob(jobId) ?? currentJob;
1085
+ }
1086
+ const finalResult = await finalizeJobResult(currentJob, pageInputs, config, dependencies);
1087
+ const persistedReview = currentJob.request.projectId
1088
+ ? persistReview(currentJob.request.projectId, finalResult, {
1089
+ title: currentJob.request.reviewTitle,
1090
+ })
1091
+ : null;
1092
+ await mutateJob((job) => ({
1093
+ ...job,
1094
+ status: 'completed',
1095
+ updatedAt: nowIso(dependencies.now),
1096
+ completedAt: nowIso(dependencies.now),
1097
+ execution: {
1098
+ ...job.execution,
1099
+ pid: undefined,
1100
+ lastHeartbeatAt: nowIso(dependencies.now),
1101
+ },
1102
+ result: {
1103
+ ingestResult: finalResult,
1104
+ persistedReview: persistedReview
1105
+ ? {
1106
+ datasetName: persistedReview.datasetName,
1107
+ reviewId: persistedReview.reviewId,
1108
+ createdAt: persistedReview.createdAt,
1109
+ }
1110
+ : undefined,
1111
+ },
1112
+ }));
1113
+ }
1114
+ catch (error) {
1115
+ await mutateJob((job) => ({
1116
+ ...job,
1117
+ status: 'failed',
1118
+ updatedAt: nowIso(dependencies.now),
1119
+ execution: {
1120
+ ...job.execution,
1121
+ pid: undefined,
1122
+ lastHeartbeatAt: nowIso(dependencies.now),
1123
+ lastError: error instanceof Error ? error.message : String(error),
1124
+ },
1125
+ }));
1126
+ }
1127
+ return loadPersistedIngestJob(jobId) ?? currentJob;
1128
+ }
1129
+ //# sourceMappingURL=job-worker.js.map