@geotechcli/core 0.4.24 → 0.4.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/dist/agents/sandbox.d.ts.map +1 -1
  2. package/dist/agents/sandbox.js +13 -8
  3. package/dist/agents/sandbox.js.map +1 -1
  4. package/dist/agents/swarm.d.ts.map +1 -1
  5. package/dist/agents/swarm.js +21 -4
  6. package/dist/agents/swarm.js.map +1 -1
  7. package/dist/ingest/geotech-document.d.ts +4 -0
  8. package/dist/ingest/geotech-document.d.ts.map +1 -1
  9. package/dist/ingest/geotech-document.js +109 -5
  10. package/dist/ingest/geotech-document.js.map +1 -1
  11. package/dist/ingest/geotech-extract.d.ts +3 -0
  12. package/dist/ingest/geotech-extract.d.ts.map +1 -1
  13. package/dist/ingest/geotech-extract.js +29 -1
  14. package/dist/ingest/geotech-extract.js.map +1 -1
  15. package/dist/ingest/index.d.ts +2 -1
  16. package/dist/ingest/index.d.ts.map +1 -1
  17. package/dist/ingest/index.js +2 -1
  18. package/dist/ingest/index.js.map +1 -1
  19. package/dist/ingest/job-store.d.ts +18 -3
  20. package/dist/ingest/job-store.d.ts.map +1 -1
  21. package/dist/ingest/job-store.js +180 -11
  22. package/dist/ingest/job-store.js.map +1 -1
  23. package/dist/ingest/job-worker.d.ts.map +1 -1
  24. package/dist/ingest/job-worker.js +534 -63
  25. package/dist/ingest/job-worker.js.map +1 -1
  26. package/dist/ingest/pdf.d.ts +1 -0
  27. package/dist/ingest/pdf.d.ts.map +1 -1
  28. package/dist/ingest/pdf.js +11 -0
  29. package/dist/ingest/pdf.js.map +1 -1
  30. package/dist/ingest/segmentation.d.ts +38 -0
  31. package/dist/ingest/segmentation.d.ts.map +1 -0
  32. package/dist/ingest/segmentation.js +145 -0
  33. package/dist/ingest/segmentation.js.map +1 -0
  34. package/dist/meta/metadata.json +1 -1
  35. package/dist/report/html.d.ts.map +1 -1
  36. package/dist/report/html.js +17 -34
  37. package/dist/report/html.js.map +1 -1
  38. package/dist/report/ingest-dossier.d.ts.map +1 -1
  39. package/dist/report/ingest-dossier.js +19 -0
  40. package/dist/report/ingest-dossier.js.map +1 -1
  41. package/dist/skills/index.d.ts.map +1 -1
  42. package/dist/skills/index.js +12 -4
  43. package/dist/skills/index.js.map +1 -1
  44. package/dist/vision/geotech-document.d.ts +1 -0
  45. package/dist/vision/geotech-document.d.ts.map +1 -1
  46. package/dist/vision/geotech-document.js +82 -14
  47. package/dist/vision/geotech-document.js.map +1 -1
  48. package/dist/vision/index.d.ts.map +1 -1
  49. package/dist/vision/index.js +47 -10
  50. package/dist/vision/index.js.map +1 -1
  51. package/package.json +1 -1
@@ -1,16 +1,20 @@
1
- import { basename } from 'node:path';
1
+ import { basename, join } from 'node:path';
2
+ import { homedir } from 'node:os';
2
3
  import { buildLLMConfig } from '../config/index.js';
3
- import { readDocumentPdfPageInputs } from './document-inputs.js';
4
- import { inspectPdfDocument } from './pdf.js';
4
+ import { countDocumentPdfPages, readDocumentPdfPageInputs } from './document-inputs.js';
5
+ import { inferPdfDocumentPageCountFallback, inspectPdfDocument, } from './pdf.js';
5
6
  import { ingestBoreholeLogDocument, summarizeBoreholeIngestInspection, } from './geotech-extract.js';
6
7
  import { buildPreflightLowYieldInsight, ingestGeotechDocument, inferPreflightLowYieldPageRole, } from './geotech-document.js';
7
8
  import { extractGeotechDocumentFactsFromText, interpretGeotechDocumentPage, } from '../vision/geotech-document.js';
8
9
  import { interpretBoreholeLogWithContext, transcribeDocumentImageText, } from '../vision/index.js';
9
10
  import { recoverDocumentTextHint } from '../vision/ocr.js';
10
11
  import { persistBoreholeIngestReview } from './review-store.js';
11
- import { loadPersistedIngestJob, savePersistedIngestJob, } from './job-store.js';
12
+ import { buildPersistedIngestJobSegments, createPersistedIngestJob, loadPersistedIngestJob, resolvePersistedIngestJobExtractionConcurrency, savePersistedIngestJob, } from './job-store.js';
13
+ import { HOSTED_BETA_EFFECTIVE_PAGE_LIMIT, slicePdfInspectionToRange, writePdfPageSubset, } from './segmentation.js';
12
14
  const SLOW_VISUAL_ERROR_PATTERNS = [
13
15
  /timeout/i,
16
+ /\b524\b/i,
17
+ /upstream(?: request)? (?:timed out|timeout|failed)/i,
14
18
  /provider is busy/i,
15
19
  /returned no content/i,
16
20
  /did not contain assistant text/i,
@@ -20,6 +24,15 @@ const SLOW_VISUAL_ERROR_PATTERNS = [
20
24
  /\b503\b/i,
21
25
  /\b504\b/i,
22
26
  ];
27
+ const RETRYABLE_UPSTREAM_ERROR_PATTERNS = [
28
+ /timeout/i,
29
+ /\b524\b/i,
30
+ /upstream(?: request)? (?:timed out|timeout|failed)/i,
31
+ /provider is busy/i,
32
+ /temporarily unavailable/i,
33
+ /\b503\b/i,
34
+ /\b504\b/i,
35
+ ];
23
36
  const FATAL_PROVIDER_STOP_PATTERNS = [
24
37
  /daily limit reached/i,
25
38
  /remaining today:\s*0/i,
@@ -34,6 +47,310 @@ function nowIso(now) {
34
47
  function uniqueStrings(values) {
35
48
  return [...new Set(values.filter((value) => typeof value === 'string' && value.trim().length > 0))];
36
49
  }
50
+ function getJobSourceDisplayPath(job) {
51
+ return job.source.originalFilePath ?? job.source.filePath;
52
+ }
53
+ function getJobSourceDisplayName(job) {
54
+ return job.source.originalFileName ?? basename(getJobSourceDisplayPath(job));
55
+ }
56
+ function cloneSegmentationSummary(segmentation) {
57
+ if (!segmentation) {
58
+ return undefined;
59
+ }
60
+ return {
61
+ ...segmentation,
62
+ pageRange: [...segmentation.pageRange],
63
+ segments: segmentation.segments?.map((segment) => ({ ...segment })),
64
+ };
65
+ }
66
+ function buildJobResultSource(job, counts) {
67
+ return {
68
+ filePath: getJobSourceDisplayPath(job),
69
+ fileName: getJobSourceDisplayName(job),
70
+ inputKind: 'pdf',
71
+ totalPages: job.source.totalPages,
72
+ successfulPages: counts.successfulPages,
73
+ failedPages: counts.failedPages,
74
+ pageRange: job.source.pageRange,
75
+ segmentation: cloneSegmentationSummary(job.segmentation),
76
+ };
77
+ }
78
+ function remapPageReferenceText(message, localPageNumber, originalPageNumber) {
79
+ if (!message) {
80
+ return undefined;
81
+ }
82
+ return message
83
+ .replace(new RegExp(`\\bPage ${localPageNumber}\\b`, 'g'), `Page ${originalPageNumber}`)
84
+ .replace(new RegExp(`\\bpage ${localPageNumber}\\b`, 'g'), `page ${originalPageNumber}`);
85
+ }
86
+ function remapChildGeotechInsightToOriginalPage(insight, localPageNumber, originalPageNumber, parentTotalPages) {
87
+ return {
88
+ ...insight,
89
+ pageNumber: originalPageNumber,
90
+ totalPages: parentTotalPages,
91
+ warnings: uniqueStrings(insight.warnings.map((warning) => remapPageReferenceText(warning, localPageNumber, originalPageNumber))),
92
+ };
93
+ }
94
+ function updateSegmentSummaryStatus(segmentation, segmentIndex, patch) {
95
+ if (!segmentation?.segments) {
96
+ return segmentation;
97
+ }
98
+ return {
99
+ ...segmentation,
100
+ segments: segmentation.segments.map((segment) => segment.segmentIndex === segmentIndex
101
+ ? {
102
+ ...segment,
103
+ ...patch,
104
+ }
105
+ : segment),
106
+ };
107
+ }
108
+ function getIngestJobArtifactsRoot() {
109
+ return process.env.GEOTECHCLI_CONFIG_DIR ?? join(homedir(), '.geotechcli');
110
+ }
111
+ function getSegmentArtifactsDir(parentJobId) {
112
+ return join(getIngestJobArtifactsRoot(), 'ingest-jobs', parentJobId, 'segments');
113
+ }
114
+ function getJobScopedPageRange(job) {
115
+ const [startPage, endPage] = job.source.pageRange ?? job.segmentation?.pageRange ?? [1, job.source.totalPages];
116
+ return {
117
+ startPage,
118
+ endPage,
119
+ };
120
+ }
121
+ function getJobOriginalTotalPages(job) {
122
+ return Math.max(job.source.totalPages, job.source.pageRange?.[1] ?? 0, job.segmentation?.pageRange?.[1] ?? 0);
123
+ }
124
+ function filterPageInputsToSelectedPages(pageInputs, selectedPageNumbers) {
125
+ const selected = new Set(selectedPageNumbers);
126
+ return pageInputs.filter((pageInput) => selected.has(pageInput.pageNumber));
127
+ }
128
+ function countCheckpointStatuses(pages, range) {
129
+ let completedPages = 0;
130
+ let failedPages = 0;
131
+ for (const page of pages) {
132
+ if (page.pageNumber < range.startPage || page.pageNumber > range.endPage) {
133
+ continue;
134
+ }
135
+ if (page.status === 'completed') {
136
+ completedPages += 1;
137
+ }
138
+ else if (page.status === 'failed') {
139
+ failedPages += 1;
140
+ }
141
+ }
142
+ return { completedPages, failedPages };
143
+ }
144
+ function isSegmentResolved(job, range) {
145
+ return job.checkpoints.pages
146
+ .filter((page) => page.pageNumber >= range.startPage && page.pageNumber <= range.endPage)
147
+ .every((page) => page.status !== 'pending');
148
+ }
149
+ function remapSegmentCheckpointError(message, localPageNumber, originalPageNumber) {
150
+ const normalized = normalizeCheckpointErrorMessage(message ?? '');
151
+ return remapPageReferenceText(normalized, localPageNumber, originalPageNumber);
152
+ }
153
+ function mergeSegmentChildJobIntoParentJob(parentJob, childJob, range, now) {
154
+ const timestamp = nowIso(now);
155
+ const originalTotalPages = getJobOriginalTotalPages(parentJob);
156
+ return {
157
+ ...parentJob,
158
+ updatedAt: timestamp,
159
+ execution: {
160
+ ...parentJob.execution,
161
+ lastHeartbeatAt: timestamp,
162
+ },
163
+ checkpoints: {
164
+ pages: parentJob.checkpoints.pages.map((pageCheckpoint) => {
165
+ if (pageCheckpoint.pageNumber < range.startPage || pageCheckpoint.pageNumber > range.endPage) {
166
+ return pageCheckpoint;
167
+ }
168
+ const localPageNumber = pageCheckpoint.pageNumber - range.startPage + 1;
169
+ const childCheckpoint = childJob.checkpoints.pages.find((page) => page.pageNumber === localPageNumber);
170
+ if (!childCheckpoint || childCheckpoint.status === 'pending') {
171
+ return {
172
+ ...pageCheckpoint,
173
+ status: 'failed',
174
+ updatedAt: timestamp,
175
+ error: `Page ${pageCheckpoint.pageNumber} did not complete within segment ${range.startPage}-${range.endPage}.`,
176
+ downgraded: false,
177
+ };
178
+ }
179
+ const remappedResult = parentJob.documentType === 'geotech-document'
180
+ && childCheckpoint.status === 'completed'
181
+ && childCheckpoint.result
182
+ ? remapChildGeotechInsightToOriginalPage(childCheckpoint.result, localPageNumber, pageCheckpoint.pageNumber, originalTotalPages)
183
+ : childCheckpoint.result;
184
+ return {
185
+ ...pageCheckpoint,
186
+ status: childCheckpoint.status,
187
+ attempts: Math.max(pageCheckpoint.attempts, childCheckpoint.attempts),
188
+ updatedAt: timestamp,
189
+ completedAt: childCheckpoint.completedAt,
190
+ error: remapSegmentCheckpointError(childCheckpoint.error, localPageNumber, pageCheckpoint.pageNumber),
191
+ downgraded: childCheckpoint.downgraded,
192
+ ocrTextHint: childCheckpoint.ocrTextHint,
193
+ ocrSource: childCheckpoint.ocrSource,
194
+ ocrWarnings: childCheckpoint.ocrWarnings,
195
+ result: remappedResult,
196
+ };
197
+ }),
198
+ },
199
+ };
200
+ }
201
+ async function runSegmentedParentGeotechJob(jobId, currentJob, mutateJob, config, dependencies) {
202
+ const effectivePageLimit = currentJob.segmentation?.effectivePageLimit
203
+ ?? HOSTED_BETA_EFFECTIVE_PAGE_LIMIT;
204
+ const scopedRange = getJobScopedPageRange(currentJob);
205
+ const baseInspection = currentJob.inspection;
206
+ if (!baseInspection) {
207
+ throw new Error('Segmented parent ingest requires PDF inspection metadata.');
208
+ }
209
+ const baseSegments = currentJob.segmentation?.segments?.length
210
+ ? currentJob.segmentation.segments
211
+ : buildPersistedIngestJobSegments(baseInspection, {
212
+ pageRange: scopedRange,
213
+ effectivePageLimit,
214
+ }).map((segment, index, allSegments) => ({
215
+ ...segment,
216
+ segmentIndex: index + 1,
217
+ segmentCount: allSegments.length,
218
+ status: 'queued',
219
+ }));
220
+ if (!currentJob.segmentation?.segments?.length) {
221
+ await mutateJob((job) => ({
222
+ ...job,
223
+ updatedAt: nowIso(dependencies.now),
224
+ segmentation: {
225
+ mode: 'segmented-parent',
226
+ pageRange: [scopedRange.startPage, scopedRange.endPage],
227
+ effectivePageLimit,
228
+ segmentCount: baseSegments.length,
229
+ segments: baseSegments,
230
+ },
231
+ }));
232
+ currentJob = loadPersistedIngestJob(jobId) ?? currentJob;
233
+ }
234
+ const segmentArtifactsDir = getSegmentArtifactsDir(jobId);
235
+ for (const segment of currentJob.segmentation?.segments ?? baseSegments) {
236
+ if (isCancelled(jobId)) {
237
+ break;
238
+ }
239
+ currentJob = loadPersistedIngestJob(jobId) ?? currentJob;
240
+ const latestSegment = currentJob.segmentation?.segments?.find((entry) => entry.segmentIndex === segment.segmentIndex) ?? segment;
241
+ const range = {
242
+ startPage: latestSegment.startPage,
243
+ endPage: latestSegment.endPage,
244
+ };
245
+ if (isSegmentResolved(currentJob, range) && latestSegment.status === 'completed') {
246
+ continue;
247
+ }
248
+ const packetPath = join(segmentArtifactsDir, `segment-${String(latestSegment.segmentIndex).padStart(2, '0')}-pages-${range.startPage}-${range.endPage}.pdf`);
249
+ await writePdfPageSubset(getJobSourceDisplayPath(currentJob), range, packetPath);
250
+ const childInspection = slicePdfInspectionToRange(baseInspection, range, { rebasePageNumbers: true });
251
+ if (!childInspection) {
252
+ throw new Error(`Unable to create a scoped inspection for segment ${latestSegment.segmentIndex}.`);
253
+ }
254
+ let childJob = latestSegment.childJobId ? loadPersistedIngestJob(latestSegment.childJobId) : null;
255
+ if (!childJob || childJob.status === 'failed' || childJob.status === 'canceled') {
256
+ childJob = createPersistedIngestJob({
257
+ documentType: currentJob.documentType,
258
+ filePath: packetPath,
259
+ inspection: childInspection,
260
+ config: currentJob.config,
261
+ overrideBoreholeId: currentJob.request.overrideBoreholeId,
262
+ originalFilePath: getJobSourceDisplayPath(currentJob),
263
+ originalFileName: getJobSourceDisplayName(currentJob),
264
+ segmentation: {
265
+ mode: 'segment-child',
266
+ pageRange: [range.startPage, range.endPage],
267
+ effectivePageLimit,
268
+ segmentCount: currentJob.segmentation?.segmentCount ?? baseSegments.length,
269
+ segmentIndex: latestSegment.segmentIndex,
270
+ parentJobId: currentJob.jobId,
271
+ },
272
+ now: dependencies.now,
273
+ });
274
+ }
275
+ const startedAt = Date.now();
276
+ await mutateJob((job) => ({
277
+ ...job,
278
+ updatedAt: nowIso(dependencies.now),
279
+ execution: {
280
+ ...job.execution,
281
+ lastHeartbeatAt: nowIso(dependencies.now),
282
+ },
283
+ segmentation: updateSegmentSummaryStatus(job.segmentation, latestSegment.segmentIndex, {
284
+ childJobId: childJob.jobId,
285
+ status: 'running',
286
+ }),
287
+ }));
288
+ const completedChild = await runPersistedIngestJobWorker(childJob.jobId, dependencies);
289
+ const durationMs = Date.now() - startedAt;
290
+ await mutateJob((job) => {
291
+ const mergedJob = mergeSegmentChildJobIntoParentJob(job, completedChild, range, dependencies.now);
292
+ const counts = countCheckpointStatuses(mergedJob.checkpoints.pages, range);
293
+ const childStatus = completedChild.status === 'completed'
294
+ ? 'completed'
295
+ : completedChild.status === 'canceled'
296
+ ? 'canceled'
297
+ : 'failed';
298
+ return {
299
+ ...mergedJob,
300
+ segmentation: updateSegmentSummaryStatus(mergedJob.segmentation, latestSegment.segmentIndex, {
301
+ childJobId: completedChild.jobId,
302
+ status: childStatus,
303
+ completedPages: counts.completedPages,
304
+ failedPages: counts.failedPages,
305
+ durationMs,
306
+ }),
307
+ };
308
+ });
309
+ }
310
+ currentJob = loadPersistedIngestJob(jobId) ?? currentJob;
311
+ if (currentJob.execution.cancelRequested) {
312
+ await mutateJob((job) => ({
313
+ ...job,
314
+ status: 'canceled',
315
+ updatedAt: nowIso(dependencies.now),
316
+ canceledAt: nowIso(dependencies.now),
317
+ execution: {
318
+ ...job.execution,
319
+ pid: undefined,
320
+ },
321
+ }));
322
+ return loadPersistedIngestJob(jobId) ?? currentJob;
323
+ }
324
+ const replayPageInputs = filterPageInputsToSelectedPages(await preparePdfPageInputs(currentJob.source.filePath, null, currentJob.processing.pagePreprocessingConcurrency, dependencies), currentJob.checkpoints.pages.map((page) => page.pageNumber));
325
+ const finalResult = await finalizeJobResult(currentJob, replayPageInputs, config, dependencies);
326
+ const persistedReview = currentJob.request.projectId
327
+ ? (dependencies.persistReview ?? persistBoreholeIngestReview)(currentJob.request.projectId, finalResult, {
328
+ title: currentJob.request.reviewTitle,
329
+ })
330
+ : null;
331
+ await mutateJob((job) => ({
332
+ ...job,
333
+ status: 'completed',
334
+ updatedAt: nowIso(dependencies.now),
335
+ completedAt: nowIso(dependencies.now),
336
+ execution: {
337
+ ...job.execution,
338
+ pid: undefined,
339
+ lastHeartbeatAt: nowIso(dependencies.now),
340
+ },
341
+ result: {
342
+ ingestResult: finalResult,
343
+ persistedReview: persistedReview
344
+ ? {
345
+ datasetName: persistedReview.datasetName,
346
+ reviewId: persistedReview.reviewId,
347
+ createdAt: persistedReview.createdAt,
348
+ }
349
+ : undefined,
350
+ },
351
+ }));
352
+ return loadPersistedIngestJob(jobId) ?? currentJob;
353
+ }
37
354
  function isBoreholeResult(result) {
38
355
  return result.documentType === 'borehole-log';
39
356
  }
@@ -155,6 +472,9 @@ function isSlowVisualPageError(message, classification, sourceKind) {
155
472
  const looksVisual = sourceKind === 'raster-image' || classification === 'image-only' || classification === 'text-unreadable';
156
473
  return looksVisual && SLOW_VISUAL_ERROR_PATTERNS.some((pattern) => pattern.test(message));
157
474
  }
475
+ function isRetryableUpstreamPageError(message) {
476
+ return RETRYABLE_UPSTREAM_ERROR_PATTERNS.some((pattern) => pattern.test(message));
477
+ }
158
478
  function isFatalProviderStopError(message) {
159
479
  return FATAL_PROVIDER_STOP_PATTERNS.some((pattern) => pattern.test(message));
160
480
  }
@@ -169,6 +489,13 @@ function normalizeCheckpointErrorMessage(message) {
169
489
  }
170
490
  return normalized;
171
491
  }
492
+ async function waitForCheckpointRetryBackoff(attempt) {
493
+ const delayMs = Math.min(1000, 100 * Math.max(1, 2 ** Math.max(0, attempt - 1)));
494
+ await new Promise((resolvePromise) => {
495
+ const timer = setTimeout(resolvePromise, delayMs);
496
+ timer.unref?.();
497
+ });
498
+ }
172
499
  async function withWorkerPageTimeout(promise, timeoutMs, errorMessage) {
173
500
  if (!Number.isFinite(timeoutMs) || timeoutMs <= 0) {
174
501
  return promise;
@@ -249,6 +576,59 @@ function buildInspectionSummary(inspection) {
249
576
  ocrRecoveredPageCount: 0,
250
577
  };
251
578
  }
579
+ function countCheckpointOcrRecoveredPages(checkpoints) {
580
+ return checkpoints.filter((page) => page.ocrSource === 'local-ocr' || page.ocrSource === 'vision-ocr').length;
581
+ }
582
+ function applyCheckpointOcrRecoveredSummary(result, checkpoints) {
583
+ const recoveredPageCount = countCheckpointOcrRecoveredPages(checkpoints);
584
+ if (recoveredPageCount === 0 || !result.inspectionSummary) {
585
+ return result;
586
+ }
587
+ return {
588
+ ...result,
589
+ inspectionSummary: {
590
+ ...result.inspectionSummary,
591
+ ocrRecoveredPageCount: Math.max(result.inspectionSummary.ocrRecoveredPageCount, recoveredPageCount),
592
+ },
593
+ };
594
+ }
595
+ function buildFallbackWorkerCheckpoints(job, pageCount, timestamp) {
596
+ if (job.checkpoints.pages.length > 0) {
597
+ return job.checkpoints.pages;
598
+ }
599
+ const startPage = job.source.pageRange?.[0] ?? job.segmentation?.pageRange?.[0] ?? 1;
600
+ return Array.from({ length: pageCount }, (_, index) => ({
601
+ pageNumber: startPage + index,
602
+ classification: null,
603
+ sourceKind: 'pdf-page',
604
+ weight: 1,
605
+ status: 'pending',
606
+ attempts: 0,
607
+ updatedAt: timestamp,
608
+ }));
609
+ }
610
+ async function inferWorkerPdfPageCount(job) {
611
+ if (job.source.totalPages > 0) {
612
+ return job.source.totalPages;
613
+ }
614
+ try {
615
+ const fullPageCount = await countDocumentPdfPages(job.source.filePath);
616
+ return job.source.pageRange
617
+ ? Math.max(0, Math.min(fullPageCount, job.source.pageRange[1]) - job.source.pageRange[0] + 1)
618
+ : fullPageCount;
619
+ }
620
+ catch {
621
+ try {
622
+ const fullPageCount = inferPdfDocumentPageCountFallback(job.source.filePath);
623
+ return job.source.pageRange
624
+ ? Math.max(0, Math.min(fullPageCount, job.source.pageRange[1]) - job.source.pageRange[0] + 1)
625
+ : fullPageCount;
626
+ }
627
+ catch {
628
+ return 0;
629
+ }
630
+ }
631
+ }
252
632
  function summarizeReviewReasons(findings) {
253
633
  return uniqueStrings(findings
254
634
  .filter((finding) => finding.severity !== 'advisory')
@@ -289,19 +669,15 @@ function buildSyntheticBoreholeResult(job, inspection, now) {
289
669
  message: `${normalFailedPages.length} page(s) failed during ingest and should be reviewed.`,
290
670
  });
291
671
  }
292
- return {
672
+ return applyCheckpointOcrRecoveredSummary({
293
673
  kind: 'geotech-ingest-result',
294
674
  schemaVersion: 1,
295
675
  documentType: 'borehole-log',
296
676
  generatedAt: nowIso(now),
297
- source: {
298
- filePath: job.source.filePath,
299
- fileName: basename(job.source.filePath),
300
- inputKind: 'pdf',
301
- totalPages: job.source.totalPages,
677
+ source: buildJobResultSource(job, {
302
678
  successfulPages: 0,
303
679
  failedPages: pageFailures.length,
304
- },
680
+ }),
305
681
  inspection,
306
682
  inspectionSummary: summarizeBoreholeIngestInspection(inspection),
307
683
  boreholes: [],
@@ -323,7 +699,7 @@ function buildSyntheticBoreholeResult(job, inspection, now) {
323
699
  reviewRequired: reviewFindings.some((finding) => finding.severity !== 'advisory'),
324
700
  confidence: 0,
325
701
  canAutoProceed: false,
326
- };
702
+ }, job.checkpoints.pages);
327
703
  }
328
704
  function buildSyntheticGeotechDocumentResult(job, inspection, now) {
329
705
  const pageFailures = job.checkpoints.pages
@@ -360,19 +736,15 @@ function buildSyntheticGeotechDocumentResult(job, inspection, now) {
360
736
  message: `${normalFailedPages.length} page(s) failed during ingest and should be reviewed.`,
361
737
  });
362
738
  }
363
- return {
739
+ return applyCheckpointOcrRecoveredSummary({
364
740
  kind: 'geotech-ingest-result',
365
741
  schemaVersion: 1,
366
742
  documentType: 'geotech-document',
367
743
  generatedAt: nowIso(now),
368
- source: {
369
- filePath: job.source.filePath,
370
- fileName: basename(job.source.filePath),
371
- inputKind: 'pdf',
372
- totalPages: job.source.totalPages,
744
+ source: buildJobResultSource(job, {
373
745
  successfulPages: 0,
374
746
  failedPages: pageFailures.length,
375
- },
747
+ }),
376
748
  inspection,
377
749
  inspectionSummary: buildInspectionSummary(inspection),
378
750
  documentClass: null,
@@ -402,7 +774,7 @@ function buildSyntheticGeotechDocumentResult(job, inspection, now) {
402
774
  confidence: 0,
403
775
  reviewRequired: reviewFindings.some((finding) => finding.severity !== 'advisory'),
404
776
  canAutoProceed: false,
405
- };
777
+ }, job.checkpoints.pages);
406
778
  }
407
779
  function dedupeReviewFindings(reviewFindings) {
408
780
  return [
@@ -531,6 +903,7 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
531
903
  const lowYieldRole = inferPreflightLowYieldPageRole({
532
904
  inspectionPage,
533
905
  previousInspectionPage: job.inspection?.pages[pageInput.pageNumber - 2],
906
+ nextInspectionPage: job.inspection?.pages[pageInput.pageNumber],
534
907
  pageNumber: pageInput.pageNumber,
535
908
  totalPages: pageInput.totalPages,
536
909
  sourceKind: pageInput.sourceKind,
@@ -662,11 +1035,7 @@ async function finalizeJobResult(job, pageInputs, config, dependencies) {
662
1035
  try {
663
1036
  const result = await ingestBoreholeLogDocument({
664
1037
  config,
665
- source: {
666
- filePath: job.source.filePath,
667
- fileName: basename(job.source.filePath),
668
- inputKind: 'pdf',
669
- },
1038
+ source: buildJobResultSource(job, { successfulPages: 0, failedPages: 0 }),
670
1039
  overrideBoreholeId: job.request.overrideBoreholeId,
671
1040
  inspection: job.inspection,
672
1041
  pages: job.checkpoints.pages
@@ -695,7 +1064,7 @@ async function finalizeJobResult(job, pageInputs, config, dependencies) {
695
1064
  },
696
1065
  now: dependencies.now,
697
1066
  });
698
- return applyBoreholeFailureDowngrades(result, job.checkpoints.pages);
1067
+ return applyCheckpointOcrRecoveredSummary(applyBoreholeFailureDowngrades(result, job.checkpoints.pages), job.checkpoints.pages);
699
1068
  }
700
1069
  catch (error) {
701
1070
  const message = error instanceof Error ? error.message : String(error);
@@ -709,11 +1078,7 @@ async function finalizeJobResult(job, pageInputs, config, dependencies) {
709
1078
  try {
710
1079
  const result = await ingestGeotechDocument({
711
1080
  config,
712
- source: {
713
- filePath: job.source.filePath,
714
- fileName: basename(job.source.filePath),
715
- inputKind: 'pdf',
716
- },
1081
+ source: buildJobResultSource(job, { successfulPages: 0, failedPages: 0 }),
717
1082
  inspection: job.inspection,
718
1083
  pages: job.checkpoints.pages
719
1084
  .map((checkpoint) => geotechPageInputMap.get(checkpoint.pageNumber))
@@ -752,7 +1117,7 @@ async function finalizeJobResult(job, pageInputs, config, dependencies) {
752
1117
  },
753
1118
  now: dependencies.now,
754
1119
  });
755
- return applyGeotechFailureDowngrades(result, job.checkpoints.pages);
1120
+ return applyCheckpointOcrRecoveredSummary(applyGeotechFailureDowngrades(result, job.checkpoints.pages), job.checkpoints.pages);
756
1121
  }
757
1122
  catch (error) {
758
1123
  const message = error instanceof Error ? error.message : String(error);
@@ -805,28 +1170,62 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
805
1170
  ? currentJob.inspection
806
1171
  : inspect(currentJob.source.filePath);
807
1172
  if (!currentJob.inspection || currentJob.inspection.totalPages === 0) {
808
- await mutateJob((job) => ({
809
- ...job,
810
- inspection,
811
- source: {
812
- ...job.source,
813
- totalPages: inspection.totalPages,
814
- weightedPageCost: inspection.pages.reduce((sum, page) => sum + (page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1), 0),
815
- },
816
- checkpoints: {
817
- pages: inspection.pages.map((page) => job.checkpoints.pages.find((existing) => existing.pageNumber === page.pageNumber) ?? ({
818
- pageNumber: page.pageNumber,
819
- classification: page.classification,
820
- sourceKind: mapPageSourceKind(page.classification),
821
- weight: page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1,
822
- status: 'pending',
823
- attempts: 0,
824
- updatedAt: nowIso(dependencies.now),
825
- })),
826
- },
827
- }));
1173
+ if (inspection.totalPages > 0) {
1174
+ await mutateJob((job) => ({
1175
+ ...job,
1176
+ inspection,
1177
+ source: {
1178
+ ...job.source,
1179
+ totalPages: inspection.totalPages,
1180
+ weightedPageCost: inspection.pages.reduce((sum, page) => sum + (page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1), 0),
1181
+ },
1182
+ processing: {
1183
+ ...job.processing,
1184
+ chunkExtractionConcurrency: resolvePersistedIngestJobExtractionConcurrency(job.config, inspection),
1185
+ },
1186
+ checkpoints: {
1187
+ pages: inspection.pages.map((page) => job.checkpoints.pages.find((existing) => existing.pageNumber === page.pageNumber) ?? ({
1188
+ pageNumber: page.pageNumber,
1189
+ classification: page.classification,
1190
+ sourceKind: mapPageSourceKind(page.classification),
1191
+ weight: page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1,
1192
+ status: 'pending',
1193
+ attempts: 0,
1194
+ updatedAt: nowIso(dependencies.now),
1195
+ })),
1196
+ },
1197
+ }));
1198
+ }
1199
+ else {
1200
+ const inferredPageCount = await inferWorkerPdfPageCount(currentJob);
1201
+ if (inferredPageCount > 0) {
1202
+ await mutateJob((job) => {
1203
+ const timestamp = nowIso(dependencies.now);
1204
+ return {
1205
+ ...job,
1206
+ inspection: null,
1207
+ source: {
1208
+ ...job.source,
1209
+ totalPages: Math.max(job.source.totalPages, inferredPageCount),
1210
+ weightedPageCost: Math.max(job.source.weightedPageCost, inferredPageCount),
1211
+ },
1212
+ processing: {
1213
+ ...job.processing,
1214
+ chunkExtractionConcurrency: resolvePersistedIngestJobExtractionConcurrency(job.config, null),
1215
+ },
1216
+ checkpoints: {
1217
+ pages: buildFallbackWorkerCheckpoints(job, inferredPageCount, timestamp),
1218
+ },
1219
+ };
1220
+ });
1221
+ }
1222
+ }
828
1223
  }
1224
+ currentJob = loadPersistedIngestJob(jobId) ?? currentJob;
829
1225
  const config = buildJobConfig(currentJob, dependencies);
1226
+ if (currentJob.documentType === 'geotech-document' && currentJob.segmentation?.mode === 'segmented-parent') {
1227
+ return await runSegmentedParentGeotechJob(jobId, currentJob, mutateJob, config, dependencies);
1228
+ }
830
1229
  const pageInputs = await preparePdfPageInputs(currentJob.source.filePath, currentJob.inspection, currentJob.processing.pagePreprocessingConcurrency, dependencies);
831
1230
  let processedNewPages = 0;
832
1231
  if (currentJob.documentType === 'geotech-document') {
@@ -878,8 +1277,45 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
878
1277
  : checkpoint),
879
1278
  },
880
1279
  }));
881
- try {
882
- const processed = await processGeotechDocumentPage(currentJob, page, config, dependencies);
1280
+ let processed = null;
1281
+ let finalErrorMessage = '';
1282
+ for (let attemptIndex = 0; attemptIndex < 2; attemptIndex += 1) {
1283
+ try {
1284
+ processed = await processGeotechDocumentPage(currentJob, page, config, dependencies);
1285
+ break;
1286
+ }
1287
+ catch (error) {
1288
+ const message = error instanceof Error ? error.message : String(error);
1289
+ const normalizedMessage = normalizeCheckpointErrorMessage(message);
1290
+ if (attemptIndex === 0
1291
+ && isRetryableUpstreamPageError(normalizedMessage)
1292
+ && !isFatalProviderStopError(normalizedMessage)) {
1293
+ await waitForCheckpointRetryBackoff(findCheckpoint(currentJob, page.pageNumber).attempts);
1294
+ await mutateJob((job) => ({
1295
+ ...job,
1296
+ updatedAt: nowIso(dependencies.now),
1297
+ execution: {
1298
+ ...job.execution,
1299
+ lastHeartbeatAt: nowIso(dependencies.now),
1300
+ },
1301
+ checkpoints: {
1302
+ pages: job.checkpoints.pages.map((checkpoint) => checkpoint.pageNumber === page.pageNumber
1303
+ ? {
1304
+ ...checkpoint,
1305
+ attempts: checkpoint.attempts + 1,
1306
+ updatedAt: nowIso(dependencies.now),
1307
+ error: `retrying after upstream timeout: ${normalizedMessage}`,
1308
+ }
1309
+ : checkpoint),
1310
+ },
1311
+ }));
1312
+ continue;
1313
+ }
1314
+ finalErrorMessage = normalizedMessage;
1315
+ break;
1316
+ }
1317
+ }
1318
+ if (processed) {
883
1319
  processedNewPages += 1;
884
1320
  await mutateJob((job) => ({
885
1321
  ...job,
@@ -906,9 +1342,8 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
906
1342
  },
907
1343
  }));
908
1344
  }
909
- catch (error) {
910
- const message = error instanceof Error ? error.message : String(error);
911
- const normalizedMessage = normalizeCheckpointErrorMessage(message);
1345
+ else {
1346
+ const normalizedMessage = finalErrorMessage || `Page ${page.pageNumber} failed during async ingest.`;
912
1347
  const checkpoint = findCheckpoint(currentJob, page.pageNumber);
913
1348
  if (!fatalProviderStopMessage && isFatalProviderStopError(normalizedMessage)) {
914
1349
  fatalProviderStopMessage = normalizedMessage;
@@ -993,8 +1428,45 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
993
1428
  : pageCheckpoint),
994
1429
  },
995
1430
  }));
996
- try {
997
- const processed = await processBoreholePage(currentJob, page, config, state, dependencies);
1431
+ let processed = null;
1432
+ let finalErrorMessage = '';
1433
+ for (let attemptIndex = 0; attemptIndex < 2; attemptIndex += 1) {
1434
+ try {
1435
+ processed = await processBoreholePage(currentJob, page, config, state, dependencies);
1436
+ break;
1437
+ }
1438
+ catch (error) {
1439
+ const message = error instanceof Error ? error.message : String(error);
1440
+ const normalizedMessage = normalizeCheckpointErrorMessage(message);
1441
+ if (attemptIndex === 0
1442
+ && isRetryableUpstreamPageError(normalizedMessage)
1443
+ && !isFatalProviderStopError(normalizedMessage)) {
1444
+ await waitForCheckpointRetryBackoff(findCheckpoint(currentJob, page.pageNumber).attempts);
1445
+ await mutateJob((job) => ({
1446
+ ...job,
1447
+ updatedAt: nowIso(dependencies.now),
1448
+ execution: {
1449
+ ...job.execution,
1450
+ lastHeartbeatAt: nowIso(dependencies.now),
1451
+ },
1452
+ checkpoints: {
1453
+ pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
1454
+ ? {
1455
+ ...pageCheckpoint,
1456
+ attempts: pageCheckpoint.attempts + 1,
1457
+ updatedAt: nowIso(dependencies.now),
1458
+ error: `retrying after upstream timeout: ${normalizedMessage}`,
1459
+ }
1460
+ : pageCheckpoint),
1461
+ },
1462
+ }));
1463
+ continue;
1464
+ }
1465
+ finalErrorMessage = normalizedMessage;
1466
+ break;
1467
+ }
1468
+ }
1469
+ if (processed) {
998
1470
  processedNewPages += 1;
999
1471
  state = processed.nextState;
1000
1472
  await mutateJob((job) => ({
@@ -1022,9 +1494,8 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
1022
1494
  },
1023
1495
  }));
1024
1496
  }
1025
- catch (error) {
1026
- const message = error instanceof Error ? error.message : String(error);
1027
- const normalizedMessage = normalizeCheckpointErrorMessage(message);
1497
+ else {
1498
+ const normalizedMessage = finalErrorMessage || `Page ${page.pageNumber} failed during async ingest.`;
1028
1499
  await mutateJob((job) => ({
1029
1500
  ...job,
1030
1501
  updatedAt: nowIso(dependencies.now),
@@ -1058,7 +1529,7 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
1058
1529
  ...pageCheckpoint,
1059
1530
  status: 'failed',
1060
1531
  updatedAt: nowIso(dependencies.now),
1061
- error: `skipped after upstream provider stop. ${normalizeCheckpointErrorMessage(message)}`,
1532
+ error: `skipped after upstream provider stop. ${normalizedMessage}`,
1062
1533
  downgraded: false,
1063
1534
  }
1064
1535
  : pageCheckpoint),