@geotechcli/core 0.4.25 → 0.4.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/agents/sandbox.d.ts.map +1 -1
  2. package/dist/agents/sandbox.js +13 -8
  3. package/dist/agents/sandbox.js.map +1 -1
  4. package/dist/agents/swarm.d.ts.map +1 -1
  5. package/dist/agents/swarm.js +21 -4
  6. package/dist/agents/swarm.js.map +1 -1
  7. package/dist/ingest/geotech-document.d.ts.map +1 -1
  8. package/dist/ingest/geotech-document.js +62 -3
  9. package/dist/ingest/geotech-document.js.map +1 -1
  10. package/dist/ingest/geotech-extract.d.ts.map +1 -1
  11. package/dist/ingest/geotech-extract.js +29 -1
  12. package/dist/ingest/geotech-extract.js.map +1 -1
  13. package/dist/ingest/job-store.d.ts +3 -2
  14. package/dist/ingest/job-store.d.ts.map +1 -1
  15. package/dist/ingest/job-store.js +77 -10
  16. package/dist/ingest/job-store.js.map +1 -1
  17. package/dist/ingest/job-worker.d.ts +2 -1
  18. package/dist/ingest/job-worker.d.ts.map +1 -1
  19. package/dist/ingest/job-worker.js +397 -71
  20. package/dist/ingest/job-worker.js.map +1 -1
  21. package/dist/ingest/pdf.d.ts +1 -0
  22. package/dist/ingest/pdf.d.ts.map +1 -1
  23. package/dist/ingest/pdf.js +11 -0
  24. package/dist/ingest/pdf.js.map +1 -1
  25. package/dist/meta/metadata.json +1 -1
  26. package/dist/report/html.d.ts.map +1 -1
  27. package/dist/report/html.js +17 -34
  28. package/dist/report/html.js.map +1 -1
  29. package/dist/skills/index.d.ts.map +1 -1
  30. package/dist/skills/index.js +12 -4
  31. package/dist/skills/index.js.map +1 -1
  32. package/dist/vision/geotech-document.d.ts +4 -0
  33. package/dist/vision/geotech-document.d.ts.map +1 -1
  34. package/dist/vision/geotech-document.js +74 -17
  35. package/dist/vision/geotech-document.js.map +1 -1
  36. package/dist/vision/index.d.ts.map +1 -1
  37. package/dist/vision/index.js +47 -10
  38. package/dist/vision/index.js.map +1 -1
  39. package/dist/vision/ocr.d.ts +1 -0
  40. package/dist/vision/ocr.d.ts.map +1 -1
  41. package/dist/vision/ocr.js +5 -1
  42. package/dist/vision/ocr.js.map +1 -1
  43. package/package.json +1 -1
@@ -1,11 +1,11 @@
1
1
  import { basename, join } from 'node:path';
2
2
  import { homedir } from 'node:os';
3
3
  import { buildLLMConfig } from '../config/index.js';
4
- import { readDocumentPdfPageInputs } from './document-inputs.js';
5
- import { inspectPdfDocument } from './pdf.js';
4
+ import { countDocumentPdfPages, readDocumentPdfPageInputs } from './document-inputs.js';
5
+ import { inferPdfDocumentPageCountFallback, inspectPdfDocument, } from './pdf.js';
6
6
  import { ingestBoreholeLogDocument, summarizeBoreholeIngestInspection, } from './geotech-extract.js';
7
7
  import { buildPreflightLowYieldInsight, ingestGeotechDocument, inferPreflightLowYieldPageRole, } from './geotech-document.js';
8
- import { extractGeotechDocumentFactsFromText, interpretGeotechDocumentPage, } from '../vision/geotech-document.js';
8
+ import { extractGeotechDocumentDeterministicFactsFromText, extractGeotechDocumentFactsFromText, interpretGeotechDocumentPage, } from '../vision/geotech-document.js';
9
9
  import { interpretBoreholeLogWithContext, transcribeDocumentImageText, } from '../vision/index.js';
10
10
  import { recoverDocumentTextHint } from '../vision/ocr.js';
11
11
  import { persistBoreholeIngestReview } from './review-store.js';
@@ -13,6 +13,8 @@ import { buildPersistedIngestJobSegments, createPersistedIngestJob, loadPersiste
13
13
  import { HOSTED_BETA_EFFECTIVE_PAGE_LIMIT, slicePdfInspectionToRange, writePdfPageSubset, } from './segmentation.js';
14
14
  const SLOW_VISUAL_ERROR_PATTERNS = [
15
15
  /timeout/i,
16
+ /\b524\b/i,
17
+ /upstream(?: request)? (?:timed out|timeout|failed)/i,
16
18
  /provider is busy/i,
17
19
  /returned no content/i,
18
20
  /did not contain assistant text/i,
@@ -22,6 +24,15 @@ const SLOW_VISUAL_ERROR_PATTERNS = [
22
24
  /\b503\b/i,
23
25
  /\b504\b/i,
24
26
  ];
27
+ const RETRYABLE_UPSTREAM_ERROR_PATTERNS = [
28
+ /timeout/i,
29
+ /\b524\b/i,
30
+ /upstream(?: request)? (?:timed out|timeout|failed)/i,
31
+ /provider is busy/i,
32
+ /temporarily unavailable/i,
33
+ /\b503\b/i,
34
+ /\b504\b/i,
35
+ ];
25
36
  const FATAL_PROVIDER_STOP_PATTERNS = [
26
37
  /daily limit reached/i,
27
38
  /remaining today:\s*0/i,
@@ -30,6 +41,8 @@ const FATAL_PROVIDER_STOP_PATTERNS = [
30
41
  /rate limit/i,
31
42
  /\b429\b/i,
32
43
  ];
44
+ const VISUAL_TAIL_RUN_MIN_PAGES = 3;
45
+ const VISUAL_TAIL_RUN_SLOW_FAILURE_THRESHOLD = 2;
33
46
  function nowIso(now) {
34
47
  return (now ?? (() => new Date()))().toISOString();
35
48
  }
@@ -458,12 +471,123 @@ function advanceBoreholeProcessingState(state, result, pageTextHint, overrideBor
458
471
  return next;
459
472
  }
460
473
  function isSlowVisualPageError(message, classification, sourceKind) {
461
- const looksVisual = sourceKind === 'raster-image' || classification === 'image-only' || classification === 'text-unreadable';
474
+ const looksVisual = sourceKind === 'raster-image' || isVisualRunClassification(classification);
462
475
  return looksVisual && SLOW_VISUAL_ERROR_PATTERNS.some((pattern) => pattern.test(message));
463
476
  }
477
+ function isVisualRunClassification(classification) {
478
+ return classification === 'image-only' || classification === 'text-unreadable' || classification === 'graphics-only';
479
+ }
480
+ function isVisualRunCheckpoint(page) {
481
+ return page.sourceKind === 'raster-image' || isVisualRunClassification(page.classification);
482
+ }
483
+ function getInspectionLeadText(inspectionPage) {
484
+ if (!inspectionPage) {
485
+ return '';
486
+ }
487
+ return uniqueStrings([
488
+ ...(inspectionPage.normalizedArtifact?.headingHints ?? []),
489
+ inspectionPage.normalizedArtifact?.nativeText,
490
+ inspectionPage.normalizedText,
491
+ ])
492
+ .join('\n')
493
+ .replace(/\s+/g, ' ')
494
+ .trim()
495
+ .toLowerCase();
496
+ }
497
+ function hasBoreholeAppendixCue(text) {
498
+ return /\b(record of boreholes?|borehole logs?|borehole records?|test pits?|cpt|cone penetration|standard penetration test|spt)\b/i.test(text);
499
+ }
500
+ function hasVisualAppendixCue(text) {
501
+ return (/\bappendix\b/i.test(text)
502
+ && /\b(figures?|plates?|photos?|photographs?|drawings?|plans?|sketches?|site layout|certificates?|laboratory)\b/i.test(text)
503
+ && !hasBoreholeAppendixCue(text));
504
+ }
505
+ function findConsecutiveVisualRun(job, pageNumber) {
506
+ const pages = [...job.checkpoints.pages].sort((left, right) => left.pageNumber - right.pageNumber);
507
+ const centerIndex = pages.findIndex((page) => page.pageNumber === pageNumber);
508
+ if (centerIndex < 0 || !isVisualRunCheckpoint(pages[centerIndex])) {
509
+ return [];
510
+ }
511
+ let startIndex = centerIndex;
512
+ while (startIndex > 0 && isVisualRunCheckpoint(pages[startIndex - 1])) {
513
+ startIndex -= 1;
514
+ }
515
+ let endIndex = centerIndex;
516
+ while (endIndex + 1 < pages.length && isVisualRunCheckpoint(pages[endIndex + 1])) {
517
+ endIndex += 1;
518
+ }
519
+ return pages.slice(startIndex, endIndex + 1);
520
+ }
521
+ function hasNearbyVisualAppendixCue(job, runStartPage) {
522
+ if (!job.inspection) {
523
+ return false;
524
+ }
525
+ const firstCandidatePage = Math.max(1, runStartPage - 3);
526
+ for (let pageNumber = firstCandidatePage; pageNumber <= runStartPage; pageNumber += 1) {
527
+ const inspectionPage = job.inspection.pages.find((page) => page.pageNumber === pageNumber);
528
+ if (hasVisualAppendixCue(getInspectionLeadText(inspectionPage))) {
529
+ return true;
530
+ }
531
+ }
532
+ return false;
533
+ }
534
+ function isTailOrAppendixVisualRun(job, run) {
535
+ if (run.length < VISUAL_TAIL_RUN_MIN_PAGES) {
536
+ return false;
537
+ }
538
+ const startPage = run[0].pageNumber;
539
+ const endPage = run[run.length - 1].pageNumber;
540
+ const totalPages = Math.max(job.source.totalPages, job.inspection?.totalPages ?? 0);
541
+ const touchesTail = totalPages > 0
542
+ && endPage >= totalPages
543
+ && startPage >= Math.max(2, Math.ceil(totalPages * 0.55));
544
+ return touchesTail || hasNearbyVisualAppendixCue(job, startPage);
545
+ }
546
+ function applyConsecutiveVisualRunDowngrades(job, triggerPageNumber, now) {
547
+ const run = findConsecutiveVisualRun(job, triggerPageNumber);
548
+ if (!isTailOrAppendixVisualRun(job, run)) {
549
+ return job;
550
+ }
551
+ const slowFailureCount = run.filter((page) => page.status === 'failed'
552
+ && page.downgraded
553
+ && isSlowVisualPageError(page.error ?? '', page.classification, page.sourceKind)).length;
554
+ if (slowFailureCount < VISUAL_TAIL_RUN_SLOW_FAILURE_THRESHOLD) {
555
+ return job;
556
+ }
557
+ const pendingPageNumbers = new Set(run
558
+ .filter((page) => page.status === 'pending' && page.pageNumber > triggerPageNumber)
559
+ .map((page) => page.pageNumber));
560
+ if (pendingPageNumbers.size === 0) {
561
+ return job;
562
+ }
563
+ const timestamp = nowIso(now);
564
+ return {
565
+ ...job,
566
+ updatedAt: timestamp,
567
+ checkpoints: {
568
+ pages: job.checkpoints.pages.map((page) => pendingPageNumbers.has(page.pageNumber)
569
+ ? {
570
+ ...page,
571
+ status: 'failed',
572
+ updatedAt: timestamp,
573
+ error: `Skipped page ${page.pageNumber} after ${slowFailureCount} slow visual failures in a consecutive image-only tail/appendix run.`,
574
+ downgraded: true,
575
+ }
576
+ : page),
577
+ },
578
+ };
579
+ }
580
+ function isRetryableUpstreamPageError(message) {
581
+ return RETRYABLE_UPSTREAM_ERROR_PATTERNS.some((pattern) => pattern.test(message));
582
+ }
464
583
  function isFatalProviderStopError(message) {
465
584
  return FATAL_PROVIDER_STOP_PATTERNS.some((pattern) => pattern.test(message));
466
585
  }
586
+ function isTextExtractionTimeoutError(message) {
587
+ return (/text extraction timed out/i.test(message)
588
+ || (/hosted beta request timed out|timed out after \d+s/i.test(message)
589
+ && !/\b524\b|upstream request failed/i.test(message)));
590
+ }
467
591
  function normalizeCheckpointErrorMessage(message) {
468
592
  let normalized = message.trim();
469
593
  for (let iteration = 0; iteration < 4; iteration += 1) {
@@ -475,6 +599,13 @@ function normalizeCheckpointErrorMessage(message) {
475
599
  }
476
600
  return normalized;
477
601
  }
602
+ async function waitForCheckpointRetryBackoff(attempt) {
603
+ const delayMs = Math.min(1000, 100 * Math.max(1, 2 ** Math.max(0, attempt - 1)));
604
+ await new Promise((resolvePromise) => {
605
+ const timer = setTimeout(resolvePromise, delayMs);
606
+ timer.unref?.();
607
+ });
608
+ }
478
609
  async function withWorkerPageTimeout(promise, timeoutMs, errorMessage) {
479
610
  if (!Number.isFinite(timeoutMs) || timeoutMs <= 0) {
480
611
  return promise;
@@ -489,14 +620,17 @@ async function withWorkerPageTimeout(promise, timeoutMs, errorMessage) {
489
620
  }),
490
621
  ]);
491
622
  }
492
- function resolveWorkerPhaseTimeoutMs(config, input) {
623
+ function resolveWorkerPhaseTimeoutMs(config, input, options) {
493
624
  const baseTimeoutMs = Math.min(Math.max(config.timeout ?? 120000, 60000), 120000);
494
625
  const isHeavyVisualPage = input.sourceKind === 'raster-image'
495
626
  || input.classification === 'image-only'
496
627
  || input.classification === 'text-unreadable';
497
- return isHeavyVisualPage
628
+ const phaseTimeoutMs = isHeavyVisualPage
498
629
  ? Math.min(Math.max(baseTimeoutMs, 180000), 180000)
499
630
  : baseTimeoutMs;
631
+ return options?.cheapRetry
632
+ ? Math.min(phaseTimeoutMs, isHeavyVisualPage ? 60000 : 90000)
633
+ : phaseTimeoutMs;
500
634
  }
501
635
  function resolveWorkerTextExtractionTimeoutMs(baseTimeoutMs, textHint) {
502
636
  if (!textHint) {
@@ -555,6 +689,59 @@ function buildInspectionSummary(inspection) {
555
689
  ocrRecoveredPageCount: 0,
556
690
  };
557
691
  }
692
+ function countCheckpointOcrRecoveredPages(checkpoints) {
693
+ return checkpoints.filter((page) => page.ocrSource === 'local-ocr' || page.ocrSource === 'vision-ocr').length;
694
+ }
695
+ function applyCheckpointOcrRecoveredSummary(result, checkpoints) {
696
+ const recoveredPageCount = countCheckpointOcrRecoveredPages(checkpoints);
697
+ if (recoveredPageCount === 0 || !result.inspectionSummary) {
698
+ return result;
699
+ }
700
+ return {
701
+ ...result,
702
+ inspectionSummary: {
703
+ ...result.inspectionSummary,
704
+ ocrRecoveredPageCount: Math.max(result.inspectionSummary.ocrRecoveredPageCount, recoveredPageCount),
705
+ },
706
+ };
707
+ }
708
+ function buildFallbackWorkerCheckpoints(job, pageCount, timestamp) {
709
+ if (job.checkpoints.pages.length > 0) {
710
+ return job.checkpoints.pages;
711
+ }
712
+ const startPage = job.source.pageRange?.[0] ?? job.segmentation?.pageRange?.[0] ?? 1;
713
+ return Array.from({ length: pageCount }, (_, index) => ({
714
+ pageNumber: startPage + index,
715
+ classification: null,
716
+ sourceKind: 'pdf-page',
717
+ weight: 1,
718
+ status: 'pending',
719
+ attempts: 0,
720
+ updatedAt: timestamp,
721
+ }));
722
+ }
723
+ async function inferWorkerPdfPageCount(job) {
724
+ if (job.source.totalPages > 0) {
725
+ return job.source.totalPages;
726
+ }
727
+ try {
728
+ const fullPageCount = await countDocumentPdfPages(job.source.filePath);
729
+ return job.source.pageRange
730
+ ? Math.max(0, Math.min(fullPageCount, job.source.pageRange[1]) - job.source.pageRange[0] + 1)
731
+ : fullPageCount;
732
+ }
733
+ catch {
734
+ try {
735
+ const fullPageCount = inferPdfDocumentPageCountFallback(job.source.filePath);
736
+ return job.source.pageRange
737
+ ? Math.max(0, Math.min(fullPageCount, job.source.pageRange[1]) - job.source.pageRange[0] + 1)
738
+ : fullPageCount;
739
+ }
740
+ catch {
741
+ return 0;
742
+ }
743
+ }
744
+ }
558
745
  function summarizeReviewReasons(findings) {
559
746
  return uniqueStrings(findings
560
747
  .filter((finding) => finding.severity !== 'advisory')
@@ -595,7 +782,7 @@ function buildSyntheticBoreholeResult(job, inspection, now) {
595
782
  message: `${normalFailedPages.length} page(s) failed during ingest and should be reviewed.`,
596
783
  });
597
784
  }
598
- return {
785
+ return applyCheckpointOcrRecoveredSummary({
599
786
  kind: 'geotech-ingest-result',
600
787
  schemaVersion: 1,
601
788
  documentType: 'borehole-log',
@@ -625,7 +812,7 @@ function buildSyntheticBoreholeResult(job, inspection, now) {
625
812
  reviewRequired: reviewFindings.some((finding) => finding.severity !== 'advisory'),
626
813
  confidence: 0,
627
814
  canAutoProceed: false,
628
- };
815
+ }, job.checkpoints.pages);
629
816
  }
630
817
  function buildSyntheticGeotechDocumentResult(job, inspection, now) {
631
818
  const pageFailures = job.checkpoints.pages
@@ -662,7 +849,7 @@ function buildSyntheticGeotechDocumentResult(job, inspection, now) {
662
849
  message: `${normalFailedPages.length} page(s) failed during ingest and should be reviewed.`,
663
850
  });
664
851
  }
665
- return {
852
+ return applyCheckpointOcrRecoveredSummary({
666
853
  kind: 'geotech-ingest-result',
667
854
  schemaVersion: 1,
668
855
  documentType: 'geotech-document',
@@ -700,7 +887,7 @@ function buildSyntheticGeotechDocumentResult(job, inspection, now) {
700
887
  confidence: 0,
701
888
  reviewRequired: reviewFindings.some((finding) => finding.severity !== 'advisory'),
702
889
  canAutoProceed: false,
703
- };
890
+ }, job.checkpoints.pages);
704
891
  }
705
892
  function dedupeReviewFindings(reviewFindings) {
706
893
  return [
@@ -819,11 +1006,13 @@ function findCheckpoint(job, pageNumber) {
819
1006
  }
820
1007
  return checkpoint;
821
1008
  }
822
- async function processGeotechDocumentPage(job, pageInput, config, dependencies) {
1009
+ async function processGeotechDocumentPage(job, pageInput, config, dependencies, options = {}) {
823
1010
  const inspect = dependencies.inspectPdfDocument ?? inspectPdfDocument;
824
1011
  const recoverTextHint = dependencies.recoverDocumentTextHint ?? recoverDocumentTextHint;
825
1012
  const interpretation = dependencies.interpretGeotechDocumentPage ?? interpretGeotechDocumentPage;
826
1013
  const extractTextFacts = dependencies.extractGeotechDocumentFactsFromText ?? extractGeotechDocumentFactsFromText;
1014
+ const extractDeterministicFacts = dependencies.extractGeotechDocumentDeterministicFactsFromText
1015
+ ?? extractGeotechDocumentDeterministicFactsFromText;
827
1016
  const transcribe = dependencies.transcribeDocumentImageText ?? transcribeDocumentImageText;
828
1017
  const inspectionPage = job.inspection?.pages[pageInput.pageNumber - 1] ?? inspect(job.source.filePath).pages[pageInput.pageNumber - 1];
829
1018
  const lowYieldRole = inferPreflightLowYieldPageRole({
@@ -854,6 +1043,8 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
854
1043
  const phaseTimeoutMs = resolveWorkerPhaseTimeoutMs(config, {
855
1044
  classification: inspectionPage?.classification,
856
1045
  sourceKind: pageInput.sourceKind,
1046
+ }, {
1047
+ cheapRetry: options.cheapRetry,
857
1048
  });
858
1049
  const phaseConfig = {
859
1050
  ...config,
@@ -862,7 +1053,9 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
862
1053
  let pageTextHint;
863
1054
  let ocrSource = 'none';
864
1055
  let ocrWarnings = [];
1056
+ let textRecoveryAttempted = false;
865
1057
  try {
1058
+ textRecoveryAttempted = true;
866
1059
  const recovery = await withWorkerPageTimeout(recoverTextHint({
867
1060
  existingTextHint: inspectionPage?.normalizedArtifact?.nativeText ?? inspectionPage?.normalizedText,
868
1061
  existingTextAccepted: inspectionPage?.normalizedArtifact?.textQuality.accepted ?? true,
@@ -871,6 +1064,7 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
871
1064
  config: phaseConfig,
872
1065
  pdfFilePath: job.source.filePath,
873
1066
  pdfPageNumber: pageInput.pageNumber,
1067
+ allowVisionOcr: !(options.cheapRetry && pageInput.sourceKind === 'raster-image'),
874
1068
  visionTranscribe: transcribe,
875
1069
  }), phaseTimeoutMs, `Page ${pageInput.pageNumber}: OCR/text recovery timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
876
1070
  pageTextHint = normalizeTextHint(recovery.textHint);
@@ -888,15 +1082,35 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
888
1082
  totalPages: pageInput.totalPages,
889
1083
  pageClassification: inspectionPage?.classification,
890
1084
  pageTextHint,
1085
+ textRecoveryAttempted: !pageTextHint && textRecoveryAttempted,
891
1086
  };
892
1087
  const extractionTimeoutMs = resolveWorkerTextExtractionTimeoutMs(phaseTimeoutMs, pageTextHint);
893
1088
  const extractionConfig = {
894
1089
  ...config,
895
1090
  timeout: extractionTimeoutMs,
896
1091
  };
897
- const result = pageTextHint
898
- ? await withWorkerPageTimeout(extractTextFacts(pageTextHint, extractionConfig, context), extractionTimeoutMs, `Page ${pageInput.pageNumber}: text extraction timed out after ${Math.round(extractionTimeoutMs / 1000)}s`)
899
- : await withWorkerPageTimeout(interpretation(pageInput.base64, pageInput.mimeType, phaseConfig, context), phaseTimeoutMs, `Page ${pageInput.pageNumber}: visual page interpretation timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
1092
+ let result;
1093
+ if (pageTextHint) {
1094
+ try {
1095
+ result = await withWorkerPageTimeout(extractTextFacts(pageTextHint, extractionConfig, context), extractionTimeoutMs, `Page ${pageInput.pageNumber}: text extraction timed out after ${Math.round(extractionTimeoutMs / 1000)}s`);
1096
+ }
1097
+ catch (error) {
1098
+ const message = normalizeCheckpointErrorMessage(error instanceof Error ? error.message : String(error));
1099
+ const deterministic = isTextExtractionTimeoutError(message)
1100
+ ? extractDeterministicFacts(pageTextHint, context, {
1101
+ forcePartial: true,
1102
+ warning: `Text extraction timed out (${message}); used deterministic partial extraction instead.`,
1103
+ })
1104
+ : null;
1105
+ if (!deterministic) {
1106
+ throw error;
1107
+ }
1108
+ result = deterministic;
1109
+ }
1110
+ }
1111
+ else {
1112
+ result = await withWorkerPageTimeout(interpretation(pageInput.base64, pageInput.mimeType, phaseConfig, context), phaseTimeoutMs, `Page ${pageInput.pageNumber}: visual page interpretation timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
1113
+ }
900
1114
  return {
901
1115
  result,
902
1116
  ocrTextHint: pageTextHint,
@@ -904,7 +1118,7 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
904
1118
  ocrWarnings,
905
1119
  };
906
1120
  }
907
- async function processBoreholePage(job, pageInput, config, state, dependencies) {
1121
+ async function processBoreholePage(job, pageInput, config, state, dependencies, options = {}) {
908
1122
  const recoverTextHint = dependencies.recoverDocumentTextHint ?? recoverDocumentTextHint;
909
1123
  const transcribe = dependencies.transcribeDocumentImageText ?? transcribeDocumentImageText;
910
1124
  const interpret = dependencies.interpretBoreholeLogWithContext ?? interpretBoreholeLogWithContext;
@@ -912,6 +1126,8 @@ async function processBoreholePage(job, pageInput, config, state, dependencies)
912
1126
  const phaseTimeoutMs = resolveWorkerPhaseTimeoutMs(config, {
913
1127
  classification: inspectionPage?.classification,
914
1128
  sourceKind: pageInput.sourceKind,
1129
+ }, {
1130
+ cheapRetry: options.cheapRetry,
915
1131
  });
916
1132
  const phaseConfig = {
917
1133
  ...config,
@@ -926,6 +1142,7 @@ async function processBoreholePage(job, pageInput, config, state, dependencies)
926
1142
  config: phaseConfig,
927
1143
  pdfFilePath: job.source.filePath,
928
1144
  pdfPageNumber: pageInput.pageNumber,
1145
+ allowVisionOcr: !(options.cheapRetry && pageInput.sourceKind === 'raster-image'),
929
1146
  visionTranscribe: transcribe,
930
1147
  }), phaseTimeoutMs, `Page ${pageInput.pageNumber}: OCR/text recovery timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
931
1148
  if (recovery.textHint) {
@@ -990,7 +1207,7 @@ async function finalizeJobResult(job, pageInputs, config, dependencies) {
990
1207
  },
991
1208
  now: dependencies.now,
992
1209
  });
993
- return applyBoreholeFailureDowngrades(result, job.checkpoints.pages);
1210
+ return applyCheckpointOcrRecoveredSummary(applyBoreholeFailureDowngrades(result, job.checkpoints.pages), job.checkpoints.pages);
994
1211
  }
995
1212
  catch (error) {
996
1213
  const message = error instanceof Error ? error.message : String(error);
@@ -1043,7 +1260,7 @@ async function finalizeJobResult(job, pageInputs, config, dependencies) {
1043
1260
  },
1044
1261
  now: dependencies.now,
1045
1262
  });
1046
- return applyGeotechFailureDowngrades(result, job.checkpoints.pages);
1263
+ return applyCheckpointOcrRecoveredSummary(applyGeotechFailureDowngrades(result, job.checkpoints.pages), job.checkpoints.pages);
1047
1264
  }
1048
1265
  catch (error) {
1049
1266
  const message = error instanceof Error ? error.message : String(error);
@@ -1096,30 +1313,56 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
1096
1313
  ? currentJob.inspection
1097
1314
  : inspect(currentJob.source.filePath);
1098
1315
  if (!currentJob.inspection || currentJob.inspection.totalPages === 0) {
1099
- await mutateJob((job) => ({
1100
- ...job,
1101
- inspection,
1102
- source: {
1103
- ...job.source,
1104
- totalPages: inspection.totalPages,
1105
- weightedPageCost: inspection.pages.reduce((sum, page) => sum + (page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1), 0),
1106
- },
1107
- processing: {
1108
- ...job.processing,
1109
- chunkExtractionConcurrency: resolvePersistedIngestJobExtractionConcurrency(job.config, inspection),
1110
- },
1111
- checkpoints: {
1112
- pages: inspection.pages.map((page) => job.checkpoints.pages.find((existing) => existing.pageNumber === page.pageNumber) ?? ({
1113
- pageNumber: page.pageNumber,
1114
- classification: page.classification,
1115
- sourceKind: mapPageSourceKind(page.classification),
1116
- weight: page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1,
1117
- status: 'pending',
1118
- attempts: 0,
1119
- updatedAt: nowIso(dependencies.now),
1120
- })),
1121
- },
1122
- }));
1316
+ if (inspection.totalPages > 0) {
1317
+ await mutateJob((job) => ({
1318
+ ...job,
1319
+ inspection,
1320
+ source: {
1321
+ ...job.source,
1322
+ totalPages: inspection.totalPages,
1323
+ weightedPageCost: inspection.pages.reduce((sum, page) => sum + (page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1), 0),
1324
+ },
1325
+ processing: {
1326
+ ...job.processing,
1327
+ chunkExtractionConcurrency: resolvePersistedIngestJobExtractionConcurrency(job.config, inspection, job.segmentation),
1328
+ },
1329
+ checkpoints: {
1330
+ pages: inspection.pages.map((page) => job.checkpoints.pages.find((existing) => existing.pageNumber === page.pageNumber) ?? ({
1331
+ pageNumber: page.pageNumber,
1332
+ classification: page.classification,
1333
+ sourceKind: mapPageSourceKind(page.classification),
1334
+ weight: page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1,
1335
+ status: 'pending',
1336
+ attempts: 0,
1337
+ updatedAt: nowIso(dependencies.now),
1338
+ })),
1339
+ },
1340
+ }));
1341
+ }
1342
+ else {
1343
+ const inferredPageCount = await inferWorkerPdfPageCount(currentJob);
1344
+ if (inferredPageCount > 0) {
1345
+ await mutateJob((job) => {
1346
+ const timestamp = nowIso(dependencies.now);
1347
+ return {
1348
+ ...job,
1349
+ inspection: null,
1350
+ source: {
1351
+ ...job.source,
1352
+ totalPages: Math.max(job.source.totalPages, inferredPageCount),
1353
+ weightedPageCost: Math.max(job.source.weightedPageCost, inferredPageCount),
1354
+ },
1355
+ processing: {
1356
+ ...job.processing,
1357
+ chunkExtractionConcurrency: resolvePersistedIngestJobExtractionConcurrency(job.config, null, job.segmentation),
1358
+ },
1359
+ checkpoints: {
1360
+ pages: buildFallbackWorkerCheckpoints(job, inferredPageCount, timestamp),
1361
+ },
1362
+ };
1363
+ });
1364
+ }
1365
+ }
1123
1366
  }
1124
1367
  currentJob = loadPersistedIngestJob(jobId) ?? currentJob;
1125
1368
  const config = buildJobConfig(currentJob, dependencies);
@@ -1138,6 +1381,9 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
1138
1381
  if (isCancelled(jobId)) {
1139
1382
  return;
1140
1383
  }
1384
+ if (findCheckpoint(currentJob, page.pageNumber).status !== 'pending') {
1385
+ return;
1386
+ }
1141
1387
  if (fatalProviderStopMessage) {
1142
1388
  await mutateJob((job) => ({
1143
1389
  ...job,
@@ -1177,8 +1423,47 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
1177
1423
  : checkpoint),
1178
1424
  },
1179
1425
  }));
1180
- try {
1181
- const processed = await processGeotechDocumentPage(currentJob, page, config, dependencies);
1426
+ let processed = null;
1427
+ let finalErrorMessage = '';
1428
+ for (let attemptIndex = 0; attemptIndex < 2; attemptIndex += 1) {
1429
+ try {
1430
+ processed = await processGeotechDocumentPage(currentJob, page, config, dependencies, {
1431
+ cheapRetry: attemptIndex > 0,
1432
+ });
1433
+ break;
1434
+ }
1435
+ catch (error) {
1436
+ const message = error instanceof Error ? error.message : String(error);
1437
+ const normalizedMessage = normalizeCheckpointErrorMessage(message);
1438
+ if (attemptIndex === 0
1439
+ && isRetryableUpstreamPageError(normalizedMessage)
1440
+ && !isFatalProviderStopError(normalizedMessage)) {
1441
+ await waitForCheckpointRetryBackoff(findCheckpoint(currentJob, page.pageNumber).attempts);
1442
+ await mutateJob((job) => ({
1443
+ ...job,
1444
+ updatedAt: nowIso(dependencies.now),
1445
+ execution: {
1446
+ ...job.execution,
1447
+ lastHeartbeatAt: nowIso(dependencies.now),
1448
+ },
1449
+ checkpoints: {
1450
+ pages: job.checkpoints.pages.map((checkpoint) => checkpoint.pageNumber === page.pageNumber
1451
+ ? {
1452
+ ...checkpoint,
1453
+ attempts: checkpoint.attempts + 1,
1454
+ updatedAt: nowIso(dependencies.now),
1455
+ error: `retrying after upstream timeout: ${normalizedMessage}`,
1456
+ }
1457
+ : checkpoint),
1458
+ },
1459
+ }));
1460
+ continue;
1461
+ }
1462
+ finalErrorMessage = normalizedMessage;
1463
+ break;
1464
+ }
1465
+ }
1466
+ if (processed) {
1182
1467
  processedNewPages += 1;
1183
1468
  await mutateJob((job) => ({
1184
1469
  ...job,
@@ -1205,32 +1490,35 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
1205
1490
  },
1206
1491
  }));
1207
1492
  }
1208
- catch (error) {
1209
- const message = error instanceof Error ? error.message : String(error);
1210
- const normalizedMessage = normalizeCheckpointErrorMessage(message);
1493
+ else {
1494
+ const normalizedMessage = finalErrorMessage || `Page ${page.pageNumber} failed during async ingest.`;
1211
1495
  const checkpoint = findCheckpoint(currentJob, page.pageNumber);
1212
1496
  if (!fatalProviderStopMessage && isFatalProviderStopError(normalizedMessage)) {
1213
1497
  fatalProviderStopMessage = normalizedMessage;
1214
1498
  }
1215
- await mutateJob((job) => ({
1216
- ...job,
1217
- updatedAt: nowIso(dependencies.now),
1218
- execution: {
1219
- ...job.execution,
1220
- lastHeartbeatAt: nowIso(dependencies.now),
1221
- },
1222
- checkpoints: {
1223
- pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
1224
- ? {
1225
- ...pageCheckpoint,
1226
- status: 'failed',
1227
- updatedAt: nowIso(dependencies.now),
1228
- error: normalizedMessage,
1229
- downgraded: isSlowVisualPageError(normalizedMessage, checkpoint.classification, checkpoint.sourceKind),
1230
- }
1231
- : pageCheckpoint),
1232
- },
1233
- }));
1499
+ await mutateJob((job) => {
1500
+ const timestamp = nowIso(dependencies.now);
1501
+ const failedJob = {
1502
+ ...job,
1503
+ updatedAt: timestamp,
1504
+ execution: {
1505
+ ...job.execution,
1506
+ lastHeartbeatAt: timestamp,
1507
+ },
1508
+ checkpoints: {
1509
+ pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
1510
+ ? {
1511
+ ...pageCheckpoint,
1512
+ status: 'failed',
1513
+ updatedAt: timestamp,
1514
+ error: normalizedMessage,
1515
+ downgraded: isSlowVisualPageError(normalizedMessage, checkpoint.classification, checkpoint.sourceKind),
1516
+ }
1517
+ : pageCheckpoint),
1518
+ },
1519
+ };
1520
+ return applyConsecutiveVisualRunDowngrades(failedJob, page.pageNumber, dependencies.now);
1521
+ });
1234
1522
  }
1235
1523
  });
1236
1524
  if (fatalProviderStopMessage) {
@@ -1292,8 +1580,47 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
1292
1580
  : pageCheckpoint),
1293
1581
  },
1294
1582
  }));
1295
- try {
1296
- const processed = await processBoreholePage(currentJob, page, config, state, dependencies);
1583
+ let processed = null;
1584
+ let finalErrorMessage = '';
1585
+ for (let attemptIndex = 0; attemptIndex < 2; attemptIndex += 1) {
1586
+ try {
1587
+ processed = await processBoreholePage(currentJob, page, config, state, dependencies, {
1588
+ cheapRetry: attemptIndex > 0,
1589
+ });
1590
+ break;
1591
+ }
1592
+ catch (error) {
1593
+ const message = error instanceof Error ? error.message : String(error);
1594
+ const normalizedMessage = normalizeCheckpointErrorMessage(message);
1595
+ if (attemptIndex === 0
1596
+ && isRetryableUpstreamPageError(normalizedMessage)
1597
+ && !isFatalProviderStopError(normalizedMessage)) {
1598
+ await waitForCheckpointRetryBackoff(findCheckpoint(currentJob, page.pageNumber).attempts);
1599
+ await mutateJob((job) => ({
1600
+ ...job,
1601
+ updatedAt: nowIso(dependencies.now),
1602
+ execution: {
1603
+ ...job.execution,
1604
+ lastHeartbeatAt: nowIso(dependencies.now),
1605
+ },
1606
+ checkpoints: {
1607
+ pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
1608
+ ? {
1609
+ ...pageCheckpoint,
1610
+ attempts: pageCheckpoint.attempts + 1,
1611
+ updatedAt: nowIso(dependencies.now),
1612
+ error: `retrying after upstream timeout: ${normalizedMessage}`,
1613
+ }
1614
+ : pageCheckpoint),
1615
+ },
1616
+ }));
1617
+ continue;
1618
+ }
1619
+ finalErrorMessage = normalizedMessage;
1620
+ break;
1621
+ }
1622
+ }
1623
+ if (processed) {
1297
1624
  processedNewPages += 1;
1298
1625
  state = processed.nextState;
1299
1626
  await mutateJob((job) => ({
@@ -1321,9 +1648,8 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
1321
1648
  },
1322
1649
  }));
1323
1650
  }
1324
- catch (error) {
1325
- const message = error instanceof Error ? error.message : String(error);
1326
- const normalizedMessage = normalizeCheckpointErrorMessage(message);
1651
+ else {
1652
+ const normalizedMessage = finalErrorMessage || `Page ${page.pageNumber} failed during async ingest.`;
1327
1653
  await mutateJob((job) => ({
1328
1654
  ...job,
1329
1655
  updatedAt: nowIso(dependencies.now),
@@ -1357,7 +1683,7 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
1357
1683
  ...pageCheckpoint,
1358
1684
  status: 'failed',
1359
1685
  updatedAt: nowIso(dependencies.now),
1360
- error: `skipped after upstream provider stop. ${normalizeCheckpointErrorMessage(message)}`,
1686
+ error: `skipped after upstream provider stop. ${normalizedMessage}`,
1361
1687
  downgraded: false,
1362
1688
  }
1363
1689
  : pageCheckpoint),