@geotechcli/core 0.4.25 → 0.4.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/sandbox.d.ts.map +1 -1
- package/dist/agents/sandbox.js +13 -8
- package/dist/agents/sandbox.js.map +1 -1
- package/dist/agents/swarm.d.ts.map +1 -1
- package/dist/agents/swarm.js +21 -4
- package/dist/agents/swarm.js.map +1 -1
- package/dist/ingest/geotech-document.d.ts.map +1 -1
- package/dist/ingest/geotech-document.js +62 -3
- package/dist/ingest/geotech-document.js.map +1 -1
- package/dist/ingest/geotech-extract.d.ts.map +1 -1
- package/dist/ingest/geotech-extract.js +29 -1
- package/dist/ingest/geotech-extract.js.map +1 -1
- package/dist/ingest/job-store.d.ts +3 -2
- package/dist/ingest/job-store.d.ts.map +1 -1
- package/dist/ingest/job-store.js +77 -10
- package/dist/ingest/job-store.js.map +1 -1
- package/dist/ingest/job-worker.d.ts +2 -1
- package/dist/ingest/job-worker.d.ts.map +1 -1
- package/dist/ingest/job-worker.js +397 -71
- package/dist/ingest/job-worker.js.map +1 -1
- package/dist/ingest/pdf.d.ts +1 -0
- package/dist/ingest/pdf.d.ts.map +1 -1
- package/dist/ingest/pdf.js +11 -0
- package/dist/ingest/pdf.js.map +1 -1
- package/dist/meta/metadata.json +1 -1
- package/dist/report/html.d.ts.map +1 -1
- package/dist/report/html.js +17 -34
- package/dist/report/html.js.map +1 -1
- package/dist/skills/index.d.ts.map +1 -1
- package/dist/skills/index.js +12 -4
- package/dist/skills/index.js.map +1 -1
- package/dist/vision/geotech-document.d.ts +4 -0
- package/dist/vision/geotech-document.d.ts.map +1 -1
- package/dist/vision/geotech-document.js +74 -17
- package/dist/vision/geotech-document.js.map +1 -1
- package/dist/vision/index.d.ts.map +1 -1
- package/dist/vision/index.js +47 -10
- package/dist/vision/index.js.map +1 -1
- package/dist/vision/ocr.d.ts +1 -0
- package/dist/vision/ocr.d.ts.map +1 -1
- package/dist/vision/ocr.js +5 -1
- package/dist/vision/ocr.js.map +1 -1
- package/package.json +1 -1
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import { basename, join } from 'node:path';
|
|
2
2
|
import { homedir } from 'node:os';
|
|
3
3
|
import { buildLLMConfig } from '../config/index.js';
|
|
4
|
-
import { readDocumentPdfPageInputs } from './document-inputs.js';
|
|
5
|
-
import { inspectPdfDocument } from './pdf.js';
|
|
4
|
+
import { countDocumentPdfPages, readDocumentPdfPageInputs } from './document-inputs.js';
|
|
5
|
+
import { inferPdfDocumentPageCountFallback, inspectPdfDocument, } from './pdf.js';
|
|
6
6
|
import { ingestBoreholeLogDocument, summarizeBoreholeIngestInspection, } from './geotech-extract.js';
|
|
7
7
|
import { buildPreflightLowYieldInsight, ingestGeotechDocument, inferPreflightLowYieldPageRole, } from './geotech-document.js';
|
|
8
|
-
import { extractGeotechDocumentFactsFromText, interpretGeotechDocumentPage, } from '../vision/geotech-document.js';
|
|
8
|
+
import { extractGeotechDocumentDeterministicFactsFromText, extractGeotechDocumentFactsFromText, interpretGeotechDocumentPage, } from '../vision/geotech-document.js';
|
|
9
9
|
import { interpretBoreholeLogWithContext, transcribeDocumentImageText, } from '../vision/index.js';
|
|
10
10
|
import { recoverDocumentTextHint } from '../vision/ocr.js';
|
|
11
11
|
import { persistBoreholeIngestReview } from './review-store.js';
|
|
@@ -13,6 +13,8 @@ import { buildPersistedIngestJobSegments, createPersistedIngestJob, loadPersiste
|
|
|
13
13
|
import { HOSTED_BETA_EFFECTIVE_PAGE_LIMIT, slicePdfInspectionToRange, writePdfPageSubset, } from './segmentation.js';
|
|
14
14
|
const SLOW_VISUAL_ERROR_PATTERNS = [
|
|
15
15
|
/timeout/i,
|
|
16
|
+
/\b524\b/i,
|
|
17
|
+
/upstream(?: request)? (?:timed out|timeout|failed)/i,
|
|
16
18
|
/provider is busy/i,
|
|
17
19
|
/returned no content/i,
|
|
18
20
|
/did not contain assistant text/i,
|
|
@@ -22,6 +24,15 @@ const SLOW_VISUAL_ERROR_PATTERNS = [
|
|
|
22
24
|
/\b503\b/i,
|
|
23
25
|
/\b504\b/i,
|
|
24
26
|
];
|
|
27
|
+
const RETRYABLE_UPSTREAM_ERROR_PATTERNS = [
|
|
28
|
+
/timeout/i,
|
|
29
|
+
/\b524\b/i,
|
|
30
|
+
/upstream(?: request)? (?:timed out|timeout|failed)/i,
|
|
31
|
+
/provider is busy/i,
|
|
32
|
+
/temporarily unavailable/i,
|
|
33
|
+
/\b503\b/i,
|
|
34
|
+
/\b504\b/i,
|
|
35
|
+
];
|
|
25
36
|
const FATAL_PROVIDER_STOP_PATTERNS = [
|
|
26
37
|
/daily limit reached/i,
|
|
27
38
|
/remaining today:\s*0/i,
|
|
@@ -30,6 +41,8 @@ const FATAL_PROVIDER_STOP_PATTERNS = [
|
|
|
30
41
|
/rate limit/i,
|
|
31
42
|
/\b429\b/i,
|
|
32
43
|
];
|
|
44
|
+
const VISUAL_TAIL_RUN_MIN_PAGES = 3;
|
|
45
|
+
const VISUAL_TAIL_RUN_SLOW_FAILURE_THRESHOLD = 2;
|
|
33
46
|
function nowIso(now) {
|
|
34
47
|
return (now ?? (() => new Date()))().toISOString();
|
|
35
48
|
}
|
|
@@ -458,12 +471,123 @@ function advanceBoreholeProcessingState(state, result, pageTextHint, overrideBor
|
|
|
458
471
|
return next;
|
|
459
472
|
}
|
|
460
473
|
function isSlowVisualPageError(message, classification, sourceKind) {
|
|
461
|
-
const looksVisual = sourceKind === 'raster-image' || classification
|
|
474
|
+
const looksVisual = sourceKind === 'raster-image' || isVisualRunClassification(classification);
|
|
462
475
|
return looksVisual && SLOW_VISUAL_ERROR_PATTERNS.some((pattern) => pattern.test(message));
|
|
463
476
|
}
|
|
477
|
+
function isVisualRunClassification(classification) {
|
|
478
|
+
return classification === 'image-only' || classification === 'text-unreadable' || classification === 'graphics-only';
|
|
479
|
+
}
|
|
480
|
+
function isVisualRunCheckpoint(page) {
|
|
481
|
+
return page.sourceKind === 'raster-image' || isVisualRunClassification(page.classification);
|
|
482
|
+
}
|
|
483
|
+
function getInspectionLeadText(inspectionPage) {
|
|
484
|
+
if (!inspectionPage) {
|
|
485
|
+
return '';
|
|
486
|
+
}
|
|
487
|
+
return uniqueStrings([
|
|
488
|
+
...(inspectionPage.normalizedArtifact?.headingHints ?? []),
|
|
489
|
+
inspectionPage.normalizedArtifact?.nativeText,
|
|
490
|
+
inspectionPage.normalizedText,
|
|
491
|
+
])
|
|
492
|
+
.join('\n')
|
|
493
|
+
.replace(/\s+/g, ' ')
|
|
494
|
+
.trim()
|
|
495
|
+
.toLowerCase();
|
|
496
|
+
}
|
|
497
|
+
function hasBoreholeAppendixCue(text) {
|
|
498
|
+
return /\b(record of boreholes?|borehole logs?|borehole records?|test pits?|cpt|cone penetration|standard penetration test|spt)\b/i.test(text);
|
|
499
|
+
}
|
|
500
|
+
function hasVisualAppendixCue(text) {
|
|
501
|
+
return (/\bappendix\b/i.test(text)
|
|
502
|
+
&& /\b(figures?|plates?|photos?|photographs?|drawings?|plans?|sketches?|site layout|certificates?|laboratory)\b/i.test(text)
|
|
503
|
+
&& !hasBoreholeAppendixCue(text));
|
|
504
|
+
}
|
|
505
|
+
function findConsecutiveVisualRun(job, pageNumber) {
|
|
506
|
+
const pages = [...job.checkpoints.pages].sort((left, right) => left.pageNumber - right.pageNumber);
|
|
507
|
+
const centerIndex = pages.findIndex((page) => page.pageNumber === pageNumber);
|
|
508
|
+
if (centerIndex < 0 || !isVisualRunCheckpoint(pages[centerIndex])) {
|
|
509
|
+
return [];
|
|
510
|
+
}
|
|
511
|
+
let startIndex = centerIndex;
|
|
512
|
+
while (startIndex > 0 && isVisualRunCheckpoint(pages[startIndex - 1])) {
|
|
513
|
+
startIndex -= 1;
|
|
514
|
+
}
|
|
515
|
+
let endIndex = centerIndex;
|
|
516
|
+
while (endIndex + 1 < pages.length && isVisualRunCheckpoint(pages[endIndex + 1])) {
|
|
517
|
+
endIndex += 1;
|
|
518
|
+
}
|
|
519
|
+
return pages.slice(startIndex, endIndex + 1);
|
|
520
|
+
}
|
|
521
|
+
function hasNearbyVisualAppendixCue(job, runStartPage) {
|
|
522
|
+
if (!job.inspection) {
|
|
523
|
+
return false;
|
|
524
|
+
}
|
|
525
|
+
const firstCandidatePage = Math.max(1, runStartPage - 3);
|
|
526
|
+
for (let pageNumber = firstCandidatePage; pageNumber <= runStartPage; pageNumber += 1) {
|
|
527
|
+
const inspectionPage = job.inspection.pages.find((page) => page.pageNumber === pageNumber);
|
|
528
|
+
if (hasVisualAppendixCue(getInspectionLeadText(inspectionPage))) {
|
|
529
|
+
return true;
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
return false;
|
|
533
|
+
}
|
|
534
|
+
function isTailOrAppendixVisualRun(job, run) {
|
|
535
|
+
if (run.length < VISUAL_TAIL_RUN_MIN_PAGES) {
|
|
536
|
+
return false;
|
|
537
|
+
}
|
|
538
|
+
const startPage = run[0].pageNumber;
|
|
539
|
+
const endPage = run[run.length - 1].pageNumber;
|
|
540
|
+
const totalPages = Math.max(job.source.totalPages, job.inspection?.totalPages ?? 0);
|
|
541
|
+
const touchesTail = totalPages > 0
|
|
542
|
+
&& endPage >= totalPages
|
|
543
|
+
&& startPage >= Math.max(2, Math.ceil(totalPages * 0.55));
|
|
544
|
+
return touchesTail || hasNearbyVisualAppendixCue(job, startPage);
|
|
545
|
+
}
|
|
546
|
+
function applyConsecutiveVisualRunDowngrades(job, triggerPageNumber, now) {
|
|
547
|
+
const run = findConsecutiveVisualRun(job, triggerPageNumber);
|
|
548
|
+
if (!isTailOrAppendixVisualRun(job, run)) {
|
|
549
|
+
return job;
|
|
550
|
+
}
|
|
551
|
+
const slowFailureCount = run.filter((page) => page.status === 'failed'
|
|
552
|
+
&& page.downgraded
|
|
553
|
+
&& isSlowVisualPageError(page.error ?? '', page.classification, page.sourceKind)).length;
|
|
554
|
+
if (slowFailureCount < VISUAL_TAIL_RUN_SLOW_FAILURE_THRESHOLD) {
|
|
555
|
+
return job;
|
|
556
|
+
}
|
|
557
|
+
const pendingPageNumbers = new Set(run
|
|
558
|
+
.filter((page) => page.status === 'pending' && page.pageNumber > triggerPageNumber)
|
|
559
|
+
.map((page) => page.pageNumber));
|
|
560
|
+
if (pendingPageNumbers.size === 0) {
|
|
561
|
+
return job;
|
|
562
|
+
}
|
|
563
|
+
const timestamp = nowIso(now);
|
|
564
|
+
return {
|
|
565
|
+
...job,
|
|
566
|
+
updatedAt: timestamp,
|
|
567
|
+
checkpoints: {
|
|
568
|
+
pages: job.checkpoints.pages.map((page) => pendingPageNumbers.has(page.pageNumber)
|
|
569
|
+
? {
|
|
570
|
+
...page,
|
|
571
|
+
status: 'failed',
|
|
572
|
+
updatedAt: timestamp,
|
|
573
|
+
error: `Skipped page ${page.pageNumber} after ${slowFailureCount} slow visual failures in a consecutive image-only tail/appendix run.`,
|
|
574
|
+
downgraded: true,
|
|
575
|
+
}
|
|
576
|
+
: page),
|
|
577
|
+
},
|
|
578
|
+
};
|
|
579
|
+
}
|
|
580
|
+
function isRetryableUpstreamPageError(message) {
|
|
581
|
+
return RETRYABLE_UPSTREAM_ERROR_PATTERNS.some((pattern) => pattern.test(message));
|
|
582
|
+
}
|
|
464
583
|
function isFatalProviderStopError(message) {
|
|
465
584
|
return FATAL_PROVIDER_STOP_PATTERNS.some((pattern) => pattern.test(message));
|
|
466
585
|
}
|
|
586
|
+
function isTextExtractionTimeoutError(message) {
|
|
587
|
+
return (/text extraction timed out/i.test(message)
|
|
588
|
+
|| (/hosted beta request timed out|timed out after \d+s/i.test(message)
|
|
589
|
+
&& !/\b524\b|upstream request failed/i.test(message)));
|
|
590
|
+
}
|
|
467
591
|
function normalizeCheckpointErrorMessage(message) {
|
|
468
592
|
let normalized = message.trim();
|
|
469
593
|
for (let iteration = 0; iteration < 4; iteration += 1) {
|
|
@@ -475,6 +599,13 @@ function normalizeCheckpointErrorMessage(message) {
|
|
|
475
599
|
}
|
|
476
600
|
return normalized;
|
|
477
601
|
}
|
|
602
|
+
async function waitForCheckpointRetryBackoff(attempt) {
|
|
603
|
+
const delayMs = Math.min(1000, 100 * Math.max(1, 2 ** Math.max(0, attempt - 1)));
|
|
604
|
+
await new Promise((resolvePromise) => {
|
|
605
|
+
const timer = setTimeout(resolvePromise, delayMs);
|
|
606
|
+
timer.unref?.();
|
|
607
|
+
});
|
|
608
|
+
}
|
|
478
609
|
async function withWorkerPageTimeout(promise, timeoutMs, errorMessage) {
|
|
479
610
|
if (!Number.isFinite(timeoutMs) || timeoutMs <= 0) {
|
|
480
611
|
return promise;
|
|
@@ -489,14 +620,17 @@ async function withWorkerPageTimeout(promise, timeoutMs, errorMessage) {
|
|
|
489
620
|
}),
|
|
490
621
|
]);
|
|
491
622
|
}
|
|
492
|
-
function resolveWorkerPhaseTimeoutMs(config, input) {
|
|
623
|
+
function resolveWorkerPhaseTimeoutMs(config, input, options) {
|
|
493
624
|
const baseTimeoutMs = Math.min(Math.max(config.timeout ?? 120000, 60000), 120000);
|
|
494
625
|
const isHeavyVisualPage = input.sourceKind === 'raster-image'
|
|
495
626
|
|| input.classification === 'image-only'
|
|
496
627
|
|| input.classification === 'text-unreadable';
|
|
497
|
-
|
|
628
|
+
const phaseTimeoutMs = isHeavyVisualPage
|
|
498
629
|
? Math.min(Math.max(baseTimeoutMs, 180000), 180000)
|
|
499
630
|
: baseTimeoutMs;
|
|
631
|
+
return options?.cheapRetry
|
|
632
|
+
? Math.min(phaseTimeoutMs, isHeavyVisualPage ? 60000 : 90000)
|
|
633
|
+
: phaseTimeoutMs;
|
|
500
634
|
}
|
|
501
635
|
function resolveWorkerTextExtractionTimeoutMs(baseTimeoutMs, textHint) {
|
|
502
636
|
if (!textHint) {
|
|
@@ -555,6 +689,59 @@ function buildInspectionSummary(inspection) {
|
|
|
555
689
|
ocrRecoveredPageCount: 0,
|
|
556
690
|
};
|
|
557
691
|
}
|
|
692
|
+
function countCheckpointOcrRecoveredPages(checkpoints) {
|
|
693
|
+
return checkpoints.filter((page) => page.ocrSource === 'local-ocr' || page.ocrSource === 'vision-ocr').length;
|
|
694
|
+
}
|
|
695
|
+
function applyCheckpointOcrRecoveredSummary(result, checkpoints) {
|
|
696
|
+
const recoveredPageCount = countCheckpointOcrRecoveredPages(checkpoints);
|
|
697
|
+
if (recoveredPageCount === 0 || !result.inspectionSummary) {
|
|
698
|
+
return result;
|
|
699
|
+
}
|
|
700
|
+
return {
|
|
701
|
+
...result,
|
|
702
|
+
inspectionSummary: {
|
|
703
|
+
...result.inspectionSummary,
|
|
704
|
+
ocrRecoveredPageCount: Math.max(result.inspectionSummary.ocrRecoveredPageCount, recoveredPageCount),
|
|
705
|
+
},
|
|
706
|
+
};
|
|
707
|
+
}
|
|
708
|
+
function buildFallbackWorkerCheckpoints(job, pageCount, timestamp) {
|
|
709
|
+
if (job.checkpoints.pages.length > 0) {
|
|
710
|
+
return job.checkpoints.pages;
|
|
711
|
+
}
|
|
712
|
+
const startPage = job.source.pageRange?.[0] ?? job.segmentation?.pageRange?.[0] ?? 1;
|
|
713
|
+
return Array.from({ length: pageCount }, (_, index) => ({
|
|
714
|
+
pageNumber: startPage + index,
|
|
715
|
+
classification: null,
|
|
716
|
+
sourceKind: 'pdf-page',
|
|
717
|
+
weight: 1,
|
|
718
|
+
status: 'pending',
|
|
719
|
+
attempts: 0,
|
|
720
|
+
updatedAt: timestamp,
|
|
721
|
+
}));
|
|
722
|
+
}
|
|
723
|
+
async function inferWorkerPdfPageCount(job) {
|
|
724
|
+
if (job.source.totalPages > 0) {
|
|
725
|
+
return job.source.totalPages;
|
|
726
|
+
}
|
|
727
|
+
try {
|
|
728
|
+
const fullPageCount = await countDocumentPdfPages(job.source.filePath);
|
|
729
|
+
return job.source.pageRange
|
|
730
|
+
? Math.max(0, Math.min(fullPageCount, job.source.pageRange[1]) - job.source.pageRange[0] + 1)
|
|
731
|
+
: fullPageCount;
|
|
732
|
+
}
|
|
733
|
+
catch {
|
|
734
|
+
try {
|
|
735
|
+
const fullPageCount = inferPdfDocumentPageCountFallback(job.source.filePath);
|
|
736
|
+
return job.source.pageRange
|
|
737
|
+
? Math.max(0, Math.min(fullPageCount, job.source.pageRange[1]) - job.source.pageRange[0] + 1)
|
|
738
|
+
: fullPageCount;
|
|
739
|
+
}
|
|
740
|
+
catch {
|
|
741
|
+
return 0;
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
}
|
|
558
745
|
function summarizeReviewReasons(findings) {
|
|
559
746
|
return uniqueStrings(findings
|
|
560
747
|
.filter((finding) => finding.severity !== 'advisory')
|
|
@@ -595,7 +782,7 @@ function buildSyntheticBoreholeResult(job, inspection, now) {
|
|
|
595
782
|
message: `${normalFailedPages.length} page(s) failed during ingest and should be reviewed.`,
|
|
596
783
|
});
|
|
597
784
|
}
|
|
598
|
-
return {
|
|
785
|
+
return applyCheckpointOcrRecoveredSummary({
|
|
599
786
|
kind: 'geotech-ingest-result',
|
|
600
787
|
schemaVersion: 1,
|
|
601
788
|
documentType: 'borehole-log',
|
|
@@ -625,7 +812,7 @@ function buildSyntheticBoreholeResult(job, inspection, now) {
|
|
|
625
812
|
reviewRequired: reviewFindings.some((finding) => finding.severity !== 'advisory'),
|
|
626
813
|
confidence: 0,
|
|
627
814
|
canAutoProceed: false,
|
|
628
|
-
};
|
|
815
|
+
}, job.checkpoints.pages);
|
|
629
816
|
}
|
|
630
817
|
function buildSyntheticGeotechDocumentResult(job, inspection, now) {
|
|
631
818
|
const pageFailures = job.checkpoints.pages
|
|
@@ -662,7 +849,7 @@ function buildSyntheticGeotechDocumentResult(job, inspection, now) {
|
|
|
662
849
|
message: `${normalFailedPages.length} page(s) failed during ingest and should be reviewed.`,
|
|
663
850
|
});
|
|
664
851
|
}
|
|
665
|
-
return {
|
|
852
|
+
return applyCheckpointOcrRecoveredSummary({
|
|
666
853
|
kind: 'geotech-ingest-result',
|
|
667
854
|
schemaVersion: 1,
|
|
668
855
|
documentType: 'geotech-document',
|
|
@@ -700,7 +887,7 @@ function buildSyntheticGeotechDocumentResult(job, inspection, now) {
|
|
|
700
887
|
confidence: 0,
|
|
701
888
|
reviewRequired: reviewFindings.some((finding) => finding.severity !== 'advisory'),
|
|
702
889
|
canAutoProceed: false,
|
|
703
|
-
};
|
|
890
|
+
}, job.checkpoints.pages);
|
|
704
891
|
}
|
|
705
892
|
function dedupeReviewFindings(reviewFindings) {
|
|
706
893
|
return [
|
|
@@ -819,11 +1006,13 @@ function findCheckpoint(job, pageNumber) {
|
|
|
819
1006
|
}
|
|
820
1007
|
return checkpoint;
|
|
821
1008
|
}
|
|
822
|
-
async function processGeotechDocumentPage(job, pageInput, config, dependencies) {
|
|
1009
|
+
async function processGeotechDocumentPage(job, pageInput, config, dependencies, options = {}) {
|
|
823
1010
|
const inspect = dependencies.inspectPdfDocument ?? inspectPdfDocument;
|
|
824
1011
|
const recoverTextHint = dependencies.recoverDocumentTextHint ?? recoverDocumentTextHint;
|
|
825
1012
|
const interpretation = dependencies.interpretGeotechDocumentPage ?? interpretGeotechDocumentPage;
|
|
826
1013
|
const extractTextFacts = dependencies.extractGeotechDocumentFactsFromText ?? extractGeotechDocumentFactsFromText;
|
|
1014
|
+
const extractDeterministicFacts = dependencies.extractGeotechDocumentDeterministicFactsFromText
|
|
1015
|
+
?? extractGeotechDocumentDeterministicFactsFromText;
|
|
827
1016
|
const transcribe = dependencies.transcribeDocumentImageText ?? transcribeDocumentImageText;
|
|
828
1017
|
const inspectionPage = job.inspection?.pages[pageInput.pageNumber - 1] ?? inspect(job.source.filePath).pages[pageInput.pageNumber - 1];
|
|
829
1018
|
const lowYieldRole = inferPreflightLowYieldPageRole({
|
|
@@ -854,6 +1043,8 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
|
|
|
854
1043
|
const phaseTimeoutMs = resolveWorkerPhaseTimeoutMs(config, {
|
|
855
1044
|
classification: inspectionPage?.classification,
|
|
856
1045
|
sourceKind: pageInput.sourceKind,
|
|
1046
|
+
}, {
|
|
1047
|
+
cheapRetry: options.cheapRetry,
|
|
857
1048
|
});
|
|
858
1049
|
const phaseConfig = {
|
|
859
1050
|
...config,
|
|
@@ -862,7 +1053,9 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
|
|
|
862
1053
|
let pageTextHint;
|
|
863
1054
|
let ocrSource = 'none';
|
|
864
1055
|
let ocrWarnings = [];
|
|
1056
|
+
let textRecoveryAttempted = false;
|
|
865
1057
|
try {
|
|
1058
|
+
textRecoveryAttempted = true;
|
|
866
1059
|
const recovery = await withWorkerPageTimeout(recoverTextHint({
|
|
867
1060
|
existingTextHint: inspectionPage?.normalizedArtifact?.nativeText ?? inspectionPage?.normalizedText,
|
|
868
1061
|
existingTextAccepted: inspectionPage?.normalizedArtifact?.textQuality.accepted ?? true,
|
|
@@ -871,6 +1064,7 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
|
|
|
871
1064
|
config: phaseConfig,
|
|
872
1065
|
pdfFilePath: job.source.filePath,
|
|
873
1066
|
pdfPageNumber: pageInput.pageNumber,
|
|
1067
|
+
allowVisionOcr: !(options.cheapRetry && pageInput.sourceKind === 'raster-image'),
|
|
874
1068
|
visionTranscribe: transcribe,
|
|
875
1069
|
}), phaseTimeoutMs, `Page ${pageInput.pageNumber}: OCR/text recovery timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
|
|
876
1070
|
pageTextHint = normalizeTextHint(recovery.textHint);
|
|
@@ -888,15 +1082,35 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
|
|
|
888
1082
|
totalPages: pageInput.totalPages,
|
|
889
1083
|
pageClassification: inspectionPage?.classification,
|
|
890
1084
|
pageTextHint,
|
|
1085
|
+
textRecoveryAttempted: !pageTextHint && textRecoveryAttempted,
|
|
891
1086
|
};
|
|
892
1087
|
const extractionTimeoutMs = resolveWorkerTextExtractionTimeoutMs(phaseTimeoutMs, pageTextHint);
|
|
893
1088
|
const extractionConfig = {
|
|
894
1089
|
...config,
|
|
895
1090
|
timeout: extractionTimeoutMs,
|
|
896
1091
|
};
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
1092
|
+
let result;
|
|
1093
|
+
if (pageTextHint) {
|
|
1094
|
+
try {
|
|
1095
|
+
result = await withWorkerPageTimeout(extractTextFacts(pageTextHint, extractionConfig, context), extractionTimeoutMs, `Page ${pageInput.pageNumber}: text extraction timed out after ${Math.round(extractionTimeoutMs / 1000)}s`);
|
|
1096
|
+
}
|
|
1097
|
+
catch (error) {
|
|
1098
|
+
const message = normalizeCheckpointErrorMessage(error instanceof Error ? error.message : String(error));
|
|
1099
|
+
const deterministic = isTextExtractionTimeoutError(message)
|
|
1100
|
+
? extractDeterministicFacts(pageTextHint, context, {
|
|
1101
|
+
forcePartial: true,
|
|
1102
|
+
warning: `Text extraction timed out (${message}); used deterministic partial extraction instead.`,
|
|
1103
|
+
})
|
|
1104
|
+
: null;
|
|
1105
|
+
if (!deterministic) {
|
|
1106
|
+
throw error;
|
|
1107
|
+
}
|
|
1108
|
+
result = deterministic;
|
|
1109
|
+
}
|
|
1110
|
+
}
|
|
1111
|
+
else {
|
|
1112
|
+
result = await withWorkerPageTimeout(interpretation(pageInput.base64, pageInput.mimeType, phaseConfig, context), phaseTimeoutMs, `Page ${pageInput.pageNumber}: visual page interpretation timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
|
|
1113
|
+
}
|
|
900
1114
|
return {
|
|
901
1115
|
result,
|
|
902
1116
|
ocrTextHint: pageTextHint,
|
|
@@ -904,7 +1118,7 @@ async function processGeotechDocumentPage(job, pageInput, config, dependencies)
|
|
|
904
1118
|
ocrWarnings,
|
|
905
1119
|
};
|
|
906
1120
|
}
|
|
907
|
-
async function processBoreholePage(job, pageInput, config, state, dependencies) {
|
|
1121
|
+
async function processBoreholePage(job, pageInput, config, state, dependencies, options = {}) {
|
|
908
1122
|
const recoverTextHint = dependencies.recoverDocumentTextHint ?? recoverDocumentTextHint;
|
|
909
1123
|
const transcribe = dependencies.transcribeDocumentImageText ?? transcribeDocumentImageText;
|
|
910
1124
|
const interpret = dependencies.interpretBoreholeLogWithContext ?? interpretBoreholeLogWithContext;
|
|
@@ -912,6 +1126,8 @@ async function processBoreholePage(job, pageInput, config, state, dependencies)
|
|
|
912
1126
|
const phaseTimeoutMs = resolveWorkerPhaseTimeoutMs(config, {
|
|
913
1127
|
classification: inspectionPage?.classification,
|
|
914
1128
|
sourceKind: pageInput.sourceKind,
|
|
1129
|
+
}, {
|
|
1130
|
+
cheapRetry: options.cheapRetry,
|
|
915
1131
|
});
|
|
916
1132
|
const phaseConfig = {
|
|
917
1133
|
...config,
|
|
@@ -926,6 +1142,7 @@ async function processBoreholePage(job, pageInput, config, state, dependencies)
|
|
|
926
1142
|
config: phaseConfig,
|
|
927
1143
|
pdfFilePath: job.source.filePath,
|
|
928
1144
|
pdfPageNumber: pageInput.pageNumber,
|
|
1145
|
+
allowVisionOcr: !(options.cheapRetry && pageInput.sourceKind === 'raster-image'),
|
|
929
1146
|
visionTranscribe: transcribe,
|
|
930
1147
|
}), phaseTimeoutMs, `Page ${pageInput.pageNumber}: OCR/text recovery timed out after ${Math.round(phaseTimeoutMs / 1000)}s`);
|
|
931
1148
|
if (recovery.textHint) {
|
|
@@ -990,7 +1207,7 @@ async function finalizeJobResult(job, pageInputs, config, dependencies) {
|
|
|
990
1207
|
},
|
|
991
1208
|
now: dependencies.now,
|
|
992
1209
|
});
|
|
993
|
-
return applyBoreholeFailureDowngrades(result, job.checkpoints.pages);
|
|
1210
|
+
return applyCheckpointOcrRecoveredSummary(applyBoreholeFailureDowngrades(result, job.checkpoints.pages), job.checkpoints.pages);
|
|
994
1211
|
}
|
|
995
1212
|
catch (error) {
|
|
996
1213
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -1043,7 +1260,7 @@ async function finalizeJobResult(job, pageInputs, config, dependencies) {
|
|
|
1043
1260
|
},
|
|
1044
1261
|
now: dependencies.now,
|
|
1045
1262
|
});
|
|
1046
|
-
return applyGeotechFailureDowngrades(result, job.checkpoints.pages);
|
|
1263
|
+
return applyCheckpointOcrRecoveredSummary(applyGeotechFailureDowngrades(result, job.checkpoints.pages), job.checkpoints.pages);
|
|
1047
1264
|
}
|
|
1048
1265
|
catch (error) {
|
|
1049
1266
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -1096,30 +1313,56 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
|
|
|
1096
1313
|
? currentJob.inspection
|
|
1097
1314
|
: inspect(currentJob.source.filePath);
|
|
1098
1315
|
if (!currentJob.inspection || currentJob.inspection.totalPages === 0) {
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1316
|
+
if (inspection.totalPages > 0) {
|
|
1317
|
+
await mutateJob((job) => ({
|
|
1318
|
+
...job,
|
|
1319
|
+
inspection,
|
|
1320
|
+
source: {
|
|
1321
|
+
...job.source,
|
|
1322
|
+
totalPages: inspection.totalPages,
|
|
1323
|
+
weightedPageCost: inspection.pages.reduce((sum, page) => sum + (page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1), 0),
|
|
1324
|
+
},
|
|
1325
|
+
processing: {
|
|
1326
|
+
...job.processing,
|
|
1327
|
+
chunkExtractionConcurrency: resolvePersistedIngestJobExtractionConcurrency(job.config, inspection, job.segmentation),
|
|
1328
|
+
},
|
|
1329
|
+
checkpoints: {
|
|
1330
|
+
pages: inspection.pages.map((page) => job.checkpoints.pages.find((existing) => existing.pageNumber === page.pageNumber) ?? ({
|
|
1331
|
+
pageNumber: page.pageNumber,
|
|
1332
|
+
classification: page.classification,
|
|
1333
|
+
sourceKind: mapPageSourceKind(page.classification),
|
|
1334
|
+
weight: page.classification === 'image-only' || page.classification === 'text-unreadable' ? 2 : 1,
|
|
1335
|
+
status: 'pending',
|
|
1336
|
+
attempts: 0,
|
|
1337
|
+
updatedAt: nowIso(dependencies.now),
|
|
1338
|
+
})),
|
|
1339
|
+
},
|
|
1340
|
+
}));
|
|
1341
|
+
}
|
|
1342
|
+
else {
|
|
1343
|
+
const inferredPageCount = await inferWorkerPdfPageCount(currentJob);
|
|
1344
|
+
if (inferredPageCount > 0) {
|
|
1345
|
+
await mutateJob((job) => {
|
|
1346
|
+
const timestamp = nowIso(dependencies.now);
|
|
1347
|
+
return {
|
|
1348
|
+
...job,
|
|
1349
|
+
inspection: null,
|
|
1350
|
+
source: {
|
|
1351
|
+
...job.source,
|
|
1352
|
+
totalPages: Math.max(job.source.totalPages, inferredPageCount),
|
|
1353
|
+
weightedPageCost: Math.max(job.source.weightedPageCost, inferredPageCount),
|
|
1354
|
+
},
|
|
1355
|
+
processing: {
|
|
1356
|
+
...job.processing,
|
|
1357
|
+
chunkExtractionConcurrency: resolvePersistedIngestJobExtractionConcurrency(job.config, null, job.segmentation),
|
|
1358
|
+
},
|
|
1359
|
+
checkpoints: {
|
|
1360
|
+
pages: buildFallbackWorkerCheckpoints(job, inferredPageCount, timestamp),
|
|
1361
|
+
},
|
|
1362
|
+
};
|
|
1363
|
+
});
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1123
1366
|
}
|
|
1124
1367
|
currentJob = loadPersistedIngestJob(jobId) ?? currentJob;
|
|
1125
1368
|
const config = buildJobConfig(currentJob, dependencies);
|
|
@@ -1138,6 +1381,9 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
|
|
|
1138
1381
|
if (isCancelled(jobId)) {
|
|
1139
1382
|
return;
|
|
1140
1383
|
}
|
|
1384
|
+
if (findCheckpoint(currentJob, page.pageNumber).status !== 'pending') {
|
|
1385
|
+
return;
|
|
1386
|
+
}
|
|
1141
1387
|
if (fatalProviderStopMessage) {
|
|
1142
1388
|
await mutateJob((job) => ({
|
|
1143
1389
|
...job,
|
|
@@ -1177,8 +1423,47 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
|
|
|
1177
1423
|
: checkpoint),
|
|
1178
1424
|
},
|
|
1179
1425
|
}));
|
|
1180
|
-
|
|
1181
|
-
|
|
1426
|
+
let processed = null;
|
|
1427
|
+
let finalErrorMessage = '';
|
|
1428
|
+
for (let attemptIndex = 0; attemptIndex < 2; attemptIndex += 1) {
|
|
1429
|
+
try {
|
|
1430
|
+
processed = await processGeotechDocumentPage(currentJob, page, config, dependencies, {
|
|
1431
|
+
cheapRetry: attemptIndex > 0,
|
|
1432
|
+
});
|
|
1433
|
+
break;
|
|
1434
|
+
}
|
|
1435
|
+
catch (error) {
|
|
1436
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1437
|
+
const normalizedMessage = normalizeCheckpointErrorMessage(message);
|
|
1438
|
+
if (attemptIndex === 0
|
|
1439
|
+
&& isRetryableUpstreamPageError(normalizedMessage)
|
|
1440
|
+
&& !isFatalProviderStopError(normalizedMessage)) {
|
|
1441
|
+
await waitForCheckpointRetryBackoff(findCheckpoint(currentJob, page.pageNumber).attempts);
|
|
1442
|
+
await mutateJob((job) => ({
|
|
1443
|
+
...job,
|
|
1444
|
+
updatedAt: nowIso(dependencies.now),
|
|
1445
|
+
execution: {
|
|
1446
|
+
...job.execution,
|
|
1447
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
1448
|
+
},
|
|
1449
|
+
checkpoints: {
|
|
1450
|
+
pages: job.checkpoints.pages.map((checkpoint) => checkpoint.pageNumber === page.pageNumber
|
|
1451
|
+
? {
|
|
1452
|
+
...checkpoint,
|
|
1453
|
+
attempts: checkpoint.attempts + 1,
|
|
1454
|
+
updatedAt: nowIso(dependencies.now),
|
|
1455
|
+
error: `retrying after upstream timeout: ${normalizedMessage}`,
|
|
1456
|
+
}
|
|
1457
|
+
: checkpoint),
|
|
1458
|
+
},
|
|
1459
|
+
}));
|
|
1460
|
+
continue;
|
|
1461
|
+
}
|
|
1462
|
+
finalErrorMessage = normalizedMessage;
|
|
1463
|
+
break;
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
if (processed) {
|
|
1182
1467
|
processedNewPages += 1;
|
|
1183
1468
|
await mutateJob((job) => ({
|
|
1184
1469
|
...job,
|
|
@@ -1205,32 +1490,35 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
|
|
|
1205
1490
|
},
|
|
1206
1491
|
}));
|
|
1207
1492
|
}
|
|
1208
|
-
|
|
1209
|
-
const
|
|
1210
|
-
const normalizedMessage = normalizeCheckpointErrorMessage(message);
|
|
1493
|
+
else {
|
|
1494
|
+
const normalizedMessage = finalErrorMessage || `Page ${page.pageNumber} failed during async ingest.`;
|
|
1211
1495
|
const checkpoint = findCheckpoint(currentJob, page.pageNumber);
|
|
1212
1496
|
if (!fatalProviderStopMessage && isFatalProviderStopError(normalizedMessage)) {
|
|
1213
1497
|
fatalProviderStopMessage = normalizedMessage;
|
|
1214
1498
|
}
|
|
1215
|
-
await mutateJob((job) =>
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1499
|
+
await mutateJob((job) => {
|
|
1500
|
+
const timestamp = nowIso(dependencies.now);
|
|
1501
|
+
const failedJob = {
|
|
1502
|
+
...job,
|
|
1503
|
+
updatedAt: timestamp,
|
|
1504
|
+
execution: {
|
|
1505
|
+
...job.execution,
|
|
1506
|
+
lastHeartbeatAt: timestamp,
|
|
1507
|
+
},
|
|
1508
|
+
checkpoints: {
|
|
1509
|
+
pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
|
|
1510
|
+
? {
|
|
1511
|
+
...pageCheckpoint,
|
|
1512
|
+
status: 'failed',
|
|
1513
|
+
updatedAt: timestamp,
|
|
1514
|
+
error: normalizedMessage,
|
|
1515
|
+
downgraded: isSlowVisualPageError(normalizedMessage, checkpoint.classification, checkpoint.sourceKind),
|
|
1516
|
+
}
|
|
1517
|
+
: pageCheckpoint),
|
|
1518
|
+
},
|
|
1519
|
+
};
|
|
1520
|
+
return applyConsecutiveVisualRunDowngrades(failedJob, page.pageNumber, dependencies.now);
|
|
1521
|
+
});
|
|
1234
1522
|
}
|
|
1235
1523
|
});
|
|
1236
1524
|
if (fatalProviderStopMessage) {
|
|
@@ -1292,8 +1580,47 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
|
|
|
1292
1580
|
: pageCheckpoint),
|
|
1293
1581
|
},
|
|
1294
1582
|
}));
|
|
1295
|
-
|
|
1296
|
-
|
|
1583
|
+
let processed = null;
|
|
1584
|
+
let finalErrorMessage = '';
|
|
1585
|
+
for (let attemptIndex = 0; attemptIndex < 2; attemptIndex += 1) {
|
|
1586
|
+
try {
|
|
1587
|
+
processed = await processBoreholePage(currentJob, page, config, state, dependencies, {
|
|
1588
|
+
cheapRetry: attemptIndex > 0,
|
|
1589
|
+
});
|
|
1590
|
+
break;
|
|
1591
|
+
}
|
|
1592
|
+
catch (error) {
|
|
1593
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1594
|
+
const normalizedMessage = normalizeCheckpointErrorMessage(message);
|
|
1595
|
+
if (attemptIndex === 0
|
|
1596
|
+
&& isRetryableUpstreamPageError(normalizedMessage)
|
|
1597
|
+
&& !isFatalProviderStopError(normalizedMessage)) {
|
|
1598
|
+
await waitForCheckpointRetryBackoff(findCheckpoint(currentJob, page.pageNumber).attempts);
|
|
1599
|
+
await mutateJob((job) => ({
|
|
1600
|
+
...job,
|
|
1601
|
+
updatedAt: nowIso(dependencies.now),
|
|
1602
|
+
execution: {
|
|
1603
|
+
...job.execution,
|
|
1604
|
+
lastHeartbeatAt: nowIso(dependencies.now),
|
|
1605
|
+
},
|
|
1606
|
+
checkpoints: {
|
|
1607
|
+
pages: job.checkpoints.pages.map((pageCheckpoint) => pageCheckpoint.pageNumber === page.pageNumber
|
|
1608
|
+
? {
|
|
1609
|
+
...pageCheckpoint,
|
|
1610
|
+
attempts: pageCheckpoint.attempts + 1,
|
|
1611
|
+
updatedAt: nowIso(dependencies.now),
|
|
1612
|
+
error: `retrying after upstream timeout: ${normalizedMessage}`,
|
|
1613
|
+
}
|
|
1614
|
+
: pageCheckpoint),
|
|
1615
|
+
},
|
|
1616
|
+
}));
|
|
1617
|
+
continue;
|
|
1618
|
+
}
|
|
1619
|
+
finalErrorMessage = normalizedMessage;
|
|
1620
|
+
break;
|
|
1621
|
+
}
|
|
1622
|
+
}
|
|
1623
|
+
if (processed) {
|
|
1297
1624
|
processedNewPages += 1;
|
|
1298
1625
|
state = processed.nextState;
|
|
1299
1626
|
await mutateJob((job) => ({
|
|
@@ -1321,9 +1648,8 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
|
|
|
1321
1648
|
},
|
|
1322
1649
|
}));
|
|
1323
1650
|
}
|
|
1324
|
-
|
|
1325
|
-
const
|
|
1326
|
-
const normalizedMessage = normalizeCheckpointErrorMessage(message);
|
|
1651
|
+
else {
|
|
1652
|
+
const normalizedMessage = finalErrorMessage || `Page ${page.pageNumber} failed during async ingest.`;
|
|
1327
1653
|
await mutateJob((job) => ({
|
|
1328
1654
|
...job,
|
|
1329
1655
|
updatedAt: nowIso(dependencies.now),
|
|
@@ -1357,7 +1683,7 @@ export async function runPersistedIngestJobWorker(jobId, dependencies = {}) {
|
|
|
1357
1683
|
...pageCheckpoint,
|
|
1358
1684
|
status: 'failed',
|
|
1359
1685
|
updatedAt: nowIso(dependencies.now),
|
|
1360
|
-
error: `skipped after upstream provider stop. ${
|
|
1686
|
+
error: `skipped after upstream provider stop. ${normalizedMessage}`,
|
|
1361
1687
|
downgraded: false,
|
|
1362
1688
|
}
|
|
1363
1689
|
: pageCheckpoint),
|