@geotechcli/core 0.4.37 → 0.4.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ingest/geotech-document.d.ts +16 -0
- package/dist/ingest/geotech-document.d.ts.map +1 -1
- package/dist/ingest/geotech-document.js +227 -16
- package/dist/ingest/geotech-document.js.map +1 -1
- package/dist/ingest/index.d.ts +2 -1
- package/dist/ingest/index.d.ts.map +1 -1
- package/dist/ingest/index.js +1 -0
- package/dist/ingest/index.js.map +1 -1
- package/dist/ingest/job-store.d.ts +2 -1
- package/dist/ingest/job-store.d.ts.map +1 -1
- package/dist/ingest/job-store.js +41 -0
- package/dist/ingest/job-store.js.map +1 -1
- package/dist/ingest/job-worker.d.ts.map +1 -1
- package/dist/ingest/job-worker.js +191 -12
- package/dist/ingest/job-worker.js.map +1 -1
- package/dist/ingest/page-evidence-cache.d.ts +47 -0
- package/dist/ingest/page-evidence-cache.d.ts.map +1 -0
- package/dist/ingest/page-evidence-cache.js +205 -0
- package/dist/ingest/page-evidence-cache.js.map +1 -0
- package/dist/meta/metadata.json +1 -1
- package/dist/report/html.d.ts.map +1 -1
- package/dist/report/html.js +484 -5
- package/dist/report/html.js.map +1 -1
- package/dist/report/ingest-dossier.d.ts +2 -0
- package/dist/report/ingest-dossier.d.ts.map +1 -1
- package/dist/report/ingest-dossier.js +32 -2
- package/dist/report/ingest-dossier.js.map +1 -1
- package/package.json +1 -1
|
@@ -22,6 +22,20 @@ export interface GeotechDocumentSource {
|
|
|
22
22
|
pageRange?: [number, number];
|
|
23
23
|
segmentation?: IngestSegmentationSummary;
|
|
24
24
|
}
|
|
25
|
+
export type GeotechDocumentPageEvidenceCacheStatus = 'hit' | 'miss' | 'stored' | 'skipped';
|
|
26
|
+
export interface GeotechDocumentPageEvidenceCacheAudit {
|
|
27
|
+
status: GeotechDocumentPageEvidenceCacheStatus;
|
|
28
|
+
entryId: string;
|
|
29
|
+
cacheKey: string;
|
|
30
|
+
fileHash: string;
|
|
31
|
+
pageHash: string;
|
|
32
|
+
pageNumber: number;
|
|
33
|
+
modelVersion: string;
|
|
34
|
+
preprocessingVersion: string;
|
|
35
|
+
schemaVersion: number;
|
|
36
|
+
createdAt?: string;
|
|
37
|
+
reason?: string;
|
|
38
|
+
}
|
|
25
39
|
export interface GeotechDocumentPageAudit {
|
|
26
40
|
pageNumber: number;
|
|
27
41
|
classification: PdfPageClassification | null;
|
|
@@ -31,6 +45,7 @@ export interface GeotechDocumentPageAudit {
|
|
|
31
45
|
materialCount: number;
|
|
32
46
|
classificationCount: number;
|
|
33
47
|
parameterCount: number;
|
|
48
|
+
evidenceCache?: GeotechDocumentPageEvidenceCacheAudit;
|
|
34
49
|
warnings: string[];
|
|
35
50
|
}
|
|
36
51
|
export type GeotechDocumentFindingSeverity = 'advisory' | 'review' | 'blocking';
|
|
@@ -118,6 +133,7 @@ export interface IngestGeotechDocumentOptions {
|
|
|
118
133
|
contentChunks?: GeotechDocumentContentChunk[];
|
|
119
134
|
};
|
|
120
135
|
}) => Promise<GeotechDocumentSynthesis | null>;
|
|
136
|
+
usePageEvidenceCache?: boolean;
|
|
121
137
|
pageConcurrency?: number;
|
|
122
138
|
now?: () => Date;
|
|
123
139
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"geotech-document.d.ts","sourceRoot":"","sources":["../../src/ingest/geotech-document.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"geotech-document.d.ts","sourceRoot":"","sources":["../../src/ingest/geotech-document.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAsB,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAIrE,OAAO,EACL,mCAAmC,EACnC,4BAA4B,EAC5B,KAAK,6BAA6B,EAElC,KAAK,sBAAsB,EAC3B,KAAK,0BAA0B,EAC/B,KAAK,2BAA2B,EAChC,KAAK,WAAW,EACjB,MAAM,+BAA+B,CAAC;AACvC,OAAO,EAAE,2BAA2B,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAA2B,KAAK,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AACxF,OAAO,KAAK,EAAE,qBAAqB,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAC7E,OAAO,KAAK,EAAE,yBAAyB,EAAE,MAAM,mBAAmB,CAAC;AAenE,MAAM,WAAW,0BAA0B;IACzC,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,wBAAyB,SAAQ,0BAA0B;IAC1E,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,UAAU,GAAG,cAAc,CAAC;IACzC,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,OAAO,GAAG,KAAK,CAAC;IAC3B,SAAS,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC7B,YAAY,CAAC,EAAE,yBAAyB,CAAC;CAC1C;AAED,MAAM,MAAM,sCAAsC,GAAG,KAAK,GAAG,MAAM,GAAG,QAAQ,GAAG,SAAS,CAAC;AAE3F,MAAM,WAAW,qCAAqC;IACpD,MAAM,EAAE,sCAAsC,CAAC;IAC/C,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,wBAAwB;IACvC,UAAU,EAAE,MAAM,CAAC;IACnB,cAAc,EAAE,qBAAqB,GAAG,IAAI,CAAC;IAC7C,cAAc,EAAE,sBAAsB,CAAC;IACvC,WAAW,EAAE,WAAW,CAAC;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,mBAAmB,EAAE,MAAM,CAAC;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,aAAa,CAAC,EAAE,qCAAqC,CAAC;IACtD,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,MAAM,MAAM,8BAA8B,GAAG,UAAU,GAAG,QAAQ,GAAG,UAAU,CAAC;AAChF,MAAM,MAAM,2BAA2B,GAAG,UAAU,GAAG,MAAM,GAAG,UAAU,CAAC;AAE3E,MAAM,WAAW,sBAAsB;IACrC,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,8BAA8B,CAAC;IACzC,KAAK,EAAE,2BAA2B,CAAC;IACnC,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED,MAAM,WAAW,gCAAgC;IAC/C,wBAAwB,EAAE,OAAO,CAAC,MAAM,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC,CAAC;IACzE,mBAAmB,EAAE,MAAM,CAAC;IAC5B,mBAAmB,EAAE,MAAM,CAAC;IAC5B,iBAAiB,EAAE,MAAM,CAAC;IAC1B,qBAAqB,EAAE,MAAM,CAAC;CAC/B;AAED,MAAM,WAAW,2BAA2B;IAC1C,IAAI,EAAE,uBAAuB,CAAC;IAC9B,aAAa,EAAE,CAAC,CAAC;IACjB,YAAY,EAAE,kBAAkB,CAAC;IACjC,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,qBAAqB,GAAG;QAC9B,UAAU,EAAE,MAAM,CAAC;QACnB,eAAe,EAAE,MAAM,CAAC;QACxB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;IACF,UAAU,EAAE,qBAAqB,GAAG,IAAI,CAAC;IACzC,iBAAiB,EAAE,gCAAgC,GAAG,IAAI,CAAC;IAC3D,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,SAAS,EAAE,0BAA0B,EAAE,CAAC;IACxC,eAAe,EAAE,6BAA6B,EAAE,CAAC;IACjD,UAAU,EAAE,2BAA2B,EAAE,CAAC;IAC1C,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,SAAS,CAAC,EAAE,wBAAwB,GAAG,IAAI,CAAC;IAC5C,aAAa,CAAC,EAAE,2BAA2B,EAAE,CAAC;IAC9C,UAAU,EAAE,wBAAwB,EAAE,CAAC;IACvC,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,cAAc,EAAE,sBAAsB,EAAE,CAAC;IACzC,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,WAAW,EAAE,WAAW,CAAC;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,cAAc,EAAE,OAAO,CAAC;IACxB,cAAc,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,wBAAwB;IACvC,SAAS,EAAE,MAAM,EAAE,CAAC;IACpB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,2BAA2B;IAC1C,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC5B,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,KAAK,EAAE,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,SAAS,CAAC;IAC/C,WAAW,CAAC,EAAE,0BAA0B,CAAC;IACzC,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;CACvB;AAED,MAAM,MAAM,0BAA0B,GAClC,gBAAgB,GAChB,SAAS,GACT,cAAc,GACd,YAAY,GACZ,gBAAgB,GAChB,aAAa,GACb,gBAAgB,GAChB,iBAAiB,GACjB,SAAS,CAAC;AAUd,MAAM,WAAW,4BAA4B;IAC3C,MAAM,EAAE,SAAS,CAAC;IAClB,MAAM,EAAE,qBAAqB,CAAC;IAC9B,UAAU,CAAC,EAAE,qBAAqB,GAAG,IAAI,CAAC;IAC1C,KAAK,CAAC,EAAE,0BAA0B,CAAC;IACnC,KAAK,CAAC,EAAE,wBAAwB,EAAE,CAAC;IACnC,aAAa,CAAC,EAAE,OAAO,4BAA4B,CAAC;IACpD,gBAAgB,CAAC,EAAE,OAAO,mCAAmC,CAAC;IAC9D,uBAAuB,CAAC,EAAE,OAAO,2BAA2B,CAAC;IAC7D,kBAAkB,CAAC,EAAE,CAAC,KAAK,EAAE;QAC3B,MAAM,EAAE,SAAS,CAAC;QAClB,MAAM,EAAE,IAAI,CAAC,2BAA2B,EAAE,WAAW,GAAG,eAAe,CAAC,GAAG;YACzE,aAAa,CAAC,EAAE,2BAA2B,EAAE,CAAC;SAC/C,CAAC;KACH,KAAK,OAAO,CAAC,wBAAwB,GAAG,IAAI,CAAC,CAAC;IAC/C,oBAAoB,CAAC,EAAE,OAAO,CAAC;IAC/B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,GAAG,CAAC,EAAE,MAAM,IAAI,CAAC;CAClB;AA4tCD,wBAAgB,8BAA8B,CAAC,KAAK,EAAE;IACpD,cAAc,EAAE,qBAAqB,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,GAAG,SAAS,CAAC;IACnE,sBAAsB,CAAC,EAAE,qBAAqB,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,GAAG,SAAS,CAAC;IAC5E,kBAAkB,CAAC,EAAE,qBAAqB,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,GAAG,SAAS,CAAC;IACxE,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,UAAU,CAAC,EAAE,wBAAwB,CAAC,YAAY,CAAC,CAAC;CACrD,GAAG,0BAA0B,GAAG,IAAI,CA6EpC;AAED,wBAAgB,6BAA6B,CAAC,KAAK,EAAE;IACnD,IAAI,EAAE,0BAA0B,CAAC;IACjC,cAAc,EAAE,qBAAqB,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC;IACvD,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;CACpB,GAAG,sBAAsB,CA4CzB;AAmID,wBAAsB,qBAAqB,CACzC,OAAO,EAAE,4BAA4B,GACpC,OAAO,CAAC,2BAA2B,CAAC,CAmetC"}
|
|
@@ -1,12 +1,127 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
1
2
|
import { generateText } from '../llm/router.js';
|
|
2
3
|
import { resolveProviderCapabilities } from '../llm/index.js';
|
|
3
4
|
import { parseJsonObject } from '../vision/parse.js';
|
|
4
5
|
import { extractGeotechDocumentFactsFromText, interpretGeotechDocumentPage, } from '../vision/geotech-document.js';
|
|
5
6
|
import { transcribeDocumentImageText } from '../vision/index.js';
|
|
6
7
|
import { recoverDocumentTextHint } from '../vision/ocr.js';
|
|
8
|
+
import { PAGE_EVIDENCE_CACHE_SCHEMA_VERSION, buildPageEvidenceCacheKey, buildPageEvidenceModelVersion, buildPageEvidencePreprocessingVersion, hashBuffer, hashString, readPageEvidenceCache, writePageEvidenceCache, } from './page-evidence-cache.js';
|
|
7
9
|
function uniqueStrings(values) {
|
|
8
10
|
return [...new Set(values.filter((value) => typeof value === 'string' && value.trim().length > 0))];
|
|
9
11
|
}
|
|
12
|
+
function isRecord(value) {
|
|
13
|
+
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
14
|
+
}
|
|
15
|
+
function isParseStatus(value) {
|
|
16
|
+
return value === 'parsed' || value === 'partial' || value === 'failed';
|
|
17
|
+
}
|
|
18
|
+
function asCachedGeotechDocumentInsight(value) {
|
|
19
|
+
if (!isRecord(value)) {
|
|
20
|
+
return null;
|
|
21
|
+
}
|
|
22
|
+
if (!isParseStatus(value.parseStatus)
|
|
23
|
+
|| typeof value.confidence !== 'number'
|
|
24
|
+
|| !Array.isArray(value.warnings)
|
|
25
|
+
|| !Array.isArray(value.materials)
|
|
26
|
+
|| !Array.isArray(value.classifications)
|
|
27
|
+
|| !Array.isArray(value.parameters)
|
|
28
|
+
|| !Array.isArray(value.risks)
|
|
29
|
+
|| !Array.isArray(value.recommendations)) {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
return value;
|
|
33
|
+
}
|
|
34
|
+
function resolveSourceFileHash(source, pages) {
|
|
35
|
+
const sourceFilePath = source.filePath ?? pages?.find((page) => page.filePath)?.filePath;
|
|
36
|
+
if (sourceFilePath) {
|
|
37
|
+
try {
|
|
38
|
+
return hashBuffer(readFileSync(sourceFilePath));
|
|
39
|
+
}
|
|
40
|
+
catch {
|
|
41
|
+
// Fall through to a stable metadata hash when the source is a synthetic test input.
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
return hashString(JSON.stringify({
|
|
45
|
+
filePath: source.filePath ?? null,
|
|
46
|
+
fileName: source.fileName ?? null,
|
|
47
|
+
inputKind: source.inputKind,
|
|
48
|
+
pageRange: source.pageRange ?? null,
|
|
49
|
+
}));
|
|
50
|
+
}
|
|
51
|
+
function resolveEvidenceCachePageNumber(source, page) {
|
|
52
|
+
const pageRange = source.segmentation?.pageRange ?? source.pageRange;
|
|
53
|
+
if (pageRange && page.pageNumber < pageRange[0]) {
|
|
54
|
+
return pageRange[0] + page.pageNumber - 1;
|
|
55
|
+
}
|
|
56
|
+
return page.pageNumber;
|
|
57
|
+
}
|
|
58
|
+
function buildPageEvidenceCacheContext(input) {
|
|
59
|
+
const pageNumber = resolveEvidenceCachePageNumber(input.source, input.page);
|
|
60
|
+
const pageHash = input.source.inputKind === 'pdf'
|
|
61
|
+
&& (input.page.sourceKind === 'pdf-page' || input.page.mimeType === 'application/pdf')
|
|
62
|
+
? hashString(`pdf-page:${pageNumber}`)
|
|
63
|
+
: hashString(`${input.page.mimeType}\n${input.page.base64}`);
|
|
64
|
+
const parts = {
|
|
65
|
+
fileHash: input.fileHash,
|
|
66
|
+
pageHash,
|
|
67
|
+
pageNumber,
|
|
68
|
+
modelVersion: input.modelVersion,
|
|
69
|
+
preprocessingVersion: input.preprocessingVersion,
|
|
70
|
+
schemaVersion: PAGE_EVIDENCE_CACHE_SCHEMA_VERSION,
|
|
71
|
+
};
|
|
72
|
+
const cacheKey = buildPageEvidenceCacheKey(parts);
|
|
73
|
+
return {
|
|
74
|
+
parts,
|
|
75
|
+
cacheKey,
|
|
76
|
+
entryId: cacheKey.slice(0, 12),
|
|
77
|
+
pageNumber,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
function buildPageEvidenceCacheAudit(context, status, entry, reason) {
|
|
81
|
+
return {
|
|
82
|
+
status,
|
|
83
|
+
entryId: context.entryId,
|
|
84
|
+
cacheKey: context.cacheKey,
|
|
85
|
+
fileHash: context.parts.fileHash,
|
|
86
|
+
pageHash: context.parts.pageHash,
|
|
87
|
+
pageNumber: context.pageNumber,
|
|
88
|
+
modelVersion: context.parts.modelVersion,
|
|
89
|
+
preprocessingVersion: context.parts.preprocessingVersion,
|
|
90
|
+
schemaVersion: context.parts.schemaVersion ?? PAGE_EVIDENCE_CACHE_SCHEMA_VERSION,
|
|
91
|
+
createdAt: entry?.createdAt,
|
|
92
|
+
reason,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
function safeReadPageEvidenceCache(context) {
|
|
96
|
+
try {
|
|
97
|
+
return readPageEvidenceCache(context.parts);
|
|
98
|
+
}
|
|
99
|
+
catch {
|
|
100
|
+
return null;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
function safeWritePageEvidenceCache(context, evidence, now) {
|
|
104
|
+
try {
|
|
105
|
+
return {
|
|
106
|
+
entry: writePageEvidenceCache(context.parts, evidence, { now }),
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
catch (error) {
|
|
110
|
+
return {
|
|
111
|
+
entry: null,
|
|
112
|
+
error: error instanceof Error ? error.message : String(error),
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
function shouldUsePageEvidenceCache(option) {
|
|
117
|
+
if (option === true) {
|
|
118
|
+
return true;
|
|
119
|
+
}
|
|
120
|
+
if (option === false) {
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
return process.env.NODE_ENV !== 'test';
|
|
124
|
+
}
|
|
10
125
|
function normalizeTextItems(value, limit = 8) {
|
|
11
126
|
if (!Array.isArray(value)) {
|
|
12
127
|
return [];
|
|
@@ -1039,6 +1154,16 @@ export async function ingestGeotechDocument(options) {
|
|
|
1039
1154
|
const pageConcurrency = shouldSeriallyProcessImageHeavyPages(options.config, options.inspection)
|
|
1040
1155
|
? 1
|
|
1041
1156
|
: resolvePageConcurrency(options.config, options.pageConcurrency);
|
|
1157
|
+
const usePageEvidenceCache = shouldUsePageEvidenceCache(options.usePageEvidenceCache);
|
|
1158
|
+
const pageEvidenceFileHash = usePageEvidenceCache
|
|
1159
|
+
? resolveSourceFileHash(options.source, options.pages)
|
|
1160
|
+
: null;
|
|
1161
|
+
const pageEvidenceModelVersion = usePageEvidenceCache
|
|
1162
|
+
? buildPageEvidenceModelVersion(options.config)
|
|
1163
|
+
: '';
|
|
1164
|
+
const pageEvidencePreprocessingVersion = usePageEvidenceCache
|
|
1165
|
+
? buildPageEvidencePreprocessingVersion(options.config)
|
|
1166
|
+
: '';
|
|
1042
1167
|
if (options.pages && options.pages.length > 0) {
|
|
1043
1168
|
const pages = [...options.pages].sort((left, right) => left.pageNumber - right.pageNumber);
|
|
1044
1169
|
const settledPages = await mapWithConcurrency(pages, pageConcurrency, async (page) => {
|
|
@@ -1055,6 +1180,19 @@ export async function ingestGeotechDocument(options) {
|
|
|
1055
1180
|
?? inspectionPage?.normalizedText
|
|
1056
1181
|
?? undefined;
|
|
1057
1182
|
let textHintSource = pageTextHint?.trim() ? 'native-text' : 'none';
|
|
1183
|
+
const cacheContext = usePageEvidenceCache && pageEvidenceFileHash
|
|
1184
|
+
? buildPageEvidenceCacheContext({
|
|
1185
|
+
source: options.source,
|
|
1186
|
+
page,
|
|
1187
|
+
fileHash: pageEvidenceFileHash,
|
|
1188
|
+
modelVersion: pageEvidenceModelVersion,
|
|
1189
|
+
preprocessingVersion: pageEvidencePreprocessingVersion,
|
|
1190
|
+
})
|
|
1191
|
+
: null;
|
|
1192
|
+
const cachedEvidence = cacheContext ? safeReadPageEvidenceCache(cacheContext) : null;
|
|
1193
|
+
let evidenceCache = cacheContext
|
|
1194
|
+
? buildPageEvidenceCacheAudit(cacheContext, cachedEvidence ? 'hit' : 'miss', cachedEvidence)
|
|
1195
|
+
: undefined;
|
|
1058
1196
|
try {
|
|
1059
1197
|
if (lowYieldRole && inspectionPage) {
|
|
1060
1198
|
return {
|
|
@@ -1068,6 +1206,9 @@ export async function ingestGeotechDocument(options) {
|
|
|
1068
1206
|
: 'Figure/appendix page was summarized without a full multimodal extraction call.',
|
|
1069
1207
|
],
|
|
1070
1208
|
ocrRecovered: false,
|
|
1209
|
+
evidenceCache: cacheContext
|
|
1210
|
+
? buildPageEvidenceCacheAudit(cacheContext, 'skipped', null, 'low-yield page classified before model extraction')
|
|
1211
|
+
: evidenceCache,
|
|
1071
1212
|
result: buildPreflightLowYieldInsight({
|
|
1072
1213
|
role: lowYieldRole,
|
|
1073
1214
|
inspectionPage,
|
|
@@ -1076,6 +1217,24 @@ export async function ingestGeotechDocument(options) {
|
|
|
1076
1217
|
}),
|
|
1077
1218
|
};
|
|
1078
1219
|
}
|
|
1220
|
+
const cachedResult = asCachedGeotechDocumentInsight(cachedEvidence?.extractionResult);
|
|
1221
|
+
if (cachedResult) {
|
|
1222
|
+
return {
|
|
1223
|
+
ok: true,
|
|
1224
|
+
pageNumber: page.pageNumber,
|
|
1225
|
+
inspectionPage,
|
|
1226
|
+
textHintSource: cachedEvidence.source,
|
|
1227
|
+
recoveryWarnings: cachedEvidence.warnings,
|
|
1228
|
+
ocrRecovered: cachedEvidence.source === 'local-ocr' || cachedEvidence.source === 'vision-ocr' || cachedEvidence.source === 'glm-ocr',
|
|
1229
|
+
evidenceCache,
|
|
1230
|
+
result: cachedResult,
|
|
1231
|
+
};
|
|
1232
|
+
}
|
|
1233
|
+
const cachedTextHint = cachedEvidence?.textHint;
|
|
1234
|
+
if (cachedTextHint) {
|
|
1235
|
+
pageTextHint = cachedTextHint;
|
|
1236
|
+
textHintSource = cachedEvidence.source;
|
|
1237
|
+
}
|
|
1079
1238
|
const pageTimeoutMs = resolvePagePhaseTimeoutMs(options.config, {
|
|
1080
1239
|
classification: inspectionPage?.classification,
|
|
1081
1240
|
sourceKind: page.sourceKind,
|
|
@@ -1089,7 +1248,7 @@ export async function ingestGeotechDocument(options) {
|
|
|
1089
1248
|
page,
|
|
1090
1249
|
inspectionPage,
|
|
1091
1250
|
textHint: pageTextHint,
|
|
1092
|
-
})) {
|
|
1251
|
+
}) && !cachedTextHint) {
|
|
1093
1252
|
const context = {
|
|
1094
1253
|
pageNumber: page.pageNumber,
|
|
1095
1254
|
totalPages: page.totalPages,
|
|
@@ -1097,6 +1256,17 @@ export async function ingestGeotechDocument(options) {
|
|
|
1097
1256
|
directVisualPreferred: true,
|
|
1098
1257
|
};
|
|
1099
1258
|
const result = await withPageTimeout(interpretPage(page.base64, page.mimeType, pagePhaseConfig, context), pageTimeoutMs, `Page ${page.pageNumber}: direct visual page interpretation timed out after ${Math.round(pageTimeoutMs / 1000)}s`);
|
|
1259
|
+
if (cacheContext) {
|
|
1260
|
+
const stored = safeWritePageEvidenceCache(cacheContext, {
|
|
1261
|
+
source: 'vision-visual',
|
|
1262
|
+
warnings: ['Skipped OCR-only transcription and used direct visual extraction for an image-heavy hosted-beta page.'],
|
|
1263
|
+
transformed: false,
|
|
1264
|
+
extractionResult: result,
|
|
1265
|
+
}, now);
|
|
1266
|
+
evidenceCache = stored.entry
|
|
1267
|
+
? buildPageEvidenceCacheAudit(cacheContext, 'stored', stored.entry)
|
|
1268
|
+
: buildPageEvidenceCacheAudit(cacheContext, 'skipped', null, `cache write failed: ${stored.error}`);
|
|
1269
|
+
}
|
|
1100
1270
|
return {
|
|
1101
1271
|
ok: true,
|
|
1102
1272
|
pageNumber: page.pageNumber,
|
|
@@ -1104,23 +1274,36 @@ export async function ingestGeotechDocument(options) {
|
|
|
1104
1274
|
textHintSource: 'vision-visual',
|
|
1105
1275
|
recoveryWarnings: ['Skipped OCR-only transcription and used direct visual extraction for an image-heavy hosted-beta page.'],
|
|
1106
1276
|
ocrRecovered: false,
|
|
1277
|
+
evidenceCache,
|
|
1107
1278
|
result,
|
|
1108
1279
|
};
|
|
1109
1280
|
}
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1281
|
+
let recoveryWarnings = cachedEvidence?.warnings ?? [];
|
|
1282
|
+
let recoverySource = textHintSource;
|
|
1283
|
+
let recoveryTransformed = cachedEvidence?.transformed ?? false;
|
|
1284
|
+
let layoutSummary = cachedEvidence?.layoutSummary;
|
|
1285
|
+
if (!cachedTextHint) {
|
|
1286
|
+
const recovery = await withPageTimeout(recoverDocumentTextHint({
|
|
1287
|
+
existingTextHint: pageTextHint,
|
|
1288
|
+
existingTextAccepted: inspectionPage?.normalizedArtifact?.textQuality.accepted ?? true,
|
|
1289
|
+
imageBase64: page.base64,
|
|
1290
|
+
mimeType: page.mimeType,
|
|
1291
|
+
config: pagePhaseConfig,
|
|
1292
|
+
pdfFilePath: page.filePath,
|
|
1293
|
+
pdfPageNumber: page.pageNumber,
|
|
1294
|
+
visionTranscribe: transcribePageImageText,
|
|
1295
|
+
}), pageTimeoutMs, `Page ${page.pageNumber}: OCR/text recovery timed out after ${Math.round(pageTimeoutMs / 1000)}s`);
|
|
1296
|
+
if (recovery.textHint) {
|
|
1297
|
+
pageTextHint = recovery.textHint;
|
|
1298
|
+
}
|
|
1299
|
+
recoveryWarnings = recovery.warnings;
|
|
1300
|
+
recoverySource = recovery.source;
|
|
1301
|
+
recoveryTransformed = recovery.transformed;
|
|
1302
|
+
layoutSummary = recovery.layout
|
|
1303
|
+
? `GLM-OCR parsed ${recovery.layout.pages.length} page(s), ${recovery.layout.pages.reduce((sum, layoutPage) => sum + layoutPage.tables.length, 0)} table(s).`
|
|
1304
|
+
: undefined;
|
|
1122
1305
|
}
|
|
1123
|
-
textHintSource =
|
|
1306
|
+
textHintSource = recoverySource;
|
|
1124
1307
|
const context = {
|
|
1125
1308
|
pageNumber: page.pageNumber,
|
|
1126
1309
|
totalPages: page.totalPages,
|
|
@@ -1136,13 +1319,38 @@ export async function ingestGeotechDocument(options) {
|
|
|
1136
1319
|
const result = pageTextHint
|
|
1137
1320
|
? await withPageTimeout(extractTextFacts(pageTextHint, extractionConfig, context), extractionTimeoutMs, `Page ${page.pageNumber}: text extraction timed out after ${Math.round(extractionTimeoutMs / 1000)}s`)
|
|
1138
1321
|
: await withPageTimeout(interpretPage(page.base64, page.mimeType, pagePhaseConfig, context), pageTimeoutMs, `Page ${page.pageNumber}: visual page interpretation timed out after ${Math.round(pageTimeoutMs / 1000)}s`);
|
|
1322
|
+
if (cacheContext && !cachedEvidence) {
|
|
1323
|
+
const stored = safeWritePageEvidenceCache(cacheContext, {
|
|
1324
|
+
textHint: pageTextHint,
|
|
1325
|
+
source: textHintSource,
|
|
1326
|
+
warnings: recoveryWarnings,
|
|
1327
|
+
transformed: recoveryTransformed,
|
|
1328
|
+
layoutSummary,
|
|
1329
|
+
extractionResult: result,
|
|
1330
|
+
}, now);
|
|
1331
|
+
evidenceCache = stored.entry
|
|
1332
|
+
? buildPageEvidenceCacheAudit(cacheContext, 'stored', stored.entry)
|
|
1333
|
+
: buildPageEvidenceCacheAudit(cacheContext, 'skipped', null, `cache write failed: ${stored.error}`);
|
|
1334
|
+
}
|
|
1335
|
+
else if (cacheContext && cachedEvidence && !cachedResult) {
|
|
1336
|
+
safeWritePageEvidenceCache(cacheContext, {
|
|
1337
|
+
textHint: pageTextHint,
|
|
1338
|
+
source: textHintSource,
|
|
1339
|
+
warnings: recoveryWarnings,
|
|
1340
|
+
transformed: recoveryTransformed,
|
|
1341
|
+
layoutSummary,
|
|
1342
|
+
extractionResult: result,
|
|
1343
|
+
createdAt: cachedEvidence.createdAt,
|
|
1344
|
+
}, now);
|
|
1345
|
+
}
|
|
1139
1346
|
return {
|
|
1140
1347
|
ok: true,
|
|
1141
1348
|
pageNumber: page.pageNumber,
|
|
1142
1349
|
inspectionPage,
|
|
1143
1350
|
textHintSource,
|
|
1144
|
-
recoveryWarnings
|
|
1145
|
-
ocrRecovered:
|
|
1351
|
+
recoveryWarnings,
|
|
1352
|
+
ocrRecovered: textHintSource === 'local-ocr' || textHintSource === 'vision-ocr' || textHintSource === 'glm-ocr',
|
|
1353
|
+
evidenceCache,
|
|
1146
1354
|
result,
|
|
1147
1355
|
};
|
|
1148
1356
|
}
|
|
@@ -1152,6 +1360,7 @@ export async function ingestGeotechDocument(options) {
|
|
|
1152
1360
|
pageNumber: page.pageNumber,
|
|
1153
1361
|
inspectionPage,
|
|
1154
1362
|
textHintSource,
|
|
1363
|
+
evidenceCache,
|
|
1155
1364
|
error: normalizePageErrorMessage(error instanceof Error ? error.message : String(error)),
|
|
1156
1365
|
};
|
|
1157
1366
|
}
|
|
@@ -1181,6 +1390,7 @@ export async function ingestGeotechDocument(options) {
|
|
|
1181
1390
|
materialCount: settled.result.materials.length,
|
|
1182
1391
|
classificationCount: settled.result.classifications.length,
|
|
1183
1392
|
parameterCount: settled.result.parameters.length,
|
|
1393
|
+
evidenceCache: settled.evidenceCache,
|
|
1184
1394
|
warnings: uniqueStrings([
|
|
1185
1395
|
...settled.recoveryWarnings,
|
|
1186
1396
|
...settled.result.warnings,
|
|
@@ -1198,6 +1408,7 @@ export async function ingestGeotechDocument(options) {
|
|
|
1198
1408
|
materialCount: 0,
|
|
1199
1409
|
classificationCount: 0,
|
|
1200
1410
|
parameterCount: 0,
|
|
1411
|
+
evidenceCache: settled.evidenceCache,
|
|
1201
1412
|
warnings: [settled.error],
|
|
1202
1413
|
});
|
|
1203
1414
|
}
|