@geotechcli/core 0.4.37 → 0.4.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ingest/geotech-document.d.ts +16 -0
- package/dist/ingest/geotech-document.d.ts.map +1 -1
- package/dist/ingest/geotech-document.js +224 -16
- package/dist/ingest/geotech-document.js.map +1 -1
- package/dist/ingest/index.d.ts +2 -1
- package/dist/ingest/index.d.ts.map +1 -1
- package/dist/ingest/index.js +1 -0
- package/dist/ingest/index.js.map +1 -1
- package/dist/ingest/job-store.d.ts +2 -1
- package/dist/ingest/job-store.d.ts.map +1 -1
- package/dist/ingest/job-store.js +41 -0
- package/dist/ingest/job-store.js.map +1 -1
- package/dist/ingest/job-worker.d.ts.map +1 -1
- package/dist/ingest/job-worker.js +187 -12
- package/dist/ingest/job-worker.js.map +1 -1
- package/dist/ingest/page-evidence-cache.d.ts +47 -0
- package/dist/ingest/page-evidence-cache.d.ts.map +1 -0
- package/dist/ingest/page-evidence-cache.js +205 -0
- package/dist/ingest/page-evidence-cache.js.map +1 -0
- package/dist/meta/metadata.json +1 -1
- package/dist/report/html.d.ts.map +1 -1
- package/dist/report/html.js +8 -1
- package/dist/report/html.js.map +1 -1
- package/dist/report/ingest-dossier.d.ts +2 -0
- package/dist/report/ingest-dossier.d.ts.map +1 -1
- package/dist/report/ingest-dossier.js +32 -2
- package/dist/report/ingest-dossier.js.map +1 -1
- package/package.json +1 -1
|
@@ -22,6 +22,20 @@ export interface GeotechDocumentSource {
|
|
|
22
22
|
pageRange?: [number, number];
|
|
23
23
|
segmentation?: IngestSegmentationSummary;
|
|
24
24
|
}
|
|
25
|
+
export type GeotechDocumentPageEvidenceCacheStatus = 'hit' | 'miss' | 'stored' | 'skipped';
|
|
26
|
+
export interface GeotechDocumentPageEvidenceCacheAudit {
|
|
27
|
+
status: GeotechDocumentPageEvidenceCacheStatus;
|
|
28
|
+
entryId: string;
|
|
29
|
+
cacheKey: string;
|
|
30
|
+
fileHash: string;
|
|
31
|
+
pageHash: string;
|
|
32
|
+
pageNumber: number;
|
|
33
|
+
modelVersion: string;
|
|
34
|
+
preprocessingVersion: string;
|
|
35
|
+
schemaVersion: number;
|
|
36
|
+
createdAt?: string;
|
|
37
|
+
reason?: string;
|
|
38
|
+
}
|
|
25
39
|
export interface GeotechDocumentPageAudit {
|
|
26
40
|
pageNumber: number;
|
|
27
41
|
classification: PdfPageClassification | null;
|
|
@@ -31,6 +45,7 @@ export interface GeotechDocumentPageAudit {
|
|
|
31
45
|
materialCount: number;
|
|
32
46
|
classificationCount: number;
|
|
33
47
|
parameterCount: number;
|
|
48
|
+
evidenceCache?: GeotechDocumentPageEvidenceCacheAudit;
|
|
34
49
|
warnings: string[];
|
|
35
50
|
}
|
|
36
51
|
export type GeotechDocumentFindingSeverity = 'advisory' | 'review' | 'blocking';
|
|
@@ -118,6 +133,7 @@ export interface IngestGeotechDocumentOptions {
|
|
|
118
133
|
contentChunks?: GeotechDocumentContentChunk[];
|
|
119
134
|
};
|
|
120
135
|
}) => Promise<GeotechDocumentSynthesis | null>;
|
|
136
|
+
usePageEvidenceCache?: boolean;
|
|
121
137
|
pageConcurrency?: number;
|
|
122
138
|
now?: () => Date;
|
|
123
139
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"geotech-document.d.ts","sourceRoot":"","sources":["../../src/ingest/geotech-document.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"geotech-document.d.ts","sourceRoot":"","sources":["../../src/ingest/geotech-document.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAsB,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAIrE,OAAO,EACL,mCAAmC,EACnC,4BAA4B,EAC5B,KAAK,6BAA6B,EAElC,KAAK,sBAAsB,EAC3B,KAAK,0BAA0B,EAC/B,KAAK,2BAA2B,EAChC,KAAK,WAAW,EACjB,MAAM,+BAA+B,CAAC;AACvC,OAAO,EAAE,2BAA2B,EAAE,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAA2B,KAAK,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AACxF,OAAO,KAAK,EAAE,qBAAqB,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAC7E,OAAO,KAAK,EAAE,yBAAyB,EAAE,MAAM,mBAAmB,CAAC;AAenE,MAAM,WAAW,0BAA0B;IACzC,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,wBAAyB,SAAQ,0BAA0B;IAC1E,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,UAAU,GAAG,cAAc,CAAC;IACzC,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,OAAO,GAAG,KAAK,CAAC;IAC3B,SAAS,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC7B,YAAY,CAAC,EAAE,yBAAyB,CAAC;CAC1C;AAED,MAAM,MAAM,sCAAsC,GAAG,KAAK,GAAG,MAAM,GAAG,QAAQ,GAAG,SAAS,CAAC;AAE3F,MAAM,WAAW,qCAAqC;IACpD,MAAM,EAAE,sCAAsC,CAAC;IAC/C,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,wBAAwB;IACvC,UAAU,EAAE,MAAM,CAAC;IACnB,cAAc,EAAE,qBAAqB,GAAG,IAAI,CAAC;IAC7C,cAAc,EAAE,sBAAsB,CAAC;IACvC,WAAW,EAAE,WAAW,CAAC;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,mBAAmB,EAAE,MAAM,CAAC;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,aAAa,CAAC,EAAE,qCAAqC,CAAC;IACtD,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,MAAM,MAAM,8BAA8B,GAAG,UAAU,GAAG,QAAQ,GAAG,UAAU,CAAC;AAChF,MAAM,MAAM,2BAA2B,GAAG,UAAU,GAAG,MAAM,GAAG,UAAU,CAAC;AAE3E,MAAM,WAAW,sBAAsB;IACrC,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,8BAA8B,CAAC;IACzC,KAAK,EAAE,2BAA2B,CAAC;IACnC,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED,MAAM,WAAW,gCAAgC;IAC/C,wBAAwB,EAAE,OAAO,CAAC,MAAM,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC,CAAC;IACzE,mBAAmB,EAAE,MAAM,CAAC;IAC5B,mBAAmB,EAAE,MAAM,CAAC;IAC5B,iBAAiB,EAAE,MAAM,CAAC;IAC1B,qBAAqB,EAAE,MAAM,CAAC;CAC/B;AAED,MAAM,WAAW,2BAA2B;IAC1C,IAAI,EAAE,uBAAuB,CAAC;IAC9B,aAAa,EAAE,CAAC,CAAC;IACjB,YAAY,EAAE,kBAAkB,CAAC;IACjC,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,qBAAqB,GAAG;QAC9B,UAAU,EAAE,MAAM,CAAC;QACnB,eAAe,EAAE,MAAM,CAAC;QACxB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;IACF,UAAU,EAAE,qBAAqB,GAAG,IAAI,CAAC;IACzC,iBAAiB,EAAE,gCAAgC,GAAG,IAAI,CAAC;IAC3D,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,SAAS,EAAE,0BAA0B,EAAE,CAAC;IACxC,eAAe,EAAE,6BAA6B,EAAE,CAAC;IACjD,UAAU,EAAE,2BAA2B,EAAE,CAAC;IAC1C,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,SAAS,CAAC,EAAE,wBAAwB,GAAG,IAAI,CAAC;IAC5C,aAAa,CAAC,EAAE,2BAA2B,EAAE,CAAC;IAC9C,UAAU,EAAE,wBAAwB,EAAE,CAAC;IACvC,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,cAAc,EAAE,sBAAsB,EAAE,CAAC;IACzC,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,WAAW,EAAE,WAAW,CAAC;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,cAAc,EAAE,OAAO,CAAC;IACxB,cAAc,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,wBAAwB;IACvC,SAAS,EAAE,MAAM,EAAE,CAAC;IACpB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,2BAA2B;IAC1C,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC5B,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,KAAK,EAAE,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,SAAS,CAAC;IAC/C,WAAW,CAAC,EAAE,0BAA0B,CAAC;IACzC,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;CACvB;AAED,MAAM,MAAM,0BAA0B,GAClC,gBAAgB,GAChB,SAAS,GACT,cAAc,GACd,YAAY,GACZ,gBAAgB,GAChB,aAAa,GACb,gBAAgB,GAChB,iBAAiB,GACjB,SAAS,CAAC;AAUd,MAAM,WAAW,4BAA4B;IAC3C,MAAM,EAAE,SAAS,CAAC;IAClB,MAAM,EAAE,qBAAqB,CAAC;IAC9B,UAAU,CAAC,EAAE,qBAAqB,GAAG,IAAI,CAAC;IAC1C,KAAK,CAAC,EAAE,0BAA0B,CAAC;IACnC,KAAK,CAAC,EAAE,wBAAwB,EAAE,CAAC;IACnC,aAAa,CAAC,EAAE,OAAO,4BAA4B,CAAC;IACpD,gBAAgB,CAAC,EAAE,OAAO,mCAAmC,CAAC;IAC9D,uBAAuB,CAAC,EAAE,OAAO,2BAA2B,CAAC;IAC7D,kBAAkB,CAAC,EAAE,CAAC,KAAK,EAAE;QAC3B,MAAM,EAAE,SAAS,CAAC;QAClB,MAAM,EAAE,IAAI,CAAC,2BAA2B,EAAE,WAAW,GAAG,eAAe,CAAC,GAAG;YACzE,aAAa,CAAC,EAAE,2BAA2B,EAAE,CAAC;SAC/C,CAAC;KACH,KAAK,OAAO,CAAC,wBAAwB,GAAG,IAAI,CAAC,CAAC;IAC/C,oBAAoB,CAAC,EAAE,OAAO,CAAC;IAC/B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,GAAG,CAAC,EAAE,MAAM,IAAI,CAAC;CAClB;AAytCD,wBAAgB,8BAA8B,CAAC,KAAK,EAAE;IACpD,cAAc,EAAE,qBAAqB,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,GAAG,SAAS,CAAC;IACnE,sBAAsB,CAAC,EAAE,qBAAqB,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,GAAG,SAAS,CAAC;IAC5E,kBAAkB,CAAC,EAAE,qBAAqB,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,GAAG,SAAS,CAAC;IACxE,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,UAAU,CAAC,EAAE,wBAAwB,CAAC,YAAY,CAAC,CAAC;CACrD,GAAG,0BAA0B,GAAG,IAAI,CA6EpC;AAED,wBAAgB,6BAA6B,CAAC,KAAK,EAAE;IACnD,IAAI,EAAE,0BAA0B,CAAC;IACjC,cAAc,EAAE,qBAAqB,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC;IACvD,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;CACpB,GAAG,sBAAsB,CA4CzB;AAmID,wBAAsB,qBAAqB,CACzC,OAAO,EAAE,4BAA4B,GACpC,OAAO,CAAC,2BAA2B,CAAC,CAmetC"}
|
|
@@ -1,12 +1,124 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
1
2
|
import { generateText } from '../llm/router.js';
|
|
2
3
|
import { resolveProviderCapabilities } from '../llm/index.js';
|
|
3
4
|
import { parseJsonObject } from '../vision/parse.js';
|
|
4
5
|
import { extractGeotechDocumentFactsFromText, interpretGeotechDocumentPage, } from '../vision/geotech-document.js';
|
|
5
6
|
import { transcribeDocumentImageText } from '../vision/index.js';
|
|
6
7
|
import { recoverDocumentTextHint } from '../vision/ocr.js';
|
|
8
|
+
import { PAGE_EVIDENCE_CACHE_SCHEMA_VERSION, buildPageEvidenceCacheKey, buildPageEvidenceModelVersion, buildPageEvidencePreprocessingVersion, hashBuffer, hashString, readPageEvidenceCache, writePageEvidenceCache, } from './page-evidence-cache.js';
|
|
7
9
|
function uniqueStrings(values) {
|
|
8
10
|
return [...new Set(values.filter((value) => typeof value === 'string' && value.trim().length > 0))];
|
|
9
11
|
}
|
|
12
|
+
function isRecord(value) {
|
|
13
|
+
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
14
|
+
}
|
|
15
|
+
function isParseStatus(value) {
|
|
16
|
+
return value === 'parsed' || value === 'partial' || value === 'failed';
|
|
17
|
+
}
|
|
18
|
+
function asCachedGeotechDocumentInsight(value) {
|
|
19
|
+
if (!isRecord(value)) {
|
|
20
|
+
return null;
|
|
21
|
+
}
|
|
22
|
+
if (!isParseStatus(value.parseStatus)
|
|
23
|
+
|| typeof value.confidence !== 'number'
|
|
24
|
+
|| !Array.isArray(value.warnings)
|
|
25
|
+
|| !Array.isArray(value.materials)
|
|
26
|
+
|| !Array.isArray(value.classifications)
|
|
27
|
+
|| !Array.isArray(value.parameters)
|
|
28
|
+
|| !Array.isArray(value.risks)
|
|
29
|
+
|| !Array.isArray(value.recommendations)) {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
return value;
|
|
33
|
+
}
|
|
34
|
+
function resolveSourceFileHash(source, pages) {
|
|
35
|
+
const sourceFilePath = source.filePath ?? pages?.find((page) => page.filePath)?.filePath;
|
|
36
|
+
if (sourceFilePath) {
|
|
37
|
+
try {
|
|
38
|
+
return hashBuffer(readFileSync(sourceFilePath));
|
|
39
|
+
}
|
|
40
|
+
catch {
|
|
41
|
+
// Fall through to a stable metadata hash when the source is a synthetic test input.
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
return hashString(JSON.stringify({
|
|
45
|
+
filePath: source.filePath ?? null,
|
|
46
|
+
fileName: source.fileName ?? null,
|
|
47
|
+
inputKind: source.inputKind,
|
|
48
|
+
pageRange: source.pageRange ?? null,
|
|
49
|
+
}));
|
|
50
|
+
}
|
|
51
|
+
function resolveEvidenceCachePageNumber(source, page) {
|
|
52
|
+
const pageRange = source.segmentation?.pageRange ?? source.pageRange;
|
|
53
|
+
if (pageRange && page.pageNumber < pageRange[0]) {
|
|
54
|
+
return pageRange[0] + page.pageNumber - 1;
|
|
55
|
+
}
|
|
56
|
+
return page.pageNumber;
|
|
57
|
+
}
|
|
58
|
+
function buildPageEvidenceCacheContext(input) {
|
|
59
|
+
const pageNumber = resolveEvidenceCachePageNumber(input.source, input.page);
|
|
60
|
+
const pageHash = hashString(`${input.page.mimeType}\n${input.page.base64}`);
|
|
61
|
+
const parts = {
|
|
62
|
+
fileHash: input.fileHash,
|
|
63
|
+
pageHash,
|
|
64
|
+
pageNumber,
|
|
65
|
+
modelVersion: input.modelVersion,
|
|
66
|
+
preprocessingVersion: input.preprocessingVersion,
|
|
67
|
+
schemaVersion: PAGE_EVIDENCE_CACHE_SCHEMA_VERSION,
|
|
68
|
+
};
|
|
69
|
+
const cacheKey = buildPageEvidenceCacheKey(parts);
|
|
70
|
+
return {
|
|
71
|
+
parts,
|
|
72
|
+
cacheKey,
|
|
73
|
+
entryId: cacheKey.slice(0, 12),
|
|
74
|
+
pageNumber,
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
function buildPageEvidenceCacheAudit(context, status, entry, reason) {
|
|
78
|
+
return {
|
|
79
|
+
status,
|
|
80
|
+
entryId: context.entryId,
|
|
81
|
+
cacheKey: context.cacheKey,
|
|
82
|
+
fileHash: context.parts.fileHash,
|
|
83
|
+
pageHash: context.parts.pageHash,
|
|
84
|
+
pageNumber: context.pageNumber,
|
|
85
|
+
modelVersion: context.parts.modelVersion,
|
|
86
|
+
preprocessingVersion: context.parts.preprocessingVersion,
|
|
87
|
+
schemaVersion: context.parts.schemaVersion ?? PAGE_EVIDENCE_CACHE_SCHEMA_VERSION,
|
|
88
|
+
createdAt: entry?.createdAt,
|
|
89
|
+
reason,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
function safeReadPageEvidenceCache(context) {
|
|
93
|
+
try {
|
|
94
|
+
return readPageEvidenceCache(context.parts);
|
|
95
|
+
}
|
|
96
|
+
catch {
|
|
97
|
+
return null;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
function safeWritePageEvidenceCache(context, evidence, now) {
|
|
101
|
+
try {
|
|
102
|
+
return {
|
|
103
|
+
entry: writePageEvidenceCache(context.parts, evidence, { now }),
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
catch (error) {
|
|
107
|
+
return {
|
|
108
|
+
entry: null,
|
|
109
|
+
error: error instanceof Error ? error.message : String(error),
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
function shouldUsePageEvidenceCache(option) {
|
|
114
|
+
if (option === true) {
|
|
115
|
+
return true;
|
|
116
|
+
}
|
|
117
|
+
if (option === false) {
|
|
118
|
+
return false;
|
|
119
|
+
}
|
|
120
|
+
return process.env.NODE_ENV !== 'test';
|
|
121
|
+
}
|
|
10
122
|
function normalizeTextItems(value, limit = 8) {
|
|
11
123
|
if (!Array.isArray(value)) {
|
|
12
124
|
return [];
|
|
@@ -1039,6 +1151,16 @@ export async function ingestGeotechDocument(options) {
|
|
|
1039
1151
|
const pageConcurrency = shouldSeriallyProcessImageHeavyPages(options.config, options.inspection)
|
|
1040
1152
|
? 1
|
|
1041
1153
|
: resolvePageConcurrency(options.config, options.pageConcurrency);
|
|
1154
|
+
const usePageEvidenceCache = shouldUsePageEvidenceCache(options.usePageEvidenceCache);
|
|
1155
|
+
const pageEvidenceFileHash = usePageEvidenceCache
|
|
1156
|
+
? resolveSourceFileHash(options.source, options.pages)
|
|
1157
|
+
: null;
|
|
1158
|
+
const pageEvidenceModelVersion = usePageEvidenceCache
|
|
1159
|
+
? buildPageEvidenceModelVersion(options.config)
|
|
1160
|
+
: '';
|
|
1161
|
+
const pageEvidencePreprocessingVersion = usePageEvidenceCache
|
|
1162
|
+
? buildPageEvidencePreprocessingVersion(options.config)
|
|
1163
|
+
: '';
|
|
1042
1164
|
if (options.pages && options.pages.length > 0) {
|
|
1043
1165
|
const pages = [...options.pages].sort((left, right) => left.pageNumber - right.pageNumber);
|
|
1044
1166
|
const settledPages = await mapWithConcurrency(pages, pageConcurrency, async (page) => {
|
|
@@ -1055,6 +1177,19 @@ export async function ingestGeotechDocument(options) {
|
|
|
1055
1177
|
?? inspectionPage?.normalizedText
|
|
1056
1178
|
?? undefined;
|
|
1057
1179
|
let textHintSource = pageTextHint?.trim() ? 'native-text' : 'none';
|
|
1180
|
+
const cacheContext = usePageEvidenceCache && pageEvidenceFileHash
|
|
1181
|
+
? buildPageEvidenceCacheContext({
|
|
1182
|
+
source: options.source,
|
|
1183
|
+
page,
|
|
1184
|
+
fileHash: pageEvidenceFileHash,
|
|
1185
|
+
modelVersion: pageEvidenceModelVersion,
|
|
1186
|
+
preprocessingVersion: pageEvidencePreprocessingVersion,
|
|
1187
|
+
})
|
|
1188
|
+
: null;
|
|
1189
|
+
const cachedEvidence = cacheContext ? safeReadPageEvidenceCache(cacheContext) : null;
|
|
1190
|
+
let evidenceCache = cacheContext
|
|
1191
|
+
? buildPageEvidenceCacheAudit(cacheContext, cachedEvidence ? 'hit' : 'miss', cachedEvidence)
|
|
1192
|
+
: undefined;
|
|
1058
1193
|
try {
|
|
1059
1194
|
if (lowYieldRole && inspectionPage) {
|
|
1060
1195
|
return {
|
|
@@ -1068,6 +1203,9 @@ export async function ingestGeotechDocument(options) {
|
|
|
1068
1203
|
: 'Figure/appendix page was summarized without a full multimodal extraction call.',
|
|
1069
1204
|
],
|
|
1070
1205
|
ocrRecovered: false,
|
|
1206
|
+
evidenceCache: cacheContext
|
|
1207
|
+
? buildPageEvidenceCacheAudit(cacheContext, 'skipped', null, 'low-yield page classified before model extraction')
|
|
1208
|
+
: evidenceCache,
|
|
1071
1209
|
result: buildPreflightLowYieldInsight({
|
|
1072
1210
|
role: lowYieldRole,
|
|
1073
1211
|
inspectionPage,
|
|
@@ -1076,6 +1214,24 @@ export async function ingestGeotechDocument(options) {
|
|
|
1076
1214
|
}),
|
|
1077
1215
|
};
|
|
1078
1216
|
}
|
|
1217
|
+
const cachedResult = asCachedGeotechDocumentInsight(cachedEvidence?.extractionResult);
|
|
1218
|
+
if (cachedResult) {
|
|
1219
|
+
return {
|
|
1220
|
+
ok: true,
|
|
1221
|
+
pageNumber: page.pageNumber,
|
|
1222
|
+
inspectionPage,
|
|
1223
|
+
textHintSource: cachedEvidence.source,
|
|
1224
|
+
recoveryWarnings: cachedEvidence.warnings,
|
|
1225
|
+
ocrRecovered: cachedEvidence.source === 'local-ocr' || cachedEvidence.source === 'vision-ocr' || cachedEvidence.source === 'glm-ocr',
|
|
1226
|
+
evidenceCache,
|
|
1227
|
+
result: cachedResult,
|
|
1228
|
+
};
|
|
1229
|
+
}
|
|
1230
|
+
const cachedTextHint = cachedEvidence?.textHint;
|
|
1231
|
+
if (cachedTextHint) {
|
|
1232
|
+
pageTextHint = cachedTextHint;
|
|
1233
|
+
textHintSource = cachedEvidence.source;
|
|
1234
|
+
}
|
|
1079
1235
|
const pageTimeoutMs = resolvePagePhaseTimeoutMs(options.config, {
|
|
1080
1236
|
classification: inspectionPage?.classification,
|
|
1081
1237
|
sourceKind: page.sourceKind,
|
|
@@ -1089,7 +1245,7 @@ export async function ingestGeotechDocument(options) {
|
|
|
1089
1245
|
page,
|
|
1090
1246
|
inspectionPage,
|
|
1091
1247
|
textHint: pageTextHint,
|
|
1092
|
-
})) {
|
|
1248
|
+
}) && !cachedTextHint) {
|
|
1093
1249
|
const context = {
|
|
1094
1250
|
pageNumber: page.pageNumber,
|
|
1095
1251
|
totalPages: page.totalPages,
|
|
@@ -1097,6 +1253,17 @@ export async function ingestGeotechDocument(options) {
|
|
|
1097
1253
|
directVisualPreferred: true,
|
|
1098
1254
|
};
|
|
1099
1255
|
const result = await withPageTimeout(interpretPage(page.base64, page.mimeType, pagePhaseConfig, context), pageTimeoutMs, `Page ${page.pageNumber}: direct visual page interpretation timed out after ${Math.round(pageTimeoutMs / 1000)}s`);
|
|
1256
|
+
if (cacheContext) {
|
|
1257
|
+
const stored = safeWritePageEvidenceCache(cacheContext, {
|
|
1258
|
+
source: 'vision-visual',
|
|
1259
|
+
warnings: ['Skipped OCR-only transcription and used direct visual extraction for an image-heavy hosted-beta page.'],
|
|
1260
|
+
transformed: false,
|
|
1261
|
+
extractionResult: result,
|
|
1262
|
+
}, now);
|
|
1263
|
+
evidenceCache = stored.entry
|
|
1264
|
+
? buildPageEvidenceCacheAudit(cacheContext, 'stored', stored.entry)
|
|
1265
|
+
: buildPageEvidenceCacheAudit(cacheContext, 'skipped', null, `cache write failed: ${stored.error}`);
|
|
1266
|
+
}
|
|
1100
1267
|
return {
|
|
1101
1268
|
ok: true,
|
|
1102
1269
|
pageNumber: page.pageNumber,
|
|
@@ -1104,23 +1271,36 @@ export async function ingestGeotechDocument(options) {
|
|
|
1104
1271
|
textHintSource: 'vision-visual',
|
|
1105
1272
|
recoveryWarnings: ['Skipped OCR-only transcription and used direct visual extraction for an image-heavy hosted-beta page.'],
|
|
1106
1273
|
ocrRecovered: false,
|
|
1274
|
+
evidenceCache,
|
|
1107
1275
|
result,
|
|
1108
1276
|
};
|
|
1109
1277
|
}
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1278
|
+
let recoveryWarnings = cachedEvidence?.warnings ?? [];
|
|
1279
|
+
let recoverySource = textHintSource;
|
|
1280
|
+
let recoveryTransformed = cachedEvidence?.transformed ?? false;
|
|
1281
|
+
let layoutSummary = cachedEvidence?.layoutSummary;
|
|
1282
|
+
if (!cachedTextHint) {
|
|
1283
|
+
const recovery = await withPageTimeout(recoverDocumentTextHint({
|
|
1284
|
+
existingTextHint: pageTextHint,
|
|
1285
|
+
existingTextAccepted: inspectionPage?.normalizedArtifact?.textQuality.accepted ?? true,
|
|
1286
|
+
imageBase64: page.base64,
|
|
1287
|
+
mimeType: page.mimeType,
|
|
1288
|
+
config: pagePhaseConfig,
|
|
1289
|
+
pdfFilePath: page.filePath,
|
|
1290
|
+
pdfPageNumber: page.pageNumber,
|
|
1291
|
+
visionTranscribe: transcribePageImageText,
|
|
1292
|
+
}), pageTimeoutMs, `Page ${page.pageNumber}: OCR/text recovery timed out after ${Math.round(pageTimeoutMs / 1000)}s`);
|
|
1293
|
+
if (recovery.textHint) {
|
|
1294
|
+
pageTextHint = recovery.textHint;
|
|
1295
|
+
}
|
|
1296
|
+
recoveryWarnings = recovery.warnings;
|
|
1297
|
+
recoverySource = recovery.source;
|
|
1298
|
+
recoveryTransformed = recovery.transformed;
|
|
1299
|
+
layoutSummary = recovery.layout
|
|
1300
|
+
? `GLM-OCR parsed ${recovery.layout.pages.length} page(s), ${recovery.layout.pages.reduce((sum, layoutPage) => sum + layoutPage.tables.length, 0)} table(s).`
|
|
1301
|
+
: undefined;
|
|
1122
1302
|
}
|
|
1123
|
-
textHintSource =
|
|
1303
|
+
textHintSource = recoverySource;
|
|
1124
1304
|
const context = {
|
|
1125
1305
|
pageNumber: page.pageNumber,
|
|
1126
1306
|
totalPages: page.totalPages,
|
|
@@ -1136,13 +1316,38 @@ export async function ingestGeotechDocument(options) {
|
|
|
1136
1316
|
const result = pageTextHint
|
|
1137
1317
|
? await withPageTimeout(extractTextFacts(pageTextHint, extractionConfig, context), extractionTimeoutMs, `Page ${page.pageNumber}: text extraction timed out after ${Math.round(extractionTimeoutMs / 1000)}s`)
|
|
1138
1318
|
: await withPageTimeout(interpretPage(page.base64, page.mimeType, pagePhaseConfig, context), pageTimeoutMs, `Page ${page.pageNumber}: visual page interpretation timed out after ${Math.round(pageTimeoutMs / 1000)}s`);
|
|
1319
|
+
if (cacheContext && !cachedEvidence) {
|
|
1320
|
+
const stored = safeWritePageEvidenceCache(cacheContext, {
|
|
1321
|
+
textHint: pageTextHint,
|
|
1322
|
+
source: textHintSource,
|
|
1323
|
+
warnings: recoveryWarnings,
|
|
1324
|
+
transformed: recoveryTransformed,
|
|
1325
|
+
layoutSummary,
|
|
1326
|
+
extractionResult: result,
|
|
1327
|
+
}, now);
|
|
1328
|
+
evidenceCache = stored.entry
|
|
1329
|
+
? buildPageEvidenceCacheAudit(cacheContext, 'stored', stored.entry)
|
|
1330
|
+
: buildPageEvidenceCacheAudit(cacheContext, 'skipped', null, `cache write failed: ${stored.error}`);
|
|
1331
|
+
}
|
|
1332
|
+
else if (cacheContext && cachedEvidence && !cachedResult) {
|
|
1333
|
+
safeWritePageEvidenceCache(cacheContext, {
|
|
1334
|
+
textHint: pageTextHint,
|
|
1335
|
+
source: textHintSource,
|
|
1336
|
+
warnings: recoveryWarnings,
|
|
1337
|
+
transformed: recoveryTransformed,
|
|
1338
|
+
layoutSummary,
|
|
1339
|
+
extractionResult: result,
|
|
1340
|
+
createdAt: cachedEvidence.createdAt,
|
|
1341
|
+
}, now);
|
|
1342
|
+
}
|
|
1139
1343
|
return {
|
|
1140
1344
|
ok: true,
|
|
1141
1345
|
pageNumber: page.pageNumber,
|
|
1142
1346
|
inspectionPage,
|
|
1143
1347
|
textHintSource,
|
|
1144
|
-
recoveryWarnings
|
|
1145
|
-
ocrRecovered:
|
|
1348
|
+
recoveryWarnings,
|
|
1349
|
+
ocrRecovered: textHintSource === 'local-ocr' || textHintSource === 'vision-ocr' || textHintSource === 'glm-ocr',
|
|
1350
|
+
evidenceCache,
|
|
1146
1351
|
result,
|
|
1147
1352
|
};
|
|
1148
1353
|
}
|
|
@@ -1152,6 +1357,7 @@ export async function ingestGeotechDocument(options) {
|
|
|
1152
1357
|
pageNumber: page.pageNumber,
|
|
1153
1358
|
inspectionPage,
|
|
1154
1359
|
textHintSource,
|
|
1360
|
+
evidenceCache,
|
|
1155
1361
|
error: normalizePageErrorMessage(error instanceof Error ? error.message : String(error)),
|
|
1156
1362
|
};
|
|
1157
1363
|
}
|
|
@@ -1181,6 +1387,7 @@ export async function ingestGeotechDocument(options) {
|
|
|
1181
1387
|
materialCount: settled.result.materials.length,
|
|
1182
1388
|
classificationCount: settled.result.classifications.length,
|
|
1183
1389
|
parameterCount: settled.result.parameters.length,
|
|
1390
|
+
evidenceCache: settled.evidenceCache,
|
|
1184
1391
|
warnings: uniqueStrings([
|
|
1185
1392
|
...settled.recoveryWarnings,
|
|
1186
1393
|
...settled.result.warnings,
|
|
@@ -1198,6 +1405,7 @@ export async function ingestGeotechDocument(options) {
|
|
|
1198
1405
|
materialCount: 0,
|
|
1199
1406
|
classificationCount: 0,
|
|
1200
1407
|
parameterCount: 0,
|
|
1408
|
+
evidenceCache: settled.evidenceCache,
|
|
1201
1409
|
warnings: [settled.error],
|
|
1202
1410
|
});
|
|
1203
1411
|
}
|