@geotechcli/core 0.4.21 → 0.4.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. package/dist/agents/brain.d.ts +1 -5
  2. package/dist/agents/brain.d.ts.map +1 -1
  3. package/dist/agents/brain.js +4 -120
  4. package/dist/agents/brain.js.map +1 -1
  5. package/dist/agents/data-tools.js +759 -0
  6. package/dist/agents/data-tools.js.map +1 -1
  7. package/dist/agents/runtime-bootstrap.d.ts +6 -0
  8. package/dist/agents/runtime-bootstrap.d.ts.map +1 -0
  9. package/dist/agents/runtime-bootstrap.js +8 -0
  10. package/dist/agents/runtime-bootstrap.js.map +1 -0
  11. package/dist/agents/runtime-fallbacks.d.ts +7 -0
  12. package/dist/agents/runtime-fallbacks.d.ts.map +1 -0
  13. package/dist/agents/runtime-fallbacks.js +87 -0
  14. package/dist/agents/runtime-fallbacks.js.map +1 -0
  15. package/dist/agents/swarm.d.ts +1 -4
  16. package/dist/agents/swarm.d.ts.map +1 -1
  17. package/dist/agents/swarm.js +74 -8
  18. package/dist/agents/swarm.js.map +1 -1
  19. package/dist/agents/tool-runtime.d.ts +7 -0
  20. package/dist/agents/tool-runtime.d.ts.map +1 -0
  21. package/dist/agents/tool-runtime.js +9 -0
  22. package/dist/agents/tool-runtime.js.map +1 -0
  23. package/dist/config/index.d.ts +4 -4
  24. package/dist/config/index.js +1 -1
  25. package/dist/config/index.js.map +1 -1
  26. package/dist/geo/coordinates.d.ts +40 -0
  27. package/dist/geo/coordinates.d.ts.map +1 -0
  28. package/dist/geo/coordinates.js +461 -0
  29. package/dist/geo/coordinates.js.map +1 -0
  30. package/dist/geo/index.d.ts +1 -0
  31. package/dist/geo/index.d.ts.map +1 -1
  32. package/dist/geo/index.js +1 -0
  33. package/dist/geo/index.js.map +1 -1
  34. package/dist/index.d.ts +3 -2
  35. package/dist/index.d.ts.map +1 -1
  36. package/dist/index.js +3 -2
  37. package/dist/index.js.map +1 -1
  38. package/dist/ingest/ags.d.ts +3 -0
  39. package/dist/ingest/ags.d.ts.map +1 -1
  40. package/dist/ingest/ags.js +98 -9
  41. package/dist/ingest/ags.js.map +1 -1
  42. package/dist/ingest/cpt.d.ts +4 -0
  43. package/dist/ingest/cpt.d.ts.map +1 -1
  44. package/dist/ingest/cpt.js +87 -25
  45. package/dist/ingest/cpt.js.map +1 -1
  46. package/dist/ingest/document-inputs.d.ts +37 -0
  47. package/dist/ingest/document-inputs.d.ts.map +1 -0
  48. package/dist/ingest/document-inputs.js +197 -0
  49. package/dist/ingest/document-inputs.js.map +1 -0
  50. package/dist/ingest/geotech-document.d.ts +118 -0
  51. package/dist/ingest/geotech-document.d.ts.map +1 -0
  52. package/dist/ingest/geotech-document.js +1006 -0
  53. package/dist/ingest/geotech-document.js.map +1 -0
  54. package/dist/ingest/geotech-extract.d.ts +86 -0
  55. package/dist/ingest/geotech-extract.d.ts.map +1 -0
  56. package/dist/ingest/geotech-extract.js +652 -0
  57. package/dist/ingest/geotech-extract.js.map +1 -0
  58. package/dist/ingest/geotech-schemas.d.ts +248 -0
  59. package/dist/ingest/geotech-schemas.d.ts.map +1 -0
  60. package/dist/ingest/geotech-schemas.js +150 -0
  61. package/dist/ingest/geotech-schemas.js.map +1 -0
  62. package/dist/ingest/index.d.ts +8 -0
  63. package/dist/ingest/index.d.ts.map +1 -1
  64. package/dist/ingest/index.js +8 -0
  65. package/dist/ingest/index.js.map +1 -1
  66. package/dist/ingest/ingest-job-child.d.ts +2 -0
  67. package/dist/ingest/ingest-job-child.d.ts.map +1 -0
  68. package/dist/ingest/ingest-job-child.js +45 -0
  69. package/dist/ingest/ingest-job-child.js.map +1 -0
  70. package/dist/ingest/job-store.d.ts +117 -0
  71. package/dist/ingest/job-store.d.ts.map +1 -0
  72. package/dist/ingest/job-store.js +541 -0
  73. package/dist/ingest/job-store.js.map +1 -0
  74. package/dist/ingest/job-worker.d.ts +24 -0
  75. package/dist/ingest/job-worker.d.ts.map +1 -0
  76. package/dist/ingest/job-worker.js +1129 -0
  77. package/dist/ingest/job-worker.js.map +1 -0
  78. package/dist/ingest/pdf.d.ts +102 -0
  79. package/dist/ingest/pdf.d.ts.map +1 -0
  80. package/dist/ingest/pdf.js +1544 -0
  81. package/dist/ingest/pdf.js.map +1 -0
  82. package/dist/ingest/review-store.d.ts +215 -0
  83. package/dist/ingest/review-store.d.ts.map +1 -0
  84. package/dist/ingest/review-store.js +1995 -0
  85. package/dist/ingest/review-store.js.map +1 -0
  86. package/dist/llm/capabilities.d.ts +8 -0
  87. package/dist/llm/capabilities.d.ts.map +1 -0
  88. package/dist/llm/capabilities.js +73 -0
  89. package/dist/llm/capabilities.js.map +1 -0
  90. package/dist/llm/index.d.ts +3 -2
  91. package/dist/llm/index.d.ts.map +1 -1
  92. package/dist/llm/index.js +2 -1
  93. package/dist/llm/index.js.map +1 -1
  94. package/dist/llm/providers/anthropic.d.ts +6 -0
  95. package/dist/llm/providers/anthropic.d.ts.map +1 -1
  96. package/dist/llm/providers/anthropic.js +10 -1
  97. package/dist/llm/providers/anthropic.js.map +1 -1
  98. package/dist/llm/providers/hosted-beta.d.ts +6 -0
  99. package/dist/llm/providers/hosted-beta.d.ts.map +1 -1
  100. package/dist/llm/providers/hosted-beta.js +40 -10
  101. package/dist/llm/providers/hosted-beta.js.map +1 -1
  102. package/dist/llm/providers/huggingface.d.ts +6 -0
  103. package/dist/llm/providers/huggingface.d.ts.map +1 -1
  104. package/dist/llm/providers/huggingface.js +21 -1
  105. package/dist/llm/providers/huggingface.js.map +1 -1
  106. package/dist/llm/providers/openai-compatible.d.ts +6 -0
  107. package/dist/llm/providers/openai-compatible.d.ts.map +1 -1
  108. package/dist/llm/providers/openai-compatible.js +21 -1
  109. package/dist/llm/providers/openai-compatible.js.map +1 -1
  110. package/dist/llm/providers/zhipu.d.ts +6 -0
  111. package/dist/llm/providers/zhipu.d.ts.map +1 -1
  112. package/dist/llm/providers/zhipu.js +15 -1
  113. package/dist/llm/providers/zhipu.js.map +1 -1
  114. package/dist/llm/router.d.ts +7 -0
  115. package/dist/llm/router.d.ts.map +1 -1
  116. package/dist/llm/router.js +33 -13
  117. package/dist/llm/router.js.map +1 -1
  118. package/dist/llm/types.d.ts +22 -4
  119. package/dist/llm/types.d.ts.map +1 -1
  120. package/dist/llm/types.js.map +1 -1
  121. package/dist/meta/metadata.json +1 -1
  122. package/dist/report/html.d.ts +3 -0
  123. package/dist/report/html.d.ts.map +1 -0
  124. package/dist/report/html.js +626 -0
  125. package/dist/report/html.js.map +1 -0
  126. package/dist/report/index.d.ts +2 -0
  127. package/dist/report/index.d.ts.map +1 -1
  128. package/dist/report/index.js +2 -0
  129. package/dist/report/index.js.map +1 -1
  130. package/dist/report/ingest-dossier.d.ts +81 -0
  131. package/dist/report/ingest-dossier.d.ts.map +1 -0
  132. package/dist/report/ingest-dossier.js +324 -0
  133. package/dist/report/ingest-dossier.js.map +1 -0
  134. package/dist/storage/index.d.ts +5 -0
  135. package/dist/storage/index.d.ts.map +1 -1
  136. package/dist/storage/index.js +12 -6
  137. package/dist/storage/index.js.map +1 -1
  138. package/dist/vision/geotech-document.d.ts +46 -0
  139. package/dist/vision/geotech-document.d.ts.map +1 -0
  140. package/dist/vision/geotech-document.js +576 -0
  141. package/dist/vision/geotech-document.js.map +1 -0
  142. package/dist/vision/index.d.ts +31 -0
  143. package/dist/vision/index.d.ts.map +1 -1
  144. package/dist/vision/index.js +659 -27
  145. package/dist/vision/index.js.map +1 -1
  146. package/dist/vision/ocr.d.ts +29 -0
  147. package/dist/vision/ocr.d.ts.map +1 -0
  148. package/dist/vision/ocr.js +287 -0
  149. package/dist/vision/ocr.js.map +1 -0
  150. package/dist/vision/preprocess.d.ts +26 -0
  151. package/dist/vision/preprocess.d.ts.map +1 -0
  152. package/dist/vision/preprocess.js +194 -0
  153. package/dist/vision/preprocess.js.map +1 -0
  154. package/package.json +5 -1
@@ -0,0 +1,1006 @@
1
+ import { resolveProviderCapabilities } from '../llm/index.js';
2
+ import { extractGeotechDocumentFactsFromText, interpretGeotechDocumentPage, } from '../vision/geotech-document.js';
3
+ import { transcribeDocumentImageText } from '../vision/index.js';
4
+ import { recoverDocumentTextHint } from '../vision/ocr.js';
5
+ function uniqueStrings(values) {
6
+ return [...new Set(values.filter((value) => typeof value === 'string' && value.trim().length > 0))];
7
+ }
8
+ function createFindingKey(finding) {
9
+ return [
10
+ finding.severity,
11
+ finding.scope,
12
+ finding.code,
13
+ finding.message,
14
+ finding.pageNumber ?? '',
15
+ finding.materialDescription ?? '',
16
+ ].join('|');
17
+ }
18
+ function uniqueFindings(findings) {
19
+ const seen = new Set();
20
+ const unique = [];
21
+ for (const finding of findings) {
22
+ const key = createFindingKey(finding);
23
+ if (seen.has(key)) {
24
+ continue;
25
+ }
26
+ seen.add(key);
27
+ unique.push(finding);
28
+ }
29
+ return unique;
30
+ }
31
+ function findingRequiresReview(finding) {
32
+ return finding.severity !== 'advisory';
33
+ }
34
+ function summarizeReviewReasons(findings) {
35
+ return uniqueStrings(findings
36
+ .filter(findingRequiresReview)
37
+ .map((finding) => finding.message));
38
+ }
39
+ function normalizePageErrorMessage(message) {
40
+ let normalized = message.trim();
41
+ for (let index = 0; index < 4; index += 1) {
42
+ const updated = normalized.replace(/^Page \d+:\s*/i, '').trim();
43
+ if (updated === normalized) {
44
+ break;
45
+ }
46
+ normalized = updated;
47
+ }
48
+ return normalized;
49
+ }
50
+ function summarizeInspection(inspection, ocrRecoveredPageCount = 0) {
51
+ if (!inspection) {
52
+ return null;
53
+ }
54
+ const counts = {};
55
+ let imageHeavyPageCount = 0;
56
+ let nativeTextPageCount = 0;
57
+ let degradedPageCount = 0;
58
+ for (const page of inspection.pages) {
59
+ counts[page.classification] = (counts[page.classification] ?? 0) + 1;
60
+ if (page.classification === 'image-only' || page.classification === 'text-unreadable') {
61
+ imageHeavyPageCount += 1;
62
+ }
63
+ if (page.capabilities.nativeTextExtraction !== 'unavailable') {
64
+ nativeTextPageCount += 1;
65
+ }
66
+ if (page.degradation.level !== 'none') {
67
+ degradedPageCount += 1;
68
+ }
69
+ }
70
+ return {
71
+ pageClassificationCounts: counts,
72
+ imageHeavyPageCount,
73
+ nativeTextPageCount,
74
+ degradedPageCount,
75
+ ocrRecoveredPageCount,
76
+ };
77
+ }
78
+ function buildInspectionWarnings(inspection) {
79
+ if (!inspection) {
80
+ return [];
81
+ }
82
+ const warnings = [...inspection.warnings];
83
+ for (const page of inspection.pages) {
84
+ if (page.classification === 'image-only' || page.classification === 'text-unreadable') {
85
+ warnings.push(`PDF page ${page.pageNumber} is ${page.classification}. Native text was not recovered from the PDF parser, so OCR-style transcription may be needed.`);
86
+ }
87
+ }
88
+ return uniqueStrings(warnings);
89
+ }
90
+ function mergeParseStatus(statuses) {
91
+ if (statuses.length === 0) {
92
+ return 'failed';
93
+ }
94
+ if (statuses.every((status) => status === 'parsed')) {
95
+ return 'parsed';
96
+ }
97
+ if (statuses.some((status) => status !== 'failed')) {
98
+ return 'partial';
99
+ }
100
+ return 'failed';
101
+ }
102
+ function mergeMaterials(results) {
103
+ const seen = new Set();
104
+ const materials = [];
105
+ for (const result of results) {
106
+ for (const material of result.materials) {
107
+ const key = [
108
+ material.kind,
109
+ material.description.toLowerCase(),
110
+ material.uscsSymbol ?? '',
111
+ material.lithology?.toLowerCase() ?? '',
112
+ ].join('|');
113
+ if (seen.has(key)) {
114
+ continue;
115
+ }
116
+ seen.add(key);
117
+ materials.push(material);
118
+ }
119
+ }
120
+ return materials;
121
+ }
122
+ function mergeClassifications(results) {
123
+ const seen = new Set();
124
+ const classifications = [];
125
+ for (const result of results) {
126
+ for (const classification of result.classifications) {
127
+ const key = [
128
+ classification.system.toLowerCase(),
129
+ classification.value.toLowerCase(),
130
+ classification.context?.toLowerCase() ?? '',
131
+ ].join('|');
132
+ if (seen.has(key)) {
133
+ continue;
134
+ }
135
+ seen.add(key);
136
+ classifications.push(classification);
137
+ }
138
+ }
139
+ return classifications;
140
+ }
141
+ function mergeParameters(results) {
142
+ const seen = new Set();
143
+ const parameters = [];
144
+ for (const result of results) {
145
+ for (const parameter of result.parameters) {
146
+ const key = [
147
+ parameter.name.toLowerCase(),
148
+ parameter.valueText.toLowerCase(),
149
+ parameter.unit?.toLowerCase() ?? '',
150
+ parameter.material?.toLowerCase() ?? '',
151
+ parameter.context?.toLowerCase() ?? '',
152
+ ].join('|');
153
+ if (seen.has(key)) {
154
+ continue;
155
+ }
156
+ seen.add(key);
157
+ parameters.push(parameter);
158
+ }
159
+ }
160
+ return parameters;
161
+ }
162
+ function resolvePageConcurrency(config, requestedConcurrency) {
163
+ if (requestedConcurrency != null && Number.isFinite(requestedConcurrency)) {
164
+ return Math.max(1, Math.min(4, Math.trunc(requestedConcurrency)));
165
+ }
166
+ const capabilities = resolveProviderCapabilities(config);
167
+ if (config.provider === 'hosted-beta') {
168
+ return 2;
169
+ }
170
+ return capabilities.visionImages ? 3 : 2;
171
+ }
172
+ async function mapWithConcurrency(items, concurrency, mapper) {
173
+ const results = new Array(items.length);
174
+ let nextIndex = 0;
175
+ async function worker() {
176
+ while (nextIndex < items.length) {
177
+ const currentIndex = nextIndex;
178
+ nextIndex += 1;
179
+ results[currentIndex] = await mapper(items[currentIndex], currentIndex);
180
+ }
181
+ }
182
+ await Promise.all(Array.from({ length: Math.max(1, Math.min(concurrency, items.length || 1)) }, () => worker()));
183
+ return results;
184
+ }
185
+ async function withPageTimeout(task, timeoutMs, message) {
186
+ let handle = null;
187
+ try {
188
+ return await Promise.race([
189
+ task,
190
+ new Promise((_, reject) => {
191
+ handle = setTimeout(() => reject(new Error(message)), timeoutMs);
192
+ }),
193
+ ]);
194
+ }
195
+ finally {
196
+ if (handle) {
197
+ clearTimeout(handle);
198
+ }
199
+ }
200
+ }
201
+ function resolvePagePhaseTimeoutMs(config, input) {
202
+ const baseTimeoutMs = Math.min(Math.max(config.timeout ?? 120000, 60000), 120000);
203
+ const isHeavyVisualPage = input.sourceKind === 'raster-image'
204
+ || input.classification === 'image-only'
205
+ || input.classification === 'text-unreadable';
206
+ return isHeavyVisualPage
207
+ ? Math.min(Math.max(baseTimeoutMs, 180000), 180000)
208
+ : baseTimeoutMs;
209
+ }
210
+ function resolveTextExtractionTimeoutMs(baseTimeoutMs, textHint) {
211
+ if (!textHint) {
212
+ return baseTimeoutMs;
213
+ }
214
+ if (textHint.length >= 1800) {
215
+ return Math.min(Math.max(baseTimeoutMs, 150000), 180000);
216
+ }
217
+ if (textHint.length >= 1000) {
218
+ return Math.max(baseTimeoutMs, 120000);
219
+ }
220
+ return baseTimeoutMs;
221
+ }
222
+ function normalizeHeadingText(value) {
223
+ return value
224
+ .toLowerCase()
225
+ .replace(/\s+/g, ' ')
226
+ .replace(/[^\w\s:/()-]+/g, '')
227
+ .trim();
228
+ }
229
+ function buildHeadingKey(headings) {
230
+ return headings
231
+ .map(normalizeHeadingText)
232
+ .filter(Boolean)
233
+ .join(' > ');
234
+ }
235
+ function collectResultTextSignals(result) {
236
+ return [
237
+ result.title,
238
+ result.summary,
239
+ ...result.materials.map((material) => material.description),
240
+ ...result.classifications.map((classification) => `${classification.system} ${classification.value}${classification.context ? ` ${classification.context}` : ''}`.trim()),
241
+ ...result.parameters.map((parameter) => `${parameter.name} ${parameter.valueText}${parameter.unit ? ` ${parameter.unit}` : ''}${parameter.context ? ` ${parameter.context}` : ''}`.trim()),
242
+ ...result.risks,
243
+ ...result.recommendations,
244
+ ]
245
+ .filter((value) => typeof value === 'string' && value.trim().length > 0)
246
+ .join('\n')
247
+ .trim();
248
+ }
249
+ function inferChunkScope(inspectionPage) {
250
+ return inspectionPage?.normalizedArtifact?.tablesDetected
251
+ ? 'table'
252
+ : inspectionPage?.normalizedArtifact?.figuresDetected
253
+ ? 'figure'
254
+ : (inspectionPage?.normalizedArtifact?.headingHints.length ?? 0) > 0
255
+ ? 'section'
256
+ : 'page';
257
+ }
258
+ function inferSectionType(input) {
259
+ if (input.result.documentClass === 'administrative-document') {
260
+ return 'administrative';
261
+ }
262
+ if (input.result.documentClass === 'visual-appendix-document') {
263
+ return 'visual-appendix';
264
+ }
265
+ const leadingSignalText = buildPageLeadText(input.inspectionPage, {
266
+ bodyLineLimit: 8,
267
+ bodyCharacterLimit: 420,
268
+ acceptedOnly: true,
269
+ }).toLowerCase();
270
+ const signalText = [
271
+ input.result.title,
272
+ input.result.summary,
273
+ ...(input.inspectionPage?.normalizedArtifact?.headingHints ?? []),
274
+ input.inspectionPage?.normalizedArtifact?.nativeText ?? '',
275
+ ]
276
+ .filter((value) => typeof value === 'string' && value.trim().length > 0)
277
+ .join('\n')
278
+ .toLowerCase();
279
+ const extractedEngineeringSignals = input.result.materials.length > 0
280
+ || input.result.classifications.length > 0
281
+ || input.result.parameters.length > 0;
282
+ const pageEngineeringSignals = hasEngineeringPageSignals(input.inspectionPage);
283
+ const hasMeaningfulEngineeringSignals = extractedEngineeringSignals || pageEngineeringSignals;
284
+ if (!hasMeaningfulEngineeringSignals
285
+ && /\b(cover|title page|document control|distribution|revision|issue register|transmittal|copyright|prepared for|prepared by|checked by|approved by|table of contents|contents)\b/.test(leadingSignalText)) {
286
+ return 'administrative';
287
+ }
288
+ if (/\b(executive summary|summary|conclusion|overview|scope of works?)\b/.test(signalText)) {
289
+ return 'summary';
290
+ }
291
+ if (input.result.recommendations.length > 0 || /\b(recommend|mitigation|next step|monitor|confirm|verify|further investigation)\b/.test(signalText)) {
292
+ return 'recommendation';
293
+ }
294
+ if (/\b(groundwater|water table|seepage|piezometric|aquifer|dewatering)\b/.test(signalText)) {
295
+ return 'groundwater';
296
+ }
297
+ if (input.result.classifications.length > 0
298
+ || /\b(uscs|rmr|rqd|q-system|classification)\b/.test(signalText)) {
299
+ return 'classification';
300
+ }
301
+ if (input.result.materials.length > 0
302
+ || /\b(geology|geological|lithology|lithological|strata|stratigraphy|formation|borehole|ground conditions?|subsurface|rock mass|weathered|fill|standard penetration test|spt|field methods?)\b/.test(signalText)) {
303
+ return 'ground-model';
304
+ }
305
+ if (input.result.parameters.length > 0
306
+ || /\b(laboratory|lab report|triaxial|atterberg|plasticity|permeability|moisture content|ucs|uniaxial|direct shear|ccil|chain of custody)\b/.test(signalText)) {
307
+ return 'laboratory';
308
+ }
309
+ if (!hasMeaningfulEngineeringSignals
310
+ && (input.scope === 'figure'
311
+ || /\b(figure|fig\.|plate|photo|sketch|section view|appendix|appendices|photo log|drawing)\b/.test(leadingSignalText))) {
312
+ return 'visual-appendix';
313
+ }
314
+ return 'general';
315
+ }
316
+ function scoreChunkSignificance(input) {
317
+ let score = 0;
318
+ score += Math.min(16, input.result.materials.length * 3);
319
+ score += Math.min(15, input.result.classifications.length * 4);
320
+ score += Math.min(24, input.result.parameters.length * 5);
321
+ score += Math.min(8, input.result.risks.length * 2);
322
+ score += Math.min(8, input.result.recommendations.length * 2);
323
+ if (input.result.summary) {
324
+ score += 6;
325
+ }
326
+ if (input.result.parseStatus === 'parsed') {
327
+ score += 8;
328
+ }
329
+ else if (input.result.parseStatus === 'partial') {
330
+ score += 3;
331
+ }
332
+ switch (input.sectionType) {
333
+ case 'ground-model':
334
+ score += 12;
335
+ break;
336
+ case 'laboratory':
337
+ score += 14;
338
+ break;
339
+ case 'classification':
340
+ score += 10;
341
+ break;
342
+ case 'groundwater':
343
+ score += 9;
344
+ break;
345
+ case 'recommendation':
346
+ score += 7;
347
+ break;
348
+ case 'summary':
349
+ score += 5;
350
+ break;
351
+ case 'visual-appendix':
352
+ score -= 6;
353
+ break;
354
+ case 'administrative':
355
+ score -= 18;
356
+ break;
357
+ default:
358
+ break;
359
+ }
360
+ if (input.scope === 'table') {
361
+ score += 5;
362
+ }
363
+ else if (input.scope === 'figure') {
364
+ score -= 4;
365
+ }
366
+ return Math.max(0, Math.min(100, score));
367
+ }
368
+ function buildPreparedPageChunks(results, inspection) {
369
+ return results
370
+ .filter((result) => result.pageNumber != null)
371
+ .map((result) => {
372
+ const pageNumber = result.pageNumber ?? 0;
373
+ const inspectionPage = inspection?.pages[pageNumber - 1];
374
+ const text = collectResultTextSignals(result);
375
+ const scope = inferChunkScope(inspectionPage);
376
+ const sectionType = inferSectionType({ result, inspectionPage, scope });
377
+ const significance = scoreChunkSignificance({ result, scope, sectionType });
378
+ return {
379
+ chunkId: `page-${pageNumber}`,
380
+ pageRange: [pageNumber, pageNumber],
381
+ headingAncestry: inspectionPage?.normalizedArtifact?.headingHints ?? [],
382
+ scope,
383
+ sectionType,
384
+ significance,
385
+ text,
386
+ sourcePages: [pageNumber],
387
+ documentClass: result.documentClass,
388
+ title: result.title,
389
+ summary: result.summary,
390
+ };
391
+ })
392
+ .filter((chunk) => chunk.text.trim().length > 0);
393
+ }
394
+ function shouldMergePreparedChunks(previous, current) {
395
+ if (current.pageRange[0] !== previous.pageRange[1] + 1) {
396
+ return false;
397
+ }
398
+ if (previous.sectionType !== current.sectionType) {
399
+ return false;
400
+ }
401
+ if (previous.sectionType === 'administrative'
402
+ || previous.sectionType === 'visual-appendix'
403
+ || previous.scope === 'figure'
404
+ || current.scope === 'figure') {
405
+ return false;
406
+ }
407
+ const previousHeadingKey = buildHeadingKey(previous.headingAncestry);
408
+ const currentHeadingKey = buildHeadingKey(current.headingAncestry);
409
+ if (previousHeadingKey && currentHeadingKey) {
410
+ return previousHeadingKey === currentHeadingKey;
411
+ }
412
+ return previous.scope === current.scope;
413
+ }
414
+ function mergePreparedChunks(chunks) {
415
+ const merged = [];
416
+ for (const chunk of chunks) {
417
+ const previous = merged[merged.length - 1];
418
+ if (!previous || !shouldMergePreparedChunks(previous, chunk)) {
419
+ merged.push({
420
+ ...chunk,
421
+ sourcePages: [...chunk.sourcePages],
422
+ });
423
+ continue;
424
+ }
425
+ previous.chunkId = `${previous.chunkId}+${chunk.chunkId}`;
426
+ previous.pageRange = [previous.pageRange[0], chunk.pageRange[1]];
427
+ previous.sourcePages = [...new Set([...previous.sourcePages, ...chunk.sourcePages])];
428
+ previous.headingAncestry = previous.headingAncestry.length >= chunk.headingAncestry.length
429
+ ? previous.headingAncestry
430
+ : chunk.headingAncestry;
431
+ previous.text = `${previous.text}\n\n${chunk.text}`.trim();
432
+ previous.significance = Math.min(100, previous.significance + chunk.significance);
433
+ if (!previous.summary && chunk.summary) {
434
+ previous.summary = chunk.summary;
435
+ }
436
+ if (!previous.title && chunk.title) {
437
+ previous.title = chunk.title;
438
+ }
439
+ if (!previous.documentClass && chunk.documentClass) {
440
+ previous.documentClass = chunk.documentClass;
441
+ }
442
+ if (previous.scope === 'page' && chunk.scope === 'section') {
443
+ previous.scope = 'section';
444
+ }
445
+ if (previous.scope === 'page' && chunk.scope === 'table') {
446
+ previous.scope = 'table';
447
+ }
448
+ }
449
+ return merged;
450
+ }
451
+ function buildContentChunks(results, inspection) {
452
+ return mergePreparedChunks(buildPreparedPageChunks(results, inspection));
453
+ }
454
+ function isGenericDocumentTitle(value) {
455
+ if (!value?.trim()) {
456
+ return true;
457
+ }
458
+ return /\b(cover sheet|cover page|table of contents|contents|appendix|drawing register|revision history|project information)\b/i.test(value);
459
+ }
460
+ function chooseDocumentClass(chunks) {
461
+ const scores = new Map();
462
+ const firstMeaningfulClass = chunks.find((chunk) => chunk.sectionType !== 'administrative'
463
+ && chunk.sectionType !== 'visual-appendix'
464
+ && typeof chunk.documentClass === 'string'
465
+ && chunk.documentClass.trim().length > 0
466
+ && chunk.documentClass.toLowerCase() !== 'unknown')?.documentClass?.trim() ?? null;
467
+ for (const chunk of chunks) {
468
+ const documentClass = chunk.documentClass?.trim();
469
+ if (!documentClass || documentClass.toLowerCase() === 'unknown') {
470
+ continue;
471
+ }
472
+ const weight = chunk.sectionType === 'administrative' || chunk.sectionType === 'visual-appendix'
473
+ ? 0
474
+ : Math.max(1, chunk.significance);
475
+ if (weight === 0) {
476
+ continue;
477
+ }
478
+ scores.set(documentClass, (scores.get(documentClass) ?? 0)
479
+ + weight
480
+ - (documentClass === 'geotechnical-document' ? 3 : 0));
481
+ }
482
+ const ranked = [...scores.entries()].sort((left, right) => right[1] - left[1]);
483
+ if (ranked.length === 0) {
484
+ return firstMeaningfulClass;
485
+ }
486
+ if (firstMeaningfulClass
487
+ && ranked.length > 1
488
+ && ranked[0][1] - ranked[1][1] <= 12) {
489
+ return firstMeaningfulClass;
490
+ }
491
+ return ranked[0]?.[0] ?? firstMeaningfulClass;
492
+ }
493
+ function chooseDocumentTitle(chunks) {
494
+ return [...chunks]
495
+ .sort((left, right) => right.significance - left.significance)
496
+ .map((chunk) => chunk.title)
497
+ .find((value) => typeof value === 'string' && value.trim().length > 0 && !isGenericDocumentTitle(value))
498
+ ?? null;
499
+ }
500
+ function chooseDocumentSummary(chunks) {
501
+ const ranked = [...chunks]
502
+ .filter((chunk) => chunk.sectionType !== 'administrative')
503
+ .sort((left, right) => right.significance - left.significance);
504
+ const summaries = uniqueStrings(ranked
505
+ .map((chunk) => chunk.summary)
506
+ .filter((value) => typeof value === 'string' && value.trim().length > 0)).slice(0, 3);
507
+ return summaries.length > 0 ? summaries.join(' ') : null;
508
+ }
509
+ function buildPageLeadText(inspectionPage, options) {
510
+ if (!inspectionPage) {
511
+ return '';
512
+ }
513
+ const bodyLineLimit = options?.bodyLineLimit ?? 8;
514
+ const bodyCharacterLimit = options?.bodyCharacterLimit ?? 480;
515
+ const acceptedOnly = options?.acceptedOnly ?? false;
516
+ const accepted = inspectionPage.normalizedArtifact?.textQuality.accepted ?? false;
517
+ const nativeText = acceptedOnly && !accepted
518
+ ? ''
519
+ : inspectionPage.normalizedArtifact?.nativeText
520
+ ?? inspectionPage.normalizedText
521
+ ?? '';
522
+ const bodyPreview = nativeText
523
+ .split(/\r?\n/)
524
+ .map((line) => line.trim())
525
+ .filter(Boolean)
526
+ .slice(0, bodyLineLimit)
527
+ .join('\n')
528
+ .slice(0, bodyCharacterLimit);
529
+ return uniqueStrings([
530
+ ...(inspectionPage.normalizedArtifact?.headingHints ?? []),
531
+ bodyPreview,
532
+ ])
533
+ .join('\n')
534
+ .trim();
535
+ }
536
+ function hasEngineeringPageSignals(inspectionPage) {
537
+ if (!inspectionPage) {
538
+ return false;
539
+ }
540
+ if (inspectionPage.normalizedArtifact?.tablesDetected) {
541
+ return true;
542
+ }
543
+ const cueText = buildPageLeadText(inspectionPage, {
544
+ bodyLineLimit: 18,
545
+ bodyCharacterLimit: 1200,
546
+ acceptedOnly: false,
547
+ }).toLowerCase();
548
+ return /\b(bh\d{1,5}[a-z0-9-]*|borehole|northing|easting|elevation|masl|m asl|standard penetration test|spt|astm|soil sample|laboratory|triaxial|consolidation|plasticity|liquid limit|grain size|shear|friction angle|cohesion|unit weight|moisture content|permeability|groundwater|rqd|rmr|q-system|ucs)\b/.test(cueText);
549
+ }
550
+ function inferNonCriticalPageRole(inspectionPage) {
551
+ if (!inspectionPage) {
552
+ return null;
553
+ }
554
+ const engineeringSignals = hasEngineeringPageSignals(inspectionPage);
555
+ const headingText = buildPageLeadText(inspectionPage, {
556
+ bodyLineLimit: 6,
557
+ bodyCharacterLimit: 360,
558
+ acceptedOnly: true,
559
+ }).toLowerCase();
560
+ if (!engineeringSignals
561
+ && /\b(cover|title page|table of contents|contents|document control|revision|distribution|transmittal|copyright)\b/.test(headingText)) {
562
+ return 'administrative';
563
+ }
564
+ if (!engineeringSignals
565
+ && (inspectionPage.normalizedArtifact?.figuresDetected
566
+ || /\b(appendix|figure|fig\.|plate|photo|sketch|drawing)\b/.test(headingText))) {
567
+ return 'visual-appendix';
568
+ }
569
+ return null;
570
+ }
571
+ function inferAppendixDividerRole(inspectionPage) {
572
+ if (!inspectionPage) {
573
+ return null;
574
+ }
575
+ const leadText = buildPageLeadText(inspectionPage, {
576
+ bodyLineLimit: 6,
577
+ bodyCharacterLimit: 320,
578
+ acceptedOnly: true,
579
+ }).toLowerCase();
580
+ if (!leadText || !/\bappendix\b/.test(leadText)) {
581
+ return null;
582
+ }
583
+ if (/\b(figures?|plates?|photos?|photographs?|drawings?|plans?|sketches?|site layout)\b/.test(leadText)) {
584
+ return 'visual-appendix';
585
+ }
586
+ if (/\b(record of boreholes?|borehole logs?|borehole records?|test pits?|cpt|cone penetration)\b/.test(leadText)) {
587
+ return null;
588
+ }
589
+ return null;
590
+ }
591
+ export function inferPreflightLowYieldPageRole(input) {
592
+ const explicitRole = inferNonCriticalPageRole(input.inspectionPage);
593
+ if (explicitRole) {
594
+ return explicitRole;
595
+ }
596
+ const inspectionPage = input.inspectionPage;
597
+ if (!inspectionPage) {
598
+ return null;
599
+ }
600
+ if (inspectionPage.classification === 'empty') {
601
+ return input.pageNumber === 1 ? 'administrative' : 'visual-appendix';
602
+ }
603
+ const wordCount = inspectionPage.metadata?.wordCount ?? 0;
604
+ const textSource = inspectionPage.normalizedArtifact?.textSource;
605
+ const previousDividerRole = inferAppendixDividerRole(input.previousInspectionPage);
606
+ if (input.pageNumber === 1
607
+ && inspectionPage.classification === 'image-only'
608
+ && textSource === 'native-text-low-quality'
609
+ && wordCount > 0
610
+ && wordCount <= 80
611
+ && !inspectionPage.normalizedArtifact?.tablesDetected
612
+ && !inspectionPage.normalizedArtifact?.figuresDetected) {
613
+ return 'administrative';
614
+ }
615
+ if (previousDividerRole === 'visual-appendix'
616
+ && (inspectionPage.classification === 'image-only' || inspectionPage.classification === 'graphics-only')
617
+ && textSource === 'native-text-low-quality'
618
+ && wordCount <= 120
619
+ && !hasEngineeringPageSignals(inspectionPage)) {
620
+ return 'visual-appendix';
621
+ }
622
+ if (inspectionPage.classification === 'graphics-only'
623
+ && input.pageNumber === input.totalPages) {
624
+ return 'visual-appendix';
625
+ }
626
+ if (inspectionPage.classification === 'graphics-only'
627
+ && input.sourceKind === 'raster-image'
628
+ && wordCount <= 80) {
629
+ return 'visual-appendix';
630
+ }
631
+ return null;
632
+ }
633
+ export function buildPreflightLowYieldInsight(input) {
634
+ const headingSummary = buildPageLeadText(input.inspectionPage, {
635
+ bodyLineLimit: 6,
636
+ bodyCharacterLimit: 200,
637
+ acceptedOnly: true,
638
+ })
639
+ .replace(/\s+/g, ' ')
640
+ .trim()
641
+ .slice(0, 200);
642
+ const title = input.role === 'administrative'
643
+ ? headingSummary || 'Administrative / cover page'
644
+ : headingSummary || 'Figure / appendix page';
645
+ const summary = input.role === 'administrative'
646
+ ? `Page ${input.pageNumber} of ${input.totalPages} appears to be an administrative or cover-style page. No reliable engineering data was extracted, so this page was summarized without an expensive vision pass.`
647
+ : `Page ${input.pageNumber} of ${input.totalPages} appears to be a graphics or appendix page. It was summarized as low-yield visual content without a full multimodal extraction pass.`;
648
+ const warnings = [
649
+ input.role === 'administrative'
650
+ ? 'Low-yield administrative/cover page was short-circuited before multimodal extraction.'
651
+ : 'Low-yield figure/appendix page was short-circuited before multimodal extraction.',
652
+ ];
653
+ return {
654
+ documentClass: input.role === 'administrative' ? 'administrative-document' : 'visual-appendix-document',
655
+ title,
656
+ summary,
657
+ materials: [],
658
+ classifications: [],
659
+ parameters: [],
660
+ risks: [],
661
+ recommendations: [],
662
+ pageNumber: input.pageNumber,
663
+ totalPages: input.totalPages,
664
+ rawLLMText: '',
665
+ latencyMs: 0,
666
+ parseStatus: 'partial',
667
+ confidence: input.role === 'administrative' ? 48 : 55,
668
+ warnings,
669
+ canAutoProceed: false,
670
+ };
671
+ }
672
+ function deriveDocumentFindings(results, pageFailures, materials, classifications, parameters, inspection) {
673
+ const findings = [];
674
+ const hasEngineeringContent = materials.length > 0 || classifications.length > 0 || parameters.length > 0;
675
+ for (const result of results) {
676
+ const inspectionPage = result.pageNumber != null ? inspection?.pages[result.pageNumber - 1] : undefined;
677
+ const nonCriticalRole = inferPreflightLowYieldPageRole({
678
+ inspectionPage,
679
+ pageNumber: result.pageNumber,
680
+ totalPages: result.totalPages,
681
+ });
682
+ if (result.parseStatus === 'failed') {
683
+ findings.push({
684
+ code: nonCriticalRole === 'administrative' ? 'administrative_page_low_yield' : nonCriticalRole === 'visual-appendix' ? 'visual_appendix_low_yield' : 'page_geotech_extraction_failed',
685
+ severity: nonCriticalRole && hasEngineeringContent ? 'advisory' : 'blocking',
686
+ scope: 'page',
687
+ message: nonCriticalRole === 'administrative'
688
+ ? `Page ${result.pageNumber ?? '?'} looked administrative (cover/contents/control) and did not contribute engineering content.`
689
+ : nonCriticalRole === 'visual-appendix'
690
+ ? `Page ${result.pageNumber ?? '?'} looked like a figure/appendix page and did not contribute structured engineering content.`
691
+ : `Page ${result.pageNumber ?? '?'} did not yield usable geotechnical content.`,
692
+ pageNumber: result.pageNumber ?? undefined,
693
+ });
694
+ continue;
695
+ }
696
+ if (result.parseStatus === 'partial' || result.confidence < 60) {
697
+ findings.push({
698
+ code: nonCriticalRole === 'administrative' ? 'administrative_page_partial' : nonCriticalRole === 'visual-appendix' ? 'visual_appendix_partial' : 'page_geotech_extraction_partial',
699
+ severity: nonCriticalRole ? 'advisory' : 'review',
700
+ scope: 'page',
701
+ message: nonCriticalRole === 'administrative'
702
+ ? `Page ${result.pageNumber ?? '?'} appears administrative and produced only low-yield extraction output.`
703
+ : nonCriticalRole === 'visual-appendix'
704
+ ? `Page ${result.pageNumber ?? '?'} appears to be a figure/appendix page and produced only partial engineering yield.`
705
+ : `Page ${result.pageNumber ?? '?'} produced partial or low-confidence geotechnical interpretation.`,
706
+ pageNumber: result.pageNumber ?? undefined,
707
+ });
708
+ }
709
+ }
710
+ for (const failure of pageFailures) {
711
+ const pageNumber = Number((failure.match(/^Page (\d+):/) ?? [])[1]);
712
+ const isTimeout = /timed out/i.test(failure);
713
+ const inspectionPage = Number.isFinite(pageNumber) ? inspection?.pages[pageNumber - 1] : undefined;
714
+ const nonCriticalRole = inferPreflightLowYieldPageRole({
715
+ inspectionPage,
716
+ pageNumber: Number.isFinite(pageNumber) ? pageNumber : null,
717
+ totalPages: inspection?.totalPages ?? null,
718
+ });
719
+ findings.push({
720
+ code: nonCriticalRole === 'administrative'
721
+ ? 'administrative_page_failed'
722
+ : nonCriticalRole === 'visual-appendix'
723
+ ? (isTimeout ? 'visual_appendix_timeout' : 'visual_appendix_failed')
724
+ : isTimeout
725
+ ? 'visual_page_timeout'
726
+ : 'page_ingest_failed',
727
+ severity: nonCriticalRole && hasEngineeringContent
728
+ ? 'advisory'
729
+ : isTimeout
730
+ ? 'review'
731
+ : 'blocking',
732
+ scope: 'page',
733
+ message: nonCriticalRole === 'administrative'
734
+ ? `${failure} This page appears administrative and was downgraded from a blocking failure.`
735
+ : nonCriticalRole === 'visual-appendix'
736
+ ? `${failure} This page appears to be a figure/appendix page and was downgraded from a blocking failure.`
737
+ : failure,
738
+ pageNumber: Number.isFinite(pageNumber) ? pageNumber : undefined,
739
+ });
740
+ }
741
+ if (materials.length === 0 && classifications.length === 0 && parameters.length === 0) {
742
+ findings.push({
743
+ code: 'no_geotech_content_detected',
744
+ severity: 'blocking',
745
+ scope: 'document',
746
+ message: 'No usable geology, lithology, or geotechnical engineering parameters were extracted from the supplied pages.',
747
+ });
748
+ }
749
+ else if (parameters.length === 0) {
750
+ findings.push({
751
+ code: 'parameters_not_detected',
752
+ severity: 'review',
753
+ scope: 'document',
754
+ message: 'Ground descriptions were found, but no explicit engineering parameters were extracted. Manual review may still be needed.',
755
+ });
756
+ }
757
+ else if (classifications.length === 0) {
758
+ findings.push({
759
+ code: 'classifications_not_detected',
760
+ severity: 'advisory',
761
+ scope: 'document',
762
+ message: 'Engineering parameters were found, but no formal soil/rock classification was extracted.',
763
+ });
764
+ }
765
+ return uniqueFindings(findings);
766
+ }
767
+ export async function ingestGeotechDocument(options) {
768
+ const interpretPage = options.interpretPage ?? interpretGeotechDocumentPage;
769
+ const extractTextFacts = options.extractTextFacts ?? extractGeotechDocumentFactsFromText;
770
+ const transcribePageImageText = options.transcribePageImageText ?? transcribeDocumentImageText;
771
+ const now = options.now ?? (() => new Date());
772
+ if (!options.image && (!options.pages || options.pages.length === 0)) {
773
+ throw new Error('Geotechnical document ingest requires either a single image input or one or more PDF page inputs.');
774
+ }
775
+ const pageAudits = [];
776
+ const pageFailures = [];
777
+ const pageResults = [];
778
+ const documentWarnings = buildInspectionWarnings(options.inspection);
779
+ const recoveredOcrPages = new Set();
780
+ const pageConcurrency = resolvePageConcurrency(options.config, options.pageConcurrency);
781
+ if (options.pages && options.pages.length > 0) {
782
+ const pages = [...options.pages].sort((left, right) => left.pageNumber - right.pageNumber);
783
+ const settledPages = await mapWithConcurrency(pages, pageConcurrency, async (page) => {
784
+ const inspectionPage = options.inspection?.pages[page.pageNumber - 1];
785
+ const lowYieldRole = inferPreflightLowYieldPageRole({
786
+ inspectionPage,
787
+ pageNumber: page.pageNumber,
788
+ totalPages: page.totalPages,
789
+ sourceKind: page.sourceKind,
790
+ });
791
+ let pageTextHint = inspectionPage?.normalizedArtifact?.nativeText
792
+ ?? inspectionPage?.normalizedText
793
+ ?? undefined;
794
+ let textHintSource = pageTextHint?.trim() ? 'native-text' : 'none';
795
+ try {
796
+ if (lowYieldRole && inspectionPage) {
797
+ return {
798
+ ok: true,
799
+ pageNumber: page.pageNumber,
800
+ inspectionPage,
801
+ textHintSource,
802
+ recoveryWarnings: [
803
+ lowYieldRole === 'administrative'
804
+ ? 'Administrative/cover page was summarized without a full multimodal extraction call.'
805
+ : 'Figure/appendix page was summarized without a full multimodal extraction call.',
806
+ ],
807
+ ocrRecovered: false,
808
+ result: buildPreflightLowYieldInsight({
809
+ role: lowYieldRole,
810
+ inspectionPage,
811
+ pageNumber: page.pageNumber,
812
+ totalPages: page.totalPages,
813
+ }),
814
+ };
815
+ }
816
+ const pageTimeoutMs = resolvePagePhaseTimeoutMs(options.config, {
817
+ classification: inspectionPage?.classification,
818
+ sourceKind: page.sourceKind,
819
+ });
820
+ const pagePhaseConfig = {
821
+ ...options.config,
822
+ timeout: pageTimeoutMs,
823
+ };
824
+ const recovery = await withPageTimeout(recoverDocumentTextHint({
825
+ existingTextHint: pageTextHint,
826
+ existingTextAccepted: inspectionPage?.normalizedArtifact?.textQuality.accepted ?? true,
827
+ imageBase64: page.base64,
828
+ mimeType: page.mimeType,
829
+ config: pagePhaseConfig,
830
+ pdfFilePath: page.filePath,
831
+ pdfPageNumber: page.pageNumber,
832
+ visionTranscribe: transcribePageImageText,
833
+ }), pageTimeoutMs, `Page ${page.pageNumber}: OCR/text recovery timed out after ${Math.round(pageTimeoutMs / 1000)}s`);
834
+ if (recovery.textHint) {
835
+ pageTextHint = recovery.textHint;
836
+ }
837
+ textHintSource = recovery.source;
838
+ const context = {
839
+ pageNumber: page.pageNumber,
840
+ totalPages: page.totalPages,
841
+ pageClassification: inspectionPage?.classification,
842
+ pageTextHint,
843
+ };
844
+ const extractionTimeoutMs = resolveTextExtractionTimeoutMs(pageTimeoutMs, pageTextHint);
845
+ const extractionConfig = {
846
+ ...options.config,
847
+ timeout: extractionTimeoutMs,
848
+ };
849
+ const result = pageTextHint
850
+ ? await withPageTimeout(extractTextFacts(pageTextHint, extractionConfig, context), extractionTimeoutMs, `Page ${page.pageNumber}: text extraction timed out after ${Math.round(extractionTimeoutMs / 1000)}s`)
851
+ : await withPageTimeout(interpretPage(page.base64, page.mimeType, pagePhaseConfig, context), pageTimeoutMs, `Page ${page.pageNumber}: visual page interpretation timed out after ${Math.round(pageTimeoutMs / 1000)}s`);
852
+ return {
853
+ ok: true,
854
+ pageNumber: page.pageNumber,
855
+ inspectionPage,
856
+ textHintSource,
857
+ recoveryWarnings: recovery.warnings,
858
+ ocrRecovered: recovery.source === 'local-ocr' || recovery.source === 'vision-ocr',
859
+ result,
860
+ };
861
+ }
862
+ catch (error) {
863
+ return {
864
+ ok: false,
865
+ pageNumber: page.pageNumber,
866
+ inspectionPage,
867
+ textHintSource,
868
+ error: normalizePageErrorMessage(error instanceof Error ? error.message : String(error)),
869
+ };
870
+ }
871
+ });
872
+ settledPages.sort((left, right) => left.pageNumber - right.pageNumber);
873
+ for (const settled of settledPages) {
874
+ if (settled.ok) {
875
+ if (settled.ocrRecovered) {
876
+ recoveredOcrPages.add(settled.pageNumber);
877
+ documentWarnings.push(`Recovered ${settled.textHintSource === 'local-ocr' ? 'local OCR' : 'OCR-style'} text hint for page ${settled.pageNumber}.`);
878
+ }
879
+ else if (settled.textHintSource === 'pdfjs-text') {
880
+ documentWarnings.push(`Recovered high-fidelity PDF text for page ${settled.pageNumber} without a multimodal OCR call.`);
881
+ }
882
+ documentWarnings.push(...settled.recoveryWarnings.map((warning) => `Page ${settled.pageNumber}: ${warning}`));
883
+ pageResults.push(settled.result);
884
+ pageAudits.push({
885
+ pageNumber: settled.pageNumber,
886
+ classification: settled.inspectionPage?.classification ?? null,
887
+ textHintSource: settled.textHintSource,
888
+ parseStatus: settled.result.parseStatus,
889
+ confidence: settled.result.confidence,
890
+ materialCount: settled.result.materials.length,
891
+ classificationCount: settled.result.classifications.length,
892
+ parameterCount: settled.result.parameters.length,
893
+ warnings: uniqueStrings([
894
+ ...settled.recoveryWarnings,
895
+ ...settled.result.warnings,
896
+ ]),
897
+ });
898
+ continue;
899
+ }
900
+ pageFailures.push(`Page ${settled.pageNumber}: ${settled.error}`);
901
+ pageAudits.push({
902
+ pageNumber: settled.pageNumber,
903
+ classification: settled.inspectionPage?.classification ?? null,
904
+ textHintSource: settled.textHintSource,
905
+ parseStatus: 'failed',
906
+ confidence: 0,
907
+ materialCount: 0,
908
+ classificationCount: 0,
909
+ parameterCount: 0,
910
+ warnings: [settled.error],
911
+ });
912
+ }
913
+ }
914
+ else if (options.image) {
915
+ const result = await interpretPage(options.image.base64, options.image.mimeType, options.config, { pageNumber: 1, totalPages: 1 });
916
+ pageResults.push(result);
917
+ pageAudits.push({
918
+ pageNumber: 1,
919
+ classification: null,
920
+ textHintSource: 'none',
921
+ parseStatus: result.parseStatus,
922
+ confidence: result.confidence,
923
+ materialCount: result.materials.length,
924
+ classificationCount: result.classifications.length,
925
+ parameterCount: result.parameters.length,
926
+ warnings: result.warnings,
927
+ });
928
+ }
929
+ if (pageResults.length === 0) {
930
+ throw new Error(pageFailures.length > 0
931
+ ? `No pages could be ingested successfully.\n${pageFailures.join('\n')}`
932
+ : 'No pages could be ingested successfully.');
933
+ }
934
+ const materials = mergeMaterials(pageResults);
935
+ const classifications = mergeClassifications(pageResults);
936
+ const parameters = mergeParameters(pageResults);
937
+ const risks = uniqueStrings(pageResults.flatMap((result) => result.risks));
938
+ const recommendations = uniqueStrings(pageResults.flatMap((result) => result.recommendations));
939
+ const summaries = uniqueStrings(pageResults.map((result) => result.summary));
940
+ const contentChunks = buildContentChunks(pageResults, options.inspection);
941
+ const title = chooseDocumentTitle(contentChunks);
942
+ const documentClass = chooseDocumentClass(contentChunks);
943
+ const partialAuditCount = pageAudits.filter((audit) => audit.parseStatus === 'partial').length;
944
+ const failedAuditCount = pageAudits.filter((audit) => audit.parseStatus === 'failed').length;
945
+ const parseStatus = pageFailures.length > 0
946
+ ? 'partial'
947
+ : mergeParseStatus(pageResults.map((result) => result.parseStatus));
948
+ const baseConfidence = Math.round(pageResults.reduce((sum, result) => sum + result.confidence, 0) / pageResults.length);
949
+ const failurePenalty = Math.min(30, failedAuditCount * 12);
950
+ const partialPenalty = Math.min(18, partialAuditCount * 6);
951
+ const confidenceCap = failedAuditCount > 0
952
+ ? 68
953
+ : partialAuditCount > 0
954
+ ? 78
955
+ : 100;
956
+ const confidence = Math.max(0, Math.min(confidenceCap, Math.round(baseConfidence - failurePenalty - partialPenalty)));
957
+ const reviewFindings = deriveDocumentFindings(pageResults, pageFailures, materials, classifications, parameters, options.inspection);
958
+ const reviewRequired = reviewFindings.some(findingRequiresReview);
959
+ const allPagesParsed = pageAudits.length > 0 && pageAudits.every((audit) => audit.parseStatus === 'parsed');
960
+ const warnings = uniqueStrings([
961
+ ...documentWarnings,
962
+ ...pageResults.flatMap((result) => result.warnings),
963
+ ]);
964
+ return {
965
+ kind: 'geotech-ingest-result',
966
+ schemaVersion: 1,
967
+ documentType: 'geotech-document',
968
+ generatedAt: now().toISOString(),
969
+ source: {
970
+ ...options.source,
971
+ totalPages: options.pages?.length ?? 1,
972
+ successfulPages: pageResults.length,
973
+ failedPages: pageFailures.length,
974
+ },
975
+ inspection: options.inspection ?? null,
976
+ inspectionSummary: summarizeInspection(options.inspection, recoveredOcrPages.size),
977
+ documentClass,
978
+ title,
979
+ summary: chooseDocumentSummary(contentChunks) ?? (summaries.length > 0 ? summaries.join(' ') : null),
980
+ materials,
981
+ classifications,
982
+ parameters,
983
+ risks,
984
+ recommendations,
985
+ contentChunks: contentChunks.map((chunk) => ({
986
+ chunkId: chunk.chunkId,
987
+ pageRange: chunk.pageRange,
988
+ headingAncestry: chunk.headingAncestry,
989
+ scope: chunk.scope,
990
+ sectionType: chunk.sectionType,
991
+ significance: chunk.significance,
992
+ text: chunk.text,
993
+ sourcePages: chunk.sourcePages,
994
+ })),
995
+ pageAudits,
996
+ pageFailures,
997
+ warnings,
998
+ reviewFindings,
999
+ reviewReasons: summarizeReviewReasons(reviewFindings),
1000
+ parseStatus,
1001
+ confidence,
1002
+ reviewRequired,
1003
+ canAutoProceed: !reviewRequired && parseStatus === 'parsed' && confidence >= 70 && allPagesParsed && pageFailures.length === 0,
1004
+ };
1005
+ }
1006
+ //# sourceMappingURL=geotech-document.js.map