aiex-cli 0.0.6-beta.1 → 0.0.6-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/dist/cli.mjs +324 -87
- package/dist/{doctor-collector-hWEvJ4lw.mjs → doctor-collector-abgpqc5T.mjs} +31 -58
- package/dist/index.mjs +1 -1
- package/dist/web/assets/AISettings-Dbma0Oku.js +264 -0
- package/dist/web/assets/ExtractionViewer-BEYHgPw2.js +1 -0
- package/dist/web/assets/index-D0So2rJE.css +2 -0
- package/dist/web/assets/{index-Dlze68g1.js → index-D7eI2nAX.js} +38 -38
- package/dist/web/index.html +2 -2
- package/dist/{zh-CN-Qcn0DHFh.mjs → zh-CN-wEUNhuHM.mjs} +3 -9
- package/package.json +2 -1
- package/dist/web/assets/AISettings-BlyTFIIy.js +0 -272
- package/dist/web/assets/ExtractionViewer-DqIrBGNK.js +0 -1
- package/dist/web/assets/index-CvY9TGny.css +0 -2
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { C as description, E as version, O as doctorDiagnosticsTableRows, S as seedConfig, T as package_default, _ as DEFAULT_PROMPT_CONFIG, a as parseJsonSchema, b as AIConfigSchema, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MINERU_CONFIG, h as DEFAULT_MINERU_API_CONFIG, i as JsonSchemaDefinitionSchema, k as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as PLACEHOLDER_SCHEMA, w as name, x as createConfig, y as PLACEHOLDER_TEXT } from "./doctor-collector-abgpqc5T.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -21,6 +21,8 @@ import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
|
21
21
|
import { APICallError, Output, generateText, jsonSchema } from "ai";
|
|
22
22
|
import pRetry from "p-retry";
|
|
23
23
|
import mime from "mime";
|
|
24
|
+
import { TextDecoder, promisify } from "node:util";
|
|
25
|
+
import { fileTypeFromBuffer, fileTypeFromFile } from "file-type";
|
|
24
26
|
import { jsonrepair } from "jsonrepair";
|
|
25
27
|
import { LangfuseSpanProcessor } from "@langfuse/otel";
|
|
26
28
|
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
@@ -31,7 +33,6 @@ import { glob, globSync } from "tinyglobby";
|
|
|
31
33
|
import { extractText, getDocumentProxy, getMeta } from "unpdf";
|
|
32
34
|
import AdmZip from "adm-zip";
|
|
33
35
|
import { execFile } from "node:child_process";
|
|
34
|
-
import { promisify } from "node:util";
|
|
35
36
|
import * as chokidar from "chokidar";
|
|
36
37
|
import { serve } from "@hono/node-server";
|
|
37
38
|
import open from "open";
|
|
@@ -12859,13 +12860,65 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
|
|
|
12859
12860
|
});
|
|
12860
12861
|
}
|
|
12861
12862
|
|
|
12863
|
+
//#endregion
|
|
12864
|
+
//#region src/core/input-file-kind.ts
|
|
12865
|
+
const UTF8_DECODER = new TextDecoder("utf-8", { fatal: true });
|
|
12866
|
+
const SVG_START_RE = /^\s*<svg[\s>]/i;
|
|
12867
|
+
const SVG_ANY_RE = /<svg[\s>]/i;
|
|
12868
|
+
function isSupportedImageMime(mime$1) {
|
|
12869
|
+
return !!mime$1 && [
|
|
12870
|
+
"image/png",
|
|
12871
|
+
"image/jpeg",
|
|
12872
|
+
"image/webp"
|
|
12873
|
+
].includes(mime$1);
|
|
12874
|
+
}
|
|
12875
|
+
function detectTextKind(buffer) {
|
|
12876
|
+
try {
|
|
12877
|
+
const text$1 = UTF8_DECODER.decode(buffer);
|
|
12878
|
+
if (SVG_START_RE.test(text$1) || SVG_ANY_RE.test(text$1.slice(0, 4096))) return {
|
|
12879
|
+
kind: "unsupported",
|
|
12880
|
+
mime: "image/svg+xml"
|
|
12881
|
+
};
|
|
12882
|
+
return {
|
|
12883
|
+
kind: "text",
|
|
12884
|
+
mime: "text/plain"
|
|
12885
|
+
};
|
|
12886
|
+
} catch {
|
|
12887
|
+
return { kind: "unsupported" };
|
|
12888
|
+
}
|
|
12889
|
+
}
|
|
12890
|
+
async function detectInputFileKind(filePath) {
|
|
12891
|
+
const detected = await fileTypeFromFile(filePath);
|
|
12892
|
+
if (detected?.mime === "application/pdf") return {
|
|
12893
|
+
kind: "pdf",
|
|
12894
|
+
mime: detected.mime
|
|
12895
|
+
};
|
|
12896
|
+
if (isSupportedImageMime(detected?.mime)) return {
|
|
12897
|
+
kind: "image",
|
|
12898
|
+
mime: detected?.mime
|
|
12899
|
+
};
|
|
12900
|
+
return detectTextKind(await fs.readFile(filePath));
|
|
12901
|
+
}
|
|
12902
|
+
async function detectInputBufferKind(buffer) {
|
|
12903
|
+
const detected = await fileTypeFromBuffer(buffer);
|
|
12904
|
+
if (detected?.mime === "application/pdf") return {
|
|
12905
|
+
kind: "pdf",
|
|
12906
|
+
mime: detected.mime
|
|
12907
|
+
};
|
|
12908
|
+
if (isSupportedImageMime(detected?.mime)) return {
|
|
12909
|
+
kind: "image",
|
|
12910
|
+
mime: detected?.mime
|
|
12911
|
+
};
|
|
12912
|
+
return detectTextKind(buffer);
|
|
12913
|
+
}
|
|
12914
|
+
|
|
12862
12915
|
//#endregion
|
|
12863
12916
|
//#region src/core/ai-extraction/file-utils.ts
|
|
12864
|
-
function detectMimeType(filePath) {
|
|
12865
|
-
return mime.getType(filePath) ?? "application/octet-stream";
|
|
12917
|
+
async function detectMimeType(filePath) {
|
|
12918
|
+
return (await detectInputFileKind(filePath)).mime ?? mime.getType(filePath) ?? "application/octet-stream";
|
|
12866
12919
|
}
|
|
12867
12920
|
async function readFilePart(filePath) {
|
|
12868
|
-
const mimeStr = detectMimeType(filePath);
|
|
12921
|
+
const mimeStr = await detectMimeType(filePath);
|
|
12869
12922
|
const buffer = await fs.readFile(filePath);
|
|
12870
12923
|
const name$1 = path.basename(filePath);
|
|
12871
12924
|
if (mimeStr.startsWith("image/")) return {
|
|
@@ -13205,14 +13258,48 @@ function validateExtractedData(schema, data) {
|
|
|
13205
13258
|
//#endregion
|
|
13206
13259
|
//#region src/core/ai-extraction/extractor.ts
|
|
13207
13260
|
const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
|
|
13261
|
+
function expectedExtractionFields(schema) {
|
|
13262
|
+
return Object.entries(schema.properties).filter(([, prop]) => !(prop.primary && prop.autoIncrement)).map(([name$1]) => name$1);
|
|
13263
|
+
}
|
|
13264
|
+
function calculateMissingFields(schema, data) {
|
|
13265
|
+
const expected = expectedExtractionFields(schema);
|
|
13266
|
+
if (expected.length === 0) return {
|
|
13267
|
+
fields: [],
|
|
13268
|
+
rate: 0
|
|
13269
|
+
};
|
|
13270
|
+
if (!data || typeof data !== "object" || Array.isArray(data)) return {
|
|
13271
|
+
fields: expected,
|
|
13272
|
+
rate: 1
|
|
13273
|
+
};
|
|
13274
|
+
const record = data;
|
|
13275
|
+
const fields = expected.filter((field) => {
|
|
13276
|
+
const value = record[field];
|
|
13277
|
+
return value === void 0 || value === null || value === "";
|
|
13278
|
+
});
|
|
13279
|
+
return {
|
|
13280
|
+
fields,
|
|
13281
|
+
rate: fields.length / expected.length
|
|
13282
|
+
};
|
|
13283
|
+
}
|
|
13208
13284
|
async function extractStructuredData(input) {
|
|
13209
13285
|
const { config, schema, text: text$1, aiexDir, file, modelOverride } = input;
|
|
13286
|
+
let apiRetryCount = 0;
|
|
13287
|
+
const onApiRetry = (info) => {
|
|
13288
|
+
apiRetryCount += 1;
|
|
13289
|
+
input.onRetry?.(info);
|
|
13290
|
+
};
|
|
13210
13291
|
if (!config.provider.apiKey) return {
|
|
13211
13292
|
success: false,
|
|
13212
|
-
error: t("errors.ai.apiKeyMissing")
|
|
13293
|
+
error: t("errors.ai.apiKeyMissing"),
|
|
13294
|
+
quality: { ai: {
|
|
13295
|
+
validationPassed: false,
|
|
13296
|
+
attempts: 0,
|
|
13297
|
+
selfCorrectionCount: 0,
|
|
13298
|
+
apiRetryCount
|
|
13299
|
+
} }
|
|
13213
13300
|
};
|
|
13214
13301
|
const useFileContent = !!file;
|
|
13215
|
-
const isImageFile = useFileContent
|
|
13302
|
+
const isImageFile = (useFileContent ? await detectMimeType(file) : "").startsWith("image/");
|
|
13216
13303
|
const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
|
|
13217
13304
|
const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
|
|
13218
13305
|
const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
|
|
@@ -13228,7 +13315,13 @@ async function extractStructuredData(input) {
|
|
|
13228
13315
|
} catch (e) {
|
|
13229
13316
|
return {
|
|
13230
13317
|
success: false,
|
|
13231
|
-
error: e.message
|
|
13318
|
+
error: e.message,
|
|
13319
|
+
quality: { ai: {
|
|
13320
|
+
validationPassed: false,
|
|
13321
|
+
attempts: 0,
|
|
13322
|
+
selfCorrectionCount: 0,
|
|
13323
|
+
apiRetryCount
|
|
13324
|
+
} }
|
|
13232
13325
|
};
|
|
13233
13326
|
}
|
|
13234
13327
|
const useStructuredOutput = selected.capabilities.structuredOutput;
|
|
@@ -13286,7 +13379,7 @@ async function extractStructuredData(input) {
|
|
|
13286
13379
|
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13287
13380
|
};
|
|
13288
13381
|
if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
|
|
13289
|
-
result = await withRetry(() => generateText(fileOpts),
|
|
13382
|
+
result = await withRetry(() => generateText(fileOpts), onApiRetry);
|
|
13290
13383
|
} else {
|
|
13291
13384
|
const textOpts = {
|
|
13292
13385
|
model: provider.chatModel(selected.name),
|
|
@@ -13297,7 +13390,7 @@ async function extractStructuredData(input) {
|
|
|
13297
13390
|
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13298
13391
|
};
|
|
13299
13392
|
if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
|
|
13300
|
-
result = await withRetry(() => generateText(textOpts),
|
|
13393
|
+
result = await withRetry(() => generateText(textOpts), onApiRetry);
|
|
13301
13394
|
}
|
|
13302
13395
|
if (result.usage) {
|
|
13303
13396
|
totalPromptTokens += result.usage.inputTokens ?? 0;
|
|
@@ -13315,6 +13408,7 @@ async function extractStructuredData(input) {
|
|
|
13315
13408
|
if (!parseError && data !== void 0) {
|
|
13316
13409
|
const validation = validateExtractedData(schema, data);
|
|
13317
13410
|
if (validation.success) {
|
|
13411
|
+
const missing = calculateMissingFields(schema, data);
|
|
13318
13412
|
const outputDir = path.resolve(aiexDir, config.extraction.outputDir.replace(".aiex/", ""));
|
|
13319
13413
|
await fs.mkdir(outputDir, { recursive: true });
|
|
13320
13414
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
@@ -13332,7 +13426,15 @@ async function extractStructuredData(input) {
|
|
|
13332
13426
|
prompt: totalPromptTokens,
|
|
13333
13427
|
completion: totalCompletionTokens,
|
|
13334
13428
|
total: totalPromptTokens + totalCompletionTokens
|
|
13335
|
-
}
|
|
13429
|
+
},
|
|
13430
|
+
quality: { ai: {
|
|
13431
|
+
validationPassed: true,
|
|
13432
|
+
attempts: attempt,
|
|
13433
|
+
selfCorrectionCount: attempt - 1,
|
|
13434
|
+
apiRetryCount,
|
|
13435
|
+
missingFields: missing.fields,
|
|
13436
|
+
missingFieldRate: missing.rate
|
|
13437
|
+
} }
|
|
13336
13438
|
};
|
|
13337
13439
|
} else validationError = validation.error;
|
|
13338
13440
|
}
|
|
@@ -13365,12 +13467,26 @@ Please output the corrected JSON object now:`;
|
|
|
13365
13467
|
}
|
|
13366
13468
|
return {
|
|
13367
13469
|
success: false,
|
|
13368
|
-
error: lastError || "Extraction failed after self-reflection retries"
|
|
13470
|
+
error: lastError || "Extraction failed after self-reflection retries",
|
|
13471
|
+
quality: { ai: {
|
|
13472
|
+
validationPassed: false,
|
|
13473
|
+
attempts: maxAttempts,
|
|
13474
|
+
selfCorrectionCount: maxAttempts - 1,
|
|
13475
|
+
apiRetryCount,
|
|
13476
|
+
validationError: lastError
|
|
13477
|
+
} }
|
|
13369
13478
|
};
|
|
13370
13479
|
} catch (error) {
|
|
13371
13480
|
return {
|
|
13372
13481
|
success: false,
|
|
13373
|
-
error: getErrorMessage(error)
|
|
13482
|
+
error: getErrorMessage(error),
|
|
13483
|
+
quality: { ai: {
|
|
13484
|
+
validationPassed: false,
|
|
13485
|
+
attempts: 0,
|
|
13486
|
+
selfCorrectionCount: 0,
|
|
13487
|
+
apiRetryCount,
|
|
13488
|
+
validationError: getErrorMessage(error)
|
|
13489
|
+
} }
|
|
13374
13490
|
};
|
|
13375
13491
|
}
|
|
13376
13492
|
}
|
|
@@ -13953,30 +14069,10 @@ const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
|
|
|
13953
14069
|
const MAX_UPLOAD_SIZE_TEXT = "30MB";
|
|
13954
14070
|
const SUPPORTED_FILE_TYPES_TEXT = "images, PDF, text, markdown, CSV, JSON, HTML, XML, YAML";
|
|
13955
14071
|
const MISSING_UPLOAD_FILE_TEXT = t("errors.file.missingUpload");
|
|
13956
|
-
const SUPPORTED_MIME_TYPES = new Set([
|
|
13957
|
-
"image/png",
|
|
13958
|
-
"image/jpeg",
|
|
13959
|
-
"image/gif",
|
|
13960
|
-
"image/webp",
|
|
13961
|
-
"image/bmp",
|
|
13962
|
-
"image/svg+xml",
|
|
13963
|
-
"application/pdf",
|
|
13964
|
-
"text/plain",
|
|
13965
|
-
"text/markdown",
|
|
13966
|
-
"text/csv",
|
|
13967
|
-
"application/json",
|
|
13968
|
-
"text/html",
|
|
13969
|
-
"text/xml",
|
|
13970
|
-
"application/x-yaml",
|
|
13971
|
-
"text/yaml"
|
|
13972
|
-
]);
|
|
13973
14072
|
const MIME_TO_EXT = {
|
|
13974
14073
|
"image/png": "png",
|
|
13975
14074
|
"image/jpeg": "jpg",
|
|
13976
|
-
"image/gif": "gif",
|
|
13977
14075
|
"image/webp": "webp",
|
|
13978
|
-
"image/bmp": "bmp",
|
|
13979
|
-
"image/svg+xml": "svg",
|
|
13980
14076
|
"application/pdf": "pdf",
|
|
13981
14077
|
"text/plain": "txt",
|
|
13982
14078
|
"text/markdown": "md",
|
|
@@ -13993,8 +14089,8 @@ function bytesToMB(bytes) {
|
|
|
13993
14089
|
function getExtensionFromMime(mimeType) {
|
|
13994
14090
|
return MIME_TO_EXT[mimeType];
|
|
13995
14091
|
}
|
|
13996
|
-
function
|
|
13997
|
-
return
|
|
14092
|
+
function getExtensionForDetectedFile(mimeType) {
|
|
14093
|
+
return mimeType ? getExtensionFromMime(mimeType) ?? "txt" : "txt";
|
|
13998
14094
|
}
|
|
13999
14095
|
function unsupportedFileTypeMessage(mimeType) {
|
|
14000
14096
|
return t("errors.file.unsupportedType", {
|
|
@@ -14011,14 +14107,16 @@ var FileValidationError = class extends Error {
|
|
|
14011
14107
|
this.name = "FileValidationError";
|
|
14012
14108
|
}
|
|
14013
14109
|
};
|
|
14014
|
-
function
|
|
14110
|
+
async function validateFileUploadContent(file, buffer) {
|
|
14015
14111
|
if (file.size === 0) throw new FileValidationError(t("errors.file.empty"));
|
|
14016
14112
|
if (file.size > MAX_UPLOAD_SIZE) throw new FileValidationError(t("errors.file.sizeExceeded", {
|
|
14017
14113
|
size: bytesToMB(file.size).toFixed(1),
|
|
14018
14114
|
limit: MAX_UPLOAD_SIZE_TEXT,
|
|
14019
14115
|
file: file.name
|
|
14020
14116
|
}));
|
|
14021
|
-
|
|
14117
|
+
const detected = await detectInputBufferKind(buffer);
|
|
14118
|
+
if (detected.kind === "unsupported") throw new FileValidationError(unsupportedFileTypeMessage(detected.mime ?? (file.type || "application/octet-stream")));
|
|
14119
|
+
return detected.mime ?? "text/plain";
|
|
14022
14120
|
}
|
|
14023
14121
|
|
|
14024
14122
|
//#endregion
|
|
@@ -14280,14 +14378,6 @@ function createPdfConverter(config) {
|
|
|
14280
14378
|
return withFallback(new ExternalCommandPdfConverter("mineru", mineruConfig), mineruConfig);
|
|
14281
14379
|
}
|
|
14282
14380
|
if (config.converter === "mineru_api") return new MineruApiPdfConverter(config.mineruApi ?? DEFAULT_MINERU_API_CONFIG);
|
|
14283
|
-
if (config.converter === "markitdown") {
|
|
14284
|
-
const markitdownConfig = config.markitdown ?? DEFAULT_MARKITDOWN_CONFIG;
|
|
14285
|
-
return withFallback(new ExternalCommandPdfConverter("markitdown", markitdownConfig), markitdownConfig);
|
|
14286
|
-
}
|
|
14287
|
-
if (config.converter === "marker") {
|
|
14288
|
-
const markerConfig = config.marker ?? DEFAULT_MARKER_CONFIG;
|
|
14289
|
-
return withFallback(new ExternalCommandPdfConverter("marker", markerConfig), markerConfig);
|
|
14290
|
-
}
|
|
14291
14381
|
if (config.converter === "external") {
|
|
14292
14382
|
if (!config.external) throw new Error(t("errors.pdf.externalNotConfigured"));
|
|
14293
14383
|
return new ExternalCommandPdfConverter("external", config.external);
|
|
@@ -14309,12 +14399,32 @@ const FILE_PART_EXTENSIONS = new Set([
|
|
|
14309
14399
|
"png",
|
|
14310
14400
|
"jpg",
|
|
14311
14401
|
"jpeg",
|
|
14312
|
-
"
|
|
14313
|
-
"webp",
|
|
14314
|
-
"bmp",
|
|
14315
|
-
"svg"
|
|
14402
|
+
"webp"
|
|
14316
14403
|
]);
|
|
14317
14404
|
const PDF_EXT_RE = /\.pdf$/i;
|
|
14405
|
+
async function describeExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
14406
|
+
const detected = await detectInputFileKind(filePath);
|
|
14407
|
+
if (detected.kind === "image") return {
|
|
14408
|
+
kind: "image",
|
|
14409
|
+
mime: detected.mime,
|
|
14410
|
+
handler: shouldUseImageOcrFallback(aiConfig, modelOverride) ? "image_local_ocr" : "image_vision"
|
|
14411
|
+
};
|
|
14412
|
+
if (detected.kind === "pdf") {
|
|
14413
|
+
const converter = createPdfConverter(aiConfig?.pdf);
|
|
14414
|
+
return {
|
|
14415
|
+
kind: "pdf",
|
|
14416
|
+
mime: detected.mime,
|
|
14417
|
+
handler: "pdf_converter",
|
|
14418
|
+
converter: converter.name
|
|
14419
|
+
};
|
|
14420
|
+
}
|
|
14421
|
+
if (detected.kind === "text") return {
|
|
14422
|
+
kind: "text",
|
|
14423
|
+
mime: detected.mime,
|
|
14424
|
+
handler: "text"
|
|
14425
|
+
};
|
|
14426
|
+
throw new Error(unsupportedFileTypeMessage(detected.mime ?? "application/octet-stream"));
|
|
14427
|
+
}
|
|
14318
14428
|
async function readExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
14319
14429
|
const stat = fs$1.statSync(filePath);
|
|
14320
14430
|
if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
|
|
@@ -14322,19 +14432,34 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
|
14322
14432
|
limit: MAX_UPLOAD_SIZE_TEXT,
|
|
14323
14433
|
file: filePath
|
|
14324
14434
|
}));
|
|
14325
|
-
const
|
|
14326
|
-
if (
|
|
14327
|
-
if (
|
|
14328
|
-
const result = await recognizeImageText(filePath
|
|
14435
|
+
const inputProcessing = await describeExtractFileInput(filePath, aiConfig, modelOverride);
|
|
14436
|
+
if (inputProcessing.kind === "image") {
|
|
14437
|
+
if (inputProcessing.handler === "image_local_ocr") {
|
|
14438
|
+
const result = await recognizeImageText(filePath);
|
|
14329
14439
|
consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
|
|
14330
|
-
return {
|
|
14440
|
+
return {
|
|
14441
|
+
text: result.text,
|
|
14442
|
+
inputProcessing,
|
|
14443
|
+
quality: { input: {
|
|
14444
|
+
kind: "image",
|
|
14445
|
+
textLength: result.text.length,
|
|
14446
|
+
emptyText: result.text.trim().length === 0,
|
|
14447
|
+
ocr: {
|
|
14448
|
+
confidence: result.confidence,
|
|
14449
|
+
textLength: result.text.length,
|
|
14450
|
+
platform: process.platform
|
|
14451
|
+
}
|
|
14452
|
+
} }
|
|
14453
|
+
};
|
|
14331
14454
|
}
|
|
14332
14455
|
return {
|
|
14333
14456
|
text: "",
|
|
14334
|
-
filePath
|
|
14457
|
+
filePath,
|
|
14458
|
+
inputProcessing,
|
|
14459
|
+
quality: { input: { kind: "image" } }
|
|
14335
14460
|
};
|
|
14336
14461
|
}
|
|
14337
|
-
if (
|
|
14462
|
+
if (inputProcessing.kind === "pdf") {
|
|
14338
14463
|
const buffer = await fs.readFile(filePath);
|
|
14339
14464
|
const converter = createPdfConverter(aiConfig?.pdf);
|
|
14340
14465
|
const result = await converter.convert(buffer, filePath);
|
|
@@ -14352,9 +14477,37 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
|
14352
14477
|
await fs.writeFile(fallbackMd, result.text);
|
|
14353
14478
|
consola.info(t("command.extract.file.markdownSaved", { path: fallbackMd }));
|
|
14354
14479
|
}
|
|
14355
|
-
|
|
14480
|
+
const textLength = result.text.length;
|
|
14481
|
+
return {
|
|
14482
|
+
text: result.text,
|
|
14483
|
+
inputProcessing,
|
|
14484
|
+
quality: { input: {
|
|
14485
|
+
kind: "pdf",
|
|
14486
|
+
textLength,
|
|
14487
|
+
emptyText: result.text.trim().length === 0,
|
|
14488
|
+
pdf: {
|
|
14489
|
+
pageCount: result.pageCount,
|
|
14490
|
+
textLength,
|
|
14491
|
+
emptyText: result.text.trim().length === 0,
|
|
14492
|
+
fallbackUsed: result.metadata?.fallback === "true",
|
|
14493
|
+
converter: result.metadata?.converter ?? converter.name
|
|
14494
|
+
}
|
|
14495
|
+
} }
|
|
14496
|
+
};
|
|
14356
14497
|
}
|
|
14357
|
-
|
|
14498
|
+
if (inputProcessing.kind === "text") {
|
|
14499
|
+
const text$1 = await fs.readFile(filePath, "utf-8");
|
|
14500
|
+
return {
|
|
14501
|
+
text: text$1,
|
|
14502
|
+
inputProcessing,
|
|
14503
|
+
quality: { input: {
|
|
14504
|
+
kind: "text",
|
|
14505
|
+
textLength: text$1.length,
|
|
14506
|
+
emptyText: text$1.trim().length === 0
|
|
14507
|
+
} }
|
|
14508
|
+
};
|
|
14509
|
+
}
|
|
14510
|
+
throw new Error(unsupportedFileTypeMessage(inputProcessing.mime ?? "application/octet-stream"));
|
|
14358
14511
|
}
|
|
14359
14512
|
|
|
14360
14513
|
//#endregion
|
|
@@ -14548,7 +14701,9 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14548
14701
|
}
|
|
14549
14702
|
return {
|
|
14550
14703
|
success: false,
|
|
14551
|
-
error: result.error || t("common.unknownError")
|
|
14704
|
+
error: result.error || t("common.unknownError"),
|
|
14705
|
+
quality: result.quality,
|
|
14706
|
+
failureStage: "ai_extraction"
|
|
14552
14707
|
};
|
|
14553
14708
|
}
|
|
14554
14709
|
if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
|
|
@@ -14567,7 +14722,9 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14567
14722
|
consola.error(dbError);
|
|
14568
14723
|
return {
|
|
14569
14724
|
success: false,
|
|
14570
|
-
error: dbError
|
|
14725
|
+
error: dbError,
|
|
14726
|
+
quality: result.quality,
|
|
14727
|
+
failureStage: "db_insert"
|
|
14571
14728
|
};
|
|
14572
14729
|
}
|
|
14573
14730
|
try {
|
|
@@ -14581,14 +14738,17 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14581
14738
|
outputPath: result.outputPath,
|
|
14582
14739
|
data: result.data,
|
|
14583
14740
|
tablesInserted: insertResult.tablesInserted,
|
|
14584
|
-
tokensUsed: result.tokensUsed
|
|
14741
|
+
tokensUsed: result.tokensUsed,
|
|
14742
|
+
quality: result.quality
|
|
14585
14743
|
};
|
|
14586
14744
|
} else {
|
|
14587
14745
|
if (!options?.quiet) s2.stop(t("command.extract.file.dbInsertFail"));
|
|
14588
14746
|
consola.error(insertResult.error || t("common.unknownError"));
|
|
14589
14747
|
return {
|
|
14590
14748
|
success: false,
|
|
14591
|
-
error: insertResult.error
|
|
14749
|
+
error: insertResult.error,
|
|
14750
|
+
quality: result.quality,
|
|
14751
|
+
failureStage: "db_insert"
|
|
14592
14752
|
};
|
|
14593
14753
|
}
|
|
14594
14754
|
} finally {
|
|
@@ -14599,7 +14759,9 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14599
14759
|
consola.error(e instanceof Error ? e.message : String(e));
|
|
14600
14760
|
return {
|
|
14601
14761
|
success: false,
|
|
14602
|
-
error: String(e)
|
|
14762
|
+
error: String(e),
|
|
14763
|
+
quality: result.quality,
|
|
14764
|
+
failureStage: "db_insert"
|
|
14603
14765
|
};
|
|
14604
14766
|
}
|
|
14605
14767
|
}
|
|
@@ -14607,9 +14769,29 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14607
14769
|
success: true,
|
|
14608
14770
|
outputPath: result.outputPath,
|
|
14609
14771
|
data: result.data,
|
|
14610
|
-
tokensUsed: result.tokensUsed
|
|
14772
|
+
tokensUsed: result.tokensUsed,
|
|
14773
|
+
quality: result.quality
|
|
14611
14774
|
};
|
|
14612
14775
|
}
|
|
14776
|
+
function formatInputProcessing$1(input) {
|
|
14777
|
+
const handler = input.converter ? `${input.handler}(${input.converter})` : input.handler;
|
|
14778
|
+
return `${input.mime ?? input.kind} -> ${handler}`;
|
|
14779
|
+
}
|
|
14780
|
+
function mergeQuality(inputQuality, aiQuality) {
|
|
14781
|
+
if (!inputQuality && !aiQuality) return void 0;
|
|
14782
|
+
return {
|
|
14783
|
+
input: inputQuality?.input,
|
|
14784
|
+
ai: aiQuality?.ai
|
|
14785
|
+
};
|
|
14786
|
+
}
|
|
14787
|
+
function classifyInputError(error, inputProcessing) {
|
|
14788
|
+
if (inputProcessing?.handler === "pdf_converter") return "file_conversion";
|
|
14789
|
+
if (inputProcessing?.handler === "image_local_ocr") return "ocr";
|
|
14790
|
+
const message = (error instanceof Error ? error.message : String(error)).toLowerCase();
|
|
14791
|
+
if (message.includes("ocr")) return "ocr";
|
|
14792
|
+
if (message.includes("pdf") || message.includes("converter")) return "file_conversion";
|
|
14793
|
+
return "input_detection";
|
|
14794
|
+
}
|
|
14613
14795
|
async function runAuditedExtraction(options) {
|
|
14614
14796
|
const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
|
|
14615
14797
|
let fileHash;
|
|
@@ -14650,7 +14832,10 @@ async function runAuditedExtraction(options) {
|
|
|
14650
14832
|
outputName: existing.outputName,
|
|
14651
14833
|
tablesInserted: existing.tablesInserted,
|
|
14652
14834
|
notionPages: existing.notionPages,
|
|
14653
|
-
tokensUsed: existing.tokensUsed
|
|
14835
|
+
tokensUsed: existing.tokensUsed,
|
|
14836
|
+
inputProcessing: existing.inputProcessing,
|
|
14837
|
+
quality: existing.quality,
|
|
14838
|
+
failureStage: existing.failureStage
|
|
14654
14839
|
};
|
|
14655
14840
|
}
|
|
14656
14841
|
}
|
|
@@ -14669,6 +14854,8 @@ async function runAuditedExtraction(options) {
|
|
|
14669
14854
|
},
|
|
14670
14855
|
retryOf
|
|
14671
14856
|
});
|
|
14857
|
+
let inputProcessing;
|
|
14858
|
+
let inputQuality;
|
|
14672
14859
|
try {
|
|
14673
14860
|
let text$1 = "";
|
|
14674
14861
|
let filePath;
|
|
@@ -14676,6 +14863,13 @@ async function runAuditedExtraction(options) {
|
|
|
14676
14863
|
const input = await readExtractFileInput(source.filePath, aiConfig, modelOverride);
|
|
14677
14864
|
text$1 = input.text;
|
|
14678
14865
|
filePath = input.filePath;
|
|
14866
|
+
inputProcessing = input.inputProcessing;
|
|
14867
|
+
inputQuality = input.quality;
|
|
14868
|
+
if (!quiet) consola.info(`Input: ${formatInputProcessing$1(inputProcessing)}`);
|
|
14869
|
+
await updateExtractionAuditRecord(aiexDir, audit.id, {
|
|
14870
|
+
inputProcessing,
|
|
14871
|
+
quality: inputQuality
|
|
14872
|
+
});
|
|
14679
14873
|
} else text$1 = source.text;
|
|
14680
14874
|
const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
|
|
14681
14875
|
quiet,
|
|
@@ -14693,6 +14887,8 @@ async function runAuditedExtraction(options) {
|
|
|
14693
14887
|
outputName: r.outputPath ? path.basename(r.outputPath) : void 0,
|
|
14694
14888
|
tablesInserted: r.tablesInserted,
|
|
14695
14889
|
tokensUsed: r.tokensUsed,
|
|
14890
|
+
quality: mergeQuality(inputQuality, r.quality),
|
|
14891
|
+
failureStage: "integration",
|
|
14696
14892
|
error: error instanceof Error ? error.message : String(error)
|
|
14697
14893
|
});
|
|
14698
14894
|
if (!quiet) consola.error(t("command.extract.file.notionSyncFail", { error: error instanceof Error ? error.message : String(error) }));
|
|
@@ -14701,7 +14897,10 @@ async function runAuditedExtraction(options) {
|
|
|
14701
14897
|
success: false,
|
|
14702
14898
|
error: error instanceof Error ? error.message : String(error),
|
|
14703
14899
|
auditId: audit.id,
|
|
14704
|
-
fileHash
|
|
14900
|
+
fileHash,
|
|
14901
|
+
inputProcessing,
|
|
14902
|
+
quality: mergeQuality(inputQuality, r.quality),
|
|
14903
|
+
failureStage: "integration"
|
|
14705
14904
|
};
|
|
14706
14905
|
}
|
|
14707
14906
|
const updated = await updateExtractionAuditRecord(aiexDir, audit.id, {
|
|
@@ -14710,7 +14909,8 @@ async function runAuditedExtraction(options) {
|
|
|
14710
14909
|
outputName: r.outputPath ? path.basename(r.outputPath) : void 0,
|
|
14711
14910
|
tablesInserted: r.tablesInserted,
|
|
14712
14911
|
notionPages,
|
|
14713
|
-
tokensUsed: r.tokensUsed
|
|
14912
|
+
tokensUsed: r.tokensUsed,
|
|
14913
|
+
quality: mergeQuality(inputQuality, r.quality)
|
|
14714
14914
|
});
|
|
14715
14915
|
await triggerWebhook(aiConfig, audit.id, schemaName, "extraction.success", source, r.data, void 0, r.tokensUsed, quiet);
|
|
14716
14916
|
return {
|
|
@@ -14721,12 +14921,17 @@ async function runAuditedExtraction(options) {
|
|
|
14721
14921
|
notionPages: updated.notionPages,
|
|
14722
14922
|
tokensUsed: updated.tokensUsed,
|
|
14723
14923
|
auditId: updated.id,
|
|
14724
|
-
fileHash
|
|
14924
|
+
fileHash,
|
|
14925
|
+
inputProcessing: updated.inputProcessing,
|
|
14926
|
+
quality: updated.quality,
|
|
14927
|
+
failureStage: updated.failureStage
|
|
14725
14928
|
};
|
|
14726
14929
|
} else {
|
|
14727
14930
|
await updateExtractionAuditRecord(aiexDir, audit.id, {
|
|
14728
14931
|
status: "failed",
|
|
14729
|
-
error: r.error || "Extraction failed"
|
|
14932
|
+
error: r.error || "Extraction failed",
|
|
14933
|
+
quality: mergeQuality(inputQuality, r.quality),
|
|
14934
|
+
failureStage: r.failureStage ?? "ai_extraction"
|
|
14730
14935
|
});
|
|
14731
14936
|
if (!quiet) consola.error(t("command.extract.file.extractionFailed", { error: r.error }));
|
|
14732
14937
|
await triggerWebhook(aiConfig, audit.id, schemaName, "extraction.failed", source, void 0, r.error || "Extraction failed", void 0, quiet);
|
|
@@ -14734,13 +14939,19 @@ async function runAuditedExtraction(options) {
|
|
|
14734
14939
|
success: false,
|
|
14735
14940
|
error: r.error,
|
|
14736
14941
|
auditId: audit.id,
|
|
14737
|
-
fileHash
|
|
14942
|
+
fileHash,
|
|
14943
|
+
inputProcessing,
|
|
14944
|
+
quality: mergeQuality(inputQuality, r.quality),
|
|
14945
|
+
failureStage: r.failureStage ?? "ai_extraction"
|
|
14738
14946
|
};
|
|
14739
14947
|
}
|
|
14740
14948
|
} catch (e) {
|
|
14949
|
+
const failureStage = classifyInputError(e, inputProcessing);
|
|
14741
14950
|
await updateExtractionAuditRecord(aiexDir, audit.id, {
|
|
14742
14951
|
status: "failed",
|
|
14743
|
-
error: e instanceof Error ? e.message : String(e)
|
|
14952
|
+
error: e instanceof Error ? e.message : String(e),
|
|
14953
|
+
quality: inputQuality,
|
|
14954
|
+
failureStage
|
|
14744
14955
|
});
|
|
14745
14956
|
if (!quiet) {
|
|
14746
14957
|
const name$1 = source.type === "file" ? path.basename(source.filePath) : "text input";
|
|
@@ -14754,7 +14965,10 @@ async function runAuditedExtraction(options) {
|
|
|
14754
14965
|
success: false,
|
|
14755
14966
|
error: e instanceof Error ? e.message : String(e),
|
|
14756
14967
|
auditId: audit.id,
|
|
14757
|
-
fileHash
|
|
14968
|
+
fileHash,
|
|
14969
|
+
inputProcessing,
|
|
14970
|
+
quality: inputQuality,
|
|
14971
|
+
failureStage
|
|
14758
14972
|
};
|
|
14759
14973
|
}
|
|
14760
14974
|
}
|
|
@@ -14938,6 +15152,18 @@ function isExtractSubCommand(rawArgs) {
|
|
|
14938
15152
|
function formatSource(source) {
|
|
14939
15153
|
return source.type === "file" ? source.fileName || "file" : "unknown";
|
|
14940
15154
|
}
|
|
15155
|
+
function formatInputProcessing(input) {
|
|
15156
|
+
if (!input) return "";
|
|
15157
|
+
const handler = input.converter ? `${input.handler}(${input.converter})` : input.handler;
|
|
15158
|
+
return ` [${input.mime ?? input.kind} -> ${handler}]`;
|
|
15159
|
+
}
|
|
15160
|
+
function formatQuality(quality, failureStage) {
|
|
15161
|
+
if (failureStage) return ` [failed:${failureStage}]`;
|
|
15162
|
+
if (quality?.input?.pdf) return ` [pdf:${quality.input.pdf.pageCount}p/${quality.input.pdf.textLength}chars${quality.input.pdf.fallbackUsed ? "/fallback" : ""}]`;
|
|
15163
|
+
if (quality?.input?.ocr) return ` [ocr:${Math.round(quality.input.ocr.confidence * 100)}%/${quality.input.ocr.textLength}chars]`;
|
|
15164
|
+
if (quality?.ai?.missingFieldRate !== void 0) return ` [missing:${Math.round(quality.ai.missingFieldRate * 100)}%]`;
|
|
15165
|
+
return "";
|
|
15166
|
+
}
|
|
14941
15167
|
async function loadConfiguredAI(aiexDir) {
|
|
14942
15168
|
const aiConfig = await readAIConfig(aiexDir);
|
|
14943
15169
|
if (!aiConfig) {
|
|
@@ -14980,7 +15206,7 @@ const historyCommand = defineCommand({
|
|
|
14980
15206
|
}
|
|
14981
15207
|
for (const record of records) {
|
|
14982
15208
|
const suffix = record.error ? ` — ${record.error}` : record.outputName ? ` — ${record.outputName}` : "";
|
|
14983
|
-
consola.info(`${record.status.padEnd(9)} ${record.id} ${record.schemaName} ${formatSource(record.source)}${suffix}`);
|
|
15209
|
+
consola.info(`${record.status.padEnd(9)} ${record.id} ${record.schemaName} ${formatSource(record.source)}${formatInputProcessing(record.inputProcessing)}${formatQuality(record.quality, record.failureStage)}${suffix}`);
|
|
14984
15210
|
}
|
|
14985
15211
|
}
|
|
14986
15212
|
});
|
|
@@ -15493,10 +15719,7 @@ const SUPPORTED_EXTENSIONS = new Set([
|
|
|
15493
15719
|
"png",
|
|
15494
15720
|
"jpg",
|
|
15495
15721
|
"jpeg",
|
|
15496
|
-
"gif",
|
|
15497
15722
|
"webp",
|
|
15498
|
-
"bmp",
|
|
15499
|
-
"svg",
|
|
15500
15723
|
"pdf",
|
|
15501
15724
|
"txt",
|
|
15502
15725
|
"md",
|
|
@@ -15909,7 +16132,10 @@ async function listExtractions(config) {
|
|
|
15909
16132
|
modifiedAt: stat.mtime.toISOString(),
|
|
15910
16133
|
notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
|
|
15911
16134
|
notionPages,
|
|
15912
|
-
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
|
|
16135
|
+
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0,
|
|
16136
|
+
inputProcessing: audit?.inputProcessing,
|
|
16137
|
+
quality: audit?.quality,
|
|
16138
|
+
failureStage: audit?.failureStage
|
|
15913
16139
|
});
|
|
15914
16140
|
} catch {
|
|
15915
16141
|
continue;
|
|
@@ -16180,10 +16406,9 @@ function getFormFile(value) {
|
|
|
16180
16406
|
function safeUploadName(name$1) {
|
|
16181
16407
|
return path.basename(name$1).replace(/[^\w.-]/g, "_") || "upload.txt";
|
|
16182
16408
|
}
|
|
16183
|
-
function safeUploadNameForMime(file) {
|
|
16409
|
+
function safeUploadNameForMime(file, mimeType) {
|
|
16184
16410
|
const safeName = safeUploadName(file.name);
|
|
16185
|
-
const ext =
|
|
16186
|
-
if (!ext) throw new FileValidationError(unsupportedFileTypeMessage(file.type));
|
|
16411
|
+
const ext = getExtensionForDetectedFile(mimeType);
|
|
16187
16412
|
return `${path.parse(safeName).name || "upload"}.${ext}`;
|
|
16188
16413
|
}
|
|
16189
16414
|
function jsonResponse(body, status) {
|
|
@@ -16193,10 +16418,10 @@ function jsonResponse(body, status) {
|
|
|
16193
16418
|
});
|
|
16194
16419
|
}
|
|
16195
16420
|
async function saveUploadToFile(file, uploadsDir, id) {
|
|
16196
|
-
validateFileUpload(file);
|
|
16197
|
-
await fs.mkdir(uploadsDir, { recursive: true });
|
|
16198
|
-
const filePath = path.join(uploadsDir, `${id}-${safeUploadNameForMime(file)}`);
|
|
16199
16421
|
const buffer = Buffer.from(await file.arrayBuffer());
|
|
16422
|
+
const mimeType = await validateFileUploadContent(file, buffer);
|
|
16423
|
+
await fs.mkdir(uploadsDir, { recursive: true });
|
|
16424
|
+
const filePath = path.join(uploadsDir, `${id}-${safeUploadNameForMime(file, mimeType)}`);
|
|
16200
16425
|
await fs.writeFile(filePath, buffer);
|
|
16201
16426
|
return filePath;
|
|
16202
16427
|
}
|
|
@@ -16277,7 +16502,10 @@ function extractRoutes(config) {
|
|
|
16277
16502
|
if (!result.success) return jsonResponse({
|
|
16278
16503
|
success: false,
|
|
16279
16504
|
error: result.error,
|
|
16280
|
-
auditId: result.auditId
|
|
16505
|
+
auditId: result.auditId,
|
|
16506
|
+
inputProcessing: result.inputProcessing,
|
|
16507
|
+
quality: result.quality,
|
|
16508
|
+
failureStage: result.failureStage
|
|
16281
16509
|
}, 500);
|
|
16282
16510
|
return jsonResponse({
|
|
16283
16511
|
success: true,
|
|
@@ -16286,7 +16514,10 @@ function extractRoutes(config) {
|
|
|
16286
16514
|
tablesInserted: result.tablesInserted,
|
|
16287
16515
|
notionPages: result.notionPages,
|
|
16288
16516
|
tokensUsed: result.tokensUsed,
|
|
16289
|
-
auditId: result.auditId
|
|
16517
|
+
auditId: result.auditId,
|
|
16518
|
+
inputProcessing: result.inputProcessing,
|
|
16519
|
+
quality: result.quality,
|
|
16520
|
+
failureStage: result.failureStage
|
|
16290
16521
|
}, 200);
|
|
16291
16522
|
} catch (error) {
|
|
16292
16523
|
if (isMissingUploadFileError(error)) return c.json({
|
|
@@ -16344,7 +16575,10 @@ function extractRoutes(config) {
|
|
|
16344
16575
|
if (!result.success) return jsonResponse({
|
|
16345
16576
|
success: false,
|
|
16346
16577
|
error: result.error,
|
|
16347
|
-
auditId: result.auditId
|
|
16578
|
+
auditId: result.auditId,
|
|
16579
|
+
inputProcessing: result.inputProcessing,
|
|
16580
|
+
quality: result.quality,
|
|
16581
|
+
failureStage: result.failureStage
|
|
16348
16582
|
}, 500);
|
|
16349
16583
|
return jsonResponse({
|
|
16350
16584
|
success: true,
|
|
@@ -16353,7 +16587,10 @@ function extractRoutes(config) {
|
|
|
16353
16587
|
tablesInserted: result.tablesInserted,
|
|
16354
16588
|
notionPages: result.notionPages,
|
|
16355
16589
|
tokensUsed: result.tokensUsed,
|
|
16356
|
-
auditId: result.auditId
|
|
16590
|
+
auditId: result.auditId,
|
|
16591
|
+
inputProcessing: result.inputProcessing,
|
|
16592
|
+
quality: result.quality,
|
|
16593
|
+
failureStage: result.failureStage
|
|
16357
16594
|
}, 200);
|
|
16358
16595
|
});
|
|
16359
16596
|
app.delete("/extract/records/:id", async (c) => {
|