aiex-cli 0.0.6-beta.1 → 0.0.6-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/dist/cli.mjs +142 -68
- package/dist/{doctor-collector-hWEvJ4lw.mjs → doctor-collector-CGo5dgHm.mjs} +31 -58
- package/dist/index.mjs +1 -1
- package/dist/web/assets/AISettings-Dbma0Oku.js +264 -0
- package/dist/web/assets/ExtractionViewer-CrQMLtX7.js +1 -0
- package/dist/web/assets/{index-Dlze68g1.js → index-CdQgz6dJ.js} +8 -8
- package/dist/web/assets/index-D0So2rJE.css +2 -0
- package/dist/web/index.html +2 -2
- package/dist/{zh-CN-Qcn0DHFh.mjs → zh-CN-wEUNhuHM.mjs} +3 -9
- package/package.json +2 -1
- package/dist/web/assets/AISettings-BlyTFIIy.js +0 -272
- package/dist/web/assets/ExtractionViewer-DqIrBGNK.js +0 -1
- package/dist/web/assets/index-CvY9TGny.css +0 -2
package/README.md
CHANGED
|
@@ -208,6 +208,7 @@ aiex works with any OpenAI-compatible API provider. Configure in the Web UI (AI
|
|
|
208
208
|
|
|
209
209
|
- **Provider** — Set your base URL and API key
|
|
210
210
|
- **Models** — Add models with vision and/or structured output capabilities
|
|
211
|
+
- **Documents** — Choose a PDF converter (`unpdf`, `mineru`, `mineru_api`, or `external`); image input automatically uses a vision model when available, otherwise system OCR on supported platforms
|
|
211
212
|
- **Prompts** — Customize system and user prompt templates with `{schema}` and `{text}` placeholders
|
|
212
213
|
- **Integrations** — Optionally connect Notion from AI Settings; use Connect & Map to bind a schema to an existing Notion data source
|
|
213
214
|
|
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { C as description, E as version, O as doctorDiagnosticsTableRows, S as seedConfig, T as package_default, _ as DEFAULT_PROMPT_CONFIG, a as parseJsonSchema, b as AIConfigSchema, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MINERU_CONFIG, h as DEFAULT_MINERU_API_CONFIG, i as JsonSchemaDefinitionSchema, k as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as PLACEHOLDER_SCHEMA, w as name, x as createConfig, y as PLACEHOLDER_TEXT } from "./doctor-collector-CGo5dgHm.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -21,6 +21,8 @@ import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
|
21
21
|
import { APICallError, Output, generateText, jsonSchema } from "ai";
|
|
22
22
|
import pRetry from "p-retry";
|
|
23
23
|
import mime from "mime";
|
|
24
|
+
import { TextDecoder, promisify } from "node:util";
|
|
25
|
+
import { fileTypeFromBuffer, fileTypeFromFile } from "file-type";
|
|
24
26
|
import { jsonrepair } from "jsonrepair";
|
|
25
27
|
import { LangfuseSpanProcessor } from "@langfuse/otel";
|
|
26
28
|
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
@@ -31,7 +33,6 @@ import { glob, globSync } from "tinyglobby";
|
|
|
31
33
|
import { extractText, getDocumentProxy, getMeta } from "unpdf";
|
|
32
34
|
import AdmZip from "adm-zip";
|
|
33
35
|
import { execFile } from "node:child_process";
|
|
34
|
-
import { promisify } from "node:util";
|
|
35
36
|
import * as chokidar from "chokidar";
|
|
36
37
|
import { serve } from "@hono/node-server";
|
|
37
38
|
import open from "open";
|
|
@@ -12859,13 +12860,65 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
|
|
|
12859
12860
|
});
|
|
12860
12861
|
}
|
|
12861
12862
|
|
|
12863
|
+
//#endregion
|
|
12864
|
+
//#region src/core/input-file-kind.ts
|
|
12865
|
+
const UTF8_DECODER = new TextDecoder("utf-8", { fatal: true });
|
|
12866
|
+
const SVG_START_RE = /^\s*<svg[\s>]/i;
|
|
12867
|
+
const SVG_ANY_RE = /<svg[\s>]/i;
|
|
12868
|
+
function isSupportedImageMime(mime$1) {
|
|
12869
|
+
return !!mime$1 && [
|
|
12870
|
+
"image/png",
|
|
12871
|
+
"image/jpeg",
|
|
12872
|
+
"image/webp"
|
|
12873
|
+
].includes(mime$1);
|
|
12874
|
+
}
|
|
12875
|
+
function detectTextKind(buffer) {
|
|
12876
|
+
try {
|
|
12877
|
+
const text$1 = UTF8_DECODER.decode(buffer);
|
|
12878
|
+
if (SVG_START_RE.test(text$1) || SVG_ANY_RE.test(text$1.slice(0, 4096))) return {
|
|
12879
|
+
kind: "unsupported",
|
|
12880
|
+
mime: "image/svg+xml"
|
|
12881
|
+
};
|
|
12882
|
+
return {
|
|
12883
|
+
kind: "text",
|
|
12884
|
+
mime: "text/plain"
|
|
12885
|
+
};
|
|
12886
|
+
} catch {
|
|
12887
|
+
return { kind: "unsupported" };
|
|
12888
|
+
}
|
|
12889
|
+
}
|
|
12890
|
+
async function detectInputFileKind(filePath) {
|
|
12891
|
+
const detected = await fileTypeFromFile(filePath);
|
|
12892
|
+
if (detected?.mime === "application/pdf") return {
|
|
12893
|
+
kind: "pdf",
|
|
12894
|
+
mime: detected.mime
|
|
12895
|
+
};
|
|
12896
|
+
if (isSupportedImageMime(detected?.mime)) return {
|
|
12897
|
+
kind: "image",
|
|
12898
|
+
mime: detected?.mime
|
|
12899
|
+
};
|
|
12900
|
+
return detectTextKind(await fs.readFile(filePath));
|
|
12901
|
+
}
|
|
12902
|
+
async function detectInputBufferKind(buffer) {
|
|
12903
|
+
const detected = await fileTypeFromBuffer(buffer);
|
|
12904
|
+
if (detected?.mime === "application/pdf") return {
|
|
12905
|
+
kind: "pdf",
|
|
12906
|
+
mime: detected.mime
|
|
12907
|
+
};
|
|
12908
|
+
if (isSupportedImageMime(detected?.mime)) return {
|
|
12909
|
+
kind: "image",
|
|
12910
|
+
mime: detected?.mime
|
|
12911
|
+
};
|
|
12912
|
+
return detectTextKind(buffer);
|
|
12913
|
+
}
|
|
12914
|
+
|
|
12862
12915
|
//#endregion
|
|
12863
12916
|
//#region src/core/ai-extraction/file-utils.ts
|
|
12864
|
-
function detectMimeType(filePath) {
|
|
12865
|
-
return mime.getType(filePath) ?? "application/octet-stream";
|
|
12917
|
+
async function detectMimeType(filePath) {
|
|
12918
|
+
return (await detectInputFileKind(filePath)).mime ?? mime.getType(filePath) ?? "application/octet-stream";
|
|
12866
12919
|
}
|
|
12867
12920
|
async function readFilePart(filePath) {
|
|
12868
|
-
const mimeStr = detectMimeType(filePath);
|
|
12921
|
+
const mimeStr = await detectMimeType(filePath);
|
|
12869
12922
|
const buffer = await fs.readFile(filePath);
|
|
12870
12923
|
const name$1 = path.basename(filePath);
|
|
12871
12924
|
if (mimeStr.startsWith("image/")) return {
|
|
@@ -13212,7 +13265,7 @@ async function extractStructuredData(input) {
|
|
|
13212
13265
|
error: t("errors.ai.apiKeyMissing")
|
|
13213
13266
|
};
|
|
13214
13267
|
const useFileContent = !!file;
|
|
13215
|
-
const isImageFile = useFileContent
|
|
13268
|
+
const isImageFile = (useFileContent ? await detectMimeType(file) : "").startsWith("image/");
|
|
13216
13269
|
const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
|
|
13217
13270
|
const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
|
|
13218
13271
|
const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
|
|
@@ -13953,30 +14006,10 @@ const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
|
|
|
13953
14006
|
const MAX_UPLOAD_SIZE_TEXT = "30MB";
|
|
13954
14007
|
const SUPPORTED_FILE_TYPES_TEXT = "images, PDF, text, markdown, CSV, JSON, HTML, XML, YAML";
|
|
13955
14008
|
const MISSING_UPLOAD_FILE_TEXT = t("errors.file.missingUpload");
|
|
13956
|
-
const SUPPORTED_MIME_TYPES = new Set([
|
|
13957
|
-
"image/png",
|
|
13958
|
-
"image/jpeg",
|
|
13959
|
-
"image/gif",
|
|
13960
|
-
"image/webp",
|
|
13961
|
-
"image/bmp",
|
|
13962
|
-
"image/svg+xml",
|
|
13963
|
-
"application/pdf",
|
|
13964
|
-
"text/plain",
|
|
13965
|
-
"text/markdown",
|
|
13966
|
-
"text/csv",
|
|
13967
|
-
"application/json",
|
|
13968
|
-
"text/html",
|
|
13969
|
-
"text/xml",
|
|
13970
|
-
"application/x-yaml",
|
|
13971
|
-
"text/yaml"
|
|
13972
|
-
]);
|
|
13973
14009
|
const MIME_TO_EXT = {
|
|
13974
14010
|
"image/png": "png",
|
|
13975
14011
|
"image/jpeg": "jpg",
|
|
13976
|
-
"image/gif": "gif",
|
|
13977
14012
|
"image/webp": "webp",
|
|
13978
|
-
"image/bmp": "bmp",
|
|
13979
|
-
"image/svg+xml": "svg",
|
|
13980
14013
|
"application/pdf": "pdf",
|
|
13981
14014
|
"text/plain": "txt",
|
|
13982
14015
|
"text/markdown": "md",
|
|
@@ -13993,8 +14026,8 @@ function bytesToMB(bytes) {
|
|
|
13993
14026
|
function getExtensionFromMime(mimeType) {
|
|
13994
14027
|
return MIME_TO_EXT[mimeType];
|
|
13995
14028
|
}
|
|
13996
|
-
function
|
|
13997
|
-
return
|
|
14029
|
+
function getExtensionForDetectedFile(mimeType) {
|
|
14030
|
+
return mimeType ? getExtensionFromMime(mimeType) ?? "txt" : "txt";
|
|
13998
14031
|
}
|
|
13999
14032
|
function unsupportedFileTypeMessage(mimeType) {
|
|
14000
14033
|
return t("errors.file.unsupportedType", {
|
|
@@ -14011,14 +14044,16 @@ var FileValidationError = class extends Error {
|
|
|
14011
14044
|
this.name = "FileValidationError";
|
|
14012
14045
|
}
|
|
14013
14046
|
};
|
|
14014
|
-
function
|
|
14047
|
+
async function validateFileUploadContent(file, buffer) {
|
|
14015
14048
|
if (file.size === 0) throw new FileValidationError(t("errors.file.empty"));
|
|
14016
14049
|
if (file.size > MAX_UPLOAD_SIZE) throw new FileValidationError(t("errors.file.sizeExceeded", {
|
|
14017
14050
|
size: bytesToMB(file.size).toFixed(1),
|
|
14018
14051
|
limit: MAX_UPLOAD_SIZE_TEXT,
|
|
14019
14052
|
file: file.name
|
|
14020
14053
|
}));
|
|
14021
|
-
|
|
14054
|
+
const detected = await detectInputBufferKind(buffer);
|
|
14055
|
+
if (detected.kind === "unsupported") throw new FileValidationError(unsupportedFileTypeMessage(detected.mime ?? (file.type || "application/octet-stream")));
|
|
14056
|
+
return detected.mime ?? "text/plain";
|
|
14022
14057
|
}
|
|
14023
14058
|
|
|
14024
14059
|
//#endregion
|
|
@@ -14280,14 +14315,6 @@ function createPdfConverter(config) {
|
|
|
14280
14315
|
return withFallback(new ExternalCommandPdfConverter("mineru", mineruConfig), mineruConfig);
|
|
14281
14316
|
}
|
|
14282
14317
|
if (config.converter === "mineru_api") return new MineruApiPdfConverter(config.mineruApi ?? DEFAULT_MINERU_API_CONFIG);
|
|
14283
|
-
if (config.converter === "markitdown") {
|
|
14284
|
-
const markitdownConfig = config.markitdown ?? DEFAULT_MARKITDOWN_CONFIG;
|
|
14285
|
-
return withFallback(new ExternalCommandPdfConverter("markitdown", markitdownConfig), markitdownConfig);
|
|
14286
|
-
}
|
|
14287
|
-
if (config.converter === "marker") {
|
|
14288
|
-
const markerConfig = config.marker ?? DEFAULT_MARKER_CONFIG;
|
|
14289
|
-
return withFallback(new ExternalCommandPdfConverter("marker", markerConfig), markerConfig);
|
|
14290
|
-
}
|
|
14291
14318
|
if (config.converter === "external") {
|
|
14292
14319
|
if (!config.external) throw new Error(t("errors.pdf.externalNotConfigured"));
|
|
14293
14320
|
return new ExternalCommandPdfConverter("external", config.external);
|
|
@@ -14309,12 +14336,32 @@ const FILE_PART_EXTENSIONS = new Set([
|
|
|
14309
14336
|
"png",
|
|
14310
14337
|
"jpg",
|
|
14311
14338
|
"jpeg",
|
|
14312
|
-
"
|
|
14313
|
-
"webp",
|
|
14314
|
-
"bmp",
|
|
14315
|
-
"svg"
|
|
14339
|
+
"webp"
|
|
14316
14340
|
]);
|
|
14317
14341
|
const PDF_EXT_RE = /\.pdf$/i;
|
|
14342
|
+
async function describeExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
14343
|
+
const detected = await detectInputFileKind(filePath);
|
|
14344
|
+
if (detected.kind === "image") return {
|
|
14345
|
+
kind: "image",
|
|
14346
|
+
mime: detected.mime,
|
|
14347
|
+
handler: shouldUseImageOcrFallback(aiConfig, modelOverride) ? "image_local_ocr" : "image_vision"
|
|
14348
|
+
};
|
|
14349
|
+
if (detected.kind === "pdf") {
|
|
14350
|
+
const converter = createPdfConverter(aiConfig?.pdf);
|
|
14351
|
+
return {
|
|
14352
|
+
kind: "pdf",
|
|
14353
|
+
mime: detected.mime,
|
|
14354
|
+
handler: "pdf_converter",
|
|
14355
|
+
converter: converter.name
|
|
14356
|
+
};
|
|
14357
|
+
}
|
|
14358
|
+
if (detected.kind === "text") return {
|
|
14359
|
+
kind: "text",
|
|
14360
|
+
mime: detected.mime,
|
|
14361
|
+
handler: "text"
|
|
14362
|
+
};
|
|
14363
|
+
throw new Error(unsupportedFileTypeMessage(detected.mime ?? "application/octet-stream"));
|
|
14364
|
+
}
|
|
14318
14365
|
async function readExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
14319
14366
|
const stat = fs$1.statSync(filePath);
|
|
14320
14367
|
if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
|
|
@@ -14322,19 +14369,23 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
|
14322
14369
|
limit: MAX_UPLOAD_SIZE_TEXT,
|
|
14323
14370
|
file: filePath
|
|
14324
14371
|
}));
|
|
14325
|
-
const
|
|
14326
|
-
if (
|
|
14327
|
-
if (
|
|
14328
|
-
const result = await recognizeImageText(filePath
|
|
14372
|
+
const inputProcessing = await describeExtractFileInput(filePath, aiConfig, modelOverride);
|
|
14373
|
+
if (inputProcessing.kind === "image") {
|
|
14374
|
+
if (inputProcessing.handler === "image_local_ocr") {
|
|
14375
|
+
const result = await recognizeImageText(filePath);
|
|
14329
14376
|
consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
|
|
14330
|
-
return {
|
|
14377
|
+
return {
|
|
14378
|
+
text: result.text,
|
|
14379
|
+
inputProcessing
|
|
14380
|
+
};
|
|
14331
14381
|
}
|
|
14332
14382
|
return {
|
|
14333
14383
|
text: "",
|
|
14334
|
-
filePath
|
|
14384
|
+
filePath,
|
|
14385
|
+
inputProcessing
|
|
14335
14386
|
};
|
|
14336
14387
|
}
|
|
14337
|
-
if (
|
|
14388
|
+
if (inputProcessing.kind === "pdf") {
|
|
14338
14389
|
const buffer = await fs.readFile(filePath);
|
|
14339
14390
|
const converter = createPdfConverter(aiConfig?.pdf);
|
|
14340
14391
|
const result = await converter.convert(buffer, filePath);
|
|
@@ -14352,9 +14403,16 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
|
14352
14403
|
await fs.writeFile(fallbackMd, result.text);
|
|
14353
14404
|
consola.info(t("command.extract.file.markdownSaved", { path: fallbackMd }));
|
|
14354
14405
|
}
|
|
14355
|
-
return {
|
|
14406
|
+
return {
|
|
14407
|
+
text: result.text,
|
|
14408
|
+
inputProcessing
|
|
14409
|
+
};
|
|
14356
14410
|
}
|
|
14357
|
-
|
|
14411
|
+
if (inputProcessing.kind === "text") return {
|
|
14412
|
+
text: await fs.readFile(filePath, "utf-8"),
|
|
14413
|
+
inputProcessing
|
|
14414
|
+
};
|
|
14415
|
+
throw new Error(unsupportedFileTypeMessage(inputProcessing.mime ?? "application/octet-stream"));
|
|
14358
14416
|
}
|
|
14359
14417
|
|
|
14360
14418
|
//#endregion
|
|
@@ -14610,6 +14668,10 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14610
14668
|
tokensUsed: result.tokensUsed
|
|
14611
14669
|
};
|
|
14612
14670
|
}
|
|
14671
|
+
function formatInputProcessing$1(input) {
|
|
14672
|
+
const handler = input.converter ? `${input.handler}(${input.converter})` : input.handler;
|
|
14673
|
+
return `${input.mime ?? input.kind} -> ${handler}`;
|
|
14674
|
+
}
|
|
14613
14675
|
async function runAuditedExtraction(options) {
|
|
14614
14676
|
const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
|
|
14615
14677
|
let fileHash;
|
|
@@ -14650,7 +14712,8 @@ async function runAuditedExtraction(options) {
|
|
|
14650
14712
|
outputName: existing.outputName,
|
|
14651
14713
|
tablesInserted: existing.tablesInserted,
|
|
14652
14714
|
notionPages: existing.notionPages,
|
|
14653
|
-
tokensUsed: existing.tokensUsed
|
|
14715
|
+
tokensUsed: existing.tokensUsed,
|
|
14716
|
+
inputProcessing: existing.inputProcessing
|
|
14654
14717
|
};
|
|
14655
14718
|
}
|
|
14656
14719
|
}
|
|
@@ -14672,10 +14735,14 @@ async function runAuditedExtraction(options) {
|
|
|
14672
14735
|
try {
|
|
14673
14736
|
let text$1 = "";
|
|
14674
14737
|
let filePath;
|
|
14738
|
+
let inputProcessing;
|
|
14675
14739
|
if (source.type === "file") {
|
|
14676
14740
|
const input = await readExtractFileInput(source.filePath, aiConfig, modelOverride);
|
|
14677
14741
|
text$1 = input.text;
|
|
14678
14742
|
filePath = input.filePath;
|
|
14743
|
+
inputProcessing = input.inputProcessing;
|
|
14744
|
+
if (!quiet) consola.info(`Input: ${formatInputProcessing$1(inputProcessing)}`);
|
|
14745
|
+
await updateExtractionAuditRecord(aiexDir, audit.id, { inputProcessing });
|
|
14679
14746
|
} else text$1 = source.text;
|
|
14680
14747
|
const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
|
|
14681
14748
|
quiet,
|
|
@@ -14701,7 +14768,8 @@ async function runAuditedExtraction(options) {
|
|
|
14701
14768
|
success: false,
|
|
14702
14769
|
error: error instanceof Error ? error.message : String(error),
|
|
14703
14770
|
auditId: audit.id,
|
|
14704
|
-
fileHash
|
|
14771
|
+
fileHash,
|
|
14772
|
+
inputProcessing
|
|
14705
14773
|
};
|
|
14706
14774
|
}
|
|
14707
14775
|
const updated = await updateExtractionAuditRecord(aiexDir, audit.id, {
|
|
@@ -14721,7 +14789,8 @@ async function runAuditedExtraction(options) {
|
|
|
14721
14789
|
notionPages: updated.notionPages,
|
|
14722
14790
|
tokensUsed: updated.tokensUsed,
|
|
14723
14791
|
auditId: updated.id,
|
|
14724
|
-
fileHash
|
|
14792
|
+
fileHash,
|
|
14793
|
+
inputProcessing: updated.inputProcessing
|
|
14725
14794
|
};
|
|
14726
14795
|
} else {
|
|
14727
14796
|
await updateExtractionAuditRecord(aiexDir, audit.id, {
|
|
@@ -14734,7 +14803,8 @@ async function runAuditedExtraction(options) {
|
|
|
14734
14803
|
success: false,
|
|
14735
14804
|
error: r.error,
|
|
14736
14805
|
auditId: audit.id,
|
|
14737
|
-
fileHash
|
|
14806
|
+
fileHash,
|
|
14807
|
+
inputProcessing
|
|
14738
14808
|
};
|
|
14739
14809
|
}
|
|
14740
14810
|
} catch (e) {
|
|
@@ -14938,6 +15008,11 @@ function isExtractSubCommand(rawArgs) {
|
|
|
14938
15008
|
function formatSource(source) {
|
|
14939
15009
|
return source.type === "file" ? source.fileName || "file" : "unknown";
|
|
14940
15010
|
}
|
|
15011
|
+
function formatInputProcessing(input) {
|
|
15012
|
+
if (!input) return "";
|
|
15013
|
+
const handler = input.converter ? `${input.handler}(${input.converter})` : input.handler;
|
|
15014
|
+
return ` [${input.mime ?? input.kind} -> ${handler}]`;
|
|
15015
|
+
}
|
|
14941
15016
|
async function loadConfiguredAI(aiexDir) {
|
|
14942
15017
|
const aiConfig = await readAIConfig(aiexDir);
|
|
14943
15018
|
if (!aiConfig) {
|
|
@@ -14980,7 +15055,7 @@ const historyCommand = defineCommand({
|
|
|
14980
15055
|
}
|
|
14981
15056
|
for (const record of records) {
|
|
14982
15057
|
const suffix = record.error ? ` — ${record.error}` : record.outputName ? ` — ${record.outputName}` : "";
|
|
14983
|
-
consola.info(`${record.status.padEnd(9)} ${record.id} ${record.schemaName} ${formatSource(record.source)}${suffix}`);
|
|
15058
|
+
consola.info(`${record.status.padEnd(9)} ${record.id} ${record.schemaName} ${formatSource(record.source)}${formatInputProcessing(record.inputProcessing)}${suffix}`);
|
|
14984
15059
|
}
|
|
14985
15060
|
}
|
|
14986
15061
|
});
|
|
@@ -15493,10 +15568,7 @@ const SUPPORTED_EXTENSIONS = new Set([
|
|
|
15493
15568
|
"png",
|
|
15494
15569
|
"jpg",
|
|
15495
15570
|
"jpeg",
|
|
15496
|
-
"gif",
|
|
15497
15571
|
"webp",
|
|
15498
|
-
"bmp",
|
|
15499
|
-
"svg",
|
|
15500
15572
|
"pdf",
|
|
15501
15573
|
"txt",
|
|
15502
15574
|
"md",
|
|
@@ -15909,7 +15981,8 @@ async function listExtractions(config) {
|
|
|
15909
15981
|
modifiedAt: stat.mtime.toISOString(),
|
|
15910
15982
|
notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
|
|
15911
15983
|
notionPages,
|
|
15912
|
-
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
|
|
15984
|
+
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0,
|
|
15985
|
+
inputProcessing: audit?.inputProcessing
|
|
15913
15986
|
});
|
|
15914
15987
|
} catch {
|
|
15915
15988
|
continue;
|
|
@@ -16180,10 +16253,9 @@ function getFormFile(value) {
|
|
|
16180
16253
|
function safeUploadName(name$1) {
|
|
16181
16254
|
return path.basename(name$1).replace(/[^\w.-]/g, "_") || "upload.txt";
|
|
16182
16255
|
}
|
|
16183
|
-
function safeUploadNameForMime(file) {
|
|
16256
|
+
function safeUploadNameForMime(file, mimeType) {
|
|
16184
16257
|
const safeName = safeUploadName(file.name);
|
|
16185
|
-
const ext =
|
|
16186
|
-
if (!ext) throw new FileValidationError(unsupportedFileTypeMessage(file.type));
|
|
16258
|
+
const ext = getExtensionForDetectedFile(mimeType);
|
|
16187
16259
|
return `${path.parse(safeName).name || "upload"}.${ext}`;
|
|
16188
16260
|
}
|
|
16189
16261
|
function jsonResponse(body, status) {
|
|
@@ -16193,10 +16265,10 @@ function jsonResponse(body, status) {
|
|
|
16193
16265
|
});
|
|
16194
16266
|
}
|
|
16195
16267
|
async function saveUploadToFile(file, uploadsDir, id) {
|
|
16196
|
-
validateFileUpload(file);
|
|
16197
|
-
await fs.mkdir(uploadsDir, { recursive: true });
|
|
16198
|
-
const filePath = path.join(uploadsDir, `${id}-${safeUploadNameForMime(file)}`);
|
|
16199
16268
|
const buffer = Buffer.from(await file.arrayBuffer());
|
|
16269
|
+
const mimeType = await validateFileUploadContent(file, buffer);
|
|
16270
|
+
await fs.mkdir(uploadsDir, { recursive: true });
|
|
16271
|
+
const filePath = path.join(uploadsDir, `${id}-${safeUploadNameForMime(file, mimeType)}`);
|
|
16200
16272
|
await fs.writeFile(filePath, buffer);
|
|
16201
16273
|
return filePath;
|
|
16202
16274
|
}
|
|
@@ -16286,7 +16358,8 @@ function extractRoutes(config) {
|
|
|
16286
16358
|
tablesInserted: result.tablesInserted,
|
|
16287
16359
|
notionPages: result.notionPages,
|
|
16288
16360
|
tokensUsed: result.tokensUsed,
|
|
16289
|
-
auditId: result.auditId
|
|
16361
|
+
auditId: result.auditId,
|
|
16362
|
+
inputProcessing: result.inputProcessing
|
|
16290
16363
|
}, 200);
|
|
16291
16364
|
} catch (error) {
|
|
16292
16365
|
if (isMissingUploadFileError(error)) return c.json({
|
|
@@ -16353,7 +16426,8 @@ function extractRoutes(config) {
|
|
|
16353
16426
|
tablesInserted: result.tablesInserted,
|
|
16354
16427
|
notionPages: result.notionPages,
|
|
16355
16428
|
tokensUsed: result.tokensUsed,
|
|
16356
|
-
auditId: result.auditId
|
|
16429
|
+
auditId: result.auditId,
|
|
16430
|
+
inputProcessing: result.inputProcessing
|
|
16357
16431
|
}, 200);
|
|
16358
16432
|
});
|
|
16359
16433
|
app.delete("/extract/records/:id", async (c) => {
|
|
@@ -74,7 +74,7 @@ function doctorDiagnosticsTableRows(d) {
|
|
|
74
74
|
//#endregion
|
|
75
75
|
//#region package.json
|
|
76
76
|
var name = "aiex-cli";
|
|
77
|
-
var version = "0.0.6-beta.
|
|
77
|
+
var version = "0.0.6-beta.2";
|
|
78
78
|
var description = "JSON Schema → SQLite with AI-powered data extraction";
|
|
79
79
|
var package_default = {
|
|
80
80
|
name,
|
|
@@ -155,6 +155,7 @@ var package_default = {
|
|
|
155
155
|
"es-toolkit": "catalog:",
|
|
156
156
|
"esbuild": "catalog:",
|
|
157
157
|
"execa": "catalog:",
|
|
158
|
+
"file-type": "catalog:",
|
|
158
159
|
"hono": "catalog:",
|
|
159
160
|
"i18next": "catalog:",
|
|
160
161
|
"i18next-fs-backend": "catalog:",
|
|
@@ -229,12 +230,13 @@ const PromptConfigSchema = z.object({
|
|
|
229
230
|
userTemplate: z.string().min(1)
|
|
230
231
|
});
|
|
231
232
|
const ExtractionConfigSchema = z.object({ outputDir: z.string().min(1) });
|
|
233
|
+
const ImageOcrFallbackSchema = z.preprocess((value) => [
|
|
234
|
+
"auto",
|
|
235
|
+
"off",
|
|
236
|
+
"local"
|
|
237
|
+
].includes(String(value)) ? "localAuto" : value, z.literal("localAuto").default("localAuto").optional());
|
|
232
238
|
const ImageOcrConfigSchema = z.object({
|
|
233
|
-
ocrFallback:
|
|
234
|
-
"auto",
|
|
235
|
-
"off",
|
|
236
|
-
"local"
|
|
237
|
-
]).default("auto").optional(),
|
|
239
|
+
ocrFallback: ImageOcrFallbackSchema,
|
|
238
240
|
ocrLanguages: z.string().min(1).optional(),
|
|
239
241
|
ocrMinConfidence: z.number().min(0).max(1).optional()
|
|
240
242
|
});
|
|
@@ -253,21 +255,30 @@ const MineruApiPdfConverterConfigSchema = z.object({
|
|
|
253
255
|
enableFormula: z.boolean().optional(),
|
|
254
256
|
enableTable: z.boolean().optional()
|
|
255
257
|
});
|
|
256
|
-
const PdfConfigSchema = z.
|
|
258
|
+
const PdfConfigSchema = z.preprocess((value) => {
|
|
259
|
+
if (!value || typeof value !== "object") return value;
|
|
260
|
+
const config = { ...value };
|
|
261
|
+
if (config.converter === "markitdown" && config.markitdown) {
|
|
262
|
+
config.converter = "external";
|
|
263
|
+
config.external = config.markitdown;
|
|
264
|
+
} else if (config.converter === "marker" && config.marker) {
|
|
265
|
+
config.converter = "external";
|
|
266
|
+
config.external = config.marker;
|
|
267
|
+
} else if (config.converter === "markitdown" || config.converter === "marker") config.converter = "unpdf";
|
|
268
|
+
delete config.markitdown;
|
|
269
|
+
delete config.marker;
|
|
270
|
+
return config;
|
|
271
|
+
}, z.object({
|
|
257
272
|
converter: z.enum([
|
|
258
273
|
"unpdf",
|
|
259
274
|
"mineru",
|
|
260
275
|
"mineru_api",
|
|
261
|
-
"markitdown",
|
|
262
|
-
"marker",
|
|
263
276
|
"external"
|
|
264
277
|
]),
|
|
265
278
|
mineru: ExternalPdfConverterConfigSchema.optional(),
|
|
266
279
|
mineruApi: MineruApiPdfConverterConfigSchema.optional(),
|
|
267
|
-
markitdown: ExternalPdfConverterConfigSchema.optional(),
|
|
268
|
-
marker: ExternalPdfConverterConfigSchema.optional(),
|
|
269
280
|
external: ExternalPdfConverterConfigSchema.optional()
|
|
270
|
-
});
|
|
281
|
+
}));
|
|
271
282
|
const LangfuseConfigSchema = z.object({
|
|
272
283
|
publicKey: z.string(),
|
|
273
284
|
secretKey: z.string(),
|
|
@@ -336,11 +347,6 @@ Extraction requirements:
|
|
|
336
347
|
{text}`
|
|
337
348
|
};
|
|
338
349
|
const DEFAULT_EXTRACTION_CONFIG = { outputDir: ".aiex/extracted" };
|
|
339
|
-
const DEFAULT_IMAGE_OCR_CONFIG = {
|
|
340
|
-
ocrFallback: "auto",
|
|
341
|
-
ocrLanguages: "en-US, zh-Hans",
|
|
342
|
-
ocrMinConfidence: 0
|
|
343
|
-
};
|
|
344
350
|
const DEFAULT_MINERU_CONFIG = {
|
|
345
351
|
command: "mineru",
|
|
346
352
|
args: [
|
|
@@ -352,26 +358,6 @@ const DEFAULT_MINERU_CONFIG = {
|
|
|
352
358
|
timeout: 600,
|
|
353
359
|
fallbackToUnpdf: true
|
|
354
360
|
};
|
|
355
|
-
const DEFAULT_MARKITDOWN_CONFIG = {
|
|
356
|
-
command: "markitdown",
|
|
357
|
-
args: [
|
|
358
|
-
"{input}",
|
|
359
|
-
"-o",
|
|
360
|
-
"{outputDir}/{basename}.md"
|
|
361
|
-
],
|
|
362
|
-
timeout: 600,
|
|
363
|
-
fallbackToUnpdf: true
|
|
364
|
-
};
|
|
365
|
-
const DEFAULT_MARKER_CONFIG = {
|
|
366
|
-
command: "marker_single",
|
|
367
|
-
args: [
|
|
368
|
-
"{input}",
|
|
369
|
-
"--output_dir",
|
|
370
|
-
"{outputDir}"
|
|
371
|
-
],
|
|
372
|
-
timeout: 600,
|
|
373
|
-
fallbackToUnpdf: true
|
|
374
|
-
};
|
|
375
361
|
const DEFAULT_MINERU_API_CONFIG = {
|
|
376
362
|
token: "",
|
|
377
363
|
baseURL: "https://mineru.net/api/v4",
|
|
@@ -383,15 +369,12 @@ const DEFAULT_MINERU_API_CONFIG = {
|
|
|
383
369
|
const DEFAULT_PDF_CONFIG = {
|
|
384
370
|
converter: "unpdf",
|
|
385
371
|
mineru: DEFAULT_MINERU_CONFIG,
|
|
386
|
-
mineruApi: DEFAULT_MINERU_API_CONFIG
|
|
387
|
-
markitdown: DEFAULT_MARKITDOWN_CONFIG,
|
|
388
|
-
marker: DEFAULT_MARKER_CONFIG
|
|
372
|
+
mineruApi: DEFAULT_MINERU_API_CONFIG
|
|
389
373
|
};
|
|
390
374
|
const DEFAULT_AI_CONFIG = {
|
|
391
375
|
provider: DEFAULT_PROVIDER_CONFIG,
|
|
392
376
|
prompt: DEFAULT_PROMPT_CONFIG,
|
|
393
377
|
extraction: DEFAULT_EXTRACTION_CONFIG,
|
|
394
|
-
image: DEFAULT_IMAGE_OCR_CONFIG,
|
|
395
378
|
pdf: DEFAULT_PDF_CONFIG,
|
|
396
379
|
webhook: {
|
|
397
380
|
enabled: false,
|
|
@@ -791,7 +774,7 @@ const en = {
|
|
|
791
774
|
imageInput: "Image Input",
|
|
792
775
|
imageInputSummary: {
|
|
793
776
|
visionModel: "Image files will use your configured vision model first.",
|
|
794
|
-
ocrFallback: "No vision model is configured, and local OCR
|
|
777
|
+
ocrFallback: "No vision model is configured, and local OCR is unavailable.",
|
|
795
778
|
ocrLocal: "No vision model is configured. Image text will require local OCR on macOS or Windows.",
|
|
796
779
|
ocrAuto: "No vision model is configured. On macOS or Windows, local OCR will be tried automatically for text-heavy images."
|
|
797
780
|
},
|
|
@@ -799,7 +782,7 @@ const en = {
|
|
|
799
782
|
noVisionModel: "No vision model",
|
|
800
783
|
advancedImageSettings: "Advanced image settings",
|
|
801
784
|
hideAdvancedImageSettings: "Hide advanced image settings",
|
|
802
|
-
ocrFallback: "OCR fallback",
|
|
785
|
+
ocrFallback: "Local OCR fallback",
|
|
803
786
|
ocrLanguages: "Languages",
|
|
804
787
|
ocrMinConfidence: "Minimum confidence",
|
|
805
788
|
ocrHint: "Image extraction always prefers a vision model. OCR fallback is only used when no vision model is available.",
|
|
@@ -877,15 +860,9 @@ const en = {
|
|
|
877
860
|
converterOptions: {
|
|
878
861
|
unpdf: "Built-in text extraction (unpdf)",
|
|
879
862
|
mineru: "MinerU (mineru)",
|
|
880
|
-
markitdown: "MarkItDown (markitdown)",
|
|
881
|
-
marker: "Marker (marker_single)",
|
|
882
863
|
external: "Custom External Command"
|
|
883
864
|
},
|
|
884
|
-
ocrFallbackOptions: {
|
|
885
|
-
auto: "Auto on macOS or Windows when no vision model exists",
|
|
886
|
-
off: "Off",
|
|
887
|
-
local: "Require local OCR"
|
|
888
|
-
}
|
|
865
|
+
ocrFallbackOptions: { localAuto: "Vision model or local OCR" }
|
|
889
866
|
},
|
|
890
867
|
prompt: {
|
|
891
868
|
defaultSystem: `You are a professional data extraction assistant. Your task is to extract structured data from text and return a JSON object based on the data structure definition provided below.
|
|
@@ -956,7 +933,7 @@ async function initI18n(lng) {
|
|
|
956
933
|
fallbackLng: "en",
|
|
957
934
|
resources: {
|
|
958
935
|
"en": { translation: en },
|
|
959
|
-
"zh-CN": { translation: await import("./zh-CN-
|
|
936
|
+
"zh-CN": { translation: await import("./zh-CN-wEUNhuHM.mjs").then((m) => m.zhCN) }
|
|
960
937
|
},
|
|
961
938
|
interpolation: { escapeValue: false },
|
|
962
939
|
returnNull: false
|
|
@@ -981,7 +958,7 @@ const defaultRuntime = {
|
|
|
981
958
|
}
|
|
982
959
|
};
|
|
983
960
|
function imageOcrMode(config) {
|
|
984
|
-
return config?.ocrFallback ?? "
|
|
961
|
+
return config?.ocrFallback ?? "localAuto";
|
|
985
962
|
}
|
|
986
963
|
function hasVisionModel(aiConfig, modelOverride) {
|
|
987
964
|
if (modelOverride) return modelOverride.capabilities.vision;
|
|
@@ -989,9 +966,7 @@ function hasVisionModel(aiConfig, modelOverride) {
|
|
|
989
966
|
}
|
|
990
967
|
function shouldUseImageOcrFallback(aiConfig, modelOverride, runtime = defaultRuntime) {
|
|
991
968
|
if (hasVisionModel(aiConfig, modelOverride)) return false;
|
|
992
|
-
|
|
993
|
-
if (mode === "off") return false;
|
|
994
|
-
if (mode === "local") return true;
|
|
969
|
+
if (imageOcrMode(aiConfig?.image) === "localAuto") return isLocalOcrPlatform(runtime.platform);
|
|
995
970
|
return isLocalOcrPlatform(runtime.platform);
|
|
996
971
|
}
|
|
997
972
|
function isLocalOcrPlatform(platform) {
|
|
@@ -1001,9 +976,7 @@ function parseOcrLanguages(languages) {
|
|
|
1001
976
|
return (languages ?? DEFAULT_OCR_LANGUAGES).split(",").map((language) => language.trim()).filter(Boolean);
|
|
1002
977
|
}
|
|
1003
978
|
async function recognizeImageText(imagePath, config, runtime = defaultRuntime) {
|
|
1004
|
-
const mode = imageOcrMode(config);
|
|
1005
979
|
if (!isLocalOcrPlatform(runtime.platform)) throw new Error(t("errors.ocr.platformUnsupported", { platform: runtime.platform }));
|
|
1006
|
-
if (mode === "off") throw new Error(t("errors.ocr.disabled"));
|
|
1007
980
|
let localOcr;
|
|
1008
981
|
try {
|
|
1009
982
|
localOcr = await runtime.loadLocalOcr();
|
|
@@ -1536,4 +1509,4 @@ async function collectDoctorDiagnostics(options = {}) {
|
|
|
1536
1509
|
}
|
|
1537
1510
|
|
|
1538
1511
|
//#endregion
|
|
1539
|
-
export {
|
|
1512
|
+
export { description as C, buildDoctorDiagnostics as D, version as E, doctorDiagnosticsTableRows as O, seedConfig as S, package_default as T, DEFAULT_PROMPT_CONFIG as _, parseJsonSchema as a, AIConfigSchema as b, recognizeImageText as c, t as d, getDefaultAIConfig as f, DEFAULT_MINERU_CONFIG as g, DEFAULT_MINERU_API_CONFIG as h, JsonSchemaDefinitionSchema as i, formatDoctorDiagnosticsJson as k, shouldUseImageOcrFallback as l, writeAIConfig as m, createMigrationConfig as n, toSnakeCase as o, readAIConfig as p, generateDrizzleConfig as r, generateDrizzleSchema as s, collectDoctorDiagnostics as t, initI18n as u, PLACEHOLDER_SCHEMA as v, name as w, createConfig as x, PLACEHOLDER_TEXT as y };
|
package/dist/index.mjs
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { D as buildDoctorDiagnostics, O as doctorDiagnosticsTableRows, a as parseJsonSchema, i as JsonSchemaDefinitionSchema, k as formatDoctorDiagnosticsJson, n as createMigrationConfig, r as generateDrizzleConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics } from "./doctor-collector-CGo5dgHm.mjs";
|
|
2
2
|
|
|
3
3
|
export { JsonSchemaDefinitionSchema, buildDoctorDiagnostics, collectDoctorDiagnostics, createMigrationConfig, doctorDiagnosticsTableRows, formatDoctorDiagnosticsJson, generateDrizzleConfig, generateDrizzleSchema, parseJsonSchema };
|