aiex-cli 0.0.5-beta.4 → 0.0.5-beta.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/cli.mjs +785 -460
- package/dist/{doctor-collector-Cv7RArla.mjs → doctor-collector-BpqhXNcO.mjs} +30 -92
- package/dist/index.mjs +1 -1
- package/dist/web/assets/AISettings-sVI4PTNB.js +264 -0
- package/dist/web/assets/{DataBrowser-GAA-pGq0.js → DataBrowser-BGkZb9FV.js} +1 -1
- package/dist/web/assets/{ExtractionViewer-BhhWrBs2.js → ExtractionViewer-DNrkSECj.js} +1 -1
- package/dist/web/assets/{api-client-b4ZBXpNH.js → api-client-gQAAOw0v.js} +1 -1
- package/dist/web/assets/{index-CKV2X6sS.js → index-BQKZKzzP.js} +3 -3
- package/dist/web/assets/index-BU58oIRd.css +2 -0
- package/dist/web/index.html +3 -3
- package/dist/{zh-CN-CyL-61Ow.mjs → zh-CN-DkillGHx.mjs} +11 -24
- package/package.json +3 -1
- package/dist/web/assets/AISettings-BlyTFIIy.js +0 -272
- package/dist/web/assets/index-Csdgio76.css +0 -2
|
@@ -74,7 +74,7 @@ function doctorDiagnosticsTableRows(d) {
|
|
|
74
74
|
//#endregion
|
|
75
75
|
//#region package.json
|
|
76
76
|
var name = "aiex-cli";
|
|
77
|
-
var version = "0.0.5-beta.
|
|
77
|
+
var version = "0.0.5-beta.6";
|
|
78
78
|
var description = "JSON Schema → SQLite with AI-powered data extraction";
|
|
79
79
|
var package_default = {
|
|
80
80
|
name,
|
|
@@ -158,9 +158,11 @@ var package_default = {
|
|
|
158
158
|
"hono": "catalog:",
|
|
159
159
|
"i18next": "catalog:",
|
|
160
160
|
"i18next-fs-backend": "catalog:",
|
|
161
|
+
"js-tiktoken": "catalog:",
|
|
161
162
|
"jsonfile": "catalog:",
|
|
162
163
|
"jsonrepair": "catalog:",
|
|
163
164
|
"kysely": "catalog:",
|
|
165
|
+
"marked": "catalog:",
|
|
164
166
|
"mime": "catalog:",
|
|
165
167
|
"open": "catalog:",
|
|
166
168
|
"p-retry": "catalog:",
|
|
@@ -209,7 +211,6 @@ function seedConfig(config = createConfig()) {
|
|
|
209
211
|
//#endregion
|
|
210
212
|
//#region src/core/ai-extraction/schemas.ts
|
|
211
213
|
const ModelCapabilitiesSchema = z.object({
|
|
212
|
-
vision: z.boolean(),
|
|
213
214
|
structuredOutput: z.boolean(),
|
|
214
215
|
maxTokens: z.number().int().positive().optional(),
|
|
215
216
|
maxOutputTokens: z.number().int().positive().optional()
|
|
@@ -232,17 +233,15 @@ const ExtractionConfigSchema = z.object({
|
|
|
232
233
|
outputDir: z.string().min(1),
|
|
233
234
|
mode: z.enum(["pipeline"]).default("pipeline").optional(),
|
|
234
235
|
concurrency: z.number().int().min(1).optional(),
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
preFilteringLimit: z.number().int().min(1).optional()
|
|
236
|
+
maxTokens: z.number().int().positive().default(8e3).optional(),
|
|
237
|
+
overlapSize: z.number().int().nonnegative().optional()
|
|
238
238
|
});
|
|
239
239
|
const ImageOcrConfigSchema = z.object({
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
ocrLanguages: z.string().min(1).optional(),
|
|
240
|
+
imageConversion: z.enum(["vision", "local"]).default("local").optional(),
|
|
241
|
+
visionBaseURL: z.string().url().optional(),
|
|
242
|
+
visionApiKey: z.string().optional(),
|
|
243
|
+
imageModelName: z.string().min(1).optional(),
|
|
244
|
+
ocrLanguages: z.string().optional(),
|
|
246
245
|
ocrMinConfidence: z.number().min(0).max(1).optional()
|
|
247
246
|
});
|
|
248
247
|
const ExternalPdfConverterConfigSchema = z.object({
|
|
@@ -265,14 +264,10 @@ const PdfConfigSchema = z.object({
|
|
|
265
264
|
"unpdf",
|
|
266
265
|
"mineru",
|
|
267
266
|
"mineru_api",
|
|
268
|
-
"markitdown",
|
|
269
|
-
"marker",
|
|
270
267
|
"external"
|
|
271
268
|
]),
|
|
272
269
|
mineru: ExternalPdfConverterConfigSchema.optional(),
|
|
273
270
|
mineruApi: MineruApiPdfConverterConfigSchema.optional(),
|
|
274
|
-
markitdown: ExternalPdfConverterConfigSchema.optional(),
|
|
275
|
-
marker: ExternalPdfConverterConfigSchema.optional(),
|
|
276
271
|
external: ExternalPdfConverterConfigSchema.optional()
|
|
277
272
|
});
|
|
278
273
|
const LangfuseConfigSchema = z.object({
|
|
@@ -312,16 +307,7 @@ const PLACEHOLDER_SCHEMA = "{schema}";
|
|
|
312
307
|
const PLACEHOLDER_TEXT = "{text}";
|
|
313
308
|
const DEFAULT_MODELS = [{
|
|
314
309
|
name: "qwen-plus",
|
|
315
|
-
capabilities: {
|
|
316
|
-
vision: false,
|
|
317
|
-
structuredOutput: true
|
|
318
|
-
}
|
|
319
|
-
}, {
|
|
320
|
-
name: "qwen-vl-plus",
|
|
321
|
-
capabilities: {
|
|
322
|
-
vision: true,
|
|
323
|
-
structuredOutput: true
|
|
324
|
-
}
|
|
310
|
+
capabilities: { structuredOutput: true }
|
|
325
311
|
}];
|
|
326
312
|
const DEFAULT_PROVIDER_CONFIG = {
|
|
327
313
|
baseURL: "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
|
@@ -338,7 +324,10 @@ Extraction requirements:
|
|
|
338
324
|
1. Extract strictly according to the field names and types defined in the structure
|
|
339
325
|
2. If the text lacks information for a field, set that field to null
|
|
340
326
|
3. Do not add fields that do not exist in the structure definition
|
|
341
|
-
4.
|
|
327
|
+
4. Use only facts present in the source text; do not infer, guess, or complete missing values from outside knowledge
|
|
328
|
+
5. Normalize values to the target type: numbers as JSON numbers, booleans as true/false, dates and formatted strings exactly as requested by the field format
|
|
329
|
+
6. For repeated or conflicting mentions, prefer the most specific final value in the source text and ignore placeholder values such as N/A, unknown, TBD, or empty strings
|
|
330
|
+
7. Maintain data accuracy and completeness`,
|
|
342
331
|
userTemplate: `Please extract data from the following text:
|
|
343
332
|
{text}`
|
|
344
333
|
};
|
|
@@ -347,7 +336,7 @@ const DEFAULT_EXTRACTION_CONFIG = {
|
|
|
347
336
|
mode: "pipeline"
|
|
348
337
|
};
|
|
349
338
|
const DEFAULT_IMAGE_OCR_CONFIG = {
|
|
350
|
-
|
|
339
|
+
imageConversion: "local",
|
|
351
340
|
ocrLanguages: "en-US, zh-Hans",
|
|
352
341
|
ocrMinConfidence: 0
|
|
353
342
|
};
|
|
@@ -362,26 +351,6 @@ const DEFAULT_MINERU_CONFIG = {
|
|
|
362
351
|
timeout: 600,
|
|
363
352
|
fallbackToUnpdf: true
|
|
364
353
|
};
|
|
365
|
-
const DEFAULT_MARKITDOWN_CONFIG = {
|
|
366
|
-
command: "markitdown",
|
|
367
|
-
args: [
|
|
368
|
-
"{input}",
|
|
369
|
-
"-o",
|
|
370
|
-
"{outputDir}/{basename}.md"
|
|
371
|
-
],
|
|
372
|
-
timeout: 600,
|
|
373
|
-
fallbackToUnpdf: true
|
|
374
|
-
};
|
|
375
|
-
const DEFAULT_MARKER_CONFIG = {
|
|
376
|
-
command: "marker_single",
|
|
377
|
-
args: [
|
|
378
|
-
"{input}",
|
|
379
|
-
"--output_dir",
|
|
380
|
-
"{outputDir}"
|
|
381
|
-
],
|
|
382
|
-
timeout: 600,
|
|
383
|
-
fallbackToUnpdf: true
|
|
384
|
-
};
|
|
385
354
|
const DEFAULT_MINERU_API_CONFIG = {
|
|
386
355
|
token: "",
|
|
387
356
|
baseURL: "https://mineru.net/api/v4",
|
|
@@ -393,9 +362,7 @@ const DEFAULT_MINERU_API_CONFIG = {
|
|
|
393
362
|
const DEFAULT_PDF_CONFIG = {
|
|
394
363
|
converter: "unpdf",
|
|
395
364
|
mineru: DEFAULT_MINERU_CONFIG,
|
|
396
|
-
mineruApi: DEFAULT_MINERU_API_CONFIG
|
|
397
|
-
markitdown: DEFAULT_MARKITDOWN_CONFIG,
|
|
398
|
-
marker: DEFAULT_MARKER_CONFIG
|
|
365
|
+
mineruApi: DEFAULT_MINERU_API_CONFIG
|
|
399
366
|
};
|
|
400
367
|
const DEFAULT_AI_CONFIG = {
|
|
401
368
|
provider: DEFAULT_PROVIDER_CONFIG,
|
|
@@ -567,6 +534,8 @@ const en = {
|
|
|
567
534
|
file: {
|
|
568
535
|
hashWarning: "Failed to calculate file hash for {{file}}: {{error}}",
|
|
569
536
|
alreadyProcessed: "File {{file}} (hash: {{hash}}) has already been processed successfully. Skipping.",
|
|
537
|
+
visionTranscribed: "Transcribed image text via AI vision model ({{model}})",
|
|
538
|
+
visionTranscribeFailed: "Vision model transcription failed for {{model}}, falling back to local OCR",
|
|
570
539
|
ocrText: "Extracted image text via local OCR (confidence: {{confidence}}%)",
|
|
571
540
|
pdfFallback: "Fell back to unpdf — {{count}} page(s) extracted",
|
|
572
541
|
pdfConverted: "Converted PDF via {{name}}, {{count}} page(s)",
|
|
@@ -577,9 +546,8 @@ const en = {
|
|
|
577
546
|
extractFail: "Extraction failed",
|
|
578
547
|
extractComplete: "Extraction complete",
|
|
579
548
|
extractRetry: "API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
|
|
580
|
-
chunking: "Input text
|
|
549
|
+
chunking: "Input text ({{length}} tokens) exceeds limit ({{limit}} tokens). Splitting into chunks...",
|
|
581
550
|
chunksCount: "Split into {{count}} chunk(s).",
|
|
582
|
-
preFiltering: "Hybrid pre-filtering: selected {{filtered}} out of {{original}} chunks based on schema relevance.",
|
|
583
551
|
extractingChunk: "Extracting chunk {{current}}/{{total}}...",
|
|
584
552
|
extractRetryChunk: "Chunk {{current}}/{{total}} API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
|
|
585
553
|
extractFailChunk: "Extraction failed for chunk {{current}}/{{total}}",
|
|
@@ -691,14 +659,10 @@ const en = {
|
|
|
691
659
|
ai: {
|
|
692
660
|
apiKeyMissing: "API Key not configured. Please configure AI settings in the web UI.",
|
|
693
661
|
extractionNotObject: "Extraction result is not an object and cannot be written to Notion.",
|
|
694
|
-
noModels: "No AI models configured. Please add at least one model in AI Settings."
|
|
695
|
-
noVisionModel: "Image input requires a model with vision capability{{hint}}.",
|
|
696
|
-
noVisionModelContext: "No vision-capable model with sufficient context window (≥{{tokens}} tokens) found{{hint}}.",
|
|
697
|
-
addSuitableModel: " Please add a suitable vision-capable model in AI Settings."
|
|
662
|
+
noModels: "No AI models configured. Please add at least one model in AI Settings."
|
|
698
663
|
},
|
|
699
664
|
ocr: {
|
|
700
665
|
platformUnsupported: "Local OCR is only available on macOS or Windows. Current platform: {{platform}}.",
|
|
701
|
-
disabled: "Image OCR fallback is disabled in AI settings.",
|
|
702
666
|
unavailable: "Local OCR is unavailable. Install optional dependency @napi-rs/system-ocr and approve its native build scripts. {{error}}",
|
|
703
667
|
noText: "Local OCR did not recognize any text in the image.",
|
|
704
668
|
lowConfidence: "Local OCR confidence {{confidence}}% is below the configured minimum {{min}}%."
|
|
@@ -800,26 +764,19 @@ const en = {
|
|
|
800
764
|
models: "Models",
|
|
801
765
|
addModel: "Add Model",
|
|
802
766
|
modelName: "Model name (e.g. gpt-4o)",
|
|
803
|
-
structuredOutput: "Structured Output",
|
|
804
|
-
textOnlyOutput: "Text-only Output",
|
|
805
|
-
visionSupported: "Vision Supported",
|
|
806
|
-
visionUnsupported: "Vision Unsupported",
|
|
807
767
|
subscribe: "Registry",
|
|
808
768
|
imageInput: "Image Input",
|
|
809
769
|
imageInputSummary: {
|
|
810
|
-
visionModel: "Image
|
|
811
|
-
|
|
812
|
-
ocrLocal: "
|
|
813
|
-
ocrAuto: "No vision model is configured. On macOS or Windows, local OCR will be tried automatically for text-heavy images."
|
|
770
|
+
visionModel: "Image text is transcribed via {{model}} before structured extraction.",
|
|
771
|
+
ocrNoModel: "No vision model selected. Image text will be read through local OCR.",
|
|
772
|
+
ocrLocal: "Image text will be read through local OCR."
|
|
814
773
|
},
|
|
815
|
-
visionModelConfigured: "Vision model configured",
|
|
816
|
-
noVisionModel: "No vision model",
|
|
817
774
|
advancedImageSettings: "Advanced image settings",
|
|
818
775
|
hideAdvancedImageSettings: "Hide advanced image settings",
|
|
819
|
-
ocrFallback: "
|
|
776
|
+
ocrFallback: "Image input mode",
|
|
820
777
|
ocrLanguages: "Languages",
|
|
821
778
|
ocrMinConfidence: "Minimum confidence",
|
|
822
|
-
ocrHint: "
|
|
779
|
+
ocrHint: "Images are converted to text before structured extraction.",
|
|
823
780
|
pdfConversion: "PDF Conversion",
|
|
824
781
|
converter: "Converter",
|
|
825
782
|
command: "Command",
|
|
@@ -894,14 +851,11 @@ const en = {
|
|
|
894
851
|
converterOptions: {
|
|
895
852
|
unpdf: "Built-in text extraction (unpdf)",
|
|
896
853
|
mineru: "MinerU (mineru)",
|
|
897
|
-
markitdown: "MarkItDown (markitdown)",
|
|
898
|
-
marker: "Marker (marker_single)",
|
|
899
854
|
external: "Custom External Command"
|
|
900
855
|
},
|
|
901
856
|
ocrFallbackOptions: {
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
local: "Require local OCR"
|
|
857
|
+
vision: "Vision model (fallback to OCR)",
|
|
858
|
+
local: "Local OCR only"
|
|
905
859
|
}
|
|
906
860
|
},
|
|
907
861
|
prompt: {
|
|
@@ -973,7 +927,7 @@ async function initI18n(lng) {
|
|
|
973
927
|
fallbackLng: "en",
|
|
974
928
|
resources: {
|
|
975
929
|
"en": { translation: en },
|
|
976
|
-
"zh-CN": { translation: await import("./zh-CN-
|
|
930
|
+
"zh-CN": { translation: await import("./zh-CN-DkillGHx.mjs").then((m) => m.zhCN) }
|
|
977
931
|
},
|
|
978
932
|
interpolation: { escapeValue: false },
|
|
979
933
|
returnNull: false
|
|
@@ -997,20 +951,6 @@ const defaultRuntime = {
|
|
|
997
951
|
return await import("@napi-rs/system-ocr");
|
|
998
952
|
}
|
|
999
953
|
};
|
|
1000
|
-
function imageOcrMode(config) {
|
|
1001
|
-
return config?.ocrFallback ?? "auto";
|
|
1002
|
-
}
|
|
1003
|
-
function hasVisionModel(aiConfig, modelOverride) {
|
|
1004
|
-
if (modelOverride) return modelOverride.capabilities.vision;
|
|
1005
|
-
return aiConfig?.provider.models.some((model) => model.capabilities.vision) ?? true;
|
|
1006
|
-
}
|
|
1007
|
-
function shouldUseImageOcrFallback(aiConfig, modelOverride, runtime = defaultRuntime) {
|
|
1008
|
-
if (hasVisionModel(aiConfig, modelOverride)) return false;
|
|
1009
|
-
const mode = imageOcrMode(aiConfig?.image);
|
|
1010
|
-
if (mode === "off") return false;
|
|
1011
|
-
if (mode === "local") return true;
|
|
1012
|
-
return isLocalOcrPlatform(runtime.platform);
|
|
1013
|
-
}
|
|
1014
954
|
function isLocalOcrPlatform(platform) {
|
|
1015
955
|
return platform === "darwin" || platform === "win32";
|
|
1016
956
|
}
|
|
@@ -1018,9 +958,7 @@ function parseOcrLanguages(languages) {
|
|
|
1018
958
|
return (languages ?? DEFAULT_OCR_LANGUAGES).split(",").map((language) => language.trim()).filter(Boolean);
|
|
1019
959
|
}
|
|
1020
960
|
async function recognizeImageText(imagePath, config, runtime = defaultRuntime) {
|
|
1021
|
-
const mode = imageOcrMode(config);
|
|
1022
961
|
if (!isLocalOcrPlatform(runtime.platform)) throw new Error(t("errors.ocr.platformUnsupported", { platform: runtime.platform }));
|
|
1023
|
-
if (mode === "off") throw new Error(t("errors.ocr.disabled"));
|
|
1024
962
|
let localOcr;
|
|
1025
963
|
try {
|
|
1026
964
|
localOcr = await runtime.loadLocalOcr();
|
|
@@ -1553,4 +1491,4 @@ async function collectDoctorDiagnostics(options = {}) {
|
|
|
1553
1491
|
}
|
|
1554
1492
|
|
|
1555
1493
|
//#endregion
|
|
1556
|
-
export {
|
|
1494
|
+
export { name as C, doctorDiagnosticsTableRows as D, buildDoctorDiagnostics as E, formatDoctorDiagnosticsJson as O, description as S, version as T, PLACEHOLDER_SCHEMA as _, parseJsonSchema as a, createConfig as b, recognizeImageText as c, getDefaultAIConfig as d, readAIConfig as f, DEFAULT_PROMPT_CONFIG as g, DEFAULT_MINERU_CONFIG as h, JsonSchemaDefinitionSchema as i, initI18n as l, DEFAULT_MINERU_API_CONFIG as m, createMigrationConfig as n, toSnakeCase as o, writeAIConfig as p, generateDrizzleConfig as r, generateDrizzleSchema as s, collectDoctorDiagnostics as t, t as u, PLACEHOLDER_TEXT as v, package_default as w, seedConfig as x, AIConfigSchema as y };
|
package/dist/index.mjs
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { D as doctorDiagnosticsTableRows, E as buildDoctorDiagnostics, O as formatDoctorDiagnosticsJson, a as parseJsonSchema, i as JsonSchemaDefinitionSchema, n as createMigrationConfig, r as generateDrizzleConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics } from "./doctor-collector-BpqhXNcO.mjs";
|
|
2
2
|
|
|
3
3
|
export { JsonSchemaDefinitionSchema, buildDoctorDiagnostics, collectDoctorDiagnostics, createMigrationConfig, doctorDiagnosticsTableRows, formatDoctorDiagnosticsJson, generateDrizzleConfig, generateDrizzleSchema, parseJsonSchema };
|