aiex-cli 0.0.6-beta.1 → 0.0.6-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-hWEvJ4lw.mjs";
1
+ import { C as description, E as version, O as doctorDiagnosticsTableRows, S as seedConfig, T as package_default, _ as DEFAULT_PROMPT_CONFIG, a as parseJsonSchema, b as AIConfigSchema, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MINERU_CONFIG, h as DEFAULT_MINERU_API_CONFIG, i as JsonSchemaDefinitionSchema, k as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as PLACEHOLDER_SCHEMA, w as name, x as createConfig, y as PLACEHOLDER_TEXT } from "./doctor-collector-abgpqc5T.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import fs from "node:fs/promises";
4
4
  import os from "node:os";
@@ -21,6 +21,8 @@ import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
21
21
  import { APICallError, Output, generateText, jsonSchema } from "ai";
22
22
  import pRetry from "p-retry";
23
23
  import mime from "mime";
24
+ import { TextDecoder, promisify } from "node:util";
25
+ import { fileTypeFromBuffer, fileTypeFromFile } from "file-type";
24
26
  import { jsonrepair } from "jsonrepair";
25
27
  import { LangfuseSpanProcessor } from "@langfuse/otel";
26
28
  import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
@@ -31,7 +33,6 @@ import { glob, globSync } from "tinyglobby";
31
33
  import { extractText, getDocumentProxy, getMeta } from "unpdf";
32
34
  import AdmZip from "adm-zip";
33
35
  import { execFile } from "node:child_process";
34
- import { promisify } from "node:util";
35
36
  import * as chokidar from "chokidar";
36
37
  import { serve } from "@hono/node-server";
37
38
  import open from "open";
@@ -12859,13 +12860,65 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
12859
12860
  });
12860
12861
  }
12861
12862
 
12863
+ //#endregion
12864
+ //#region src/core/input-file-kind.ts
12865
+ const UTF8_DECODER = new TextDecoder("utf-8", { fatal: true });
12866
+ const SVG_START_RE = /^\s*<svg[\s>]/i;
12867
+ const SVG_ANY_RE = /<svg[\s>]/i;
12868
+ function isSupportedImageMime(mime$1) {
12869
+ return !!mime$1 && [
12870
+ "image/png",
12871
+ "image/jpeg",
12872
+ "image/webp"
12873
+ ].includes(mime$1);
12874
+ }
12875
+ function detectTextKind(buffer) {
12876
+ try {
12877
+ const text$1 = UTF8_DECODER.decode(buffer);
12878
+ if (SVG_START_RE.test(text$1) || SVG_ANY_RE.test(text$1.slice(0, 4096))) return {
12879
+ kind: "unsupported",
12880
+ mime: "image/svg+xml"
12881
+ };
12882
+ return {
12883
+ kind: "text",
12884
+ mime: "text/plain"
12885
+ };
12886
+ } catch {
12887
+ return { kind: "unsupported" };
12888
+ }
12889
+ }
12890
+ async function detectInputFileKind(filePath) {
12891
+ const detected = await fileTypeFromFile(filePath);
12892
+ if (detected?.mime === "application/pdf") return {
12893
+ kind: "pdf",
12894
+ mime: detected.mime
12895
+ };
12896
+ if (isSupportedImageMime(detected?.mime)) return {
12897
+ kind: "image",
12898
+ mime: detected?.mime
12899
+ };
12900
+ return detectTextKind(await fs.readFile(filePath));
12901
+ }
12902
+ async function detectInputBufferKind(buffer) {
12903
+ const detected = await fileTypeFromBuffer(buffer);
12904
+ if (detected?.mime === "application/pdf") return {
12905
+ kind: "pdf",
12906
+ mime: detected.mime
12907
+ };
12908
+ if (isSupportedImageMime(detected?.mime)) return {
12909
+ kind: "image",
12910
+ mime: detected?.mime
12911
+ };
12912
+ return detectTextKind(buffer);
12913
+ }
12914
+
12862
12915
  //#endregion
12863
12916
  //#region src/core/ai-extraction/file-utils.ts
12864
- function detectMimeType(filePath) {
12865
- return mime.getType(filePath) ?? "application/octet-stream";
12917
+ async function detectMimeType(filePath) {
12918
+ return (await detectInputFileKind(filePath)).mime ?? mime.getType(filePath) ?? "application/octet-stream";
12866
12919
  }
12867
12920
  async function readFilePart(filePath) {
12868
- const mimeStr = detectMimeType(filePath);
12921
+ const mimeStr = await detectMimeType(filePath);
12869
12922
  const buffer = await fs.readFile(filePath);
12870
12923
  const name$1 = path.basename(filePath);
12871
12924
  if (mimeStr.startsWith("image/")) return {
@@ -13205,14 +13258,48 @@ function validateExtractedData(schema, data) {
13205
13258
  //#endregion
13206
13259
  //#region src/core/ai-extraction/extractor.ts
13207
13260
  const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
13261
+ function expectedExtractionFields(schema) {
13262
+ return Object.entries(schema.properties).filter(([, prop]) => !(prop.primary && prop.autoIncrement)).map(([name$1]) => name$1);
13263
+ }
13264
+ function calculateMissingFields(schema, data) {
13265
+ const expected = expectedExtractionFields(schema);
13266
+ if (expected.length === 0) return {
13267
+ fields: [],
13268
+ rate: 0
13269
+ };
13270
+ if (!data || typeof data !== "object" || Array.isArray(data)) return {
13271
+ fields: expected,
13272
+ rate: 1
13273
+ };
13274
+ const record = data;
13275
+ const fields = expected.filter((field) => {
13276
+ const value = record[field];
13277
+ return value === void 0 || value === null || value === "";
13278
+ });
13279
+ return {
13280
+ fields,
13281
+ rate: fields.length / expected.length
13282
+ };
13283
+ }
13208
13284
  async function extractStructuredData(input) {
13209
13285
  const { config, schema, text: text$1, aiexDir, file, modelOverride } = input;
13286
+ let apiRetryCount = 0;
13287
+ const onApiRetry = (info) => {
13288
+ apiRetryCount += 1;
13289
+ input.onRetry?.(info);
13290
+ };
13210
13291
  if (!config.provider.apiKey) return {
13211
13292
  success: false,
13212
- error: t("errors.ai.apiKeyMissing")
13293
+ error: t("errors.ai.apiKeyMissing"),
13294
+ quality: { ai: {
13295
+ validationPassed: false,
13296
+ attempts: 0,
13297
+ selfCorrectionCount: 0,
13298
+ apiRetryCount
13299
+ } }
13213
13300
  };
13214
13301
  const useFileContent = !!file;
13215
- const isImageFile = useFileContent && detectMimeType(file).startsWith("image/");
13302
+ const isImageFile = (useFileContent ? await detectMimeType(file) : "").startsWith("image/");
13216
13303
  const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
13217
13304
  const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
13218
13305
  const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
@@ -13228,7 +13315,13 @@ async function extractStructuredData(input) {
13228
13315
  } catch (e) {
13229
13316
  return {
13230
13317
  success: false,
13231
- error: e.message
13318
+ error: e.message,
13319
+ quality: { ai: {
13320
+ validationPassed: false,
13321
+ attempts: 0,
13322
+ selfCorrectionCount: 0,
13323
+ apiRetryCount
13324
+ } }
13232
13325
  };
13233
13326
  }
13234
13327
  const useStructuredOutput = selected.capabilities.structuredOutput;
@@ -13286,7 +13379,7 @@ async function extractStructuredData(input) {
13286
13379
  experimental_telemetry: { isEnabled: useTelemetry }
13287
13380
  };
13288
13381
  if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
13289
- result = await withRetry(() => generateText(fileOpts), input.onRetry);
13382
+ result = await withRetry(() => generateText(fileOpts), onApiRetry);
13290
13383
  } else {
13291
13384
  const textOpts = {
13292
13385
  model: provider.chatModel(selected.name),
@@ -13297,7 +13390,7 @@ async function extractStructuredData(input) {
13297
13390
  experimental_telemetry: { isEnabled: useTelemetry }
13298
13391
  };
13299
13392
  if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
13300
- result = await withRetry(() => generateText(textOpts), input.onRetry);
13393
+ result = await withRetry(() => generateText(textOpts), onApiRetry);
13301
13394
  }
13302
13395
  if (result.usage) {
13303
13396
  totalPromptTokens += result.usage.inputTokens ?? 0;
@@ -13315,6 +13408,7 @@ async function extractStructuredData(input) {
13315
13408
  if (!parseError && data !== void 0) {
13316
13409
  const validation = validateExtractedData(schema, data);
13317
13410
  if (validation.success) {
13411
+ const missing = calculateMissingFields(schema, data);
13318
13412
  const outputDir = path.resolve(aiexDir, config.extraction.outputDir.replace(".aiex/", ""));
13319
13413
  await fs.mkdir(outputDir, { recursive: true });
13320
13414
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
@@ -13332,7 +13426,15 @@ async function extractStructuredData(input) {
13332
13426
  prompt: totalPromptTokens,
13333
13427
  completion: totalCompletionTokens,
13334
13428
  total: totalPromptTokens + totalCompletionTokens
13335
- }
13429
+ },
13430
+ quality: { ai: {
13431
+ validationPassed: true,
13432
+ attempts: attempt,
13433
+ selfCorrectionCount: attempt - 1,
13434
+ apiRetryCount,
13435
+ missingFields: missing.fields,
13436
+ missingFieldRate: missing.rate
13437
+ } }
13336
13438
  };
13337
13439
  } else validationError = validation.error;
13338
13440
  }
@@ -13365,12 +13467,26 @@ Please output the corrected JSON object now:`;
13365
13467
  }
13366
13468
  return {
13367
13469
  success: false,
13368
- error: lastError || "Extraction failed after self-reflection retries"
13470
+ error: lastError || "Extraction failed after self-reflection retries",
13471
+ quality: { ai: {
13472
+ validationPassed: false,
13473
+ attempts: maxAttempts,
13474
+ selfCorrectionCount: maxAttempts - 1,
13475
+ apiRetryCount,
13476
+ validationError: lastError
13477
+ } }
13369
13478
  };
13370
13479
  } catch (error) {
13371
13480
  return {
13372
13481
  success: false,
13373
- error: getErrorMessage(error)
13482
+ error: getErrorMessage(error),
13483
+ quality: { ai: {
13484
+ validationPassed: false,
13485
+ attempts: 0,
13486
+ selfCorrectionCount: 0,
13487
+ apiRetryCount,
13488
+ validationError: getErrorMessage(error)
13489
+ } }
13374
13490
  };
13375
13491
  }
13376
13492
  }
@@ -13953,30 +14069,10 @@ const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
13953
14069
  const MAX_UPLOAD_SIZE_TEXT = "30MB";
13954
14070
  const SUPPORTED_FILE_TYPES_TEXT = "images, PDF, text, markdown, CSV, JSON, HTML, XML, YAML";
13955
14071
  const MISSING_UPLOAD_FILE_TEXT = t("errors.file.missingUpload");
13956
- const SUPPORTED_MIME_TYPES = new Set([
13957
- "image/png",
13958
- "image/jpeg",
13959
- "image/gif",
13960
- "image/webp",
13961
- "image/bmp",
13962
- "image/svg+xml",
13963
- "application/pdf",
13964
- "text/plain",
13965
- "text/markdown",
13966
- "text/csv",
13967
- "application/json",
13968
- "text/html",
13969
- "text/xml",
13970
- "application/x-yaml",
13971
- "text/yaml"
13972
- ]);
13973
14072
  const MIME_TO_EXT = {
13974
14073
  "image/png": "png",
13975
14074
  "image/jpeg": "jpg",
13976
- "image/gif": "gif",
13977
14075
  "image/webp": "webp",
13978
- "image/bmp": "bmp",
13979
- "image/svg+xml": "svg",
13980
14076
  "application/pdf": "pdf",
13981
14077
  "text/plain": "txt",
13982
14078
  "text/markdown": "md",
@@ -13993,8 +14089,8 @@ function bytesToMB(bytes) {
13993
14089
  function getExtensionFromMime(mimeType) {
13994
14090
  return MIME_TO_EXT[mimeType];
13995
14091
  }
13996
- function isAllowedMimeType(mimeType) {
13997
- return SUPPORTED_MIME_TYPES.has(mimeType);
14092
+ function getExtensionForDetectedFile(mimeType) {
14093
+ return mimeType ? getExtensionFromMime(mimeType) ?? "txt" : "txt";
13998
14094
  }
13999
14095
  function unsupportedFileTypeMessage(mimeType) {
14000
14096
  return t("errors.file.unsupportedType", {
@@ -14011,14 +14107,16 @@ var FileValidationError = class extends Error {
14011
14107
  this.name = "FileValidationError";
14012
14108
  }
14013
14109
  };
14014
- function validateFileUpload(file) {
14110
+ async function validateFileUploadContent(file, buffer) {
14015
14111
  if (file.size === 0) throw new FileValidationError(t("errors.file.empty"));
14016
14112
  if (file.size > MAX_UPLOAD_SIZE) throw new FileValidationError(t("errors.file.sizeExceeded", {
14017
14113
  size: bytesToMB(file.size).toFixed(1),
14018
14114
  limit: MAX_UPLOAD_SIZE_TEXT,
14019
14115
  file: file.name
14020
14116
  }));
14021
- if (!isAllowedMimeType(file.type)) throw new FileValidationError(unsupportedFileTypeMessage(file.type));
14117
+ const detected = await detectInputBufferKind(buffer);
14118
+ if (detected.kind === "unsupported") throw new FileValidationError(unsupportedFileTypeMessage(detected.mime ?? (file.type || "application/octet-stream")));
14119
+ return detected.mime ?? "text/plain";
14022
14120
  }
14023
14121
 
14024
14122
  //#endregion
@@ -14280,14 +14378,6 @@ function createPdfConverter(config) {
14280
14378
  return withFallback(new ExternalCommandPdfConverter("mineru", mineruConfig), mineruConfig);
14281
14379
  }
14282
14380
  if (config.converter === "mineru_api") return new MineruApiPdfConverter(config.mineruApi ?? DEFAULT_MINERU_API_CONFIG);
14283
- if (config.converter === "markitdown") {
14284
- const markitdownConfig = config.markitdown ?? DEFAULT_MARKITDOWN_CONFIG;
14285
- return withFallback(new ExternalCommandPdfConverter("markitdown", markitdownConfig), markitdownConfig);
14286
- }
14287
- if (config.converter === "marker") {
14288
- const markerConfig = config.marker ?? DEFAULT_MARKER_CONFIG;
14289
- return withFallback(new ExternalCommandPdfConverter("marker", markerConfig), markerConfig);
14290
- }
14291
14381
  if (config.converter === "external") {
14292
14382
  if (!config.external) throw new Error(t("errors.pdf.externalNotConfigured"));
14293
14383
  return new ExternalCommandPdfConverter("external", config.external);
@@ -14309,12 +14399,32 @@ const FILE_PART_EXTENSIONS = new Set([
14309
14399
  "png",
14310
14400
  "jpg",
14311
14401
  "jpeg",
14312
- "gif",
14313
- "webp",
14314
- "bmp",
14315
- "svg"
14402
+ "webp"
14316
14403
  ]);
14317
14404
  const PDF_EXT_RE = /\.pdf$/i;
14405
+ async function describeExtractFileInput(filePath, aiConfig, modelOverride) {
14406
+ const detected = await detectInputFileKind(filePath);
14407
+ if (detected.kind === "image") return {
14408
+ kind: "image",
14409
+ mime: detected.mime,
14410
+ handler: shouldUseImageOcrFallback(aiConfig, modelOverride) ? "image_local_ocr" : "image_vision"
14411
+ };
14412
+ if (detected.kind === "pdf") {
14413
+ const converter = createPdfConverter(aiConfig?.pdf);
14414
+ return {
14415
+ kind: "pdf",
14416
+ mime: detected.mime,
14417
+ handler: "pdf_converter",
14418
+ converter: converter.name
14419
+ };
14420
+ }
14421
+ if (detected.kind === "text") return {
14422
+ kind: "text",
14423
+ mime: detected.mime,
14424
+ handler: "text"
14425
+ };
14426
+ throw new Error(unsupportedFileTypeMessage(detected.mime ?? "application/octet-stream"));
14427
+ }
14318
14428
  async function readExtractFileInput(filePath, aiConfig, modelOverride) {
14319
14429
  const stat = fs$1.statSync(filePath);
14320
14430
  if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
@@ -14322,19 +14432,34 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
14322
14432
  limit: MAX_UPLOAD_SIZE_TEXT,
14323
14433
  file: filePath
14324
14434
  }));
14325
- const ext = path.extname(filePath).toLowerCase().replace(".", "");
14326
- if (FILE_PART_EXTENSIONS.has(ext)) {
14327
- if (shouldUseImageOcrFallback(aiConfig, modelOverride)) {
14328
- const result = await recognizeImageText(filePath, aiConfig?.image);
14435
+ const inputProcessing = await describeExtractFileInput(filePath, aiConfig, modelOverride);
14436
+ if (inputProcessing.kind === "image") {
14437
+ if (inputProcessing.handler === "image_local_ocr") {
14438
+ const result = await recognizeImageText(filePath);
14329
14439
  consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
14330
- return { text: result.text };
14440
+ return {
14441
+ text: result.text,
14442
+ inputProcessing,
14443
+ quality: { input: {
14444
+ kind: "image",
14445
+ textLength: result.text.length,
14446
+ emptyText: result.text.trim().length === 0,
14447
+ ocr: {
14448
+ confidence: result.confidence,
14449
+ textLength: result.text.length,
14450
+ platform: process.platform
14451
+ }
14452
+ } }
14453
+ };
14331
14454
  }
14332
14455
  return {
14333
14456
  text: "",
14334
- filePath
14457
+ filePath,
14458
+ inputProcessing,
14459
+ quality: { input: { kind: "image" } }
14335
14460
  };
14336
14461
  }
14337
- if (ext === "pdf") {
14462
+ if (inputProcessing.kind === "pdf") {
14338
14463
  const buffer = await fs.readFile(filePath);
14339
14464
  const converter = createPdfConverter(aiConfig?.pdf);
14340
14465
  const result = await converter.convert(buffer, filePath);
@@ -14352,9 +14477,37 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
14352
14477
  await fs.writeFile(fallbackMd, result.text);
14353
14478
  consola.info(t("command.extract.file.markdownSaved", { path: fallbackMd }));
14354
14479
  }
14355
- return { text: result.text };
14480
+ const textLength = result.text.length;
14481
+ return {
14482
+ text: result.text,
14483
+ inputProcessing,
14484
+ quality: { input: {
14485
+ kind: "pdf",
14486
+ textLength,
14487
+ emptyText: result.text.trim().length === 0,
14488
+ pdf: {
14489
+ pageCount: result.pageCount,
14490
+ textLength,
14491
+ emptyText: result.text.trim().length === 0,
14492
+ fallbackUsed: result.metadata?.fallback === "true",
14493
+ converter: result.metadata?.converter ?? converter.name
14494
+ }
14495
+ } }
14496
+ };
14356
14497
  }
14357
- return { text: await fs.readFile(filePath, "utf-8") };
14498
+ if (inputProcessing.kind === "text") {
14499
+ const text$1 = await fs.readFile(filePath, "utf-8");
14500
+ return {
14501
+ text: text$1,
14502
+ inputProcessing,
14503
+ quality: { input: {
14504
+ kind: "text",
14505
+ textLength: text$1.length,
14506
+ emptyText: text$1.trim().length === 0
14507
+ } }
14508
+ };
14509
+ }
14510
+ throw new Error(unsupportedFileTypeMessage(inputProcessing.mime ?? "application/octet-stream"));
14358
14511
  }
14359
14512
 
14360
14513
  //#endregion
@@ -14548,7 +14701,9 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14548
14701
  }
14549
14702
  return {
14550
14703
  success: false,
14551
- error: result.error || t("common.unknownError")
14704
+ error: result.error || t("common.unknownError"),
14705
+ quality: result.quality,
14706
+ failureStage: "ai_extraction"
14552
14707
  };
14553
14708
  }
14554
14709
  if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
@@ -14567,7 +14722,9 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14567
14722
  consola.error(dbError);
14568
14723
  return {
14569
14724
  success: false,
14570
- error: dbError
14725
+ error: dbError,
14726
+ quality: result.quality,
14727
+ failureStage: "db_insert"
14571
14728
  };
14572
14729
  }
14573
14730
  try {
@@ -14581,14 +14738,17 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14581
14738
  outputPath: result.outputPath,
14582
14739
  data: result.data,
14583
14740
  tablesInserted: insertResult.tablesInserted,
14584
- tokensUsed: result.tokensUsed
14741
+ tokensUsed: result.tokensUsed,
14742
+ quality: result.quality
14585
14743
  };
14586
14744
  } else {
14587
14745
  if (!options?.quiet) s2.stop(t("command.extract.file.dbInsertFail"));
14588
14746
  consola.error(insertResult.error || t("common.unknownError"));
14589
14747
  return {
14590
14748
  success: false,
14591
- error: insertResult.error
14749
+ error: insertResult.error,
14750
+ quality: result.quality,
14751
+ failureStage: "db_insert"
14592
14752
  };
14593
14753
  }
14594
14754
  } finally {
@@ -14599,7 +14759,9 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14599
14759
  consola.error(e instanceof Error ? e.message : String(e));
14600
14760
  return {
14601
14761
  success: false,
14602
- error: String(e)
14762
+ error: String(e),
14763
+ quality: result.quality,
14764
+ failureStage: "db_insert"
14603
14765
  };
14604
14766
  }
14605
14767
  }
@@ -14607,9 +14769,29 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14607
14769
  success: true,
14608
14770
  outputPath: result.outputPath,
14609
14771
  data: result.data,
14610
- tokensUsed: result.tokensUsed
14772
+ tokensUsed: result.tokensUsed,
14773
+ quality: result.quality
14611
14774
  };
14612
14775
  }
14776
+ function formatInputProcessing$1(input) {
14777
+ const handler = input.converter ? `${input.handler}(${input.converter})` : input.handler;
14778
+ return `${input.mime ?? input.kind} -> ${handler}`;
14779
+ }
14780
+ function mergeQuality(inputQuality, aiQuality) {
14781
+ if (!inputQuality && !aiQuality) return void 0;
14782
+ return {
14783
+ input: inputQuality?.input,
14784
+ ai: aiQuality?.ai
14785
+ };
14786
+ }
14787
+ function classifyInputError(error, inputProcessing) {
14788
+ if (inputProcessing?.handler === "pdf_converter") return "file_conversion";
14789
+ if (inputProcessing?.handler === "image_local_ocr") return "ocr";
14790
+ const message = (error instanceof Error ? error.message : String(error)).toLowerCase();
14791
+ if (message.includes("ocr")) return "ocr";
14792
+ if (message.includes("pdf") || message.includes("converter")) return "file_conversion";
14793
+ return "input_detection";
14794
+ }
14613
14795
  async function runAuditedExtraction(options) {
14614
14796
  const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
14615
14797
  let fileHash;
@@ -14650,7 +14832,10 @@ async function runAuditedExtraction(options) {
14650
14832
  outputName: existing.outputName,
14651
14833
  tablesInserted: existing.tablesInserted,
14652
14834
  notionPages: existing.notionPages,
14653
- tokensUsed: existing.tokensUsed
14835
+ tokensUsed: existing.tokensUsed,
14836
+ inputProcessing: existing.inputProcessing,
14837
+ quality: existing.quality,
14838
+ failureStage: existing.failureStage
14654
14839
  };
14655
14840
  }
14656
14841
  }
@@ -14669,6 +14854,8 @@ async function runAuditedExtraction(options) {
14669
14854
  },
14670
14855
  retryOf
14671
14856
  });
14857
+ let inputProcessing;
14858
+ let inputQuality;
14672
14859
  try {
14673
14860
  let text$1 = "";
14674
14861
  let filePath;
@@ -14676,6 +14863,13 @@ async function runAuditedExtraction(options) {
14676
14863
  const input = await readExtractFileInput(source.filePath, aiConfig, modelOverride);
14677
14864
  text$1 = input.text;
14678
14865
  filePath = input.filePath;
14866
+ inputProcessing = input.inputProcessing;
14867
+ inputQuality = input.quality;
14868
+ if (!quiet) consola.info(`Input: ${formatInputProcessing$1(inputProcessing)}`);
14869
+ await updateExtractionAuditRecord(aiexDir, audit.id, {
14870
+ inputProcessing,
14871
+ quality: inputQuality
14872
+ });
14679
14873
  } else text$1 = source.text;
14680
14874
  const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
14681
14875
  quiet,
@@ -14693,6 +14887,8 @@ async function runAuditedExtraction(options) {
14693
14887
  outputName: r.outputPath ? path.basename(r.outputPath) : void 0,
14694
14888
  tablesInserted: r.tablesInserted,
14695
14889
  tokensUsed: r.tokensUsed,
14890
+ quality: mergeQuality(inputQuality, r.quality),
14891
+ failureStage: "integration",
14696
14892
  error: error instanceof Error ? error.message : String(error)
14697
14893
  });
14698
14894
  if (!quiet) consola.error(t("command.extract.file.notionSyncFail", { error: error instanceof Error ? error.message : String(error) }));
@@ -14701,7 +14897,10 @@ async function runAuditedExtraction(options) {
14701
14897
  success: false,
14702
14898
  error: error instanceof Error ? error.message : String(error),
14703
14899
  auditId: audit.id,
14704
- fileHash
14900
+ fileHash,
14901
+ inputProcessing,
14902
+ quality: mergeQuality(inputQuality, r.quality),
14903
+ failureStage: "integration"
14705
14904
  };
14706
14905
  }
14707
14906
  const updated = await updateExtractionAuditRecord(aiexDir, audit.id, {
@@ -14710,7 +14909,8 @@ async function runAuditedExtraction(options) {
14710
14909
  outputName: r.outputPath ? path.basename(r.outputPath) : void 0,
14711
14910
  tablesInserted: r.tablesInserted,
14712
14911
  notionPages,
14713
- tokensUsed: r.tokensUsed
14912
+ tokensUsed: r.tokensUsed,
14913
+ quality: mergeQuality(inputQuality, r.quality)
14714
14914
  });
14715
14915
  await triggerWebhook(aiConfig, audit.id, schemaName, "extraction.success", source, r.data, void 0, r.tokensUsed, quiet);
14716
14916
  return {
@@ -14721,12 +14921,17 @@ async function runAuditedExtraction(options) {
14721
14921
  notionPages: updated.notionPages,
14722
14922
  tokensUsed: updated.tokensUsed,
14723
14923
  auditId: updated.id,
14724
- fileHash
14924
+ fileHash,
14925
+ inputProcessing: updated.inputProcessing,
14926
+ quality: updated.quality,
14927
+ failureStage: updated.failureStage
14725
14928
  };
14726
14929
  } else {
14727
14930
  await updateExtractionAuditRecord(aiexDir, audit.id, {
14728
14931
  status: "failed",
14729
- error: r.error || "Extraction failed"
14932
+ error: r.error || "Extraction failed",
14933
+ quality: mergeQuality(inputQuality, r.quality),
14934
+ failureStage: r.failureStage ?? "ai_extraction"
14730
14935
  });
14731
14936
  if (!quiet) consola.error(t("command.extract.file.extractionFailed", { error: r.error }));
14732
14937
  await triggerWebhook(aiConfig, audit.id, schemaName, "extraction.failed", source, void 0, r.error || "Extraction failed", void 0, quiet);
@@ -14734,13 +14939,19 @@ async function runAuditedExtraction(options) {
14734
14939
  success: false,
14735
14940
  error: r.error,
14736
14941
  auditId: audit.id,
14737
- fileHash
14942
+ fileHash,
14943
+ inputProcessing,
14944
+ quality: mergeQuality(inputQuality, r.quality),
14945
+ failureStage: r.failureStage ?? "ai_extraction"
14738
14946
  };
14739
14947
  }
14740
14948
  } catch (e) {
14949
+ const failureStage = classifyInputError(e, inputProcessing);
14741
14950
  await updateExtractionAuditRecord(aiexDir, audit.id, {
14742
14951
  status: "failed",
14743
- error: e instanceof Error ? e.message : String(e)
14952
+ error: e instanceof Error ? e.message : String(e),
14953
+ quality: inputQuality,
14954
+ failureStage
14744
14955
  });
14745
14956
  if (!quiet) {
14746
14957
  const name$1 = source.type === "file" ? path.basename(source.filePath) : "text input";
@@ -14754,7 +14965,10 @@ async function runAuditedExtraction(options) {
14754
14965
  success: false,
14755
14966
  error: e instanceof Error ? e.message : String(e),
14756
14967
  auditId: audit.id,
14757
- fileHash
14968
+ fileHash,
14969
+ inputProcessing,
14970
+ quality: inputQuality,
14971
+ failureStage
14758
14972
  };
14759
14973
  }
14760
14974
  }
@@ -14938,6 +15152,18 @@ function isExtractSubCommand(rawArgs) {
14938
15152
  function formatSource(source) {
14939
15153
  return source.type === "file" ? source.fileName || "file" : "unknown";
14940
15154
  }
15155
+ function formatInputProcessing(input) {
15156
+ if (!input) return "";
15157
+ const handler = input.converter ? `${input.handler}(${input.converter})` : input.handler;
15158
+ return ` [${input.mime ?? input.kind} -> ${handler}]`;
15159
+ }
15160
+ function formatQuality(quality, failureStage) {
15161
+ if (failureStage) return ` [failed:${failureStage}]`;
15162
+ if (quality?.input?.pdf) return ` [pdf:${quality.input.pdf.pageCount}p/${quality.input.pdf.textLength}chars${quality.input.pdf.fallbackUsed ? "/fallback" : ""}]`;
15163
+ if (quality?.input?.ocr) return ` [ocr:${Math.round(quality.input.ocr.confidence * 100)}%/${quality.input.ocr.textLength}chars]`;
15164
+ if (quality?.ai?.missingFieldRate !== void 0) return ` [missing:${Math.round(quality.ai.missingFieldRate * 100)}%]`;
15165
+ return "";
15166
+ }
14941
15167
  async function loadConfiguredAI(aiexDir) {
14942
15168
  const aiConfig = await readAIConfig(aiexDir);
14943
15169
  if (!aiConfig) {
@@ -14980,7 +15206,7 @@ const historyCommand = defineCommand({
14980
15206
  }
14981
15207
  for (const record of records) {
14982
15208
  const suffix = record.error ? ` — ${record.error}` : record.outputName ? ` — ${record.outputName}` : "";
14983
- consola.info(`${record.status.padEnd(9)} ${record.id} ${record.schemaName} ${formatSource(record.source)}${suffix}`);
15209
+ consola.info(`${record.status.padEnd(9)} ${record.id} ${record.schemaName} ${formatSource(record.source)}${formatInputProcessing(record.inputProcessing)}${formatQuality(record.quality, record.failureStage)}${suffix}`);
14984
15210
  }
14985
15211
  }
14986
15212
  });
@@ -15493,10 +15719,7 @@ const SUPPORTED_EXTENSIONS = new Set([
15493
15719
  "png",
15494
15720
  "jpg",
15495
15721
  "jpeg",
15496
- "gif",
15497
15722
  "webp",
15498
- "bmp",
15499
- "svg",
15500
15723
  "pdf",
15501
15724
  "txt",
15502
15725
  "md",
@@ -15909,7 +16132,10 @@ async function listExtractions(config) {
15909
16132
  modifiedAt: stat.mtime.toISOString(),
15910
16133
  notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
15911
16134
  notionPages,
15912
- notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
16135
+ notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0,
16136
+ inputProcessing: audit?.inputProcessing,
16137
+ quality: audit?.quality,
16138
+ failureStage: audit?.failureStage
15913
16139
  });
15914
16140
  } catch {
15915
16141
  continue;
@@ -16180,10 +16406,9 @@ function getFormFile(value) {
16180
16406
  function safeUploadName(name$1) {
16181
16407
  return path.basename(name$1).replace(/[^\w.-]/g, "_") || "upload.txt";
16182
16408
  }
16183
- function safeUploadNameForMime(file) {
16409
+ function safeUploadNameForMime(file, mimeType) {
16184
16410
  const safeName = safeUploadName(file.name);
16185
- const ext = getExtensionFromMime(file.type);
16186
- if (!ext) throw new FileValidationError(unsupportedFileTypeMessage(file.type));
16411
+ const ext = getExtensionForDetectedFile(mimeType);
16187
16412
  return `${path.parse(safeName).name || "upload"}.${ext}`;
16188
16413
  }
16189
16414
  function jsonResponse(body, status) {
@@ -16193,10 +16418,10 @@ function jsonResponse(body, status) {
16193
16418
  });
16194
16419
  }
16195
16420
  async function saveUploadToFile(file, uploadsDir, id) {
16196
- validateFileUpload(file);
16197
- await fs.mkdir(uploadsDir, { recursive: true });
16198
- const filePath = path.join(uploadsDir, `${id}-${safeUploadNameForMime(file)}`);
16199
16421
  const buffer = Buffer.from(await file.arrayBuffer());
16422
+ const mimeType = await validateFileUploadContent(file, buffer);
16423
+ await fs.mkdir(uploadsDir, { recursive: true });
16424
+ const filePath = path.join(uploadsDir, `${id}-${safeUploadNameForMime(file, mimeType)}`);
16200
16425
  await fs.writeFile(filePath, buffer);
16201
16426
  return filePath;
16202
16427
  }
@@ -16277,7 +16502,10 @@ function extractRoutes(config) {
16277
16502
  if (!result.success) return jsonResponse({
16278
16503
  success: false,
16279
16504
  error: result.error,
16280
- auditId: result.auditId
16505
+ auditId: result.auditId,
16506
+ inputProcessing: result.inputProcessing,
16507
+ quality: result.quality,
16508
+ failureStage: result.failureStage
16281
16509
  }, 500);
16282
16510
  return jsonResponse({
16283
16511
  success: true,
@@ -16286,7 +16514,10 @@ function extractRoutes(config) {
16286
16514
  tablesInserted: result.tablesInserted,
16287
16515
  notionPages: result.notionPages,
16288
16516
  tokensUsed: result.tokensUsed,
16289
- auditId: result.auditId
16517
+ auditId: result.auditId,
16518
+ inputProcessing: result.inputProcessing,
16519
+ quality: result.quality,
16520
+ failureStage: result.failureStage
16290
16521
  }, 200);
16291
16522
  } catch (error) {
16292
16523
  if (isMissingUploadFileError(error)) return c.json({
@@ -16344,7 +16575,10 @@ function extractRoutes(config) {
16344
16575
  if (!result.success) return jsonResponse({
16345
16576
  success: false,
16346
16577
  error: result.error,
16347
- auditId: result.auditId
16578
+ auditId: result.auditId,
16579
+ inputProcessing: result.inputProcessing,
16580
+ quality: result.quality,
16581
+ failureStage: result.failureStage
16348
16582
  }, 500);
16349
16583
  return jsonResponse({
16350
16584
  success: true,
@@ -16353,7 +16587,10 @@ function extractRoutes(config) {
16353
16587
  tablesInserted: result.tablesInserted,
16354
16588
  notionPages: result.notionPages,
16355
16589
  tokensUsed: result.tokensUsed,
16356
- auditId: result.auditId
16590
+ auditId: result.auditId,
16591
+ inputProcessing: result.inputProcessing,
16592
+ quality: result.quality,
16593
+ failureStage: result.failureStage
16357
16594
  }, 200);
16358
16595
  });
16359
16596
  app.delete("/extract/records/:id", async (c) => {