aiex-cli 0.0.5-beta.6 → 0.0.6-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { C as name, D as doctorDiagnosticsTableRows, O as formatDoctorDiagnosticsJson, S as description, T as version, _ as PLACEHOLDER_SCHEMA, a as parseJsonSchema, b as createConfig, c as recognizeImageText, d as getDefaultAIConfig, f as readAIConfig, g as DEFAULT_PROMPT_CONFIG, h as DEFAULT_MINERU_CONFIG, i as JsonSchemaDefinitionSchema, l as initI18n, m as DEFAULT_MINERU_API_CONFIG, n as createMigrationConfig, o as toSnakeCase, p as writeAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as t, v as PLACEHOLDER_TEXT, w as package_default, x as seedConfig, y as AIConfigSchema } from "./doctor-collector-BpqhXNcO.mjs";
1
+ import { C as description, E as version, O as doctorDiagnosticsTableRows, S as seedConfig, T as package_default, _ as DEFAULT_PROMPT_CONFIG, a as parseJsonSchema, b as AIConfigSchema, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MINERU_CONFIG, h as DEFAULT_MINERU_API_CONFIG, i as JsonSchemaDefinitionSchema, k as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as PLACEHOLDER_SCHEMA, w as name, x as createConfig, y as PLACEHOLDER_TEXT } from "./doctor-collector-CGo5dgHm.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import fs from "node:fs/promises";
4
4
  import os from "node:os";
@@ -17,14 +17,15 @@ import Database from "better-sqlite3";
17
17
  import pc from "picocolors";
18
18
  import { Buffer } from "node:buffer";
19
19
  import * as XLSX from "xlsx";
20
- import { getEncoding } from "js-tiktoken";
21
20
  import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
22
21
  import { APICallError, Output, generateText, jsonSchema } from "ai";
23
22
  import pRetry from "p-retry";
23
+ import mime from "mime";
24
+ import { TextDecoder, promisify } from "node:util";
25
+ import { fileTypeFromBuffer, fileTypeFromFile } from "file-type";
24
26
  import { jsonrepair } from "jsonrepair";
25
27
  import { LangfuseSpanProcessor } from "@langfuse/otel";
26
28
  import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
27
- import { marked } from "marked";
28
29
  import crypto from "node:crypto";
29
30
  import { Client, extractNotionId } from "@notionhq/client";
30
31
  import { execa } from "execa";
@@ -32,7 +33,6 @@ import { glob, globSync } from "tinyglobby";
32
33
  import { extractText, getDocumentProxy, getMeta } from "unpdf";
33
34
  import AdmZip from "adm-zip";
34
35
  import { execFile } from "node:child_process";
35
- import { promisify } from "node:util";
36
36
  import * as chokidar from "chokidar";
37
37
  import { serve } from "@hono/node-server";
38
38
  import open from "open";
@@ -12860,6 +12860,80 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
12860
12860
  });
12861
12861
  }
12862
12862
 
12863
+ //#endregion
12864
+ //#region src/core/input-file-kind.ts
12865
+ const UTF8_DECODER = new TextDecoder("utf-8", { fatal: true });
12866
+ const SVG_START_RE = /^\s*<svg[\s>]/i;
12867
+ const SVG_ANY_RE = /<svg[\s>]/i;
12868
+ function isSupportedImageMime(mime$1) {
12869
+ return !!mime$1 && [
12870
+ "image/png",
12871
+ "image/jpeg",
12872
+ "image/webp"
12873
+ ].includes(mime$1);
12874
+ }
12875
+ function detectTextKind(buffer) {
12876
+ try {
12877
+ const text$1 = UTF8_DECODER.decode(buffer);
12878
+ if (SVG_START_RE.test(text$1) || SVG_ANY_RE.test(text$1.slice(0, 4096))) return {
12879
+ kind: "unsupported",
12880
+ mime: "image/svg+xml"
12881
+ };
12882
+ return {
12883
+ kind: "text",
12884
+ mime: "text/plain"
12885
+ };
12886
+ } catch {
12887
+ return { kind: "unsupported" };
12888
+ }
12889
+ }
12890
+ async function detectInputFileKind(filePath) {
12891
+ const detected = await fileTypeFromFile(filePath);
12892
+ if (detected?.mime === "application/pdf") return {
12893
+ kind: "pdf",
12894
+ mime: detected.mime
12895
+ };
12896
+ if (isSupportedImageMime(detected?.mime)) return {
12897
+ kind: "image",
12898
+ mime: detected?.mime
12899
+ };
12900
+ return detectTextKind(await fs.readFile(filePath));
12901
+ }
12902
+ async function detectInputBufferKind(buffer) {
12903
+ const detected = await fileTypeFromBuffer(buffer);
12904
+ if (detected?.mime === "application/pdf") return {
12905
+ kind: "pdf",
12906
+ mime: detected.mime
12907
+ };
12908
+ if (isSupportedImageMime(detected?.mime)) return {
12909
+ kind: "image",
12910
+ mime: detected?.mime
12911
+ };
12912
+ return detectTextKind(buffer);
12913
+ }
12914
+
12915
+ //#endregion
12916
+ //#region src/core/ai-extraction/file-utils.ts
12917
+ async function detectMimeType(filePath) {
12918
+ return (await detectInputFileKind(filePath)).mime ?? mime.getType(filePath) ?? "application/octet-stream";
12919
+ }
12920
+ async function readFilePart(filePath) {
12921
+ const mimeStr = await detectMimeType(filePath);
12922
+ const buffer = await fs.readFile(filePath);
12923
+ const name$1 = path.basename(filePath);
12924
+ if (mimeStr.startsWith("image/")) return {
12925
+ type: "image",
12926
+ image: buffer,
12927
+ mimeType: mimeStr
12928
+ };
12929
+ return {
12930
+ type: "file",
12931
+ data: buffer,
12932
+ mediaType: mimeStr,
12933
+ filename: name$1
12934
+ };
12935
+ }
12936
+
12863
12937
  //#endregion
12864
12938
  //#region src/core/ai-extraction/json-utils.ts
12865
12939
  function parseJsonLike(text$1) {
@@ -12920,10 +12994,25 @@ function filterCompatible(models, inputTokens, outputTokens) {
12920
12994
  });
12921
12995
  }
12922
12996
  function selectModel(input) {
12923
- const { models, inputTokens, outputTokens } = input;
12997
+ const { models, isImage, fileName, inputTokens, outputTokens } = input;
12924
12998
  if (models.length === 0) throw new Error(t("errors.ai.noModels"));
12925
12999
  let candidates = filterCompatible(models, inputTokens, outputTokens);
12926
13000
  if (candidates.length === 0) candidates = models;
13001
+ if (isImage) {
13002
+ const visionModel = candidates.find((m) => m.capabilities.vision);
13003
+ if (!visionModel) {
13004
+ const hint = fileName ? ` (${fileName})` : "";
13005
+ const msg = inputTokens ? t("errors.ai.noVisionModelContext", {
13006
+ tokens: inputTokens,
13007
+ hint
13008
+ }) : t("errors.ai.noVisionModel", { hint });
13009
+ throw new Error(msg + t("errors.ai.addSuitableModel"));
13010
+ }
13011
+ return {
13012
+ name: visionModel.name,
13013
+ capabilities: visionModel.capabilities
13014
+ };
13015
+ }
12927
13016
  const soModel = candidates.find((m) => m.capabilities.structuredOutput);
12928
13017
  if (soModel) return {
12929
13018
  name: soModel.name,
@@ -12937,46 +13026,36 @@ function selectModel(input) {
12937
13026
 
12938
13027
  //#endregion
12939
13028
  //#region src/core/ai-extraction/prompt-generator.ts
12940
- const CAMEL_CASE_BOUNDARY_RE = /([a-z0-9])([A-Z])/g;
12941
- const IDENTIFIER_SEPARATOR_RE = /[\s_-]+/;
12942
- function splitIdentifier(name$1) {
12943
- return name$1.replace(CAMEL_CASE_BOUNDARY_RE, "$1 $2").split(IDENTIFIER_SEPARATOR_RE).map((part) => part.trim().toLowerCase()).filter(Boolean);
12944
- }
12945
- function propertyToDescription(name$1, prop, indent = "", required = false) {
13029
+ function propertyToDescription(name$1, prop, indent = "") {
12946
13030
  const lines = [];
12947
13031
  let typeStr = prop.type;
12948
13032
  if (prop.type === "array" && prop.items) typeStr = `array of ${prop.items.type}`;
12949
- lines.push(`${indent}- ${name$1}: ${typeStr}${required ? " (required)" : ""}`);
12950
- const terms = splitIdentifier(name$1);
12951
- if (terms.length > 1) lines.push(`${indent} search terms: ${terms.join(", ")}`);
12952
- if (prop.description) lines.push(`${indent} description: ${prop.description}`);
13033
+ lines.push(`${indent}- ${name$1}: ${typeStr}`);
12953
13034
  if (prop.minLength !== void 0 || prop.maxLength !== void 0) lines.push(`${indent} length: ${prop.minLength ?? 0} - ${prop.maxLength ?? "unlimited"}`);
12954
- if (prop.minimum !== void 0 || prop.maximum !== void 0) lines.push(`${indent} range: ${prop.minimum ?? "-∞"} - ${prop.maximum ?? "+∞"}`);
12955
13035
  if (prop.format) lines.push(`${indent} format: ${prop.format}`);
12956
13036
  if (prop.unique) lines.push(`${indent} unique: true`);
12957
13037
  if (prop.default !== void 0) lines.push(`${indent} default: ${JSON.stringify(prop.default)}`);
12958
13038
  return lines.join("\n");
12959
13039
  }
12960
- function nestedPropertyToDescription(name$1, prop, indent = "", requiredFields = []) {
13040
+ function nestedPropertyToDescription(name$1, prop, indent = "") {
12961
13041
  const lines = [];
12962
- const isRequired = requiredFields.includes(name$1);
12963
13042
  if (prop.nested?.enabled && prop.type === "object") {
12964
13043
  const relation = prop.nested.relation || "has-one";
12965
- lines.push(`${indent}- ${name$1}: object (related table, ${relation})${isRequired ? " (required)" : ""}`);
12966
- if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
13044
+ lines.push(`${indent}- ${name$1}: object (related table, ${relation})`);
13045
+ if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12967
13046
  return lines.join("\n");
12968
13047
  }
12969
13048
  if (prop.type === "array" && prop.items?.nested?.enabled) {
12970
13049
  const relation = prop.items.nested.relation || "has-many";
12971
- lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})${isRequired ? " (required)" : ""}`);
12972
- if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
13050
+ lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})`);
13051
+ if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12973
13052
  return lines.join("\n");
12974
13053
  }
12975
- lines.push(propertyToDescription(name$1, prop, indent, isRequired));
12976
- if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
13054
+ lines.push(propertyToDescription(name$1, prop, indent));
13055
+ if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12977
13056
  if (prop.type === "array" && prop.items?.properties && !prop.items?.nested?.enabled) {
12978
13057
  lines.push(`${indent} item fields:`);
12979
- for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
13058
+ for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12980
13059
  }
12981
13060
  return lines.join("\n");
12982
13061
  }
@@ -12988,7 +13067,7 @@ function schemaToDescription(schema) {
12988
13067
  lines.push("Fields:");
12989
13068
  for (const [name$1, prop] of Object.entries(schema.properties)) {
12990
13069
  const property = prop;
12991
- lines.push(nestedPropertyToDescription(name$1, property, "", schema.required ?? []));
13070
+ lines.push(nestedPropertyToDescription(name$1, property));
12992
13071
  }
12993
13072
  if (schema.examples && schema.examples.length > 0) {
12994
13073
  lines.push("");
@@ -13033,6 +13112,33 @@ function generatePromptSnapshot(schema, promptConfig = DEFAULT_PROMPT_CONFIG) {
13033
13112
  ].join("\n");
13034
13113
  }
13035
13114
 
13115
+ //#endregion
13116
+ //#region src/core/ai-extraction/snapshot.ts
13117
+ const SYSTEM_PROMPT_REGEX = /## System Prompt\n([\s\S]*?)(?=## User Prompt|$)/;
13118
+ const USER_PROMPT_REGEX = /## User Prompt Template\n([\s\S]*)$/;
13119
+ async function loadPromptSnapshot(aiexDir, tableName) {
13120
+ const snapshotPath = path.join(aiexDir, "extracted", `${tableName}.prompt.md`);
13121
+ try {
13122
+ const content = await fs.readFile(snapshotPath, "utf-8");
13123
+ const systemMatch = content.match(SYSTEM_PROMPT_REGEX);
13124
+ const userMatch = content.match(USER_PROMPT_REGEX);
13125
+ if (systemMatch && userMatch) return {
13126
+ system: systemMatch[1].trim(),
13127
+ user: userMatch[1].trim()
13128
+ };
13129
+ } catch {}
13130
+ return null;
13131
+ }
13132
+ async function savePromptSnapshot(schema, aiexDir) {
13133
+ const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
13134
+ const outputDir = path.join(aiexDir, "extracted");
13135
+ await fs.mkdir(outputDir, { recursive: true });
13136
+ const fileName = `${schema.table.name}.prompt.md`;
13137
+ const outputPath = path.join(outputDir, fileName);
13138
+ await fs.writeFile(outputPath, content);
13139
+ return outputPath;
13140
+ }
13141
+
13036
13142
  //#endregion
13037
13143
  //#region src/core/ai-extraction/telemetry.ts
13038
13144
  let langfuseInitialized = false;
@@ -13075,7 +13181,7 @@ function propertyToExtractionSchema(property) {
13075
13181
  }
13076
13182
  return { type: nullableType(property.type) };
13077
13183
  }
13078
- function isRecord$2(value) {
13184
+ function isRecord(value) {
13079
13185
  return typeof value === "object" && value !== null && !Array.isArray(value);
13080
13186
  }
13081
13187
  function schemaToExtractionOutputSchema(schema) {
@@ -13113,7 +13219,7 @@ function validatePropertyValue(path$1, property, value, issues) {
13113
13219
  }
13114
13220
  return;
13115
13221
  case "object":
13116
- if (!isRecord$2(value)) {
13222
+ if (!isRecord(value)) {
13117
13223
  issues.push(`${path$1}: expected object or null`);
13118
13224
  return;
13119
13225
  }
@@ -13136,7 +13242,7 @@ function validateProperties(basePath, properties, data, issues) {
13136
13242
  }
13137
13243
  }
13138
13244
  function validateExtractedData(schema, data) {
13139
- if (!isRecord$2(data)) return {
13245
+ if (!isRecord(data)) return {
13140
13246
  success: false,
13141
13247
  error: "Extracted data must be a JSON object."
13142
13248
  };
@@ -13153,11 +13259,13 @@ function validateExtractedData(schema, data) {
13153
13259
  //#region src/core/ai-extraction/extractor.ts
13154
13260
  const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
13155
13261
  async function extractStructuredData(input) {
13156
- const { config, schema, text: text$1, modelOverride } = input;
13262
+ const { config, schema, text: text$1, aiexDir, file, modelOverride } = input;
13157
13263
  if (!config.provider.apiKey) return {
13158
13264
  success: false,
13159
13265
  error: t("errors.ai.apiKeyMissing")
13160
13266
  };
13267
+ const useFileContent = !!file;
13268
+ const isImageFile = (useFileContent ? await detectMimeType(file) : "").startsWith("image/");
13161
13269
  const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
13162
13270
  const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
13163
13271
  const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
@@ -13165,6 +13273,8 @@ async function extractStructuredData(input) {
13165
13273
  try {
13166
13274
  selected = modelOverride ?? selectModel({
13167
13275
  models: config.provider.models,
13276
+ isImage: isImageFile,
13277
+ fileName: file,
13168
13278
  inputTokens,
13169
13279
  outputTokens
13170
13280
  });
@@ -13184,7 +13294,18 @@ async function extractStructuredData(input) {
13184
13294
  apiKey: config.provider.apiKey,
13185
13295
  supportsStructuredOutputs: useStructuredOutput
13186
13296
  });
13187
- const { system, user } = generateExtractionPrompt(schema, text$1, config.prompt ?? DEFAULT_PROMPT_CONFIG);
13297
+ let system;
13298
+ let user;
13299
+ const snapshot = await loadPromptSnapshot(aiexDir, schema.table.name);
13300
+ const promptText = file ? PLACEHOLDER_TEXT : text$1;
13301
+ if (snapshot) {
13302
+ system = snapshot.system;
13303
+ user = snapshot.user.replaceAll(PLACEHOLDER_TEXT, promptText);
13304
+ } else {
13305
+ const generated = generateExtractionPrompt(schema, promptText, config.prompt ?? DEFAULT_PROMPT_CONFIG);
13306
+ system = generated.system;
13307
+ user = generated.user;
13308
+ }
13188
13309
  const outputSchema = jsonSchema(schemaToExtractionOutputSchema(schema));
13189
13310
  const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
13190
13311
  let systemPrompt = system;
@@ -13199,16 +13320,38 @@ async function extractStructuredData(input) {
13199
13320
  let parseError;
13200
13321
  let validationError;
13201
13322
  try {
13202
- const textOpts = {
13203
- model: provider.chatModel(selected.name),
13204
- system: systemPrompt,
13205
- prompt: userPrompt,
13206
- abortSignal: AbortSignal.timeout(timeoutMs),
13207
- maxRetries: 0,
13208
- experimental_telemetry: { isEnabled: useTelemetry }
13209
- };
13210
- if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
13211
- result = await withRetry(() => generateText(textOpts), input.onRetry);
13323
+ if (useFileContent) {
13324
+ const filePart = await readFilePart(file);
13325
+ const fileName = filePart.type === "file" ? filePart.filename : path.basename(file);
13326
+ const contentParts = [{
13327
+ type: "text",
13328
+ text: userPrompt.includes(PLACEHOLDER_TEXT) ? userPrompt.replaceAll(PLACEHOLDER_TEXT, text$1 || `Data is contained in the attached file: ${fileName}`) : userPrompt
13329
+ }, filePart];
13330
+ const fileOpts = {
13331
+ model: provider.chatModel(selected.name),
13332
+ system: systemPrompt,
13333
+ messages: [{
13334
+ role: "user",
13335
+ content: contentParts
13336
+ }],
13337
+ abortSignal: AbortSignal.timeout(timeoutMs),
13338
+ maxRetries: 0,
13339
+ experimental_telemetry: { isEnabled: useTelemetry }
13340
+ };
13341
+ if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
13342
+ result = await withRetry(() => generateText(fileOpts), input.onRetry);
13343
+ } else {
13344
+ const textOpts = {
13345
+ model: provider.chatModel(selected.name),
13346
+ system: systemPrompt,
13347
+ prompt: userPrompt,
13348
+ abortSignal: AbortSignal.timeout(timeoutMs),
13349
+ maxRetries: 0,
13350
+ experimental_telemetry: { isEnabled: useTelemetry }
13351
+ };
13352
+ if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
13353
+ result = await withRetry(() => generateText(textOpts), input.onRetry);
13354
+ }
13212
13355
  if (result.usage) {
13213
13356
  totalPromptTokens += result.usage.inputTokens ?? 0;
13214
13357
  totalCompletionTokens += result.usage.outputTokens ?? 0;
@@ -13224,16 +13367,27 @@ async function extractStructuredData(input) {
13224
13367
  }
13225
13368
  if (!parseError && data !== void 0) {
13226
13369
  const validation = validateExtractedData(schema, data);
13227
- if (validation.success) return {
13228
- success: true,
13229
- data,
13230
- tokensUsed: {
13231
- prompt: totalPromptTokens,
13232
- completion: totalCompletionTokens,
13233
- total: totalPromptTokens + totalCompletionTokens
13234
- }
13235
- };
13236
- else validationError = validation.error;
13370
+ if (validation.success) {
13371
+ const outputDir = path.resolve(aiexDir, config.extraction.outputDir.replace(".aiex/", ""));
13372
+ await fs.mkdir(outputDir, { recursive: true });
13373
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
13374
+ const outputFileName = `${schema.table.name}-${timestamp}.json`;
13375
+ const outputPath = path.join(outputDir, outputFileName);
13376
+ await writeFile(outputPath, data, {
13377
+ spaces: 2,
13378
+ EOL: "\n"
13379
+ });
13380
+ return {
13381
+ success: true,
13382
+ outputPath,
13383
+ data,
13384
+ tokensUsed: {
13385
+ prompt: totalPromptTokens,
13386
+ completion: totalCompletionTokens,
13387
+ total: totalPromptTokens + totalCompletionTokens
13388
+ }
13389
+ };
13390
+ } else validationError = validation.error;
13237
13391
  }
13238
13392
  const errorMsg = parseError || validationError || "Unknown validation error";
13239
13393
  lastError = errorMsg;
@@ -13244,14 +13398,11 @@ async function extractStructuredData(input) {
13244
13398
  CRITICAL RULES:
13245
13399
  1. Only correct the fields that failed validation.
13246
13400
  2. Preserve all other correctly extracted fields and their values exactly.
13247
- 3. Use only values supported by the original text. If a value cannot be confirmed, set it to null.
13248
- 4. Remove any fields not defined by the JSON Schema.
13249
- 5. Normalize values to the expected JSON type without changing the intended meaning.
13250
- 6. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
13401
+ 3. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
13251
13402
  userPrompt = `The JSON data you generated previously failed validation. Please correct it.
13252
13403
 
13253
13404
  [Original Text]
13254
- ${text$1 || "Original text is empty."}
13405
+ ${text$1 || "Data is contained in the attached file."}
13255
13406
 
13256
13407
  [JSON Schema Definition]
13257
13408
  ${JSON.stringify(schemaToExtractionOutputSchema(schema), null, 2)}
@@ -13262,11 +13413,6 @@ ${invalidJson}
13262
13413
  [Validation Error Details]
13263
13414
  ${errorMsg}
13264
13415
 
13265
- Correction checklist:
13266
- - Fix each field path mentioned in the validation error.
13267
- - Keep schema-valid fields unchanged.
13268
- - Do not invent missing facts; use null when the original text does not support a value.
13269
-
13270
13416
  Please output the corrected JSON object now:`;
13271
13417
  }
13272
13418
  }
@@ -13419,343 +13565,6 @@ function insertExtractedData(db, schema, data) {
13419
13565
  }
13420
13566
  }
13421
13567
 
13422
- //#endregion
13423
- //#region src/core/ai-extraction/json-merger.ts
13424
- function isRecord$1(value) {
13425
- return typeof value === "object" && value !== null && !Array.isArray(value);
13426
- }
13427
- function stableKey(value) {
13428
- if (!isRecord$1(value)) return JSON.stringify(value);
13429
- return JSON.stringify(Object.keys(value).sort().reduce((acc, key) => {
13430
- acc[key] = value[key];
13431
- return acc;
13432
- }, {}));
13433
- }
13434
- function isBlankString(value) {
13435
- return typeof value === "string" && value.trim() === "";
13436
- }
13437
- function isPlaceholderString$1(value) {
13438
- if (typeof value !== "string") return false;
13439
- const normalized = value.trim().toLowerCase();
13440
- return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
13441
- }
13442
- function pickPrimitiveValue(values) {
13443
- const meaningful = values.filter((v) => !isBlankString(v) && !isPlaceholderString$1(v));
13444
- if (meaningful.length === 0) return null;
13445
- if (typeof meaningful[0] === "boolean") {
13446
- const trueCount = meaningful.filter(Boolean).length;
13447
- return trueCount >= meaningful.length - trueCount;
13448
- }
13449
- return meaningful[0];
13450
- }
13451
- function mergePropertyValue(property, values) {
13452
- const nonNullValues = values.filter((v) => v !== null && v !== void 0);
13453
- if (nonNullValues.length === 0) return null;
13454
- if (property.type === "array") {
13455
- const concatenated = [];
13456
- const seen = /* @__PURE__ */ new Set();
13457
- for (const val of nonNullValues) if (Array.isArray(val)) for (const item of val) {
13458
- const key = stableKey(item);
13459
- if (!seen.has(key)) {
13460
- seen.add(key);
13461
- concatenated.push(item);
13462
- }
13463
- }
13464
- return concatenated;
13465
- }
13466
- if (property.type === "object") {
13467
- const childProperties = property.properties;
13468
- if (!childProperties) {
13469
- const mergedObj$1 = {};
13470
- for (const val of nonNullValues) if (isRecord$1(val)) Object.assign(mergedObj$1, val);
13471
- return mergedObj$1;
13472
- }
13473
- const mergedObj = {};
13474
- for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord$1(v) ? v[propName] : void 0));
13475
- return mergedObj;
13476
- }
13477
- return pickPrimitiveValue(nonNullValues);
13478
- }
13479
- /**
13480
- * Merges structured extraction outputs from multiple document chunks
13481
- * according to the schema properties.
13482
- */
13483
- function mergeExtractionResults(schema, results) {
13484
- if (results.length === 0) return {};
13485
- if (results.length === 1) return results[0];
13486
- const merged = {};
13487
- for (const [propName, propDef] of Object.entries(schema.properties)) {
13488
- if (propDef.primary && propDef.autoIncrement) continue;
13489
- merged[propName] = mergePropertyValue(propDef, results.map((r) => r[propName]));
13490
- }
13491
- return merged;
13492
- }
13493
-
13494
- //#endregion
13495
- //#region src/core/ai-extraction/snapshot.ts
13496
- async function savePromptSnapshot(schema, aiexDir) {
13497
- const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
13498
- const outputDir = path.join(aiexDir, "extracted");
13499
- await fs.mkdir(outputDir, { recursive: true });
13500
- const fileName = `${schema.table.name}.prompt.md`;
13501
- const outputPath = path.join(outputDir, fileName);
13502
- await fs.writeFile(outputPath, content);
13503
- return outputPath;
13504
- }
13505
-
13506
- //#endregion
13507
- //#region src/core/ai-extraction/text-splitter.ts
13508
- const encoding$1 = getEncoding("cl100k_base");
13509
- const MAX_OVERLAP_RATIO = .15;
13510
- const MAX_EFFECTIVE_OVERLAP_TOKENS = 1200;
13511
- const TABLE_SEPARATOR_CELL_RE = /^:?-{3,}:?$/;
13512
- const LEADING_TABLE_PIPE_RE = /^\|/;
13513
- const TRAILING_TABLE_PIPE_RE = /\|$/;
13514
- function countTokens(text$1) {
13515
- return encoding$1.encode(text$1).length;
13516
- }
13517
- function calculateChunkTokenBudget(options = {}) {
13518
- const configuredMaxTokens = options.configuredMaxTokens ?? 8e3;
13519
- const modelMaxTokens = options.modelMaxTokens;
13520
- if (!modelMaxTokens) return configuredMaxTokens;
13521
- const outputReserveTokens = options.outputReserveTokens ?? 2e3;
13522
- const promptReserveTokens = options.promptReserveTokens ?? 1200;
13523
- const safetyBufferTokens = options.safetyBufferTokens ?? Math.min(1e3, Math.floor(modelMaxTokens * .1));
13524
- const available = modelMaxTokens - outputReserveTokens - promptReserveTokens - safetyBufferTokens;
13525
- return Math.max(512, Math.min(configuredMaxTokens, available));
13526
- }
13527
- function formatHeadingContext(headings) {
13528
- const active = headings.filter(Boolean);
13529
- if (active.length === 0) return "";
13530
- return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
13531
- }
13532
- function getMetadata(headings) {
13533
- return {
13534
- h1: headings[0] || void 0,
13535
- h2: headings[1] || void 0,
13536
- h3: headings[2] || void 0,
13537
- h4: headings[3] || void 0
13538
- };
13539
- }
13540
- function getHeadingPath(metadata) {
13541
- return [
13542
- metadata.h1,
13543
- metadata.h2,
13544
- metadata.h3,
13545
- metadata.h4
13546
- ].filter(Boolean);
13547
- }
13548
- function finalizeChunks(chunks, sourceText) {
13549
- let searchStart = 0;
13550
- const totalChunks = chunks.length;
13551
- return chunks.map((chunk, index) => {
13552
- const tokenCount = countTokens(chunk.pageContent);
13553
- let charStart = sourceText.indexOf(chunk.pageContent, searchStart);
13554
- if (charStart === -1) charStart = sourceText.indexOf(chunk.pageContent);
13555
- const charEnd = charStart >= 0 ? charStart + chunk.pageContent.length : void 0;
13556
- if (charStart >= 0 && charEnd !== void 0) searchStart = charEnd;
13557
- return {
13558
- ...chunk,
13559
- chunkIndex: index,
13560
- totalChunks,
13561
- tokenCount,
13562
- headingPath: getHeadingPath(chunk.metadata),
13563
- charStart: charStart >= 0 ? charStart : void 0,
13564
- charEnd
13565
- };
13566
- });
13567
- }
13568
- function getEffectiveOverlapTokens(maxTokens, overlapTokens) {
13569
- return Math.floor(Math.min(overlapTokens, Math.max(64, maxTokens * MAX_OVERLAP_RATIO), MAX_EFFECTIVE_OVERLAP_TOKENS));
13570
- }
13571
- function splitMarkdownTable(tableText, maxTokens) {
13572
- if (countTokens(tableText) <= maxTokens) return [tableText];
13573
- const lines = tableText.split("\n");
13574
- const headerIndex = lines.findIndex((line) => line.trim().startsWith("|"));
13575
- const separatorIndex = lines.findIndex((line, index) => {
13576
- if (index <= headerIndex) return false;
13577
- const cells = line.trim().replace(LEADING_TABLE_PIPE_RE, "").replace(TRAILING_TABLE_PIPE_RE, "").split("|").map((cell) => cell.trim());
13578
- return cells.length > 0 && cells.every((cell) => TABLE_SEPARATOR_CELL_RE.test(cell));
13579
- });
13580
- if (headerIndex === -1 || separatorIndex === -1) return splitTextRecursively(tableText, maxTokens, ["\n"]);
13581
- const prefix = lines.slice(0, headerIndex);
13582
- const header = lines[headerIndex];
13583
- const separator = lines[separatorIndex];
13584
- const rows = lines.slice(separatorIndex + 1).filter((line) => line.trim() !== "");
13585
- const chunks = [];
13586
- let currentRows = [];
13587
- const buildTable = (tableRows) => {
13588
- return [
13589
- ...prefix,
13590
- header,
13591
- separator,
13592
- ...tableRows
13593
- ].join("\n");
13594
- };
13595
- for (const row of rows) {
13596
- const candidateRows = [...currentRows, row];
13597
- if (currentRows.length > 0 && countTokens(buildTable(candidateRows)) > maxTokens) {
13598
- chunks.push(buildTable(currentRows));
13599
- currentRows = [row];
13600
- } else currentRows = candidateRows;
13601
- }
13602
- if (currentRows.length > 0) chunks.push(buildTable(currentRows));
13603
- return chunks.length > 0 ? chunks : [tableText];
13604
- }
13605
- /**
13606
- * Splits text recursively using a list of separators.
13607
- * Preserves the separators when re-joining.
13608
- */
13609
- function splitTextRecursively(text$1, maxTokens, separators = [
13610
- "\n\n",
13611
- "\n",
13612
- "。",
13613
- ". ",
13614
- " "
13615
- ]) {
13616
- if (countTokens(text$1) <= maxTokens) return [text$1];
13617
- if (separators.length === 0) {
13618
- const chunks = [];
13619
- let current = "";
13620
- for (const char of text$1) if (countTokens(current + char) > maxTokens) {
13621
- chunks.push(current);
13622
- current = char;
13623
- } else current += char;
13624
- if (current) chunks.push(current);
13625
- return chunks;
13626
- }
13627
- const separator = separators[0];
13628
- const nextSeparators = separators.slice(1);
13629
- const parts = text$1.split(separator);
13630
- const result = [];
13631
- let currentChunk = [];
13632
- let currentChunkTokens = 0;
13633
- for (let i = 0; i < parts.length; i++) {
13634
- const part = parts[i];
13635
- const itemText = part + (i < parts.length - 1 ? separator : "");
13636
- const partTokens = countTokens(itemText);
13637
- if (partTokens > maxTokens) {
13638
- if (currentChunk.length > 0) {
13639
- result.push(currentChunk.join(""));
13640
- currentChunk = [];
13641
- currentChunkTokens = 0;
13642
- }
13643
- const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
13644
- for (let j = 0; j < subParts.length; j++) {
13645
- const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
13646
- result.push(finalSub);
13647
- }
13648
- } else if (currentChunkTokens + partTokens > maxTokens) {
13649
- result.push(currentChunk.join(""));
13650
- currentChunk = [itemText];
13651
- currentChunkTokens = partTokens;
13652
- } else {
13653
- currentChunk.push(itemText);
13654
- currentChunkTokens += partTokens;
13655
- }
13656
- }
13657
- if (currentChunk.length > 0) result.push(currentChunk.join(""));
13658
- return result;
13659
- }
13660
- /**
13661
- * Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
13662
- * Protects tables, list items, and code blocks from being broken.
13663
- */
13664
- function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
13665
- const tokens = marked.lexer(text$1);
13666
- const chunks = [];
13667
- const effectiveOverlapTokens = getEffectiveOverlapTokens(maxTokens, overlapTokens);
13668
- let currentHeadings = [];
13669
- let currentChunkList = [];
13670
- let accumulatedTokens = 0;
13671
- const flushCurrentChunk = (isHeadingChange = false) => {
13672
- if (currentChunkList.length === 0) return;
13673
- const pageContent = currentChunkList.map((item) => item.text).join("");
13674
- const firstHeadings = currentChunkList[0].headings;
13675
- chunks.push({
13676
- pageContent,
13677
- metadata: getMetadata(firstHeadings)
13678
- });
13679
- if (isHeadingChange || effectiveOverlapTokens <= 0) {
13680
- currentChunkList = [];
13681
- accumulatedTokens = 0;
13682
- } else {
13683
- const overlapItems = [];
13684
- let currentOverlapTokens = 0;
13685
- for (let i = currentChunkList.length - 1; i >= 0; i--) {
13686
- const item = currentChunkList[i];
13687
- const itemTokens = countTokens(item.text);
13688
- if (currentOverlapTokens + itemTokens > effectiveOverlapTokens && overlapItems.length > 0) break;
13689
- overlapItems.unshift(item);
13690
- currentOverlapTokens += itemTokens;
13691
- }
13692
- currentChunkList = [...overlapItems];
13693
- accumulatedTokens = currentOverlapTokens;
13694
- }
13695
- };
13696
- for (const token of tokens) {
13697
- if (token.type === "space") {
13698
- if (currentChunkList.length > 0) {
13699
- currentChunkList[currentChunkList.length - 1].text += token.raw;
13700
- accumulatedTokens += countTokens(token.raw);
13701
- }
13702
- continue;
13703
- }
13704
- if (token.type === "heading") {
13705
- flushCurrentChunk(true);
13706
- const depth = token.depth;
13707
- const title = token.text.trim();
13708
- currentHeadings = currentHeadings.slice(0, depth - 1);
13709
- currentHeadings[depth - 1] = title;
13710
- }
13711
- const rawText = token.raw;
13712
- if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
13713
- else {
13714
- const isAtomic = token.type === "table" || token.type === "code";
13715
- processTextBlock(rawText, currentHeadings, isAtomic);
13716
- }
13717
- }
13718
- flushCurrentChunk(true);
13719
- return finalizeChunks(chunks, text$1);
13720
- function processTextBlock(blockText, headings, isAtomic = false) {
13721
- const blockTokens = countTokens(blockText);
13722
- const contextTokens = countTokens(formatHeadingContext(headings));
13723
- const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
13724
- const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
13725
- if (blockTokens > budgetLimit) if (isAtomic) {
13726
- flushCurrentChunk(false);
13727
- const atomicBlocks = blockTokens <= maxTokens ? [blockText] : blockText.includes("|") ? splitMarkdownTable(blockText, budgetLimit) : splitTextRecursively(blockText, budgetLimit, ["\n"]);
13728
- for (const block of atomicBlocks) {
13729
- currentChunkList.push({
13730
- text: block,
13731
- headings: [...headings]
13732
- });
13733
- accumulatedTokens = countTokens(block);
13734
- flushCurrentChunk(false);
13735
- }
13736
- } else {
13737
- flushCurrentChunk(false);
13738
- const subBlocks = splitTextRecursively(blockText, budgetLimit);
13739
- for (const sub of subBlocks) {
13740
- currentChunkList.push({
13741
- text: sub,
13742
- headings: [...headings]
13743
- });
13744
- accumulatedTokens += countTokens(sub);
13745
- if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
13746
- }
13747
- }
13748
- else {
13749
- if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
13750
- currentChunkList.push({
13751
- text: blockText,
13752
- headings: [...headings]
13753
- });
13754
- accumulatedTokens += blockTokens;
13755
- }
13756
- }
13757
- }
13758
-
13759
13568
  //#endregion
13760
13569
  //#region src/core/extraction-audit.ts
13761
13570
  const AUDIT_ID_RE = /^[\w.-]+$/;
@@ -13906,276 +13715,6 @@ function getFileHash(filePath) {
13906
13715
  });
13907
13716
  }
13908
13717
 
13909
- //#endregion
13910
- //#region src/core/ai-extraction/evidence.ts
13911
- const JSON_FILE_SUFFIX_RE$1 = /\.json$/i;
13912
- const FIELD_PATH_PREFIX_RE = /^\$\./;
13913
- function isRecord(value) {
13914
- return typeof value === "object" && value !== null && !Array.isArray(value);
13915
- }
13916
- function stableValueKey(value) {
13917
- return JSON.stringify(value);
13918
- }
13919
- function isPlaceholderString(value) {
13920
- if (typeof value !== "string") return false;
13921
- const normalized = value.trim().toLowerCase();
13922
- return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
13923
- }
13924
- function primitiveToText(value) {
13925
- if (value === null || value === void 0) return null;
13926
- if (typeof value === "string") return value.trim() || null;
13927
- if (typeof value === "number" || typeof value === "boolean") return String(value);
13928
- return null;
13929
- }
13930
- function isMeaningfulValue(value) {
13931
- return primitiveToText(value) !== null && !isPlaceholderString(value);
13932
- }
13933
- function normalizeText(value) {
13934
- return value.toLowerCase().replace(/\s+/g, " ").trim();
13935
- }
13936
- function quoteAround(text$1, start, length) {
13937
- const before = Math.max(0, start - 80);
13938
- const after = Math.min(text$1.length, start + length + 80);
13939
- return text$1.slice(before, after).replace(/\s+/g, " ").trim();
13940
- }
13941
- function findEvidence(value, chunks) {
13942
- const searchText = primitiveToText(value);
13943
- if (!searchText) return null;
13944
- const normalizedSearchText = normalizeText(searchText);
13945
- if (!normalizedSearchText) return null;
13946
- for (const chunk of chunks) {
13947
- if (normalizeText(chunk.text).indexOf(normalizedSearchText) === -1) continue;
13948
- const rawIndex = chunk.text.toLowerCase().indexOf(searchText.toLowerCase());
13949
- const quoteIndex = rawIndex >= 0 ? rawIndex : 0;
13950
- return {
13951
- chunkIndex: chunk.chunkIndex,
13952
- headingPath: chunk.headingPath,
13953
- quote: quoteAround(chunk.text, quoteIndex, searchText.length)
13954
- };
13955
- }
13956
- return null;
13957
- }
13958
- function addEvidenceForProperty(fields, path$1, property, value, chunks) {
13959
- if (property.type === "object" && property.properties) {
13960
- const record = isRecord(value) ? value : {};
13961
- for (const [childName, childProperty] of Object.entries(property.properties)) addEvidenceForProperty(fields, `${path$1}.${childName}`, childProperty, record[childName], chunks);
13962
- return;
13963
- }
13964
- if (property.type === "array") {
13965
- if (!Array.isArray(value) || value.length === 0) {
13966
- fields.push({
13967
- fieldPath: path$1,
13968
- status: "missing",
13969
- value: null,
13970
- confidence: 0,
13971
- note: "Array field is empty or missing."
13972
- });
13973
- return;
13974
- }
13975
- value.forEach((item, index) => {
13976
- if (property.items?.type === "object" && property.items.properties) {
13977
- const record = isRecord(item) ? item : {};
13978
- for (const [childName, childProperty] of Object.entries(property.items.properties)) addEvidenceForProperty(fields, `${path$1}[${index}].${childName}`, childProperty, record[childName], chunks);
13979
- } else addPrimitiveEvidence(fields, `${path$1}[${index}]`, item, chunks);
13980
- });
13981
- return;
13982
- }
13983
- addPrimitiveEvidence(fields, path$1, value, chunks);
13984
- }
13985
- function addPrimitiveEvidence(fields, fieldPath, value, chunks) {
13986
- if (value === null || value === void 0 || value === "") {
13987
- fields.push({
13988
- fieldPath,
13989
- status: "missing",
13990
- value: null,
13991
- confidence: 0,
13992
- note: "Field is null or empty in final extraction."
13993
- });
13994
- return;
13995
- }
13996
- const found = findEvidence(value, chunks);
13997
- if (found) {
13998
- fields.push({
13999
- fieldPath,
14000
- status: "found",
14001
- value,
14002
- confidence: .8,
14003
- ...found
14004
- });
14005
- return;
14006
- }
14007
- fields.push({
14008
- fieldPath,
14009
- status: "inferred",
14010
- value,
14011
- confidence: .35,
14012
- note: "Final value was not found verbatim in the available source text."
14013
- });
14014
- }
14015
- function sourceChunksFromText(text$1) {
14016
- return text$1 ? [{
14017
- text: text$1,
14018
- chunkIndex: 0,
14019
- headingPath: []
14020
- }] : [];
14021
- }
14022
- function sourceChunksFromMarkdownChunks(chunks) {
14023
- return chunks.map((chunk, index) => ({
14024
- text: chunk.pageContent,
14025
- chunkIndex: chunk.chunkIndex ?? index,
14026
- headingPath: chunk.headingPath ?? []
14027
- }));
14028
- }
14029
- function getPathParts(fieldPath) {
14030
- return fieldPath.replace(FIELD_PATH_PREFIX_RE, "").split(".").filter(Boolean);
14031
- }
14032
- function getValueAtPath$1(data, fieldPath) {
14033
- let current = data;
14034
- for (const part of getPathParts(fieldPath)) {
14035
- if (!isRecord(current)) return void 0;
14036
- current = current[part];
14037
- }
14038
- return current;
14039
- }
14040
- function setValueAtPath(data, fieldPath, value) {
14041
- const parts = getPathParts(fieldPath);
14042
- let current = data;
14043
- for (let i = 0; i < parts.length - 1; i++) {
14044
- const part = parts[i];
14045
- if (!isRecord(current[part])) current[part] = {};
14046
- current = current[part];
14047
- }
14048
- current[parts[parts.length - 1]] = value;
14049
- }
14050
- function collectScalarFields(fields, fieldPath, property) {
14051
- if (property.type === "object" && property.properties) {
14052
- for (const [name$1, childProperty] of Object.entries(property.properties)) collectScalarFields(fields, `${fieldPath}.${name$1}`, childProperty);
14053
- return;
14054
- }
14055
- if (property.type !== "array") fields.push({
14056
- fieldPath,
14057
- property
14058
- });
14059
- }
14060
- function candidateScore(candidate) {
14061
- return (candidate.status === "found" ? 100 : 0) + Math.round(candidate.confidence * 10) + candidate.chunkIndex;
14062
- }
14063
- function selectCandidatesForField(candidates) {
14064
- if (candidates.length === 0) return null;
14065
- candidates.sort((a, b) => candidateScore(b) - candidateScore(a));
14066
- const selected = candidates[0];
14067
- selected.selected = true;
14068
- for (const candidate of candidates.slice(1)) {
14069
- candidate.selected = false;
14070
- candidate.rejectionReason = "Lower evidence score or earlier chunk position.";
14071
- }
14072
- const distinctValues = /* @__PURE__ */ new Map();
14073
- for (const candidate of candidates) distinctValues.set(stableValueKey(candidate.value), candidate.value);
14074
- if (distinctValues.size <= 1) return null;
14075
- return {
14076
- fieldPath: selected.fieldPath,
14077
- selectedValue: selected.value,
14078
- rejectedValues: candidates.slice(1).map((candidate) => candidate.value),
14079
- candidates: [...candidates]
14080
- };
14081
- }
14082
- function buildCandidateMergeReport(input) {
14083
- const scalarFields = [];
14084
- for (const [name$1, property] of Object.entries(input.schema.properties)) {
14085
- if (property.primary && property.autoIncrement) continue;
14086
- collectScalarFields(scalarFields, `$.${name$1}`, property);
14087
- }
14088
- const sourceChunks = sourceChunksFromMarkdownChunks(input.chunks);
14089
- const candidatesByPath = /* @__PURE__ */ new Map();
14090
- for (const { fieldPath } of scalarFields) for (let chunkIndex = 0; chunkIndex < input.chunkResults.length; chunkIndex++) {
14091
- const value = getValueAtPath$1(input.chunkResults[chunkIndex], fieldPath);
14092
- if (!isMeaningfulValue(value)) continue;
14093
- const sourceChunk = sourceChunks[chunkIndex] ?? {
14094
- text: "",
14095
- chunkIndex
14096
- };
14097
- const found = findEvidence(value, [sourceChunk]);
14098
- const candidate = {
14099
- fieldPath,
14100
- value,
14101
- chunkIndex: sourceChunk.chunkIndex ?? chunkIndex,
14102
- headingPath: sourceChunk.headingPath,
14103
- status: found ? "found" : "inferred",
14104
- quote: found?.quote,
14105
- confidence: found ? .85 : .35
14106
- };
14107
- const candidates = candidatesByPath.get(fieldPath) ?? [];
14108
- candidates.push(candidate);
14109
- candidatesByPath.set(fieldPath, candidates);
14110
- }
14111
- const allCandidates = [];
14112
- const conflicts = [];
14113
- for (const candidates of candidatesByPath.values()) {
14114
- const conflict = selectCandidatesForField(candidates);
14115
- allCandidates.push(...candidates);
14116
- if (conflict) conflicts.push(conflict);
14117
- }
14118
- return {
14119
- candidates: allCandidates,
14120
- conflicts
14121
- };
14122
- }
14123
- function applySelectedCandidates(data, report) {
14124
- const merged = structuredClone(data);
14125
- for (const candidate of report.candidates) if (candidate.selected) setValueAtPath(merged, candidate.fieldPath, candidate.value);
14126
- return merged;
14127
- }
14128
- function buildExtractionEvidence(input) {
14129
- const data = isRecord(input.data) ? input.data : {};
14130
- const chunks = input.chunks ? sourceChunksFromMarkdownChunks(input.chunks) : sourceChunksFromText(input.text ?? "");
14131
- const fields = [];
14132
- for (const [name$1, property] of Object.entries(input.schema.properties)) {
14133
- if (property.primary && property.autoIncrement) continue;
14134
- addEvidenceForProperty(fields, `$.${name$1}`, property, data[name$1], chunks);
14135
- }
14136
- const inferredIssues = fields.filter((field) => field.status === "inferred").map((field) => ({
14137
- fieldPath: field.fieldPath,
14138
- message: field.note ?? "Field value lacks source evidence."
14139
- }));
14140
- const conflictIssues = (input.candidateReport?.conflicts ?? []).map((conflict) => ({
14141
- fieldPath: conflict.fieldPath,
14142
- message: "Multiple chunk candidates disagree for this field."
14143
- }));
14144
- const issues = [...inferredIssues, ...conflictIssues];
14145
- return {
14146
- coverage: {
14147
- path: input.outputPath ? evidencePathForOutput(input.outputPath) : void 0,
14148
- fieldCount: fields.length,
14149
- evidenceCount: fields.filter((field) => field.status === "found").length,
14150
- foundCount: fields.filter((field) => field.status === "found").length,
14151
- missingCount: fields.filter((field) => field.status === "missing").length,
14152
- inferredCount: fields.filter((field) => field.status === "inferred").length,
14153
- conflictCount: input.candidateReport?.conflicts.length ?? 0,
14154
- issueCount: issues.length
14155
- },
14156
- fields,
14157
- candidates: input.candidateReport?.candidates,
14158
- conflicts: input.candidateReport?.conflicts,
14159
- issues
14160
- };
14161
- }
14162
- function evidencePathForOutput(outputPath) {
14163
- return outputPath.replace(JSON_FILE_SUFFIX_RE$1, ".evidence.json");
14164
- }
14165
- async function writeExtractionEvidence(input) {
14166
- const report = buildExtractionEvidence(input);
14167
- const evidencePath = evidencePathForOutput(input.outputPath);
14168
- report.coverage.path = evidencePath;
14169
- await writeFile(evidencePath, report, {
14170
- spaces: 2,
14171
- EOL: "\n"
14172
- });
14173
- return {
14174
- ...report.coverage,
14175
- path: path.resolve(evidencePath)
14176
- };
14177
- }
14178
-
14179
13718
  //#endregion
14180
13719
  //#region src/core/notion-sink.ts
14181
13720
  const RICH_TEXT_LIMIT = 2e3;
@@ -14461,66 +14000,16 @@ async function triggerWebhook(aiConfig, auditId, schemaName, event, source, data
14461
14000
  }
14462
14001
  }
14463
14002
 
14464
- //#endregion
14465
- //#region src/core/ai-extraction/transcriber.ts
14466
- const TRANSCRIPTION_PROMPT = "Transcribe all visible text from this image accurately. Preserve the layout and line breaks as much as possible.";
14467
- async function transcribeImageWithVision(imagePath, baseURL, apiKey, modelName, timeoutMs) {
14468
- const provider = createOpenAICompatible({
14469
- baseURL,
14470
- name: "openai-compatible",
14471
- apiKey
14472
- });
14473
- const buffer = await fs.readFile(imagePath);
14474
- const effectiveTimeout = timeoutMs ?? 3e5;
14475
- return {
14476
- text: (await generateText({
14477
- model: provider.chatModel(modelName),
14478
- messages: [{
14479
- role: "user",
14480
- content: [{
14481
- type: "text",
14482
- text: TRANSCRIPTION_PROMPT
14483
- }, {
14484
- type: "image",
14485
- image: buffer
14486
- }]
14487
- }],
14488
- abortSignal: AbortSignal.timeout(effectiveTimeout)
14489
- })).text,
14490
- modelName
14491
- };
14492
- }
14493
-
14494
14003
  //#endregion
14495
14004
  //#region src/core/file-constants.ts
14496
14005
  const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
14497
14006
  const MAX_UPLOAD_SIZE_TEXT = "30MB";
14498
14007
  const SUPPORTED_FILE_TYPES_TEXT = "images, PDF, text, markdown, CSV, JSON, HTML, XML, YAML";
14499
14008
  const MISSING_UPLOAD_FILE_TEXT = t("errors.file.missingUpload");
14500
- const SUPPORTED_MIME_TYPES = new Set([
14501
- "image/png",
14502
- "image/jpeg",
14503
- "image/gif",
14504
- "image/webp",
14505
- "image/bmp",
14506
- "image/svg+xml",
14507
- "application/pdf",
14508
- "text/plain",
14509
- "text/markdown",
14510
- "text/csv",
14511
- "application/json",
14512
- "text/html",
14513
- "text/xml",
14514
- "application/x-yaml",
14515
- "text/yaml"
14516
- ]);
14517
14009
  const MIME_TO_EXT = {
14518
14010
  "image/png": "png",
14519
14011
  "image/jpeg": "jpg",
14520
- "image/gif": "gif",
14521
14012
  "image/webp": "webp",
14522
- "image/bmp": "bmp",
14523
- "image/svg+xml": "svg",
14524
14013
  "application/pdf": "pdf",
14525
14014
  "text/plain": "txt",
14526
14015
  "text/markdown": "md",
@@ -14537,8 +14026,8 @@ function bytesToMB(bytes) {
14537
14026
  function getExtensionFromMime(mimeType) {
14538
14027
  return MIME_TO_EXT[mimeType];
14539
14028
  }
14540
- function isAllowedMimeType(mimeType) {
14541
- return SUPPORTED_MIME_TYPES.has(mimeType);
14029
+ function getExtensionForDetectedFile(mimeType) {
14030
+ return mimeType ? getExtensionFromMime(mimeType) ?? "txt" : "txt";
14542
14031
  }
14543
14032
  function unsupportedFileTypeMessage(mimeType) {
14544
14033
  return t("errors.file.unsupportedType", {
@@ -14555,14 +14044,16 @@ var FileValidationError = class extends Error {
14555
14044
  this.name = "FileValidationError";
14556
14045
  }
14557
14046
  };
14558
- function validateFileUpload(file) {
14047
+ async function validateFileUploadContent(file, buffer) {
14559
14048
  if (file.size === 0) throw new FileValidationError(t("errors.file.empty"));
14560
14049
  if (file.size > MAX_UPLOAD_SIZE) throw new FileValidationError(t("errors.file.sizeExceeded", {
14561
14050
  size: bytesToMB(file.size).toFixed(1),
14562
14051
  limit: MAX_UPLOAD_SIZE_TEXT,
14563
14052
  file: file.name
14564
14053
  }));
14565
- if (!isAllowedMimeType(file.type)) throw new FileValidationError(unsupportedFileTypeMessage(file.type));
14054
+ const detected = await detectInputBufferKind(buffer);
14055
+ if (detected.kind === "unsupported") throw new FileValidationError(unsupportedFileTypeMessage(detected.mime ?? (file.type || "application/octet-stream")));
14056
+ return detected.mime ?? "text/plain";
14566
14057
  }
14567
14058
 
14568
14059
  //#endregion
@@ -14845,39 +14336,56 @@ const FILE_PART_EXTENSIONS = new Set([
14845
14336
  "png",
14846
14337
  "jpg",
14847
14338
  "jpeg",
14848
- "gif",
14849
- "webp",
14850
- "bmp",
14851
- "svg"
14339
+ "webp"
14852
14340
  ]);
14853
14341
  const PDF_EXT_RE = /\.pdf$/i;
14854
- async function readExtractFileInput(filePath, aiConfig) {
14342
+ async function describeExtractFileInput(filePath, aiConfig, modelOverride) {
14343
+ const detected = await detectInputFileKind(filePath);
14344
+ if (detected.kind === "image") return {
14345
+ kind: "image",
14346
+ mime: detected.mime,
14347
+ handler: shouldUseImageOcrFallback(aiConfig, modelOverride) ? "image_local_ocr" : "image_vision"
14348
+ };
14349
+ if (detected.kind === "pdf") {
14350
+ const converter = createPdfConverter(aiConfig?.pdf);
14351
+ return {
14352
+ kind: "pdf",
14353
+ mime: detected.mime,
14354
+ handler: "pdf_converter",
14355
+ converter: converter.name
14356
+ };
14357
+ }
14358
+ if (detected.kind === "text") return {
14359
+ kind: "text",
14360
+ mime: detected.mime,
14361
+ handler: "text"
14362
+ };
14363
+ throw new Error(unsupportedFileTypeMessage(detected.mime ?? "application/octet-stream"));
14364
+ }
14365
+ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
14855
14366
  const stat = fs$1.statSync(filePath);
14856
14367
  if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
14857
14368
  size: bytesToMB(stat.size).toFixed(1),
14858
14369
  limit: MAX_UPLOAD_SIZE_TEXT,
14859
14370
  file: filePath
14860
14371
  }));
14861
- const ext = path.extname(filePath).toLowerCase().replace(".", "");
14862
- if (FILE_PART_EXTENSIONS.has(ext)) {
14863
- const image = aiConfig?.image;
14864
- if (image?.imageConversion === "vision" && image.imageModelName && aiConfig) {
14865
- const baseURL = image.visionBaseURL || aiConfig.provider.baseURL;
14866
- const apiKey = image.visionApiKey || aiConfig.provider.apiKey;
14867
- const timeout = (aiConfig.provider.timeout ?? 300) * 1e3;
14868
- try {
14869
- const result$1 = await transcribeImageWithVision(filePath, baseURL, apiKey, image.imageModelName, timeout);
14870
- consola.info(t("command.extract.file.visionTranscribed", { model: result$1.modelName }));
14871
- return { text: result$1.text };
14872
- } catch {
14873
- consola.warn(t("command.extract.file.visionTranscribeFailed", { model: image.imageModelName }));
14874
- }
14372
+ const inputProcessing = await describeExtractFileInput(filePath, aiConfig, modelOverride);
14373
+ if (inputProcessing.kind === "image") {
14374
+ if (inputProcessing.handler === "image_local_ocr") {
14375
+ const result = await recognizeImageText(filePath);
14376
+ consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
14377
+ return {
14378
+ text: result.text,
14379
+ inputProcessing
14380
+ };
14875
14381
  }
14876
- const result = await recognizeImageText(filePath, aiConfig?.image);
14877
- consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
14878
- return { text: result.text };
14382
+ return {
14383
+ text: "",
14384
+ filePath,
14385
+ inputProcessing
14386
+ };
14879
14387
  }
14880
- if (ext === "pdf") {
14388
+ if (inputProcessing.kind === "pdf") {
14881
14389
  const buffer = await fs.readFile(filePath);
14882
14390
  const converter = createPdfConverter(aiConfig?.pdf);
14883
14391
  const result = await converter.convert(buffer, filePath);
@@ -14895,9 +14403,16 @@ async function readExtractFileInput(filePath, aiConfig) {
14895
14403
  await fs.writeFile(fallbackMd, result.text);
14896
14404
  consola.info(t("command.extract.file.markdownSaved", { path: fallbackMd }));
14897
14405
  }
14898
- return { text: result.text };
14406
+ return {
14407
+ text: result.text,
14408
+ inputProcessing
14409
+ };
14899
14410
  }
14900
- return { text: await fs.readFile(filePath, "utf-8") };
14411
+ if (inputProcessing.kind === "text") return {
14412
+ text: await fs.readFile(filePath, "utf-8"),
14413
+ inputProcessing
14414
+ };
14415
+ throw new Error(unsupportedFileTypeMessage(inputProcessing.mime ?? "application/octet-stream"));
14901
14416
  }
14902
14417
 
14903
14418
  //#endregion
@@ -14996,21 +14511,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
14996
14511
 
14997
14512
  //#endregion
14998
14513
  //#region src/core/extract-runner.ts
14999
- const encoding = getEncoding("cl100k_base");
15000
14514
  const JSON_EXT_RE$1 = /\.json$/;
15001
- async function limitConcurrency(concurrency, items, fn) {
15002
- const results = Array.from({ length: items.length });
15003
- let nextIndex = 0;
15004
- async function worker() {
15005
- while (nextIndex < items.length) {
15006
- const currentIndex = nextIndex++;
15007
- results[currentIndex] = await fn(items[currentIndex], currentIndex);
15008
- }
15009
- }
15010
- const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
15011
- await Promise.all(workers);
15012
- return results;
15013
- }
15014
14515
  async function ensureDatabaseReady(dbPath, schema) {
15015
14516
  try {
15016
14517
  await fs.access(dbPath);
@@ -15082,146 +14583,34 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15082
14583
  }
15083
14584
  const s = spinner();
15084
14585
  if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
15085
- const maxTokens = calculateChunkTokenBudget({
15086
- configuredMaxTokens: aiConfig.extraction?.maxTokens ?? 8e3,
15087
- modelMaxTokens: modelOverride?.capabilities.maxTokens
15088
- });
15089
- const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
15090
- const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
15091
- if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunking", {
15092
- length: totalTokens,
15093
- limit: maxTokens
15094
- }));
15095
- const processedDocs = text$1 && totalTokens > maxTokens ? splitMarkdown(text$1, maxTokens, overlapTokens) : [{
15096
- pageContent: text$1 ?? "",
15097
- metadata: {},
15098
- chunkIndex: 0,
15099
- totalChunks: 1,
15100
- tokenCount: totalTokens,
15101
- headingPath: [],
15102
- charStart: 0,
15103
- charEnd: text$1?.length ?? 0
15104
- }];
15105
- if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: processedDocs.length }));
15106
- const chunkResults = Array.from({ length: processedDocs.length });
15107
- const accumulatedTokens = {
15108
- prompt: 0,
15109
- completion: 0,
15110
- total: 0
15111
- };
15112
- let success = true;
15113
- let errorMsg = "";
15114
- const extractionTasks = processedDocs.map((doc, i) => {
15115
- return async () => {
15116
- if (!success) return;
15117
- const headings = doc.headingPath?.length ? doc.headingPath : [
15118
- doc.metadata.h1,
15119
- doc.metadata.h2,
15120
- doc.metadata.h3,
15121
- doc.metadata.h4
15122
- ].filter(Boolean);
15123
- let chunkText = doc.pageContent;
15124
- if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
15125
- const chunkResult = await extractStructuredData({
15126
- config: aiConfig,
15127
- schema: schemaLoad.schema,
15128
- text: chunkText,
15129
- aiexDir,
15130
- modelOverride,
15131
- onRetry(info) {
15132
- if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
15133
- current: i + 1,
15134
- total: processedDocs.length,
15135
- code: info.statusCode,
15136
- delay: info.delayMs / 1e3,
15137
- attempt: info.attempt,
15138
- max: info.maxRetries
15139
- }));
15140
- }
15141
- });
15142
- if (!chunkResult.success) {
15143
- success = false;
15144
- errorMsg = chunkResult.error || t("common.unknownError");
15145
- if (!options?.quiet) {
15146
- s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
15147
- consola.error(errorMsg);
15148
- }
15149
- return;
15150
- }
15151
- if (chunkResult.data) chunkResults[i] = chunkResult.data;
15152
- if (chunkResult.tokensUsed) {
15153
- accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
15154
- accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
15155
- accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
15156
- }
15157
- };
15158
- });
15159
- const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
15160
- if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
15161
- current: 1,
15162
- total: processedDocs.length
15163
- }));
15164
- try {
15165
- await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
15166
- if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
15167
- current: idx + 1,
15168
- total: processedDocs.length
15169
- }));
15170
- await task();
15171
- });
15172
- } catch (e) {
15173
- success = false;
15174
- errorMsg = e instanceof Error ? e.message : String(e);
15175
- }
15176
- if (!success) return {
15177
- success: false,
15178
- error: errorMsg
15179
- };
15180
- const successfulChunkResults = chunkResults.filter((chunkResult) => !!chunkResult);
15181
- const candidateReport = buildCandidateMergeReport({
14586
+ const result = await extractStructuredData({
14587
+ config: aiConfig,
15182
14588
  schema: schemaLoad.schema,
15183
- chunkResults: successfulChunkResults,
15184
- chunks: processedDocs
14589
+ text: text$1 ?? "",
14590
+ aiexDir,
14591
+ file: filePath,
14592
+ modelOverride,
14593
+ onRetry(info) {
14594
+ if (!options?.quiet) s.message(t("command.extract.file.extractRetry", {
14595
+ code: info.statusCode,
14596
+ delay: info.delayMs / 1e3,
14597
+ attempt: info.attempt,
14598
+ max: info.maxRetries
14599
+ }));
14600
+ }
15185
14601
  });
15186
- const mergedData = applySelectedCandidates(mergeExtractionResults(schemaLoad.schema, successfulChunkResults), candidateReport);
15187
- const validation = validateExtractedData(schemaLoad.schema, mergedData);
15188
- if (!validation.success) {
15189
- const valError = validation.error || "Merged data validation failed";
14602
+ if (!result.success) {
15190
14603
  if (!options?.quiet) {
15191
- s.stop(t("command.extract.file.validationFail"));
15192
- consola.error(valError);
14604
+ s.stop(t("command.extract.file.extractFail"));
14605
+ consola.error(result.error || t("common.unknownError"));
15193
14606
  }
15194
14607
  return {
15195
14608
  success: false,
15196
- error: valError
14609
+ error: result.error || t("common.unknownError")
15197
14610
  };
15198
14611
  }
15199
- const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
15200
- await fs.mkdir(outputDir, { recursive: true });
15201
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
15202
- const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
15203
- const outputPath = path.join(outputDir, outputFileName);
15204
- await fs.writeFile(outputPath, JSON.stringify(mergedData, null, 2));
15205
- const result = {
15206
- success: true,
15207
- data: mergedData,
15208
- tokensUsed: accumulatedTokens,
15209
- outputPath,
15210
- evidenceSummary: await writeExtractionEvidence({
15211
- schema: schemaLoad.schema,
15212
- data: mergedData,
15213
- outputPath,
15214
- chunks: processedDocs,
15215
- candidateReport
15216
- })
15217
- };
15218
14612
  if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
15219
14613
  if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
15220
- if (result.evidenceSummary && !options?.quiet) {
15221
- const summary = result.evidenceSummary;
15222
- const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
15223
- consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, conflicts ${summary.conflictCount ?? 0}, issues ${issueText}`));
15224
- }
15225
14614
  if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
15226
14615
  prompt: result.tokensUsed.prompt,
15227
14616
  completion: result.tokensUsed.completion,
@@ -15250,7 +14639,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15250
14639
  outputPath: result.outputPath,
15251
14640
  data: result.data,
15252
14641
  tablesInserted: insertResult.tablesInserted,
15253
- evidenceSummary: result.evidenceSummary,
15254
14642
  tokensUsed: result.tokensUsed
15255
14643
  };
15256
14644
  } else {
@@ -15277,10 +14665,13 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15277
14665
  success: true,
15278
14666
  outputPath: result.outputPath,
15279
14667
  data: result.data,
15280
- evidenceSummary: result.evidenceSummary,
15281
14668
  tokensUsed: result.tokensUsed
15282
14669
  };
15283
14670
  }
14671
+ function formatInputProcessing$1(input) {
14672
+ const handler = input.converter ? `${input.handler}(${input.converter})` : input.handler;
14673
+ return `${input.mime ?? input.kind} -> ${handler}`;
14674
+ }
15284
14675
  async function runAuditedExtraction(options) {
15285
14676
  const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
15286
14677
  let fileHash;
@@ -15321,7 +14712,8 @@ async function runAuditedExtraction(options) {
15321
14712
  outputName: existing.outputName,
15322
14713
  tablesInserted: existing.tablesInserted,
15323
14714
  notionPages: existing.notionPages,
15324
- tokensUsed: existing.tokensUsed
14715
+ tokensUsed: existing.tokensUsed,
14716
+ inputProcessing: existing.inputProcessing
15325
14717
  };
15326
14718
  }
15327
14719
  }
@@ -15342,9 +14734,17 @@ async function runAuditedExtraction(options) {
15342
14734
  });
15343
14735
  try {
15344
14736
  let text$1 = "";
15345
- if (source.type === "file") text$1 = (await readExtractFileInput(source.filePath, aiConfig)).text;
15346
- else text$1 = source.text;
15347
- const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, source.type === "file" ? source.filePath : void 0, modelOverride, {
14737
+ let filePath;
14738
+ let inputProcessing;
14739
+ if (source.type === "file") {
14740
+ const input = await readExtractFileInput(source.filePath, aiConfig, modelOverride);
14741
+ text$1 = input.text;
14742
+ filePath = input.filePath;
14743
+ inputProcessing = input.inputProcessing;
14744
+ if (!quiet) consola.info(`Input: ${formatInputProcessing$1(inputProcessing)}`);
14745
+ await updateExtractionAuditRecord(aiexDir, audit.id, { inputProcessing });
14746
+ } else text$1 = source.text;
14747
+ const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
15348
14748
  quiet,
15349
14749
  insert
15350
14750
  });
@@ -15368,7 +14768,8 @@ async function runAuditedExtraction(options) {
15368
14768
  success: false,
15369
14769
  error: error instanceof Error ? error.message : String(error),
15370
14770
  auditId: audit.id,
15371
- fileHash
14771
+ fileHash,
14772
+ inputProcessing
15372
14773
  };
15373
14774
  }
15374
14775
  const updated = await updateExtractionAuditRecord(aiexDir, audit.id, {
@@ -15386,10 +14787,10 @@ async function runAuditedExtraction(options) {
15386
14787
  outputName: updated.outputName,
15387
14788
  tablesInserted: updated.tablesInserted,
15388
14789
  notionPages: updated.notionPages,
15389
- evidenceSummary: r.evidenceSummary,
15390
14790
  tokensUsed: updated.tokensUsed,
15391
14791
  auditId: updated.id,
15392
- fileHash
14792
+ fileHash,
14793
+ inputProcessing: updated.inputProcessing
15393
14794
  };
15394
14795
  } else {
15395
14796
  await updateExtractionAuditRecord(aiexDir, audit.id, {
@@ -15402,7 +14803,8 @@ async function runAuditedExtraction(options) {
15402
14803
  success: false,
15403
14804
  error: r.error,
15404
14805
  auditId: audit.id,
15405
- fileHash
14806
+ fileHash,
14807
+ inputProcessing
15406
14808
  };
15407
14809
  }
15408
14810
  } catch (e) {
@@ -15606,6 +15008,11 @@ function isExtractSubCommand(rawArgs) {
15606
15008
  function formatSource(source) {
15607
15009
  return source.type === "file" ? source.fileName || "file" : "unknown";
15608
15010
  }
15011
+ function formatInputProcessing(input) {
15012
+ if (!input) return "";
15013
+ const handler = input.converter ? `${input.handler}(${input.converter})` : input.handler;
15014
+ return ` [${input.mime ?? input.kind} -> ${handler}]`;
15015
+ }
15609
15016
  async function loadConfiguredAI(aiexDir) {
15610
15017
  const aiConfig = await readAIConfig(aiexDir);
15611
15018
  if (!aiConfig) {
@@ -15648,7 +15055,7 @@ const historyCommand = defineCommand({
15648
15055
  }
15649
15056
  for (const record of records) {
15650
15057
  const suffix = record.error ? ` — ${record.error}` : record.outputName ? ` — ${record.outputName}` : "";
15651
- consola.info(`${record.status.padEnd(9)} ${record.id} ${record.schemaName} ${formatSource(record.source)}${suffix}`);
15058
+ consola.info(`${record.status.padEnd(9)} ${record.id} ${record.schemaName} ${formatSource(record.source)}${formatInputProcessing(record.inputProcessing)}${suffix}`);
15652
15059
  }
15653
15060
  }
15654
15061
  });
@@ -16161,10 +15568,7 @@ const SUPPORTED_EXTENSIONS = new Set([
16161
15568
  "png",
16162
15569
  "jpg",
16163
15570
  "jpeg",
16164
- "gif",
16165
15571
  "webp",
16166
- "bmp",
16167
- "svg",
16168
15572
  "pdf",
16169
15573
  "txt",
16170
15574
  "md",
@@ -16514,7 +15918,6 @@ function aiRoutes(config) {
16514
15918
  //#endregion
16515
15919
  //#region src/core/data-service.ts
16516
15920
  const FILE_REGEX = /\.json$/;
16517
- const EVIDENCE_FILE_SUFFIX = ".evidence.json";
16518
15921
  const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
16519
15922
  const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
16520
15923
  const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
@@ -16530,24 +15933,6 @@ function getAuditNotionStatus(record) {
16530
15933
  if (record.status === "failed") return "failed";
16531
15934
  return "not_synced";
16532
15935
  }
16533
- async function readEvidenceSummary(extractedDir, outputName) {
16534
- const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
16535
- try {
16536
- const coverage = (await readFile(evidencePath))?.coverage;
16537
- if (!coverage || typeof coverage !== "object") return void 0;
16538
- return {
16539
- path: evidencePath,
16540
- fieldCount: Number(coverage.fieldCount) || 0,
16541
- evidenceCount: Number(coverage.evidenceCount) || 0,
16542
- foundCount: Number(coverage.foundCount) || 0,
16543
- missingCount: Number(coverage.missingCount) || 0,
16544
- inferredCount: Number(coverage.inferredCount) || 0,
16545
- issueCount: Number(coverage.issueCount) || 0
16546
- };
16547
- } catch {
16548
- return;
16549
- }
16550
- }
16551
15936
  async function getRowExtractionActions(aiexDir, tableName) {
16552
15937
  const actions = /* @__PURE__ */ new Map();
16553
15938
  const auditRecords = await listExtractionAuditRecords(aiexDir);
@@ -16575,7 +15960,7 @@ async function listExtractions(config) {
16575
15960
  const aiexDir = path.dirname(config.schemaPath);
16576
15961
  const extractedDir = path.join(aiexDir, "extracted");
16577
15962
  await fs.mkdir(extractedDir, { recursive: true });
16578
- const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md") && !f.endsWith(EVIDENCE_FILE_SUFFIX));
15963
+ const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
16579
15964
  const auditRecords = await listExtractionAuditRecords(aiexDir);
16580
15965
  const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
16581
15966
  const records = [];
@@ -16594,10 +15979,10 @@ async function listExtractions(config) {
16594
15979
  timestamp,
16595
15980
  fileSize: stat.size,
16596
15981
  modifiedAt: stat.mtime.toISOString(),
16597
- evidenceSummary: await readEvidenceSummary(extractedDir, file),
16598
15982
  notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
16599
15983
  notionPages,
16600
- notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
15984
+ notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0,
15985
+ inputProcessing: audit?.inputProcessing
16601
15986
  });
16602
15987
  } catch {
16603
15988
  continue;
@@ -16774,7 +16159,6 @@ async function retryNotionSync(config, fileName) {
16774
16159
 
16775
16160
  //#endregion
16776
16161
  //#region src/server/routes/data.ts
16777
- const JSON_FILE_SUFFIX_RE = /\.json$/;
16778
16162
  const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
16779
16163
  const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
16780
16164
  const tableQuerySchema = z.object({
@@ -16827,22 +16211,10 @@ function dataRoutes(config) {
16827
16211
  const filePath = path.join(extractedDir, name$1);
16828
16212
  try {
16829
16213
  const content = await fs.readFile(filePath, "utf-8");
16830
- const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
16831
- let evidenceSummary;
16832
- try {
16833
- const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
16834
- evidenceSummary = evidence?.coverage ? {
16835
- ...evidence.coverage,
16836
- path: evidencePath
16837
- } : void 0;
16838
- } catch {
16839
- evidenceSummary = void 0;
16840
- }
16841
16214
  return c.json({
16842
16215
  success: true,
16843
16216
  content,
16844
- name: name$1,
16845
- evidenceSummary
16217
+ name: name$1
16846
16218
  });
16847
16219
  } catch {
16848
16220
  return c.json({ error: t("server.extractionNotFound") }, 404);
@@ -16881,10 +16253,9 @@ function getFormFile(value) {
16881
16253
  function safeUploadName(name$1) {
16882
16254
  return path.basename(name$1).replace(/[^\w.-]/g, "_") || "upload.txt";
16883
16255
  }
16884
- function safeUploadNameForMime(file) {
16256
+ function safeUploadNameForMime(file, mimeType) {
16885
16257
  const safeName = safeUploadName(file.name);
16886
- const ext = getExtensionFromMime(file.type);
16887
- if (!ext) throw new FileValidationError(unsupportedFileTypeMessage(file.type));
16258
+ const ext = getExtensionForDetectedFile(mimeType);
16888
16259
  return `${path.parse(safeName).name || "upload"}.${ext}`;
16889
16260
  }
16890
16261
  function jsonResponse(body, status) {
@@ -16894,10 +16265,10 @@ function jsonResponse(body, status) {
16894
16265
  });
16895
16266
  }
16896
16267
  async function saveUploadToFile(file, uploadsDir, id) {
16897
- validateFileUpload(file);
16898
- await fs.mkdir(uploadsDir, { recursive: true });
16899
- const filePath = path.join(uploadsDir, `${id}-${safeUploadNameForMime(file)}`);
16900
16268
  const buffer = Buffer.from(await file.arrayBuffer());
16269
+ const mimeType = await validateFileUploadContent(file, buffer);
16270
+ await fs.mkdir(uploadsDir, { recursive: true });
16271
+ const filePath = path.join(uploadsDir, `${id}-${safeUploadNameForMime(file, mimeType)}`);
16901
16272
  await fs.writeFile(filePath, buffer);
16902
16273
  return filePath;
16903
16274
  }
@@ -16986,9 +16357,9 @@ function extractRoutes(config) {
16986
16357
  outputName: result.outputName,
16987
16358
  tablesInserted: result.tablesInserted,
16988
16359
  notionPages: result.notionPages,
16989
- evidenceSummary: result.evidenceSummary,
16990
16360
  tokensUsed: result.tokensUsed,
16991
- auditId: result.auditId
16361
+ auditId: result.auditId,
16362
+ inputProcessing: result.inputProcessing
16992
16363
  }, 200);
16993
16364
  } catch (error) {
16994
16365
  if (isMissingUploadFileError(error)) return c.json({
@@ -17054,9 +16425,9 @@ function extractRoutes(config) {
17054
16425
  outputName: result.outputName,
17055
16426
  tablesInserted: result.tablesInserted,
17056
16427
  notionPages: result.notionPages,
17057
- evidenceSummary: result.evidenceSummary,
17058
16428
  tokensUsed: result.tokensUsed,
17059
- auditId: result.auditId
16429
+ auditId: result.auditId,
16430
+ inputProcessing: result.inputProcessing
17060
16431
  }, 200);
17061
16432
  });
17062
16433
  app.delete("/extract/records/:id", async (c) => {