@clazic/kordoc 2.4.11 → 2.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,10 +6,11 @@ import {
6
6
  KordocError,
7
7
  classifyError,
8
8
  isPathTraversal,
9
+ normalizeKordocError,
9
10
  precheckZipSize,
10
11
  sanitizeHref,
11
12
  toArrayBuffer
12
- } from "./chunk-PJSXZBZB.js";
13
+ } from "./chunk-5R37N6KE.js";
13
14
  import {
14
15
  parsePageRange
15
16
  } from "./chunk-MOL7MDBG.js";
@@ -19,6 +20,10 @@ import {
19
20
  import {
20
21
  createCliOcrProvider
21
22
  } from "./chunk-34WIGIQC.js";
23
+ import {
24
+ createLoggerFromEnv,
25
+ generateRunId
26
+ } from "./chunk-I6YC6ZGK.js";
22
27
  import {
23
28
  __commonJS,
24
29
  __require,
@@ -344,8 +349,8 @@ var require_cfb = __commonJS({
344
349
  }
345
350
  return L.length - R.length;
346
351
  }
347
- function dirname(p) {
348
- if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p : dirname(p.slice(0, -1));
352
+ function dirname2(p) {
353
+ if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p : dirname2(p.slice(0, -1));
349
354
  var c = p.lastIndexOf("/");
350
355
  return c === -1 ? p : p.slice(0, c + 1);
351
356
  }
@@ -766,10 +771,10 @@ var require_cfb = __commonJS({
766
771
  data.push([cfb.FullPaths[i2], cfb.FileIndex[i2]]);
767
772
  }
768
773
  for (i2 = 0; i2 < data.length; ++i2) {
769
- var dad = dirname(data[i2][0]);
774
+ var dad = dirname2(data[i2][0]);
770
775
  s = fullPaths[dad];
771
776
  while (!s) {
772
- while (dirname(dad) && !fullPaths[dirname(dad)]) dad = dirname(dad);
777
+ while (dirname2(dad) && !fullPaths[dirname2(dad)]) dad = dirname2(dad);
773
778
  data.push([dad, {
774
779
  name: filename(dad).replace("/", ""),
775
780
  type: 1,
@@ -779,7 +784,7 @@ var require_cfb = __commonJS({
779
784
  content: null
780
785
  }]);
781
786
  fullPaths[dad] = true;
782
- dad = dirname(data[i2][0]);
787
+ dad = dirname2(data[i2][0]);
783
788
  s = fullPaths[dad];
784
789
  }
785
790
  }
@@ -805,13 +810,13 @@ var require_cfb = __commonJS({
805
810
  elt.size = 0;
806
811
  elt.type = 5;
807
812
  } else if (nm.slice(-1) == "/") {
808
- for (j = i2 + 1; j < data.length; ++j) if (dirname(cfb.FullPaths[j]) == nm) break;
813
+ for (j = i2 + 1; j < data.length; ++j) if (dirname2(cfb.FullPaths[j]) == nm) break;
809
814
  elt.C = j >= data.length ? -1 : j;
810
- for (j = i2 + 1; j < data.length; ++j) if (dirname(cfb.FullPaths[j]) == dirname(nm)) break;
815
+ for (j = i2 + 1; j < data.length; ++j) if (dirname2(cfb.FullPaths[j]) == dirname2(nm)) break;
811
816
  elt.R = j >= data.length ? -1 : j;
812
817
  elt.type = 1;
813
818
  } else {
814
- if (dirname(cfb.FullPaths[i2 + 1] || "") == dirname(nm)) elt.R = i2 + 1;
819
+ if (dirname2(cfb.FullPaths[i2 + 1] || "") == dirname2(nm)) elt.R = i2 + 1;
815
820
  elt.type = 2;
816
821
  }
817
822
  }
@@ -2308,50 +2313,89 @@ function stripDtd(xml) {
2308
2313
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
2309
2314
  }
2310
2315
  async function parseHwpxDocument(buffer, options, existingZip) {
2311
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
2312
- let zip;
2316
+ const logger = createLoggerFromEnv().child({ component: "hwpx/parser.ts", stage: "detect" });
2317
+ logger.log({ level: "info", event: "start", message: "HWPX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
2318
+ let lastParsedSection = 0;
2313
2319
  try {
2314
- zip = existingZip ?? await JSZip2.loadAsync(buffer);
2315
- } catch {
2316
- return await extractFromBrokenZip(buffer);
2317
- }
2318
- const actualEntryCount = Object.keys(zip.files).length;
2319
- if (actualEntryCount > MAX_ZIP_ENTRIES) {
2320
- throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
2321
- }
2322
- const decompressed = { total: 0 };
2323
- const metadata = {};
2324
- await extractHwpxMetadata(zip, metadata, decompressed);
2325
- const styleMap = await extractHwpxStyles(zip, decompressed);
2326
- const warnings = [];
2327
- const sectionPaths = await resolveSectionPaths(zip);
2328
- if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
2329
- metadata.pageCount = sectionPaths.length;
2330
- const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
2331
- const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
2332
- const blocks = [];
2333
- let parsedSections = 0;
2334
- for (let si = 0; si < sectionPaths.length; si++) {
2335
- if (pageFilter && !pageFilter.has(si + 1)) continue;
2336
- const file = zip.file(sectionPaths[si]);
2337
- if (!file) continue;
2320
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
2321
+ let zip;
2338
2322
  try {
2339
- const xml = await file.async("text");
2340
- decompressed.total += xml.length * 2;
2341
- if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
2342
- blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
2343
- parsedSections++;
2344
- options?.onProgress?.(parsedSections, totalTarget);
2345
- } catch (secErr) {
2346
- if (secErr instanceof KordocError) throw secErr;
2347
- warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
2348
- }
2349
- }
2350
- const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
2351
- detectHwpxHeadings(blocks, styleMap);
2352
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2353
- const markdown = blocksToMarkdown(blocks);
2354
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2323
+ zip = existingZip ?? await JSZip2.loadAsync(buffer);
2324
+ } catch {
2325
+ return await extractFromBrokenZip(buffer);
2326
+ }
2327
+ const actualEntryCount = Object.keys(zip.files).length;
2328
+ if (actualEntryCount > MAX_ZIP_ENTRIES) {
2329
+ throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
2330
+ }
2331
+ const decompressed = { total: 0 };
2332
+ const metadata = {};
2333
+ await extractHwpxMetadata(zip, metadata, decompressed);
2334
+ const styleMap = await extractHwpxStyles(zip, decompressed);
2335
+ const warnings = [];
2336
+ const sectionPaths = await resolveSectionPaths(zip);
2337
+ if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
2338
+ metadata.pageCount = sectionPaths.length;
2339
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uACBD\uB85C \uD574\uC11D \uC644\uB8CC", meta: { sections: sectionPaths.length } });
2340
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
2341
+ const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
2342
+ const blocks = [];
2343
+ let parsedSections = 0;
2344
+ for (let si = 0; si < sectionPaths.length; si++) {
2345
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
2346
+ const file = zip.file(sectionPaths[si]);
2347
+ if (!file) continue;
2348
+ try {
2349
+ const xml = await file.async("text");
2350
+ decompressed.total += xml.length * 2;
2351
+ if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
2352
+ blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
2353
+ parsedSections++;
2354
+ options?.onProgress?.(parsedSections, totalTarget);
2355
+ logger.log({
2356
+ level: "debug",
2357
+ stage: "convert",
2358
+ event: "progress",
2359
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
2360
+ meta: { section: si + 1, parsedSections, totalTarget }
2361
+ });
2362
+ lastParsedSection = si + 1;
2363
+ } catch (secErr) {
2364
+ if (secErr instanceof KordocError) throw secErr;
2365
+ warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
2366
+ logger.log({
2367
+ level: "warn",
2368
+ stage: "convert",
2369
+ event: "progress",
2370
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
2371
+ meta: { section: si + 1 },
2372
+ error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
2373
+ });
2374
+ }
2375
+ }
2376
+ const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
2377
+ detectHwpxHeadings(blocks, styleMap);
2378
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2379
+ const markdown = blocksToMarkdown(blocks);
2380
+ logger.log({
2381
+ level: "info",
2382
+ stage: "finalize",
2383
+ event: "done",
2384
+ message: "HWPX \uD30C\uC2F1 \uC644\uB8CC",
2385
+ meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
2386
+ });
2387
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2388
+ } catch (err) {
2389
+ logger.log({
2390
+ level: "error",
2391
+ stage: "finalize",
2392
+ event: "error",
2393
+ message: "HWPX \uD30C\uC2F1 \uC2E4\uD328",
2394
+ meta: { lastParsedSection },
2395
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
2396
+ });
2397
+ throw err;
2398
+ }
2355
2399
  }
2356
2400
  function imageExtToMime(ext) {
2357
2401
  switch (ext.toLowerCase()) {
@@ -4084,71 +4128,110 @@ var CFB = __toESM(require_cfb(), 1);
4084
4128
  var MAX_SECTIONS = 100;
4085
4129
  var MAX_TOTAL_DECOMPRESS = 500 * 1024 * 1024;
4086
4130
  function parseHwp5Document(buffer, options) {
4087
- let cfb = null;
4088
- let lenientCfb = null;
4089
- const warnings = [];
4131
+ const logger = createLoggerFromEnv().child({ component: "hwp5/parser.ts", stage: "detect" });
4132
+ logger.log({ level: "info", event: "start", message: "HWP5 \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.length } });
4133
+ let lastParsedSection = 0;
4090
4134
  try {
4091
- cfb = CFB.parse(buffer);
4092
- } catch {
4135
+ let cfb = null;
4136
+ let lenientCfb = null;
4137
+ const warnings = [];
4093
4138
  try {
4094
- lenientCfb = parseLenientCfb(buffer);
4095
- warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
4139
+ cfb = CFB.parse(buffer);
4096
4140
  } catch {
4097
- throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
4141
+ try {
4142
+ lenientCfb = parseLenientCfb(buffer);
4143
+ warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
4144
+ } catch {
4145
+ throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
4146
+ }
4098
4147
  }
4099
- }
4100
- const findStream = (path) => {
4101
- if (cfb) {
4102
- const entry = CFB.find(cfb, path);
4103
- return entry?.content ? Buffer.from(entry.content) : null;
4148
+ const findStream = (path) => {
4149
+ if (cfb) {
4150
+ const entry = CFB.find(cfb, path);
4151
+ return entry?.content ? Buffer.from(entry.content) : null;
4152
+ }
4153
+ return lenientCfb.findStream(path);
4154
+ };
4155
+ const headerData = findStream("/FileHeader");
4156
+ if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
4157
+ const header = parseFileHeader(headerData);
4158
+ if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
4159
+ if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
4160
+ if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
4161
+ const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
4162
+ const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
4163
+ const metadata = {
4164
+ version: `${header.versionMajor}.x`
4165
+ };
4166
+ if (cfb) extractHwp5Metadata(cfb, metadata);
4167
+ const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
4168
+ const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
4169
+ if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
4170
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uBAA9\uB85D \uD574\uC11D \uC644\uB8CC", meta: { sections: sections.length, distribution } });
4171
+ metadata.pageCount = sections.length;
4172
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
4173
+ const totalTarget = pageFilter ? pageFilter.size : sections.length;
4174
+ const blocks = [];
4175
+ let totalDecompressed = 0;
4176
+ let parsedSections = 0;
4177
+ for (let si = 0; si < sections.length; si++) {
4178
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
4179
+ try {
4180
+ const sectionData = sections[si];
4181
+ const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
4182
+ totalDecompressed += data.length;
4183
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
4184
+ const records = readRecords(data);
4185
+ const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
4186
+ blocks.push(...sectionBlocks);
4187
+ parsedSections++;
4188
+ options?.onProgress?.(parsedSections, totalTarget);
4189
+ logger.log({
4190
+ level: "debug",
4191
+ stage: "convert",
4192
+ event: "progress",
4193
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
4194
+ meta: { section: si + 1, parsedSections, totalTarget }
4195
+ });
4196
+ lastParsedSection = si + 1;
4197
+ } catch (secErr) {
4198
+ if (secErr instanceof KordocError) throw secErr;
4199
+ warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
4200
+ logger.log({
4201
+ level: "warn",
4202
+ stage: "convert",
4203
+ event: "progress",
4204
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
4205
+ meta: { section: si + 1 },
4206
+ error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
4207
+ });
4208
+ }
4104
4209
  }
4105
- return lenientCfb.findStream(path);
4106
- };
4107
- const headerData = findStream("/FileHeader");
4108
- if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
4109
- const header = parseFileHeader(headerData);
4110
- if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
4111
- if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
4112
- if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
4113
- const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
4114
- const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
4115
- const metadata = {
4116
- version: `${header.versionMajor}.x`
4117
- };
4118
- if (cfb) extractHwp5Metadata(cfb, metadata);
4119
- const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
4120
- const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
4121
- if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
4122
- metadata.pageCount = sections.length;
4123
- const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
4124
- const totalTarget = pageFilter ? pageFilter.size : sections.length;
4125
- const blocks = [];
4126
- let totalDecompressed = 0;
4127
- let parsedSections = 0;
4128
- for (let si = 0; si < sections.length; si++) {
4129
- if (pageFilter && !pageFilter.has(si + 1)) continue;
4130
- try {
4131
- const sectionData = sections[si];
4132
- const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
4133
- totalDecompressed += data.length;
4134
- if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
4135
- const records = readRecords(data);
4136
- const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
4137
- blocks.push(...sectionBlocks);
4138
- parsedSections++;
4139
- options?.onProgress?.(parsedSections, totalTarget);
4140
- } catch (secErr) {
4141
- if (secErr instanceof KordocError) throw secErr;
4142
- warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
4143
- }
4144
- }
4145
- const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
4146
- if (docInfo) {
4147
- detectHwp5Headings(blocks, docInfo);
4148
- }
4149
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
4150
- const markdown = blocksToMarkdown(blocks);
4151
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
4210
+ const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
4211
+ if (docInfo) {
4212
+ detectHwp5Headings(blocks, docInfo);
4213
+ }
4214
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
4215
+ const markdown = blocksToMarkdown(blocks);
4216
+ logger.log({
4217
+ level: "info",
4218
+ stage: "finalize",
4219
+ event: "done",
4220
+ message: "HWP5 \uD30C\uC2F1 \uC644\uB8CC",
4221
+ meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
4222
+ });
4223
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
4224
+ } catch (err) {
4225
+ logger.log({
4226
+ level: "error",
4227
+ stage: "finalize",
4228
+ event: "error",
4229
+ message: "HWP5 \uD30C\uC2F1 \uC2E4\uD328",
4230
+ meta: { lastParsedSection },
4231
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
4232
+ });
4233
+ throw err;
4234
+ }
4152
4235
  }
4153
4236
  function parseDocInfoStream(cfb, compressed) {
4154
4237
  try {
@@ -4707,6 +4790,10 @@ function arrangeCells(rows, cols, cells) {
4707
4790
  return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
4708
4791
  }
4709
4792
 
4793
+ // src/pdf/parser.ts
4794
+ import { createRequire } from "module";
4795
+ import { dirname, join, resolve } from "path";
4796
+
4710
4797
  // src/pdf/line-detector.ts
4711
4798
  import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
4712
4799
  var ORIENTATION_TOL = 2;
@@ -4893,12 +4980,17 @@ function buildTableGrids(horizontals, verticals) {
4893
4980
  const rawXs = vLines.map((l) => l.x1);
4894
4981
  const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
4895
4982
  if (rowYs.length < 2 || colXs.length < 2) continue;
4983
+ const rowCount = rowYs.length - 1;
4984
+ const colCount = colXs.length - 1;
4985
+ if (rowCount <= 0 || colCount <= 0) continue;
4986
+ if (rowCount * colCount < 2) continue;
4896
4987
  const bbox = {
4897
4988
  x1: colXs[0],
4898
4989
  y1: rowYs[rowYs.length - 1],
4899
4990
  x2: colXs[colXs.length - 1],
4900
4991
  y2: rowYs[0]
4901
4992
  };
4993
+ if (!hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox)) continue;
4902
4994
  grids.push({ rowYs, colXs, bbox });
4903
4995
  }
4904
4996
  return mergeAdjacentGrids(grids);
@@ -4948,6 +5040,35 @@ function clusterCoordinates(values) {
4948
5040
  }
4949
5041
  return clusters.map((c) => c.sum / c.count);
4950
5042
  }
5043
+ function hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox) {
5044
+ const internalRows = rowYs.slice(1, -1);
5045
+ const internalCols = colXs.slice(1, -1);
5046
+ const width = Math.max(1, bbox.x2 - bbox.x1);
5047
+ const height = Math.max(1, bbox.y2 - bbox.y1);
5048
+ const coverageThreshold = 0.55;
5049
+ const coveredRows = internalRows.filter(
5050
+ (y) => hLines.some((h) => Math.abs(h.y1 - y) <= COORD_MERGE_TOL && lineOverlapRatio(h.x1, h.x2, bbox.x1, bbox.x2) >= coverageThreshold)
5051
+ ).length;
5052
+ const coveredCols = internalCols.filter(
5053
+ (x) => vLines.some((v) => Math.abs(v.x1 - x) <= COORD_MERGE_TOL && lineOverlapRatio(v.y1, v.y2, bbox.y1, bbox.y2) >= coverageThreshold)
5054
+ ).length;
5055
+ const rowCoverage = internalRows.length > 0 ? coveredRows / internalRows.length : 1;
5056
+ const colCoverage = internalCols.length > 0 ? coveredCols / internalCols.length : 1;
5057
+ const longHorizontal = hLines.filter((h) => Math.abs(h.x2 - h.x1) >= width * 0.7).length;
5058
+ const longVertical = vLines.filter((v) => Math.abs(v.y2 - v.y1) >= height * 0.7).length;
5059
+ const hasAxisSupport = longHorizontal >= 2 && longVertical >= 2;
5060
+ if (!hasAxisSupport) return false;
5061
+ if (internalRows.length > 0 && rowCoverage < 0.5) return false;
5062
+ if (internalCols.length > 0 && colCoverage < 0.5) return false;
5063
+ return true;
5064
+ }
5065
+ function lineOverlapRatio(a1, a2, b1, b2) {
5066
+ const left = Math.max(Math.min(a1, a2), Math.min(b1, b2));
5067
+ const right = Math.min(Math.max(a1, a2), Math.max(b1, b2));
5068
+ const overlap = Math.max(0, right - left);
5069
+ const target = Math.max(1, Math.abs(b2 - b1));
5070
+ return overlap / target;
5071
+ }
4951
5072
  function groupConnectedLines(lines) {
4952
5073
  const parent = lines.map((_, i) => i);
4953
5074
  function find2(x) {
@@ -5344,6 +5465,17 @@ g.pdfjsWorker = pdfjsWorker;
5344
5465
  // src/pdf/parser.ts
5345
5466
  import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mjs";
5346
5467
  GlobalWorkerOptions.workerSrc = "";
5468
+ var require2 = createRequire(
5469
+ typeof __filename !== "undefined" ? __filename : resolve(process.cwd(), "kordoc.require.cjs")
5470
+ );
5471
+ function resolvePdfjsWasmUrl() {
5472
+ try {
5473
+ const pdfjsPkg = require2.resolve("pdfjs-dist/package.json");
5474
+ return join(dirname(pdfjsPkg), "wasm/");
5475
+ } catch {
5476
+ return resolve(process.cwd(), "node_modules/pdfjs-dist/wasm/");
5477
+ }
5478
+ }
5347
5479
  var MAX_PAGES = 5e3;
5348
5480
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
5349
5481
  function calcPdfTimeout(bufferSize) {
@@ -5359,7 +5491,8 @@ async function loadPdfWithTimeout(buffer) {
5359
5491
  data: new Uint8Array(buffer),
5360
5492
  useSystemFonts: true,
5361
5493
  disableFontFace: true,
5362
- isEvalSupported: false
5494
+ isEvalSupported: false,
5495
+ wasmUrl: resolvePdfjsWasmUrl()
5363
5496
  });
5364
5497
  let timer;
5365
5498
  try {
@@ -5376,7 +5509,47 @@ async function loadPdfWithTimeout(buffer) {
5376
5509
  if (timer !== void 0) clearTimeout(timer);
5377
5510
  }
5378
5511
  }
5512
+ function estimateImageBasedPdf(metrics) {
5513
+ if (metrics.length === 0) {
5514
+ return { isImageBased: true, score: 1, reason: "\uC0D8\uD50C \uD1B5\uACC4 \uC5C6\uC74C" };
5515
+ }
5516
+ const totalPages = metrics.length;
5517
+ const totalChars = metrics.reduce((s, m) => s + m.nonWhitespaceChars, 0);
5518
+ const totalItems = metrics.reduce((s, m) => s + m.visibleItems, 0);
5519
+ const pagesWithText = metrics.filter((m) => m.nonWhitespaceChars >= 20 || m.visibleItems >= 15).length;
5520
+ const avgChars = totalChars / totalPages;
5521
+ const avgItems = totalItems / totalPages;
5522
+ const textPresenceRatio = pagesWithText / totalPages;
5523
+ let score = 0;
5524
+ if (avgChars < 10) score += 0.45;
5525
+ if (avgItems < 8) score += 0.35;
5526
+ if (textPresenceRatio < 0.35) score += 0.25;
5527
+ if (avgChars > 40) score -= 0.35;
5528
+ if (avgItems > 25) score -= 0.35;
5529
+ if (textPresenceRatio > 0.7) score -= 0.25;
5530
+ score = Math.max(0, Math.min(1, score));
5531
+ const isImageBased = score >= 0.5;
5532
+ const reason = `avgChars=${avgChars.toFixed(1)}, avgItems=${avgItems.toFixed(1)}, textPresence=${(textPresenceRatio * 100).toFixed(0)}%, score=${score.toFixed(2)}`;
5533
+ return { isImageBased, score, reason };
5534
+ }
5535
+ function summarizePartialFailures(failedPages, totalTarget) {
5536
+ if (failedPages.length === 0) return null;
5537
+ const sorted = [...failedPages].sort((a, b) => a - b);
5538
+ const preview = sorted.slice(0, 10).join(", ");
5539
+ const suffix = sorted.length > 10 ? ` \uC678 ${sorted.length - 10}\uD398\uC774\uC9C0` : "";
5540
+ return `\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uC694\uC57D: ${sorted.length}/${totalTarget}\uD398\uC774\uC9C0 \uC2E4\uD328 (p${preview}${suffix})`;
5541
+ }
5542
+ function shouldAbortForPartialFailures(failedPages, totalTarget, maxPartialFailureRatio) {
5543
+ if (typeof maxPartialFailureRatio !== "number") {
5544
+ return { abort: false, ratio: 0, threshold: 0 };
5545
+ }
5546
+ const threshold = Math.max(0, Math.min(1, maxPartialFailureRatio));
5547
+ const ratio = totalTarget > 0 ? failedPages.length / totalTarget : 0;
5548
+ return { abort: ratio > threshold, ratio, threshold };
5549
+ }
5379
5550
  async function parsePdfDocument(buffer, options) {
5551
+ const logger = createLoggerFromEnv().child({ component: "pdf/parser.ts", stage: "detect" });
5552
+ logger.log({ level: "info", event: "start", message: "PDF \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
5380
5553
  const doc = await loadPdfWithTimeout(buffer);
5381
5554
  try {
5382
5555
  const pageCount = doc.numPages;
@@ -5385,9 +5558,13 @@ async function parsePdfDocument(buffer, options) {
5385
5558
  await extractPdfMetadata(doc, metadata);
5386
5559
  const blocks = [];
5387
5560
  const warnings = [];
5561
+ const failedPages = [];
5562
+ let lastParsedPage2 = 0;
5563
+ const sampleMetricsByPage = /* @__PURE__ */ new Map();
5388
5564
  let totalChars = 0;
5389
5565
  let totalTextBytes = 0;
5390
5566
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
5567
+ logger.log({ level: "debug", event: "progress", message: "PDF \uB85C\uB529 \uC644\uB8CC", meta: { pageCount, effectivePageCount } });
5391
5568
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
5392
5569
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
5393
5570
  const fontSizeFreq = /* @__PURE__ */ new Map();
@@ -5424,11 +5601,17 @@ async function parsePdfDocument(buffer, options) {
5424
5601
  totalChars += t.replace(/\s/g, "").length;
5425
5602
  totalTextBytes += t.length * 2;
5426
5603
  }
5604
+ sampleMetricsByPage.set(i, {
5605
+ nonWhitespaceChars: visible.reduce((sum, it) => sum + it.text.replace(/\s/g, "").length, 0),
5606
+ visibleItems: visible.length
5607
+ });
5608
+ lastParsedPage2 = i;
5427
5609
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
5428
5610
  parsedPages++;
5429
5611
  options?.onProgress?.(parsedPages, totalTarget);
5430
5612
  } catch (pageErr) {
5431
5613
  if (pageErr instanceof KordocError) throw pageErr;
5614
+ if (!failedPages.includes(i)) failedPages.push(i);
5432
5615
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
5433
5616
  }
5434
5617
  };
@@ -5445,8 +5628,21 @@ async function parsePdfDocument(buffer, options) {
5445
5628
  for (const si of sampledIndices) {
5446
5629
  await parseSinglePage(targetPageNums[si]);
5447
5630
  }
5448
- const sampleParsed = parsedPages || sampledIndices.size;
5449
- const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
5631
+ const sampledMetrics = [];
5632
+ for (const si of sampledIndices) {
5633
+ const pageNum = targetPageNums[si];
5634
+ const m = sampleMetricsByPage.get(pageNum);
5635
+ if (m) sampledMetrics.push(m);
5636
+ }
5637
+ const imageBasedDecision = estimateImageBasedPdf(sampledMetrics);
5638
+ const isImageBased = imageBasedDecision.isImageBased;
5639
+ logger.log({
5640
+ level: "info",
5641
+ stage: "probe",
5642
+ event: "done",
5643
+ message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815",
5644
+ meta: { isImageBased, reason: imageBasedDecision.reason, sampledPages: sampledMetrics.length }
5645
+ });
5450
5646
  if (!isImageBased) {
5451
5647
  for (let si = 0; si < targetPageNums.length; si++) {
5452
5648
  if (!sampledIndices.has(si)) {
@@ -5454,20 +5650,52 @@ async function parsePdfDocument(buffer, options) {
5454
5650
  }
5455
5651
  }
5456
5652
  }
5653
+ const partialSummary = summarizePartialFailures(failedPages, totalTarget);
5654
+ if (partialSummary) {
5655
+ warnings.push({
5656
+ message: partialSummary,
5657
+ code: "PARTIAL_PARSE"
5658
+ });
5659
+ }
5660
+ if (isImageBased) {
5661
+ warnings.push({
5662
+ message: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815: ${imageBasedDecision.reason}`,
5663
+ code: "OCR_FALLBACK"
5664
+ });
5665
+ }
5666
+ const partialPolicy = shouldAbortForPartialFailures(
5667
+ failedPages,
5668
+ totalTarget,
5669
+ options?.maxPartialFailureRatio
5670
+ );
5671
+ if (partialPolicy.abort) {
5672
+ throw new KordocError(
5673
+ `\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uBE44\uC728 \uCD08\uACFC: ${(partialPolicy.ratio * 100).toFixed(1)}% (\uD5C8\uC6A9 ${(partialPolicy.threshold * 100).toFixed(1)}%)`
5674
+ );
5675
+ }
5457
5676
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
5458
5677
  if (isImageBased) {
5459
5678
  const ocrMode = options?.ocrMode ?? "auto";
5460
5679
  const concurrency = options?.ocrConcurrency ?? 1;
5461
5680
  const batchSize = options?.ocrBatchSize;
5681
+ logger.log({
5682
+ level: "info",
5683
+ stage: "ocr",
5684
+ event: "start",
5685
+ message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF OCR \uC2DC\uC791",
5686
+ meta: { ocrMode, concurrency, batchSize, totalTarget }
5687
+ });
5462
5688
  if (ocrMode === "off") {
5463
5689
  throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
5464
5690
  }
5465
- const { resolveOcrProvider } = await import("./resolve-4I65IGMM.js");
5466
- const { ocrPages } = await import("./provider-PYZL2VNN.js");
5691
+ const { resolveOcrProvider } = await import("./resolve-673XFZQ6.js");
5692
+ const { ocrPages } = await import("./provider-T2D5XRTI.js");
5467
5693
  const tryProvider = async (provider, filter) => {
5468
5694
  try {
5695
+ logger.log({ level: "debug", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589", meta: { filteredPages: filter?.size } });
5469
5696
  return await ocrPages(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
5470
5697
  } catch {
5698
+ logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589 \uC2E4\uD328(\uBE48 \uACB0\uACFC\uB85C \uCC98\uB9AC)" });
5471
5699
  return [];
5472
5700
  } finally {
5473
5701
  const terminable = provider;
@@ -5490,6 +5718,7 @@ async function parsePdfDocument(buffer, options) {
5490
5718
  for (const mode of getAutoFallbackChain()) {
5491
5719
  if (pendingPages.size === 0) break;
5492
5720
  try {
5721
+ logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uC2DC\uB3C4", meta: { mode, pendingPages: pendingPages.size } });
5493
5722
  const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
5494
5723
  const provider = await resolveOcrProvider(mode, warnings, concurrency, batchSize);
5495
5724
  const blocks2 = await tryProvider(provider, modeFilter);
@@ -5504,10 +5733,20 @@ async function parsePdfDocument(buffer, options) {
5504
5733
  code: "OCR_CLI_FALLBACK"
5505
5734
  });
5506
5735
  }
5736
+ logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uCC98\uB9AC \uC644\uB8CC", meta: { mode, blocks: blocks2.length, pendingPages: pendingPages.size } });
5507
5737
  } else {
5508
5738
  warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
5739
+ logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uACB0\uACFC \uC5C6\uC74C", meta: { mode } });
5509
5740
  }
5510
- } catch {
5741
+ } catch (engineErr) {
5742
+ logger.log({
5743
+ level: "warn",
5744
+ stage: "ocr",
5745
+ event: "progress",
5746
+ message: "OCR \uC5D4\uC9C4 \uCD08\uAE30\uD654/\uC2E4\uD589 \uC2E4\uD328",
5747
+ meta: { mode },
5748
+ error: { message: engineErr instanceof Error ? engineErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: engineErr instanceof Error ? engineErr.name : "Error" }
5749
+ });
5511
5750
  }
5512
5751
  }
5513
5752
  allOcrBlocks.sort((a, b) => (a.pageNumber ?? 0) - (b.pageNumber ?? 0));
@@ -5525,6 +5764,7 @@ async function parsePdfDocument(buffer, options) {
5525
5764
  }
5526
5765
  if (ocrBlocks.length > 0) {
5527
5766
  const ocrMarkdown = blocksToMarkdown(ocrBlocks);
5767
+ logger.log({ level: "info", stage: "ocr", event: "done", message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 OCR \uC644\uB8CC", meta: { blocks: ocrBlocks.length } });
5528
5768
  return {
5529
5769
  markdown: ocrMarkdown,
5530
5770
  blocks: ocrBlocks,
@@ -5550,8 +5790,25 @@ async function parsePdfDocument(buffer, options) {
5550
5790
  }
5551
5791
  detectMarkerHeadings(blocks);
5552
5792
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
5553
- let markdown = cleanPdfText(blocksToMarkdown(blocks));
5793
+ let markdown = cleanPdfText(blocksToMarkdown(blocks), options?.pdfTextNormalization ?? "default");
5794
+ logger.log({
5795
+ level: "info",
5796
+ stage: "finalize",
5797
+ event: "done",
5798
+ message: "PDF \uD30C\uC2F1 \uC644\uB8CC",
5799
+ meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, isImageBased: false }
5800
+ });
5554
5801
  return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
5802
+ } catch (err) {
5803
+ logger.log({
5804
+ level: "error",
5805
+ stage: "finalize",
5806
+ event: "error",
5807
+ message: "PDF \uD30C\uC2F1 \uC2E4\uD328",
5808
+ meta: { lastParsedPage },
5809
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
5810
+ });
5811
+ throw err;
5555
5812
  } finally {
5556
5813
  await doc.destroy().catch(() => {
5557
5814
  });
@@ -5656,6 +5913,17 @@ function shouldDemoteTable(table) {
5656
5913
  const emptyCells = totalCells - allCells.length;
5657
5914
  if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
5658
5915
  if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
5916
+ if (table.cols >= 3 && table.rows <= 4) {
5917
+ const markerCells = allCells.filter((t) => /^[□■◆○●▶▷◇◆]/.test(t)).length;
5918
+ const numericCells = allCells.filter((t) => /\d/.test(t)).length;
5919
+ if (markerCells >= Math.max(1, Math.floor(allCells.length * 0.35)) && numericCells <= Math.floor(allCells.length * 0.15)) {
5920
+ return true;
5921
+ }
5922
+ }
5923
+ if (table.cols >= 3 && table.rows >= 2) {
5924
+ const sparseRows = table.cells.filter((row) => row.filter((c) => c.text.trim()).length <= 1).length;
5925
+ if (sparseRows >= Math.ceil(table.rows * 0.7)) return true;
5926
+ }
5659
5927
  return false;
5660
5928
  }
5661
5929
  function demoteTableToText(table) {
@@ -6211,10 +6479,15 @@ function mergeLineSimple(items) {
6211
6479
  }
6212
6480
  return result;
6213
6481
  }
6214
- function cleanPdfText(text) {
6215
- return mergeKoreanLines(
6216
- text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
6217
- ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
6482
+ function stripPdfPageNumberArtifacts(text) {
6483
+ return text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "");
6484
+ }
6485
+ function cleanPdfText(text, mode = "default") {
6486
+ const stripped = stripPdfPageNumberArtifacts(text);
6487
+ if (mode === "strict-preserve") {
6488
+ return stripped.replace(/\n{4,}/g, "\n\n\n").trim();
6489
+ }
6490
+ return mergeKoreanLines(stripped).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
6218
6491
  }
6219
6492
  function startsWithMarker(line) {
6220
6493
  const t = line.trimStart();
@@ -6610,100 +6883,139 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
6610
6883
  return blocks;
6611
6884
  }
6612
6885
  async function parseXlsxDocument(buffer, options, existingZip) {
6613
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
6614
- const zip = existingZip ?? await JSZip3.loadAsync(buffer);
6615
- const warnings = [];
6616
- const workbookFile = zip.file("xl/workbook.xml");
6617
- if (!workbookFile) {
6618
- throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
6619
- }
6620
- let sharedStrings = [];
6621
- const ssFile = zip.file("xl/sharedStrings.xml");
6622
- if (ssFile) {
6623
- sharedStrings = parseSharedStrings(await ssFile.async("text"));
6624
- }
6625
- const sheets = parseWorkbook(await workbookFile.async("text"));
6626
- if (sheets.length === 0) {
6627
- throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
6628
- }
6629
- let relsMap = /* @__PURE__ */ new Map();
6630
- const relsFile = zip.file("xl/_rels/workbook.xml.rels");
6631
- if (relsFile) {
6632
- relsMap = parseRels(await relsFile.async("text"));
6633
- }
6634
- let pageFilter = null;
6635
- if (options?.pages) {
6636
- const { parsePageRange: parsePageRange2 } = await import("./page-range-ALIRXAL5.js");
6637
- pageFilter = parsePageRange2(options.pages, sheets.length);
6638
- }
6639
- const blocks = [];
6640
- const processedSheets = Math.min(sheets.length, MAX_SHEETS);
6641
- let totalCells = 0;
6642
- for (let i = 0; i < processedSheets; i++) {
6643
- if (pageFilter && !pageFilter.has(i + 1)) continue;
6644
- const sheet = sheets[i];
6645
- options?.onProgress?.(i + 1, processedSheets);
6646
- let sheetPath = relsMap.get(sheet.rId);
6647
- if (sheetPath) {
6648
- if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
6649
- sheetPath = `xl/${sheetPath}`;
6650
- } else if (sheetPath.startsWith("/")) {
6651
- sheetPath = sheetPath.slice(1);
6652
- }
6653
- } else {
6654
- sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
6655
- }
6656
- const sheetFile = zip.file(sheetPath);
6657
- if (!sheetFile) {
6658
- warnings.push({
6659
- page: i + 1,
6660
- message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
6661
- code: "PARTIAL_PARSE"
6662
- });
6663
- continue;
6886
+ const logger = createLoggerFromEnv().child({ component: "xlsx/parser.ts", stage: "detect" });
6887
+ logger.log({ level: "info", event: "start", message: "XLSX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
6888
+ let lastProcessedSheet = 0;
6889
+ try {
6890
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
6891
+ const zip = existingZip ?? await JSZip3.loadAsync(buffer);
6892
+ const warnings = [];
6893
+ const workbookFile = zip.file("xl/workbook.xml");
6894
+ if (!workbookFile) {
6895
+ throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
6896
+ }
6897
+ let sharedStrings = [];
6898
+ const ssFile = zip.file("xl/sharedStrings.xml");
6899
+ if (ssFile) {
6900
+ sharedStrings = parseSharedStrings(await ssFile.async("text"));
6901
+ }
6902
+ const sheets = parseWorkbook(await workbookFile.async("text"));
6903
+ if (sheets.length === 0) {
6904
+ throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
6905
+ }
6906
+ logger.log({ level: "debug", event: "progress", message: "\uC2DC\uD2B8 \uBAA9\uB85D \uB85C\uB4DC", meta: { sheets: sheets.length } });
6907
+ let relsMap = /* @__PURE__ */ new Map();
6908
+ const relsFile = zip.file("xl/_rels/workbook.xml.rels");
6909
+ if (relsFile) {
6910
+ relsMap = parseRels(await relsFile.async("text"));
6911
+ }
6912
+ let pageFilter = null;
6913
+ if (options?.pages) {
6914
+ const { parsePageRange: parsePageRange2 } = await import("./page-range-ALIRXAL5.js");
6915
+ pageFilter = parsePageRange2(options.pages, sheets.length);
6664
6916
  }
6665
- try {
6666
- const sheetXml = await sheetFile.async("text");
6667
- const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
6668
- totalCells += maxRow * maxCol;
6669
- if (totalCells > MAX_TOTAL_CELLS) {
6670
- warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
6671
- break;
6917
+ const blocks = [];
6918
+ const processedSheets = Math.min(sheets.length, MAX_SHEETS);
6919
+ let totalCells = 0;
6920
+ for (let i = 0; i < processedSheets; i++) {
6921
+ if (pageFilter && !pageFilter.has(i + 1)) continue;
6922
+ const sheet = sheets[i];
6923
+ options?.onProgress?.(i + 1, processedSheets);
6924
+ let sheetPath = relsMap.get(sheet.rId);
6925
+ if (sheetPath) {
6926
+ if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
6927
+ sheetPath = `xl/${sheetPath}`;
6928
+ } else if (sheetPath.startsWith("/")) {
6929
+ sheetPath = sheetPath.slice(1);
6930
+ }
6931
+ } else {
6932
+ sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
6933
+ }
6934
+ const sheetFile = zip.file(sheetPath);
6935
+ if (!sheetFile) {
6936
+ warnings.push({
6937
+ page: i + 1,
6938
+ message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
6939
+ code: "PARTIAL_PARSE"
6940
+ });
6941
+ continue;
6942
+ }
6943
+ try {
6944
+ const sheetXml = await sheetFile.async("text");
6945
+ const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
6946
+ totalCells += maxRow * maxCol;
6947
+ if (totalCells > MAX_TOTAL_CELLS) {
6948
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
6949
+ break;
6950
+ }
6951
+ const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
6952
+ blocks.push(...sheetBlocks);
6953
+ logger.log({
6954
+ level: "debug",
6955
+ stage: "convert",
6956
+ event: "progress",
6957
+ message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC644\uB8CC",
6958
+ meta: { sheet: sheet.name, index: i + 1, processedSheets }
6959
+ });
6960
+ lastProcessedSheet = i + 1;
6961
+ } catch (err) {
6962
+ warnings.push({
6963
+ page: i + 1,
6964
+ message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
6965
+ code: "PARTIAL_PARSE"
6966
+ });
6967
+ logger.log({
6968
+ level: "warn",
6969
+ stage: "convert",
6970
+ event: "progress",
6971
+ message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC2E4\uD328",
6972
+ meta: { sheet: sheet.name, index: i + 1 },
6973
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
6974
+ });
6672
6975
  }
6673
- const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
6674
- blocks.push(...sheetBlocks);
6675
- } catch (err) {
6676
- warnings.push({
6677
- page: i + 1,
6678
- message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
6679
- code: "PARTIAL_PARSE"
6680
- });
6681
6976
  }
6682
- }
6683
- const metadata = {
6684
- pageCount: processedSheets
6685
- };
6686
- const coreFile = zip.file("docProps/core.xml");
6687
- if (coreFile) {
6688
- try {
6689
- const coreXml = await coreFile.async("text");
6690
- const doc = parseXml(coreXml);
6691
- const getFirst = (tag) => {
6692
- const els = doc.getElementsByTagName(tag);
6693
- return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
6694
- };
6695
- metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
6696
- metadata.author = getFirst("dc:creator");
6697
- metadata.description = getFirst("dc:description");
6698
- const created = getFirst("dcterms:created");
6699
- if (created) metadata.createdAt = created;
6700
- const modified = getFirst("dcterms:modified");
6701
- if (modified) metadata.modifiedAt = modified;
6702
- } catch {
6977
+ const metadata = {
6978
+ pageCount: processedSheets
6979
+ };
6980
+ const coreFile = zip.file("docProps/core.xml");
6981
+ if (coreFile) {
6982
+ try {
6983
+ const coreXml = await coreFile.async("text");
6984
+ const doc = parseXml(coreXml);
6985
+ const getFirst = (tag) => {
6986
+ const els = doc.getElementsByTagName(tag);
6987
+ return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
6988
+ };
6989
+ metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
6990
+ metadata.author = getFirst("dc:creator");
6991
+ metadata.description = getFirst("dc:description");
6992
+ const created = getFirst("dcterms:created");
6993
+ if (created) metadata.createdAt = created;
6994
+ const modified = getFirst("dcterms:modified");
6995
+ if (modified) metadata.modifiedAt = modified;
6996
+ } catch {
6997
+ }
6703
6998
  }
6999
+ const markdown = blocksToMarkdown(blocks);
7000
+ logger.log({
7001
+ level: "info",
7002
+ stage: "finalize",
7003
+ event: "done",
7004
+ message: "XLSX \uD30C\uC2F1 \uC644\uB8CC",
7005
+ meta: { blocks: blocks.length, warnings: warnings.length, pageCount: processedSheets }
7006
+ });
7007
+ return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
7008
+ } catch (err) {
7009
+ logger.log({
7010
+ level: "error",
7011
+ stage: "finalize",
7012
+ event: "error",
7013
+ message: "XLSX \uD30C\uC2F1 \uC2E4\uD328",
7014
+ meta: { lastProcessedSheet },
7015
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
7016
+ });
7017
+ throw err;
6704
7018
  }
6705
- const markdown = blocksToMarkdown(blocks);
6706
- return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
6707
7019
  }
6708
7020
 
6709
7021
  // src/docx/parser.ts
@@ -7070,95 +7382,120 @@ async function extractImages(zip, rels, doc) {
7070
7382
  return { blocks, images };
7071
7383
  }
7072
7384
  async function parseDocxDocument(buffer, options, existingZip) {
7073
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
7074
- const zip = existingZip ?? await JSZip4.loadAsync(buffer);
7075
- const warnings = [];
7076
- const docFile = zip.file("word/document.xml");
7077
- if (!docFile) {
7078
- throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
7079
- }
7080
- let rels = /* @__PURE__ */ new Map();
7081
- const relsFile = zip.file("word/_rels/document.xml.rels");
7082
- if (relsFile) {
7083
- rels = parseRels2(await relsFile.async("text"));
7084
- }
7085
- let styles = /* @__PURE__ */ new Map();
7086
- const stylesFile = zip.file("word/styles.xml");
7087
- if (stylesFile) {
7088
- try {
7089
- styles = parseStyles(await stylesFile.async("text"));
7090
- } catch {
7385
+ const logger = createLoggerFromEnv().child({ component: "docx/parser.ts", stage: "detect" });
7386
+ logger.log({ level: "info", event: "start", message: "DOCX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
7387
+ let lastProcessedNode = 0;
7388
+ try {
7389
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
7390
+ const zip = existingZip ?? await JSZip4.loadAsync(buffer);
7391
+ const warnings = [];
7392
+ const docFile = zip.file("word/document.xml");
7393
+ if (!docFile) {
7394
+ throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
7395
+ }
7396
+ let rels = /* @__PURE__ */ new Map();
7397
+ const relsFile = zip.file("word/_rels/document.xml.rels");
7398
+ if (relsFile) {
7399
+ rels = parseRels2(await relsFile.async("text"));
7400
+ }
7401
+ let styles = /* @__PURE__ */ new Map();
7402
+ const stylesFile = zip.file("word/styles.xml");
7403
+ if (stylesFile) {
7404
+ try {
7405
+ styles = parseStyles(await stylesFile.async("text"));
7406
+ } catch {
7407
+ }
7091
7408
  }
7092
- }
7093
- let numbering = /* @__PURE__ */ new Map();
7094
- const numFile = zip.file("word/numbering.xml");
7095
- if (numFile) {
7096
- try {
7097
- numbering = parseNumbering(await numFile.async("text"));
7098
- } catch {
7409
+ let numbering = /* @__PURE__ */ new Map();
7410
+ const numFile = zip.file("word/numbering.xml");
7411
+ if (numFile) {
7412
+ try {
7413
+ numbering = parseNumbering(await numFile.async("text"));
7414
+ } catch {
7415
+ }
7099
7416
  }
7100
- }
7101
- let footnotes = /* @__PURE__ */ new Map();
7102
- const fnFile = zip.file("word/footnotes.xml");
7103
- if (fnFile) {
7104
- try {
7105
- footnotes = parseFootnotes(await fnFile.async("text"));
7106
- } catch {
7417
+ let footnotes = /* @__PURE__ */ new Map();
7418
+ const fnFile = zip.file("word/footnotes.xml");
7419
+ if (fnFile) {
7420
+ try {
7421
+ footnotes = parseFootnotes(await fnFile.async("text"));
7422
+ } catch {
7423
+ }
7107
7424
  }
7108
- }
7109
- const docXml = await docFile.async("text");
7110
- const doc = parseXml2(docXml);
7111
- const body = findElements(doc, "body");
7112
- if (body.length === 0) {
7113
- throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
7114
- }
7115
- const blocks = [];
7116
- const bodyEl = body[0];
7117
- const children = bodyEl.childNodes;
7118
- for (let i = 0; i < children.length; i++) {
7119
- const node = children[i];
7120
- if (node.nodeType !== 1) continue;
7121
- const el = node;
7122
- const localName = el.localName ?? el.tagName?.split(":").pop();
7123
- if (localName === "p") {
7124
- const block = parseParagraph(el, styles, numbering, footnotes, rels);
7125
- if (block) blocks.push(block);
7126
- } else if (localName === "tbl") {
7127
- const block = parseTable(el, styles, numbering, footnotes, rels);
7128
- if (block) blocks.push(block);
7129
- }
7130
- }
7131
- const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
7132
- const metadata = {};
7133
- const coreFile = zip.file("docProps/core.xml");
7134
- if (coreFile) {
7135
- try {
7136
- const coreXml = await coreFile.async("text");
7137
- const coreDoc = parseXml2(coreXml);
7138
- const getFirst = (tag) => {
7139
- const els = coreDoc.getElementsByTagName(tag);
7140
- return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
7141
- };
7142
- metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
7143
- metadata.author = getFirst("dc:creator");
7144
- metadata.description = getFirst("dc:description");
7145
- const created = getFirst("dcterms:created");
7146
- if (created) metadata.createdAt = created;
7147
- const modified = getFirst("dcterms:modified");
7148
- if (modified) metadata.modifiedAt = modified;
7149
- } catch {
7425
+ const docXml = await docFile.async("text");
7426
+ const doc = parseXml2(docXml);
7427
+ const body = findElements(doc, "body");
7428
+ if (body.length === 0) {
7429
+ throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
7150
7430
  }
7431
+ const blocks = [];
7432
+ const bodyEl = body[0];
7433
+ const children = bodyEl.childNodes;
7434
+ for (let i = 0; i < children.length; i++) {
7435
+ const node = children[i];
7436
+ if (node.nodeType !== 1) continue;
7437
+ const el = node;
7438
+ const localName = el.localName ?? el.tagName?.split(":").pop();
7439
+ if (localName === "p") {
7440
+ const block = parseParagraph(el, styles, numbering, footnotes, rels);
7441
+ if (block) blocks.push(block);
7442
+ } else if (localName === "tbl") {
7443
+ const block = parseTable(el, styles, numbering, footnotes, rels);
7444
+ if (block) blocks.push(block);
7445
+ }
7446
+ lastProcessedNode = i + 1;
7447
+ }
7448
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uBCF8\uBB38 \uBE14\uB85D \uD30C\uC2F1 \uC644\uB8CC", meta: { blocks: blocks.length } });
7449
+ const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
7450
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC774\uBBF8\uC9C0 \uCD94\uCD9C \uC644\uB8CC", meta: { imageBlocks: imgBlocks.length, images: images.length } });
7451
+ const metadata = {};
7452
+ const coreFile = zip.file("docProps/core.xml");
7453
+ if (coreFile) {
7454
+ try {
7455
+ const coreXml = await coreFile.async("text");
7456
+ const coreDoc = parseXml2(coreXml);
7457
+ const getFirst = (tag) => {
7458
+ const els = coreDoc.getElementsByTagName(tag);
7459
+ return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
7460
+ };
7461
+ metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
7462
+ metadata.author = getFirst("dc:creator");
7463
+ metadata.description = getFirst("dc:description");
7464
+ const created = getFirst("dcterms:created");
7465
+ if (created) metadata.createdAt = created;
7466
+ const modified = getFirst("dcterms:modified");
7467
+ if (modified) metadata.modifiedAt = modified;
7468
+ } catch {
7469
+ }
7470
+ }
7471
+ const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
7472
+ const markdown = blocksToMarkdown(blocks);
7473
+ logger.log({
7474
+ level: "info",
7475
+ stage: "finalize",
7476
+ event: "done",
7477
+ message: "DOCX \uD30C\uC2F1 \uC644\uB8CC",
7478
+ meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, images: images.length }
7479
+ });
7480
+ return {
7481
+ markdown,
7482
+ blocks,
7483
+ metadata,
7484
+ outline: outline.length > 0 ? outline : void 0,
7485
+ warnings: warnings.length > 0 ? warnings : void 0,
7486
+ images: images.length > 0 ? images : void 0
7487
+ };
7488
+ } catch (err) {
7489
+ logger.log({
7490
+ level: "error",
7491
+ stage: "finalize",
7492
+ event: "error",
7493
+ message: "DOCX \uD30C\uC2F1 \uC2E4\uD328",
7494
+ meta: { lastProcessedNode },
7495
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
7496
+ });
7497
+ throw err;
7151
7498
  }
7152
- const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
7153
- const markdown = blocksToMarkdown(blocks);
7154
- return {
7155
- markdown,
7156
- blocks,
7157
- metadata,
7158
- outline: outline.length > 0 ? outline : void 0,
7159
- warnings: warnings.length > 0 ? warnings : void 0,
7160
- images: images.length > 0 ? images : void 0
7161
- };
7162
7499
  }
7163
7500
 
7164
7501
  // src/form/recognize.ts
@@ -9487,8 +9824,22 @@ async function markdownToXlsx(markdown, options) {
9487
9824
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
9488
9825
  }
9489
9826
 
9827
+ // src/pipeline/unified-ocr.ts
9828
+ import libre from "libreoffice-convert";
9829
+ var libreConvert = libre.convert;
9830
+ var PROOFREAD_PROMPT = [
9831
+ "\uC544\uB798 Markdown\uC744 \uBE44\uD30C\uAD34 \uAD50\uC815\uB9CC \uC218\uD589\uD558\uC138\uC694.",
9832
+ "\uADDC\uCE59:",
9833
+ "- \uC0AC\uC2E4 \uCD94\uAC00/\uC0AD\uC81C/\uCD94\uCE21 \uAE08\uC9C0",
9834
+ "- \uC22B\uC790, \uB2E8\uC704, \uACE0\uC720\uBA85\uC0AC \uBCC0\uACBD \uAE08\uC9C0",
9835
+ "- \uC624\uD0C8\uC790, \uB744\uC5B4\uC4F0\uAE30, \uC904\uBC14\uAFC8, Markdown \uAD6C\uC870\uB9CC \uAD50\uC815",
9836
+ "- \uACB0\uACFC\uB294 Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825"
9837
+ ].join("\n");
9838
+
9490
9839
  // src/index.ts
9491
9840
  async function parse2(input, options) {
9841
+ const logger = createLoggerFromEnv().withRun(generateRunId("parse")).child({ component: "index.ts", stage: "detect" });
9842
+ logger.log({ level: "info", event: "start", message: "parse \uD638\uCD9C \uC2DC\uC791" });
9492
9843
  let buffer;
9493
9844
  if (typeof input === "string") {
9494
9845
  try {
@@ -9496,6 +9847,13 @@ async function parse2(input, options) {
9496
9847
  buffer = toArrayBuffer(buf);
9497
9848
  } catch (err) {
9498
9849
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
9850
+ logger.log({
9851
+ level: "error",
9852
+ stage: "detect",
9853
+ event: "error",
9854
+ message: msg,
9855
+ error: { code: "PARSE_ERROR", message: msg, name: err instanceof Error ? err.name : "Error" }
9856
+ });
9499
9857
  return { success: false, fileType: "unknown", error: msg, code: "PARSE_ERROR" };
9500
9858
  }
9501
9859
  } else if (Buffer.isBuffer(input)) {
@@ -9504,13 +9862,23 @@ async function parse2(input, options) {
9504
9862
  buffer = input;
9505
9863
  }
9506
9864
  if (!buffer || buffer.byteLength === 0) {
9865
+ logger.log({ level: "error", stage: "detect", event: "error", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", error: { code: "EMPTY_INPUT", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", name: "KordocError" } });
9507
9866
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
9508
9867
  }
9509
9868
  const MAX_FILE_SIZE = 500 * 1024 * 1024;
9510
9869
  if (buffer.byteLength > MAX_FILE_SIZE) {
9870
+ logger.log({
9871
+ level: "error",
9872
+ stage: "detect",
9873
+ event: "error",
9874
+ message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC",
9875
+ meta: { size: buffer.byteLength },
9876
+ error: { code: "FILE_TOO_LARGE", message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC", name: "KordocError" }
9877
+ });
9511
9878
  return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
9512
9879
  }
9513
9880
  const format = detectFormat(buffer);
9881
+ logger.log({ level: "info", event: "done", message: "\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC", meta: { format } });
9514
9882
  switch (format) {
9515
9883
  case "hwpx": {
9516
9884
  const { format: zipFormat, zip } = await detectZipFormat(buffer);
@@ -9588,7 +9956,8 @@ async function parseHwpx(buffer, options, zip) {
9588
9956
  const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
9589
9957
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
9590
9958
  } catch (err) {
9591
- return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
9959
+ const normalized = normalizeKordocError(err, "HWPX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
9960
+ return { success: false, fileType: "hwpx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
9592
9961
  }
9593
9962
  }
9594
9963
  async function parseHwp(buffer, options) {
@@ -9596,7 +9965,8 @@ async function parseHwp(buffer, options) {
9596
9965
  const { markdown, blocks, metadata, outline, warnings, images } = parseHwp5Document(Buffer.from(buffer), options);
9597
9966
  return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
9598
9967
  } catch (err) {
9599
- return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
9968
+ const normalized = normalizeKordocError(err, "HWP \uD30C\uC2F1 \uC2E4\uD328", "finalize");
9969
+ return { success: false, fileType: "hwp", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
9600
9970
  }
9601
9971
  }
9602
9972
  async function parsePdf(buffer, options) {
@@ -9604,8 +9974,15 @@ async function parsePdf(buffer, options) {
9604
9974
  const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
9605
9975
  return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
9606
9976
  } catch (err) {
9977
+ const normalized = normalizeKordocError(err, "PDF \uD30C\uC2F1 \uC2E4\uD328", "finalize");
9607
9978
  const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
9608
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
9979
+ return {
9980
+ success: false,
9981
+ fileType: "pdf",
9982
+ error: normalized.message,
9983
+ code: normalized.code ?? classifyError(normalized),
9984
+ isImageBased
9985
+ };
9609
9986
  }
9610
9987
  }
9611
9988
  async function parseXlsx(buffer, options, zip) {
@@ -9613,7 +9990,8 @@ async function parseXlsx(buffer, options, zip) {
9613
9990
  const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
9614
9991
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
9615
9992
  } catch (err) {
9616
- return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
9993
+ const normalized = normalizeKordocError(err, "XLSX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
9994
+ return { success: false, fileType: "xlsx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
9617
9995
  }
9618
9996
  }
9619
9997
  async function parseDocx(buffer, options, zip) {
@@ -9621,7 +9999,8 @@ async function parseDocx(buffer, options, zip) {
9621
9999
  const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
9622
10000
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
9623
10001
  } catch (err) {
9624
- return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
10002
+ const normalized = normalizeKordocError(err, "DOCX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
10003
+ return { success: false, fileType: "docx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
9625
10004
  }
9626
10005
  }
9627
10006
 
@@ -9813,4 +10192,4 @@ export {
9813
10192
  cfb/cfb.js:
9814
10193
  (*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
9815
10194
  */
9816
- //# sourceMappingURL=chunk-JGMLDBW5.js.map
10195
+ //# sourceMappingURL=chunk-KJEZPVEK.js.map