aurochs 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,5 +3,5 @@
3
3
  *
4
4
  * Exports types for PDF document structure.
5
5
  */
6
- export type { PdfElement, PdfPage, PdfDocument, PdfEmbeddedFont } from './types';
6
+ export type { PdfElement, PdfPage, PdfDocument, PdfEmbeddedFont, PdfFontToUnicode, PdfFontMetrics } from './types';
7
7
  export { PDF_UNITS, isPdfPath, isPdfText, isPdfImage } from './types';
@@ -1,6 +1,7 @@
1
1
  import { PdfImage } from '../image';
2
2
  import { PdfPath } from '../path';
3
3
  import { PdfText } from '../text';
4
+ import { CIDOrdering } from '../font';
4
5
  export type PdfElement = PdfPath | PdfText | PdfImage;
5
6
  /**
6
7
  * Represents a parsed PDF page.
@@ -45,6 +46,29 @@ export type PdfPage = {
45
46
  */
46
47
  readonly elements: readonly PdfElement[];
47
48
  };
49
+ /**
50
+ * ToUnicode mapping data for round-trip preservation.
51
+ * Used to reconstruct ToUnicode CMap when writing PDF.
52
+ */
53
+ export type PdfFontToUnicode = {
54
+ /** Source bytes (hex) → Unicode string mapping. Key is uppercase hex (e.g., "8140" → "ア"). */
55
+ readonly byteMapping: ReadonlyMap<string, string>;
56
+ /** Source code byte lengths from codespace ranges (descending order). */
57
+ readonly sourceCodeByteLengths: readonly number[];
58
+ };
59
+ /**
60
+ * Font metrics for PDF writing.
61
+ */
62
+ export type PdfFontMetrics = {
63
+ /** Ascender height in 1/1000 em units */
64
+ readonly ascender: number;
65
+ /** Descender depth in 1/1000 em units (negative) */
66
+ readonly descender: number;
67
+ /** Glyph widths: character code → width in 1/1000 em units */
68
+ readonly widths: ReadonlyMap<number, number>;
69
+ /** Default glyph width when not found in widths */
70
+ readonly defaultWidth: number;
71
+ };
48
72
  /**
49
73
  * Embedded font data extracted from PDF.
50
74
  *
@@ -59,6 +83,16 @@ export type PdfEmbeddedFont = {
59
83
  readonly data: Uint8Array;
60
84
  /** MIME type */
61
85
  readonly mimeType: string;
86
+ /** Original BaseFont name from PDF (e.g., "/ZRDQJE+Hiragino-Sans"). Includes subset prefix. */
87
+ readonly baseFontName?: string;
88
+ /** ToUnicode CMap information for round-trip preservation. */
89
+ readonly toUnicode?: PdfFontToUnicode;
90
+ /** Font metrics for accurate text layout. */
91
+ readonly metrics?: PdfFontMetrics;
92
+ /** CID ordering (Japan1, GB1, CNS1, Korea1, Identity). */
93
+ readonly ordering?: CIDOrdering;
94
+ /** Number of bytes per character code (1 for single-byte, 2 for CID fonts). */
95
+ readonly codeByteWidth?: 1 | 2;
62
96
  };
63
97
  export type PdfDocument = {
64
98
  readonly pages: readonly PdfPage[];
@@ -1,4 +1,5 @@
1
1
  import { PdfText } from '../../domain/text';
2
+ import { PdfEmbeddedFont } from '../../domain/document';
2
3
  /**
3
4
  * Context for text serialization.
4
5
  */
@@ -8,6 +9,11 @@ export type TextSerializationContext = {
8
9
  * e.g., "Helvetica" -> "F1"
9
10
  */
10
11
  readonly fontNameToResource: ReadonlyMap<string, string>;
12
+ /**
13
+ * Embedded fonts for CID font detection.
14
+ * When present, CID fonts will use hex string output.
15
+ */
16
+ readonly embeddedFonts?: readonly PdfEmbeddedFont[];
11
17
  };
12
18
  /**
13
19
  * Serialize a PdfText element to PDF content stream operators.
@@ -20,6 +20,22 @@ export declare function buildType1Font(fontName: string, tracker: PdfObjectTrack
20
20
  * @returns The font dictionary object number
21
21
  */
22
22
  export declare function buildEmbeddedFont(font: PdfEmbeddedFont, tracker: PdfObjectTracker): number;
23
+ /**
24
+ * Build a Type0 CID font with Identity-H encoding.
25
+ *
26
+ * Structure:
27
+ * - Type0 font dictionary (top-level)
28
+ * - CIDFont dictionary (descendant)
29
+ * - FontDescriptor
30
+ * - CIDToGIDMap (Identity)
31
+ * - ToUnicode CMap stream
32
+ * - Embedded font file
33
+ *
34
+ * @param font - The embedded font data with CID information
35
+ * @param tracker - Object tracker for allocation
36
+ * @returns The Type0 font dictionary object number
37
+ */
38
+ export declare function buildType0Font(font: PdfEmbeddedFont, tracker: PdfObjectTracker): number;
23
39
  /**
24
40
  * Build fonts for a document.
25
41
  * Returns a map of font name to object number.
@@ -3,6 +3,7 @@
3
3
  */
4
4
  export { PdfObjectTracker, type PdfObjectEntry, } from './object-tracker';
5
5
  export { buildResourceDict, buildEmptyResourceDict, type ResourceRefs, } from './resource-builder';
6
- export { buildType1Font, buildEmbeddedFont, buildFonts, } from './font-builder';
6
+ export { buildType1Font, buildEmbeddedFont, buildType0Font, buildFonts, } from './font-builder';
7
+ export { generateToUnicodeStream, } from './tounicode-writer';
7
8
  export { buildImageXObject, buildImages, } from './image-builder';
8
9
  export { buildPage, type PageBuildResult, type BuildPageOptions, } from './page-builder';
@@ -1,4 +1,4 @@
1
- import { PdfPage } from '../../domain/document';
1
+ import { PdfPage, PdfEmbeddedFont } from '../../domain/document';
2
2
  import { PdfImage } from '../../domain/image';
3
3
  import { ResourceRefs } from './resource-builder';
4
4
  import { PdfObjectTracker } from './object-tracker';
@@ -23,6 +23,8 @@ export type BuildPageOptions = {
23
23
  readonly fontObjMap: ReadonlyMap<string, number>;
24
24
  readonly imageObjMap: ReadonlyMap<number, number>;
25
25
  readonly tracker: PdfObjectTracker;
26
+ /** Embedded fonts for CID font text serialization. */
27
+ readonly embeddedFonts?: readonly PdfEmbeddedFont[];
26
28
  };
27
29
  /**
28
30
  * Build a page object.
@@ -0,0 +1,8 @@
1
+ import { PdfFontToUnicode } from '../../domain/document';
2
+ /**
3
+ * Generate a ToUnicode CMap stream.
4
+ *
5
+ * @param toUnicode - The ToUnicode mapping data
6
+ * @returns CMap stream content as UTF-8 encoded bytes
7
+ */
8
+ export declare function generateToUnicodeStream(toUnicode: PdfFontToUnicode): Uint8Array;
@@ -4062,8 +4062,51 @@ function extractToUnicodeMap(page, fontDict) {
4062
4062
  }
4063
4063
  const decoded = decodePdfStream(stream);
4064
4064
  const cmap = new TextDecoder("latin1").decode(decoded);
4065
- const parsed = parseToUnicodeCMap(cmap);
4066
- return parsed.mapping;
4065
+ return parseToUnicodeCMap(cmap);
4066
+ }
4067
+ function extractCIDOrdering(page, fontDict) {
4068
+ const subtypeObj = resolve$8(page, dictGet$e(fontDict, "Subtype"));
4069
+ const subtype = subtypeObj?.type === "name" ? subtypeObj.value : null;
4070
+ if (subtype !== "Type0") {
4071
+ return void 0;
4072
+ }
4073
+ const descendants = resolve$8(page, dictGet$e(fontDict, "DescendantFonts"));
4074
+ const arr = asArray$a(descendants);
4075
+ if (!arr || arr.items.length === 0) {
4076
+ return void 0;
4077
+ }
4078
+ const cidFont = asDict$a(resolve$8(page, arr.items[0]));
4079
+ if (!cidFont) {
4080
+ return void 0;
4081
+ }
4082
+ const cidSystemInfo = resolveDict$4(page, dictGet$e(cidFont, "CIDSystemInfo"));
4083
+ if (!cidSystemInfo) {
4084
+ return void 0;
4085
+ }
4086
+ const orderingObj = resolve$8(page, dictGet$e(cidSystemInfo, "Ordering"));
4087
+ if (!orderingObj) {
4088
+ return void 0;
4089
+ }
4090
+ const ordering = orderingObj.type === "string" ? orderingObj.text : orderingObj.type === "name" ? orderingObj.value : null;
4091
+ if (!ordering) {
4092
+ return void 0;
4093
+ }
4094
+ if (ordering.includes("Japan1")) {
4095
+ return "Japan1";
4096
+ }
4097
+ if (ordering.includes("GB1")) {
4098
+ return "GB1";
4099
+ }
4100
+ if (ordering.includes("CNS1")) {
4101
+ return "CNS1";
4102
+ }
4103
+ if (ordering.includes("Korea1")) {
4104
+ return "Korea1";
4105
+ }
4106
+ if (ordering.includes("Identity")) {
4107
+ return "Identity";
4108
+ }
4109
+ return void 0;
4067
4110
  }
4068
4111
  function getFontDescriptor(page, fontDict) {
4069
4112
  const subtypeObj = resolve$8(page, dictGet$e(fontDict, "Subtype"));
@@ -4103,11 +4146,20 @@ function normalizeEmbeddedFontData(args) {
4103
4146
  if (args.format !== "truetype") {
4104
4147
  return { data: args.rawData, metrics: void 0 };
4105
4148
  }
4106
- const data = repairFontForWeb(args.rawData, new Map(args.toUnicode ?? []), args.fontFamily);
4149
+ const data = repairFontForWeb(args.rawData, new Map(args.toUnicode?.mapping ?? []), args.fontFamily);
4107
4150
  const rawMetrics = extractTrueTypeMetrics(data);
4108
4151
  const metrics = rawMetrics ? normalizeMetricsTo1000(rawMetrics) : void 0;
4109
4152
  return { data, metrics };
4110
4153
  }
4154
+ function buildEmbeddedFontToUnicode(toUnicode) {
4155
+ if (!toUnicode || toUnicode.byteMapping.size === 0) {
4156
+ return void 0;
4157
+ }
4158
+ return {
4159
+ byteMapping: toUnicode.byteMapping,
4160
+ sourceCodeByteLengths: toUnicode.sourceCodeByteLengths
4161
+ };
4162
+ }
4111
4163
  function extractEmbeddedFontsFromNativePages(pages) {
4112
4164
  const fonts = [];
4113
4165
  const seen = /* @__PURE__ */ new Set();
@@ -4148,13 +4200,18 @@ function extractEmbeddedFontsFromNativePages(pages) {
4148
4200
  const fontFamily = normalizeFontFamily(baseFontRaw);
4149
4201
  const toUnicode = extractToUnicodeMap(page, fontDict);
4150
4202
  const { data, metrics } = normalizeEmbeddedFontData({ format, rawData, fontFamily, toUnicode });
4203
+ const ordering = extractCIDOrdering(page, fontDict);
4204
+ const codeByteWidth = toUnicode?.codeByteWidth ?? 1;
4151
4205
  fonts.push({
4152
4206
  baseFontName: baseFontRaw,
4153
4207
  fontFamily,
4154
4208
  format,
4155
4209
  data,
4156
4210
  mimeType,
4157
- metrics
4211
+ metrics,
4212
+ toUnicode: buildEmbeddedFontToUnicode(toUnicode),
4213
+ ordering,
4214
+ codeByteWidth
4158
4215
  });
4159
4216
  }
4160
4217
  }
@@ -5753,6 +5810,13 @@ const handleTextNextLine = (ctx, gfxOps) => {
5753
5810
  }
5754
5811
  };
5755
5812
  };
5813
+ function rawTextToBytes(rawText) {
5814
+ const bytes = new Uint8Array(rawText.length);
5815
+ for (let i2 = 0; i2 < rawText.length; i2++) {
5816
+ bytes[i2] = rawText.charCodeAt(i2) & 255;
5817
+ }
5818
+ return bytes;
5819
+ }
5756
5820
  function createTextRun(text, textState, gfxState) {
5757
5821
  const { ctm, textRise, charSpacing, wordSpacing, horizontalScaling, graphicsState } = gfxState;
5758
5822
  const { textMatrix, currentFont, currentBaseFont, currentFontInfo, currentFontSize, currentFontMetrics, currentCodeByteWidth } = textState;
@@ -5790,6 +5854,7 @@ function createTextRun(text, textState, gfxState) {
5790
5854
  const effectiveFontSize = calculateEffectiveFontSize(currentFontSize, textMatrix, ctm);
5791
5855
  const run = {
5792
5856
  text,
5857
+ rawBytes: rawTextToBytes(text),
5793
5858
  textMatrix,
5794
5859
  x: startPos.x,
5795
5860
  y: startPos.y,
@@ -21405,7 +21470,20 @@ function buildEmbeddedFonts(embeddedFontsRaw) {
21405
21470
  fontFamily: f2.fontFamily,
21406
21471
  format: f2.format,
21407
21472
  data: f2.data,
21408
- mimeType: f2.mimeType
21473
+ mimeType: f2.mimeType,
21474
+ baseFontName: f2.baseFontName,
21475
+ toUnicode: f2.toUnicode ? {
21476
+ byteMapping: f2.toUnicode.byteMapping,
21477
+ sourceCodeByteLengths: f2.toUnicode.sourceCodeByteLengths
21478
+ } : void 0,
21479
+ metrics: f2.metrics ? {
21480
+ ascender: f2.metrics.ascender,
21481
+ descender: f2.metrics.descender,
21482
+ widths: f2.metrics.widths ?? /* @__PURE__ */ new Map(),
21483
+ defaultWidth: f2.metrics.defaultWidth ?? 500
21484
+ } : void 0,
21485
+ ordering: f2.ordering,
21486
+ codeByteWidth: f2.codeByteWidth
21409
21487
  }));
21410
21488
  }
21411
21489
  async function parsePageSource({ page, pageNumber, parseOptions, embeddedFontMetrics }) {
@@ -22023,4 +22101,4 @@ export {
22023
22101
  calculateTextDisplacement as y,
22024
22102
  rasterizeSoftMaskedFillPath as z
22025
22103
  };
22026
- //# sourceMappingURL=pdf-parser-CKdfZw6N.js.map
22104
+ //# sourceMappingURL=pdf-parser-Ciztl2kx.js.map