aurochs 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/@aurochs/pdf/src/domain/document/index.d.ts +1 -1
- package/dist/@aurochs/pdf/src/domain/document/types.d.ts +34 -0
- package/dist/@aurochs/pdf/src/writer/content-stream/text-operators.d.ts +6 -0
- package/dist/@aurochs/pdf/src/writer/document/font-builder.d.ts +16 -0
- package/dist/@aurochs/pdf/src/writer/document/index.d.ts +2 -1
- package/dist/@aurochs/pdf/src/writer/document/page-builder.d.ts +3 -1
- package/dist/@aurochs/pdf/src/writer/document/tounicode-writer.d.ts +8 -0
- package/dist/_shared/{pdf-parser-CKdfZw6N.js → pdf-parser-Ciztl2kx.js} +84 -6
- package/dist/_shared/pdf-parser-Ciztl2kx.js.map +1 -0
- package/dist/cli.js +474 -15
- package/dist/pdf/builder/index.js +2 -1
- package/dist/pdf/builder/index.js.map +1 -1
- package/dist/pdf/parser/index.js +1 -1
- package/dist/pdf/parser/index.js.map +1 -1
- package/dist/pdf/writer/index.js +391 -10
- package/dist/pdf/writer/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/_shared/pdf-parser-CKdfZw6N.js.map +0 -1
|
@@ -3,5 +3,5 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Exports types for PDF document structure.
|
|
5
5
|
*/
|
|
6
|
-
export type { PdfElement, PdfPage, PdfDocument, PdfEmbeddedFont } from './types';
|
|
6
|
+
export type { PdfElement, PdfPage, PdfDocument, PdfEmbeddedFont, PdfFontToUnicode, PdfFontMetrics } from './types';
|
|
7
7
|
export { PDF_UNITS, isPdfPath, isPdfText, isPdfImage } from './types';
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { PdfImage } from '../image';
|
|
2
2
|
import { PdfPath } from '../path';
|
|
3
3
|
import { PdfText } from '../text';
|
|
4
|
+
import { CIDOrdering } from '../font';
|
|
4
5
|
export type PdfElement = PdfPath | PdfText | PdfImage;
|
|
5
6
|
/**
|
|
6
7
|
* Represents a parsed PDF page.
|
|
@@ -45,6 +46,29 @@ export type PdfPage = {
|
|
|
45
46
|
*/
|
|
46
47
|
readonly elements: readonly PdfElement[];
|
|
47
48
|
};
|
|
49
|
+
/**
|
|
50
|
+
* ToUnicode mapping data for round-trip preservation.
|
|
51
|
+
* Used to reconstruct ToUnicode CMap when writing PDF.
|
|
52
|
+
*/
|
|
53
|
+
export type PdfFontToUnicode = {
|
|
54
|
+
/** Source bytes (hex) → Unicode string mapping. Key is uppercase hex (e.g., "8140" → "ア"). */
|
|
55
|
+
readonly byteMapping: ReadonlyMap<string, string>;
|
|
56
|
+
/** Source code byte lengths from codespace ranges (descending order). */
|
|
57
|
+
readonly sourceCodeByteLengths: readonly number[];
|
|
58
|
+
};
|
|
59
|
+
/**
|
|
60
|
+
* Font metrics for PDF writing.
|
|
61
|
+
*/
|
|
62
|
+
export type PdfFontMetrics = {
|
|
63
|
+
/** Ascender height in 1/1000 em units */
|
|
64
|
+
readonly ascender: number;
|
|
65
|
+
/** Descender depth in 1/1000 em units (negative) */
|
|
66
|
+
readonly descender: number;
|
|
67
|
+
/** Glyph widths: character code → width in 1/1000 em units */
|
|
68
|
+
readonly widths: ReadonlyMap<number, number>;
|
|
69
|
+
/** Default glyph width when not found in widths */
|
|
70
|
+
readonly defaultWidth: number;
|
|
71
|
+
};
|
|
48
72
|
/**
|
|
49
73
|
* Embedded font data extracted from PDF.
|
|
50
74
|
*
|
|
@@ -59,6 +83,16 @@ export type PdfEmbeddedFont = {
|
|
|
59
83
|
readonly data: Uint8Array;
|
|
60
84
|
/** MIME type */
|
|
61
85
|
readonly mimeType: string;
|
|
86
|
+
/** Original BaseFont name from PDF (e.g., "/ZRDQJE+Hiragino-Sans"). Includes subset prefix. */
|
|
87
|
+
readonly baseFontName?: string;
|
|
88
|
+
/** ToUnicode CMap information for round-trip preservation. */
|
|
89
|
+
readonly toUnicode?: PdfFontToUnicode;
|
|
90
|
+
/** Font metrics for accurate text layout. */
|
|
91
|
+
readonly metrics?: PdfFontMetrics;
|
|
92
|
+
/** CID ordering (Japan1, GB1, CNS1, Korea1, Identity). */
|
|
93
|
+
readonly ordering?: CIDOrdering;
|
|
94
|
+
/** Number of bytes per character code (1 for single-byte, 2 for CID fonts). */
|
|
95
|
+
readonly codeByteWidth?: 1 | 2;
|
|
62
96
|
};
|
|
63
97
|
export type PdfDocument = {
|
|
64
98
|
readonly pages: readonly PdfPage[];
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { PdfText } from '../../domain/text';
|
|
2
|
+
import { PdfEmbeddedFont } from '../../domain/document';
|
|
2
3
|
/**
|
|
3
4
|
* Context for text serialization.
|
|
4
5
|
*/
|
|
@@ -8,6 +9,11 @@ export type TextSerializationContext = {
|
|
|
8
9
|
* e.g., "Helvetica" -> "F1"
|
|
9
10
|
*/
|
|
10
11
|
readonly fontNameToResource: ReadonlyMap<string, string>;
|
|
12
|
+
/**
|
|
13
|
+
* Embedded fonts for CID font detection.
|
|
14
|
+
* When present, CID fonts will use hex string output.
|
|
15
|
+
*/
|
|
16
|
+
readonly embeddedFonts?: readonly PdfEmbeddedFont[];
|
|
11
17
|
};
|
|
12
18
|
/**
|
|
13
19
|
* Serialize a PdfText element to PDF content stream operators.
|
|
@@ -20,6 +20,22 @@ export declare function buildType1Font(fontName: string, tracker: PdfObjectTrack
|
|
|
20
20
|
* @returns The font dictionary object number
|
|
21
21
|
*/
|
|
22
22
|
export declare function buildEmbeddedFont(font: PdfEmbeddedFont, tracker: PdfObjectTracker): number;
|
|
23
|
+
/**
|
|
24
|
+
* Build a Type0 CID font with Identity-H encoding.
|
|
25
|
+
*
|
|
26
|
+
* Structure:
|
|
27
|
+
* - Type0 font dictionary (top-level)
|
|
28
|
+
* - CIDFont dictionary (descendant)
|
|
29
|
+
* - FontDescriptor
|
|
30
|
+
* - CIDToGIDMap (Identity)
|
|
31
|
+
* - ToUnicode CMap stream
|
|
32
|
+
* - Embedded font file
|
|
33
|
+
*
|
|
34
|
+
* @param font - The embedded font data with CID information
|
|
35
|
+
* @param tracker - Object tracker for allocation
|
|
36
|
+
* @returns The Type0 font dictionary object number
|
|
37
|
+
*/
|
|
38
|
+
export declare function buildType0Font(font: PdfEmbeddedFont, tracker: PdfObjectTracker): number;
|
|
23
39
|
/**
|
|
24
40
|
* Build fonts for a document.
|
|
25
41
|
* Returns a map of font name to object number.
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
*/
|
|
4
4
|
export { PdfObjectTracker, type PdfObjectEntry, } from './object-tracker';
|
|
5
5
|
export { buildResourceDict, buildEmptyResourceDict, type ResourceRefs, } from './resource-builder';
|
|
6
|
-
export { buildType1Font, buildEmbeddedFont, buildFonts, } from './font-builder';
|
|
6
|
+
export { buildType1Font, buildEmbeddedFont, buildType0Font, buildFonts, } from './font-builder';
|
|
7
|
+
export { generateToUnicodeStream, } from './tounicode-writer';
|
|
7
8
|
export { buildImageXObject, buildImages, } from './image-builder';
|
|
8
9
|
export { buildPage, type PageBuildResult, type BuildPageOptions, } from './page-builder';
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { PdfPage } from '../../domain/document';
|
|
1
|
+
import { PdfPage, PdfEmbeddedFont } from '../../domain/document';
|
|
2
2
|
import { PdfImage } from '../../domain/image';
|
|
3
3
|
import { ResourceRefs } from './resource-builder';
|
|
4
4
|
import { PdfObjectTracker } from './object-tracker';
|
|
@@ -23,6 +23,8 @@ export type BuildPageOptions = {
|
|
|
23
23
|
readonly fontObjMap: ReadonlyMap<string, number>;
|
|
24
24
|
readonly imageObjMap: ReadonlyMap<number, number>;
|
|
25
25
|
readonly tracker: PdfObjectTracker;
|
|
26
|
+
/** Embedded fonts for CID font text serialization. */
|
|
27
|
+
readonly embeddedFonts?: readonly PdfEmbeddedFont[];
|
|
26
28
|
};
|
|
27
29
|
/**
|
|
28
30
|
* Build a page object.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { PdfFontToUnicode } from '../../domain/document';
|
|
2
|
+
/**
|
|
3
|
+
* Generate a ToUnicode CMap stream.
|
|
4
|
+
*
|
|
5
|
+
* @param toUnicode - The ToUnicode mapping data
|
|
6
|
+
* @returns CMap stream content as UTF-8 encoded bytes
|
|
7
|
+
*/
|
|
8
|
+
export declare function generateToUnicodeStream(toUnicode: PdfFontToUnicode): Uint8Array;
|
|
@@ -4062,8 +4062,51 @@ function extractToUnicodeMap(page, fontDict) {
|
|
|
4062
4062
|
}
|
|
4063
4063
|
const decoded = decodePdfStream(stream);
|
|
4064
4064
|
const cmap = new TextDecoder("latin1").decode(decoded);
|
|
4065
|
-
|
|
4066
|
-
|
|
4065
|
+
return parseToUnicodeCMap(cmap);
|
|
4066
|
+
}
|
|
4067
|
+
function extractCIDOrdering(page, fontDict) {
|
|
4068
|
+
const subtypeObj = resolve$8(page, dictGet$e(fontDict, "Subtype"));
|
|
4069
|
+
const subtype = subtypeObj?.type === "name" ? subtypeObj.value : null;
|
|
4070
|
+
if (subtype !== "Type0") {
|
|
4071
|
+
return void 0;
|
|
4072
|
+
}
|
|
4073
|
+
const descendants = resolve$8(page, dictGet$e(fontDict, "DescendantFonts"));
|
|
4074
|
+
const arr = asArray$a(descendants);
|
|
4075
|
+
if (!arr || arr.items.length === 0) {
|
|
4076
|
+
return void 0;
|
|
4077
|
+
}
|
|
4078
|
+
const cidFont = asDict$a(resolve$8(page, arr.items[0]));
|
|
4079
|
+
if (!cidFont) {
|
|
4080
|
+
return void 0;
|
|
4081
|
+
}
|
|
4082
|
+
const cidSystemInfo = resolveDict$4(page, dictGet$e(cidFont, "CIDSystemInfo"));
|
|
4083
|
+
if (!cidSystemInfo) {
|
|
4084
|
+
return void 0;
|
|
4085
|
+
}
|
|
4086
|
+
const orderingObj = resolve$8(page, dictGet$e(cidSystemInfo, "Ordering"));
|
|
4087
|
+
if (!orderingObj) {
|
|
4088
|
+
return void 0;
|
|
4089
|
+
}
|
|
4090
|
+
const ordering = orderingObj.type === "string" ? orderingObj.text : orderingObj.type === "name" ? orderingObj.value : null;
|
|
4091
|
+
if (!ordering) {
|
|
4092
|
+
return void 0;
|
|
4093
|
+
}
|
|
4094
|
+
if (ordering.includes("Japan1")) {
|
|
4095
|
+
return "Japan1";
|
|
4096
|
+
}
|
|
4097
|
+
if (ordering.includes("GB1")) {
|
|
4098
|
+
return "GB1";
|
|
4099
|
+
}
|
|
4100
|
+
if (ordering.includes("CNS1")) {
|
|
4101
|
+
return "CNS1";
|
|
4102
|
+
}
|
|
4103
|
+
if (ordering.includes("Korea1")) {
|
|
4104
|
+
return "Korea1";
|
|
4105
|
+
}
|
|
4106
|
+
if (ordering.includes("Identity")) {
|
|
4107
|
+
return "Identity";
|
|
4108
|
+
}
|
|
4109
|
+
return void 0;
|
|
4067
4110
|
}
|
|
4068
4111
|
function getFontDescriptor(page, fontDict) {
|
|
4069
4112
|
const subtypeObj = resolve$8(page, dictGet$e(fontDict, "Subtype"));
|
|
@@ -4103,11 +4146,20 @@ function normalizeEmbeddedFontData(args) {
|
|
|
4103
4146
|
if (args.format !== "truetype") {
|
|
4104
4147
|
return { data: args.rawData, metrics: void 0 };
|
|
4105
4148
|
}
|
|
4106
|
-
const data = repairFontForWeb(args.rawData, new Map(args.toUnicode ?? []), args.fontFamily);
|
|
4149
|
+
const data = repairFontForWeb(args.rawData, new Map(args.toUnicode?.mapping ?? []), args.fontFamily);
|
|
4107
4150
|
const rawMetrics = extractTrueTypeMetrics(data);
|
|
4108
4151
|
const metrics = rawMetrics ? normalizeMetricsTo1000(rawMetrics) : void 0;
|
|
4109
4152
|
return { data, metrics };
|
|
4110
4153
|
}
|
|
4154
|
+
function buildEmbeddedFontToUnicode(toUnicode) {
|
|
4155
|
+
if (!toUnicode || toUnicode.byteMapping.size === 0) {
|
|
4156
|
+
return void 0;
|
|
4157
|
+
}
|
|
4158
|
+
return {
|
|
4159
|
+
byteMapping: toUnicode.byteMapping,
|
|
4160
|
+
sourceCodeByteLengths: toUnicode.sourceCodeByteLengths
|
|
4161
|
+
};
|
|
4162
|
+
}
|
|
4111
4163
|
function extractEmbeddedFontsFromNativePages(pages) {
|
|
4112
4164
|
const fonts = [];
|
|
4113
4165
|
const seen = /* @__PURE__ */ new Set();
|
|
@@ -4148,13 +4200,18 @@ function extractEmbeddedFontsFromNativePages(pages) {
|
|
|
4148
4200
|
const fontFamily = normalizeFontFamily(baseFontRaw);
|
|
4149
4201
|
const toUnicode = extractToUnicodeMap(page, fontDict);
|
|
4150
4202
|
const { data, metrics } = normalizeEmbeddedFontData({ format, rawData, fontFamily, toUnicode });
|
|
4203
|
+
const ordering = extractCIDOrdering(page, fontDict);
|
|
4204
|
+
const codeByteWidth = toUnicode?.codeByteWidth ?? 1;
|
|
4151
4205
|
fonts.push({
|
|
4152
4206
|
baseFontName: baseFontRaw,
|
|
4153
4207
|
fontFamily,
|
|
4154
4208
|
format,
|
|
4155
4209
|
data,
|
|
4156
4210
|
mimeType,
|
|
4157
|
-
metrics
|
|
4211
|
+
metrics,
|
|
4212
|
+
toUnicode: buildEmbeddedFontToUnicode(toUnicode),
|
|
4213
|
+
ordering,
|
|
4214
|
+
codeByteWidth
|
|
4158
4215
|
});
|
|
4159
4216
|
}
|
|
4160
4217
|
}
|
|
@@ -5753,6 +5810,13 @@ const handleTextNextLine = (ctx, gfxOps) => {
|
|
|
5753
5810
|
}
|
|
5754
5811
|
};
|
|
5755
5812
|
};
|
|
5813
|
+
function rawTextToBytes(rawText) {
|
|
5814
|
+
const bytes = new Uint8Array(rawText.length);
|
|
5815
|
+
for (let i2 = 0; i2 < rawText.length; i2++) {
|
|
5816
|
+
bytes[i2] = rawText.charCodeAt(i2) & 255;
|
|
5817
|
+
}
|
|
5818
|
+
return bytes;
|
|
5819
|
+
}
|
|
5756
5820
|
function createTextRun(text, textState, gfxState) {
|
|
5757
5821
|
const { ctm, textRise, charSpacing, wordSpacing, horizontalScaling, graphicsState } = gfxState;
|
|
5758
5822
|
const { textMatrix, currentFont, currentBaseFont, currentFontInfo, currentFontSize, currentFontMetrics, currentCodeByteWidth } = textState;
|
|
@@ -5790,6 +5854,7 @@ function createTextRun(text, textState, gfxState) {
|
|
|
5790
5854
|
const effectiveFontSize = calculateEffectiveFontSize(currentFontSize, textMatrix, ctm);
|
|
5791
5855
|
const run = {
|
|
5792
5856
|
text,
|
|
5857
|
+
rawBytes: rawTextToBytes(text),
|
|
5793
5858
|
textMatrix,
|
|
5794
5859
|
x: startPos.x,
|
|
5795
5860
|
y: startPos.y,
|
|
@@ -21405,7 +21470,20 @@ function buildEmbeddedFonts(embeddedFontsRaw) {
|
|
|
21405
21470
|
fontFamily: f2.fontFamily,
|
|
21406
21471
|
format: f2.format,
|
|
21407
21472
|
data: f2.data,
|
|
21408
|
-
mimeType: f2.mimeType
|
|
21473
|
+
mimeType: f2.mimeType,
|
|
21474
|
+
baseFontName: f2.baseFontName,
|
|
21475
|
+
toUnicode: f2.toUnicode ? {
|
|
21476
|
+
byteMapping: f2.toUnicode.byteMapping,
|
|
21477
|
+
sourceCodeByteLengths: f2.toUnicode.sourceCodeByteLengths
|
|
21478
|
+
} : void 0,
|
|
21479
|
+
metrics: f2.metrics ? {
|
|
21480
|
+
ascender: f2.metrics.ascender,
|
|
21481
|
+
descender: f2.metrics.descender,
|
|
21482
|
+
widths: f2.metrics.widths ?? /* @__PURE__ */ new Map(),
|
|
21483
|
+
defaultWidth: f2.metrics.defaultWidth ?? 500
|
|
21484
|
+
} : void 0,
|
|
21485
|
+
ordering: f2.ordering,
|
|
21486
|
+
codeByteWidth: f2.codeByteWidth
|
|
21409
21487
|
}));
|
|
21410
21488
|
}
|
|
21411
21489
|
async function parsePageSource({ page, pageNumber, parseOptions, embeddedFontMetrics }) {
|
|
@@ -22023,4 +22101,4 @@ export {
|
|
|
22023
22101
|
calculateTextDisplacement as y,
|
|
22024
22102
|
rasterizeSoftMaskedFillPath as z
|
|
22025
22103
|
};
|
|
22026
|
-
//# sourceMappingURL=pdf-parser-
|
|
22104
|
+
//# sourceMappingURL=pdf-parser-Ciztl2kx.js.map
|