@cj-tech-master/excelts 8.0.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/README_zh.md +6 -0
- package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
- package/dist/browser/modules/archive/zip/stream.js +53 -0
- package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/browser/modules/pdf/core/crypto.js +637 -0
- package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/browser/modules/pdf/core/encryption.js +88 -261
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/browser/modules/pdf/index.d.ts +23 -2
- package/dist/browser/modules/pdf/index.js +21 -3
- package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/cjs/modules/archive/zip/stream.js +53 -0
- package/dist/cjs/modules/pdf/core/crypto.js +649 -0
- package/dist/cjs/modules/pdf/core/encryption.js +88 -263
- package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/cjs/modules/pdf/index.js +23 -4
- package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
- package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
- package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
- package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
- package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
- package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
- package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
- package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
- package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
- package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
- package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
- package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
- package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
- package/dist/esm/modules/archive/zip/stream.js +53 -0
- package/dist/esm/modules/pdf/core/crypto.js +637 -0
- package/dist/esm/modules/pdf/core/encryption.js +88 -261
- package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/esm/modules/pdf/index.js +21 -3
- package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/iife/excelts.iife.js +703 -267
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +35 -35
- package/dist/types/modules/archive/zip/stream.d.ts +4 -0
- package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/types/modules/pdf/index.d.ts +23 -2
- package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/package.json +1 -1
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Text reconstruction from positioned text fragments.
|
|
4
|
+
*
|
|
5
|
+
* Assembles raw text fragments extracted from PDF content streams into
|
|
6
|
+
* coherent, human-readable text with proper reading order, line breaks,
|
|
7
|
+
* and paragraph detection.
|
|
8
|
+
*
|
|
9
|
+
* Challenges addressed:
|
|
10
|
+
* - PDF text has no semantic structure (only "draw char at (x,y)")
|
|
11
|
+
* - Text fragments may be out of order
|
|
12
|
+
* - Word and line boundaries must be inferred from positions
|
|
13
|
+
* - Columns and tables need proper handling
|
|
14
|
+
* - Different fonts/sizes affect spacing thresholds
|
|
15
|
+
* - Multi-column layouts need column detection
|
|
16
|
+
* - RTL (Arabic, Hebrew) text needs right-to-left sorting
|
|
17
|
+
* - Vertical CJK text needs column-based grouping
|
|
18
|
+
*
|
|
19
|
+
* @see PDF Reference 1.7, Chapter 5 - Text
|
|
20
|
+
*/
|
|
21
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
+
exports.reconstructText = reconstructText;
|
|
23
|
+
exports.reconstructTextLines = reconstructTextLines;
|
|
24
|
+
// =============================================================================
|
|
25
|
+
// Public API
|
|
26
|
+
// =============================================================================
|
|
27
|
+
/**
|
|
28
|
+
* Reconstruct readable text from positioned text fragments.
|
|
29
|
+
*
|
|
30
|
+
* @param fragments - Raw text fragments with positions from content stream
|
|
31
|
+
* @returns Reconstructed text with proper line breaks and spacing
|
|
32
|
+
*/
|
|
33
|
+
function reconstructText(fragments) {
|
|
34
|
+
if (fragments.length === 0) {
|
|
35
|
+
return "";
|
|
36
|
+
}
|
|
37
|
+
// Separate vertical text from horizontal text
|
|
38
|
+
const verticalFragments = fragments.filter(f => f.isVertical);
|
|
39
|
+
const horizontalFragments = fragments.filter(f => !f.isVertical);
|
|
40
|
+
const parts = [];
|
|
41
|
+
// Process horizontal text (possibly multi-column)
|
|
42
|
+
if (horizontalFragments.length > 0) {
|
|
43
|
+
parts.push(reconstructHorizontalText(horizontalFragments));
|
|
44
|
+
}
|
|
45
|
+
// Process vertical text
|
|
46
|
+
if (verticalFragments.length > 0) {
|
|
47
|
+
parts.push(reconstructVerticalText(verticalFragments));
|
|
48
|
+
}
|
|
49
|
+
return parts.join("\n\n");
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Extract text as structured lines.
|
|
53
|
+
*/
|
|
54
|
+
function reconstructTextLines(fragments) {
|
|
55
|
+
if (fragments.length === 0) {
|
|
56
|
+
return [];
|
|
57
|
+
}
|
|
58
|
+
// Separate vertical from horizontal
|
|
59
|
+
const verticalFragments = fragments.filter(f => f.isVertical);
|
|
60
|
+
const horizontalFragments = fragments.filter(f => !f.isVertical);
|
|
61
|
+
const lines = [];
|
|
62
|
+
// Process horizontal text
|
|
63
|
+
if (horizontalFragments.length > 0) {
|
|
64
|
+
const columns = detectColumns(horizontalFragments);
|
|
65
|
+
for (const column of columns) {
|
|
66
|
+
const sorted = sortFragments(column);
|
|
67
|
+
const grouped = groupIntoLines(sorted);
|
|
68
|
+
for (const line of grouped) {
|
|
69
|
+
lines.push({
|
|
70
|
+
text: buildLineText(line),
|
|
71
|
+
y: line[0].y,
|
|
72
|
+
x: line[0].x,
|
|
73
|
+
fontSize: line[0].fontSize
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
// Process vertical text
|
|
79
|
+
if (verticalFragments.length > 0) {
|
|
80
|
+
const verticalLines = groupVerticalIntoColumns(verticalFragments);
|
|
81
|
+
for (const col of verticalLines) {
|
|
82
|
+
lines.push({
|
|
83
|
+
text: buildVerticalColumnText(col),
|
|
84
|
+
y: col[0].y,
|
|
85
|
+
x: col[0].x,
|
|
86
|
+
fontSize: col[0].fontSize
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return lines;
|
|
91
|
+
}
|
|
92
|
+
// =============================================================================
|
|
93
|
+
// Horizontal Text Reconstruction (with multi-column detection)
|
|
94
|
+
// =============================================================================
|
|
95
|
+
/**
|
|
96
|
+
* Reconstruct horizontal text, detecting multi-column layouts.
|
|
97
|
+
*/
|
|
98
|
+
function reconstructHorizontalText(fragments) {
|
|
99
|
+
const columns = detectColumns(fragments);
|
|
100
|
+
if (columns.length <= 1) {
|
|
101
|
+
// Single column — standard processing
|
|
102
|
+
const sorted = sortFragments(fragments);
|
|
103
|
+
const lines = groupIntoLines(sorted);
|
|
104
|
+
return buildText(lines);
|
|
105
|
+
}
|
|
106
|
+
// Multi-column: process each column independently, join with double newlines
|
|
107
|
+
const columnTexts = [];
|
|
108
|
+
for (const column of columns) {
|
|
109
|
+
const sorted = sortFragments(column);
|
|
110
|
+
const lines = groupIntoLines(sorted);
|
|
111
|
+
const text = buildText(lines);
|
|
112
|
+
if (text.length > 0) {
|
|
113
|
+
columnTexts.push(text);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return columnTexts.join("\n\n");
|
|
117
|
+
}
|
|
118
|
+
// =============================================================================
|
|
119
|
+
// Multi-Column Detection
|
|
120
|
+
// =============================================================================
|
|
121
|
+
/**
|
|
122
|
+
* Detect columns by clustering fragments by their X ranges.
|
|
123
|
+
*
|
|
124
|
+
* Builds a histogram of fragment X positions and looks for clear gaps
|
|
125
|
+
* that divide the page into 2+ columns.
|
|
126
|
+
*
|
|
127
|
+
* Distinguishes true multi-column layouts (e.g. newspaper columns) from
|
|
128
|
+
* tabular data by checking whether most Y-lines span across the gap.
|
|
129
|
+
* In a table, the same Y-line has fragments on both sides of the gap;
|
|
130
|
+
* in a true multi-column layout, each column has its own independent lines.
|
|
131
|
+
*
|
|
132
|
+
* @returns Array of fragment groups, one per detected column, sorted left-to-right
|
|
133
|
+
*/
|
|
134
|
+
function detectColumns(fragments) {
|
|
135
|
+
if (fragments.length < 4) {
|
|
136
|
+
// Too few fragments to reliably detect columns
|
|
137
|
+
return [fragments];
|
|
138
|
+
}
|
|
139
|
+
// Collect the X midpoints for each fragment
|
|
140
|
+
const xMidpoints = [];
|
|
141
|
+
for (const f of fragments) {
|
|
142
|
+
xMidpoints.push(f.x + f.width / 2);
|
|
143
|
+
}
|
|
144
|
+
// Sort midpoints
|
|
145
|
+
const sorted = [...xMidpoints].sort((a, b) => a - b);
|
|
146
|
+
// Find the median font size for gap threshold calculation
|
|
147
|
+
const fontSizes = fragments.map(f => f.fontSize).sort((a, b) => a - b);
|
|
148
|
+
const medianFontSize = fontSizes[Math.floor(fontSizes.length / 2)];
|
|
149
|
+
// Minimum gap to consider as a column separator:
|
|
150
|
+
// Must be significantly larger than a word space (at least 4x font size)
|
|
151
|
+
const minColumnGap = medianFontSize * 4;
|
|
152
|
+
// Find gaps between consecutive sorted midpoints
|
|
153
|
+
const gaps = [];
|
|
154
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
155
|
+
const gap = sorted[i] - sorted[i - 1];
|
|
156
|
+
if (gap > minColumnGap) {
|
|
157
|
+
gaps.push({ start: sorted[i - 1], end: sorted[i], size: gap });
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
if (gaps.length === 0) {
|
|
161
|
+
return [fragments];
|
|
162
|
+
}
|
|
163
|
+
// Merge nearby gaps (within minColumnGap of each other)
|
|
164
|
+
const mergedGaps = [gaps[0]];
|
|
165
|
+
for (let i = 1; i < gaps.length; i++) {
|
|
166
|
+
const last = mergedGaps[mergedGaps.length - 1];
|
|
167
|
+
if (gaps[i].start - last.end < minColumnGap) {
|
|
168
|
+
last.end = gaps[i].end;
|
|
169
|
+
}
|
|
170
|
+
else {
|
|
171
|
+
mergedGaps.push({ start: gaps[i].start, end: gaps[i].end });
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
// Use the largest gap(s) as column dividers
|
|
175
|
+
// Only keep gaps that are at least 50% of the largest gap
|
|
176
|
+
const maxGapSize = Math.max(...mergedGaps.map(g => g.end - g.start));
|
|
177
|
+
const significantGaps = mergedGaps.filter(g => g.end - g.start >= maxGapSize * 0.5);
|
|
178
|
+
if (significantGaps.length === 0) {
|
|
179
|
+
return [fragments];
|
|
180
|
+
}
|
|
181
|
+
// --------------------------------------------------------------------------
|
|
182
|
+
// Table vs. multi-column heuristic:
|
|
183
|
+
// Group fragments by Y-line. For each candidate gap, check how many Y-lines
|
|
184
|
+
// have fragments on BOTH sides of the gap. If most do, this is tabular data
|
|
185
|
+
// (same row spans multiple "columns"), not a true multi-column layout.
|
|
186
|
+
// --------------------------------------------------------------------------
|
|
187
|
+
const lineThreshold = medianFontSize * 0.3;
|
|
188
|
+
const yLines = []; // each entry: array of x-midpoints on that line
|
|
189
|
+
const yValues = [];
|
|
190
|
+
for (const f of fragments) {
|
|
191
|
+
const mid = f.x + f.width / 2;
|
|
192
|
+
let found = false;
|
|
193
|
+
for (let li = 0; li < yValues.length; li++) {
|
|
194
|
+
if (Math.abs(f.y - yValues[li]) <= lineThreshold) {
|
|
195
|
+
yLines[li].push(mid);
|
|
196
|
+
found = true;
|
|
197
|
+
break;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
if (!found) {
|
|
201
|
+
yValues.push(f.y);
|
|
202
|
+
yLines.push([mid]);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
// For each significant gap, count how many Y-lines span both sides
|
|
206
|
+
for (const gap of significantGaps) {
|
|
207
|
+
const divider = (gap.start + gap.end) / 2;
|
|
208
|
+
let spanning = 0;
|
|
209
|
+
let total = 0;
|
|
210
|
+
for (const line of yLines) {
|
|
211
|
+
if (line.length < 2) {
|
|
212
|
+
continue;
|
|
213
|
+
}
|
|
214
|
+
total++;
|
|
215
|
+
const hasLeft = line.some(x => x < divider);
|
|
216
|
+
const hasRight = line.some(x => x > divider);
|
|
217
|
+
if (hasLeft && hasRight) {
|
|
218
|
+
spanning++;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
// If more than 50% of multi-fragment lines span the gap, it's a table
|
|
222
|
+
if (total > 0 && spanning / total > 0.5) {
|
|
223
|
+
return [fragments]; // Not a true multi-column layout
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
// Build column boundaries from the gaps
|
|
227
|
+
const dividers = significantGaps.map(g => (g.start + g.end) / 2).sort((a, b) => a - b);
|
|
228
|
+
// Assign fragments to columns
|
|
229
|
+
const columnCount = dividers.length + 1;
|
|
230
|
+
const columns = Array.from({ length: columnCount }, () => []);
|
|
231
|
+
for (const f of fragments) {
|
|
232
|
+
const mid = f.x + f.width / 2;
|
|
233
|
+
let colIndex = 0;
|
|
234
|
+
for (let d = 0; d < dividers.length; d++) {
|
|
235
|
+
if (mid > dividers[d]) {
|
|
236
|
+
colIndex = d + 1;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
columns[colIndex].push(f);
|
|
240
|
+
}
|
|
241
|
+
// Filter out empty columns and return
|
|
242
|
+
return columns.filter(c => c.length > 0);
|
|
243
|
+
}
|
|
244
|
+
// =============================================================================
|
|
245
|
+
// Fragment Sorting
|
|
246
|
+
// =============================================================================
|
|
247
|
+
/**
|
|
248
|
+
* Sort fragments into reading order.
|
|
249
|
+
* Primary sort: top-to-bottom (descending Y in PDF coords).
|
|
250
|
+
* Secondary sort: left-to-right (ascending X) for LTR, right-to-left for RTL.
|
|
251
|
+
*/
|
|
252
|
+
function sortFragments(fragments) {
|
|
253
|
+
return [...fragments].sort((a, b) => {
|
|
254
|
+
// Compare Y positions — higher Y = earlier in reading order (PDF coords)
|
|
255
|
+
const dy = b.y - a.y;
|
|
256
|
+
if (Math.abs(dy) > 1) {
|
|
257
|
+
return dy;
|
|
258
|
+
}
|
|
259
|
+
// Same line — sort by X position
|
|
260
|
+
return a.x - b.x;
|
|
261
|
+
});
|
|
262
|
+
}
|
|
263
|
+
// =============================================================================
|
|
264
|
+
// Line Grouping
|
|
265
|
+
// =============================================================================
|
|
266
|
+
/**
|
|
267
|
+
* Group fragments into lines based on their Y position.
|
|
268
|
+
* Fragments within a threshold of each other's Y position are on the same line.
|
|
269
|
+
*/
|
|
270
|
+
function groupIntoLines(fragments) {
|
|
271
|
+
if (fragments.length === 0) {
|
|
272
|
+
return [];
|
|
273
|
+
}
|
|
274
|
+
const lines = [];
|
|
275
|
+
let currentLine = [fragments[0]];
|
|
276
|
+
for (let i = 1; i < fragments.length; i++) {
|
|
277
|
+
const fragment = fragments[i];
|
|
278
|
+
const prevFragment = currentLine[0];
|
|
279
|
+
// Calculate line threshold — use average font size as the baseline
|
|
280
|
+
const avgFontSize = (prevFragment.fontSize + fragment.fontSize) / 2;
|
|
281
|
+
const lineThreshold = Math.max(avgFontSize * 0.4, 2);
|
|
282
|
+
// Check if this fragment is on the same line
|
|
283
|
+
if (Math.abs(fragment.y - prevFragment.y) <= lineThreshold) {
|
|
284
|
+
currentLine.push(fragment);
|
|
285
|
+
}
|
|
286
|
+
else {
|
|
287
|
+
// New line
|
|
288
|
+
lines.push(currentLine);
|
|
289
|
+
currentLine = [fragment];
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
if (currentLine.length > 0) {
|
|
293
|
+
lines.push(currentLine);
|
|
294
|
+
}
|
|
295
|
+
// Sort fragments within each line
|
|
296
|
+
for (const line of lines) {
|
|
297
|
+
sortLineFragments(line);
|
|
298
|
+
}
|
|
299
|
+
return lines;
|
|
300
|
+
}
|
|
301
|
+
/**
|
|
302
|
+
* Sort fragments within a line, respecting RTL text direction.
|
|
303
|
+
*
|
|
304
|
+
* If the majority of fragments in the line are RTL, sort right-to-left.
|
|
305
|
+
* Otherwise sort left-to-right (standard LTR).
|
|
306
|
+
*
|
|
307
|
+
* Note: RTL fragments from the content stream should already be in visual order,
|
|
308
|
+
* so we sort by position to preserve that visual order. RTL lines sort by
|
|
309
|
+
* descending X (rightmost first), LTR lines sort by ascending X (leftmost first).
|
|
310
|
+
*/
|
|
311
|
+
function sortLineFragments(line) {
|
|
312
|
+
const rtlCount = line.filter(f => f.isRtl).length;
|
|
313
|
+
const isRtlLine = rtlCount > line.length / 2;
|
|
314
|
+
if (isRtlLine) {
|
|
315
|
+
// RTL line: sort right-to-left (descending X)
|
|
316
|
+
line.sort((a, b) => b.x - a.x);
|
|
317
|
+
}
|
|
318
|
+
else {
|
|
319
|
+
// LTR line: sort left-to-right (ascending X)
|
|
320
|
+
line.sort((a, b) => a.x - b.x);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
// =============================================================================
|
|
324
|
+
// Vertical Text Support
|
|
325
|
+
// =============================================================================
|
|
326
|
+
/**
|
|
327
|
+
* Reconstruct vertical text (WMode=1, typically CJK).
|
|
328
|
+
*
|
|
329
|
+
* Vertical text flows top-to-bottom within columns, and columns go right-to-left.
|
|
330
|
+
* Each vertical column is grouped by X position and output as a "line" of text.
|
|
331
|
+
*/
|
|
332
|
+
function reconstructVerticalText(fragments) {
|
|
333
|
+
const columns = groupVerticalIntoColumns(fragments);
|
|
334
|
+
const result = [];
|
|
335
|
+
for (const col of columns) {
|
|
336
|
+
result.push(buildVerticalColumnText(col));
|
|
337
|
+
}
|
|
338
|
+
return result.join("\n");
|
|
339
|
+
}
|
|
340
|
+
/**
|
|
341
|
+
* Group vertical text fragments by X position into columns.
|
|
342
|
+
* Sorted by X descending (rightmost column first for CJK vertical text).
|
|
343
|
+
*/
|
|
344
|
+
function groupVerticalIntoColumns(fragments) {
|
|
345
|
+
if (fragments.length === 0) {
|
|
346
|
+
return [];
|
|
347
|
+
}
|
|
348
|
+
// Sort by X descending (rightmost first), then Y descending (top first in PDF coords)
|
|
349
|
+
const sorted = [...fragments].sort((a, b) => {
|
|
350
|
+
const dx = b.x - a.x;
|
|
351
|
+
if (Math.abs(dx) > 1) {
|
|
352
|
+
return dx;
|
|
353
|
+
}
|
|
354
|
+
return b.y - a.y;
|
|
355
|
+
});
|
|
356
|
+
const columns = [];
|
|
357
|
+
let currentCol = [sorted[0]];
|
|
358
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
359
|
+
const fragment = sorted[i];
|
|
360
|
+
const prevFragment = currentCol[0];
|
|
361
|
+
// Group by X position — use font size as threshold
|
|
362
|
+
const avgFontSize = (prevFragment.fontSize + fragment.fontSize) / 2;
|
|
363
|
+
const xThreshold = Math.max(avgFontSize * 0.6, 2);
|
|
364
|
+
if (Math.abs(fragment.x - prevFragment.x) <= xThreshold) {
|
|
365
|
+
currentCol.push(fragment);
|
|
366
|
+
}
|
|
367
|
+
else {
|
|
368
|
+
columns.push(currentCol);
|
|
369
|
+
currentCol = [fragment];
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
if (currentCol.length > 0) {
|
|
373
|
+
columns.push(currentCol);
|
|
374
|
+
}
|
|
375
|
+
// Within each column, sort by Y descending (top to bottom in PDF coords)
|
|
376
|
+
for (const col of columns) {
|
|
377
|
+
col.sort((a, b) => b.y - a.y);
|
|
378
|
+
}
|
|
379
|
+
return columns;
|
|
380
|
+
}
|
|
381
|
+
/**
|
|
382
|
+
* Build text for a vertical column (fragments running top to bottom).
|
|
383
|
+
*/
|
|
384
|
+
function buildVerticalColumnText(fragments) {
|
|
385
|
+
return fragments.map(f => f.text).join("");
|
|
386
|
+
}
|
|
387
|
+
// =============================================================================
|
|
388
|
+
// Text Building
|
|
389
|
+
// =============================================================================
|
|
390
|
+
/**
|
|
391
|
+
* Build final text from grouped lines.
|
|
392
|
+
*/
|
|
393
|
+
function buildText(lines) {
|
|
394
|
+
const result = [];
|
|
395
|
+
for (let i = 0; i < lines.length; i++) {
|
|
396
|
+
result.push(buildLineText(lines[i]));
|
|
397
|
+
// Detect paragraph breaks (extra vertical spacing between lines)
|
|
398
|
+
if (i + 1 < lines.length) {
|
|
399
|
+
const currentLine = lines[i];
|
|
400
|
+
const nextLine = lines[i + 1];
|
|
401
|
+
const lineGap = currentLine[0].y - nextLine[0].y;
|
|
402
|
+
const avgFontSize = (currentLine[0].fontSize + nextLine[0].fontSize) / 2;
|
|
403
|
+
// If gap is significantly larger than normal line height, add extra newline
|
|
404
|
+
if (lineGap > avgFontSize * 1.8) {
|
|
405
|
+
result.push("");
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
return result.join("\n");
|
|
410
|
+
}
|
|
411
|
+
/**
|
|
412
|
+
* Build text for a single line from fragments.
|
|
413
|
+
* Inserts spaces between fragments that have gaps.
|
|
414
|
+
*/
|
|
415
|
+
function buildLineText(fragments) {
|
|
416
|
+
if (fragments.length === 0) {
|
|
417
|
+
return "";
|
|
418
|
+
}
|
|
419
|
+
let text = fragments[0].text;
|
|
420
|
+
for (let i = 1; i < fragments.length; i++) {
|
|
421
|
+
const prev = fragments[i - 1];
|
|
422
|
+
const curr = fragments[i];
|
|
423
|
+
// Calculate expected position after previous fragment
|
|
424
|
+
const expectedX = prev.x + prev.width;
|
|
425
|
+
const gap = curr.x - expectedX;
|
|
426
|
+
// Raw distance between fragment start positions — independent of width estimate.
|
|
427
|
+
// This is reliable even when font widths are slightly off.
|
|
428
|
+
const rawGap = curr.x - prev.x;
|
|
429
|
+
// Determine space threshold
|
|
430
|
+
const avgFontSize = (prev.fontSize + curr.fontSize) / 2;
|
|
431
|
+
const spaceThreshold = avgFontSize * 0.15; // ~15% of font size
|
|
432
|
+
const tabThreshold = avgFontSize * 2; // Large gap = tab/column
|
|
433
|
+
// Tab: either the width-based gap is large, or the raw x-distance between
|
|
434
|
+
// fragment starts is much larger than expected for adjacent characters.
|
|
435
|
+
// The raw check uses the previous fragment's text length as a proxy for
|
|
436
|
+
// expected width, avoiding dependence on potentially inaccurate font widths.
|
|
437
|
+
const expectedCharWidth = avgFontSize * 0.5; // approximate avg char width
|
|
438
|
+
const expectedTextWidth = prev.text.length * expectedCharWidth;
|
|
439
|
+
const rawExcess = rawGap - expectedTextWidth;
|
|
440
|
+
if (gap > tabThreshold || rawExcess > tabThreshold) {
|
|
441
|
+
text += "\t";
|
|
442
|
+
}
|
|
443
|
+
else if (gap > spaceThreshold || rawExcess > spaceThreshold) {
|
|
444
|
+
text += " ";
|
|
445
|
+
}
|
|
446
|
+
else if (gap < -spaceThreshold && rawGap > 0) {
|
|
447
|
+
// Width overestimate: fragments don't actually overlap in raw X space
|
|
448
|
+
// but the calculated gap is negative. Insert a space if the raw distance
|
|
449
|
+
// suggests they are separate fragments.
|
|
450
|
+
if (rawGap > avgFontSize * 0.5) {
|
|
451
|
+
text += " ";
|
|
452
|
+
}
|
|
453
|
+
text += curr.text;
|
|
454
|
+
continue;
|
|
455
|
+
}
|
|
456
|
+
else if (gap < -spaceThreshold) {
|
|
457
|
+
// Truly overlapping text — might be overprint or correction
|
|
458
|
+
// Only add if the text is different
|
|
459
|
+
if (!text.endsWith(curr.text.charAt(0))) {
|
|
460
|
+
text += curr.text;
|
|
461
|
+
continue;
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
text += curr.text;
|
|
465
|
+
}
|
|
466
|
+
return text;
|
|
467
|
+
}
|
|
@@ -19,6 +19,12 @@ import { createAbortError, toError } from "../shared/errors.js";
|
|
|
19
19
|
import { measureCentralDirectoryAndEocd, writeCentralDirectoryAndEocdInto } from "./writer-core.js";
|
|
20
20
|
import { buildDataDescriptor, buildDataDescriptorZip64, concatExtraFields, UINT16_MAX, UINT32_MAX, buildLocalFileHeader, VERSION_ZIP64, VERSION_NEEDED, FLAG_ENCRYPTED, FLAG_DATA_DESCRIPTOR, FLAG_UTF8, COMPRESSION_AES, getUnixModeFromExternalAttributes, isSymlinkMode } from "../zip-spec/zip-records.js";
|
|
21
21
|
const SMART_STORE_DECIDE_BYTES = 16 * 1024;
|
|
22
|
+
/** Input batching threshold for push(). Small chunks are accumulated in an
|
|
23
|
+
* internal buffer and flushed to the compression pipeline once this size is
|
|
24
|
+
* reached. 64 KB matches the standard deflate window and keeps the number
|
|
25
|
+
* of async push() calls — each of which creates a full Promise chain in the
|
|
26
|
+
* browser CompressionStream path — down to a manageable level. */
|
|
27
|
+
const INPUT_BATCH_BYTES = 65536;
|
|
22
28
|
/**
|
|
23
29
|
* True Streaming ZIP File - compresses chunk by chunk
|
|
24
30
|
*/
|
|
@@ -62,6 +68,12 @@ export class ZipDeflateFile {
|
|
|
62
68
|
this._finalQueued = false;
|
|
63
69
|
// Serialize push() calls so callers don't need to await to preserve ordering.
|
|
64
70
|
this._pushChain = Promise.resolve();
|
|
71
|
+
// Input batching: accumulate small chunks before feeding the compression
|
|
72
|
+
// pipeline. This collapses thousands of tiny push() calls (each creating a
|
|
73
|
+
// full async Promise chain on browsers) into a handful of large pushes.
|
|
74
|
+
// Threshold matches the common deflate window size (64 KB).
|
|
75
|
+
this._inputBuf = null;
|
|
76
|
+
this._inputPos = 0;
|
|
65
77
|
// Synchronous compression state for pushSync() path.
|
|
66
78
|
this._syncDeflater = null;
|
|
67
79
|
this._syncZlibReady = false;
|
|
@@ -639,6 +651,47 @@ export class ZipDeflateFile {
|
|
|
639
651
|
}
|
|
640
652
|
return Promise.resolve();
|
|
641
653
|
}
|
|
654
|
+
// --- Async path: batch small chunks to reduce Promise-chain overhead ---
|
|
655
|
+
// Each real push through the async pipeline creates a full Promise chain
|
|
656
|
+
// (push → _pushChain → _pushUnchained → AsyncStreamCodec.writeChain →
|
|
657
|
+
// CompressionStream.writer.write). By accumulating small chunks into a
|
|
658
|
+
// 64 KB buffer we reduce the number of async round-trips by ~100x for
|
|
659
|
+
// typical XML workloads without sacrificing streaming semantics.
|
|
660
|
+
if (!final && data.length > 0 && data.length < INPUT_BATCH_BYTES) {
|
|
661
|
+
// Lazy-allocate the batch buffer.
|
|
662
|
+
if (!this._inputBuf) {
|
|
663
|
+
this._inputBuf = new Uint8Array(INPUT_BATCH_BYTES);
|
|
664
|
+
this._inputPos = 0;
|
|
665
|
+
}
|
|
666
|
+
// If the chunk fits in the remaining space, just copy it in.
|
|
667
|
+
if (this._inputPos + data.length <= INPUT_BATCH_BYTES) {
|
|
668
|
+
this._inputBuf.set(data, this._inputPos);
|
|
669
|
+
this._inputPos += data.length;
|
|
670
|
+
// Not full yet — return resolved promise, no async work.
|
|
671
|
+
callback?.();
|
|
672
|
+
return Promise.resolve();
|
|
673
|
+
}
|
|
674
|
+
// Buffer would overflow — flush everything (buffered + new data) together.
|
|
675
|
+
const combined = new Uint8Array(this._inputPos + data.length);
|
|
676
|
+
combined.set(this._inputBuf.subarray(0, this._inputPos));
|
|
677
|
+
combined.set(data, this._inputPos);
|
|
678
|
+
this._inputPos = 0;
|
|
679
|
+
return this._pushAsync(combined, false, callback);
|
|
680
|
+
}
|
|
681
|
+
// Large chunk or final — flush any buffered data first, then push.
|
|
682
|
+
if (this._inputPos > 0) {
|
|
683
|
+
const flushData = this._inputBuf.slice(0, this._inputPos);
|
|
684
|
+
this._inputPos = 0;
|
|
685
|
+
// Chain: flush buffered → push current
|
|
686
|
+
const flushPromise = this._pushAsync(flushData, false);
|
|
687
|
+
const promise = (this._pushChain = flushPromise.then(() => this._pushUnchained(data, final, callback), () => this._pushUnchained(data, final, callback)));
|
|
688
|
+
promise.catch(() => { });
|
|
689
|
+
return promise;
|
|
690
|
+
}
|
|
691
|
+
return this._pushAsync(data, final, callback);
|
|
692
|
+
}
|
|
693
|
+
/** Enqueue an async push through the _pushChain serialization. */
|
|
694
|
+
_pushAsync(data, final, callback) {
|
|
642
695
|
// Chain the async push so calls are serialized. Use a recovery wrapper
|
|
643
696
|
// so that a single failed push does not break the chain for subsequent
|
|
644
697
|
// pushes — errors are surfaced via onerror/rejectComplete instead.
|