@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,463 @@
1
+ /**
2
+ * Text reconstruction from positioned text fragments.
3
+ *
4
+ * Assembles raw text fragments extracted from PDF content streams into
5
+ * coherent, human-readable text with proper reading order, line breaks,
6
+ * and paragraph detection.
7
+ *
8
+ * Challenges addressed:
9
+ * - PDF text has no semantic structure (only "draw char at (x,y)")
10
+ * - Text fragments may be out of order
11
+ * - Word and line boundaries must be inferred from positions
12
+ * - Columns and tables need proper handling
13
+ * - Different fonts/sizes affect spacing thresholds
14
+ * - Multi-column layouts need column detection
15
+ * - RTL (Arabic, Hebrew) text needs right-to-left sorting
16
+ * - Vertical CJK text needs column-based grouping
17
+ *
18
+ * @see PDF Reference 1.7, Chapter 5 - Text
19
+ */
20
+ // =============================================================================
21
+ // Public API
22
+ // =============================================================================
23
+ /**
24
+ * Reconstruct readable text from positioned text fragments.
25
+ *
26
+ * @param fragments - Raw text fragments with positions from content stream
27
+ * @returns Reconstructed text with proper line breaks and spacing
28
+ */
29
+ export function reconstructText(fragments) {
30
+ if (fragments.length === 0) {
31
+ return "";
32
+ }
33
+ // Separate vertical text from horizontal text
34
+ const verticalFragments = fragments.filter(f => f.isVertical);
35
+ const horizontalFragments = fragments.filter(f => !f.isVertical);
36
+ const parts = [];
37
+ // Process horizontal text (possibly multi-column)
38
+ if (horizontalFragments.length > 0) {
39
+ parts.push(reconstructHorizontalText(horizontalFragments));
40
+ }
41
+ // Process vertical text
42
+ if (verticalFragments.length > 0) {
43
+ parts.push(reconstructVerticalText(verticalFragments));
44
+ }
45
+ return parts.join("\n\n");
46
+ }
47
+ /**
48
+ * Extract text as structured lines.
49
+ */
50
+ export function reconstructTextLines(fragments) {
51
+ if (fragments.length === 0) {
52
+ return [];
53
+ }
54
+ // Separate vertical from horizontal
55
+ const verticalFragments = fragments.filter(f => f.isVertical);
56
+ const horizontalFragments = fragments.filter(f => !f.isVertical);
57
+ const lines = [];
58
+ // Process horizontal text
59
+ if (horizontalFragments.length > 0) {
60
+ const columns = detectColumns(horizontalFragments);
61
+ for (const column of columns) {
62
+ const sorted = sortFragments(column);
63
+ const grouped = groupIntoLines(sorted);
64
+ for (const line of grouped) {
65
+ lines.push({
66
+ text: buildLineText(line),
67
+ y: line[0].y,
68
+ x: line[0].x,
69
+ fontSize: line[0].fontSize
70
+ });
71
+ }
72
+ }
73
+ }
74
+ // Process vertical text
75
+ if (verticalFragments.length > 0) {
76
+ const verticalLines = groupVerticalIntoColumns(verticalFragments);
77
+ for (const col of verticalLines) {
78
+ lines.push({
79
+ text: buildVerticalColumnText(col),
80
+ y: col[0].y,
81
+ x: col[0].x,
82
+ fontSize: col[0].fontSize
83
+ });
84
+ }
85
+ }
86
+ return lines;
87
+ }
88
+ // =============================================================================
89
+ // Horizontal Text Reconstruction (with multi-column detection)
90
+ // =============================================================================
91
+ /**
92
+ * Reconstruct horizontal text, detecting multi-column layouts.
93
+ */
94
+ function reconstructHorizontalText(fragments) {
95
+ const columns = detectColumns(fragments);
96
+ if (columns.length <= 1) {
97
+ // Single column — standard processing
98
+ const sorted = sortFragments(fragments);
99
+ const lines = groupIntoLines(sorted);
100
+ return buildText(lines);
101
+ }
102
+ // Multi-column: process each column independently, join with double newlines
103
+ const columnTexts = [];
104
+ for (const column of columns) {
105
+ const sorted = sortFragments(column);
106
+ const lines = groupIntoLines(sorted);
107
+ const text = buildText(lines);
108
+ if (text.length > 0) {
109
+ columnTexts.push(text);
110
+ }
111
+ }
112
+ return columnTexts.join("\n\n");
113
+ }
114
+ // =============================================================================
115
+ // Multi-Column Detection
116
+ // =============================================================================
117
+ /**
118
+ * Detect columns by clustering fragments by their X ranges.
119
+ *
120
+ * Builds a histogram of fragment X positions and looks for clear gaps
121
+ * that divide the page into 2+ columns.
122
+ *
123
+ * Distinguishes true multi-column layouts (e.g. newspaper columns) from
124
+ * tabular data by checking whether most Y-lines span across the gap.
125
+ * In a table, the same Y-line has fragments on both sides of the gap;
126
+ * in a true multi-column layout, each column has its own independent lines.
127
+ *
128
+ * @returns Array of fragment groups, one per detected column, sorted left-to-right
129
+ */
130
+ function detectColumns(fragments) {
131
+ if (fragments.length < 4) {
132
+ // Too few fragments to reliably detect columns
133
+ return [fragments];
134
+ }
135
+ // Collect the X midpoints for each fragment
136
+ const xMidpoints = [];
137
+ for (const f of fragments) {
138
+ xMidpoints.push(f.x + f.width / 2);
139
+ }
140
+ // Sort midpoints
141
+ const sorted = [...xMidpoints].sort((a, b) => a - b);
142
+ // Find the median font size for gap threshold calculation
143
+ const fontSizes = fragments.map(f => f.fontSize).sort((a, b) => a - b);
144
+ const medianFontSize = fontSizes[Math.floor(fontSizes.length / 2)];
145
+ // Minimum gap to consider as a column separator:
146
+ // Must be significantly larger than a word space (at least 4x font size)
147
+ const minColumnGap = medianFontSize * 4;
148
+ // Find gaps between consecutive sorted midpoints
149
+ const gaps = [];
150
+ for (let i = 1; i < sorted.length; i++) {
151
+ const gap = sorted[i] - sorted[i - 1];
152
+ if (gap > minColumnGap) {
153
+ gaps.push({ start: sorted[i - 1], end: sorted[i], size: gap });
154
+ }
155
+ }
156
+ if (gaps.length === 0) {
157
+ return [fragments];
158
+ }
159
+ // Merge nearby gaps (within minColumnGap of each other)
160
+ const mergedGaps = [gaps[0]];
161
+ for (let i = 1; i < gaps.length; i++) {
162
+ const last = mergedGaps[mergedGaps.length - 1];
163
+ if (gaps[i].start - last.end < minColumnGap) {
164
+ last.end = gaps[i].end;
165
+ }
166
+ else {
167
+ mergedGaps.push({ start: gaps[i].start, end: gaps[i].end });
168
+ }
169
+ }
170
+ // Use the largest gap(s) as column dividers
171
+ // Only keep gaps that are at least 50% of the largest gap
172
+ const maxGapSize = Math.max(...mergedGaps.map(g => g.end - g.start));
173
+ const significantGaps = mergedGaps.filter(g => g.end - g.start >= maxGapSize * 0.5);
174
+ if (significantGaps.length === 0) {
175
+ return [fragments];
176
+ }
177
+ // --------------------------------------------------------------------------
178
+ // Table vs. multi-column heuristic:
179
+ // Group fragments by Y-line. For each candidate gap, check how many Y-lines
180
+ // have fragments on BOTH sides of the gap. If most do, this is tabular data
181
+ // (same row spans multiple "columns"), not a true multi-column layout.
182
+ // --------------------------------------------------------------------------
183
+ const lineThreshold = medianFontSize * 0.3;
184
+ const yLines = []; // each entry: array of x-midpoints on that line
185
+ const yValues = [];
186
+ for (const f of fragments) {
187
+ const mid = f.x + f.width / 2;
188
+ let found = false;
189
+ for (let li = 0; li < yValues.length; li++) {
190
+ if (Math.abs(f.y - yValues[li]) <= lineThreshold) {
191
+ yLines[li].push(mid);
192
+ found = true;
193
+ break;
194
+ }
195
+ }
196
+ if (!found) {
197
+ yValues.push(f.y);
198
+ yLines.push([mid]);
199
+ }
200
+ }
201
+ // For each significant gap, count how many Y-lines span both sides
202
+ for (const gap of significantGaps) {
203
+ const divider = (gap.start + gap.end) / 2;
204
+ let spanning = 0;
205
+ let total = 0;
206
+ for (const line of yLines) {
207
+ if (line.length < 2) {
208
+ continue;
209
+ }
210
+ total++;
211
+ const hasLeft = line.some(x => x < divider);
212
+ const hasRight = line.some(x => x > divider);
213
+ if (hasLeft && hasRight) {
214
+ spanning++;
215
+ }
216
+ }
217
+ // If more than 50% of multi-fragment lines span the gap, it's a table
218
+ if (total > 0 && spanning / total > 0.5) {
219
+ return [fragments]; // Not a true multi-column layout
220
+ }
221
+ }
222
+ // Build column boundaries from the gaps
223
+ const dividers = significantGaps.map(g => (g.start + g.end) / 2).sort((a, b) => a - b);
224
+ // Assign fragments to columns
225
+ const columnCount = dividers.length + 1;
226
+ const columns = Array.from({ length: columnCount }, () => []);
227
+ for (const f of fragments) {
228
+ const mid = f.x + f.width / 2;
229
+ let colIndex = 0;
230
+ for (let d = 0; d < dividers.length; d++) {
231
+ if (mid > dividers[d]) {
232
+ colIndex = d + 1;
233
+ }
234
+ }
235
+ columns[colIndex].push(f);
236
+ }
237
+ // Filter out empty columns and return
238
+ return columns.filter(c => c.length > 0);
239
+ }
240
+ // =============================================================================
241
+ // Fragment Sorting
242
+ // =============================================================================
243
+ /**
244
+ * Sort fragments into reading order.
245
+ * Primary sort: top-to-bottom (descending Y in PDF coords).
246
+ * Secondary sort: left-to-right (ascending X) for LTR, right-to-left for RTL.
247
+ */
248
+ function sortFragments(fragments) {
249
+ return [...fragments].sort((a, b) => {
250
+ // Compare Y positions — higher Y = earlier in reading order (PDF coords)
251
+ const dy = b.y - a.y;
252
+ if (Math.abs(dy) > 1) {
253
+ return dy;
254
+ }
255
+ // Same line — sort by X position
256
+ return a.x - b.x;
257
+ });
258
+ }
259
+ // =============================================================================
260
+ // Line Grouping
261
+ // =============================================================================
262
+ /**
263
+ * Group fragments into lines based on their Y position.
264
+ * Fragments within a threshold of each other's Y position are on the same line.
265
+ */
266
+ function groupIntoLines(fragments) {
267
+ if (fragments.length === 0) {
268
+ return [];
269
+ }
270
+ const lines = [];
271
+ let currentLine = [fragments[0]];
272
+ for (let i = 1; i < fragments.length; i++) {
273
+ const fragment = fragments[i];
274
+ const prevFragment = currentLine[0];
275
+ // Calculate line threshold — use average font size as the baseline
276
+ const avgFontSize = (prevFragment.fontSize + fragment.fontSize) / 2;
277
+ const lineThreshold = Math.max(avgFontSize * 0.4, 2);
278
+ // Check if this fragment is on the same line
279
+ if (Math.abs(fragment.y - prevFragment.y) <= lineThreshold) {
280
+ currentLine.push(fragment);
281
+ }
282
+ else {
283
+ // New line
284
+ lines.push(currentLine);
285
+ currentLine = [fragment];
286
+ }
287
+ }
288
+ if (currentLine.length > 0) {
289
+ lines.push(currentLine);
290
+ }
291
+ // Sort fragments within each line
292
+ for (const line of lines) {
293
+ sortLineFragments(line);
294
+ }
295
+ return lines;
296
+ }
297
+ /**
298
+ * Sort fragments within a line, respecting RTL text direction.
299
+ *
300
+ * If the majority of fragments in the line are RTL, sort right-to-left.
301
+ * Otherwise sort left-to-right (standard LTR).
302
+ *
303
+ * Note: RTL fragments from the content stream should already be in visual order,
304
+ * so we sort by position to preserve that visual order. RTL lines sort by
305
+ * descending X (rightmost first), LTR lines sort by ascending X (leftmost first).
306
+ */
307
+ function sortLineFragments(line) {
308
+ const rtlCount = line.filter(f => f.isRtl).length;
309
+ const isRtlLine = rtlCount > line.length / 2;
310
+ if (isRtlLine) {
311
+ // RTL line: sort right-to-left (descending X)
312
+ line.sort((a, b) => b.x - a.x);
313
+ }
314
+ else {
315
+ // LTR line: sort left-to-right (ascending X)
316
+ line.sort((a, b) => a.x - b.x);
317
+ }
318
+ }
319
+ // =============================================================================
320
+ // Vertical Text Support
321
+ // =============================================================================
322
+ /**
323
+ * Reconstruct vertical text (WMode=1, typically CJK).
324
+ *
325
+ * Vertical text flows top-to-bottom within columns, and columns go right-to-left.
326
+ * Each vertical column is grouped by X position and output as a "line" of text.
327
+ */
328
+ function reconstructVerticalText(fragments) {
329
+ const columns = groupVerticalIntoColumns(fragments);
330
+ const result = [];
331
+ for (const col of columns) {
332
+ result.push(buildVerticalColumnText(col));
333
+ }
334
+ return result.join("\n");
335
+ }
336
+ /**
337
+ * Group vertical text fragments by X position into columns.
338
+ * Sorted by X descending (rightmost column first for CJK vertical text).
339
+ */
340
+ function groupVerticalIntoColumns(fragments) {
341
+ if (fragments.length === 0) {
342
+ return [];
343
+ }
344
+ // Sort by X descending (rightmost first), then Y descending (top first in PDF coords)
345
+ const sorted = [...fragments].sort((a, b) => {
346
+ const dx = b.x - a.x;
347
+ if (Math.abs(dx) > 1) {
348
+ return dx;
349
+ }
350
+ return b.y - a.y;
351
+ });
352
+ const columns = [];
353
+ let currentCol = [sorted[0]];
354
+ for (let i = 1; i < sorted.length; i++) {
355
+ const fragment = sorted[i];
356
+ const prevFragment = currentCol[0];
357
+ // Group by X position — use font size as threshold
358
+ const avgFontSize = (prevFragment.fontSize + fragment.fontSize) / 2;
359
+ const xThreshold = Math.max(avgFontSize * 0.6, 2);
360
+ if (Math.abs(fragment.x - prevFragment.x) <= xThreshold) {
361
+ currentCol.push(fragment);
362
+ }
363
+ else {
364
+ columns.push(currentCol);
365
+ currentCol = [fragment];
366
+ }
367
+ }
368
+ if (currentCol.length > 0) {
369
+ columns.push(currentCol);
370
+ }
371
+ // Within each column, sort by Y descending (top to bottom in PDF coords)
372
+ for (const col of columns) {
373
+ col.sort((a, b) => b.y - a.y);
374
+ }
375
+ return columns;
376
+ }
377
+ /**
378
+ * Build text for a vertical column (fragments running top to bottom).
379
+ */
380
+ function buildVerticalColumnText(fragments) {
381
+ return fragments.map(f => f.text).join("");
382
+ }
383
+ // =============================================================================
384
+ // Text Building
385
+ // =============================================================================
386
+ /**
387
+ * Build final text from grouped lines.
388
+ */
389
+ function buildText(lines) {
390
+ const result = [];
391
+ for (let i = 0; i < lines.length; i++) {
392
+ result.push(buildLineText(lines[i]));
393
+ // Detect paragraph breaks (extra vertical spacing between lines)
394
+ if (i + 1 < lines.length) {
395
+ const currentLine = lines[i];
396
+ const nextLine = lines[i + 1];
397
+ const lineGap = currentLine[0].y - nextLine[0].y;
398
+ const avgFontSize = (currentLine[0].fontSize + nextLine[0].fontSize) / 2;
399
+ // If gap is significantly larger than normal line height, add extra newline
400
+ if (lineGap > avgFontSize * 1.8) {
401
+ result.push("");
402
+ }
403
+ }
404
+ }
405
+ return result.join("\n");
406
+ }
407
+ /**
408
+ * Build text for a single line from fragments.
409
+ * Inserts spaces between fragments that have gaps.
410
+ */
411
+ function buildLineText(fragments) {
412
+ if (fragments.length === 0) {
413
+ return "";
414
+ }
415
+ let text = fragments[0].text;
416
+ for (let i = 1; i < fragments.length; i++) {
417
+ const prev = fragments[i - 1];
418
+ const curr = fragments[i];
419
+ // Calculate expected position after previous fragment
420
+ const expectedX = prev.x + prev.width;
421
+ const gap = curr.x - expectedX;
422
+ // Raw distance between fragment start positions — independent of width estimate.
423
+ // This is reliable even when font widths are slightly off.
424
+ const rawGap = curr.x - prev.x;
425
+ // Determine space threshold
426
+ const avgFontSize = (prev.fontSize + curr.fontSize) / 2;
427
+ const spaceThreshold = avgFontSize * 0.15; // ~15% of font size
428
+ const tabThreshold = avgFontSize * 2; // Large gap = tab/column
429
+ // Tab: either the width-based gap is large, or the raw x-distance between
430
+ // fragment starts is much larger than expected for adjacent characters.
431
+ // The raw check uses the previous fragment's text length as a proxy for
432
+ // expected width, avoiding dependence on potentially inaccurate font widths.
433
+ const expectedCharWidth = avgFontSize * 0.5; // approximate avg char width
434
+ const expectedTextWidth = prev.text.length * expectedCharWidth;
435
+ const rawExcess = rawGap - expectedTextWidth;
436
+ if (gap > tabThreshold || rawExcess > tabThreshold) {
437
+ text += "\t";
438
+ }
439
+ else if (gap > spaceThreshold || rawExcess > spaceThreshold) {
440
+ text += " ";
441
+ }
442
+ else if (gap < -spaceThreshold && rawGap > 0) {
443
+ // Width overestimate: fragments don't actually overlap in raw X space
444
+ // but the calculated gap is negative. Insert a space if the raw distance
445
+ // suggests they are separate fragments.
446
+ if (rawGap > avgFontSize * 0.5) {
447
+ text += " ";
448
+ }
449
+ text += curr.text;
450
+ continue;
451
+ }
452
+ else if (gap < -spaceThreshold) {
453
+ // Truly overlapping text — might be overprint or correction
454
+ // Only add if the text is different
455
+ if (!text.endsWith(curr.text.charAt(0))) {
456
+ text += curr.text;
457
+ continue;
458
+ }
459
+ }
460
+ text += curr.text;
461
+ }
462
+ return text;
463
+ }