pretext-pdf 1.0.8 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +945 -869
  2. package/README.md +1 -1
  3. package/UPSTREAM.md +112 -0
  4. package/dist/cli.js +19 -19
  5. package/dist/measure-text.d.ts +1 -1
  6. package/dist/measure-text.d.ts.map +1 -1
  7. package/dist/measure-text.js +1 -1
  8. package/dist/measure-text.js.map +1 -1
  9. package/dist/rich-text.js +1 -1
  10. package/dist/rich-text.js.map +1 -1
  11. package/dist/vendor/pretext/analysis.d.ts +35 -0
  12. package/dist/vendor/pretext/analysis.d.ts.map +1 -0
  13. package/dist/vendor/pretext/analysis.js +1162 -0
  14. package/dist/vendor/pretext/analysis.js.map +1 -0
  15. package/dist/vendor/pretext/bidi.d.ts +2 -0
  16. package/dist/vendor/pretext/bidi.d.ts.map +1 -0
  17. package/dist/vendor/pretext/bidi.js +176 -0
  18. package/dist/vendor/pretext/bidi.js.map +1 -0
  19. package/dist/vendor/pretext/generated/bidi-data.d.ts +5 -0
  20. package/dist/vendor/pretext/generated/bidi-data.d.ts.map +1 -0
  21. package/dist/vendor/pretext/generated/bidi-data.js +980 -0
  22. package/dist/vendor/pretext/generated/bidi-data.js.map +1 -0
  23. package/dist/vendor/pretext/layout.d.ts +75 -0
  24. package/dist/vendor/pretext/layout.d.ts.map +1 -0
  25. package/dist/vendor/pretext/layout.js +448 -0
  26. package/dist/vendor/pretext/layout.js.map +1 -0
  27. package/dist/vendor/pretext/line-break.d.ts +43 -0
  28. package/dist/vendor/pretext/line-break.d.ts.map +1 -0
  29. package/dist/vendor/pretext/line-break.js +908 -0
  30. package/dist/vendor/pretext/line-break.js.map +1 -0
  31. package/dist/vendor/pretext/line-text.d.ts +5 -0
  32. package/dist/vendor/pretext/line-text.d.ts.map +1 -0
  33. package/dist/vendor/pretext/line-text.js +69 -0
  34. package/dist/vendor/pretext/line-text.js.map +1 -0
  35. package/dist/vendor/pretext/measurement.d.ts +31 -0
  36. package/dist/vendor/pretext/measurement.d.ts.map +1 -0
  37. package/dist/vendor/pretext/measurement.js +251 -0
  38. package/dist/vendor/pretext/measurement.js.map +1 -0
  39. package/dist/vendor/pretext/rich-inline.d.ts +53 -0
  40. package/dist/vendor/pretext/rich-inline.d.ts.map +1 -0
  41. package/dist/vendor/pretext/rich-inline.js +327 -0
  42. package/dist/vendor/pretext/rich-inline.js.map +1 -0
  43. package/package.json +212 -208
@@ -0,0 +1,1162 @@
1
+ const collapsibleWhitespaceRunRe = /[ \t\n\r\f]+/g;
2
+ const needsWhitespaceNormalizationRe = /[\t\n\r\f]| {2,}|^ | $/;
3
+ function getWhiteSpaceProfile(whiteSpace) {
4
+ const mode = whiteSpace ?? 'normal';
5
+ return mode === 'pre-wrap'
6
+ ? { mode, preserveOrdinarySpaces: true, preserveHardBreaks: true }
7
+ : { mode, preserveOrdinarySpaces: false, preserveHardBreaks: false };
8
+ }
9
+ export function normalizeWhitespaceNormal(text) {
10
+ if (!needsWhitespaceNormalizationRe.test(text))
11
+ return text;
12
+ let normalized = text.replace(collapsibleWhitespaceRunRe, ' ');
13
+ if (normalized.charCodeAt(0) === 0x20) {
14
+ normalized = normalized.slice(1);
15
+ }
16
+ if (normalized.length > 0 && normalized.charCodeAt(normalized.length - 1) === 0x20) {
17
+ normalized = normalized.slice(0, -1);
18
+ }
19
+ return normalized;
20
+ }
21
+ function normalizeWhitespacePreWrap(text) {
22
+ if (!/[\r\f]/.test(text))
23
+ return text;
24
+ return text
25
+ .replace(/\r\n/g, '\n')
26
+ .replace(/[\r\f]/g, '\n');
27
+ }
28
+ let sharedWordSegmenter = null;
29
+ let segmenterLocale;
30
+ function getSharedWordSegmenter() {
31
+ if (sharedWordSegmenter === null) {
32
+ sharedWordSegmenter = new Intl.Segmenter(segmenterLocale, { granularity: 'word' });
33
+ }
34
+ return sharedWordSegmenter;
35
+ }
36
+ export function clearAnalysisCaches() {
37
+ sharedWordSegmenter = null;
38
+ }
39
+ export function setAnalysisLocale(locale) {
40
+ const nextLocale = locale && locale.length > 0 ? locale : undefined;
41
+ if (segmenterLocale === nextLocale)
42
+ return;
43
+ segmenterLocale = nextLocale;
44
+ sharedWordSegmenter = null;
45
+ }
46
+ const arabicScriptRe = /\p{Script=Arabic}/u;
47
+ const combiningMarkRe = /\p{M}/u;
48
+ const currencySymbolRe = /\p{Sc}/u;
49
+ const decimalDigitRe = /\p{Nd}/u;
50
+ function containsArabicScript(text) {
51
+ return arabicScriptRe.test(text);
52
+ }
53
+ function isCJKCodePoint(codePoint) {
54
+ return ((codePoint >= 0x4E00 && codePoint <= 0x9FFF) ||
55
+ (codePoint >= 0x3400 && codePoint <= 0x4DBF) ||
56
+ (codePoint >= 0x20000 && codePoint <= 0x2A6DF) ||
57
+ (codePoint >= 0x2A700 && codePoint <= 0x2B73F) ||
58
+ (codePoint >= 0x2B740 && codePoint <= 0x2B81F) ||
59
+ (codePoint >= 0x2B820 && codePoint <= 0x2CEAF) ||
60
+ (codePoint >= 0x2CEB0 && codePoint <= 0x2EBEF) ||
61
+ (codePoint >= 0x2EBF0 && codePoint <= 0x2EE5D) ||
62
+ (codePoint >= 0x2F800 && codePoint <= 0x2FA1F) ||
63
+ (codePoint >= 0x30000 && codePoint <= 0x3134F) ||
64
+ (codePoint >= 0x31350 && codePoint <= 0x323AF) ||
65
+ (codePoint >= 0x323B0 && codePoint <= 0x33479) ||
66
+ (codePoint >= 0xF900 && codePoint <= 0xFAFF) ||
67
+ (codePoint >= 0x3000 && codePoint <= 0x303F) ||
68
+ (codePoint >= 0x3040 && codePoint <= 0x309F) ||
69
+ (codePoint >= 0x30A0 && codePoint <= 0x30FF) ||
70
+ (codePoint >= 0x3130 && codePoint <= 0x318F) ||
71
+ (codePoint >= 0xAC00 && codePoint <= 0xD7AF) ||
72
+ (codePoint >= 0xFF00 && codePoint <= 0xFFEF));
73
+ }
74
+ export function isCJK(s) {
75
+ for (let i = 0; i < s.length; i++) {
76
+ const first = s.charCodeAt(i);
77
+ if (first < 0x3000)
78
+ continue;
79
+ if (first >= 0xD800 && first <= 0xDBFF && i + 1 < s.length) {
80
+ const second = s.charCodeAt(i + 1);
81
+ if (second >= 0xDC00 && second <= 0xDFFF) {
82
+ const codePoint = ((first - 0xD800) << 10) + (second - 0xDC00) + 0x10000;
83
+ if (isCJKCodePoint(codePoint))
84
+ return true;
85
+ i++;
86
+ continue;
87
+ }
88
+ }
89
+ if (isCJKCodePoint(first))
90
+ return true;
91
+ }
92
+ return false;
93
+ }
94
+ function endsWithLineStartProhibitedText(text) {
95
+ const last = getLastCodePoint(text);
96
+ return last !== null && (kinsokuStart.has(last) || leftStickyPunctuation.has(last));
97
+ }
98
+ const keepAllGlueChars = new Set([
99
+ '\u00A0',
100
+ '\u202F',
101
+ '\u2060',
102
+ '\uFEFF',
103
+ ]);
104
+ const keepAllDashBreakChars = new Set([
105
+ '-',
106
+ '\u2010',
107
+ '\u2013',
108
+ '\u2014',
109
+ ]);
110
+ function endsWithKeepAllGlueText(text) {
111
+ const last = getLastCodePoint(text);
112
+ return last !== null && keepAllGlueChars.has(last);
113
+ }
114
+ function endsWithKeepAllDashBreakText(text) {
115
+ const last = getLastCodePoint(text);
116
+ return last !== null && keepAllDashBreakChars.has(last);
117
+ }
118
+ export function canContinueKeepAllTextRun(previousText, breakAfterPunctuation) {
119
+ if (endsWithKeepAllGlueText(previousText))
120
+ return false;
121
+ if (!breakAfterPunctuation)
122
+ return true;
123
+ if (endsWithLineStartProhibitedText(previousText))
124
+ return false;
125
+ if (endsWithKeepAllDashBreakText(previousText))
126
+ return false;
127
+ return true;
128
+ }
129
+ export const kinsokuStart = new Set([
130
+ '\uFF0C',
131
+ '\uFF0E',
132
+ '\uFF01',
133
+ '\uFF1A',
134
+ '\uFF1B',
135
+ '\uFF1F',
136
+ '\u3001',
137
+ '\u3002',
138
+ '\u30FB',
139
+ '\uFF09',
140
+ '\u3015',
141
+ '\u3009',
142
+ '\u300B',
143
+ '\u300D',
144
+ '\u300F',
145
+ '\u3011',
146
+ '\u3017',
147
+ '\u3019',
148
+ '\u301B',
149
+ '\u30FC',
150
+ '\u3005',
151
+ '\u303B',
152
+ '\u309D',
153
+ '\u309E',
154
+ '\u30FD',
155
+ '\u30FE',
156
+ ]);
157
+ export const kinsokuEnd = new Set([
158
+ '"',
159
+ '(', '[', '{',
160
+ '“', '‘', '«', '‹', '‚', '„',
161
+ '\uFF08',
162
+ '\u3014',
163
+ '\u3008',
164
+ '\u300A',
165
+ '\u300C',
166
+ '\u300E',
167
+ '\u3010',
168
+ '\u3016',
169
+ '\u3018',
170
+ '\u301A',
171
+ ]);
172
+ const forwardStickyGlue = new Set([
173
+ "'", '’',
174
+ ]);
175
+ export const leftStickyPunctuation = new Set([
176
+ '.', ',', '!', '?', ':', ';',
177
+ '\u060C',
178
+ '\u061B',
179
+ '\u061F',
180
+ '\u0964',
181
+ '\u0965',
182
+ '\u104A',
183
+ '\u104B',
184
+ '\u104C',
185
+ '\u104D',
186
+ '\u104F',
187
+ ')', ']', '}',
188
+ '%',
189
+ '"',
190
+ '”', '’', '»', '›',
191
+ '…',
192
+ ]);
193
+ const arabicNoSpaceTrailingPunctuation = new Set([
194
+ ':',
195
+ '.',
196
+ '\u060C',
197
+ '\u061B',
198
+ ]);
199
+ const myanmarMedialGlue = new Set([
200
+ '\u104F',
201
+ ]);
202
+ const closingQuoteChars = new Set([
203
+ '”', '’', '»', '›',
204
+ '\u300D',
205
+ '\u300F',
206
+ '\u3011',
207
+ '\u300B',
208
+ '\u3009',
209
+ '\u3015',
210
+ '\uFF09',
211
+ ]);
212
+ function isLeftStickyPunctuationSegment(segment) {
213
+ if (isEscapedQuoteClusterSegment(segment))
214
+ return true;
215
+ let sawPunctuation = false;
216
+ for (const ch of segment) {
217
+ if (leftStickyPunctuation.has(ch) || currencySymbolRe.test(ch)) {
218
+ sawPunctuation = true;
219
+ continue;
220
+ }
221
+ if (sawPunctuation && combiningMarkRe.test(ch))
222
+ continue;
223
+ return false;
224
+ }
225
+ return sawPunctuation;
226
+ }
227
+ function isCJKLineStartProhibitedSegment(segment) {
228
+ for (const ch of segment) {
229
+ if (!kinsokuStart.has(ch) && !leftStickyPunctuation.has(ch))
230
+ return false;
231
+ }
232
+ return segment.length > 0;
233
+ }
234
+ function isForwardStickyClusterSegment(segment) {
235
+ if (isEscapedQuoteClusterSegment(segment))
236
+ return true;
237
+ for (const ch of segment) {
238
+ if (!kinsokuEnd.has(ch) && !forwardStickyGlue.has(ch) && !combiningMarkRe.test(ch) && !currencySymbolRe.test(ch))
239
+ return false;
240
+ }
241
+ return segment.length > 0;
242
+ }
243
+ function isEscapedQuoteClusterSegment(segment) {
244
+ let sawQuote = false;
245
+ for (const ch of segment) {
246
+ if (ch === '\\' || combiningMarkRe.test(ch))
247
+ continue;
248
+ if (kinsokuEnd.has(ch) || leftStickyPunctuation.has(ch) || forwardStickyGlue.has(ch)) {
249
+ sawQuote = true;
250
+ continue;
251
+ }
252
+ return false;
253
+ }
254
+ return sawQuote;
255
+ }
256
+ function previousCodePointStart(text, end) {
257
+ const last = end - 1;
258
+ if (last <= 0)
259
+ return Math.max(last, 0);
260
+ const lastCodeUnit = text.charCodeAt(last);
261
+ if (lastCodeUnit < 0xDC00 || lastCodeUnit > 0xDFFF)
262
+ return last;
263
+ const maybeHigh = last - 1;
264
+ if (maybeHigh < 0)
265
+ return last;
266
+ const highCodeUnit = text.charCodeAt(maybeHigh);
267
+ return highCodeUnit >= 0xD800 && highCodeUnit <= 0xDBFF ? maybeHigh : last;
268
+ }
269
+ function getLastCodePoint(text) {
270
+ if (text.length === 0)
271
+ return null;
272
+ const start = previousCodePointStart(text, text.length);
273
+ return text.slice(start);
274
+ }
275
+ function splitTrailingForwardStickyCluster(text) {
276
+ const chars = Array.from(text);
277
+ let splitIndex = chars.length;
278
+ while (splitIndex > 0) {
279
+ const ch = chars[splitIndex - 1];
280
+ if (combiningMarkRe.test(ch)) {
281
+ splitIndex--;
282
+ continue;
283
+ }
284
+ if (kinsokuEnd.has(ch) || forwardStickyGlue.has(ch)) {
285
+ splitIndex--;
286
+ continue;
287
+ }
288
+ break;
289
+ }
290
+ if (splitIndex <= 0 || splitIndex === chars.length)
291
+ return null;
292
+ return {
293
+ head: chars.slice(0, splitIndex).join(''),
294
+ tail: chars.slice(splitIndex).join(''),
295
+ };
296
+ }
297
+ function getRepeatableSingleCharRunChar(text, isWordLike, kind) {
298
+ return kind === 'text' && !isWordLike && text.length === 1 && text !== '-' && text !== '—'
299
+ ? text
300
+ : null;
301
+ }
302
+ function materializeDeferredSingleCharRun(texts, chars, lengths, index) {
303
+ const ch = chars[index];
304
+ const text = texts[index];
305
+ if (ch == null)
306
+ return text;
307
+ const length = lengths[index];
308
+ if (text.length === length)
309
+ return text;
310
+ const materialized = ch.repeat(length);
311
+ texts[index] = materialized;
312
+ return materialized;
313
+ }
314
+ function hasArabicNoSpacePunctuation(containsArabic, lastCodePoint) {
315
+ return containsArabic && lastCodePoint !== null && arabicNoSpaceTrailingPunctuation.has(lastCodePoint);
316
+ }
317
+ function endsWithMyanmarMedialGlue(segment) {
318
+ const lastCodePoint = getLastCodePoint(segment);
319
+ return lastCodePoint !== null && myanmarMedialGlue.has(lastCodePoint);
320
+ }
321
+ function splitLeadingSpaceAndMarks(segment) {
322
+ if (segment.length < 2 || segment[0] !== ' ')
323
+ return null;
324
+ const marks = segment.slice(1);
325
+ if (/^\p{M}+$/u.test(marks)) {
326
+ return { space: ' ', marks };
327
+ }
328
+ return null;
329
+ }
330
+ export function endsWithClosingQuote(text) {
331
+ let end = text.length;
332
+ while (end > 0) {
333
+ const start = previousCodePointStart(text, end);
334
+ const ch = text.slice(start, end);
335
+ if (closingQuoteChars.has(ch))
336
+ return true;
337
+ if (!leftStickyPunctuation.has(ch))
338
+ return false;
339
+ end = start;
340
+ }
341
+ return false;
342
+ }
343
+ function classifySegmentBreakChar(ch, whiteSpaceProfile) {
344
+ if (whiteSpaceProfile.preserveOrdinarySpaces || whiteSpaceProfile.preserveHardBreaks) {
345
+ if (ch === ' ')
346
+ return 'preserved-space';
347
+ if (ch === '\t')
348
+ return 'tab';
349
+ if (whiteSpaceProfile.preserveHardBreaks && ch === '\n')
350
+ return 'hard-break';
351
+ }
352
+ if (ch === ' ')
353
+ return 'space';
354
+ if (ch === '\u00A0' || ch === '\u202F' || ch === '\u2060' || ch === '\uFEFF') {
355
+ return 'glue';
356
+ }
357
+ if (ch === '\u200B')
358
+ return 'zero-width-break';
359
+ if (ch === '\u00AD')
360
+ return 'soft-hyphen';
361
+ return 'text';
362
+ }
363
+ // All characters that classifySegmentBreakChar maps to a non-'text' kind.
364
+ const breakCharRe = /[\x20\t\n\xA0\xAD\u200B\u202F\u2060\uFEFF]/;
365
+ function joinTextParts(parts) {
366
+ return parts.length === 1 ? parts[0] : parts.join('');
367
+ }
368
+ function joinReversedPrefixParts(prefixParts, tail) {
369
+ const parts = [];
370
+ for (let i = prefixParts.length - 1; i >= 0; i--) {
371
+ parts.push(prefixParts[i]);
372
+ }
373
+ parts.push(tail);
374
+ return joinTextParts(parts);
375
+ }
376
+ function splitSegmentByBreakKind(segment, isWordLike, start, whiteSpaceProfile) {
377
+ if (!breakCharRe.test(segment)) {
378
+ return [{ text: segment, isWordLike, kind: 'text', start }];
379
+ }
380
+ const pieces = [];
381
+ let currentKind = null;
382
+ let currentTextParts = [];
383
+ let currentStart = start;
384
+ let currentWordLike = false;
385
+ let offset = 0;
386
+ for (const ch of segment) {
387
+ const kind = classifySegmentBreakChar(ch, whiteSpaceProfile);
388
+ const wordLike = kind === 'text' && isWordLike;
389
+ if (currentKind !== null && kind === currentKind && wordLike === currentWordLike) {
390
+ currentTextParts.push(ch);
391
+ offset += ch.length;
392
+ continue;
393
+ }
394
+ if (currentKind !== null) {
395
+ pieces.push({
396
+ text: joinTextParts(currentTextParts),
397
+ isWordLike: currentWordLike,
398
+ kind: currentKind,
399
+ start: currentStart,
400
+ });
401
+ }
402
+ currentKind = kind;
403
+ currentTextParts = [ch];
404
+ currentStart = start + offset;
405
+ currentWordLike = wordLike;
406
+ offset += ch.length;
407
+ }
408
+ if (currentKind !== null) {
409
+ pieces.push({
410
+ text: joinTextParts(currentTextParts),
411
+ isWordLike: currentWordLike,
412
+ kind: currentKind,
413
+ start: currentStart,
414
+ });
415
+ }
416
+ return pieces;
417
+ }
418
+ function isTextRunBoundary(kind) {
419
+ return (kind === 'space' ||
420
+ kind === 'preserved-space' ||
421
+ kind === 'zero-width-break' ||
422
+ kind === 'hard-break');
423
+ }
424
+ const urlSchemeSegmentRe = /^[A-Za-z][A-Za-z0-9+.-]*:$/;
425
+ function isUrlLikeRunStart(segmentation, index) {
426
+ const text = segmentation.texts[index];
427
+ if (text.startsWith('www.'))
428
+ return true;
429
+ return (urlSchemeSegmentRe.test(text) &&
430
+ index + 1 < segmentation.len &&
431
+ segmentation.kinds[index + 1] === 'text' &&
432
+ segmentation.texts[index + 1] === '//');
433
+ }
434
+ function isUrlQueryBoundarySegment(text) {
435
+ return text.includes('?') && (text.includes('://') || text.startsWith('www.'));
436
+ }
437
+ function mergeUrlLikeRuns(segmentation) {
438
+ let hasUrlStart = false;
439
+ for (let i = 0; i < segmentation.len; i++) {
440
+ if (segmentation.kinds[i] === 'text' && isUrlLikeRunStart(segmentation, i)) {
441
+ hasUrlStart = true;
442
+ break;
443
+ }
444
+ }
445
+ if (!hasUrlStart)
446
+ return segmentation;
447
+ const texts = segmentation.texts.slice();
448
+ const isWordLike = segmentation.isWordLike.slice();
449
+ const kinds = segmentation.kinds.slice();
450
+ const starts = segmentation.starts.slice();
451
+ for (let i = 0; i < segmentation.len; i++) {
452
+ if (texts[i].length === 0 || kinds[i] !== 'text' || !isUrlLikeRunStart(segmentation, i))
453
+ continue;
454
+ const mergedParts = [texts[i]];
455
+ let j = i + 1;
456
+ while (j < segmentation.len && !isTextRunBoundary(kinds[j])) {
457
+ mergedParts.push(texts[j]);
458
+ isWordLike[i] = true;
459
+ const endsQueryPrefix = texts[j].includes('?');
460
+ kinds[j] = 'text';
461
+ texts[j] = '';
462
+ j++;
463
+ if (endsQueryPrefix)
464
+ break;
465
+ }
466
+ texts[i] = joinTextParts(mergedParts);
467
+ }
468
+ let compactLen = 0;
469
+ for (let read = 0; read < texts.length; read++) {
470
+ const text = texts[read];
471
+ if (text.length === 0)
472
+ continue;
473
+ if (compactLen !== read) {
474
+ texts[compactLen] = text;
475
+ isWordLike[compactLen] = isWordLike[read];
476
+ kinds[compactLen] = kinds[read];
477
+ starts[compactLen] = starts[read];
478
+ }
479
+ compactLen++;
480
+ }
481
+ texts.length = compactLen;
482
+ isWordLike.length = compactLen;
483
+ kinds.length = compactLen;
484
+ starts.length = compactLen;
485
+ return {
486
+ len: compactLen,
487
+ texts,
488
+ isWordLike,
489
+ kinds,
490
+ starts,
491
+ };
492
+ }
493
+ function mergeUrlQueryRuns(segmentation) {
494
+ // Conservative guard: if no text segment looks like a URL query boundary,
495
+ // this pass cannot produce any change.
496
+ let hasQueryBoundary = false;
497
+ for (let i = 0; i < segmentation.len; i++) {
498
+ if (segmentation.kinds[i] === 'text' && isUrlQueryBoundarySegment(segmentation.texts[i])) {
499
+ hasQueryBoundary = true;
500
+ break;
501
+ }
502
+ }
503
+ if (!hasQueryBoundary)
504
+ return segmentation;
505
+ const texts = [];
506
+ const isWordLike = [];
507
+ const kinds = [];
508
+ const starts = [];
509
+ for (let i = 0; i < segmentation.len; i++) {
510
+ const text = segmentation.texts[i];
511
+ texts.push(text);
512
+ isWordLike.push(segmentation.isWordLike[i]);
513
+ kinds.push(segmentation.kinds[i]);
514
+ starts.push(segmentation.starts[i]);
515
+ if (!isUrlQueryBoundarySegment(text))
516
+ continue;
517
+ const nextIndex = i + 1;
518
+ if (nextIndex >= segmentation.len ||
519
+ isTextRunBoundary(segmentation.kinds[nextIndex])) {
520
+ continue;
521
+ }
522
+ const queryParts = [];
523
+ const queryStart = segmentation.starts[nextIndex];
524
+ let j = nextIndex;
525
+ while (j < segmentation.len && !isTextRunBoundary(segmentation.kinds[j])) {
526
+ queryParts.push(segmentation.texts[j]);
527
+ j++;
528
+ }
529
+ if (queryParts.length > 0) {
530
+ texts.push(joinTextParts(queryParts));
531
+ isWordLike.push(true);
532
+ kinds.push('text');
533
+ starts.push(queryStart);
534
+ i = j - 1;
535
+ }
536
+ }
537
+ return {
538
+ len: texts.length,
539
+ texts,
540
+ isWordLike,
541
+ kinds,
542
+ starts,
543
+ };
544
+ }
545
+ const numericJoinerChars = new Set([
546
+ ':', '-', '/', '×', ',', '.', '+',
547
+ '\u2013',
548
+ '\u2014',
549
+ ]);
550
+ const asciiPunctuationChainSegmentRe = /^[A-Za-z0-9_]+[.,:;]*$/;
551
+ const asciiPunctuationChainTrailingJoinersRe = /[.,:;]+$/;
552
+ function segmentContainsDecimalDigit(text) {
553
+ for (const ch of text) {
554
+ if (decimalDigitRe.test(ch))
555
+ return true;
556
+ }
557
+ return false;
558
+ }
559
+ export function isNumericRunSegment(text) {
560
+ if (text.length === 0)
561
+ return false;
562
+ for (const ch of text) {
563
+ if (decimalDigitRe.test(ch) || numericJoinerChars.has(ch))
564
+ continue;
565
+ return false;
566
+ }
567
+ return true;
568
+ }
569
+ function mergeNumericRuns(segmentation) {
570
+ let hasNumericRun = false;
571
+ for (let i = 0; i < segmentation.len; i++) {
572
+ const text = segmentation.texts[i];
573
+ if (segmentation.kinds[i] === 'text' && isNumericRunSegment(text) && segmentContainsDecimalDigit(text)) {
574
+ hasNumericRun = true;
575
+ break;
576
+ }
577
+ }
578
+ if (!hasNumericRun)
579
+ return segmentation;
580
+ const texts = [];
581
+ const isWordLike = [];
582
+ const kinds = [];
583
+ const starts = [];
584
+ for (let i = 0; i < segmentation.len; i++) {
585
+ const text = segmentation.texts[i];
586
+ const kind = segmentation.kinds[i];
587
+ if (kind === 'text' && isNumericRunSegment(text) && segmentContainsDecimalDigit(text)) {
588
+ const mergedParts = [text];
589
+ let j = i + 1;
590
+ while (j < segmentation.len &&
591
+ segmentation.kinds[j] === 'text' &&
592
+ isNumericRunSegment(segmentation.texts[j])) {
593
+ mergedParts.push(segmentation.texts[j]);
594
+ j++;
595
+ }
596
+ texts.push(joinTextParts(mergedParts));
597
+ isWordLike.push(true);
598
+ kinds.push('text');
599
+ starts.push(segmentation.starts[i]);
600
+ i = j - 1;
601
+ continue;
602
+ }
603
+ texts.push(text);
604
+ isWordLike.push(segmentation.isWordLike[i]);
605
+ kinds.push(kind);
606
+ starts.push(segmentation.starts[i]);
607
+ }
608
+ return {
609
+ len: texts.length,
610
+ texts,
611
+ isWordLike,
612
+ kinds,
613
+ starts,
614
+ };
615
+ }
616
+ function mergeAsciiPunctuationChains(segmentation) {
617
+ let hasChain = false;
618
+ for (let i = 0; i < segmentation.len - 1; i++) {
619
+ if (segmentation.kinds[i] === 'text' &&
620
+ segmentation.isWordLike[i] &&
621
+ asciiPunctuationChainTrailingJoinersRe.test(segmentation.texts[i]) &&
622
+ segmentation.kinds[i + 1] === 'text' &&
623
+ segmentation.isWordLike[i + 1]) {
624
+ hasChain = true;
625
+ break;
626
+ }
627
+ }
628
+ if (!hasChain)
629
+ return segmentation;
630
+ const texts = [];
631
+ const isWordLike = [];
632
+ const kinds = [];
633
+ const starts = [];
634
+ for (let i = 0; i < segmentation.len; i++) {
635
+ const text = segmentation.texts[i];
636
+ const kind = segmentation.kinds[i];
637
+ const wordLike = segmentation.isWordLike[i];
638
+ if (kind === 'text' && wordLike && asciiPunctuationChainSegmentRe.test(text)) {
639
+ const mergedParts = [text];
640
+ let endsWithJoiners = asciiPunctuationChainTrailingJoinersRe.test(text);
641
+ let j = i + 1;
642
+ while (endsWithJoiners &&
643
+ j < segmentation.len &&
644
+ segmentation.kinds[j] === 'text' &&
645
+ segmentation.isWordLike[j] &&
646
+ asciiPunctuationChainSegmentRe.test(segmentation.texts[j])) {
647
+ const nextText = segmentation.texts[j];
648
+ mergedParts.push(nextText);
649
+ endsWithJoiners = asciiPunctuationChainTrailingJoinersRe.test(nextText);
650
+ j++;
651
+ }
652
+ texts.push(joinTextParts(mergedParts));
653
+ isWordLike.push(true);
654
+ kinds.push('text');
655
+ starts.push(segmentation.starts[i]);
656
+ i = j - 1;
657
+ continue;
658
+ }
659
+ texts.push(text);
660
+ isWordLike.push(wordLike);
661
+ kinds.push(kind);
662
+ starts.push(segmentation.starts[i]);
663
+ }
664
+ return {
665
+ len: texts.length,
666
+ texts,
667
+ isWordLike,
668
+ kinds,
669
+ starts,
670
+ };
671
+ }
672
+ function splitHyphenatedNumericRuns(segmentation) {
673
+ let hasHyphenatedNumeric = false;
674
+ for (let i = 0; i < segmentation.len; i++) {
675
+ const text = segmentation.texts[i];
676
+ if (segmentation.kinds[i] === 'text' && text.includes('-') && segmentContainsDecimalDigit(text)) {
677
+ hasHyphenatedNumeric = true;
678
+ break;
679
+ }
680
+ }
681
+ if (!hasHyphenatedNumeric)
682
+ return segmentation;
683
+ const texts = [];
684
+ const isWordLike = [];
685
+ const kinds = [];
686
+ const starts = [];
687
+ for (let i = 0; i < segmentation.len; i++) {
688
+ const text = segmentation.texts[i];
689
+ if (segmentation.kinds[i] === 'text' && text.includes('-')) {
690
+ const parts = text.split('-');
691
+ let shouldSplit = parts.length > 1;
692
+ for (let j = 0; j < parts.length; j++) {
693
+ const part = parts[j];
694
+ if (!shouldSplit)
695
+ break;
696
+ if (part.length === 0 ||
697
+ !segmentContainsDecimalDigit(part) ||
698
+ !isNumericRunSegment(part)) {
699
+ shouldSplit = false;
700
+ }
701
+ }
702
+ if (shouldSplit) {
703
+ let offset = 0;
704
+ for (let j = 0; j < parts.length; j++) {
705
+ const part = parts[j];
706
+ const splitText = j < parts.length - 1 ? `${part}-` : part;
707
+ texts.push(splitText);
708
+ isWordLike.push(true);
709
+ kinds.push('text');
710
+ starts.push(segmentation.starts[i] + offset);
711
+ offset += splitText.length;
712
+ }
713
+ continue;
714
+ }
715
+ }
716
+ texts.push(text);
717
+ isWordLike.push(segmentation.isWordLike[i]);
718
+ kinds.push(segmentation.kinds[i]);
719
+ starts.push(segmentation.starts[i]);
720
+ }
721
+ return {
722
+ len: texts.length,
723
+ texts,
724
+ isWordLike,
725
+ kinds,
726
+ starts,
727
+ };
728
+ }
729
+ function mergeGlueConnectedTextRuns(segmentation) {
730
+ const texts = [];
731
+ const isWordLike = [];
732
+ const kinds = [];
733
+ const starts = [];
734
+ let read = 0;
735
+ while (read < segmentation.len) {
736
+ const textParts = [segmentation.texts[read]];
737
+ let wordLike = segmentation.isWordLike[read];
738
+ let kind = segmentation.kinds[read];
739
+ let start = segmentation.starts[read];
740
+ if (kind === 'glue') {
741
+ const glueParts = [textParts[0]];
742
+ const glueStart = start;
743
+ read++;
744
+ while (read < segmentation.len && segmentation.kinds[read] === 'glue') {
745
+ glueParts.push(segmentation.texts[read]);
746
+ read++;
747
+ }
748
+ const glueText = joinTextParts(glueParts);
749
+ if (read < segmentation.len && segmentation.kinds[read] === 'text') {
750
+ textParts[0] = glueText;
751
+ textParts.push(segmentation.texts[read]);
752
+ wordLike = segmentation.isWordLike[read];
753
+ kind = 'text';
754
+ start = glueStart;
755
+ read++;
756
+ }
757
+ else {
758
+ texts.push(glueText);
759
+ isWordLike.push(false);
760
+ kinds.push('glue');
761
+ starts.push(glueStart);
762
+ continue;
763
+ }
764
+ }
765
+ else {
766
+ read++;
767
+ }
768
+ if (kind === 'text') {
769
+ while (read < segmentation.len && segmentation.kinds[read] === 'glue') {
770
+ const glueParts = [];
771
+ while (read < segmentation.len && segmentation.kinds[read] === 'glue') {
772
+ glueParts.push(segmentation.texts[read]);
773
+ read++;
774
+ }
775
+ const glueText = joinTextParts(glueParts);
776
+ if (read < segmentation.len && segmentation.kinds[read] === 'text') {
777
+ textParts.push(glueText, segmentation.texts[read]);
778
+ wordLike = wordLike || segmentation.isWordLike[read];
779
+ read++;
780
+ continue;
781
+ }
782
+ textParts.push(glueText);
783
+ }
784
+ }
785
+ texts.push(joinTextParts(textParts));
786
+ isWordLike.push(wordLike);
787
+ kinds.push(kind);
788
+ starts.push(start);
789
+ }
790
+ return {
791
+ len: texts.length,
792
+ texts,
793
+ isWordLike,
794
+ kinds,
795
+ starts,
796
+ };
797
+ }
798
+ function carryTrailingForwardStickyAcrossCJKBoundary(segmentation) {
799
+ let hasAdjacentCjkText = false;
800
+ for (let i = 0; i < segmentation.len - 1; i++) {
801
+ if (segmentation.kinds[i] === 'text' &&
802
+ segmentation.kinds[i + 1] === 'text' &&
803
+ isCJK(segmentation.texts[i]) &&
804
+ isCJK(segmentation.texts[i + 1])) {
805
+ hasAdjacentCjkText = true;
806
+ break;
807
+ }
808
+ }
809
+ if (!hasAdjacentCjkText)
810
+ return segmentation;
811
+ const texts = segmentation.texts.slice();
812
+ const isWordLike = segmentation.isWordLike.slice();
813
+ const kinds = segmentation.kinds.slice();
814
+ const starts = segmentation.starts.slice();
815
+ for (let i = 0; i < texts.length - 1; i++) {
816
+ if (kinds[i] !== 'text' || kinds[i + 1] !== 'text')
817
+ continue;
818
+ if (!isCJK(texts[i]) || !isCJK(texts[i + 1]))
819
+ continue;
820
+ const split = splitTrailingForwardStickyCluster(texts[i]);
821
+ if (split === null)
822
+ continue;
823
+ texts[i] = split.head;
824
+ texts[i + 1] = split.tail + texts[i + 1];
825
+ starts[i + 1] = starts[i] + split.head.length;
826
+ }
827
+ return {
828
+ len: texts.length,
829
+ texts,
830
+ isWordLike,
831
+ kinds,
832
+ starts,
833
+ };
834
+ }
835
+ function buildMergedSegmentation(normalized, profile, whiteSpaceProfile) {
836
+ const wordSegmenter = getSharedWordSegmenter();
837
+ let mergedLen = 0;
838
+ const mergedTexts = [];
839
+ const mergedTextParts = [];
840
+ const mergedWordLike = [];
841
+ const mergedKinds = [];
842
+ const mergedStarts = [];
843
+ // Track repeatable single-char punctuation runs structurally so identical
844
+ // merges stay O(1) instead of re-scanning the accumulated segment each time.
845
+ const mergedSingleCharRunChars = [];
846
+ const mergedSingleCharRunLengths = [];
847
+ const mergedContainsCJK = [];
848
+ const mergedContainsArabicScript = [];
849
+ const mergedEndsWithClosingQuote = [];
850
+ const mergedEndsWithMyanmarMedialGlue = [];
851
+ const mergedHasArabicNoSpacePunctuation = [];
852
+ for (const s of wordSegmenter.segment(normalized)) {
853
+ for (const piece of splitSegmentByBreakKind(s.segment, s.isWordLike ?? false, s.index, whiteSpaceProfile)) {
854
+ const isText = piece.kind === 'text';
855
+ const repeatableSingleCharRunChar = getRepeatableSingleCharRunChar(piece.text, piece.isWordLike, piece.kind);
856
+ const pieceContainsCJK = isCJK(piece.text);
857
+ const pieceContainsArabicScript = containsArabicScript(piece.text);
858
+ const pieceLastCodePoint = getLastCodePoint(piece.text);
859
+ const pieceEndsWithClosingQuote = endsWithClosingQuote(piece.text);
860
+ const pieceEndsWithMyanmarMedialGlue = endsWithMyanmarMedialGlue(piece.text);
861
+ const prevIndex = mergedLen - 1;
862
+ function appendPieceToPrevious() {
863
+ if (mergedSingleCharRunChars[prevIndex] !== null) {
864
+ mergedTextParts[prevIndex] = [
865
+ materializeDeferredSingleCharRun(mergedTexts, mergedSingleCharRunChars, mergedSingleCharRunLengths, prevIndex),
866
+ ];
867
+ mergedSingleCharRunChars[prevIndex] = null;
868
+ }
869
+ mergedTextParts[prevIndex].push(piece.text);
870
+ mergedWordLike[prevIndex] = mergedWordLike[prevIndex] || piece.isWordLike;
871
+ mergedContainsCJK[prevIndex] = mergedContainsCJK[prevIndex] || pieceContainsCJK;
872
+ mergedContainsArabicScript[prevIndex] =
873
+ mergedContainsArabicScript[prevIndex] || pieceContainsArabicScript;
874
+ mergedEndsWithClosingQuote[prevIndex] = pieceEndsWithClosingQuote;
875
+ mergedEndsWithMyanmarMedialGlue[prevIndex] = pieceEndsWithMyanmarMedialGlue;
876
+ mergedHasArabicNoSpacePunctuation[prevIndex] = hasArabicNoSpacePunctuation(mergedContainsArabicScript[prevIndex], pieceLastCodePoint);
877
+ }
878
+ // First-pass keeps: no-space script-specific joins and punctuation glue
879
+ // that depend on the immediately preceding text run.
880
+ if (profile.carryCJKAfterClosingQuote &&
881
+ isText &&
882
+ mergedLen > 0 &&
883
+ mergedKinds[prevIndex] === 'text' &&
884
+ pieceContainsCJK &&
885
+ mergedContainsCJK[prevIndex] &&
886
+ mergedEndsWithClosingQuote[prevIndex]) {
887
+ appendPieceToPrevious();
888
+ }
889
+ else if (isText &&
890
+ mergedLen > 0 &&
891
+ mergedKinds[prevIndex] === 'text' &&
892
+ isCJKLineStartProhibitedSegment(piece.text) &&
893
+ mergedContainsCJK[prevIndex]) {
894
+ appendPieceToPrevious();
895
+ }
896
+ else if (isText &&
897
+ mergedLen > 0 &&
898
+ mergedKinds[prevIndex] === 'text' &&
899
+ mergedEndsWithMyanmarMedialGlue[prevIndex]) {
900
+ appendPieceToPrevious();
901
+ }
902
+ else if (isText &&
903
+ mergedLen > 0 &&
904
+ mergedKinds[prevIndex] === 'text' &&
905
+ piece.isWordLike &&
906
+ pieceContainsArabicScript &&
907
+ mergedHasArabicNoSpacePunctuation[prevIndex]) {
908
+ appendPieceToPrevious();
909
+ mergedWordLike[prevIndex] = true;
910
+ }
911
+ else if (repeatableSingleCharRunChar !== null &&
912
+ mergedLen > 0 &&
913
+ mergedKinds[prevIndex] === 'text' &&
914
+ mergedSingleCharRunChars[prevIndex] === repeatableSingleCharRunChar) {
915
+ mergedSingleCharRunLengths[prevIndex] = (mergedSingleCharRunLengths[prevIndex] ?? 1) + 1;
916
+ }
917
+ else if (isText &&
918
+ !piece.isWordLike &&
919
+ mergedLen > 0 &&
920
+ mergedKinds[prevIndex] === 'text' &&
921
+ !mergedContainsCJK[prevIndex] &&
922
+ (isLeftStickyPunctuationSegment(piece.text) ||
923
+ (piece.text === '-' && mergedWordLike[prevIndex]))) {
924
+ appendPieceToPrevious();
925
+ }
926
+ else {
927
+ mergedTexts[mergedLen] = piece.text;
928
+ mergedTextParts[mergedLen] = [piece.text];
929
+ mergedWordLike[mergedLen] = piece.isWordLike;
930
+ mergedKinds[mergedLen] = piece.kind;
931
+ mergedStarts[mergedLen] = piece.start;
932
+ mergedSingleCharRunChars[mergedLen] = repeatableSingleCharRunChar;
933
+ mergedSingleCharRunLengths[mergedLen] = repeatableSingleCharRunChar === null ? 0 : 1;
934
+ mergedContainsCJK[mergedLen] = pieceContainsCJK;
935
+ mergedContainsArabicScript[mergedLen] = pieceContainsArabicScript;
936
+ mergedEndsWithClosingQuote[mergedLen] = pieceEndsWithClosingQuote;
937
+ mergedEndsWithMyanmarMedialGlue[mergedLen] = pieceEndsWithMyanmarMedialGlue;
938
+ mergedHasArabicNoSpacePunctuation[mergedLen] = hasArabicNoSpacePunctuation(pieceContainsArabicScript, pieceLastCodePoint);
939
+ mergedLen++;
940
+ }
941
+ }
942
+ }
943
+ for (let i = 0; i < mergedLen; i++) {
944
+ if (mergedSingleCharRunChars[i] !== null) {
945
+ mergedTexts[i] = materializeDeferredSingleCharRun(mergedTexts, mergedSingleCharRunChars, mergedSingleCharRunLengths, i);
946
+ continue;
947
+ }
948
+ mergedTexts[i] = joinTextParts(mergedTextParts[i]);
949
+ }
950
+ // Later passes operate on the merged text stream itself: contextual escaped
951
+ // quote glue, forward-sticky carry, compaction, then the broader URL/numeric
952
+ // and Arabic-leading-mark fixes.
953
+ for (let i = 1; i < mergedLen; i++) {
954
+ if (mergedKinds[i] === 'text' &&
955
+ !mergedWordLike[i] &&
956
+ isEscapedQuoteClusterSegment(mergedTexts[i]) &&
957
+ mergedKinds[i - 1] === 'text' &&
958
+ !mergedContainsCJK[i - 1]) {
959
+ mergedTexts[i - 1] += mergedTexts[i];
960
+ mergedWordLike[i - 1] = mergedWordLike[i - 1] || mergedWordLike[i];
961
+ mergedTexts[i] = '';
962
+ }
963
+ }
964
+ const forwardStickyPrefixParts = Array.from({ length: mergedLen }, () => null);
965
+ let nextLiveIndex = -1;
966
+ for (let i = mergedLen - 1; i >= 0; i--) {
967
+ const text = mergedTexts[i];
968
+ if (text.length === 0)
969
+ continue;
970
+ if (mergedKinds[i] === 'text' &&
971
+ !mergedWordLike[i] &&
972
+ isForwardStickyClusterSegment(text) &&
973
+ nextLiveIndex >= 0 &&
974
+ mergedKinds[nextLiveIndex] === 'text') {
975
+ const prefixParts = forwardStickyPrefixParts[nextLiveIndex] ?? [];
976
+ prefixParts.push(text);
977
+ forwardStickyPrefixParts[nextLiveIndex] = prefixParts;
978
+ mergedStarts[nextLiveIndex] = mergedStarts[i];
979
+ mergedTexts[i] = '';
980
+ continue;
981
+ }
982
+ nextLiveIndex = i;
983
+ }
984
+ for (let i = 0; i < mergedLen; i++) {
985
+ const prefixParts = forwardStickyPrefixParts[i];
986
+ if (prefixParts == null)
987
+ continue;
988
+ mergedTexts[i] = joinReversedPrefixParts(prefixParts, mergedTexts[i]);
989
+ }
990
+ let compactLen = 0;
991
+ for (let read = 0; read < mergedLen; read++) {
992
+ const text = mergedTexts[read];
993
+ if (text.length === 0)
994
+ continue;
995
+ if (compactLen !== read) {
996
+ mergedTexts[compactLen] = text;
997
+ mergedWordLike[compactLen] = mergedWordLike[read];
998
+ mergedKinds[compactLen] = mergedKinds[read];
999
+ mergedStarts[compactLen] = mergedStarts[read];
1000
+ }
1001
+ compactLen++;
1002
+ }
1003
+ mergedTexts.length = compactLen;
1004
+ mergedWordLike.length = compactLen;
1005
+ mergedKinds.length = compactLen;
1006
+ mergedStarts.length = compactLen;
1007
+ const compacted = mergeGlueConnectedTextRuns({
1008
+ len: compactLen,
1009
+ texts: mergedTexts,
1010
+ isWordLike: mergedWordLike,
1011
+ kinds: mergedKinds,
1012
+ starts: mergedStarts,
1013
+ });
1014
+ const withMergedUrls = carryTrailingForwardStickyAcrossCJKBoundary(mergeAsciiPunctuationChains(splitHyphenatedNumericRuns(mergeNumericRuns(mergeUrlQueryRuns(mergeUrlLikeRuns(compacted))))));
1015
+ for (let i = 0; i < withMergedUrls.len - 1; i++) {
1016
+ const split = splitLeadingSpaceAndMarks(withMergedUrls.texts[i]);
1017
+ if (split === null)
1018
+ continue;
1019
+ if ((withMergedUrls.kinds[i] !== 'space' && withMergedUrls.kinds[i] !== 'preserved-space') ||
1020
+ withMergedUrls.kinds[i + 1] !== 'text' ||
1021
+ !containsArabicScript(withMergedUrls.texts[i + 1])) {
1022
+ continue;
1023
+ }
1024
+ withMergedUrls.texts[i] = split.space;
1025
+ withMergedUrls.isWordLike[i] = false;
1026
+ withMergedUrls.kinds[i] = withMergedUrls.kinds[i] === 'preserved-space' ? 'preserved-space' : 'space';
1027
+ withMergedUrls.texts[i + 1] = split.marks + withMergedUrls.texts[i + 1];
1028
+ withMergedUrls.starts[i + 1] = withMergedUrls.starts[i] + split.space.length;
1029
+ }
1030
+ return withMergedUrls;
1031
+ }
1032
+ function compileAnalysisChunks(segmentation, whiteSpaceProfile) {
1033
+ if (segmentation.len === 0)
1034
+ return [];
1035
+ if (!whiteSpaceProfile.preserveHardBreaks) {
1036
+ return [{
1037
+ startSegmentIndex: 0,
1038
+ endSegmentIndex: segmentation.len,
1039
+ consumedEndSegmentIndex: segmentation.len,
1040
+ }];
1041
+ }
1042
+ const chunks = [];
1043
+ let startSegmentIndex = 0;
1044
+ for (let i = 0; i < segmentation.len; i++) {
1045
+ if (segmentation.kinds[i] !== 'hard-break')
1046
+ continue;
1047
+ chunks.push({
1048
+ startSegmentIndex,
1049
+ endSegmentIndex: i,
1050
+ consumedEndSegmentIndex: i + 1,
1051
+ });
1052
+ startSegmentIndex = i + 1;
1053
+ }
1054
+ if (startSegmentIndex < segmentation.len) {
1055
+ chunks.push({
1056
+ startSegmentIndex,
1057
+ endSegmentIndex: segmentation.len,
1058
+ consumedEndSegmentIndex: segmentation.len,
1059
+ });
1060
+ }
1061
+ return chunks;
1062
+ }
1063
+ function mergeKeepAllTextSegments(normalized, segmentation, breakAfterPunctuation) {
1064
+ if (segmentation.len <= 1)
1065
+ return segmentation;
1066
+ const texts = [];
1067
+ const isWordLike = [];
1068
+ const kinds = [];
1069
+ const starts = [];
1070
+ let groupStart = -1;
1071
+ let groupContainsCJK = false;
1072
+ function pushOriginalText(index) {
1073
+ texts.push(segmentation.texts[index]);
1074
+ isWordLike.push(segmentation.isWordLike[index]);
1075
+ kinds.push('text');
1076
+ starts.push(segmentation.starts[index]);
1077
+ }
1078
+ function pushMergedText(start, end) {
1079
+ let wordLike = false;
1080
+ for (let i = start; i < end; i++) {
1081
+ wordLike = wordLike || segmentation.isWordLike[i];
1082
+ }
1083
+ const sourceStart = segmentation.starts[start];
1084
+ const sourceEnd = end < segmentation.len ? segmentation.starts[end] : normalized.length;
1085
+ texts.push(normalized.slice(sourceStart, sourceEnd));
1086
+ isWordLike.push(wordLike);
1087
+ kinds.push('text');
1088
+ starts.push(sourceStart);
1089
+ }
1090
+ function flushGroup(end) {
1091
+ if (groupStart < 0)
1092
+ return;
1093
+ if (groupContainsCJK) {
1094
+ if (groupStart + 1 === end) {
1095
+ pushOriginalText(groupStart);
1096
+ }
1097
+ else {
1098
+ pushMergedText(groupStart, end);
1099
+ }
1100
+ }
1101
+ else {
1102
+ for (let i = groupStart; i < end; i++)
1103
+ pushOriginalText(i);
1104
+ }
1105
+ groupStart = -1;
1106
+ groupContainsCJK = false;
1107
+ }
1108
+ for (let i = 0; i < segmentation.len; i++) {
1109
+ const text = segmentation.texts[i];
1110
+ const kind = segmentation.kinds[i];
1111
+ if (kind === 'text') {
1112
+ if (groupStart >= 0 &&
1113
+ !canContinueKeepAllTextRun(segmentation.texts[i - 1], breakAfterPunctuation)) {
1114
+ flushGroup(i);
1115
+ }
1116
+ if (groupStart < 0)
1117
+ groupStart = i;
1118
+ groupContainsCJK = groupContainsCJK || isCJK(text);
1119
+ continue;
1120
+ }
1121
+ flushGroup(i);
1122
+ texts.push(text);
1123
+ isWordLike.push(segmentation.isWordLike[i]);
1124
+ kinds.push(kind);
1125
+ starts.push(segmentation.starts[i]);
1126
+ }
1127
+ flushGroup(segmentation.len);
1128
+ return {
1129
+ len: texts.length,
1130
+ texts,
1131
+ isWordLike,
1132
+ kinds,
1133
+ starts,
1134
+ };
1135
+ }
1136
+ export function analyzeText(text, profile, whiteSpace = 'normal', wordBreak = 'normal') {
1137
+ const whiteSpaceProfile = getWhiteSpaceProfile(whiteSpace);
1138
+ const normalized = whiteSpaceProfile.mode === 'pre-wrap'
1139
+ ? normalizeWhitespacePreWrap(text)
1140
+ : normalizeWhitespaceNormal(text);
1141
+ if (normalized.length === 0) {
1142
+ return {
1143
+ normalized,
1144
+ chunks: [],
1145
+ len: 0,
1146
+ texts: [],
1147
+ isWordLike: [],
1148
+ kinds: [],
1149
+ starts: [],
1150
+ };
1151
+ }
1152
+ const mergedSegmentation = buildMergedSegmentation(normalized, profile, whiteSpaceProfile);
1153
+ const segmentation = wordBreak === 'keep-all'
1154
+ ? mergeKeepAllTextSegments(normalized, mergedSegmentation, profile.breakKeepAllAfterPunctuation)
1155
+ : mergedSegmentation;
1156
+ return {
1157
+ normalized,
1158
+ chunks: compileAnalysisChunks(segmentation, whiteSpaceProfile),
1159
+ ...segmentation,
1160
+ };
1161
+ }
1162
+ //# sourceMappingURL=analysis.js.map