expo-pretext 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1208 @@
1
+ // Ported from chenglou/pretext src/analysis.ts
2
+ // Modified for expo-pretext: accepts pre-segmented data from native module
3
+ // instead of using Intl.Segmenter internally.
4
+
5
+ export type WhiteSpaceMode = 'normal' | 'pre-wrap'
6
+ export type WordBreakMode = 'normal' | 'keep-all'
7
+
8
+ export type SegmentBreakKind =
9
+ | 'text'
10
+ | 'space'
11
+ | 'preserved-space'
12
+ | 'tab'
13
+ | 'glue'
14
+ | 'zero-width-break'
15
+ | 'soft-hyphen'
16
+ | 'hard-break'
17
+
18
+ type SegmentationPiece = {
19
+ text: string
20
+ isWordLike: boolean
21
+ kind: SegmentBreakKind
22
+ start: number
23
+ }
24
+
25
+ export type MergedSegmentation = {
26
+ len: number
27
+ texts: string[]
28
+ isWordLike: boolean[]
29
+ kinds: SegmentBreakKind[]
30
+ starts: number[]
31
+ }
32
+
33
+ export type AnalysisChunk = {
34
+ startSegmentIndex: number
35
+ endSegmentIndex: number
36
+ consumedEndSegmentIndex: number
37
+ }
38
+
39
+ export type TextAnalysis = { normalized: string, chunks: AnalysisChunk[] } & MergedSegmentation
40
+
41
+ export type AnalysisProfile = {
42
+ carryCJKAfterClosingQuote: boolean
43
+ }
44
+
45
+ const collapsibleWhitespaceRunRe = /[ \t\n\r\f]+/g
46
+ const needsWhitespaceNormalizationRe = /[\t\n\r\f]| {2,}|^ | $/
47
+
48
+ type WhiteSpaceProfile = {
49
+ mode: WhiteSpaceMode
50
+ preserveOrdinarySpaces: boolean
51
+ preserveHardBreaks: boolean
52
+ }
53
+
54
+ function getWhiteSpaceProfile(whiteSpace?: WhiteSpaceMode): WhiteSpaceProfile {
55
+ const mode = whiteSpace ?? 'normal'
56
+ return mode === 'pre-wrap'
57
+ ? { mode, preserveOrdinarySpaces: true, preserveHardBreaks: true }
58
+ : { mode, preserveOrdinarySpaces: false, preserveHardBreaks: false }
59
+ }
60
+
61
+ export function normalizeWhitespaceNormal(text: string): string {
62
+ if (!needsWhitespaceNormalizationRe.test(text)) return text
63
+
64
+ let normalized = text.replace(collapsibleWhitespaceRunRe, ' ')
65
+ if (normalized.charCodeAt(0) === 0x20) {
66
+ normalized = normalized.slice(1)
67
+ }
68
+ if (normalized.length > 0 && normalized.charCodeAt(normalized.length - 1) === 0x20) {
69
+ normalized = normalized.slice(0, -1)
70
+ }
71
+ return normalized
72
+ }
73
+
74
+ function normalizeWhitespacePreWrap(text: string): string {
75
+ if (!/[\r\f]/.test(text)) return text.replace(/\r\n/g, '\n')
76
+ return text
77
+ .replace(/\r\n/g, '\n')
78
+ .replace(/[\r\f]/g, '\n')
79
+ }
80
+
81
+ // In expo-pretext, segmentation happens on the native side.
82
+ // These functions are kept as no-ops for API compatibility.
83
+
84
+ export function clearAnalysisCaches(): void {
85
+ // No-op: native module handles segmenter lifecycle
86
+ }
87
+
88
+ export function setAnalysisLocale(_locale?: string): void {
89
+ // No-op: locale is set on the native side when calling segment
90
+ }
91
+
92
+ const arabicScriptRe = /\p{Script=Arabic}/u
93
+ const combiningMarkRe = /\p{M}/u
94
+ const decimalDigitRe = /\p{Nd}/u
95
+
96
+ function containsArabicScript(text: string): boolean {
97
+ return arabicScriptRe.test(text)
98
+ }
99
+
100
+ function isCJKCodePoint(codePoint: number): boolean {
101
+ return (
102
+ (codePoint >= 0x4E00 && codePoint <= 0x9FFF) ||
103
+ (codePoint >= 0x3400 && codePoint <= 0x4DBF) ||
104
+ (codePoint >= 0x20000 && codePoint <= 0x2A6DF) ||
105
+ (codePoint >= 0x2A700 && codePoint <= 0x2B73F) ||
106
+ (codePoint >= 0x2B740 && codePoint <= 0x2B81F) ||
107
+ (codePoint >= 0x2B820 && codePoint <= 0x2CEAF) ||
108
+ (codePoint >= 0x2CEB0 && codePoint <= 0x2EBEF) ||
109
+ (codePoint >= 0x2EBF0 && codePoint <= 0x2EE5D) ||
110
+ (codePoint >= 0x2F800 && codePoint <= 0x2FA1F) ||
111
+ (codePoint >= 0x30000 && codePoint <= 0x3134F) ||
112
+ (codePoint >= 0x31350 && codePoint <= 0x323AF) ||
113
+ (codePoint >= 0x323B0 && codePoint <= 0x33479) ||
114
+ (codePoint >= 0xF900 && codePoint <= 0xFAFF) ||
115
+ (codePoint >= 0x3000 && codePoint <= 0x303F) ||
116
+ (codePoint >= 0x3040 && codePoint <= 0x309F) ||
117
+ (codePoint >= 0x30A0 && codePoint <= 0x30FF) ||
118
+ (codePoint >= 0xAC00 && codePoint <= 0xD7AF) ||
119
+ (codePoint >= 0xFF00 && codePoint <= 0xFFEF)
120
+ )
121
+ }
122
+
123
+ export function isCJK(s: string): boolean {
124
+ for (let i = 0; i < s.length; i++) {
125
+ const first = s.charCodeAt(i)
126
+ if (first < 0x3000) continue
127
+
128
+ if (first >= 0xD800 && first <= 0xDBFF && i + 1 < s.length) {
129
+ const second = s.charCodeAt(i + 1)
130
+ if (second >= 0xDC00 && second <= 0xDFFF) {
131
+ const codePoint = ((first - 0xD800) << 10) + (second - 0xDC00) + 0x10000
132
+ if (isCJKCodePoint(codePoint)) return true
133
+ i++
134
+ continue
135
+ }
136
+ }
137
+
138
+ if (isCJKCodePoint(first)) return true
139
+ }
140
+ return false
141
+ }
142
+
143
+ function endsWithLineStartProhibitedText(text: string): boolean {
144
+ let last = ''
145
+ for (const ch of text) last = ch
146
+ return last.length > 0 && (kinsokuStart.has(last) || leftStickyPunctuation.has(last))
147
+ }
148
+
149
+ const keepAllGlueChars = new Set([
150
+ '\u00A0',
151
+ '\u202F',
152
+ '\u2060',
153
+ '\uFEFF',
154
+ ])
155
+
156
+ function containsCJKText(text: string): boolean {
157
+ return isCJK(text)
158
+ }
159
+
160
+ function endsWithKeepAllGlueText(text: string): boolean {
161
+ let last = ''
162
+ for (const ch of text) last = ch
163
+ return last.length > 0 && keepAllGlueChars.has(last)
164
+ }
165
+
166
+ export function canContinueKeepAllTextRun(previousText: string): boolean {
167
+ return (
168
+ !endsWithLineStartProhibitedText(previousText) &&
169
+ !endsWithKeepAllGlueText(previousText)
170
+ )
171
+ }
172
+
173
+ export const kinsokuStart = new Set([
174
+ '\uFF0C',
175
+ '\uFF0E',
176
+ '\uFF01',
177
+ '\uFF1A',
178
+ '\uFF1B',
179
+ '\uFF1F',
180
+ '\u3001',
181
+ '\u3002',
182
+ '\u30FB',
183
+ '\uFF09',
184
+ '\u3015',
185
+ '\u3009',
186
+ '\u300B',
187
+ '\u300D',
188
+ '\u300F',
189
+ '\u3011',
190
+ '\u3017',
191
+ '\u3019',
192
+ '\u301B',
193
+ '\u30FC',
194
+ '\u3005',
195
+ '\u303B',
196
+ '\u309D',
197
+ '\u309E',
198
+ '\u30FD',
199
+ '\u30FE',
200
+ ])
201
+
202
+ export const kinsokuEnd = new Set([
203
+ '"',
204
+ '(', '[', '{',
205
+ '\u201C', '\u2018', '\u00AB', '\u2039',
206
+ '\uFF08',
207
+ '\u3014',
208
+ '\u3008',
209
+ '\u300A',
210
+ '\u300C',
211
+ '\u300E',
212
+ '\u3010',
213
+ '\u3016',
214
+ '\u3018',
215
+ '\u301A',
216
+ ])
217
+
218
+ const forwardStickyGlue = new Set([
219
+ "'", '\u2019',
220
+ ])
221
+
222
+ export const leftStickyPunctuation = new Set([
223
+ '.', ',', '!', '?', ':', ';',
224
+ '\u060C',
225
+ '\u061B',
226
+ '\u061F',
227
+ '\u0964',
228
+ '\u0965',
229
+ '\u104A',
230
+ '\u104B',
231
+ '\u104C',
232
+ '\u104D',
233
+ '\u104F',
234
+ ')', ']', '}',
235
+ '%',
236
+ '"',
237
+ '\u201D', '\u2019', '\u00BB', '\u203A',
238
+ '\u2026',
239
+ ])
240
+
241
+ const arabicNoSpaceTrailingPunctuation = new Set([
242
+ ':',
243
+ '.',
244
+ '\u060C',
245
+ '\u061B',
246
+ ])
247
+
248
+ const myanmarMedialGlue = new Set([
249
+ '\u104F',
250
+ ])
251
+
252
+ const closingQuoteChars = new Set([
253
+ '\u201D', '\u2019', '\u00BB', '\u203A',
254
+ '\u300D',
255
+ '\u300F',
256
+ '\u3011',
257
+ '\u300B',
258
+ '\u3009',
259
+ '\u3015',
260
+ '\uFF09',
261
+ ])
262
+
263
+ function isLeftStickyPunctuationSegment(segment: string): boolean {
264
+ if (isEscapedQuoteClusterSegment(segment)) return true
265
+ let sawPunctuation = false
266
+ for (const ch of segment) {
267
+ if (leftStickyPunctuation.has(ch)) {
268
+ sawPunctuation = true
269
+ continue
270
+ }
271
+ if (sawPunctuation && combiningMarkRe.test(ch)) continue
272
+ return false
273
+ }
274
+ return sawPunctuation
275
+ }
276
+
277
+ function isCJKLineStartProhibitedSegment(segment: string): boolean {
278
+ for (const ch of segment) {
279
+ if (!kinsokuStart.has(ch) && !leftStickyPunctuation.has(ch)) return false
280
+ }
281
+ return segment.length > 0
282
+ }
283
+
284
+ function isForwardStickyClusterSegment(segment: string): boolean {
285
+ if (isEscapedQuoteClusterSegment(segment)) return true
286
+ for (const ch of segment) {
287
+ if (!kinsokuEnd.has(ch) && !forwardStickyGlue.has(ch) && !combiningMarkRe.test(ch)) return false
288
+ }
289
+ return segment.length > 0
290
+ }
291
+
292
+ function isEscapedQuoteClusterSegment(segment: string): boolean {
293
+ let sawQuote = false
294
+ for (const ch of segment) {
295
+ if (ch === '\\' || combiningMarkRe.test(ch)) continue
296
+ if (kinsokuEnd.has(ch) || leftStickyPunctuation.has(ch) || forwardStickyGlue.has(ch)) {
297
+ sawQuote = true
298
+ continue
299
+ }
300
+ return false
301
+ }
302
+ return sawQuote
303
+ }
304
+
305
+ function previousCodePointStart(text: string, end: number): number {
306
+ const last = end - 1
307
+ if (last <= 0) return Math.max(last, 0)
308
+
309
+ const lastCodeUnit = text.charCodeAt(last)
310
+ if (lastCodeUnit < 0xDC00 || lastCodeUnit > 0xDFFF) return last
311
+
312
+ const maybeHigh = last - 1
313
+ if (maybeHigh < 0) return last
314
+
315
+ const highCodeUnit = text.charCodeAt(maybeHigh)
316
+ return highCodeUnit >= 0xD800 && highCodeUnit <= 0xDBFF ? maybeHigh : last
317
+ }
318
+
319
+ function getLastCodePoint(text: string): string | null {
320
+ if (text.length === 0) return null
321
+ const start = previousCodePointStart(text, text.length)
322
+ return text.slice(start)
323
+ }
324
+
325
+ function splitTrailingForwardStickyCluster(text: string): { head: string, tail: string } | null {
326
+ const chars = Array.from(text)
327
+ let splitIndex = chars.length
328
+
329
+ while (splitIndex > 0) {
330
+ const ch = chars[splitIndex - 1]!
331
+ if (combiningMarkRe.test(ch)) {
332
+ splitIndex--
333
+ continue
334
+ }
335
+ if (kinsokuEnd.has(ch) || forwardStickyGlue.has(ch)) {
336
+ splitIndex--
337
+ continue
338
+ }
339
+ break
340
+ }
341
+
342
+ if (splitIndex <= 0 || splitIndex === chars.length) return null
343
+ return {
344
+ head: chars.slice(0, splitIndex).join(''),
345
+ tail: chars.slice(splitIndex).join(''),
346
+ }
347
+ }
348
+
349
+ function isRepeatedSingleCharRun(segment: string, ch: string): boolean {
350
+ if (segment.length === 0) return false
351
+ for (const part of segment) {
352
+ if (part !== ch) return false
353
+ }
354
+ return true
355
+ }
356
+
357
+ function endsWithArabicNoSpacePunctuation(segment: string): boolean {
358
+ if (!containsArabicScript(segment) || segment.length === 0) return false
359
+ const lastCodePoint = getLastCodePoint(segment)
360
+ return lastCodePoint !== null && arabicNoSpaceTrailingPunctuation.has(lastCodePoint)
361
+ }
362
+
363
+ function endsWithMyanmarMedialGlue(segment: string): boolean {
364
+ const lastCodePoint = getLastCodePoint(segment)
365
+ return lastCodePoint !== null && myanmarMedialGlue.has(lastCodePoint)
366
+ }
367
+
368
+ function splitLeadingSpaceAndMarks(segment: string): { space: string, marks: string } | null {
369
+ if (segment.length < 2 || segment[0] !== ' ') return null
370
+ const marks = segment.slice(1)
371
+ if (/^\p{M}+$/u.test(marks)) {
372
+ return { space: ' ', marks }
373
+ }
374
+ return null
375
+ }
376
+
377
+ export function endsWithClosingQuote(text: string): boolean {
378
+ let end = text.length
379
+ while (end > 0) {
380
+ const start = previousCodePointStart(text, end)
381
+ const ch = text.slice(start, end)
382
+ if (closingQuoteChars.has(ch)) return true
383
+ if (!leftStickyPunctuation.has(ch)) return false
384
+ end = start
385
+ }
386
+ return false
387
+ }
388
+
389
+ function classifySegmentBreakChar(ch: string, whiteSpaceProfile: WhiteSpaceProfile): SegmentBreakKind {
390
+ if (whiteSpaceProfile.preserveOrdinarySpaces || whiteSpaceProfile.preserveHardBreaks) {
391
+ if (ch === ' ') return 'preserved-space'
392
+ if (ch === '\t') return 'tab'
393
+ if (whiteSpaceProfile.preserveHardBreaks && ch === '\n') return 'hard-break'
394
+ }
395
+ if (ch === ' ') return 'space'
396
+ if (ch === '\u00A0' || ch === '\u202F' || ch === '\u2060' || ch === '\uFEFF') {
397
+ return 'glue'
398
+ }
399
+ if (ch === '\u200B') return 'zero-width-break'
400
+ if (ch === '\u00AD') return 'soft-hyphen'
401
+ return 'text'
402
+ }
403
+
404
+ // All characters that classifySegmentBreakChar maps to a non-'text' kind.
405
+ const breakCharRe = /[\x20\t\n\xA0\xAD\u200B\u202F\u2060\uFEFF]/
406
+
407
+ function joinTextParts(parts: string[]): string {
408
+ return parts.length === 1 ? parts[0]! : parts.join('')
409
+ }
410
+
411
+ function splitSegmentByBreakKind(
412
+ segment: string,
413
+ isWordLike: boolean,
414
+ start: number,
415
+ whiteSpaceProfile: WhiteSpaceProfile,
416
+ ): SegmentationPiece[] {
417
+ if (!breakCharRe.test(segment)) {
418
+ return [{ text: segment, isWordLike, kind: 'text', start }]
419
+ }
420
+
421
+ const pieces: SegmentationPiece[] = []
422
+ let currentKind: SegmentBreakKind | null = null
423
+ let currentTextParts: string[] = []
424
+ let currentStart = start
425
+ let currentWordLike = false
426
+ let offset = 0
427
+
428
+ for (const ch of segment) {
429
+ const kind = classifySegmentBreakChar(ch, whiteSpaceProfile)
430
+ const wordLike = kind === 'text' && isWordLike
431
+
432
+ if (currentKind !== null && kind === currentKind && wordLike === currentWordLike) {
433
+ currentTextParts.push(ch)
434
+ offset += ch.length
435
+ continue
436
+ }
437
+
438
+ if (currentKind !== null) {
439
+ pieces.push({
440
+ text: joinTextParts(currentTextParts),
441
+ isWordLike: currentWordLike,
442
+ kind: currentKind,
443
+ start: currentStart,
444
+ })
445
+ }
446
+
447
+ currentKind = kind
448
+ currentTextParts = [ch]
449
+ currentStart = start + offset
450
+ currentWordLike = wordLike
451
+ offset += ch.length
452
+ }
453
+
454
+ if (currentKind !== null) {
455
+ pieces.push({
456
+ text: joinTextParts(currentTextParts),
457
+ isWordLike: currentWordLike,
458
+ kind: currentKind,
459
+ start: currentStart,
460
+ })
461
+ }
462
+
463
+ return pieces
464
+ }
465
+
466
+ function isTextRunBoundary(kind: SegmentBreakKind): boolean {
467
+ return (
468
+ kind === 'space' ||
469
+ kind === 'preserved-space' ||
470
+ kind === 'zero-width-break' ||
471
+ kind === 'hard-break'
472
+ )
473
+ }
474
+
475
+ const urlSchemeSegmentRe = /^[A-Za-z][A-Za-z0-9+.-]*:$/
476
+
477
+ function isUrlLikeRunStart(segmentation: MergedSegmentation, index: number): boolean {
478
+ const text = segmentation.texts[index]!
479
+ if (text.startsWith('www.')) return true
480
+ return (
481
+ urlSchemeSegmentRe.test(text) &&
482
+ index + 1 < segmentation.len &&
483
+ segmentation.kinds[index + 1] === 'text' &&
484
+ segmentation.texts[index + 1] === '//'
485
+ )
486
+ }
487
+
488
+ function isUrlQueryBoundarySegment(text: string): boolean {
489
+ return text.includes('?') && (text.includes('://') || text.startsWith('www.'))
490
+ }
491
+
492
+ function mergeUrlLikeRuns(segmentation: MergedSegmentation): MergedSegmentation {
493
+ const texts = segmentation.texts.slice()
494
+ const isWordLike = segmentation.isWordLike.slice()
495
+ const kinds = segmentation.kinds.slice()
496
+ const starts = segmentation.starts.slice()
497
+
498
+ for (let i = 0; i < segmentation.len; i++) {
499
+ if (kinds[i] !== 'text' || !isUrlLikeRunStart(segmentation, i)) continue
500
+
501
+ const mergedParts = [texts[i]!]
502
+ let j = i + 1
503
+ while (j < segmentation.len && !isTextRunBoundary(kinds[j]!)) {
504
+ mergedParts.push(texts[j]!)
505
+ isWordLike[i] = true
506
+ const endsQueryPrefix = texts[j]!.includes('?')
507
+ kinds[j] = 'text'
508
+ texts[j] = ''
509
+ j++
510
+ if (endsQueryPrefix) break
511
+ }
512
+ texts[i] = joinTextParts(mergedParts)
513
+ }
514
+
515
+ let compactLen = 0
516
+ for (let read = 0; read < texts.length; read++) {
517
+ const text = texts[read]!
518
+ if (text.length === 0) continue
519
+ if (compactLen !== read) {
520
+ texts[compactLen] = text
521
+ isWordLike[compactLen] = isWordLike[read]!
522
+ kinds[compactLen] = kinds[read]!
523
+ starts[compactLen] = starts[read]!
524
+ }
525
+ compactLen++
526
+ }
527
+
528
+ texts.length = compactLen
529
+ isWordLike.length = compactLen
530
+ kinds.length = compactLen
531
+ starts.length = compactLen
532
+
533
+ return {
534
+ len: compactLen,
535
+ texts,
536
+ isWordLike,
537
+ kinds,
538
+ starts,
539
+ }
540
+ }
541
+
542
+ function mergeUrlQueryRuns(segmentation: MergedSegmentation): MergedSegmentation {
543
+ const texts: string[] = []
544
+ const isWordLike: boolean[] = []
545
+ const kinds: SegmentBreakKind[] = []
546
+ const starts: number[] = []
547
+
548
+ for (let i = 0; i < segmentation.len; i++) {
549
+ const text = segmentation.texts[i]!
550
+ texts.push(text)
551
+ isWordLike.push(segmentation.isWordLike[i]!)
552
+ kinds.push(segmentation.kinds[i]!)
553
+ starts.push(segmentation.starts[i]!)
554
+
555
+ if (!isUrlQueryBoundarySegment(text)) continue
556
+
557
+ const nextIndex = i + 1
558
+ if (
559
+ nextIndex >= segmentation.len ||
560
+ isTextRunBoundary(segmentation.kinds[nextIndex]!)
561
+ ) {
562
+ continue
563
+ }
564
+
565
+ const queryParts: string[] = []
566
+ const queryStart = segmentation.starts[nextIndex]!
567
+ let j = nextIndex
568
+ while (j < segmentation.len && !isTextRunBoundary(segmentation.kinds[j]!)) {
569
+ queryParts.push(segmentation.texts[j]!)
570
+ j++
571
+ }
572
+
573
+ if (queryParts.length > 0) {
574
+ texts.push(joinTextParts(queryParts))
575
+ isWordLike.push(true)
576
+ kinds.push('text')
577
+ starts.push(queryStart)
578
+ i = j - 1
579
+ }
580
+ }
581
+
582
+ return {
583
+ len: texts.length,
584
+ texts,
585
+ isWordLike,
586
+ kinds,
587
+ starts,
588
+ }
589
+ }
590
+
591
+ const numericJoinerChars = new Set([
592
+ ':', '-', '/', '\u00D7', ',', '.', '+',
593
+ '\u2013',
594
+ '\u2014',
595
+ ])
596
+
597
+ const asciiPunctuationChainSegmentRe = /^[A-Za-z0-9_]+[,:;]*$/
598
+ const asciiPunctuationChainTrailingJoinersRe = /[,:;]+$/
599
+
600
+ function segmentContainsDecimalDigit(text: string): boolean {
601
+ for (const ch of text) {
602
+ if (decimalDigitRe.test(ch)) return true
603
+ }
604
+ return false
605
+ }
606
+
607
+ export function isNumericRunSegment(text: string): boolean {
608
+ if (text.length === 0) return false
609
+ for (const ch of text) {
610
+ if (decimalDigitRe.test(ch) || numericJoinerChars.has(ch)) continue
611
+ return false
612
+ }
613
+ return true
614
+ }
615
+
616
+ function mergeNumericRuns(segmentation: MergedSegmentation): MergedSegmentation {
617
+ const texts: string[] = []
618
+ const isWordLike: boolean[] = []
619
+ const kinds: SegmentBreakKind[] = []
620
+ const starts: number[] = []
621
+
622
+ for (let i = 0; i < segmentation.len; i++) {
623
+ const text = segmentation.texts[i]!
624
+ const kind = segmentation.kinds[i]!
625
+
626
+ if (kind === 'text' && isNumericRunSegment(text) && segmentContainsDecimalDigit(text)) {
627
+ const mergedParts = [text]
628
+ let j = i + 1
629
+ while (
630
+ j < segmentation.len &&
631
+ segmentation.kinds[j] === 'text' &&
632
+ isNumericRunSegment(segmentation.texts[j]!)
633
+ ) {
634
+ mergedParts.push(segmentation.texts[j]!)
635
+ j++
636
+ }
637
+
638
+ texts.push(joinTextParts(mergedParts))
639
+ isWordLike.push(true)
640
+ kinds.push('text')
641
+ starts.push(segmentation.starts[i]!)
642
+ i = j - 1
643
+ continue
644
+ }
645
+
646
+ texts.push(text)
647
+ isWordLike.push(segmentation.isWordLike[i]!)
648
+ kinds.push(kind)
649
+ starts.push(segmentation.starts[i]!)
650
+ }
651
+
652
+ return {
653
+ len: texts.length,
654
+ texts,
655
+ isWordLike,
656
+ kinds,
657
+ starts,
658
+ }
659
+ }
660
+
661
+ function mergeAsciiPunctuationChains(segmentation: MergedSegmentation): MergedSegmentation {
662
+ const texts: string[] = []
663
+ const isWordLike: boolean[] = []
664
+ const kinds: SegmentBreakKind[] = []
665
+ const starts: number[] = []
666
+
667
+ for (let i = 0; i < segmentation.len; i++) {
668
+ const text = segmentation.texts[i]!
669
+ const kind = segmentation.kinds[i]!
670
+ const wordLike = segmentation.isWordLike[i]!
671
+
672
+ if (kind === 'text' && wordLike && asciiPunctuationChainSegmentRe.test(text)) {
673
+ const mergedParts = [text]
674
+ let endsWithJoiners = asciiPunctuationChainTrailingJoinersRe.test(text)
675
+ let j = i + 1
676
+
677
+ while (
678
+ endsWithJoiners &&
679
+ j < segmentation.len &&
680
+ segmentation.kinds[j] === 'text' &&
681
+ segmentation.isWordLike[j] &&
682
+ asciiPunctuationChainSegmentRe.test(segmentation.texts[j]!)
683
+ ) {
684
+ const nextText = segmentation.texts[j]!
685
+ mergedParts.push(nextText)
686
+ endsWithJoiners = asciiPunctuationChainTrailingJoinersRe.test(nextText)
687
+ j++
688
+ }
689
+
690
+ texts.push(joinTextParts(mergedParts))
691
+ isWordLike.push(true)
692
+ kinds.push('text')
693
+ starts.push(segmentation.starts[i]!)
694
+ i = j - 1
695
+ continue
696
+ }
697
+
698
+ texts.push(text)
699
+ isWordLike.push(wordLike)
700
+ kinds.push(kind)
701
+ starts.push(segmentation.starts[i]!)
702
+ }
703
+
704
+ return {
705
+ len: texts.length,
706
+ texts,
707
+ isWordLike,
708
+ kinds,
709
+ starts,
710
+ }
711
+ }
712
+
713
+ function splitHyphenatedNumericRuns(segmentation: MergedSegmentation): MergedSegmentation {
714
+ const texts: string[] = []
715
+ const isWordLike: boolean[] = []
716
+ const kinds: SegmentBreakKind[] = []
717
+ const starts: number[] = []
718
+
719
+ for (let i = 0; i < segmentation.len; i++) {
720
+ const text = segmentation.texts[i]!
721
+ if (segmentation.kinds[i] === 'text' && text.includes('-')) {
722
+ const parts = text.split('-')
723
+ let shouldSplit = parts.length > 1
724
+ for (let j = 0; j < parts.length; j++) {
725
+ const part = parts[j]!
726
+ if (!shouldSplit) break
727
+ if (
728
+ part.length === 0 ||
729
+ !segmentContainsDecimalDigit(part) ||
730
+ !isNumericRunSegment(part)
731
+ ) {
732
+ shouldSplit = false
733
+ }
734
+ }
735
+
736
+ if (shouldSplit) {
737
+ let offset = 0
738
+ for (let j = 0; j < parts.length; j++) {
739
+ const part = parts[j]!
740
+ const splitText = j < parts.length - 1 ? `${part}-` : part
741
+ texts.push(splitText)
742
+ isWordLike.push(true)
743
+ kinds.push('text')
744
+ starts.push(segmentation.starts[i]! + offset)
745
+ offset += splitText.length
746
+ }
747
+ continue
748
+ }
749
+ }
750
+
751
+ texts.push(text)
752
+ isWordLike.push(segmentation.isWordLike[i]!)
753
+ kinds.push(segmentation.kinds[i]!)
754
+ starts.push(segmentation.starts[i]!)
755
+ }
756
+
757
+ return {
758
+ len: texts.length,
759
+ texts,
760
+ isWordLike,
761
+ kinds,
762
+ starts,
763
+ }
764
+ }
765
+
766
+ function mergeGlueConnectedTextRuns(segmentation: MergedSegmentation): MergedSegmentation {
767
+ const texts: string[] = []
768
+ const isWordLike: boolean[] = []
769
+ const kinds: SegmentBreakKind[] = []
770
+ const starts: number[] = []
771
+
772
+ let read = 0
773
+ while (read < segmentation.len) {
774
+ const textParts = [segmentation.texts[read]!]
775
+ let wordLike = segmentation.isWordLike[read]!
776
+ let kind = segmentation.kinds[read]!
777
+ let start = segmentation.starts[read]!
778
+
779
+ if (kind === 'glue') {
780
+ const glueParts = [textParts[0]!]
781
+ const glueStart = start
782
+ read++
783
+ while (read < segmentation.len && segmentation.kinds[read] === 'glue') {
784
+ glueParts.push(segmentation.texts[read]!)
785
+ read++
786
+ }
787
+ const glueText = joinTextParts(glueParts)
788
+
789
+ if (read < segmentation.len && segmentation.kinds[read] === 'text') {
790
+ textParts[0] = glueText
791
+ textParts.push(segmentation.texts[read]!)
792
+ wordLike = segmentation.isWordLike[read]!
793
+ kind = 'text'
794
+ start = glueStart
795
+ read++
796
+ } else {
797
+ texts.push(glueText)
798
+ isWordLike.push(false)
799
+ kinds.push('glue')
800
+ starts.push(glueStart)
801
+ continue
802
+ }
803
+ } else {
804
+ read++
805
+ }
806
+
807
+ if (kind === 'text') {
808
+ while (read < segmentation.len && segmentation.kinds[read] === 'glue') {
809
+ const glueParts: string[] = []
810
+ while (read < segmentation.len && segmentation.kinds[read] === 'glue') {
811
+ glueParts.push(segmentation.texts[read]!)
812
+ read++
813
+ }
814
+ const glueText = joinTextParts(glueParts)
815
+
816
+ if (read < segmentation.len && segmentation.kinds[read] === 'text') {
817
+ textParts.push(glueText, segmentation.texts[read]!)
818
+ wordLike = wordLike || segmentation.isWordLike[read]!
819
+ read++
820
+ continue
821
+ }
822
+
823
+ textParts.push(glueText)
824
+ }
825
+ }
826
+
827
+ texts.push(joinTextParts(textParts))
828
+ isWordLike.push(wordLike)
829
+ kinds.push(kind)
830
+ starts.push(start)
831
+ }
832
+
833
+ return {
834
+ len: texts.length,
835
+ texts,
836
+ isWordLike,
837
+ kinds,
838
+ starts,
839
+ }
840
+ }
841
+
842
+ function carryTrailingForwardStickyAcrossCJKBoundary(segmentation: MergedSegmentation): MergedSegmentation {
843
+ const texts = segmentation.texts.slice()
844
+ const isWordLike = segmentation.isWordLike.slice()
845
+ const kinds = segmentation.kinds.slice()
846
+ const starts = segmentation.starts.slice()
847
+
848
+ for (let i = 0; i < texts.length - 1; i++) {
849
+ if (kinds[i] !== 'text' || kinds[i + 1] !== 'text') continue
850
+ if (!isCJK(texts[i]!) || !isCJK(texts[i + 1]!)) continue
851
+
852
+ const split = splitTrailingForwardStickyCluster(texts[i]!)
853
+ if (split === null) continue
854
+
855
+ texts[i] = split.head
856
+ texts[i + 1] = split.tail + texts[i + 1]!
857
+ starts[i + 1] = starts[i]! + split.head.length
858
+ }
859
+
860
+ return {
861
+ len: texts.length,
862
+ texts,
863
+ isWordLike,
864
+ kinds,
865
+ starts,
866
+ }
867
+ }
868
+
869
+
870
+ // Modified for expo-pretext: accepts pre-segmented data from native module
871
+ // instead of calling Intl.Segmenter internally.
872
+ function buildMergedSegmentation(
873
+ normalized: string,
874
+ segments: string[],
875
+ segmentIsWordLike: boolean[],
876
+ profile: AnalysisProfile,
877
+ whiteSpaceProfile: WhiteSpaceProfile,
878
+ ): MergedSegmentation {
879
+ let mergedLen = 0
880
+ const mergedTexts: string[] = []
881
+ const mergedWordLike: boolean[] = []
882
+ const mergedKinds: SegmentBreakKind[] = []
883
+ const mergedStarts: number[] = []
884
+
885
+ // Compute start offsets from segments (native module provides segments but not offsets)
886
+ let offset = 0
887
+ for (let si = 0; si < segments.length; si++) {
888
+ const seg = segments[si]!
889
+ const wordLike = segmentIsWordLike[si] ?? false
890
+ const segStart = offset
891
+ offset += seg.length
892
+
893
+ for (const piece of splitSegmentByBreakKind(seg, wordLike, segStart, whiteSpaceProfile)) {
894
+ const isText = piece.kind === 'text'
895
+
896
+ if (
897
+ profile.carryCJKAfterClosingQuote &&
898
+ isText &&
899
+ mergedLen > 0 &&
900
+ mergedKinds[mergedLen - 1] === 'text' &&
901
+ isCJK(piece.text) &&
902
+ isCJK(mergedTexts[mergedLen - 1]!) &&
903
+ endsWithClosingQuote(mergedTexts[mergedLen - 1]!)
904
+ ) {
905
+ mergedTexts[mergedLen - 1] += piece.text
906
+ mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike
907
+ } else if (
908
+ isText &&
909
+ mergedLen > 0 &&
910
+ mergedKinds[mergedLen - 1] === 'text' &&
911
+ isCJKLineStartProhibitedSegment(piece.text) &&
912
+ isCJK(mergedTexts[mergedLen - 1]!)
913
+ ) {
914
+ mergedTexts[mergedLen - 1] += piece.text
915
+ mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike
916
+ } else if (
917
+ isText &&
918
+ mergedLen > 0 &&
919
+ mergedKinds[mergedLen - 1] === 'text' &&
920
+ endsWithMyanmarMedialGlue(mergedTexts[mergedLen - 1]!)
921
+ ) {
922
+ mergedTexts[mergedLen - 1] += piece.text
923
+ mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike
924
+ } else if (
925
+ isText &&
926
+ mergedLen > 0 &&
927
+ mergedKinds[mergedLen - 1] === 'text' &&
928
+ piece.isWordLike &&
929
+ containsArabicScript(piece.text) &&
930
+ endsWithArabicNoSpacePunctuation(mergedTexts[mergedLen - 1]!)
931
+ ) {
932
+ mergedTexts[mergedLen - 1] += piece.text
933
+ mergedWordLike[mergedLen - 1] = true
934
+ } else if (
935
+ isText &&
936
+ !piece.isWordLike &&
937
+ mergedLen > 0 &&
938
+ mergedKinds[mergedLen - 1] === 'text' &&
939
+ piece.text.length === 1 &&
940
+ piece.text !== '-' &&
941
+ piece.text !== '\u2014' &&
942
+ isRepeatedSingleCharRun(mergedTexts[mergedLen - 1]!, piece.text)
943
+ ) {
944
+ mergedTexts[mergedLen - 1] += piece.text
945
+ } else if (
946
+ isText &&
947
+ !piece.isWordLike &&
948
+ mergedLen > 0 &&
949
+ mergedKinds[mergedLen - 1] === 'text' &&
950
+ (
951
+ isLeftStickyPunctuationSegment(piece.text) ||
952
+ (piece.text === '-' && mergedWordLike[mergedLen - 1]!)
953
+ )
954
+ ) {
955
+ mergedTexts[mergedLen - 1] += piece.text
956
+ } else {
957
+ mergedTexts[mergedLen] = piece.text
958
+ mergedWordLike[mergedLen] = piece.isWordLike
959
+ mergedKinds[mergedLen] = piece.kind
960
+ mergedStarts[mergedLen] = piece.start
961
+ mergedLen++
962
+ }
963
+ }
964
+ }
965
+
966
+ for (let i = 1; i < mergedLen; i++) {
967
+ if (
968
+ mergedKinds[i] === 'text' &&
969
+ !mergedWordLike[i]! &&
970
+ isEscapedQuoteClusterSegment(mergedTexts[i]!) &&
971
+ mergedKinds[i - 1] === 'text'
972
+ ) {
973
+ mergedTexts[i - 1] += mergedTexts[i]!
974
+ mergedWordLike[i - 1] = mergedWordLike[i - 1]! || mergedWordLike[i]!
975
+ mergedTexts[i] = ''
976
+ }
977
+ }
978
+
979
+ for (let i = mergedLen - 2; i >= 0; i--) {
980
+ if (mergedKinds[i] === 'text' && !mergedWordLike[i]! && isForwardStickyClusterSegment(mergedTexts[i]!)) {
981
+ let j = i + 1
982
+ while (j < mergedLen && mergedTexts[j] === '') j++
983
+ if (j < mergedLen && mergedKinds[j] === 'text') {
984
+ mergedTexts[j] = mergedTexts[i]! + mergedTexts[j]!
985
+ mergedStarts[j] = mergedStarts[i]!
986
+ mergedTexts[i] = ''
987
+ }
988
+ }
989
+ }
990
+
991
+ let compactLen = 0
992
+ for (let read = 0; read < mergedLen; read++) {
993
+ const text = mergedTexts[read]!
994
+ if (text.length === 0) continue
995
+ if (compactLen !== read) {
996
+ mergedTexts[compactLen] = text
997
+ mergedWordLike[compactLen] = mergedWordLike[read]!
998
+ mergedKinds[compactLen] = mergedKinds[read]!
999
+ mergedStarts[compactLen] = mergedStarts[read]!
1000
+ }
1001
+ compactLen++
1002
+ }
1003
+ mergedTexts.length = compactLen
1004
+ mergedWordLike.length = compactLen
1005
+ mergedKinds.length = compactLen
1006
+ mergedStarts.length = compactLen
1007
+
1008
+ const compacted = mergeGlueConnectedTextRuns({
1009
+ len: compactLen,
1010
+ texts: mergedTexts,
1011
+ isWordLike: mergedWordLike,
1012
+ kinds: mergedKinds,
1013
+ starts: mergedStarts,
1014
+ })
1015
+ const withMergedUrls = carryTrailingForwardStickyAcrossCJKBoundary(
1016
+ mergeAsciiPunctuationChains(
1017
+ splitHyphenatedNumericRuns(mergeNumericRuns(mergeUrlQueryRuns(mergeUrlLikeRuns(compacted)))),
1018
+ ),
1019
+ )
1020
+
1021
+ for (let i = 0; i < withMergedUrls.len - 1; i++) {
1022
+ const split = splitLeadingSpaceAndMarks(withMergedUrls.texts[i]!)
1023
+ if (split === null) continue
1024
+ if (
1025
+ (withMergedUrls.kinds[i] !== 'space' && withMergedUrls.kinds[i] !== 'preserved-space') ||
1026
+ withMergedUrls.kinds[i + 1] !== 'text' ||
1027
+ !containsArabicScript(withMergedUrls.texts[i + 1]!)
1028
+ ) {
1029
+ continue
1030
+ }
1031
+
1032
+ withMergedUrls.texts[i] = split.space
1033
+ withMergedUrls.isWordLike[i] = false
1034
+ withMergedUrls.kinds[i] = withMergedUrls.kinds[i] === 'preserved-space' ? 'preserved-space' : 'space'
1035
+ withMergedUrls.texts[i + 1] = split.marks + withMergedUrls.texts[i + 1]!
1036
+ withMergedUrls.starts[i + 1] = withMergedUrls.starts[i]! + split.space.length
1037
+ }
1038
+
1039
+ return withMergedUrls
1040
+ }
1041
+
1042
+ function compileAnalysisChunks(segmentation: MergedSegmentation, whiteSpaceProfile: WhiteSpaceProfile): AnalysisChunk[] {
1043
+ if (segmentation.len === 0) return []
1044
+ if (!whiteSpaceProfile.preserveHardBreaks) {
1045
+ return [{
1046
+ startSegmentIndex: 0,
1047
+ endSegmentIndex: segmentation.len,
1048
+ consumedEndSegmentIndex: segmentation.len,
1049
+ }]
1050
+ }
1051
+
1052
+ const chunks: AnalysisChunk[] = []
1053
+ let startSegmentIndex = 0
1054
+
1055
+ for (let i = 0; i < segmentation.len; i++) {
1056
+ if (segmentation.kinds[i] !== 'hard-break') continue
1057
+
1058
+ chunks.push({
1059
+ startSegmentIndex,
1060
+ endSegmentIndex: i,
1061
+ consumedEndSegmentIndex: i + 1,
1062
+ })
1063
+ startSegmentIndex = i + 1
1064
+ }
1065
+
1066
+ if (startSegmentIndex < segmentation.len) {
1067
+ chunks.push({
1068
+ startSegmentIndex,
1069
+ endSegmentIndex: segmentation.len,
1070
+ consumedEndSegmentIndex: segmentation.len,
1071
+ })
1072
+ }
1073
+
1074
+ return chunks
1075
+ }
1076
+
1077
+ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegmentation {
1078
+ if (segmentation.len <= 1) return segmentation
1079
+
1080
+ const texts: string[] = []
1081
+ const isWordLike: boolean[] = []
1082
+ const kinds: SegmentBreakKind[] = []
1083
+ const starts: number[] = []
1084
+
1085
+ for (let i = 0; i < segmentation.len; i++) {
1086
+ const text = segmentation.texts[i]!
1087
+ const kind = segmentation.kinds[i]!
1088
+ const wordLike = segmentation.isWordLike[i]!
1089
+ const start = segmentation.starts[i]!
1090
+ const previousIndex = texts.length - 1
1091
+
1092
+ if (
1093
+ kind === 'text' &&
1094
+ previousIndex >= 0 &&
1095
+ kinds[previousIndex] === 'text' &&
1096
+ canContinueKeepAllTextRun(texts[previousIndex]!) &&
1097
+ containsCJKText(texts[previousIndex]!)
1098
+ ) {
1099
+ texts[previousIndex] += text
1100
+ isWordLike[previousIndex] = isWordLike[previousIndex]! || wordLike
1101
+ continue
1102
+ }
1103
+
1104
+ texts.push(text)
1105
+ isWordLike.push(wordLike)
1106
+ kinds.push(kind)
1107
+ starts.push(start)
1108
+ }
1109
+
1110
+ return {
1111
+ len: texts.length,
1112
+ texts,
1113
+ isWordLike,
1114
+ kinds,
1115
+ starts,
1116
+ }
1117
+ }
1118
+
1119
+ // Modified for expo-pretext: accepts pre-segmented data from native module.
1120
+ // The native module performs Intl.Segmenter-equivalent word segmentation and
1121
+ // provides segments[] and isWordLike[] arrays.
1122
+ export function analyzeText(
1123
+ segments: string[],
1124
+ isWordLike: boolean[],
1125
+ profile: AnalysisProfile,
1126
+ whiteSpace: WhiteSpaceMode = 'normal',
1127
+ wordBreak: WordBreakMode = 'normal',
1128
+ ): TextAnalysis {
1129
+ const whiteSpaceProfile = getWhiteSpaceProfile(whiteSpace)
1130
+
1131
+ // Reconstruct the full text from segments, then normalize
1132
+ const rawText = segments.join('')
1133
+ const normalized = whiteSpaceProfile.mode === 'pre-wrap'
1134
+ ? normalizeWhitespacePreWrap(rawText)
1135
+ : normalizeWhitespaceNormal(rawText)
1136
+
1137
+ if (normalized.length === 0) {
1138
+ return {
1139
+ normalized,
1140
+ chunks: [],
1141
+ len: 0,
1142
+ texts: [],
1143
+ isWordLike: [],
1144
+ kinds: [],
1145
+ starts: [],
1146
+ }
1147
+ }
1148
+
1149
+ // If whitespace normalization changed the text, we need to re-derive
1150
+ // segments for the normalized text. For pre-wrap mode the text is mostly
1151
+ // unchanged (only \r\f -> \n), so we can reuse the native segments.
1152
+ // For normal mode, collapsing whitespace can shift boundaries.
1153
+ let normalizedSegments: string[]
1154
+ let normalizedIsWordLike: boolean[]
1155
+
1156
+ if (normalized === rawText) {
1157
+ // No normalization needed — use native segments directly
1158
+ normalizedSegments = segments
1159
+ normalizedIsWordLike = isWordLike
1160
+ } else {
1161
+ // Whitespace was normalized. Re-derive segments from the normalized text
1162
+ // by walking both the original segments and the normalized text in parallel.
1163
+ normalizedSegments = []
1164
+ normalizedIsWordLike = []
1165
+ let origOffset = 0
1166
+ let normOffset = 0
1167
+
1168
+ for (let si = 0; si < segments.length && normOffset < normalized.length; si++) {
1169
+ const seg = segments[si]!
1170
+ const segEnd = origOffset + seg.length
1171
+
1172
+ // Find where this segment's content appears in normalized text
1173
+ let segNormText = ''
1174
+
1175
+ // Walk through normalized text character by character, advancing origOffset
1176
+ // to track correspondence
1177
+ for (let oi = origOffset; oi < segEnd && normOffset < normalized.length; ) {
1178
+ const origCh = rawText[oi]!
1179
+ const normCh = normalized[normOffset]!
1180
+
1181
+ if (origCh === normCh) {
1182
+ segNormText += normCh
1183
+ oi++
1184
+ normOffset++
1185
+ } else {
1186
+ // This character was collapsed/removed by normalization — skip in original
1187
+ oi++
1188
+ }
1189
+ }
1190
+
1191
+ origOffset = segEnd
1192
+
1193
+ if (segNormText.length > 0) {
1194
+ normalizedSegments.push(segNormText)
1195
+ normalizedIsWordLike.push(isWordLike[si] ?? false)
1196
+ }
1197
+ }
1198
+ }
1199
+
1200
+ const segmentation = wordBreak === 'keep-all'
1201
+ ? mergeKeepAllTextSegments(buildMergedSegmentation(normalized, normalizedSegments, normalizedIsWordLike, profile, whiteSpaceProfile))
1202
+ : buildMergedSegmentation(normalized, normalizedSegments, normalizedIsWordLike, profile, whiteSpaceProfile)
1203
+ return {
1204
+ normalized,
1205
+ chunks: compileAnalysisChunks(segmentation, whiteSpaceProfile),
1206
+ ...segmentation,
1207
+ }
1208
+ }