expo-pretext 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -0
- package/LICENSE +21 -0
- package/README.md +250 -0
- package/android/src/main/java/expo/modules/pretext/ExpoPretextModule.kt +354 -0
- package/expo-module.config.json +9 -0
- package/ios/ExpoPretext.podspec +20 -0
- package/ios/ExpoPretext.swift +444 -0
- package/package.json +59 -0
- package/src/ExpoPretext.ts +70 -0
- package/src/__tests__/cache.test.ts +71 -0
- package/src/__tests__/font-utils.test.ts +69 -0
- package/src/__tests__/layout.test.ts +300 -0
- package/src/__tests__/obstacle-layout.test.ts +127 -0
- package/src/__tests__/setup-mocks.ts +14 -0
- package/src/analysis.ts +1208 -0
- package/src/bidi.ts +175 -0
- package/src/build.ts +503 -0
- package/src/cache.ts +59 -0
- package/src/engine-profile.ts +38 -0
- package/src/font-utils.ts +50 -0
- package/src/generated/bidi-data.ts +998 -0
- package/src/hooks/useFlashListHeights.ts +88 -0
- package/src/hooks/usePreparedText.ts +16 -0
- package/src/hooks/useTextHeight.ts +45 -0
- package/src/index.ts +56 -0
- package/src/layout.ts +353 -0
- package/src/line-break.ts +1113 -0
- package/src/obstacle-layout.ts +193 -0
- package/src/prepare.ts +246 -0
- package/src/rich-inline.ts +647 -0
- package/src/streaming.ts +61 -0
- package/src/types.ts +104 -0
package/src/analysis.ts
ADDED
|
@@ -0,0 +1,1208 @@
|
|
|
1
|
+
// Ported from chenglou/pretext src/analysis.ts
|
|
2
|
+
// Modified for expo-pretext: accepts pre-segmented data from native module
|
|
3
|
+
// instead of using Intl.Segmenter internally.
|
|
4
|
+
|
|
5
|
+
export type WhiteSpaceMode = 'normal' | 'pre-wrap'
|
|
6
|
+
export type WordBreakMode = 'normal' | 'keep-all'
|
|
7
|
+
|
|
8
|
+
export type SegmentBreakKind =
|
|
9
|
+
| 'text'
|
|
10
|
+
| 'space'
|
|
11
|
+
| 'preserved-space'
|
|
12
|
+
| 'tab'
|
|
13
|
+
| 'glue'
|
|
14
|
+
| 'zero-width-break'
|
|
15
|
+
| 'soft-hyphen'
|
|
16
|
+
| 'hard-break'
|
|
17
|
+
|
|
18
|
+
type SegmentationPiece = {
|
|
19
|
+
text: string
|
|
20
|
+
isWordLike: boolean
|
|
21
|
+
kind: SegmentBreakKind
|
|
22
|
+
start: number
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export type MergedSegmentation = {
|
|
26
|
+
len: number
|
|
27
|
+
texts: string[]
|
|
28
|
+
isWordLike: boolean[]
|
|
29
|
+
kinds: SegmentBreakKind[]
|
|
30
|
+
starts: number[]
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export type AnalysisChunk = {
|
|
34
|
+
startSegmentIndex: number
|
|
35
|
+
endSegmentIndex: number
|
|
36
|
+
consumedEndSegmentIndex: number
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export type TextAnalysis = { normalized: string, chunks: AnalysisChunk[] } & MergedSegmentation
|
|
40
|
+
|
|
41
|
+
export type AnalysisProfile = {
|
|
42
|
+
carryCJKAfterClosingQuote: boolean
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const collapsibleWhitespaceRunRe = /[ \t\n\r\f]+/g
|
|
46
|
+
const needsWhitespaceNormalizationRe = /[\t\n\r\f]| {2,}|^ | $/
|
|
47
|
+
|
|
48
|
+
type WhiteSpaceProfile = {
|
|
49
|
+
mode: WhiteSpaceMode
|
|
50
|
+
preserveOrdinarySpaces: boolean
|
|
51
|
+
preserveHardBreaks: boolean
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function getWhiteSpaceProfile(whiteSpace?: WhiteSpaceMode): WhiteSpaceProfile {
|
|
55
|
+
const mode = whiteSpace ?? 'normal'
|
|
56
|
+
return mode === 'pre-wrap'
|
|
57
|
+
? { mode, preserveOrdinarySpaces: true, preserveHardBreaks: true }
|
|
58
|
+
: { mode, preserveOrdinarySpaces: false, preserveHardBreaks: false }
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function normalizeWhitespaceNormal(text: string): string {
|
|
62
|
+
if (!needsWhitespaceNormalizationRe.test(text)) return text
|
|
63
|
+
|
|
64
|
+
let normalized = text.replace(collapsibleWhitespaceRunRe, ' ')
|
|
65
|
+
if (normalized.charCodeAt(0) === 0x20) {
|
|
66
|
+
normalized = normalized.slice(1)
|
|
67
|
+
}
|
|
68
|
+
if (normalized.length > 0 && normalized.charCodeAt(normalized.length - 1) === 0x20) {
|
|
69
|
+
normalized = normalized.slice(0, -1)
|
|
70
|
+
}
|
|
71
|
+
return normalized
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function normalizeWhitespacePreWrap(text: string): string {
|
|
75
|
+
if (!/[\r\f]/.test(text)) return text.replace(/\r\n/g, '\n')
|
|
76
|
+
return text
|
|
77
|
+
.replace(/\r\n/g, '\n')
|
|
78
|
+
.replace(/[\r\f]/g, '\n')
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// In expo-pretext, segmentation happens on the native side.
|
|
82
|
+
// These functions are kept as no-ops for API compatibility.
|
|
83
|
+
|
|
84
|
+
export function clearAnalysisCaches(): void {
|
|
85
|
+
// No-op: native module handles segmenter lifecycle
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export function setAnalysisLocale(_locale?: string): void {
|
|
89
|
+
// No-op: locale is set on the native side when calling segment
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const arabicScriptRe = /\p{Script=Arabic}/u
|
|
93
|
+
const combiningMarkRe = /\p{M}/u
|
|
94
|
+
const decimalDigitRe = /\p{Nd}/u
|
|
95
|
+
|
|
96
|
+
function containsArabicScript(text: string): boolean {
|
|
97
|
+
return arabicScriptRe.test(text)
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function isCJKCodePoint(codePoint: number): boolean {
|
|
101
|
+
return (
|
|
102
|
+
(codePoint >= 0x4E00 && codePoint <= 0x9FFF) ||
|
|
103
|
+
(codePoint >= 0x3400 && codePoint <= 0x4DBF) ||
|
|
104
|
+
(codePoint >= 0x20000 && codePoint <= 0x2A6DF) ||
|
|
105
|
+
(codePoint >= 0x2A700 && codePoint <= 0x2B73F) ||
|
|
106
|
+
(codePoint >= 0x2B740 && codePoint <= 0x2B81F) ||
|
|
107
|
+
(codePoint >= 0x2B820 && codePoint <= 0x2CEAF) ||
|
|
108
|
+
(codePoint >= 0x2CEB0 && codePoint <= 0x2EBEF) ||
|
|
109
|
+
(codePoint >= 0x2EBF0 && codePoint <= 0x2EE5D) ||
|
|
110
|
+
(codePoint >= 0x2F800 && codePoint <= 0x2FA1F) ||
|
|
111
|
+
(codePoint >= 0x30000 && codePoint <= 0x3134F) ||
|
|
112
|
+
(codePoint >= 0x31350 && codePoint <= 0x323AF) ||
|
|
113
|
+
(codePoint >= 0x323B0 && codePoint <= 0x33479) ||
|
|
114
|
+
(codePoint >= 0xF900 && codePoint <= 0xFAFF) ||
|
|
115
|
+
(codePoint >= 0x3000 && codePoint <= 0x303F) ||
|
|
116
|
+
(codePoint >= 0x3040 && codePoint <= 0x309F) ||
|
|
117
|
+
(codePoint >= 0x30A0 && codePoint <= 0x30FF) ||
|
|
118
|
+
(codePoint >= 0xAC00 && codePoint <= 0xD7AF) ||
|
|
119
|
+
(codePoint >= 0xFF00 && codePoint <= 0xFFEF)
|
|
120
|
+
)
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
export function isCJK(s: string): boolean {
|
|
124
|
+
for (let i = 0; i < s.length; i++) {
|
|
125
|
+
const first = s.charCodeAt(i)
|
|
126
|
+
if (first < 0x3000) continue
|
|
127
|
+
|
|
128
|
+
if (first >= 0xD800 && first <= 0xDBFF && i + 1 < s.length) {
|
|
129
|
+
const second = s.charCodeAt(i + 1)
|
|
130
|
+
if (second >= 0xDC00 && second <= 0xDFFF) {
|
|
131
|
+
const codePoint = ((first - 0xD800) << 10) + (second - 0xDC00) + 0x10000
|
|
132
|
+
if (isCJKCodePoint(codePoint)) return true
|
|
133
|
+
i++
|
|
134
|
+
continue
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (isCJKCodePoint(first)) return true
|
|
139
|
+
}
|
|
140
|
+
return false
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function endsWithLineStartProhibitedText(text: string): boolean {
|
|
144
|
+
let last = ''
|
|
145
|
+
for (const ch of text) last = ch
|
|
146
|
+
return last.length > 0 && (kinsokuStart.has(last) || leftStickyPunctuation.has(last))
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const keepAllGlueChars = new Set([
|
|
150
|
+
'\u00A0',
|
|
151
|
+
'\u202F',
|
|
152
|
+
'\u2060',
|
|
153
|
+
'\uFEFF',
|
|
154
|
+
])
|
|
155
|
+
|
|
156
|
+
function containsCJKText(text: string): boolean {
|
|
157
|
+
return isCJK(text)
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function endsWithKeepAllGlueText(text: string): boolean {
|
|
161
|
+
let last = ''
|
|
162
|
+
for (const ch of text) last = ch
|
|
163
|
+
return last.length > 0 && keepAllGlueChars.has(last)
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
export function canContinueKeepAllTextRun(previousText: string): boolean {
|
|
167
|
+
return (
|
|
168
|
+
!endsWithLineStartProhibitedText(previousText) &&
|
|
169
|
+
!endsWithKeepAllGlueText(previousText)
|
|
170
|
+
)
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
export const kinsokuStart = new Set([
|
|
174
|
+
'\uFF0C',
|
|
175
|
+
'\uFF0E',
|
|
176
|
+
'\uFF01',
|
|
177
|
+
'\uFF1A',
|
|
178
|
+
'\uFF1B',
|
|
179
|
+
'\uFF1F',
|
|
180
|
+
'\u3001',
|
|
181
|
+
'\u3002',
|
|
182
|
+
'\u30FB',
|
|
183
|
+
'\uFF09',
|
|
184
|
+
'\u3015',
|
|
185
|
+
'\u3009',
|
|
186
|
+
'\u300B',
|
|
187
|
+
'\u300D',
|
|
188
|
+
'\u300F',
|
|
189
|
+
'\u3011',
|
|
190
|
+
'\u3017',
|
|
191
|
+
'\u3019',
|
|
192
|
+
'\u301B',
|
|
193
|
+
'\u30FC',
|
|
194
|
+
'\u3005',
|
|
195
|
+
'\u303B',
|
|
196
|
+
'\u309D',
|
|
197
|
+
'\u309E',
|
|
198
|
+
'\u30FD',
|
|
199
|
+
'\u30FE',
|
|
200
|
+
])
|
|
201
|
+
|
|
202
|
+
export const kinsokuEnd = new Set([
|
|
203
|
+
'"',
|
|
204
|
+
'(', '[', '{',
|
|
205
|
+
'\u201C', '\u2018', '\u00AB', '\u2039',
|
|
206
|
+
'\uFF08',
|
|
207
|
+
'\u3014',
|
|
208
|
+
'\u3008',
|
|
209
|
+
'\u300A',
|
|
210
|
+
'\u300C',
|
|
211
|
+
'\u300E',
|
|
212
|
+
'\u3010',
|
|
213
|
+
'\u3016',
|
|
214
|
+
'\u3018',
|
|
215
|
+
'\u301A',
|
|
216
|
+
])
|
|
217
|
+
|
|
218
|
+
const forwardStickyGlue = new Set([
|
|
219
|
+
"'", '\u2019',
|
|
220
|
+
])
|
|
221
|
+
|
|
222
|
+
export const leftStickyPunctuation = new Set([
|
|
223
|
+
'.', ',', '!', '?', ':', ';',
|
|
224
|
+
'\u060C',
|
|
225
|
+
'\u061B',
|
|
226
|
+
'\u061F',
|
|
227
|
+
'\u0964',
|
|
228
|
+
'\u0965',
|
|
229
|
+
'\u104A',
|
|
230
|
+
'\u104B',
|
|
231
|
+
'\u104C',
|
|
232
|
+
'\u104D',
|
|
233
|
+
'\u104F',
|
|
234
|
+
')', ']', '}',
|
|
235
|
+
'%',
|
|
236
|
+
'"',
|
|
237
|
+
'\u201D', '\u2019', '\u00BB', '\u203A',
|
|
238
|
+
'\u2026',
|
|
239
|
+
])
|
|
240
|
+
|
|
241
|
+
const arabicNoSpaceTrailingPunctuation = new Set([
|
|
242
|
+
':',
|
|
243
|
+
'.',
|
|
244
|
+
'\u060C',
|
|
245
|
+
'\u061B',
|
|
246
|
+
])
|
|
247
|
+
|
|
248
|
+
const myanmarMedialGlue = new Set([
|
|
249
|
+
'\u104F',
|
|
250
|
+
])
|
|
251
|
+
|
|
252
|
+
const closingQuoteChars = new Set([
|
|
253
|
+
'\u201D', '\u2019', '\u00BB', '\u203A',
|
|
254
|
+
'\u300D',
|
|
255
|
+
'\u300F',
|
|
256
|
+
'\u3011',
|
|
257
|
+
'\u300B',
|
|
258
|
+
'\u3009',
|
|
259
|
+
'\u3015',
|
|
260
|
+
'\uFF09',
|
|
261
|
+
])
|
|
262
|
+
|
|
263
|
+
function isLeftStickyPunctuationSegment(segment: string): boolean {
|
|
264
|
+
if (isEscapedQuoteClusterSegment(segment)) return true
|
|
265
|
+
let sawPunctuation = false
|
|
266
|
+
for (const ch of segment) {
|
|
267
|
+
if (leftStickyPunctuation.has(ch)) {
|
|
268
|
+
sawPunctuation = true
|
|
269
|
+
continue
|
|
270
|
+
}
|
|
271
|
+
if (sawPunctuation && combiningMarkRe.test(ch)) continue
|
|
272
|
+
return false
|
|
273
|
+
}
|
|
274
|
+
return sawPunctuation
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
function isCJKLineStartProhibitedSegment(segment: string): boolean {
|
|
278
|
+
for (const ch of segment) {
|
|
279
|
+
if (!kinsokuStart.has(ch) && !leftStickyPunctuation.has(ch)) return false
|
|
280
|
+
}
|
|
281
|
+
return segment.length > 0
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
function isForwardStickyClusterSegment(segment: string): boolean {
|
|
285
|
+
if (isEscapedQuoteClusterSegment(segment)) return true
|
|
286
|
+
for (const ch of segment) {
|
|
287
|
+
if (!kinsokuEnd.has(ch) && !forwardStickyGlue.has(ch) && !combiningMarkRe.test(ch)) return false
|
|
288
|
+
}
|
|
289
|
+
return segment.length > 0
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
function isEscapedQuoteClusterSegment(segment: string): boolean {
|
|
293
|
+
let sawQuote = false
|
|
294
|
+
for (const ch of segment) {
|
|
295
|
+
if (ch === '\\' || combiningMarkRe.test(ch)) continue
|
|
296
|
+
if (kinsokuEnd.has(ch) || leftStickyPunctuation.has(ch) || forwardStickyGlue.has(ch)) {
|
|
297
|
+
sawQuote = true
|
|
298
|
+
continue
|
|
299
|
+
}
|
|
300
|
+
return false
|
|
301
|
+
}
|
|
302
|
+
return sawQuote
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
function previousCodePointStart(text: string, end: number): number {
|
|
306
|
+
const last = end - 1
|
|
307
|
+
if (last <= 0) return Math.max(last, 0)
|
|
308
|
+
|
|
309
|
+
const lastCodeUnit = text.charCodeAt(last)
|
|
310
|
+
if (lastCodeUnit < 0xDC00 || lastCodeUnit > 0xDFFF) return last
|
|
311
|
+
|
|
312
|
+
const maybeHigh = last - 1
|
|
313
|
+
if (maybeHigh < 0) return last
|
|
314
|
+
|
|
315
|
+
const highCodeUnit = text.charCodeAt(maybeHigh)
|
|
316
|
+
return highCodeUnit >= 0xD800 && highCodeUnit <= 0xDBFF ? maybeHigh : last
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
function getLastCodePoint(text: string): string | null {
|
|
320
|
+
if (text.length === 0) return null
|
|
321
|
+
const start = previousCodePointStart(text, text.length)
|
|
322
|
+
return text.slice(start)
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
function splitTrailingForwardStickyCluster(text: string): { head: string, tail: string } | null {
|
|
326
|
+
const chars = Array.from(text)
|
|
327
|
+
let splitIndex = chars.length
|
|
328
|
+
|
|
329
|
+
while (splitIndex > 0) {
|
|
330
|
+
const ch = chars[splitIndex - 1]!
|
|
331
|
+
if (combiningMarkRe.test(ch)) {
|
|
332
|
+
splitIndex--
|
|
333
|
+
continue
|
|
334
|
+
}
|
|
335
|
+
if (kinsokuEnd.has(ch) || forwardStickyGlue.has(ch)) {
|
|
336
|
+
splitIndex--
|
|
337
|
+
continue
|
|
338
|
+
}
|
|
339
|
+
break
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
if (splitIndex <= 0 || splitIndex === chars.length) return null
|
|
343
|
+
return {
|
|
344
|
+
head: chars.slice(0, splitIndex).join(''),
|
|
345
|
+
tail: chars.slice(splitIndex).join(''),
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
function isRepeatedSingleCharRun(segment: string, ch: string): boolean {
|
|
350
|
+
if (segment.length === 0) return false
|
|
351
|
+
for (const part of segment) {
|
|
352
|
+
if (part !== ch) return false
|
|
353
|
+
}
|
|
354
|
+
return true
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
function endsWithArabicNoSpacePunctuation(segment: string): boolean {
|
|
358
|
+
if (!containsArabicScript(segment) || segment.length === 0) return false
|
|
359
|
+
const lastCodePoint = getLastCodePoint(segment)
|
|
360
|
+
return lastCodePoint !== null && arabicNoSpaceTrailingPunctuation.has(lastCodePoint)
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
function endsWithMyanmarMedialGlue(segment: string): boolean {
|
|
364
|
+
const lastCodePoint = getLastCodePoint(segment)
|
|
365
|
+
return lastCodePoint !== null && myanmarMedialGlue.has(lastCodePoint)
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
function splitLeadingSpaceAndMarks(segment: string): { space: string, marks: string } | null {
|
|
369
|
+
if (segment.length < 2 || segment[0] !== ' ') return null
|
|
370
|
+
const marks = segment.slice(1)
|
|
371
|
+
if (/^\p{M}+$/u.test(marks)) {
|
|
372
|
+
return { space: ' ', marks }
|
|
373
|
+
}
|
|
374
|
+
return null
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
export function endsWithClosingQuote(text: string): boolean {
|
|
378
|
+
let end = text.length
|
|
379
|
+
while (end > 0) {
|
|
380
|
+
const start = previousCodePointStart(text, end)
|
|
381
|
+
const ch = text.slice(start, end)
|
|
382
|
+
if (closingQuoteChars.has(ch)) return true
|
|
383
|
+
if (!leftStickyPunctuation.has(ch)) return false
|
|
384
|
+
end = start
|
|
385
|
+
}
|
|
386
|
+
return false
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
function classifySegmentBreakChar(ch: string, whiteSpaceProfile: WhiteSpaceProfile): SegmentBreakKind {
|
|
390
|
+
if (whiteSpaceProfile.preserveOrdinarySpaces || whiteSpaceProfile.preserveHardBreaks) {
|
|
391
|
+
if (ch === ' ') return 'preserved-space'
|
|
392
|
+
if (ch === '\t') return 'tab'
|
|
393
|
+
if (whiteSpaceProfile.preserveHardBreaks && ch === '\n') return 'hard-break'
|
|
394
|
+
}
|
|
395
|
+
if (ch === ' ') return 'space'
|
|
396
|
+
if (ch === '\u00A0' || ch === '\u202F' || ch === '\u2060' || ch === '\uFEFF') {
|
|
397
|
+
return 'glue'
|
|
398
|
+
}
|
|
399
|
+
if (ch === '\u200B') return 'zero-width-break'
|
|
400
|
+
if (ch === '\u00AD') return 'soft-hyphen'
|
|
401
|
+
return 'text'
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
// All characters that classifySegmentBreakChar maps to a non-'text' kind.
|
|
405
|
+
const breakCharRe = /[\x20\t\n\xA0\xAD\u200B\u202F\u2060\uFEFF]/
|
|
406
|
+
|
|
407
|
+
function joinTextParts(parts: string[]): string {
|
|
408
|
+
return parts.length === 1 ? parts[0]! : parts.join('')
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
function splitSegmentByBreakKind(
|
|
412
|
+
segment: string,
|
|
413
|
+
isWordLike: boolean,
|
|
414
|
+
start: number,
|
|
415
|
+
whiteSpaceProfile: WhiteSpaceProfile,
|
|
416
|
+
): SegmentationPiece[] {
|
|
417
|
+
if (!breakCharRe.test(segment)) {
|
|
418
|
+
return [{ text: segment, isWordLike, kind: 'text', start }]
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
const pieces: SegmentationPiece[] = []
|
|
422
|
+
let currentKind: SegmentBreakKind | null = null
|
|
423
|
+
let currentTextParts: string[] = []
|
|
424
|
+
let currentStart = start
|
|
425
|
+
let currentWordLike = false
|
|
426
|
+
let offset = 0
|
|
427
|
+
|
|
428
|
+
for (const ch of segment) {
|
|
429
|
+
const kind = classifySegmentBreakChar(ch, whiteSpaceProfile)
|
|
430
|
+
const wordLike = kind === 'text' && isWordLike
|
|
431
|
+
|
|
432
|
+
if (currentKind !== null && kind === currentKind && wordLike === currentWordLike) {
|
|
433
|
+
currentTextParts.push(ch)
|
|
434
|
+
offset += ch.length
|
|
435
|
+
continue
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
if (currentKind !== null) {
|
|
439
|
+
pieces.push({
|
|
440
|
+
text: joinTextParts(currentTextParts),
|
|
441
|
+
isWordLike: currentWordLike,
|
|
442
|
+
kind: currentKind,
|
|
443
|
+
start: currentStart,
|
|
444
|
+
})
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
currentKind = kind
|
|
448
|
+
currentTextParts = [ch]
|
|
449
|
+
currentStart = start + offset
|
|
450
|
+
currentWordLike = wordLike
|
|
451
|
+
offset += ch.length
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
if (currentKind !== null) {
|
|
455
|
+
pieces.push({
|
|
456
|
+
text: joinTextParts(currentTextParts),
|
|
457
|
+
isWordLike: currentWordLike,
|
|
458
|
+
kind: currentKind,
|
|
459
|
+
start: currentStart,
|
|
460
|
+
})
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
return pieces
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
function isTextRunBoundary(kind: SegmentBreakKind): boolean {
|
|
467
|
+
return (
|
|
468
|
+
kind === 'space' ||
|
|
469
|
+
kind === 'preserved-space' ||
|
|
470
|
+
kind === 'zero-width-break' ||
|
|
471
|
+
kind === 'hard-break'
|
|
472
|
+
)
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
const urlSchemeSegmentRe = /^[A-Za-z][A-Za-z0-9+.-]*:$/
|
|
476
|
+
|
|
477
|
+
function isUrlLikeRunStart(segmentation: MergedSegmentation, index: number): boolean {
|
|
478
|
+
const text = segmentation.texts[index]!
|
|
479
|
+
if (text.startsWith('www.')) return true
|
|
480
|
+
return (
|
|
481
|
+
urlSchemeSegmentRe.test(text) &&
|
|
482
|
+
index + 1 < segmentation.len &&
|
|
483
|
+
segmentation.kinds[index + 1] === 'text' &&
|
|
484
|
+
segmentation.texts[index + 1] === '//'
|
|
485
|
+
)
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
function isUrlQueryBoundarySegment(text: string): boolean {
|
|
489
|
+
return text.includes('?') && (text.includes('://') || text.startsWith('www.'))
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
function mergeUrlLikeRuns(segmentation: MergedSegmentation): MergedSegmentation {
|
|
493
|
+
const texts = segmentation.texts.slice()
|
|
494
|
+
const isWordLike = segmentation.isWordLike.slice()
|
|
495
|
+
const kinds = segmentation.kinds.slice()
|
|
496
|
+
const starts = segmentation.starts.slice()
|
|
497
|
+
|
|
498
|
+
for (let i = 0; i < segmentation.len; i++) {
|
|
499
|
+
if (kinds[i] !== 'text' || !isUrlLikeRunStart(segmentation, i)) continue
|
|
500
|
+
|
|
501
|
+
const mergedParts = [texts[i]!]
|
|
502
|
+
let j = i + 1
|
|
503
|
+
while (j < segmentation.len && !isTextRunBoundary(kinds[j]!)) {
|
|
504
|
+
mergedParts.push(texts[j]!)
|
|
505
|
+
isWordLike[i] = true
|
|
506
|
+
const endsQueryPrefix = texts[j]!.includes('?')
|
|
507
|
+
kinds[j] = 'text'
|
|
508
|
+
texts[j] = ''
|
|
509
|
+
j++
|
|
510
|
+
if (endsQueryPrefix) break
|
|
511
|
+
}
|
|
512
|
+
texts[i] = joinTextParts(mergedParts)
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
let compactLen = 0
|
|
516
|
+
for (let read = 0; read < texts.length; read++) {
|
|
517
|
+
const text = texts[read]!
|
|
518
|
+
if (text.length === 0) continue
|
|
519
|
+
if (compactLen !== read) {
|
|
520
|
+
texts[compactLen] = text
|
|
521
|
+
isWordLike[compactLen] = isWordLike[read]!
|
|
522
|
+
kinds[compactLen] = kinds[read]!
|
|
523
|
+
starts[compactLen] = starts[read]!
|
|
524
|
+
}
|
|
525
|
+
compactLen++
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
texts.length = compactLen
|
|
529
|
+
isWordLike.length = compactLen
|
|
530
|
+
kinds.length = compactLen
|
|
531
|
+
starts.length = compactLen
|
|
532
|
+
|
|
533
|
+
return {
|
|
534
|
+
len: compactLen,
|
|
535
|
+
texts,
|
|
536
|
+
isWordLike,
|
|
537
|
+
kinds,
|
|
538
|
+
starts,
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
function mergeUrlQueryRuns(segmentation: MergedSegmentation): MergedSegmentation {
|
|
543
|
+
const texts: string[] = []
|
|
544
|
+
const isWordLike: boolean[] = []
|
|
545
|
+
const kinds: SegmentBreakKind[] = []
|
|
546
|
+
const starts: number[] = []
|
|
547
|
+
|
|
548
|
+
for (let i = 0; i < segmentation.len; i++) {
|
|
549
|
+
const text = segmentation.texts[i]!
|
|
550
|
+
texts.push(text)
|
|
551
|
+
isWordLike.push(segmentation.isWordLike[i]!)
|
|
552
|
+
kinds.push(segmentation.kinds[i]!)
|
|
553
|
+
starts.push(segmentation.starts[i]!)
|
|
554
|
+
|
|
555
|
+
if (!isUrlQueryBoundarySegment(text)) continue
|
|
556
|
+
|
|
557
|
+
const nextIndex = i + 1
|
|
558
|
+
if (
|
|
559
|
+
nextIndex >= segmentation.len ||
|
|
560
|
+
isTextRunBoundary(segmentation.kinds[nextIndex]!)
|
|
561
|
+
) {
|
|
562
|
+
continue
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
const queryParts: string[] = []
|
|
566
|
+
const queryStart = segmentation.starts[nextIndex]!
|
|
567
|
+
let j = nextIndex
|
|
568
|
+
while (j < segmentation.len && !isTextRunBoundary(segmentation.kinds[j]!)) {
|
|
569
|
+
queryParts.push(segmentation.texts[j]!)
|
|
570
|
+
j++
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
if (queryParts.length > 0) {
|
|
574
|
+
texts.push(joinTextParts(queryParts))
|
|
575
|
+
isWordLike.push(true)
|
|
576
|
+
kinds.push('text')
|
|
577
|
+
starts.push(queryStart)
|
|
578
|
+
i = j - 1
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
return {
|
|
583
|
+
len: texts.length,
|
|
584
|
+
texts,
|
|
585
|
+
isWordLike,
|
|
586
|
+
kinds,
|
|
587
|
+
starts,
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
const numericJoinerChars = new Set([
|
|
592
|
+
':', '-', '/', '\u00D7', ',', '.', '+',
|
|
593
|
+
'\u2013',
|
|
594
|
+
'\u2014',
|
|
595
|
+
])
|
|
596
|
+
|
|
597
|
+
const asciiPunctuationChainSegmentRe = /^[A-Za-z0-9_]+[,:;]*$/
|
|
598
|
+
const asciiPunctuationChainTrailingJoinersRe = /[,:;]+$/
|
|
599
|
+
|
|
600
|
+
function segmentContainsDecimalDigit(text: string): boolean {
|
|
601
|
+
for (const ch of text) {
|
|
602
|
+
if (decimalDigitRe.test(ch)) return true
|
|
603
|
+
}
|
|
604
|
+
return false
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
export function isNumericRunSegment(text: string): boolean {
|
|
608
|
+
if (text.length === 0) return false
|
|
609
|
+
for (const ch of text) {
|
|
610
|
+
if (decimalDigitRe.test(ch) || numericJoinerChars.has(ch)) continue
|
|
611
|
+
return false
|
|
612
|
+
}
|
|
613
|
+
return true
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
function mergeNumericRuns(segmentation: MergedSegmentation): MergedSegmentation {
|
|
617
|
+
const texts: string[] = []
|
|
618
|
+
const isWordLike: boolean[] = []
|
|
619
|
+
const kinds: SegmentBreakKind[] = []
|
|
620
|
+
const starts: number[] = []
|
|
621
|
+
|
|
622
|
+
for (let i = 0; i < segmentation.len; i++) {
|
|
623
|
+
const text = segmentation.texts[i]!
|
|
624
|
+
const kind = segmentation.kinds[i]!
|
|
625
|
+
|
|
626
|
+
if (kind === 'text' && isNumericRunSegment(text) && segmentContainsDecimalDigit(text)) {
|
|
627
|
+
const mergedParts = [text]
|
|
628
|
+
let j = i + 1
|
|
629
|
+
while (
|
|
630
|
+
j < segmentation.len &&
|
|
631
|
+
segmentation.kinds[j] === 'text' &&
|
|
632
|
+
isNumericRunSegment(segmentation.texts[j]!)
|
|
633
|
+
) {
|
|
634
|
+
mergedParts.push(segmentation.texts[j]!)
|
|
635
|
+
j++
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
texts.push(joinTextParts(mergedParts))
|
|
639
|
+
isWordLike.push(true)
|
|
640
|
+
kinds.push('text')
|
|
641
|
+
starts.push(segmentation.starts[i]!)
|
|
642
|
+
i = j - 1
|
|
643
|
+
continue
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
texts.push(text)
|
|
647
|
+
isWordLike.push(segmentation.isWordLike[i]!)
|
|
648
|
+
kinds.push(kind)
|
|
649
|
+
starts.push(segmentation.starts[i]!)
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
return {
|
|
653
|
+
len: texts.length,
|
|
654
|
+
texts,
|
|
655
|
+
isWordLike,
|
|
656
|
+
kinds,
|
|
657
|
+
starts,
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
function mergeAsciiPunctuationChains(segmentation: MergedSegmentation): MergedSegmentation {
|
|
662
|
+
const texts: string[] = []
|
|
663
|
+
const isWordLike: boolean[] = []
|
|
664
|
+
const kinds: SegmentBreakKind[] = []
|
|
665
|
+
const starts: number[] = []
|
|
666
|
+
|
|
667
|
+
for (let i = 0; i < segmentation.len; i++) {
|
|
668
|
+
const text = segmentation.texts[i]!
|
|
669
|
+
const kind = segmentation.kinds[i]!
|
|
670
|
+
const wordLike = segmentation.isWordLike[i]!
|
|
671
|
+
|
|
672
|
+
if (kind === 'text' && wordLike && asciiPunctuationChainSegmentRe.test(text)) {
|
|
673
|
+
const mergedParts = [text]
|
|
674
|
+
let endsWithJoiners = asciiPunctuationChainTrailingJoinersRe.test(text)
|
|
675
|
+
let j = i + 1
|
|
676
|
+
|
|
677
|
+
while (
|
|
678
|
+
endsWithJoiners &&
|
|
679
|
+
j < segmentation.len &&
|
|
680
|
+
segmentation.kinds[j] === 'text' &&
|
|
681
|
+
segmentation.isWordLike[j] &&
|
|
682
|
+
asciiPunctuationChainSegmentRe.test(segmentation.texts[j]!)
|
|
683
|
+
) {
|
|
684
|
+
const nextText = segmentation.texts[j]!
|
|
685
|
+
mergedParts.push(nextText)
|
|
686
|
+
endsWithJoiners = asciiPunctuationChainTrailingJoinersRe.test(nextText)
|
|
687
|
+
j++
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
texts.push(joinTextParts(mergedParts))
|
|
691
|
+
isWordLike.push(true)
|
|
692
|
+
kinds.push('text')
|
|
693
|
+
starts.push(segmentation.starts[i]!)
|
|
694
|
+
i = j - 1
|
|
695
|
+
continue
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
texts.push(text)
|
|
699
|
+
isWordLike.push(wordLike)
|
|
700
|
+
kinds.push(kind)
|
|
701
|
+
starts.push(segmentation.starts[i]!)
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
return {
|
|
705
|
+
len: texts.length,
|
|
706
|
+
texts,
|
|
707
|
+
isWordLike,
|
|
708
|
+
kinds,
|
|
709
|
+
starts,
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
function splitHyphenatedNumericRuns(segmentation: MergedSegmentation): MergedSegmentation {
|
|
714
|
+
const texts: string[] = []
|
|
715
|
+
const isWordLike: boolean[] = []
|
|
716
|
+
const kinds: SegmentBreakKind[] = []
|
|
717
|
+
const starts: number[] = []
|
|
718
|
+
|
|
719
|
+
for (let i = 0; i < segmentation.len; i++) {
|
|
720
|
+
const text = segmentation.texts[i]!
|
|
721
|
+
if (segmentation.kinds[i] === 'text' && text.includes('-')) {
|
|
722
|
+
const parts = text.split('-')
|
|
723
|
+
let shouldSplit = parts.length > 1
|
|
724
|
+
for (let j = 0; j < parts.length; j++) {
|
|
725
|
+
const part = parts[j]!
|
|
726
|
+
if (!shouldSplit) break
|
|
727
|
+
if (
|
|
728
|
+
part.length === 0 ||
|
|
729
|
+
!segmentContainsDecimalDigit(part) ||
|
|
730
|
+
!isNumericRunSegment(part)
|
|
731
|
+
) {
|
|
732
|
+
shouldSplit = false
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
if (shouldSplit) {
|
|
737
|
+
let offset = 0
|
|
738
|
+
for (let j = 0; j < parts.length; j++) {
|
|
739
|
+
const part = parts[j]!
|
|
740
|
+
const splitText = j < parts.length - 1 ? `${part}-` : part
|
|
741
|
+
texts.push(splitText)
|
|
742
|
+
isWordLike.push(true)
|
|
743
|
+
kinds.push('text')
|
|
744
|
+
starts.push(segmentation.starts[i]! + offset)
|
|
745
|
+
offset += splitText.length
|
|
746
|
+
}
|
|
747
|
+
continue
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
texts.push(text)
|
|
752
|
+
isWordLike.push(segmentation.isWordLike[i]!)
|
|
753
|
+
kinds.push(segmentation.kinds[i]!)
|
|
754
|
+
starts.push(segmentation.starts[i]!)
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
return {
|
|
758
|
+
len: texts.length,
|
|
759
|
+
texts,
|
|
760
|
+
isWordLike,
|
|
761
|
+
kinds,
|
|
762
|
+
starts,
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
function mergeGlueConnectedTextRuns(segmentation: MergedSegmentation): MergedSegmentation {
|
|
767
|
+
const texts: string[] = []
|
|
768
|
+
const isWordLike: boolean[] = []
|
|
769
|
+
const kinds: SegmentBreakKind[] = []
|
|
770
|
+
const starts: number[] = []
|
|
771
|
+
|
|
772
|
+
let read = 0
|
|
773
|
+
while (read < segmentation.len) {
|
|
774
|
+
const textParts = [segmentation.texts[read]!]
|
|
775
|
+
let wordLike = segmentation.isWordLike[read]!
|
|
776
|
+
let kind = segmentation.kinds[read]!
|
|
777
|
+
let start = segmentation.starts[read]!
|
|
778
|
+
|
|
779
|
+
if (kind === 'glue') {
|
|
780
|
+
const glueParts = [textParts[0]!]
|
|
781
|
+
const glueStart = start
|
|
782
|
+
read++
|
|
783
|
+
while (read < segmentation.len && segmentation.kinds[read] === 'glue') {
|
|
784
|
+
glueParts.push(segmentation.texts[read]!)
|
|
785
|
+
read++
|
|
786
|
+
}
|
|
787
|
+
const glueText = joinTextParts(glueParts)
|
|
788
|
+
|
|
789
|
+
if (read < segmentation.len && segmentation.kinds[read] === 'text') {
|
|
790
|
+
textParts[0] = glueText
|
|
791
|
+
textParts.push(segmentation.texts[read]!)
|
|
792
|
+
wordLike = segmentation.isWordLike[read]!
|
|
793
|
+
kind = 'text'
|
|
794
|
+
start = glueStart
|
|
795
|
+
read++
|
|
796
|
+
} else {
|
|
797
|
+
texts.push(glueText)
|
|
798
|
+
isWordLike.push(false)
|
|
799
|
+
kinds.push('glue')
|
|
800
|
+
starts.push(glueStart)
|
|
801
|
+
continue
|
|
802
|
+
}
|
|
803
|
+
} else {
|
|
804
|
+
read++
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
if (kind === 'text') {
|
|
808
|
+
while (read < segmentation.len && segmentation.kinds[read] === 'glue') {
|
|
809
|
+
const glueParts: string[] = []
|
|
810
|
+
while (read < segmentation.len && segmentation.kinds[read] === 'glue') {
|
|
811
|
+
glueParts.push(segmentation.texts[read]!)
|
|
812
|
+
read++
|
|
813
|
+
}
|
|
814
|
+
const glueText = joinTextParts(glueParts)
|
|
815
|
+
|
|
816
|
+
if (read < segmentation.len && segmentation.kinds[read] === 'text') {
|
|
817
|
+
textParts.push(glueText, segmentation.texts[read]!)
|
|
818
|
+
wordLike = wordLike || segmentation.isWordLike[read]!
|
|
819
|
+
read++
|
|
820
|
+
continue
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
textParts.push(glueText)
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
texts.push(joinTextParts(textParts))
|
|
828
|
+
isWordLike.push(wordLike)
|
|
829
|
+
kinds.push(kind)
|
|
830
|
+
starts.push(start)
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
return {
|
|
834
|
+
len: texts.length,
|
|
835
|
+
texts,
|
|
836
|
+
isWordLike,
|
|
837
|
+
kinds,
|
|
838
|
+
starts,
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
function carryTrailingForwardStickyAcrossCJKBoundary(segmentation: MergedSegmentation): MergedSegmentation {
|
|
843
|
+
const texts = segmentation.texts.slice()
|
|
844
|
+
const isWordLike = segmentation.isWordLike.slice()
|
|
845
|
+
const kinds = segmentation.kinds.slice()
|
|
846
|
+
const starts = segmentation.starts.slice()
|
|
847
|
+
|
|
848
|
+
for (let i = 0; i < texts.length - 1; i++) {
|
|
849
|
+
if (kinds[i] !== 'text' || kinds[i + 1] !== 'text') continue
|
|
850
|
+
if (!isCJK(texts[i]!) || !isCJK(texts[i + 1]!)) continue
|
|
851
|
+
|
|
852
|
+
const split = splitTrailingForwardStickyCluster(texts[i]!)
|
|
853
|
+
if (split === null) continue
|
|
854
|
+
|
|
855
|
+
texts[i] = split.head
|
|
856
|
+
texts[i + 1] = split.tail + texts[i + 1]!
|
|
857
|
+
starts[i + 1] = starts[i]! + split.head.length
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
return {
|
|
861
|
+
len: texts.length,
|
|
862
|
+
texts,
|
|
863
|
+
isWordLike,
|
|
864
|
+
kinds,
|
|
865
|
+
starts,
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
// Modified for expo-pretext: accepts pre-segmented data from native module
|
|
871
|
+
// instead of calling Intl.Segmenter internally.
|
|
872
|
+
function buildMergedSegmentation(
|
|
873
|
+
normalized: string,
|
|
874
|
+
segments: string[],
|
|
875
|
+
segmentIsWordLike: boolean[],
|
|
876
|
+
profile: AnalysisProfile,
|
|
877
|
+
whiteSpaceProfile: WhiteSpaceProfile,
|
|
878
|
+
): MergedSegmentation {
|
|
879
|
+
let mergedLen = 0
|
|
880
|
+
const mergedTexts: string[] = []
|
|
881
|
+
const mergedWordLike: boolean[] = []
|
|
882
|
+
const mergedKinds: SegmentBreakKind[] = []
|
|
883
|
+
const mergedStarts: number[] = []
|
|
884
|
+
|
|
885
|
+
// Compute start offsets from segments (native module provides segments but not offsets)
|
|
886
|
+
let offset = 0
|
|
887
|
+
for (let si = 0; si < segments.length; si++) {
|
|
888
|
+
const seg = segments[si]!
|
|
889
|
+
const wordLike = segmentIsWordLike[si] ?? false
|
|
890
|
+
const segStart = offset
|
|
891
|
+
offset += seg.length
|
|
892
|
+
|
|
893
|
+
for (const piece of splitSegmentByBreakKind(seg, wordLike, segStart, whiteSpaceProfile)) {
|
|
894
|
+
const isText = piece.kind === 'text'
|
|
895
|
+
|
|
896
|
+
if (
|
|
897
|
+
profile.carryCJKAfterClosingQuote &&
|
|
898
|
+
isText &&
|
|
899
|
+
mergedLen > 0 &&
|
|
900
|
+
mergedKinds[mergedLen - 1] === 'text' &&
|
|
901
|
+
isCJK(piece.text) &&
|
|
902
|
+
isCJK(mergedTexts[mergedLen - 1]!) &&
|
|
903
|
+
endsWithClosingQuote(mergedTexts[mergedLen - 1]!)
|
|
904
|
+
) {
|
|
905
|
+
mergedTexts[mergedLen - 1] += piece.text
|
|
906
|
+
mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike
|
|
907
|
+
} else if (
|
|
908
|
+
isText &&
|
|
909
|
+
mergedLen > 0 &&
|
|
910
|
+
mergedKinds[mergedLen - 1] === 'text' &&
|
|
911
|
+
isCJKLineStartProhibitedSegment(piece.text) &&
|
|
912
|
+
isCJK(mergedTexts[mergedLen - 1]!)
|
|
913
|
+
) {
|
|
914
|
+
mergedTexts[mergedLen - 1] += piece.text
|
|
915
|
+
mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike
|
|
916
|
+
} else if (
|
|
917
|
+
isText &&
|
|
918
|
+
mergedLen > 0 &&
|
|
919
|
+
mergedKinds[mergedLen - 1] === 'text' &&
|
|
920
|
+
endsWithMyanmarMedialGlue(mergedTexts[mergedLen - 1]!)
|
|
921
|
+
) {
|
|
922
|
+
mergedTexts[mergedLen - 1] += piece.text
|
|
923
|
+
mergedWordLike[mergedLen - 1] = mergedWordLike[mergedLen - 1]! || piece.isWordLike
|
|
924
|
+
} else if (
|
|
925
|
+
isText &&
|
|
926
|
+
mergedLen > 0 &&
|
|
927
|
+
mergedKinds[mergedLen - 1] === 'text' &&
|
|
928
|
+
piece.isWordLike &&
|
|
929
|
+
containsArabicScript(piece.text) &&
|
|
930
|
+
endsWithArabicNoSpacePunctuation(mergedTexts[mergedLen - 1]!)
|
|
931
|
+
) {
|
|
932
|
+
mergedTexts[mergedLen - 1] += piece.text
|
|
933
|
+
mergedWordLike[mergedLen - 1] = true
|
|
934
|
+
} else if (
|
|
935
|
+
isText &&
|
|
936
|
+
!piece.isWordLike &&
|
|
937
|
+
mergedLen > 0 &&
|
|
938
|
+
mergedKinds[mergedLen - 1] === 'text' &&
|
|
939
|
+
piece.text.length === 1 &&
|
|
940
|
+
piece.text !== '-' &&
|
|
941
|
+
piece.text !== '\u2014' &&
|
|
942
|
+
isRepeatedSingleCharRun(mergedTexts[mergedLen - 1]!, piece.text)
|
|
943
|
+
) {
|
|
944
|
+
mergedTexts[mergedLen - 1] += piece.text
|
|
945
|
+
} else if (
|
|
946
|
+
isText &&
|
|
947
|
+
!piece.isWordLike &&
|
|
948
|
+
mergedLen > 0 &&
|
|
949
|
+
mergedKinds[mergedLen - 1] === 'text' &&
|
|
950
|
+
(
|
|
951
|
+
isLeftStickyPunctuationSegment(piece.text) ||
|
|
952
|
+
(piece.text === '-' && mergedWordLike[mergedLen - 1]!)
|
|
953
|
+
)
|
|
954
|
+
) {
|
|
955
|
+
mergedTexts[mergedLen - 1] += piece.text
|
|
956
|
+
} else {
|
|
957
|
+
mergedTexts[mergedLen] = piece.text
|
|
958
|
+
mergedWordLike[mergedLen] = piece.isWordLike
|
|
959
|
+
mergedKinds[mergedLen] = piece.kind
|
|
960
|
+
mergedStarts[mergedLen] = piece.start
|
|
961
|
+
mergedLen++
|
|
962
|
+
}
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
for (let i = 1; i < mergedLen; i++) {
|
|
967
|
+
if (
|
|
968
|
+
mergedKinds[i] === 'text' &&
|
|
969
|
+
!mergedWordLike[i]! &&
|
|
970
|
+
isEscapedQuoteClusterSegment(mergedTexts[i]!) &&
|
|
971
|
+
mergedKinds[i - 1] === 'text'
|
|
972
|
+
) {
|
|
973
|
+
mergedTexts[i - 1] += mergedTexts[i]!
|
|
974
|
+
mergedWordLike[i - 1] = mergedWordLike[i - 1]! || mergedWordLike[i]!
|
|
975
|
+
mergedTexts[i] = ''
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
for (let i = mergedLen - 2; i >= 0; i--) {
|
|
980
|
+
if (mergedKinds[i] === 'text' && !mergedWordLike[i]! && isForwardStickyClusterSegment(mergedTexts[i]!)) {
|
|
981
|
+
let j = i + 1
|
|
982
|
+
while (j < mergedLen && mergedTexts[j] === '') j++
|
|
983
|
+
if (j < mergedLen && mergedKinds[j] === 'text') {
|
|
984
|
+
mergedTexts[j] = mergedTexts[i]! + mergedTexts[j]!
|
|
985
|
+
mergedStarts[j] = mergedStarts[i]!
|
|
986
|
+
mergedTexts[i] = ''
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
let compactLen = 0
|
|
992
|
+
for (let read = 0; read < mergedLen; read++) {
|
|
993
|
+
const text = mergedTexts[read]!
|
|
994
|
+
if (text.length === 0) continue
|
|
995
|
+
if (compactLen !== read) {
|
|
996
|
+
mergedTexts[compactLen] = text
|
|
997
|
+
mergedWordLike[compactLen] = mergedWordLike[read]!
|
|
998
|
+
mergedKinds[compactLen] = mergedKinds[read]!
|
|
999
|
+
mergedStarts[compactLen] = mergedStarts[read]!
|
|
1000
|
+
}
|
|
1001
|
+
compactLen++
|
|
1002
|
+
}
|
|
1003
|
+
mergedTexts.length = compactLen
|
|
1004
|
+
mergedWordLike.length = compactLen
|
|
1005
|
+
mergedKinds.length = compactLen
|
|
1006
|
+
mergedStarts.length = compactLen
|
|
1007
|
+
|
|
1008
|
+
const compacted = mergeGlueConnectedTextRuns({
|
|
1009
|
+
len: compactLen,
|
|
1010
|
+
texts: mergedTexts,
|
|
1011
|
+
isWordLike: mergedWordLike,
|
|
1012
|
+
kinds: mergedKinds,
|
|
1013
|
+
starts: mergedStarts,
|
|
1014
|
+
})
|
|
1015
|
+
const withMergedUrls = carryTrailingForwardStickyAcrossCJKBoundary(
|
|
1016
|
+
mergeAsciiPunctuationChains(
|
|
1017
|
+
splitHyphenatedNumericRuns(mergeNumericRuns(mergeUrlQueryRuns(mergeUrlLikeRuns(compacted)))),
|
|
1018
|
+
),
|
|
1019
|
+
)
|
|
1020
|
+
|
|
1021
|
+
for (let i = 0; i < withMergedUrls.len - 1; i++) {
|
|
1022
|
+
const split = splitLeadingSpaceAndMarks(withMergedUrls.texts[i]!)
|
|
1023
|
+
if (split === null) continue
|
|
1024
|
+
if (
|
|
1025
|
+
(withMergedUrls.kinds[i] !== 'space' && withMergedUrls.kinds[i] !== 'preserved-space') ||
|
|
1026
|
+
withMergedUrls.kinds[i + 1] !== 'text' ||
|
|
1027
|
+
!containsArabicScript(withMergedUrls.texts[i + 1]!)
|
|
1028
|
+
) {
|
|
1029
|
+
continue
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
withMergedUrls.texts[i] = split.space
|
|
1033
|
+
withMergedUrls.isWordLike[i] = false
|
|
1034
|
+
withMergedUrls.kinds[i] = withMergedUrls.kinds[i] === 'preserved-space' ? 'preserved-space' : 'space'
|
|
1035
|
+
withMergedUrls.texts[i + 1] = split.marks + withMergedUrls.texts[i + 1]!
|
|
1036
|
+
withMergedUrls.starts[i + 1] = withMergedUrls.starts[i]! + split.space.length
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
return withMergedUrls
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
function compileAnalysisChunks(segmentation: MergedSegmentation, whiteSpaceProfile: WhiteSpaceProfile): AnalysisChunk[] {
|
|
1043
|
+
if (segmentation.len === 0) return []
|
|
1044
|
+
if (!whiteSpaceProfile.preserveHardBreaks) {
|
|
1045
|
+
return [{
|
|
1046
|
+
startSegmentIndex: 0,
|
|
1047
|
+
endSegmentIndex: segmentation.len,
|
|
1048
|
+
consumedEndSegmentIndex: segmentation.len,
|
|
1049
|
+
}]
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
const chunks: AnalysisChunk[] = []
|
|
1053
|
+
let startSegmentIndex = 0
|
|
1054
|
+
|
|
1055
|
+
for (let i = 0; i < segmentation.len; i++) {
|
|
1056
|
+
if (segmentation.kinds[i] !== 'hard-break') continue
|
|
1057
|
+
|
|
1058
|
+
chunks.push({
|
|
1059
|
+
startSegmentIndex,
|
|
1060
|
+
endSegmentIndex: i,
|
|
1061
|
+
consumedEndSegmentIndex: i + 1,
|
|
1062
|
+
})
|
|
1063
|
+
startSegmentIndex = i + 1
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
if (startSegmentIndex < segmentation.len) {
|
|
1067
|
+
chunks.push({
|
|
1068
|
+
startSegmentIndex,
|
|
1069
|
+
endSegmentIndex: segmentation.len,
|
|
1070
|
+
consumedEndSegmentIndex: segmentation.len,
|
|
1071
|
+
})
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
return chunks
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegmentation {
|
|
1078
|
+
if (segmentation.len <= 1) return segmentation
|
|
1079
|
+
|
|
1080
|
+
const texts: string[] = []
|
|
1081
|
+
const isWordLike: boolean[] = []
|
|
1082
|
+
const kinds: SegmentBreakKind[] = []
|
|
1083
|
+
const starts: number[] = []
|
|
1084
|
+
|
|
1085
|
+
for (let i = 0; i < segmentation.len; i++) {
|
|
1086
|
+
const text = segmentation.texts[i]!
|
|
1087
|
+
const kind = segmentation.kinds[i]!
|
|
1088
|
+
const wordLike = segmentation.isWordLike[i]!
|
|
1089
|
+
const start = segmentation.starts[i]!
|
|
1090
|
+
const previousIndex = texts.length - 1
|
|
1091
|
+
|
|
1092
|
+
if (
|
|
1093
|
+
kind === 'text' &&
|
|
1094
|
+
previousIndex >= 0 &&
|
|
1095
|
+
kinds[previousIndex] === 'text' &&
|
|
1096
|
+
canContinueKeepAllTextRun(texts[previousIndex]!) &&
|
|
1097
|
+
containsCJKText(texts[previousIndex]!)
|
|
1098
|
+
) {
|
|
1099
|
+
texts[previousIndex] += text
|
|
1100
|
+
isWordLike[previousIndex] = isWordLike[previousIndex]! || wordLike
|
|
1101
|
+
continue
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
texts.push(text)
|
|
1105
|
+
isWordLike.push(wordLike)
|
|
1106
|
+
kinds.push(kind)
|
|
1107
|
+
starts.push(start)
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
return {
|
|
1111
|
+
len: texts.length,
|
|
1112
|
+
texts,
|
|
1113
|
+
isWordLike,
|
|
1114
|
+
kinds,
|
|
1115
|
+
starts,
|
|
1116
|
+
}
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
// Modified for expo-pretext: accepts pre-segmented data from native module.
|
|
1120
|
+
// The native module performs Intl.Segmenter-equivalent word segmentation and
|
|
1121
|
+
// provides segments[] and isWordLike[] arrays.
|
|
1122
|
+
export function analyzeText(
|
|
1123
|
+
segments: string[],
|
|
1124
|
+
isWordLike: boolean[],
|
|
1125
|
+
profile: AnalysisProfile,
|
|
1126
|
+
whiteSpace: WhiteSpaceMode = 'normal',
|
|
1127
|
+
wordBreak: WordBreakMode = 'normal',
|
|
1128
|
+
): TextAnalysis {
|
|
1129
|
+
const whiteSpaceProfile = getWhiteSpaceProfile(whiteSpace)
|
|
1130
|
+
|
|
1131
|
+
// Reconstruct the full text from segments, then normalize
|
|
1132
|
+
const rawText = segments.join('')
|
|
1133
|
+
const normalized = whiteSpaceProfile.mode === 'pre-wrap'
|
|
1134
|
+
? normalizeWhitespacePreWrap(rawText)
|
|
1135
|
+
: normalizeWhitespaceNormal(rawText)
|
|
1136
|
+
|
|
1137
|
+
if (normalized.length === 0) {
|
|
1138
|
+
return {
|
|
1139
|
+
normalized,
|
|
1140
|
+
chunks: [],
|
|
1141
|
+
len: 0,
|
|
1142
|
+
texts: [],
|
|
1143
|
+
isWordLike: [],
|
|
1144
|
+
kinds: [],
|
|
1145
|
+
starts: [],
|
|
1146
|
+
}
|
|
1147
|
+
}
|
|
1148
|
+
|
|
1149
|
+
// If whitespace normalization changed the text, we need to re-derive
|
|
1150
|
+
// segments for the normalized text. For pre-wrap mode the text is mostly
|
|
1151
|
+
// unchanged (only \r\f -> \n), so we can reuse the native segments.
|
|
1152
|
+
// For normal mode, collapsing whitespace can shift boundaries.
|
|
1153
|
+
let normalizedSegments: string[]
|
|
1154
|
+
let normalizedIsWordLike: boolean[]
|
|
1155
|
+
|
|
1156
|
+
if (normalized === rawText) {
|
|
1157
|
+
// No normalization needed — use native segments directly
|
|
1158
|
+
normalizedSegments = segments
|
|
1159
|
+
normalizedIsWordLike = isWordLike
|
|
1160
|
+
} else {
|
|
1161
|
+
// Whitespace was normalized. Re-derive segments from the normalized text
|
|
1162
|
+
// by walking both the original segments and the normalized text in parallel.
|
|
1163
|
+
normalizedSegments = []
|
|
1164
|
+
normalizedIsWordLike = []
|
|
1165
|
+
let origOffset = 0
|
|
1166
|
+
let normOffset = 0
|
|
1167
|
+
|
|
1168
|
+
for (let si = 0; si < segments.length && normOffset < normalized.length; si++) {
|
|
1169
|
+
const seg = segments[si]!
|
|
1170
|
+
const segEnd = origOffset + seg.length
|
|
1171
|
+
|
|
1172
|
+
// Find where this segment's content appears in normalized text
|
|
1173
|
+
let segNormText = ''
|
|
1174
|
+
|
|
1175
|
+
// Walk through normalized text character by character, advancing origOffset
|
|
1176
|
+
// to track correspondence
|
|
1177
|
+
for (let oi = origOffset; oi < segEnd && normOffset < normalized.length; ) {
|
|
1178
|
+
const origCh = rawText[oi]!
|
|
1179
|
+
const normCh = normalized[normOffset]!
|
|
1180
|
+
|
|
1181
|
+
if (origCh === normCh) {
|
|
1182
|
+
segNormText += normCh
|
|
1183
|
+
oi++
|
|
1184
|
+
normOffset++
|
|
1185
|
+
} else {
|
|
1186
|
+
// This character was collapsed/removed by normalization — skip in original
|
|
1187
|
+
oi++
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
origOffset = segEnd
|
|
1192
|
+
|
|
1193
|
+
if (segNormText.length > 0) {
|
|
1194
|
+
normalizedSegments.push(segNormText)
|
|
1195
|
+
normalizedIsWordLike.push(isWordLike[si] ?? false)
|
|
1196
|
+
}
|
|
1197
|
+
}
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
const segmentation = wordBreak === 'keep-all'
|
|
1201
|
+
? mergeKeepAllTextSegments(buildMergedSegmentation(normalized, normalizedSegments, normalizedIsWordLike, profile, whiteSpaceProfile))
|
|
1202
|
+
: buildMergedSegmentation(normalized, normalizedSegments, normalizedIsWordLike, profile, whiteSpaceProfile)
|
|
1203
|
+
return {
|
|
1204
|
+
normalized,
|
|
1205
|
+
chunks: compileAnalysisChunks(segmentation, whiteSpaceProfile),
|
|
1206
|
+
...segmentation,
|
|
1207
|
+
}
|
|
1208
|
+
}
|