@formatjs/intl-segmenter 12.0.6 → 12.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -3
- package/polyfill.iife.js +44 -160
- package/src/segmenter.js +77 -1
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@formatjs/intl-segmenter",
|
|
3
3
|
"description": "Polyfill for Intl.Segmenter",
|
|
4
|
-
"version": "12.0.
|
|
4
|
+
"version": "12.0.7",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "Matija Gaspar <matijagaspar@gmail.com>",
|
|
7
7
|
"type": "module",
|
|
@@ -12,8 +12,8 @@
|
|
|
12
12
|
},
|
|
13
13
|
"dependencies": {
|
|
14
14
|
"tslib": "^2.8.0",
|
|
15
|
-
"@formatjs/ecma402-abstract": "3.0.
|
|
16
|
-
"@formatjs/intl-localematcher": "0.7.
|
|
15
|
+
"@formatjs/ecma402-abstract": "3.0.7",
|
|
16
|
+
"@formatjs/intl-localematcher": "0.7.4"
|
|
17
17
|
},
|
|
18
18
|
"bugs": "https://github.com/formatjs/formatjs/issues",
|
|
19
19
|
"homepage": "https://github.com/formatjs/formatjs",
|
package/polyfill.iife.js
CHANGED
|
@@ -5817,17 +5817,7 @@
|
|
|
5817
5817
|
"SN",
|
|
5818
5818
|
"TG"
|
|
5819
5819
|
],
|
|
5820
|
-
"013": [
|
|
5821
|
-
"013",
|
|
5822
|
-
"BZ",
|
|
5823
|
-
"CR",
|
|
5824
|
-
"GT",
|
|
5825
|
-
"HN",
|
|
5826
|
-
"MX",
|
|
5827
|
-
"NI",
|
|
5828
|
-
"PA",
|
|
5829
|
-
"SV"
|
|
5830
|
-
],
|
|
5820
|
+
"013": ["013", "BZ", "CR", "GT", "HN", "MX", "NI", "PA", "SV"],
|
|
5831
5821
|
"014": [
|
|
5832
5822
|
"014",
|
|
5833
5823
|
"BI",
|
|
@@ -5853,38 +5843,9 @@
|
|
|
5853
5843
|
"ZM",
|
|
5854
5844
|
"ZW"
|
|
5855
5845
|
],
|
|
5856
|
-
"015": [
|
|
5857
|
-
|
|
5858
|
-
|
|
5859
|
-
"EA",
|
|
5860
|
-
"EG",
|
|
5861
|
-
"EH",
|
|
5862
|
-
"IC",
|
|
5863
|
-
"LY",
|
|
5864
|
-
"MA",
|
|
5865
|
-
"SD",
|
|
5866
|
-
"TN"
|
|
5867
|
-
],
|
|
5868
|
-
"017": [
|
|
5869
|
-
"017",
|
|
5870
|
-
"AO",
|
|
5871
|
-
"CD",
|
|
5872
|
-
"CF",
|
|
5873
|
-
"CG",
|
|
5874
|
-
"CM",
|
|
5875
|
-
"GA",
|
|
5876
|
-
"GQ",
|
|
5877
|
-
"ST",
|
|
5878
|
-
"TD"
|
|
5879
|
-
],
|
|
5880
|
-
"018": [
|
|
5881
|
-
"018",
|
|
5882
|
-
"BW",
|
|
5883
|
-
"LS",
|
|
5884
|
-
"NA",
|
|
5885
|
-
"SZ",
|
|
5886
|
-
"ZA"
|
|
5887
|
-
],
|
|
5846
|
+
"015": ["015", "DZ", "EA", "EG", "EH", "IC", "LY", "MA", "SD", "TN"],
|
|
5847
|
+
"017": ["017", "AO", "CD", "CF", "CG", "CM", "GA", "GQ", "ST", "TD"],
|
|
5848
|
+
"018": ["018", "BW", "LS", "NA", "SZ", "ZA"],
|
|
5888
5849
|
"019": [
|
|
5889
5850
|
"003",
|
|
5890
5851
|
"005",
|
|
@@ -5952,14 +5913,7 @@
|
|
|
5952
5913
|
"VG",
|
|
5953
5914
|
"VI"
|
|
5954
5915
|
],
|
|
5955
|
-
"021": [
|
|
5956
|
-
"021",
|
|
5957
|
-
"BM",
|
|
5958
|
-
"CA",
|
|
5959
|
-
"GL",
|
|
5960
|
-
"PM",
|
|
5961
|
-
"US"
|
|
5962
|
-
],
|
|
5916
|
+
"021": ["021", "BM", "CA", "GL", "PM", "US"],
|
|
5963
5917
|
"029": [
|
|
5964
5918
|
"029",
|
|
5965
5919
|
"AG",
|
|
@@ -5991,29 +5945,8 @@
|
|
|
5991
5945
|
"VG",
|
|
5992
5946
|
"VI"
|
|
5993
5947
|
],
|
|
5994
|
-
"030": [
|
|
5995
|
-
|
|
5996
|
-
"CN",
|
|
5997
|
-
"HK",
|
|
5998
|
-
"JP",
|
|
5999
|
-
"KP",
|
|
6000
|
-
"KR",
|
|
6001
|
-
"MN",
|
|
6002
|
-
"MO",
|
|
6003
|
-
"TW"
|
|
6004
|
-
],
|
|
6005
|
-
"034": [
|
|
6006
|
-
"034",
|
|
6007
|
-
"AF",
|
|
6008
|
-
"BD",
|
|
6009
|
-
"BT",
|
|
6010
|
-
"IN",
|
|
6011
|
-
"IR",
|
|
6012
|
-
"LK",
|
|
6013
|
-
"MV",
|
|
6014
|
-
"NP",
|
|
6015
|
-
"PK"
|
|
6016
|
-
],
|
|
5948
|
+
"030": ["030", "CN", "HK", "JP", "KP", "KR", "MN", "MO", "TW"],
|
|
5949
|
+
"034": ["034", "AF", "BD", "BT", "IN", "IR", "LK", "MV", "NP", "PK"],
|
|
6017
5950
|
"035": [
|
|
6018
5951
|
"035",
|
|
6019
5952
|
"BN",
|
|
@@ -6048,47 +5981,10 @@
|
|
|
6048
5981
|
"VA",
|
|
6049
5982
|
"XK"
|
|
6050
5983
|
],
|
|
6051
|
-
"053": [
|
|
6052
|
-
|
|
6053
|
-
|
|
6054
|
-
|
|
6055
|
-
"CX",
|
|
6056
|
-
"HM",
|
|
6057
|
-
"NF",
|
|
6058
|
-
"NZ"
|
|
6059
|
-
],
|
|
6060
|
-
"054": [
|
|
6061
|
-
"054",
|
|
6062
|
-
"FJ",
|
|
6063
|
-
"NC",
|
|
6064
|
-
"PG",
|
|
6065
|
-
"SB",
|
|
6066
|
-
"VU"
|
|
6067
|
-
],
|
|
6068
|
-
"057": [
|
|
6069
|
-
"057",
|
|
6070
|
-
"FM",
|
|
6071
|
-
"GU",
|
|
6072
|
-
"KI",
|
|
6073
|
-
"MH",
|
|
6074
|
-
"MP",
|
|
6075
|
-
"NR",
|
|
6076
|
-
"PW",
|
|
6077
|
-
"UM"
|
|
6078
|
-
],
|
|
6079
|
-
"061": [
|
|
6080
|
-
"061",
|
|
6081
|
-
"AS",
|
|
6082
|
-
"CK",
|
|
6083
|
-
"NU",
|
|
6084
|
-
"PF",
|
|
6085
|
-
"PN",
|
|
6086
|
-
"TK",
|
|
6087
|
-
"TO",
|
|
6088
|
-
"TV",
|
|
6089
|
-
"WF",
|
|
6090
|
-
"WS"
|
|
6091
|
-
],
|
|
5984
|
+
"053": ["053", "AU", "CC", "CX", "HM", "NF", "NZ"],
|
|
5985
|
+
"054": ["054", "FJ", "NC", "PG", "SB", "VU"],
|
|
5986
|
+
"057": ["057", "FM", "GU", "KI", "MH", "MP", "NR", "PW", "UM"],
|
|
5987
|
+
"061": ["061", "AS", "CK", "NU", "PF", "PN", "TK", "TO", "TV", "WF", "WS"],
|
|
6092
5988
|
"142": [
|
|
6093
5989
|
"030",
|
|
6094
5990
|
"034",
|
|
@@ -6148,14 +6044,7 @@
|
|
|
6148
6044
|
"VN",
|
|
6149
6045
|
"YE"
|
|
6150
6046
|
],
|
|
6151
|
-
"143": [
|
|
6152
|
-
"143",
|
|
6153
|
-
"KG",
|
|
6154
|
-
"KZ",
|
|
6155
|
-
"TJ",
|
|
6156
|
-
"TM",
|
|
6157
|
-
"UZ"
|
|
6158
|
-
],
|
|
6047
|
+
"143": ["143", "KG", "KZ", "TJ", "TM", "UZ"],
|
|
6159
6048
|
"145": [
|
|
6160
6049
|
"145",
|
|
6161
6050
|
"AE",
|
|
@@ -6237,19 +6126,7 @@
|
|
|
6237
6126
|
"VA",
|
|
6238
6127
|
"XK"
|
|
6239
6128
|
],
|
|
6240
|
-
"151": [
|
|
6241
|
-
"151",
|
|
6242
|
-
"BG",
|
|
6243
|
-
"BY",
|
|
6244
|
-
"CZ",
|
|
6245
|
-
"HU",
|
|
6246
|
-
"MD",
|
|
6247
|
-
"PL",
|
|
6248
|
-
"RO",
|
|
6249
|
-
"RU",
|
|
6250
|
-
"SK",
|
|
6251
|
-
"UA"
|
|
6252
|
-
],
|
|
6129
|
+
"151": ["151", "BG", "BY", "CZ", "HU", "MD", "PL", "RO", "RU", "SK", "UA"],
|
|
6253
6130
|
"154": [
|
|
6254
6131
|
"154",
|
|
6255
6132
|
"AX",
|
|
@@ -6270,18 +6147,7 @@
|
|
|
6270
6147
|
"SE",
|
|
6271
6148
|
"SJ"
|
|
6272
6149
|
],
|
|
6273
|
-
"155": [
|
|
6274
|
-
"155",
|
|
6275
|
-
"AT",
|
|
6276
|
-
"BE",
|
|
6277
|
-
"CH",
|
|
6278
|
-
"DE",
|
|
6279
|
-
"FR",
|
|
6280
|
-
"LI",
|
|
6281
|
-
"LU",
|
|
6282
|
-
"MC",
|
|
6283
|
-
"NL"
|
|
6284
|
-
],
|
|
6150
|
+
"155": ["155", "AT", "BE", "CH", "DE", "FR", "LI", "LU", "MC", "NL"],
|
|
6285
6151
|
"202": [
|
|
6286
6152
|
"011",
|
|
6287
6153
|
"014",
|
|
@@ -6400,7 +6266,7 @@
|
|
|
6400
6266
|
"VG",
|
|
6401
6267
|
"VI"
|
|
6402
6268
|
],
|
|
6403
|
-
|
|
6269
|
+
EU: [
|
|
6404
6270
|
"AT",
|
|
6405
6271
|
"BE",
|
|
6406
6272
|
"BG",
|
|
@@ -6430,7 +6296,7 @@
|
|
|
6430
6296
|
"SI",
|
|
6431
6297
|
"SK"
|
|
6432
6298
|
],
|
|
6433
|
-
|
|
6299
|
+
EZ: [
|
|
6434
6300
|
"AT",
|
|
6435
6301
|
"BE",
|
|
6436
6302
|
"CY",
|
|
@@ -6452,15 +6318,8 @@
|
|
|
6452
6318
|
"SI",
|
|
6453
6319
|
"SK"
|
|
6454
6320
|
],
|
|
6455
|
-
"QO"
|
|
6456
|
-
|
|
6457
|
-
"AQ",
|
|
6458
|
-
"CP",
|
|
6459
|
-
"DG",
|
|
6460
|
-
"QO",
|
|
6461
|
-
"TA"
|
|
6462
|
-
],
|
|
6463
|
-
"UN": [
|
|
6321
|
+
QO: ["AC", "AQ", "CP", "DG", "QO", "TA"],
|
|
6322
|
+
UN: [
|
|
6464
6323
|
"AD",
|
|
6465
6324
|
"AE",
|
|
6466
6325
|
"AF",
|
|
@@ -8467,6 +8326,8 @@
|
|
|
8467
8326
|
};
|
|
8468
8327
|
|
|
8469
8328
|
// packages/intl-segmenter/src/segmenter.ts
|
|
8329
|
+
var WORD_CHARACTERS_BASIC_REGEX = /\w/;
|
|
8330
|
+
var WORD_CHARACTERS_UNICODE_REGEX = void 0;
|
|
8470
8331
|
var generateRuleRegex = (rule, variables, after) => {
|
|
8471
8332
|
return new RegExp(
|
|
8472
8333
|
`${after ? "^" : ""}${replaceVariables(variables, rule)}${after ? "" : "$"}`
|
|
@@ -8624,6 +8485,29 @@
|
|
|
8624
8485
|
));
|
|
8625
8486
|
__publicField(_Segmenter, "polyfilled", true);
|
|
8626
8487
|
var Segmenter = _Segmenter;
|
|
8488
|
+
function isSegmentWordLike(segment, matchingRule) {
|
|
8489
|
+
if (WORD_CHARACTERS_UNICODE_REGEX === void 0) {
|
|
8490
|
+
try {
|
|
8491
|
+
WORD_CHARACTERS_UNICODE_REGEX = new RegExp("[\\p{L}\\p{N}\\p{M}]", "u");
|
|
8492
|
+
} catch (e) {
|
|
8493
|
+
WORD_CHARACTERS_UNICODE_REGEX = null;
|
|
8494
|
+
}
|
|
8495
|
+
}
|
|
8496
|
+
let hasWordCharacters;
|
|
8497
|
+
if (WORD_CHARACTERS_UNICODE_REGEX) {
|
|
8498
|
+
hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment);
|
|
8499
|
+
} else {
|
|
8500
|
+
hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment);
|
|
8501
|
+
}
|
|
8502
|
+
if (hasWordCharacters) {
|
|
8503
|
+
return true;
|
|
8504
|
+
}
|
|
8505
|
+
const definitelyNotWordLikeRules = ["3.1", "3.2", "3.4"];
|
|
8506
|
+
if (definitelyNotWordLikeRules.includes(matchingRule)) {
|
|
8507
|
+
return false;
|
|
8508
|
+
}
|
|
8509
|
+
return false;
|
|
8510
|
+
}
|
|
8627
8511
|
var createSegmentDataObject = (segmenter, segment, index, input, matchingRule) => {
|
|
8628
8512
|
const returnValue = {
|
|
8629
8513
|
segment,
|
|
@@ -8631,7 +8515,7 @@
|
|
|
8631
8515
|
input
|
|
8632
8516
|
};
|
|
8633
8517
|
if (getSlot(segmenter, "granularity") === "word") {
|
|
8634
|
-
returnValue.isWordLike =
|
|
8518
|
+
returnValue.isWordLike = isSegmentWordLike(segment, matchingRule);
|
|
8635
8519
|
}
|
|
8636
8520
|
return returnValue;
|
|
8637
8521
|
};
|
package/src/segmenter.js
CHANGED
|
@@ -3,6 +3,12 @@ import { CanonicalizeLocaleList, GetOption, GetOptionsObject, SupportedLocales,
|
|
|
3
3
|
import { ResolveLocale } from '@formatjs/intl-localematcher';
|
|
4
4
|
import { SegmentationRules } from './cldr-segmentation-rules.generated.js';
|
|
5
5
|
import { isSurrogate, replaceVariables } from './segmentation-utils.js';
|
|
6
|
+
// Cached regex patterns for word character detection
|
|
7
|
+
// Note: Unicode property escape regex is created at runtime in try-catch
|
|
8
|
+
// to avoid compile-time errors when targeting ES5
|
|
9
|
+
var WORD_CHARACTERS_BASIC_REGEX = /\w/;
|
|
10
|
+
// Lazy-initialized Unicode word character regex (null if not supported)
|
|
11
|
+
var WORD_CHARACTERS_UNICODE_REGEX = undefined;
|
|
6
12
|
/**
|
|
7
13
|
* Adds $ to before rules and ^ to after rules for strictness
|
|
8
14
|
* Replaces variables
|
|
@@ -143,6 +149,76 @@ var Segmenter = /** @class */ (function () {
|
|
|
143
149
|
return Segmenter;
|
|
144
150
|
}());
|
|
145
151
|
export { Segmenter };
|
|
152
|
+
/**
|
|
153
|
+
* Determines if a segment is word-like according to Unicode Word Break rules.
|
|
154
|
+
*
|
|
155
|
+
* A segment is considered word-like if it contains alphabetic characters,
|
|
156
|
+
* numbers, or ideographs. Segments containing only whitespace, punctuation,
|
|
157
|
+
* or symbols are not word-like.
|
|
158
|
+
*
|
|
159
|
+
* Per Unicode Word Break (UAX #29) and native Intl.Segmenter implementations,
|
|
160
|
+
* this matches segments that contain characters from word character classes:
|
|
161
|
+
* ALetter, Hebrew_Letter, Numeric, Katakana, Hiragana, and Ideographic.
|
|
162
|
+
*
|
|
163
|
+
* @param segment - The text segment to check
|
|
164
|
+
* @param matchingRule - The word break rule that created this segment
|
|
165
|
+
* @returns true if the segment is word-like
|
|
166
|
+
*/
|
|
167
|
+
function isSegmentWordLike(segment, matchingRule) {
|
|
168
|
+
// Primary check: Does the segment contain word characters?
|
|
169
|
+
// Word-like segments contain letters (including ideographs), numbers,
|
|
170
|
+
// or connecting characters like apostrophes within words
|
|
171
|
+
//
|
|
172
|
+
// Regex matches:
|
|
173
|
+
// - Letters: \p{L} (all Unicode letters)
|
|
174
|
+
// - Numbers: \p{N} (all Unicode numbers)
|
|
175
|
+
// - Marks: \p{M} (combining marks, typically part of letters)
|
|
176
|
+
//
|
|
177
|
+
// Note: Using Unicode property escapes which work in modern JS engines
|
|
178
|
+
// and are necessary for proper internationalization
|
|
179
|
+
// Lazy-initialize Unicode regex on first use
|
|
180
|
+
if (WORD_CHARACTERS_UNICODE_REGEX === undefined) {
|
|
181
|
+
try {
|
|
182
|
+
// Create Unicode property escape regex at runtime to avoid compile-time TS1501 error
|
|
183
|
+
WORD_CHARACTERS_UNICODE_REGEX = new RegExp('[\\p{L}\\p{N}\\p{M}]', 'u');
|
|
184
|
+
}
|
|
185
|
+
catch (_a) {
|
|
186
|
+
// Environment doesn't support Unicode property escapes
|
|
187
|
+
WORD_CHARACTERS_UNICODE_REGEX = null;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
var hasWordCharacters;
|
|
191
|
+
if (WORD_CHARACTERS_UNICODE_REGEX) {
|
|
192
|
+
// Check if segment contains word characters using Unicode property escapes
|
|
193
|
+
// This matches the behavior of native Intl.Segmenter in Chrome/Firefox
|
|
194
|
+
hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment);
|
|
195
|
+
}
|
|
196
|
+
else {
|
|
197
|
+
// Fallback for environments without Unicode property escapes
|
|
198
|
+
// Match basic word characters: letters, numbers, underscores
|
|
199
|
+
hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment);
|
|
200
|
+
}
|
|
201
|
+
// If segment contains word characters, it's word-like
|
|
202
|
+
if (hasWordCharacters) {
|
|
203
|
+
return true;
|
|
204
|
+
}
|
|
205
|
+
// If no word characters, check if it's definitely not word-like via rules
|
|
206
|
+
// Non-word-like rules per Unicode Word Break specification (UAX #29):
|
|
207
|
+
// https://unicode.org/reports/tr29/#Word_Boundaries
|
|
208
|
+
//
|
|
209
|
+
// WB3a (3.1): Break before newlines (sot ÷ (Newline | CR | LF))
|
|
210
|
+
// WB3b (3.2): Break after newlines ((Newline | CR | LF) ÷ eot)
|
|
211
|
+
// WB3d (3.4): Keep horizontal whitespace together (WSegSpace × WSegSpace)
|
|
212
|
+
//
|
|
213
|
+
// These rules specifically identify non-word segments like line breaks and whitespace
|
|
214
|
+
var definitelyNotWordLikeRules = ['3.1', '3.2', '3.4'];
|
|
215
|
+
if (definitelyNotWordLikeRules.includes(matchingRule)) {
|
|
216
|
+
return false;
|
|
217
|
+
}
|
|
218
|
+
// For segments without word characters and not matching specific non-word rules,
|
|
219
|
+
// return false (e.g., punctuation, symbols, whitespace via rule 999)
|
|
220
|
+
return false;
|
|
221
|
+
}
|
|
146
222
|
var createSegmentDataObject = function (segmenter, segment, index, input, matchingRule) {
|
|
147
223
|
var returnValue = {
|
|
148
224
|
segment: segment,
|
|
@@ -150,7 +226,7 @@ var createSegmentDataObject = function (segmenter, segment, index, input, matchi
|
|
|
150
226
|
input: input,
|
|
151
227
|
};
|
|
152
228
|
if (getSlot(segmenter, 'granularity') === 'word') {
|
|
153
|
-
returnValue.isWordLike =
|
|
229
|
+
returnValue.isWordLike = isSegmentWordLike(segment, matchingRule);
|
|
154
230
|
}
|
|
155
231
|
return returnValue;
|
|
156
232
|
};
|