@formatjs/intl-segmenter 12.0.6 → 12.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@formatjs/intl-segmenter",
3
3
  "description": "Polyfill for Intl.Segmenter",
4
- "version": "12.0.6",
4
+ "version": "12.0.7",
5
5
  "license": "MIT",
6
6
  "author": "Matija Gaspar <matijagaspar@gmail.com>",
7
7
  "type": "module",
@@ -12,8 +12,8 @@
12
12
  },
13
13
  "dependencies": {
14
14
  "tslib": "^2.8.0",
15
- "@formatjs/ecma402-abstract": "3.0.6",
16
- "@formatjs/intl-localematcher": "0.7.3"
15
+ "@formatjs/ecma402-abstract": "3.0.7",
16
+ "@formatjs/intl-localematcher": "0.7.4"
17
17
  },
18
18
  "bugs": "https://github.com/formatjs/formatjs/issues",
19
19
  "homepage": "https://github.com/formatjs/formatjs",
package/polyfill.iife.js CHANGED
@@ -5817,17 +5817,7 @@
5817
5817
  "SN",
5818
5818
  "TG"
5819
5819
  ],
5820
- "013": [
5821
- "013",
5822
- "BZ",
5823
- "CR",
5824
- "GT",
5825
- "HN",
5826
- "MX",
5827
- "NI",
5828
- "PA",
5829
- "SV"
5830
- ],
5820
+ "013": ["013", "BZ", "CR", "GT", "HN", "MX", "NI", "PA", "SV"],
5831
5821
  "014": [
5832
5822
  "014",
5833
5823
  "BI",
@@ -5853,38 +5843,9 @@
5853
5843
  "ZM",
5854
5844
  "ZW"
5855
5845
  ],
5856
- "015": [
5857
- "015",
5858
- "DZ",
5859
- "EA",
5860
- "EG",
5861
- "EH",
5862
- "IC",
5863
- "LY",
5864
- "MA",
5865
- "SD",
5866
- "TN"
5867
- ],
5868
- "017": [
5869
- "017",
5870
- "AO",
5871
- "CD",
5872
- "CF",
5873
- "CG",
5874
- "CM",
5875
- "GA",
5876
- "GQ",
5877
- "ST",
5878
- "TD"
5879
- ],
5880
- "018": [
5881
- "018",
5882
- "BW",
5883
- "LS",
5884
- "NA",
5885
- "SZ",
5886
- "ZA"
5887
- ],
5846
+ "015": ["015", "DZ", "EA", "EG", "EH", "IC", "LY", "MA", "SD", "TN"],
5847
+ "017": ["017", "AO", "CD", "CF", "CG", "CM", "GA", "GQ", "ST", "TD"],
5848
+ "018": ["018", "BW", "LS", "NA", "SZ", "ZA"],
5888
5849
  "019": [
5889
5850
  "003",
5890
5851
  "005",
@@ -5952,14 +5913,7 @@
5952
5913
  "VG",
5953
5914
  "VI"
5954
5915
  ],
5955
- "021": [
5956
- "021",
5957
- "BM",
5958
- "CA",
5959
- "GL",
5960
- "PM",
5961
- "US"
5962
- ],
5916
+ "021": ["021", "BM", "CA", "GL", "PM", "US"],
5963
5917
  "029": [
5964
5918
  "029",
5965
5919
  "AG",
@@ -5991,29 +5945,8 @@
5991
5945
  "VG",
5992
5946
  "VI"
5993
5947
  ],
5994
- "030": [
5995
- "030",
5996
- "CN",
5997
- "HK",
5998
- "JP",
5999
- "KP",
6000
- "KR",
6001
- "MN",
6002
- "MO",
6003
- "TW"
6004
- ],
6005
- "034": [
6006
- "034",
6007
- "AF",
6008
- "BD",
6009
- "BT",
6010
- "IN",
6011
- "IR",
6012
- "LK",
6013
- "MV",
6014
- "NP",
6015
- "PK"
6016
- ],
5948
+ "030": ["030", "CN", "HK", "JP", "KP", "KR", "MN", "MO", "TW"],
5949
+ "034": ["034", "AF", "BD", "BT", "IN", "IR", "LK", "MV", "NP", "PK"],
6017
5950
  "035": [
6018
5951
  "035",
6019
5952
  "BN",
@@ -6048,47 +5981,10 @@
6048
5981
  "VA",
6049
5982
  "XK"
6050
5983
  ],
6051
- "053": [
6052
- "053",
6053
- "AU",
6054
- "CC",
6055
- "CX",
6056
- "HM",
6057
- "NF",
6058
- "NZ"
6059
- ],
6060
- "054": [
6061
- "054",
6062
- "FJ",
6063
- "NC",
6064
- "PG",
6065
- "SB",
6066
- "VU"
6067
- ],
6068
- "057": [
6069
- "057",
6070
- "FM",
6071
- "GU",
6072
- "KI",
6073
- "MH",
6074
- "MP",
6075
- "NR",
6076
- "PW",
6077
- "UM"
6078
- ],
6079
- "061": [
6080
- "061",
6081
- "AS",
6082
- "CK",
6083
- "NU",
6084
- "PF",
6085
- "PN",
6086
- "TK",
6087
- "TO",
6088
- "TV",
6089
- "WF",
6090
- "WS"
6091
- ],
5984
+ "053": ["053", "AU", "CC", "CX", "HM", "NF", "NZ"],
5985
+ "054": ["054", "FJ", "NC", "PG", "SB", "VU"],
5986
+ "057": ["057", "FM", "GU", "KI", "MH", "MP", "NR", "PW", "UM"],
5987
+ "061": ["061", "AS", "CK", "NU", "PF", "PN", "TK", "TO", "TV", "WF", "WS"],
6092
5988
  "142": [
6093
5989
  "030",
6094
5990
  "034",
@@ -6148,14 +6044,7 @@
6148
6044
  "VN",
6149
6045
  "YE"
6150
6046
  ],
6151
- "143": [
6152
- "143",
6153
- "KG",
6154
- "KZ",
6155
- "TJ",
6156
- "TM",
6157
- "UZ"
6158
- ],
6047
+ "143": ["143", "KG", "KZ", "TJ", "TM", "UZ"],
6159
6048
  "145": [
6160
6049
  "145",
6161
6050
  "AE",
@@ -6237,19 +6126,7 @@
6237
6126
  "VA",
6238
6127
  "XK"
6239
6128
  ],
6240
- "151": [
6241
- "151",
6242
- "BG",
6243
- "BY",
6244
- "CZ",
6245
- "HU",
6246
- "MD",
6247
- "PL",
6248
- "RO",
6249
- "RU",
6250
- "SK",
6251
- "UA"
6252
- ],
6129
+ "151": ["151", "BG", "BY", "CZ", "HU", "MD", "PL", "RO", "RU", "SK", "UA"],
6253
6130
  "154": [
6254
6131
  "154",
6255
6132
  "AX",
@@ -6270,18 +6147,7 @@
6270
6147
  "SE",
6271
6148
  "SJ"
6272
6149
  ],
6273
- "155": [
6274
- "155",
6275
- "AT",
6276
- "BE",
6277
- "CH",
6278
- "DE",
6279
- "FR",
6280
- "LI",
6281
- "LU",
6282
- "MC",
6283
- "NL"
6284
- ],
6150
+ "155": ["155", "AT", "BE", "CH", "DE", "FR", "LI", "LU", "MC", "NL"],
6285
6151
  "202": [
6286
6152
  "011",
6287
6153
  "014",
@@ -6400,7 +6266,7 @@
6400
6266
  "VG",
6401
6267
  "VI"
6402
6268
  ],
6403
- "EU": [
6269
+ EU: [
6404
6270
  "AT",
6405
6271
  "BE",
6406
6272
  "BG",
@@ -6430,7 +6296,7 @@
6430
6296
  "SI",
6431
6297
  "SK"
6432
6298
  ],
6433
- "EZ": [
6299
+ EZ: [
6434
6300
  "AT",
6435
6301
  "BE",
6436
6302
  "CY",
@@ -6452,15 +6318,8 @@
6452
6318
  "SI",
6453
6319
  "SK"
6454
6320
  ],
6455
- "QO": [
6456
- "AC",
6457
- "AQ",
6458
- "CP",
6459
- "DG",
6460
- "QO",
6461
- "TA"
6462
- ],
6463
- "UN": [
6321
+ QO: ["AC", "AQ", "CP", "DG", "QO", "TA"],
6322
+ UN: [
6464
6323
  "AD",
6465
6324
  "AE",
6466
6325
  "AF",
@@ -8467,6 +8326,8 @@
8467
8326
  };
8468
8327
 
8469
8328
  // packages/intl-segmenter/src/segmenter.ts
8329
+ var WORD_CHARACTERS_BASIC_REGEX = /\w/;
8330
+ var WORD_CHARACTERS_UNICODE_REGEX = void 0;
8470
8331
  var generateRuleRegex = (rule, variables, after) => {
8471
8332
  return new RegExp(
8472
8333
  `${after ? "^" : ""}${replaceVariables(variables, rule)}${after ? "" : "$"}`
@@ -8624,6 +8485,29 @@
8624
8485
  ));
8625
8486
  __publicField(_Segmenter, "polyfilled", true);
8626
8487
  var Segmenter = _Segmenter;
8488
+ function isSegmentWordLike(segment, matchingRule) {
8489
+ if (WORD_CHARACTERS_UNICODE_REGEX === void 0) {
8490
+ try {
8491
+ WORD_CHARACTERS_UNICODE_REGEX = new RegExp("[\\p{L}\\p{N}\\p{M}]", "u");
8492
+ } catch (e) {
8493
+ WORD_CHARACTERS_UNICODE_REGEX = null;
8494
+ }
8495
+ }
8496
+ let hasWordCharacters;
8497
+ if (WORD_CHARACTERS_UNICODE_REGEX) {
8498
+ hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment);
8499
+ } else {
8500
+ hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment);
8501
+ }
8502
+ if (hasWordCharacters) {
8503
+ return true;
8504
+ }
8505
+ const definitelyNotWordLikeRules = ["3.1", "3.2", "3.4"];
8506
+ if (definitelyNotWordLikeRules.includes(matchingRule)) {
8507
+ return false;
8508
+ }
8509
+ return false;
8510
+ }
8627
8511
  var createSegmentDataObject = (segmenter, segment, index, input, matchingRule) => {
8628
8512
  const returnValue = {
8629
8513
  segment,
@@ -8631,7 +8515,7 @@
8631
8515
  input
8632
8516
  };
8633
8517
  if (getSlot(segmenter, "granularity") === "word") {
8634
- returnValue.isWordLike = matchingRule !== "3.1" && matchingRule !== "3.2";
8518
+ returnValue.isWordLike = isSegmentWordLike(segment, matchingRule);
8635
8519
  }
8636
8520
  return returnValue;
8637
8521
  };
package/src/segmenter.js CHANGED
@@ -3,6 +3,12 @@ import { CanonicalizeLocaleList, GetOption, GetOptionsObject, SupportedLocales,
3
3
  import { ResolveLocale } from '@formatjs/intl-localematcher';
4
4
  import { SegmentationRules } from './cldr-segmentation-rules.generated.js';
5
5
  import { isSurrogate, replaceVariables } from './segmentation-utils.js';
6
+ // Cached regex patterns for word character detection
7
+ // Note: Unicode property escape regex is created at runtime in try-catch
8
+ // to avoid compile-time errors when targeting ES5
9
+ var WORD_CHARACTERS_BASIC_REGEX = /\w/;
10
+ // Lazy-initialized Unicode word character regex (null if not supported)
11
+ var WORD_CHARACTERS_UNICODE_REGEX = undefined;
6
12
  /**
7
13
  * Adds $ to before rules and ^ to after rules for strictness
8
14
  * Replaces variables
@@ -143,6 +149,76 @@ var Segmenter = /** @class */ (function () {
143
149
  return Segmenter;
144
150
  }());
145
151
  export { Segmenter };
152
+ /**
153
+ * Determines if a segment is word-like according to Unicode Word Break rules.
154
+ *
155
+ * A segment is considered word-like if it contains alphabetic characters,
156
+ * numbers, or ideographs. Segments containing only whitespace, punctuation,
157
+ * or symbols are not word-like.
158
+ *
159
+ * Per Unicode Word Break (UAX #29) and native Intl.Segmenter implementations,
160
+ * this matches segments that contain characters from word character classes:
161
+ * ALetter, Hebrew_Letter, Numeric, Katakana, Hiragana, and Ideographic.
162
+ *
163
+ * @param segment - The text segment to check
164
+ * @param matchingRule - The word break rule that created this segment
165
+ * @returns true if the segment is word-like
166
+ */
167
+ function isSegmentWordLike(segment, matchingRule) {
168
+ // Primary check: Does the segment contain word characters?
169
+ // Word-like segments contain letters (including ideographs), numbers,
170
+ // or connecting characters like apostrophes within words
171
+ //
172
+ // Regex matches:
173
+ // - Letters: \p{L} (all Unicode letters)
174
+ // - Numbers: \p{N} (all Unicode numbers)
175
+ // - Marks: \p{M} (combining marks, typically part of letters)
176
+ //
177
+ // Note: Using Unicode property escapes which work in modern JS engines
178
+ // and are necessary for proper internationalization
179
+ // Lazy-initialize Unicode regex on first use
180
+ if (WORD_CHARACTERS_UNICODE_REGEX === undefined) {
181
+ try {
182
+ // Create Unicode property escape regex at runtime to avoid compile-time TS1501 error
183
+ WORD_CHARACTERS_UNICODE_REGEX = new RegExp('[\\p{L}\\p{N}\\p{M}]', 'u');
184
+ }
185
+ catch (_a) {
186
+ // Environment doesn't support Unicode property escapes
187
+ WORD_CHARACTERS_UNICODE_REGEX = null;
188
+ }
189
+ }
190
+ var hasWordCharacters;
191
+ if (WORD_CHARACTERS_UNICODE_REGEX) {
192
+ // Check if segment contains word characters using Unicode property escapes
193
+ // This matches the behavior of native Intl.Segmenter in Chrome/Firefox
194
+ hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment);
195
+ }
196
+ else {
197
+ // Fallback for environments without Unicode property escapes
198
+ // Match basic word characters: letters, numbers, underscores
199
+ hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment);
200
+ }
201
+ // If segment contains word characters, it's word-like
202
+ if (hasWordCharacters) {
203
+ return true;
204
+ }
205
+ // If no word characters, check if it's definitely not word-like via rules
206
+ // Non-word-like rules per Unicode Word Break specification (UAX #29):
207
+ // https://unicode.org/reports/tr29/#Word_Boundaries
208
+ //
209
+ // WB3a (3.1): Break before newlines (sot ÷ (Newline | CR | LF))
210
+ // WB3b (3.2): Break after newlines ((Newline | CR | LF) ÷ eot)
211
+ // WB3d (3.4): Keep horizontal whitespace together (WSegSpace × WSegSpace)
212
+ //
213
+ // These rules specifically identify non-word segments like line breaks and whitespace
214
+ var definitelyNotWordLikeRules = ['3.1', '3.2', '3.4'];
215
+ if (definitelyNotWordLikeRules.includes(matchingRule)) {
216
+ return false;
217
+ }
218
+ // For segments without word characters and not matching specific non-word rules,
219
+ // return false (e.g., punctuation, symbols, whitespace via rule 999)
220
+ return false;
221
+ }
146
222
  var createSegmentDataObject = function (segmenter, segment, index, input, matchingRule) {
147
223
  var returnValue = {
148
224
  segment: segment,
@@ -150,7 +226,7 @@ var createSegmentDataObject = function (segmenter, segment, index, input, matchi
150
226
  input: input,
151
227
  };
152
228
  if (getSlot(segmenter, 'granularity') === 'word') {
153
- returnValue.isWordLike = matchingRule !== '3.1' && matchingRule !== '3.2';
229
+ returnValue.isWordLike = isSegmentWordLike(segment, matchingRule);
154
230
  }
155
231
  return returnValue;
156
232
  };