npm - @formatjs/intl-segmenter - Versions diffs - 12.0.6 → 12.0.7 - Mend

@formatjs/intl-segmenter 12.0.6 → 12.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@formatjs/intl-segmenter",
   "description": "Polyfill for Intl.Segmenter",
-  "version": "12.0.6",
+  "version": "12.0.7",
   "license": "MIT",
   "author": "Matija Gaspar <matijagaspar@gmail.com>",
   "type": "module",
@@ -12,8 +12,8 @@
   },
   "dependencies": {
     "tslib": "^2.8.0",
-    "@formatjs/ecma402-abstract": "3.0.6",
-    "@formatjs/intl-localematcher": "0.7.3"
+    "@formatjs/ecma402-abstract": "3.0.7",
+    "@formatjs/intl-localematcher": "0.7.4"
   },
   "bugs": "https://github.com/formatjs/formatjs/issues",
   "homepage": "https://github.com/formatjs/formatjs",

package/polyfill.iife.js CHANGED Viewed

@@ -5817,17 +5817,7 @@
       "SN",
       "TG"
     ],
-    "013": [
-      "013",
-      "BZ",
-      "CR",
-      "GT",
-      "HN",
-      "MX",
-      "NI",
-      "PA",
-      "SV"
-    ],
+    "013": ["013", "BZ", "CR", "GT", "HN", "MX", "NI", "PA", "SV"],
     "014": [
       "014",
       "BI",
@@ -5853,38 +5843,9 @@
       "ZM",
       "ZW"
     ],
-    "015": [
-      "015",
-      "DZ",
-      "EA",
-      "EG",
-      "EH",
-      "IC",
-      "LY",
-      "MA",
-      "SD",
-      "TN"
-    ],
-    "017": [
-      "017",
-      "AO",
-      "CD",
-      "CF",
-      "CG",
-      "CM",
-      "GA",
-      "GQ",
-      "ST",
-      "TD"
-    ],
-    "018": [
-      "018",
-      "BW",
-      "LS",
-      "NA",
-      "SZ",
-      "ZA"
-    ],
+    "015": ["015", "DZ", "EA", "EG", "EH", "IC", "LY", "MA", "SD", "TN"],
+    "017": ["017", "AO", "CD", "CF", "CG", "CM", "GA", "GQ", "ST", "TD"],
+    "018": ["018", "BW", "LS", "NA", "SZ", "ZA"],
     "019": [
       "003",
       "005",
@@ -5952,14 +5913,7 @@
       "VG",
       "VI"
     ],
-    "021": [
-      "021",
-      "BM",
-      "CA",
-      "GL",
-      "PM",
-      "US"
-    ],
+    "021": ["021", "BM", "CA", "GL", "PM", "US"],
     "029": [
       "029",
       "AG",
@@ -5991,29 +5945,8 @@
       "VG",
       "VI"
     ],
-    "030": [
-      "030",
-      "CN",
-      "HK",
-      "JP",
-      "KP",
-      "KR",
-      "MN",
-      "MO",
-      "TW"
-    ],
-    "034": [
-      "034",
-      "AF",
-      "BD",
-      "BT",
-      "IN",
-      "IR",
-      "LK",
-      "MV",
-      "NP",
-      "PK"
-    ],
+    "030": ["030", "CN", "HK", "JP", "KP", "KR", "MN", "MO", "TW"],
+    "034": ["034", "AF", "BD", "BT", "IN", "IR", "LK", "MV", "NP", "PK"],
     "035": [
       "035",
       "BN",
@@ -6048,47 +5981,10 @@
       "VA",
       "XK"
     ],
-    "053": [
-      "053",
-      "AU",
-      "CC",
-      "CX",
-      "HM",
-      "NF",
-      "NZ"
-    ],
-    "054": [
-      "054",
-      "FJ",
-      "NC",
-      "PG",
-      "SB",
-      "VU"
-    ],
-    "057": [
-      "057",
-      "FM",
-      "GU",
-      "KI",
-      "MH",
-      "MP",
-      "NR",
-      "PW",
-      "UM"
-    ],
-    "061": [
-      "061",
-      "AS",
-      "CK",
-      "NU",
-      "PF",
-      "PN",
-      "TK",
-      "TO",
-      "TV",
-      "WF",
-      "WS"
-    ],
+    "053": ["053", "AU", "CC", "CX", "HM", "NF", "NZ"],
+    "054": ["054", "FJ", "NC", "PG", "SB", "VU"],
+    "057": ["057", "FM", "GU", "KI", "MH", "MP", "NR", "PW", "UM"],
+    "061": ["061", "AS", "CK", "NU", "PF", "PN", "TK", "TO", "TV", "WF", "WS"],
     "142": [
       "030",
       "034",
@@ -6148,14 +6044,7 @@
       "VN",
       "YE"
     ],
-    "143": [
-      "143",
-      "KG",
-      "KZ",
-      "TJ",
-      "TM",
-      "UZ"
-    ],
+    "143": ["143", "KG", "KZ", "TJ", "TM", "UZ"],
     "145": [
       "145",
       "AE",
@@ -6237,19 +6126,7 @@
       "VA",
       "XK"
     ],
-    "151": [
-      "151",
-      "BG",
-      "BY",
-      "CZ",
-      "HU",
-      "MD",
-      "PL",
-      "RO",
-      "RU",
-      "SK",
-      "UA"
-    ],
+    "151": ["151", "BG", "BY", "CZ", "HU", "MD", "PL", "RO", "RU", "SK", "UA"],
     "154": [
       "154",
       "AX",
@@ -6270,18 +6147,7 @@
       "SE",
       "SJ"
     ],
-    "155": [
-      "155",
-      "AT",
-      "BE",
-      "CH",
-      "DE",
-      "FR",
-      "LI",
-      "LU",
-      "MC",
-      "NL"
-    ],
+    "155": ["155", "AT", "BE", "CH", "DE", "FR", "LI", "LU", "MC", "NL"],
     "202": [
       "011",
       "014",
@@ -6400,7 +6266,7 @@
       "VG",
       "VI"
     ],
-    "EU": [
+    EU: [
       "AT",
       "BE",
       "BG",
@@ -6430,7 +6296,7 @@
       "SI",
       "SK"
     ],
-    "EZ": [
+    EZ: [
       "AT",
       "BE",
       "CY",
@@ -6452,15 +6318,8 @@
       "SI",
       "SK"
     ],
-    "QO": [
-      "AC",
-      "AQ",
-      "CP",
-      "DG",
-      "QO",
-      "TA"
-    ],
-    "UN": [
+    QO: ["AC", "AQ", "CP", "DG", "QO", "TA"],
+    UN: [
       "AD",
       "AE",
       "AF",
@@ -8467,6 +8326,8 @@
   };
   // packages/intl-segmenter/src/segmenter.ts
+  var WORD_CHARACTERS_BASIC_REGEX = /\w/;
+  var WORD_CHARACTERS_UNICODE_REGEX = void 0;
   var generateRuleRegex = (rule, variables, after) => {
     return new RegExp(
       `${after ? "^" : ""}${replaceVariables(variables, rule)}${after ? "" : "$"}`
@@ -8624,6 +8485,29 @@
   ));
   __publicField(_Segmenter, "polyfilled", true);
   var Segmenter = _Segmenter;
+  function isSegmentWordLike(segment, matchingRule) {
+    if (WORD_CHARACTERS_UNICODE_REGEX === void 0) {
+      try {
+        WORD_CHARACTERS_UNICODE_REGEX = new RegExp("[\\p{L}\\p{N}\\p{M}]", "u");
+      } catch (e) {
+        WORD_CHARACTERS_UNICODE_REGEX = null;
+      }
+    }
+    let hasWordCharacters;
+    if (WORD_CHARACTERS_UNICODE_REGEX) {
+      hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment);
+    } else {
+      hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment);
+    }
+    if (hasWordCharacters) {
+      return true;
+    }
+    const definitelyNotWordLikeRules = ["3.1", "3.2", "3.4"];
+    if (definitelyNotWordLikeRules.includes(matchingRule)) {
+      return false;
+    }
+    return false;
+  }
   var createSegmentDataObject = (segmenter, segment, index, input, matchingRule) => {
     const returnValue = {
       segment,
@@ -8631,7 +8515,7 @@
       input
     };
     if (getSlot(segmenter, "granularity") === "word") {
-      returnValue.isWordLike = matchingRule !== "3.1" && matchingRule !== "3.2";
+      returnValue.isWordLike = isSegmentWordLike(segment, matchingRule);
     }
     return returnValue;
   };

package/src/segmenter.js CHANGED Viewed

@@ -3,6 +3,12 @@ import { CanonicalizeLocaleList, GetOption, GetOptionsObject, SupportedLocales,
 import { ResolveLocale } from '@formatjs/intl-localematcher';
 import { SegmentationRules } from './cldr-segmentation-rules.generated.js';
 import { isSurrogate, replaceVariables } from './segmentation-utils.js';
+// Cached regex patterns for word character detection
+// Note: Unicode property escape regex is created at runtime in try-catch
+// to avoid compile-time errors when targeting ES5
+var WORD_CHARACTERS_BASIC_REGEX = /\w/;
+// Lazy-initialized Unicode word character regex (null if not supported)
+var WORD_CHARACTERS_UNICODE_REGEX = undefined;
 /**
  * Adds $ to before rules and ^ to after rules for strictness
  * Replaces variables
@@ -143,6 +149,76 @@ var Segmenter = /** @class */ (function () {
     return Segmenter;
 }());
 export { Segmenter };
+/**
+ * Determines if a segment is word-like according to Unicode Word Break rules.
+ *
+ * A segment is considered word-like if it contains alphabetic characters,
+ * numbers, or ideographs. Segments containing only whitespace, punctuation,
+ * or symbols are not word-like.
+ *
+ * Per Unicode Word Break (UAX #29) and native Intl.Segmenter implementations,
+ * this matches segments that contain characters from word character classes:
+ * ALetter, Hebrew_Letter, Numeric, Katakana, Hiragana, and Ideographic.
+ *
+ * @param segment - The text segment to check
+ * @param matchingRule - The word break rule that created this segment
+ * @returns true if the segment is word-like
+ */
+function isSegmentWordLike(segment, matchingRule) {
+    // Primary check: Does the segment contain word characters?
+    // Word-like segments contain letters (including ideographs), numbers,
+    // or connecting characters like apostrophes within words
+    //
+    // Regex matches:
+    // - Letters: \p{L} (all Unicode letters)
+    // - Numbers: \p{N} (all Unicode numbers)
+    // - Marks: \p{M} (combining marks, typically part of letters)
+    //
+    // Note: Using Unicode property escapes which work in modern JS engines
+    // and are necessary for proper internationalization
+    // Lazy-initialize Unicode regex on first use
+    if (WORD_CHARACTERS_UNICODE_REGEX === undefined) {
+        try {
+            // Create Unicode property escape regex at runtime to avoid compile-time TS1501 error
+            WORD_CHARACTERS_UNICODE_REGEX = new RegExp('[\\p{L}\\p{N}\\p{M}]', 'u');
+        }
+        catch (_a) {
+            // Environment doesn't support Unicode property escapes
+            WORD_CHARACTERS_UNICODE_REGEX = null;
+        }
+    }
+    var hasWordCharacters;
+    if (WORD_CHARACTERS_UNICODE_REGEX) {
+        // Check if segment contains word characters using Unicode property escapes
+        // This matches the behavior of native Intl.Segmenter in Chrome/Firefox
+        hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment);
+    }
+    else {
+        // Fallback for environments without Unicode property escapes
+        // Match basic word characters: letters, numbers, underscores
+        hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment);
+    }
+    // If segment contains word characters, it's word-like
+    if (hasWordCharacters) {
+        return true;
+    }
+    // If no word characters, check if it's definitely not word-like via rules
+    // Non-word-like rules per Unicode Word Break specification (UAX #29):
+    // https://unicode.org/reports/tr29/#Word_Boundaries
+    //
+    // WB3a (3.1): Break before newlines (sot ÷ (Newline | CR | LF))
+    // WB3b (3.2): Break after newlines ((Newline | CR | LF) ÷ eot)
+    // WB3d (3.4): Keep horizontal whitespace together (WSegSpace × WSegSpace)
+    //
+    // These rules specifically identify non-word segments like line breaks and whitespace
+    var definitelyNotWordLikeRules = ['3.1', '3.2', '3.4'];
+    if (definitelyNotWordLikeRules.includes(matchingRule)) {
+        return false;
+    }
+    // For segments without word characters and not matching specific non-word rules,
+    // return false (e.g., punctuation, symbols, whitespace via rule 999)
+    return false;
+}
 var createSegmentDataObject = function (segmenter, segment, index, input, matchingRule) {
     var returnValue = {
         segment: segment,
@@ -150,7 +226,7 @@ var createSegmentDataObject = function (segmenter, segment, index, input, matchi
         input: input,
     };
     if (getSlot(segmenter, 'granularity') === 'word') {
-        returnValue.isWordLike = matchingRule !== '3.1' && matchingRule !== '3.2';
+        returnValue.isWordLike = isSegmentWordLike(segment, matchingRule);
     }
     return returnValue;
 };