mojijs 4.0.0 → 5.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/HISTORY.md CHANGED
@@ -1,5 +1,31 @@
1
1
  # History
2
2
 
3
+ ## v5.0.1
4
+
5
+ ### 不具合修正
6
+
7
+ - v5.0.0 にて異体字セレクタを結合文字として判定してしまう問題を修正
8
+
9
+ ## v5.0.0
10
+
11
+ ### 機能改善
12
+
13
+ - 異体字セレクタの判定に、注釈機能を追加
14
+ - 絵文字の判定を強化
15
+ - 記号の判定を追加
16
+ - Unicodeの制御文字を追加
17
+ - CJK Unified Ideographs Extension I (2EBF0–2EE5F)
18
+ - CJK Unified Ideographs Extension J (323B0–3347F)
19
+
20
+ ### 変更
21
+
22
+ - travisが動作しないので除去
23
+
24
+ ### 不具合修正
25
+
26
+ - 結合していない文字も結合文字と判定する場合があるのを修正
27
+ - getVariationSelectorsNumberFromCodePoint での戻り値で意図しない文字列を返す問題を修正
28
+
3
29
  ## v4.0.0
4
30
 
5
31
  ### 機能改善
package/README.md CHANGED
@@ -1,28 +1,32 @@
1
- # MojiJS #
2
- [![Build Status](https://travis-ci.org/natade-jp/MojiJS.svg?branch=master)](https://travis-ci.org/natade-jp/MojiJS)
1
+ # MojiJS
2
+
3
3
  [![ESDoc coverage badge](https://natade-jp.github.io/MojiJS/docs/badge.svg)](https://natade-jp.github.io/MojiJS/docs/)
4
4
  ![MIT License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)
5
5
 
6
- ## What ##
7
- - 日本語の文字データを解析及び、変換するライブラリです。
8
- - [詳細なAPIを公開しています。](https://natade-jp.github.io/MojiJS/docs/)
9
- - [動作例](https://natade-jp.github.io/MojiJS/html/examples/demos/Text/) (コンソール及び[ソースコード](https://natade-jp.github.io/MojiJS/html/examples/demos/Text/main.mjs)を確認してみてください。)
10
- - [npm](https://www.npmjs.com/package/mojijs)
6
+ ## What
7
+
8
+ - 日本語の文字データを解析及び、変換するライブラリです。
9
+ - [詳細な API を公開しています。](https://natade-jp.github.io/MojiJS/docs/)
10
+ - [動作例](https://natade-jp.github.io/MojiJS/html/examples/demos/Text/) (コンソール及び[ソースコード](https://natade-jp.github.io/MojiJS/html/examples/demos/Text/main.mjs)を確認してみてください。)
11
+ - [npm](https://www.npmjs.com/package/mojijs)
11
12
 
12
13
  以下のことが行えます
13
- - エンコード(UTF-8 / UTF-16 / UTF-32 / Shift_JIS / Shift_JIS-2004 / EUC-JP / EUC-JIS-2004 )
14
- - 日本語の変換 (ひらがな, カタカナ, 半角, 全角, ローマ字 など))
15
- - 漢字の判定 (常用漢字, 人名用漢字, 面区点, 漢字水準 など)
16
- - 自然順ソート
17
14
 
18
- ## Install ##
15
+ - エンコード(UTF-8 / UTF-16 / UTF-32 / Shift_JIS / Shift_JIS-2004 / EUC-JP / EUC-JIS-2004 )
16
+ - 日本語の変換 (ひらがな, カタカナ, 半角, 全角, ローマ字 など))
17
+ - 漢字の判定 (常用漢字, 人名用漢字, 面区点, 漢字水準 など)
18
+ - 自然順ソート
19
+
20
+ ## Install
21
+
19
22
  ```
20
23
  npm install --save-dev mojijs
21
24
  ```
22
25
 
23
- ## Sample ##
26
+ ## Sample
24
27
 
25
28
  ### エンコード
29
+
26
30
  ```javascript
27
31
  const MojiJS = require("mojijs");
28
32
 
@@ -34,6 +38,7 @@ console.log(MojiJS.decode([0x61, 0xE3, 0x81, 0x82], "utf-8"));
34
38
  ```
35
39
 
36
40
  ### 日本語の変換
41
+
37
42
  ```javascript
38
43
  const MojiJS = require("mojijs");
39
44
 
@@ -42,6 +47,7 @@ console.log(MojiJS.toHiragana("カキクケコ"));
42
47
  ```
43
48
 
44
49
  ### 面区点
50
+
45
51
  ```javascript
46
52
  const MojiJS = require("mojijs");
47
53
 
@@ -61,6 +67,7 @@ console.log("面区点:" + data3.encode.menkuten.text + ", 漢字水準:" +
61
67
  ```
62
68
 
63
69
  ### 自然順ソート
70
+
64
71
  ```javascript
65
72
  const MojiJS = require("mojijs");
66
73
 
@@ -68,5 +75,6 @@ console.log(["3", "02", "あ", "イ", "う", "1"].sort(MojiJS.compareToForNatu
68
75
  -> [ '1', '02', '3', 'あ', 'イ', 'う' ]
69
76
  ```
70
77
 
71
- ## Author ##
72
- - [natade-jp](https://github.com/natade-jp/)
78
+ ## Author
79
+
80
+ - [natade-jp](https://github.com/natade-jp/)
@@ -591,20 +591,32 @@ class Unicode {
591
591
  * @returns {boolean} 確認結果
592
592
  */
593
593
  static isCombiningMarkFromCodePoint(codepoint) {
594
- return (
595
- // Combining Diacritical Marks
596
- ((0x0300 <= codepoint) && (codepoint <= 0x036F)) ||
597
- // Combining Diacritical Marks Extended
598
- ((0x1AB0 <= codepoint) && (codepoint <= 0x1AFF)) ||
599
- // Combining Diacritical Marks Supplement
600
- ((0x1DC0 <= codepoint) && (codepoint <= 0x1DFF)) ||
601
- // Combining Diacritical Marks for Symbols
602
- ((0x20D0 <= codepoint) && (codepoint <= 0x20FF)) ||
603
- // Hiragana 含まれる4種類の文字
604
- ((0x3099 <= codepoint) && (codepoint <= 0x309C)) ||
605
- // Combining Half Marks
606
- ((0xFE20 <= codepoint) && (codepoint <= 0xFE2F))
607
- );
594
+ // 異体字セレクタは除外
595
+ if (Unicode.isVariationSelectorFromCodePoint(codepoint)) {
596
+ return false;
597
+ }
598
+ try {
599
+ new RegExp("\\p{Mark}", "u");
600
+ return /\p{Mark}/u.test(String.fromCodePoint(codepoint));
601
+ } catch (e) {
602
+ // フォールバック処理
603
+ return (
604
+ // Combining Diacritical Marks
605
+ ((0x0300 <= codepoint) && (codepoint <= 0x036F)) ||
606
+ // Combining Diacritical Marks Extended
607
+ ((0x1AB0 <= codepoint) && (codepoint <= 0x1AFF)) ||
608
+ // Combining Diacritical Marks Supplement
609
+ ((0x1DC0 <= codepoint) && (codepoint <= 0x1DFF)) ||
610
+ // Combining Diacritical Marks for Symbols
611
+ ((0x20D0 <= codepoint) && (codepoint <= 0x20FF)) ||
612
+ // 日本語に含まれる2種類の文字
613
+ // COMBINING VOICED SOUND MARK
614
+ // COMBINING SEMI-VOICED SOUND MARK
615
+ ((0x3099 <= codepoint) && (codepoint <= 0x309A)) ||
616
+ // Combining Half Marks
617
+ ((0xFE20 <= codepoint) && (codepoint <= 0xFE2F))
618
+ );
619
+ }
608
620
  }
609
621
 
610
622
 
@@ -3997,15 +4009,111 @@ class MOJI_CHAR_MAP {
3997
4009
  // 制御文字、VSは多いため含めていない
3998
4010
 
3999
4011
  control_charcter_map = {
4000
- 0: "NUL", 1: "SOH", 2: "STX", 3: "ETX", 4: "EOT", 5: "ENQ", 6: "ACK", 7: "BEL",
4001
- 8: "BS", 9: "HT", 10: "LF", 11: "VT", 12: "FF", 13: "CR", 14: "SO", 15: "SI",
4002
- 16: "DLE", 17: "DC1", 18: "DC2", 19: "DC3", 20: "DC4", 21: "NAK", 22: "SYN", 23: "ETB",
4003
- 24: "CAN", 25: "EM", 26: "SUB", 27: "ESC", 28: "FS", 29: "GS", 30: "RS", 31: "US",
4004
- 127: "DEL", 128: "PAD", 129: "HOP", 130: "BPH", 131: "NBH", 132: "IND", 133: "NEL", 134: "SSA",
4005
- 135: "ESA", 136: "HTS", 137: "HTJ", 138: "VTS", 139: "PLD", 140: "PLU", 141: "RI", 142: "SS2",
4006
- 143: "SS3", 144: "DCS", 145: "PU1", 146: "PU2", 147: "STS", 148: "CCH", 149: "MW", 150: "SPA",
4007
- 151: "EPA", 152: "SOS", 153: "SGCI", 154: "SCI", 155: "CSI", 156: "ST", 157: "OSC", 158: "PM",
4008
- 159: "APC", 160: "NBSP", 173: "SHY", 65529: "IAA", 65530: "IAS", 65531: "IAT"
4012
+ // --- C0 control characters (ASCII 0x00–0x1F) ---
4013
+ 0: "NUL", // Null
4014
+ 1: "SOH", // Start of Heading
4015
+ 2: "STX", // Start of Text
4016
+ 3: "ETX", // End of Text
4017
+ 4: "EOT", // End of Transmission
4018
+ 5: "ENQ", // Enquiry
4019
+ 6: "ACK", // Acknowledge
4020
+ 7: "BEL", // Bell (beep)
4021
+
4022
+ 8: "BS", // Backspace
4023
+ 9: "HT", // Horizontal Tab
4024
+ 10: "LF", // Line Feed
4025
+ 11: "VT", // Vertical Tab
4026
+ 12: "FF", // Form Feed
4027
+ 13: "CR", // Carriage Return
4028
+ 14: "SO", // Shift Out
4029
+ 15: "SI", // Shift In
4030
+
4031
+ 16: "DLE", // Data Link Escape
4032
+ 17: "DC1", // Device Control 1 (XON)
4033
+ 18: "DC2", // Device Control 2
4034
+ 19: "DC3", // Device Control 3 (XOFF)
4035
+ 20: "DC4", // Device Control 4
4036
+ 21: "NAK", // Negative Acknowledge
4037
+ 22: "SYN", // Synchronous Idle
4038
+ 23: "ETB", // End of Transmission Block
4039
+
4040
+ 24: "CAN", // Cancel
4041
+ 25: "EM", // End of Medium
4042
+ 26: "SUB", // Substitute
4043
+ 27: "ESC", // Escape
4044
+ 28: "FS", // File Separator
4045
+ 29: "GS", // Group Separator
4046
+ 30: "RS", // Record Separator
4047
+ 31: "US", // Unit Separator
4048
+
4049
+ // --- DEL ---
4050
+ 127: "DEL", // Delete
4051
+
4052
+ // --- C1 control characters (ISO/IEC 6429, 0x80–0x9F) ---
4053
+ 128: "PAD", // Padding Character
4054
+ 129: "HOP", // High Octet Preset
4055
+ 130: "BPH", // Break Permitted Here
4056
+ 131: "NBH", // No Break Here
4057
+ 132: "IND", // Index
4058
+ 133: "NEL", // Next Line
4059
+ 134: "SSA", // Start of Selected Area
4060
+ 135: "ESA", // End of Selected Area
4061
+ 136: "HTS", // Horizontal Tab Set
4062
+ 137: "HTJ", // Horizontal Tab with Justification
4063
+ 138: "VTS", // Vertical Tab Set
4064
+ 139: "PLD", // Partial Line Down
4065
+ 140: "PLU", // Partial Line Up
4066
+ 141: "RI", // Reverse Index
4067
+ 142: "SS2", // Single Shift 2
4068
+ 143: "SS3", // Single Shift 3
4069
+ 144: "DCS", // Device Control String
4070
+ 145: "PU1", // Private Use 1
4071
+ 146: "PU2", // Private Use 2
4072
+ 147: "STS", // Set Transmit State
4073
+ 148: "CCH", // Cancel Character
4074
+ 149: "MW", // Message Waiting
4075
+ 150: "SPA", // Start of Protected Area
4076
+ 151: "EPA", // End of Protected Area
4077
+ 152: "SOS", // Start of String
4078
+ 153: "SGCI",// Single Graphic Character Introducer
4079
+ 154: "SCI", // Single Character Introducer
4080
+ 155: "CSI", // Control Sequence Introducer
4081
+ 156: "ST", // String Terminator
4082
+ 157: "OSC", // Operating System Command
4083
+ 158: "PM", // Privacy Message
4084
+ 159: "APC", // Application Program Command
4085
+
4086
+ // --- Unicode but制御的に扱われる文字 ---
4087
+ 160: "NBSP", // No-Break Space(表示は空白だが改行不可)
4088
+ 173: "SHY", // Soft Hyphen(通常は表示されない)
4089
+
4090
+ // --- Unicode Interlinear Annotation ---
4091
+ 65529: "IAA", // Interlinear Annotation Anchor
4092
+ 65530: "IAS", // Interlinear Annotation Separator
4093
+ 65531: "IAT", // Interlinear Annotation Terminator
4094
+
4095
+ // Zero Width / Joiner 系(Cf)
4096
+ 0x200B: "ZWSP", // ZERO WIDTH SPACE
4097
+ 0x200C: "ZWNJ", // ZERO WIDTH NON-JOINER
4098
+ 0x200D: "ZWJ", // ZERO WIDTH JOINER
4099
+ 0x2060: "WJ", // WORD JOINER
4100
+ 0xFEFF: "BOM", // BYTE ORDER MARK / ZERO WIDTH NO-BREAK SPACE
4101
+
4102
+ // 双方向(BiDi)制御文字
4103
+ 0x202A: "LRE", // LEFT-TO-RIGHT EMBEDDING
4104
+ 0x202B: "RLE", // RIGHT-TO-LEFT EMBEDDING
4105
+ 0x202C: "PDF", // POP DIRECTIONAL FORMATTING
4106
+ 0x202D: "LRO", // LEFT-TO-RIGHT OVERRIDE
4107
+ 0x202E: "RLO", // RIGHT-TO-LEFT OVERRIDE
4108
+
4109
+ 0x2066: "LRI", // LEFT-TO-RIGHT ISOLATE
4110
+ 0x2067: "RLI", // RIGHT-TO-LEFT ISOLATE
4111
+ 0x2068: "FSI", // FIRST STRONG ISOLATE
4112
+ 0x2069: "PDI" , // POP DIRECTIONAL ISOLATE
4113
+
4114
+ // Unicode Noncharacter(検証・防御用途)
4115
+ 0xFFFE: "NONCHAR_FFFE",
4116
+ 0xFFFF: "NONCHAR_FFFF"
4009
4117
  };
4010
4118
 
4011
4119
  const unicode_blockname_array = [
@@ -4048,8 +4156,8 @@ class MOJI_CHAR_MAP {
4048
4156
  "Cyrillic Extended-D", "Nyiakeng Puachue Hmong", "Toto", "Wancho", "Nag Mundari", "Ethiopic Extended-B", "Mende Kikakui", "Adlam",
4049
4157
  "Indic Siyaq Numbers", "Ottoman Siyaq Numbers", "Arabic Mathematical Alphabetic Symbols", "Mahjong Tiles", "Domino Tiles", "Playing Cards", "Enclosed Alphanumeric Supplement", "Enclosed Ideographic Supplement",
4050
4158
  "Miscellaneous Symbols and Pictographs", "Emoticons", "Ornamental Dingbats", "Transport and Map Symbols", "Alchemical Symbols", "Geometric Shapes Extended", "Supplemental Arrows-C", "Supplemental Symbols and Pictographs",
4051
- "Chess Symbols", "Symbols and Pictographs Extended-A", "Symbols for Legacy Computing", "CJK Unified Ideographs Extension B", "CJK Unified Ideographs Extension C", "CJK Unified Ideographs Extension D", "CJK Unified Ideographs Extension E", "CJK Unified Ideographs Extension F",
4052
- "CJK Compatibility Ideographs Supplement", "CJK Unified Ideographs Extension G", "CJK Unified Ideographs Extension H", "Tags", "Variation Selectors Supplement", "Supplementary Private Use Area-A", "Supplementary Private Use Area-B"
4159
+ "Chess Symbols", "Symbols and Pictographs Extended-A", "Symbols for Legacy Computing", "CJK Unified Ideographs Extension B", "CJK Unified Ideographs Extension C", "CJK Unified Ideographs Extension D", "CJK Unified Ideographs Extension E", "CJK Unified Ideographs Extension F", "CJK Unified Ideographs Extension I",
4160
+ "CJK Compatibility Ideographs Supplement", "CJK Unified Ideographs Extension G", "CJK Unified Ideographs Extension H", "CJK Unified Ideographs Extension J", "Tags", "Variation Selectors Supplement", "Supplementary Private Use Area-A", "Supplementary Private Use Area-B"
4053
4161
  ];
4054
4162
 
4055
4163
  const unicode_blockaddress_array = [
@@ -4072,8 +4180,8 @@ class MOJI_CHAR_MAP {
4072
4180
  0x1467F, 0x16A3F, 0x16A6F, 0x16ACF, 0x16AFF, 0x16B8F, 0x16E9F, 0x16F9F, 0x16FFF, 0x187FF, 0x18AFF, 0x18CFF, 0x18D7F, 0x1AFFF, 0x1B0FF, 0x1B12F,
4073
4181
  0x1B16F, 0x1B2FF, 0x1BC9F, 0x1BCAF, 0x1CFCF, 0x1D0FF, 0x1D1FF, 0x1D24F, 0x1D2DF, 0x1D2FF, 0x1D35F, 0x1D37F, 0x1D7FF, 0x1DAAF, 0x1DFFF, 0x1E02F,
4074
4182
  0x1E08F, 0x1E14F, 0x1E2BF, 0x1E2FF, 0x1E4FF, 0x1E7FF, 0x1E8DF, 0x1E95F, 0x1ECBF, 0x1ED4F, 0x1EEFF, 0x1F02F, 0x1F09F, 0x1F0FF, 0x1F1FF, 0x1F2FF,
4075
- 0x1F5FF, 0x1F64F, 0x1F67F, 0x1F6FF, 0x1F77F, 0x1F7FF, 0x1F8FF, 0x1F9FF, 0x1FA6F, 0x1FAFF, 0x1FBFF, 0x2A6DF, 0x2B73F, 0x2B81F, 0x2CEAF, 0x2EBEF,
4076
- 0x2FA1F, 0x3134F, 0x323AF, 0xE007F, 0xE01EF, 0xFFFFF, 0x10FFFF
4183
+ 0x1F5FF, 0x1F64F, 0x1F67F, 0x1F6FF, 0x1F77F, 0x1F7FF, 0x1F8FF, 0x1F9FF, 0x1FA6F, 0x1FAFF, 0x1FBFF, 0x2A6DF, 0x2B73F, 0x2B81F, 0x2CEAF, 0x2EBEF, 0x2EE5F,
4184
+ 0x2FA1F, 0x3134F, 0x323AF, 0x3347F, 0xE007F, 0xE01EF, 0xFFFFF, 0x10FFFF
4077
4185
  ];
4078
4186
 
4079
4187
  to_block_name_from_unicode = function(unicode_codepoint) {
@@ -4177,20 +4285,25 @@ class MojiAnalizerTools {
4177
4285
  /**
4178
4286
  * コードポイントから異体字セレクタの判定
4179
4287
  * @param {Number} codepoint - コードポイント
4288
+ * @param {boolean} [annotate = false] - 注釈をつけるか否か
4180
4289
  * @returns {string|null} 確認結果(異体字セレクタではない場合はNULLを返す)
4181
4290
  */
4182
- static getVariationSelectorsNumberFromCodePoint(codepoint) {
4291
+ static getVariationSelectorsNumberFromCodePoint(codepoint, annotate) {
4183
4292
  // モンゴル自由字形選択子 U+180B〜U+180D (3個)
4184
4293
  if((0x180B <= codepoint) && (codepoint <= 0x180D)) {
4185
4294
  return "FVS" + ((codepoint - 0x180B) + 1);
4186
4295
  }
4187
4296
  // SVSで利用される異体字セレクタ U+FE00〜U+FE0F (VS1~VS16) (16個)
4188
4297
  if((0xFE00 <= codepoint) && (codepoint <= 0xFE0F)) {
4189
- return "VS" + (codepoint - 0xFE00) + 1;
4298
+ const n = (codepoint - 0xFE00) + 1;
4299
+ if (!annotate) return "VS" + n;
4300
+ if (codepoint === 0xFE0E) return "VS15 (text)";
4301
+ if (codepoint === 0xFE0F) return "VS16 (emoji)";
4302
+ return "VS" + n;
4190
4303
  }
4191
4304
  // IVSで利用される異体字セレクタ U+E0100〜U+E01EF (VS17~VS256) (240個)
4192
4305
  else if((0xE0100 <= codepoint) && (codepoint <= 0xE01EF)) {
4193
- return "VS" + (codepoint - 0xE0100) + 17;
4306
+ return "VS" + ((codepoint - 0xE0100) + 17);
4194
4307
  }
4195
4308
  return null;
4196
4309
  }
@@ -4342,6 +4455,7 @@ class MojiAnalizerTools {
4342
4455
  * @property {boolean} is_halfwidth_katakana 半角カタカナ
4343
4456
  * @property {boolean} is_emoji 絵文字
4344
4457
  * @property {boolean} is_emoticons 顔文字
4458
+ * @property {boolean} is_symbol_base 記号(VS16 が付くと絵文字化)
4345
4459
  * @property {boolean} is_gaiji 外字
4346
4460
  * @property {boolean} is_combining_mark 結合文字
4347
4461
  * @property {boolean} is_variation_selector 異体字セレクタ
@@ -4412,6 +4526,7 @@ class MojiAnalyzer {
4412
4526
  is_halfwidth_katakana : false,
4413
4527
  is_emoji : false,
4414
4528
  is_emoticons : false,
4529
+ is_symbol_base : false,
4415
4530
  is_gaiji : false,
4416
4531
  is_combining_mark : false,
4417
4532
  is_variation_selector : false
@@ -4536,9 +4651,11 @@ class MojiAnalyzer {
4536
4651
  type.is_fullwidth_ascii = /[\u3000\uFF01-\uFF5E]/.test(data.character);
4537
4652
  type.is_halfwidth_katakana = /[\uFF61-\uFF9F]/.test(data.character);
4538
4653
  // 絵文字
4539
- type.is_emoji = /Pictographs/.test(type.blockname);
4654
+ type.is_emoji = /Pictographs|Transport and Map Symbols/.test(type.blockname);
4540
4655
  // 顔文字
4541
4656
  type.is_emoticons = /Emoticons/.test(type.blockname);
4657
+ // 記号(VS16 が付くと絵文字化)
4658
+ type.is_symbol_base = /Dingbats|Miscellaneous Symbols/.test(type.blockname);
4542
4659
  // 外字
4543
4660
  type.is_gaiji = /Private Use Area/.test(type.blockname);
4544
4661
  // 結合文字
@@ -5270,4 +5387,4 @@ class MojiJS {
5270
5387
 
5271
5388
  }
5272
5389
 
5273
- module.exports = MojiJS;
5390
+ export default MojiJS;
@@ -591,20 +591,32 @@ class Unicode {
591
591
  * @returns {boolean} 確認結果
592
592
  */
593
593
  static isCombiningMarkFromCodePoint(codepoint) {
594
- return (
595
- // Combining Diacritical Marks
596
- ((0x0300 <= codepoint) && (codepoint <= 0x036F)) ||
597
- // Combining Diacritical Marks Extended
598
- ((0x1AB0 <= codepoint) && (codepoint <= 0x1AFF)) ||
599
- // Combining Diacritical Marks Supplement
600
- ((0x1DC0 <= codepoint) && (codepoint <= 0x1DFF)) ||
601
- // Combining Diacritical Marks for Symbols
602
- ((0x20D0 <= codepoint) && (codepoint <= 0x20FF)) ||
603
- // Hiragana 含まれる4種類の文字
604
- ((0x3099 <= codepoint) && (codepoint <= 0x309C)) ||
605
- // Combining Half Marks
606
- ((0xFE20 <= codepoint) && (codepoint <= 0xFE2F))
607
- );
594
+ // 異体字セレクタは除外
595
+ if (Unicode.isVariationSelectorFromCodePoint(codepoint)) {
596
+ return false;
597
+ }
598
+ try {
599
+ new RegExp("\\p{Mark}", "u");
600
+ return /\p{Mark}/u.test(String.fromCodePoint(codepoint));
601
+ } catch (e) {
602
+ // フォールバック処理
603
+ return (
604
+ // Combining Diacritical Marks
605
+ ((0x0300 <= codepoint) && (codepoint <= 0x036F)) ||
606
+ // Combining Diacritical Marks Extended
607
+ ((0x1AB0 <= codepoint) && (codepoint <= 0x1AFF)) ||
608
+ // Combining Diacritical Marks Supplement
609
+ ((0x1DC0 <= codepoint) && (codepoint <= 0x1DFF)) ||
610
+ // Combining Diacritical Marks for Symbols
611
+ ((0x20D0 <= codepoint) && (codepoint <= 0x20FF)) ||
612
+ // 日本語に含まれる2種類の文字
613
+ // COMBINING VOICED SOUND MARK
614
+ // COMBINING SEMI-VOICED SOUND MARK
615
+ ((0x3099 <= codepoint) && (codepoint <= 0x309A)) ||
616
+ // Combining Half Marks
617
+ ((0xFE20 <= codepoint) && (codepoint <= 0xFE2F))
618
+ );
619
+ }
608
620
  }
609
621
 
610
622
 
@@ -3997,15 +4009,111 @@ class MOJI_CHAR_MAP {
3997
4009
  // 制御文字、VSは多いため含めていない
3998
4010
 
3999
4011
  control_charcter_map = {
4000
- 0: "NUL", 1: "SOH", 2: "STX", 3: "ETX", 4: "EOT", 5: "ENQ", 6: "ACK", 7: "BEL",
4001
- 8: "BS", 9: "HT", 10: "LF", 11: "VT", 12: "FF", 13: "CR", 14: "SO", 15: "SI",
4002
- 16: "DLE", 17: "DC1", 18: "DC2", 19: "DC3", 20: "DC4", 21: "NAK", 22: "SYN", 23: "ETB",
4003
- 24: "CAN", 25: "EM", 26: "SUB", 27: "ESC", 28: "FS", 29: "GS", 30: "RS", 31: "US",
4004
- 127: "DEL", 128: "PAD", 129: "HOP", 130: "BPH", 131: "NBH", 132: "IND", 133: "NEL", 134: "SSA",
4005
- 135: "ESA", 136: "HTS", 137: "HTJ", 138: "VTS", 139: "PLD", 140: "PLU", 141: "RI", 142: "SS2",
4006
- 143: "SS3", 144: "DCS", 145: "PU1", 146: "PU2", 147: "STS", 148: "CCH", 149: "MW", 150: "SPA",
4007
- 151: "EPA", 152: "SOS", 153: "SGCI", 154: "SCI", 155: "CSI", 156: "ST", 157: "OSC", 158: "PM",
4008
- 159: "APC", 160: "NBSP", 173: "SHY", 65529: "IAA", 65530: "IAS", 65531: "IAT"
4012
+ // --- C0 control characters (ASCII 0x00–0x1F) ---
4013
+ 0: "NUL", // Null
4014
+ 1: "SOH", // Start of Heading
4015
+ 2: "STX", // Start of Text
4016
+ 3: "ETX", // End of Text
4017
+ 4: "EOT", // End of Transmission
4018
+ 5: "ENQ", // Enquiry
4019
+ 6: "ACK", // Acknowledge
4020
+ 7: "BEL", // Bell (beep)
4021
+
4022
+ 8: "BS", // Backspace
4023
+ 9: "HT", // Horizontal Tab
4024
+ 10: "LF", // Line Feed
4025
+ 11: "VT", // Vertical Tab
4026
+ 12: "FF", // Form Feed
4027
+ 13: "CR", // Carriage Return
4028
+ 14: "SO", // Shift Out
4029
+ 15: "SI", // Shift In
4030
+
4031
+ 16: "DLE", // Data Link Escape
4032
+ 17: "DC1", // Device Control 1 (XON)
4033
+ 18: "DC2", // Device Control 2
4034
+ 19: "DC3", // Device Control 3 (XOFF)
4035
+ 20: "DC4", // Device Control 4
4036
+ 21: "NAK", // Negative Acknowledge
4037
+ 22: "SYN", // Synchronous Idle
4038
+ 23: "ETB", // End of Transmission Block
4039
+
4040
+ 24: "CAN", // Cancel
4041
+ 25: "EM", // End of Medium
4042
+ 26: "SUB", // Substitute
4043
+ 27: "ESC", // Escape
4044
+ 28: "FS", // File Separator
4045
+ 29: "GS", // Group Separator
4046
+ 30: "RS", // Record Separator
4047
+ 31: "US", // Unit Separator
4048
+
4049
+ // --- DEL ---
4050
+ 127: "DEL", // Delete
4051
+
4052
+ // --- C1 control characters (ISO/IEC 6429, 0x80–0x9F) ---
4053
+ 128: "PAD", // Padding Character
4054
+ 129: "HOP", // High Octet Preset
4055
+ 130: "BPH", // Break Permitted Here
4056
+ 131: "NBH", // No Break Here
4057
+ 132: "IND", // Index
4058
+ 133: "NEL", // Next Line
4059
+ 134: "SSA", // Start of Selected Area
4060
+ 135: "ESA", // End of Selected Area
4061
+ 136: "HTS", // Horizontal Tab Set
4062
+ 137: "HTJ", // Horizontal Tab with Justification
4063
+ 138: "VTS", // Vertical Tab Set
4064
+ 139: "PLD", // Partial Line Down
4065
+ 140: "PLU", // Partial Line Up
4066
+ 141: "RI", // Reverse Index
4067
+ 142: "SS2", // Single Shift 2
4068
+ 143: "SS3", // Single Shift 3
4069
+ 144: "DCS", // Device Control String
4070
+ 145: "PU1", // Private Use 1
4071
+ 146: "PU2", // Private Use 2
4072
+ 147: "STS", // Set Transmit State
4073
+ 148: "CCH", // Cancel Character
4074
+ 149: "MW", // Message Waiting
4075
+ 150: "SPA", // Start of Protected Area
4076
+ 151: "EPA", // End of Protected Area
4077
+ 152: "SOS", // Start of String
4078
+ 153: "SGCI",// Single Graphic Character Introducer
4079
+ 154: "SCI", // Single Character Introducer
4080
+ 155: "CSI", // Control Sequence Introducer
4081
+ 156: "ST", // String Terminator
4082
+ 157: "OSC", // Operating System Command
4083
+ 158: "PM", // Privacy Message
4084
+ 159: "APC", // Application Program Command
4085
+
4086
+ // --- Unicode but制御的に扱われる文字 ---
4087
+ 160: "NBSP", // No-Break Space(表示は空白だが改行不可)
4088
+ 173: "SHY", // Soft Hyphen(通常は表示されない)
4089
+
4090
+ // --- Unicode Interlinear Annotation ---
4091
+ 65529: "IAA", // Interlinear Annotation Anchor
4092
+ 65530: "IAS", // Interlinear Annotation Separator
4093
+ 65531: "IAT", // Interlinear Annotation Terminator
4094
+
4095
+ // Zero Width / Joiner 系(Cf)
4096
+ 0x200B: "ZWSP", // ZERO WIDTH SPACE
4097
+ 0x200C: "ZWNJ", // ZERO WIDTH NON-JOINER
4098
+ 0x200D: "ZWJ", // ZERO WIDTH JOINER
4099
+ 0x2060: "WJ", // WORD JOINER
4100
+ 0xFEFF: "BOM", // BYTE ORDER MARK / ZERO WIDTH NO-BREAK SPACE
4101
+
4102
+ // 双方向(BiDi)制御文字
4103
+ 0x202A: "LRE", // LEFT-TO-RIGHT EMBEDDING
4104
+ 0x202B: "RLE", // RIGHT-TO-LEFT EMBEDDING
4105
+ 0x202C: "PDF", // POP DIRECTIONAL FORMATTING
4106
+ 0x202D: "LRO", // LEFT-TO-RIGHT OVERRIDE
4107
+ 0x202E: "RLO", // RIGHT-TO-LEFT OVERRIDE
4108
+
4109
+ 0x2066: "LRI", // LEFT-TO-RIGHT ISOLATE
4110
+ 0x2067: "RLI", // RIGHT-TO-LEFT ISOLATE
4111
+ 0x2068: "FSI", // FIRST STRONG ISOLATE
4112
+ 0x2069: "PDI" , // POP DIRECTIONAL ISOLATE
4113
+
4114
+ // Unicode Noncharacter(検証・防御用途)
4115
+ 0xFFFE: "NONCHAR_FFFE",
4116
+ 0xFFFF: "NONCHAR_FFFF"
4009
4117
  };
4010
4118
 
4011
4119
  const unicode_blockname_array = [
@@ -4048,8 +4156,8 @@ class MOJI_CHAR_MAP {
4048
4156
  "Cyrillic Extended-D", "Nyiakeng Puachue Hmong", "Toto", "Wancho", "Nag Mundari", "Ethiopic Extended-B", "Mende Kikakui", "Adlam",
4049
4157
  "Indic Siyaq Numbers", "Ottoman Siyaq Numbers", "Arabic Mathematical Alphabetic Symbols", "Mahjong Tiles", "Domino Tiles", "Playing Cards", "Enclosed Alphanumeric Supplement", "Enclosed Ideographic Supplement",
4050
4158
  "Miscellaneous Symbols and Pictographs", "Emoticons", "Ornamental Dingbats", "Transport and Map Symbols", "Alchemical Symbols", "Geometric Shapes Extended", "Supplemental Arrows-C", "Supplemental Symbols and Pictographs",
4051
- "Chess Symbols", "Symbols and Pictographs Extended-A", "Symbols for Legacy Computing", "CJK Unified Ideographs Extension B", "CJK Unified Ideographs Extension C", "CJK Unified Ideographs Extension D", "CJK Unified Ideographs Extension E", "CJK Unified Ideographs Extension F",
4052
- "CJK Compatibility Ideographs Supplement", "CJK Unified Ideographs Extension G", "CJK Unified Ideographs Extension H", "Tags", "Variation Selectors Supplement", "Supplementary Private Use Area-A", "Supplementary Private Use Area-B"
4159
+ "Chess Symbols", "Symbols and Pictographs Extended-A", "Symbols for Legacy Computing", "CJK Unified Ideographs Extension B", "CJK Unified Ideographs Extension C", "CJK Unified Ideographs Extension D", "CJK Unified Ideographs Extension E", "CJK Unified Ideographs Extension F", "CJK Unified Ideographs Extension I",
4160
+ "CJK Compatibility Ideographs Supplement", "CJK Unified Ideographs Extension G", "CJK Unified Ideographs Extension H", "CJK Unified Ideographs Extension J", "Tags", "Variation Selectors Supplement", "Supplementary Private Use Area-A", "Supplementary Private Use Area-B"
4053
4161
  ];
4054
4162
 
4055
4163
  const unicode_blockaddress_array = [
@@ -4072,8 +4180,8 @@ class MOJI_CHAR_MAP {
4072
4180
  0x1467F, 0x16A3F, 0x16A6F, 0x16ACF, 0x16AFF, 0x16B8F, 0x16E9F, 0x16F9F, 0x16FFF, 0x187FF, 0x18AFF, 0x18CFF, 0x18D7F, 0x1AFFF, 0x1B0FF, 0x1B12F,
4073
4181
  0x1B16F, 0x1B2FF, 0x1BC9F, 0x1BCAF, 0x1CFCF, 0x1D0FF, 0x1D1FF, 0x1D24F, 0x1D2DF, 0x1D2FF, 0x1D35F, 0x1D37F, 0x1D7FF, 0x1DAAF, 0x1DFFF, 0x1E02F,
4074
4182
  0x1E08F, 0x1E14F, 0x1E2BF, 0x1E2FF, 0x1E4FF, 0x1E7FF, 0x1E8DF, 0x1E95F, 0x1ECBF, 0x1ED4F, 0x1EEFF, 0x1F02F, 0x1F09F, 0x1F0FF, 0x1F1FF, 0x1F2FF,
4075
- 0x1F5FF, 0x1F64F, 0x1F67F, 0x1F6FF, 0x1F77F, 0x1F7FF, 0x1F8FF, 0x1F9FF, 0x1FA6F, 0x1FAFF, 0x1FBFF, 0x2A6DF, 0x2B73F, 0x2B81F, 0x2CEAF, 0x2EBEF,
4076
- 0x2FA1F, 0x3134F, 0x323AF, 0xE007F, 0xE01EF, 0xFFFFF, 0x10FFFF
4183
+ 0x1F5FF, 0x1F64F, 0x1F67F, 0x1F6FF, 0x1F77F, 0x1F7FF, 0x1F8FF, 0x1F9FF, 0x1FA6F, 0x1FAFF, 0x1FBFF, 0x2A6DF, 0x2B73F, 0x2B81F, 0x2CEAF, 0x2EBEF, 0x2EE5F,
4184
+ 0x2FA1F, 0x3134F, 0x323AF, 0x3347F, 0xE007F, 0xE01EF, 0xFFFFF, 0x10FFFF
4077
4185
  ];
4078
4186
 
4079
4187
  to_block_name_from_unicode = function(unicode_codepoint) {
@@ -4177,20 +4285,25 @@ class MojiAnalizerTools {
4177
4285
  /**
4178
4286
  * コードポイントから異体字セレクタの判定
4179
4287
  * @param {Number} codepoint - コードポイント
4288
+ * @param {boolean} [annotate = false] - 注釈をつけるか否か
4180
4289
  * @returns {string|null} 確認結果(異体字セレクタではない場合はNULLを返す)
4181
4290
  */
4182
- static getVariationSelectorsNumberFromCodePoint(codepoint) {
4291
+ static getVariationSelectorsNumberFromCodePoint(codepoint, annotate) {
4183
4292
  // モンゴル自由字形選択子 U+180B〜U+180D (3個)
4184
4293
  if((0x180B <= codepoint) && (codepoint <= 0x180D)) {
4185
4294
  return "FVS" + ((codepoint - 0x180B) + 1);
4186
4295
  }
4187
4296
  // SVSで利用される異体字セレクタ U+FE00〜U+FE0F (VS1~VS16) (16個)
4188
4297
  if((0xFE00 <= codepoint) && (codepoint <= 0xFE0F)) {
4189
- return "VS" + (codepoint - 0xFE00) + 1;
4298
+ const n = (codepoint - 0xFE00) + 1;
4299
+ if (!annotate) return "VS" + n;
4300
+ if (codepoint === 0xFE0E) return "VS15 (text)";
4301
+ if (codepoint === 0xFE0F) return "VS16 (emoji)";
4302
+ return "VS" + n;
4190
4303
  }
4191
4304
  // IVSで利用される異体字セレクタ U+E0100〜U+E01EF (VS17~VS256) (240個)
4192
4305
  else if((0xE0100 <= codepoint) && (codepoint <= 0xE01EF)) {
4193
- return "VS" + (codepoint - 0xE0100) + 17;
4306
+ return "VS" + ((codepoint - 0xE0100) + 17);
4194
4307
  }
4195
4308
  return null;
4196
4309
  }
@@ -4342,6 +4455,7 @@ class MojiAnalizerTools {
4342
4455
  * @property {boolean} is_halfwidth_katakana 半角カタカナ
4343
4456
  * @property {boolean} is_emoji 絵文字
4344
4457
  * @property {boolean} is_emoticons 顔文字
4458
+ * @property {boolean} is_symbol_base 記号(VS16 が付くと絵文字化)
4345
4459
  * @property {boolean} is_gaiji 外字
4346
4460
  * @property {boolean} is_combining_mark 結合文字
4347
4461
  * @property {boolean} is_variation_selector 異体字セレクタ
@@ -4412,6 +4526,7 @@ class MojiAnalyzer {
4412
4526
  is_halfwidth_katakana : false,
4413
4527
  is_emoji : false,
4414
4528
  is_emoticons : false,
4529
+ is_symbol_base : false,
4415
4530
  is_gaiji : false,
4416
4531
  is_combining_mark : false,
4417
4532
  is_variation_selector : false
@@ -4536,9 +4651,11 @@ class MojiAnalyzer {
4536
4651
  type.is_fullwidth_ascii = /[\u3000\uFF01-\uFF5E]/.test(data.character);
4537
4652
  type.is_halfwidth_katakana = /[\uFF61-\uFF9F]/.test(data.character);
4538
4653
  // 絵文字
4539
- type.is_emoji = /Pictographs/.test(type.blockname);
4654
+ type.is_emoji = /Pictographs|Transport and Map Symbols/.test(type.blockname);
4540
4655
  // 顔文字
4541
4656
  type.is_emoticons = /Emoticons/.test(type.blockname);
4657
+ // 記号(VS16 が付くと絵文字化)
4658
+ type.is_symbol_base = /Dingbats|Miscellaneous Symbols/.test(type.blockname);
4542
4659
  // 外字
4543
4660
  type.is_gaiji = /Private Use Area/.test(type.blockname);
4544
4661
  // 結合文字
package/build/index.d.ts CHANGED
@@ -346,6 +346,7 @@ declare type _MojiEncodeData_ = {
346
346
  * @property {boolean} is_halfwidth_katakana 半角カタカナ
347
347
  * @property {boolean} is_emoji 絵文字
348
348
  * @property {boolean} is_emoticons 顔文字
349
+ * @property {boolean} is_symbol_base 記号(VS16 が付くと絵文字化)
349
350
  * @property {boolean} is_gaiji 外字
350
351
  * @property {boolean} is_combining_mark 結合文字
351
352
  * @property {boolean} is_variation_selector 異体字セレクタ
@@ -371,6 +372,7 @@ declare type _MojiTypeData_ = {
371
372
  is_halfwidth_katakana: boolean;
372
373
  is_emoji: boolean;
373
374
  is_emoticons: boolean;
375
+ is_symbol_base: boolean;
374
376
  is_gaiji: boolean;
375
377
  is_combining_mark: boolean;
376
378
  is_variation_selector: boolean;