mojijs 4.0.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/HISTORY.md CHANGED
@@ -1,5 +1,25 @@
1
1
  # History
2
2
 
3
+ ## v5.0.0
4
+
5
+ ### 機能改善
6
+
7
+ - 異体字セレクタの判定に、注釈機能を追加
8
+ - 絵文字の判定を強化
9
+ - 記号の判定を追加
10
+ - Unicodeの制御文字を追加
11
+ - CJK Unified Ideographs Extension I (2EBF0–2EE5F)
12
+ - CJK Unified Ideographs Extension J (323B0–3347F)
13
+
14
+ ### 変更
15
+
16
+ - travisが動作しないので除去
17
+
18
+ ### 不具合修正
19
+
20
+ - 結合していない文字も結合文字と判定する場合があるのを修正
21
+ - getVariationSelectorsNumberFromCodePoint での戻り値で意図しない文字列を返す問題を修正
22
+
3
23
  ## v4.0.0
4
24
 
5
25
  ### 機能改善
package/README.md CHANGED
@@ -1,28 +1,32 @@
1
- # MojiJS #
2
- [![Build Status](https://travis-ci.org/natade-jp/MojiJS.svg?branch=master)](https://travis-ci.org/natade-jp/MojiJS)
1
+ # MojiJS
2
+
3
3
  [![ESDoc coverage badge](https://natade-jp.github.io/MojiJS/docs/badge.svg)](https://natade-jp.github.io/MojiJS/docs/)
4
4
  ![MIT License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)
5
5
 
6
- ## What ##
7
- - 日本語の文字データを解析及び、変換するライブラリです。
8
- - [詳細なAPIを公開しています。](https://natade-jp.github.io/MojiJS/docs/)
9
- - [動作例](https://natade-jp.github.io/MojiJS/html/examples/demos/Text/) (コンソール及び[ソースコード](https://natade-jp.github.io/MojiJS/html/examples/demos/Text/main.mjs)を確認してみてください。)
10
- - [npm](https://www.npmjs.com/package/mojijs)
6
+ ## What
7
+
8
+ - 日本語の文字データを解析及び、変換するライブラリです。
9
+ - [詳細な API を公開しています。](https://natade-jp.github.io/MojiJS/docs/)
10
+ - [動作例](https://natade-jp.github.io/MojiJS/html/examples/demos/Text/) (コンソール及び[ソースコード](https://natade-jp.github.io/MojiJS/html/examples/demos/Text/main.mjs)を確認してみてください。)
11
+ - [npm](https://www.npmjs.com/package/mojijs)
11
12
 
12
13
  以下のことが行えます
13
- - エンコード(UTF-8 / UTF-16 / UTF-32 / Shift_JIS / Shift_JIS-2004 / EUC-JP / EUC-JIS-2004 )
14
- - 日本語の変換 (ひらがな, カタカナ, 半角, 全角, ローマ字 など))
15
- - 漢字の判定 (常用漢字, 人名用漢字, 面区点, 漢字水準 など)
16
- - 自然順ソート
17
14
 
18
- ## Install ##
15
+ - エンコード(UTF-8 / UTF-16 / UTF-32 / Shift_JIS / Shift_JIS-2004 / EUC-JP / EUC-JIS-2004 )
16
+ - 日本語の変換 (ひらがな, カタカナ, 半角, 全角, ローマ字 など))
17
+ - 漢字の判定 (常用漢字, 人名用漢字, 面区点, 漢字水準 など)
18
+ - 自然順ソート
19
+
20
+ ## Install
21
+
19
22
  ```
20
23
  npm install --save-dev mojijs
21
24
  ```
22
25
 
23
- ## Sample ##
26
+ ## Sample
24
27
 
25
28
  ### エンコード
29
+
26
30
  ```javascript
27
31
  const MojiJS = require("mojijs");
28
32
 
@@ -34,6 +38,7 @@ console.log(MojiJS.decode([0x61, 0xE3, 0x81, 0x82], "utf-8"));
34
38
  ```
35
39
 
36
40
  ### 日本語の変換
41
+
37
42
  ```javascript
38
43
  const MojiJS = require("mojijs");
39
44
 
@@ -42,6 +47,7 @@ console.log(MojiJS.toHiragana("カキクケコ"));
42
47
  ```
43
48
 
44
49
  ### 面区点
50
+
45
51
  ```javascript
46
52
  const MojiJS = require("mojijs");
47
53
 
@@ -61,6 +67,7 @@ console.log("面区点:" + data3.encode.menkuten.text + ", 漢字水準:" +
61
67
  ```
62
68
 
63
69
  ### 自然順ソート
70
+
64
71
  ```javascript
65
72
  const MojiJS = require("mojijs");
66
73
 
@@ -68,5 +75,6 @@ console.log(["3", "02", "あ", "イ", "う", "1"].sort(MojiJS.compareToForNatu
68
75
  -> [ '1', '02', '3', 'あ', 'イ', 'う' ]
69
76
  ```
70
77
 
71
- ## Author ##
72
- - [natade-jp](https://github.com/natade-jp/)
78
+ ## Author
79
+
80
+ - [natade-jp](https://github.com/natade-jp/)
@@ -591,20 +591,28 @@ class Unicode {
591
591
  * @returns {boolean} 確認結果
592
592
  */
593
593
  static isCombiningMarkFromCodePoint(codepoint) {
594
- return (
595
- // Combining Diacritical Marks
596
- ((0x0300 <= codepoint) && (codepoint <= 0x036F)) ||
597
- // Combining Diacritical Marks Extended
598
- ((0x1AB0 <= codepoint) && (codepoint <= 0x1AFF)) ||
599
- // Combining Diacritical Marks Supplement
600
- ((0x1DC0 <= codepoint) && (codepoint <= 0x1DFF)) ||
601
- // Combining Diacritical Marks for Symbols
602
- ((0x20D0 <= codepoint) && (codepoint <= 0x20FF)) ||
603
- // Hiragana 含まれる4種類の文字
604
- ((0x3099 <= codepoint) && (codepoint <= 0x309C)) ||
605
- // Combining Half Marks
606
- ((0xFE20 <= codepoint) && (codepoint <= 0xFE2F))
607
- );
594
+ try {
595
+ new RegExp("\\p{Mark}", "u");
596
+ return /\p{Mark}/u.test(String.fromCodePoint(codepoint));
597
+ } catch (e) {
598
+ // フォールバック処理
599
+ return (
600
+ // Combining Diacritical Marks
601
+ ((0x0300 <= codepoint) && (codepoint <= 0x036F)) ||
602
+ // Combining Diacritical Marks Extended
603
+ ((0x1AB0 <= codepoint) && (codepoint <= 0x1AFF)) ||
604
+ // Combining Diacritical Marks Supplement
605
+ ((0x1DC0 <= codepoint) && (codepoint <= 0x1DFF)) ||
606
+ // Combining Diacritical Marks for Symbols
607
+ ((0x20D0 <= codepoint) && (codepoint <= 0x20FF)) ||
608
+ // 日本語に含まれる2種類の文字
609
+ // COMBINING VOICED SOUND MARK
610
+ // COMBINING SEMI-VOICED SOUND MARK
611
+ ((0x3099 <= codepoint) && (codepoint <= 0x309A)) ||
612
+ // Combining Half Marks
613
+ ((0xFE20 <= codepoint) && (codepoint <= 0xFE2F))
614
+ );
615
+ }
608
616
  }
609
617
 
610
618
 
@@ -3997,15 +4005,111 @@ class MOJI_CHAR_MAP {
3997
4005
  // 制御文字、VSは多いため含めていない
3998
4006
 
3999
4007
  control_charcter_map = {
4000
- 0: "NUL", 1: "SOH", 2: "STX", 3: "ETX", 4: "EOT", 5: "ENQ", 6: "ACK", 7: "BEL",
4001
- 8: "BS", 9: "HT", 10: "LF", 11: "VT", 12: "FF", 13: "CR", 14: "SO", 15: "SI",
4002
- 16: "DLE", 17: "DC1", 18: "DC2", 19: "DC3", 20: "DC4", 21: "NAK", 22: "SYN", 23: "ETB",
4003
- 24: "CAN", 25: "EM", 26: "SUB", 27: "ESC", 28: "FS", 29: "GS", 30: "RS", 31: "US",
4004
- 127: "DEL", 128: "PAD", 129: "HOP", 130: "BPH", 131: "NBH", 132: "IND", 133: "NEL", 134: "SSA",
4005
- 135: "ESA", 136: "HTS", 137: "HTJ", 138: "VTS", 139: "PLD", 140: "PLU", 141: "RI", 142: "SS2",
4006
- 143: "SS3", 144: "DCS", 145: "PU1", 146: "PU2", 147: "STS", 148: "CCH", 149: "MW", 150: "SPA",
4007
- 151: "EPA", 152: "SOS", 153: "SGCI", 154: "SCI", 155: "CSI", 156: "ST", 157: "OSC", 158: "PM",
4008
- 159: "APC", 160: "NBSP", 173: "SHY", 65529: "IAA", 65530: "IAS", 65531: "IAT"
4008
+ // --- C0 control characters (ASCII 0x00–0x1F) ---
4009
+ 0: "NUL", // Null
4010
+ 1: "SOH", // Start of Heading
4011
+ 2: "STX", // Start of Text
4012
+ 3: "ETX", // End of Text
4013
+ 4: "EOT", // End of Transmission
4014
+ 5: "ENQ", // Enquiry
4015
+ 6: "ACK", // Acknowledge
4016
+ 7: "BEL", // Bell (beep)
4017
+
4018
+ 8: "BS", // Backspace
4019
+ 9: "HT", // Horizontal Tab
4020
+ 10: "LF", // Line Feed
4021
+ 11: "VT", // Vertical Tab
4022
+ 12: "FF", // Form Feed
4023
+ 13: "CR", // Carriage Return
4024
+ 14: "SO", // Shift Out
4025
+ 15: "SI", // Shift In
4026
+
4027
+ 16: "DLE", // Data Link Escape
4028
+ 17: "DC1", // Device Control 1 (XON)
4029
+ 18: "DC2", // Device Control 2
4030
+ 19: "DC3", // Device Control 3 (XOFF)
4031
+ 20: "DC4", // Device Control 4
4032
+ 21: "NAK", // Negative Acknowledge
4033
+ 22: "SYN", // Synchronous Idle
4034
+ 23: "ETB", // End of Transmission Block
4035
+
4036
+ 24: "CAN", // Cancel
4037
+ 25: "EM", // End of Medium
4038
+ 26: "SUB", // Substitute
4039
+ 27: "ESC", // Escape
4040
+ 28: "FS", // File Separator
4041
+ 29: "GS", // Group Separator
4042
+ 30: "RS", // Record Separator
4043
+ 31: "US", // Unit Separator
4044
+
4045
+ // --- DEL ---
4046
+ 127: "DEL", // Delete
4047
+
4048
+ // --- C1 control characters (ISO/IEC 6429, 0x80–0x9F) ---
4049
+ 128: "PAD", // Padding Character
4050
+ 129: "HOP", // High Octet Preset
4051
+ 130: "BPH", // Break Permitted Here
4052
+ 131: "NBH", // No Break Here
4053
+ 132: "IND", // Index
4054
+ 133: "NEL", // Next Line
4055
+ 134: "SSA", // Start of Selected Area
4056
+ 135: "ESA", // End of Selected Area
4057
+ 136: "HTS", // Horizontal Tab Set
4058
+ 137: "HTJ", // Horizontal Tab with Justification
4059
+ 138: "VTS", // Vertical Tab Set
4060
+ 139: "PLD", // Partial Line Down
4061
+ 140: "PLU", // Partial Line Up
4062
+ 141: "RI", // Reverse Index
4063
+ 142: "SS2", // Single Shift 2
4064
+ 143: "SS3", // Single Shift 3
4065
+ 144: "DCS", // Device Control String
4066
+ 145: "PU1", // Private Use 1
4067
+ 146: "PU2", // Private Use 2
4068
+ 147: "STS", // Set Transmit State
4069
+ 148: "CCH", // Cancel Character
4070
+ 149: "MW", // Message Waiting
4071
+ 150: "SPA", // Start of Protected Area
4072
+ 151: "EPA", // End of Protected Area
4073
+ 152: "SOS", // Start of String
4074
+ 153: "SGCI",// Single Graphic Character Introducer
4075
+ 154: "SCI", // Single Character Introducer
4076
+ 155: "CSI", // Control Sequence Introducer
4077
+ 156: "ST", // String Terminator
4078
+ 157: "OSC", // Operating System Command
4079
+ 158: "PM", // Privacy Message
4080
+ 159: "APC", // Application Program Command
4081
+
4082
+ // --- Unicode but制御的に扱われる文字 ---
4083
+ 160: "NBSP", // No-Break Space(表示は空白だが改行不可)
4084
+ 173: "SHY", // Soft Hyphen(通常は表示されない)
4085
+
4086
+ // --- Unicode Interlinear Annotation ---
4087
+ 65529: "IAA", // Interlinear Annotation Anchor
4088
+ 65530: "IAS", // Interlinear Annotation Separator
4089
+ 65531: "IAT", // Interlinear Annotation Terminator
4090
+
4091
+ // Zero Width / Joiner 系(Cf)
4092
+ 0x200B: "ZWSP", // ZERO WIDTH SPACE
4093
+ 0x200C: "ZWNJ", // ZERO WIDTH NON-JOINER
4094
+ 0x200D: "ZWJ", // ZERO WIDTH JOINER
4095
+ 0x2060: "WJ", // WORD JOINER
4096
+ 0xFEFF: "BOM", // BYTE ORDER MARK / ZERO WIDTH NO-BREAK SPACE
4097
+
4098
+ // 双方向(BiDi)制御文字
4099
+ 0x202A: "LRE", // LEFT-TO-RIGHT EMBEDDING
4100
+ 0x202B: "RLE", // RIGHT-TO-LEFT EMBEDDING
4101
+ 0x202C: "PDF", // POP DIRECTIONAL FORMATTING
4102
+ 0x202D: "LRO", // LEFT-TO-RIGHT OVERRIDE
4103
+ 0x202E: "RLO", // RIGHT-TO-LEFT OVERRIDE
4104
+
4105
+ 0x2066: "LRI", // LEFT-TO-RIGHT ISOLATE
4106
+ 0x2067: "RLI", // RIGHT-TO-LEFT ISOLATE
4107
+ 0x2068: "FSI", // FIRST STRONG ISOLATE
4108
+ 0x2069: "PDI" , // POP DIRECTIONAL ISOLATE
4109
+
4110
+ // Unicode Noncharacter(検証・防御用途)
4111
+ 0xFFFE: "NONCHAR_FFFE",
4112
+ 0xFFFF: "NONCHAR_FFFF"
4009
4113
  };
4010
4114
 
4011
4115
  const unicode_blockname_array = [
@@ -4048,8 +4152,8 @@ class MOJI_CHAR_MAP {
4048
4152
  "Cyrillic Extended-D", "Nyiakeng Puachue Hmong", "Toto", "Wancho", "Nag Mundari", "Ethiopic Extended-B", "Mende Kikakui", "Adlam",
4049
4153
  "Indic Siyaq Numbers", "Ottoman Siyaq Numbers", "Arabic Mathematical Alphabetic Symbols", "Mahjong Tiles", "Domino Tiles", "Playing Cards", "Enclosed Alphanumeric Supplement", "Enclosed Ideographic Supplement",
4050
4154
  "Miscellaneous Symbols and Pictographs", "Emoticons", "Ornamental Dingbats", "Transport and Map Symbols", "Alchemical Symbols", "Geometric Shapes Extended", "Supplemental Arrows-C", "Supplemental Symbols and Pictographs",
4051
- "Chess Symbols", "Symbols and Pictographs Extended-A", "Symbols for Legacy Computing", "CJK Unified Ideographs Extension B", "CJK Unified Ideographs Extension C", "CJK Unified Ideographs Extension D", "CJK Unified Ideographs Extension E", "CJK Unified Ideographs Extension F",
4052
- "CJK Compatibility Ideographs Supplement", "CJK Unified Ideographs Extension G", "CJK Unified Ideographs Extension H", "Tags", "Variation Selectors Supplement", "Supplementary Private Use Area-A", "Supplementary Private Use Area-B"
4155
+ "Chess Symbols", "Symbols and Pictographs Extended-A", "Symbols for Legacy Computing", "CJK Unified Ideographs Extension B", "CJK Unified Ideographs Extension C", "CJK Unified Ideographs Extension D", "CJK Unified Ideographs Extension E", "CJK Unified Ideographs Extension F", "CJK Unified Ideographs Extension I",
4156
+ "CJK Compatibility Ideographs Supplement", "CJK Unified Ideographs Extension G", "CJK Unified Ideographs Extension H", "CJK Unified Ideographs Extension J", "Tags", "Variation Selectors Supplement", "Supplementary Private Use Area-A", "Supplementary Private Use Area-B"
4053
4157
  ];
4054
4158
 
4055
4159
  const unicode_blockaddress_array = [
@@ -4072,8 +4176,8 @@ class MOJI_CHAR_MAP {
4072
4176
  0x1467F, 0x16A3F, 0x16A6F, 0x16ACF, 0x16AFF, 0x16B8F, 0x16E9F, 0x16F9F, 0x16FFF, 0x187FF, 0x18AFF, 0x18CFF, 0x18D7F, 0x1AFFF, 0x1B0FF, 0x1B12F,
4073
4177
  0x1B16F, 0x1B2FF, 0x1BC9F, 0x1BCAF, 0x1CFCF, 0x1D0FF, 0x1D1FF, 0x1D24F, 0x1D2DF, 0x1D2FF, 0x1D35F, 0x1D37F, 0x1D7FF, 0x1DAAF, 0x1DFFF, 0x1E02F,
4074
4178
  0x1E08F, 0x1E14F, 0x1E2BF, 0x1E2FF, 0x1E4FF, 0x1E7FF, 0x1E8DF, 0x1E95F, 0x1ECBF, 0x1ED4F, 0x1EEFF, 0x1F02F, 0x1F09F, 0x1F0FF, 0x1F1FF, 0x1F2FF,
4075
- 0x1F5FF, 0x1F64F, 0x1F67F, 0x1F6FF, 0x1F77F, 0x1F7FF, 0x1F8FF, 0x1F9FF, 0x1FA6F, 0x1FAFF, 0x1FBFF, 0x2A6DF, 0x2B73F, 0x2B81F, 0x2CEAF, 0x2EBEF,
4076
- 0x2FA1F, 0x3134F, 0x323AF, 0xE007F, 0xE01EF, 0xFFFFF, 0x10FFFF
4179
+ 0x1F5FF, 0x1F64F, 0x1F67F, 0x1F6FF, 0x1F77F, 0x1F7FF, 0x1F8FF, 0x1F9FF, 0x1FA6F, 0x1FAFF, 0x1FBFF, 0x2A6DF, 0x2B73F, 0x2B81F, 0x2CEAF, 0x2EBEF, 0x2EE5F,
4180
+ 0x2FA1F, 0x3134F, 0x323AF, 0x3347F, 0xE007F, 0xE01EF, 0xFFFFF, 0x10FFFF
4077
4181
  ];
4078
4182
 
4079
4183
  to_block_name_from_unicode = function(unicode_codepoint) {
@@ -4177,20 +4281,25 @@ class MojiAnalizerTools {
4177
4281
  /**
4178
4282
  * コードポイントから異体字セレクタの判定
4179
4283
  * @param {Number} codepoint - コードポイント
4284
+ * @param {boolean} [annotate = false] - 注釈をつけるか否か
4180
4285
  * @returns {string|null} 確認結果(異体字セレクタではない場合はNULLを返す)
4181
4286
  */
4182
- static getVariationSelectorsNumberFromCodePoint(codepoint) {
4287
+ static getVariationSelectorsNumberFromCodePoint(codepoint, annotate) {
4183
4288
  // モンゴル自由字形選択子 U+180B〜U+180D (3個)
4184
4289
  if((0x180B <= codepoint) && (codepoint <= 0x180D)) {
4185
4290
  return "FVS" + ((codepoint - 0x180B) + 1);
4186
4291
  }
4187
4292
  // SVSで利用される異体字セレクタ U+FE00〜U+FE0F (VS1~VS16) (16個)
4188
4293
  if((0xFE00 <= codepoint) && (codepoint <= 0xFE0F)) {
4189
- return "VS" + (codepoint - 0xFE00) + 1;
4294
+ const n = (codepoint - 0xFE00) + 1;
4295
+ if (!annotate) return "VS" + n;
4296
+ if (codepoint === 0xFE0E) return "VS15 (text)";
4297
+ if (codepoint === 0xFE0F) return "VS16 (emoji)";
4298
+ return "VS" + n;
4190
4299
  }
4191
4300
  // IVSで利用される異体字セレクタ U+E0100〜U+E01EF (VS17~VS256) (240個)
4192
4301
  else if((0xE0100 <= codepoint) && (codepoint <= 0xE01EF)) {
4193
- return "VS" + (codepoint - 0xE0100) + 17;
4302
+ return "VS" + ((codepoint - 0xE0100) + 17);
4194
4303
  }
4195
4304
  return null;
4196
4305
  }
@@ -4342,6 +4451,7 @@ class MojiAnalizerTools {
4342
4451
  * @property {boolean} is_halfwidth_katakana 半角カタカナ
4343
4452
  * @property {boolean} is_emoji 絵文字
4344
4453
  * @property {boolean} is_emoticons 顔文字
4454
+ * @property {boolean} is_symbol_base 記号(VS16 が付くと絵文字化)
4345
4455
  * @property {boolean} is_gaiji 外字
4346
4456
  * @property {boolean} is_combining_mark 結合文字
4347
4457
  * @property {boolean} is_variation_selector 異体字セレクタ
@@ -4412,6 +4522,7 @@ class MojiAnalyzer {
4412
4522
  is_halfwidth_katakana : false,
4413
4523
  is_emoji : false,
4414
4524
  is_emoticons : false,
4525
+ is_symbol_base : false,
4415
4526
  is_gaiji : false,
4416
4527
  is_combining_mark : false,
4417
4528
  is_variation_selector : false
@@ -4536,9 +4647,11 @@ class MojiAnalyzer {
4536
4647
  type.is_fullwidth_ascii = /[\u3000\uFF01-\uFF5E]/.test(data.character);
4537
4648
  type.is_halfwidth_katakana = /[\uFF61-\uFF9F]/.test(data.character);
4538
4649
  // 絵文字
4539
- type.is_emoji = /Pictographs/.test(type.blockname);
4650
+ type.is_emoji = /Pictographs|Transport and Map Symbols/.test(type.blockname);
4540
4651
  // 顔文字
4541
4652
  type.is_emoticons = /Emoticons/.test(type.blockname);
4653
+ // 記号(VS16 が付くと絵文字化)
4654
+ type.is_symbol_base = /Dingbats|Miscellaneous Symbols/.test(type.blockname);
4542
4655
  // 外字
4543
4656
  type.is_gaiji = /Private Use Area/.test(type.blockname);
4544
4657
  // 結合文字
@@ -5270,4 +5383,4 @@ class MojiJS {
5270
5383
 
5271
5384
  }
5272
5385
 
5273
- module.exports = MojiJS;
5386
+ export default MojiJS;
@@ -591,20 +591,28 @@ class Unicode {
591
591
  * @returns {boolean} 確認結果
592
592
  */
593
593
  static isCombiningMarkFromCodePoint(codepoint) {
594
- return (
595
- // Combining Diacritical Marks
596
- ((0x0300 <= codepoint) && (codepoint <= 0x036F)) ||
597
- // Combining Diacritical Marks Extended
598
- ((0x1AB0 <= codepoint) && (codepoint <= 0x1AFF)) ||
599
- // Combining Diacritical Marks Supplement
600
- ((0x1DC0 <= codepoint) && (codepoint <= 0x1DFF)) ||
601
- // Combining Diacritical Marks for Symbols
602
- ((0x20D0 <= codepoint) && (codepoint <= 0x20FF)) ||
603
- // Hiragana 含まれる4種類の文字
604
- ((0x3099 <= codepoint) && (codepoint <= 0x309C)) ||
605
- // Combining Half Marks
606
- ((0xFE20 <= codepoint) && (codepoint <= 0xFE2F))
607
- );
594
+ try {
595
+ new RegExp("\\p{Mark}", "u");
596
+ return /\p{Mark}/u.test(String.fromCodePoint(codepoint));
597
+ } catch (e) {
598
+ // フォールバック処理
599
+ return (
600
+ // Combining Diacritical Marks
601
+ ((0x0300 <= codepoint) && (codepoint <= 0x036F)) ||
602
+ // Combining Diacritical Marks Extended
603
+ ((0x1AB0 <= codepoint) && (codepoint <= 0x1AFF)) ||
604
+ // Combining Diacritical Marks Supplement
605
+ ((0x1DC0 <= codepoint) && (codepoint <= 0x1DFF)) ||
606
+ // Combining Diacritical Marks for Symbols
607
+ ((0x20D0 <= codepoint) && (codepoint <= 0x20FF)) ||
608
+ // 日本語に含まれる2種類の文字
609
+ // COMBINING VOICED SOUND MARK
610
+ // COMBINING SEMI-VOICED SOUND MARK
611
+ ((0x3099 <= codepoint) && (codepoint <= 0x309A)) ||
612
+ // Combining Half Marks
613
+ ((0xFE20 <= codepoint) && (codepoint <= 0xFE2F))
614
+ );
615
+ }
608
616
  }
609
617
 
610
618
 
@@ -3997,15 +4005,111 @@ class MOJI_CHAR_MAP {
3997
4005
  // 制御文字、VSは多いため含めていない
3998
4006
 
3999
4007
  control_charcter_map = {
4000
- 0: "NUL", 1: "SOH", 2: "STX", 3: "ETX", 4: "EOT", 5: "ENQ", 6: "ACK", 7: "BEL",
4001
- 8: "BS", 9: "HT", 10: "LF", 11: "VT", 12: "FF", 13: "CR", 14: "SO", 15: "SI",
4002
- 16: "DLE", 17: "DC1", 18: "DC2", 19: "DC3", 20: "DC4", 21: "NAK", 22: "SYN", 23: "ETB",
4003
- 24: "CAN", 25: "EM", 26: "SUB", 27: "ESC", 28: "FS", 29: "GS", 30: "RS", 31: "US",
4004
- 127: "DEL", 128: "PAD", 129: "HOP", 130: "BPH", 131: "NBH", 132: "IND", 133: "NEL", 134: "SSA",
4005
- 135: "ESA", 136: "HTS", 137: "HTJ", 138: "VTS", 139: "PLD", 140: "PLU", 141: "RI", 142: "SS2",
4006
- 143: "SS3", 144: "DCS", 145: "PU1", 146: "PU2", 147: "STS", 148: "CCH", 149: "MW", 150: "SPA",
4007
- 151: "EPA", 152: "SOS", 153: "SGCI", 154: "SCI", 155: "CSI", 156: "ST", 157: "OSC", 158: "PM",
4008
- 159: "APC", 160: "NBSP", 173: "SHY", 65529: "IAA", 65530: "IAS", 65531: "IAT"
4008
+ // --- C0 control characters (ASCII 0x00–0x1F) ---
4009
+ 0: "NUL", // Null
4010
+ 1: "SOH", // Start of Heading
4011
+ 2: "STX", // Start of Text
4012
+ 3: "ETX", // End of Text
4013
+ 4: "EOT", // End of Transmission
4014
+ 5: "ENQ", // Enquiry
4015
+ 6: "ACK", // Acknowledge
4016
+ 7: "BEL", // Bell (beep)
4017
+
4018
+ 8: "BS", // Backspace
4019
+ 9: "HT", // Horizontal Tab
4020
+ 10: "LF", // Line Feed
4021
+ 11: "VT", // Vertical Tab
4022
+ 12: "FF", // Form Feed
4023
+ 13: "CR", // Carriage Return
4024
+ 14: "SO", // Shift Out
4025
+ 15: "SI", // Shift In
4026
+
4027
+ 16: "DLE", // Data Link Escape
4028
+ 17: "DC1", // Device Control 1 (XON)
4029
+ 18: "DC2", // Device Control 2
4030
+ 19: "DC3", // Device Control 3 (XOFF)
4031
+ 20: "DC4", // Device Control 4
4032
+ 21: "NAK", // Negative Acknowledge
4033
+ 22: "SYN", // Synchronous Idle
4034
+ 23: "ETB", // End of Transmission Block
4035
+
4036
+ 24: "CAN", // Cancel
4037
+ 25: "EM", // End of Medium
4038
+ 26: "SUB", // Substitute
4039
+ 27: "ESC", // Escape
4040
+ 28: "FS", // File Separator
4041
+ 29: "GS", // Group Separator
4042
+ 30: "RS", // Record Separator
4043
+ 31: "US", // Unit Separator
4044
+
4045
+ // --- DEL ---
4046
+ 127: "DEL", // Delete
4047
+
4048
+ // --- C1 control characters (ISO/IEC 6429, 0x80–0x9F) ---
4049
+ 128: "PAD", // Padding Character
4050
+ 129: "HOP", // High Octet Preset
4051
+ 130: "BPH", // Break Permitted Here
4052
+ 131: "NBH", // No Break Here
4053
+ 132: "IND", // Index
4054
+ 133: "NEL", // Next Line
4055
+ 134: "SSA", // Start of Selected Area
4056
+ 135: "ESA", // End of Selected Area
4057
+ 136: "HTS", // Horizontal Tab Set
4058
+ 137: "HTJ", // Horizontal Tab with Justification
4059
+ 138: "VTS", // Vertical Tab Set
4060
+ 139: "PLD", // Partial Line Down
4061
+ 140: "PLU", // Partial Line Up
4062
+ 141: "RI", // Reverse Index
4063
+ 142: "SS2", // Single Shift 2
4064
+ 143: "SS3", // Single Shift 3
4065
+ 144: "DCS", // Device Control String
4066
+ 145: "PU1", // Private Use 1
4067
+ 146: "PU2", // Private Use 2
4068
+ 147: "STS", // Set Transmit State
4069
+ 148: "CCH", // Cancel Character
4070
+ 149: "MW", // Message Waiting
4071
+ 150: "SPA", // Start of Protected Area
4072
+ 151: "EPA", // End of Protected Area
4073
+ 152: "SOS", // Start of String
4074
+ 153: "SGCI",// Single Graphic Character Introducer
4075
+ 154: "SCI", // Single Character Introducer
4076
+ 155: "CSI", // Control Sequence Introducer
4077
+ 156: "ST", // String Terminator
4078
+ 157: "OSC", // Operating System Command
4079
+ 158: "PM", // Privacy Message
4080
+ 159: "APC", // Application Program Command
4081
+
4082
+ // --- Unicode but制御的に扱われる文字 ---
4083
+ 160: "NBSP", // No-Break Space(表示は空白だが改行不可)
4084
+ 173: "SHY", // Soft Hyphen(通常は表示されない)
4085
+
4086
+ // --- Unicode Interlinear Annotation ---
4087
+ 65529: "IAA", // Interlinear Annotation Anchor
4088
+ 65530: "IAS", // Interlinear Annotation Separator
4089
+ 65531: "IAT", // Interlinear Annotation Terminator
4090
+
4091
+ // Zero Width / Joiner 系(Cf)
4092
+ 0x200B: "ZWSP", // ZERO WIDTH SPACE
4093
+ 0x200C: "ZWNJ", // ZERO WIDTH NON-JOINER
4094
+ 0x200D: "ZWJ", // ZERO WIDTH JOINER
4095
+ 0x2060: "WJ", // WORD JOINER
4096
+ 0xFEFF: "BOM", // BYTE ORDER MARK / ZERO WIDTH NO-BREAK SPACE
4097
+
4098
+ // 双方向(BiDi)制御文字
4099
+ 0x202A: "LRE", // LEFT-TO-RIGHT EMBEDDING
4100
+ 0x202B: "RLE", // RIGHT-TO-LEFT EMBEDDING
4101
+ 0x202C: "PDF", // POP DIRECTIONAL FORMATTING
4102
+ 0x202D: "LRO", // LEFT-TO-RIGHT OVERRIDE
4103
+ 0x202E: "RLO", // RIGHT-TO-LEFT OVERRIDE
4104
+
4105
+ 0x2066: "LRI", // LEFT-TO-RIGHT ISOLATE
4106
+ 0x2067: "RLI", // RIGHT-TO-LEFT ISOLATE
4107
+ 0x2068: "FSI", // FIRST STRONG ISOLATE
4108
+ 0x2069: "PDI" , // POP DIRECTIONAL ISOLATE
4109
+
4110
+ // Unicode Noncharacter(検証・防御用途)
4111
+ 0xFFFE: "NONCHAR_FFFE",
4112
+ 0xFFFF: "NONCHAR_FFFF"
4009
4113
  };
4010
4114
 
4011
4115
  const unicode_blockname_array = [
@@ -4048,8 +4152,8 @@ class MOJI_CHAR_MAP {
4048
4152
  "Cyrillic Extended-D", "Nyiakeng Puachue Hmong", "Toto", "Wancho", "Nag Mundari", "Ethiopic Extended-B", "Mende Kikakui", "Adlam",
4049
4153
  "Indic Siyaq Numbers", "Ottoman Siyaq Numbers", "Arabic Mathematical Alphabetic Symbols", "Mahjong Tiles", "Domino Tiles", "Playing Cards", "Enclosed Alphanumeric Supplement", "Enclosed Ideographic Supplement",
4050
4154
  "Miscellaneous Symbols and Pictographs", "Emoticons", "Ornamental Dingbats", "Transport and Map Symbols", "Alchemical Symbols", "Geometric Shapes Extended", "Supplemental Arrows-C", "Supplemental Symbols and Pictographs",
4051
- "Chess Symbols", "Symbols and Pictographs Extended-A", "Symbols for Legacy Computing", "CJK Unified Ideographs Extension B", "CJK Unified Ideographs Extension C", "CJK Unified Ideographs Extension D", "CJK Unified Ideographs Extension E", "CJK Unified Ideographs Extension F",
4052
- "CJK Compatibility Ideographs Supplement", "CJK Unified Ideographs Extension G", "CJK Unified Ideographs Extension H", "Tags", "Variation Selectors Supplement", "Supplementary Private Use Area-A", "Supplementary Private Use Area-B"
4155
+ "Chess Symbols", "Symbols and Pictographs Extended-A", "Symbols for Legacy Computing", "CJK Unified Ideographs Extension B", "CJK Unified Ideographs Extension C", "CJK Unified Ideographs Extension D", "CJK Unified Ideographs Extension E", "CJK Unified Ideographs Extension F", "CJK Unified Ideographs Extension I",
4156
+ "CJK Compatibility Ideographs Supplement", "CJK Unified Ideographs Extension G", "CJK Unified Ideographs Extension H", "CJK Unified Ideographs Extension J", "Tags", "Variation Selectors Supplement", "Supplementary Private Use Area-A", "Supplementary Private Use Area-B"
4053
4157
  ];
4054
4158
 
4055
4159
  const unicode_blockaddress_array = [
@@ -4072,8 +4176,8 @@ class MOJI_CHAR_MAP {
4072
4176
  0x1467F, 0x16A3F, 0x16A6F, 0x16ACF, 0x16AFF, 0x16B8F, 0x16E9F, 0x16F9F, 0x16FFF, 0x187FF, 0x18AFF, 0x18CFF, 0x18D7F, 0x1AFFF, 0x1B0FF, 0x1B12F,
4073
4177
  0x1B16F, 0x1B2FF, 0x1BC9F, 0x1BCAF, 0x1CFCF, 0x1D0FF, 0x1D1FF, 0x1D24F, 0x1D2DF, 0x1D2FF, 0x1D35F, 0x1D37F, 0x1D7FF, 0x1DAAF, 0x1DFFF, 0x1E02F,
4074
4178
  0x1E08F, 0x1E14F, 0x1E2BF, 0x1E2FF, 0x1E4FF, 0x1E7FF, 0x1E8DF, 0x1E95F, 0x1ECBF, 0x1ED4F, 0x1EEFF, 0x1F02F, 0x1F09F, 0x1F0FF, 0x1F1FF, 0x1F2FF,
4075
- 0x1F5FF, 0x1F64F, 0x1F67F, 0x1F6FF, 0x1F77F, 0x1F7FF, 0x1F8FF, 0x1F9FF, 0x1FA6F, 0x1FAFF, 0x1FBFF, 0x2A6DF, 0x2B73F, 0x2B81F, 0x2CEAF, 0x2EBEF,
4076
- 0x2FA1F, 0x3134F, 0x323AF, 0xE007F, 0xE01EF, 0xFFFFF, 0x10FFFF
4179
+ 0x1F5FF, 0x1F64F, 0x1F67F, 0x1F6FF, 0x1F77F, 0x1F7FF, 0x1F8FF, 0x1F9FF, 0x1FA6F, 0x1FAFF, 0x1FBFF, 0x2A6DF, 0x2B73F, 0x2B81F, 0x2CEAF, 0x2EBEF, 0x2EE5F,
4180
+ 0x2FA1F, 0x3134F, 0x323AF, 0x3347F, 0xE007F, 0xE01EF, 0xFFFFF, 0x10FFFF
4077
4181
  ];
4078
4182
 
4079
4183
  to_block_name_from_unicode = function(unicode_codepoint) {
@@ -4177,20 +4281,25 @@ class MojiAnalizerTools {
4177
4281
  /**
4178
4282
  * コードポイントから異体字セレクタの判定
4179
4283
  * @param {Number} codepoint - コードポイント
4284
+ * @param {boolean} [annotate = false] - 注釈をつけるか否か
4180
4285
  * @returns {string|null} 確認結果(異体字セレクタではない場合はNULLを返す)
4181
4286
  */
4182
- static getVariationSelectorsNumberFromCodePoint(codepoint) {
4287
+ static getVariationSelectorsNumberFromCodePoint(codepoint, annotate) {
4183
4288
  // モンゴル自由字形選択子 U+180B〜U+180D (3個)
4184
4289
  if((0x180B <= codepoint) && (codepoint <= 0x180D)) {
4185
4290
  return "FVS" + ((codepoint - 0x180B) + 1);
4186
4291
  }
4187
4292
  // SVSで利用される異体字セレクタ U+FE00〜U+FE0F (VS1~VS16) (16個)
4188
4293
  if((0xFE00 <= codepoint) && (codepoint <= 0xFE0F)) {
4189
- return "VS" + (codepoint - 0xFE00) + 1;
4294
+ const n = (codepoint - 0xFE00) + 1;
4295
+ if (!annotate) return "VS" + n;
4296
+ if (codepoint === 0xFE0E) return "VS15 (text)";
4297
+ if (codepoint === 0xFE0F) return "VS16 (emoji)";
4298
+ return "VS" + n;
4190
4299
  }
4191
4300
  // IVSで利用される異体字セレクタ U+E0100〜U+E01EF (VS17~VS256) (240個)
4192
4301
  else if((0xE0100 <= codepoint) && (codepoint <= 0xE01EF)) {
4193
- return "VS" + (codepoint - 0xE0100) + 17;
4302
+ return "VS" + ((codepoint - 0xE0100) + 17);
4194
4303
  }
4195
4304
  return null;
4196
4305
  }
@@ -4342,6 +4451,7 @@ class MojiAnalizerTools {
4342
4451
  * @property {boolean} is_halfwidth_katakana 半角カタカナ
4343
4452
  * @property {boolean} is_emoji 絵文字
4344
4453
  * @property {boolean} is_emoticons 顔文字
4454
+ * @property {boolean} is_symbol_base 記号(VS16 が付くと絵文字化)
4345
4455
  * @property {boolean} is_gaiji 外字
4346
4456
  * @property {boolean} is_combining_mark 結合文字
4347
4457
  * @property {boolean} is_variation_selector 異体字セレクタ
@@ -4412,6 +4522,7 @@ class MojiAnalyzer {
4412
4522
  is_halfwidth_katakana : false,
4413
4523
  is_emoji : false,
4414
4524
  is_emoticons : false,
4525
+ is_symbol_base : false,
4415
4526
  is_gaiji : false,
4416
4527
  is_combining_mark : false,
4417
4528
  is_variation_selector : false
@@ -4536,9 +4647,11 @@ class MojiAnalyzer {
4536
4647
  type.is_fullwidth_ascii = /[\u3000\uFF01-\uFF5E]/.test(data.character);
4537
4648
  type.is_halfwidth_katakana = /[\uFF61-\uFF9F]/.test(data.character);
4538
4649
  // 絵文字
4539
- type.is_emoji = /Pictographs/.test(type.blockname);
4650
+ type.is_emoji = /Pictographs|Transport and Map Symbols/.test(type.blockname);
4540
4651
  // 顔文字
4541
4652
  type.is_emoticons = /Emoticons/.test(type.blockname);
4653
+ // 記号(VS16 が付くと絵文字化)
4654
+ type.is_symbol_base = /Dingbats|Miscellaneous Symbols/.test(type.blockname);
4542
4655
  // 外字
4543
4656
  type.is_gaiji = /Private Use Area/.test(type.blockname);
4544
4657
  // 結合文字
package/build/index.d.ts CHANGED
@@ -346,6 +346,7 @@ declare type _MojiEncodeData_ = {
346
346
  * @property {boolean} is_halfwidth_katakana 半角カタカナ
347
347
  * @property {boolean} is_emoji 絵文字
348
348
  * @property {boolean} is_emoticons 顔文字
349
+ * @property {boolean} is_symbol_base 記号(VS16 が付くと絵文字化)
349
350
  * @property {boolean} is_gaiji 外字
350
351
  * @property {boolean} is_combining_mark 結合文字
351
352
  * @property {boolean} is_variation_selector 異体字セレクタ
@@ -371,6 +372,7 @@ declare type _MojiTypeData_ = {
371
372
  is_halfwidth_katakana: boolean;
372
373
  is_emoji: boolean;
373
374
  is_emoticons: boolean;
375
+ is_symbol_base: boolean;
374
376
  is_gaiji: boolean;
375
377
  is_combining_mark: boolean;
376
378
  is_variation_selector: boolean;