unicode-emoji 3.8.0 → 4.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.rake_tasks +1 -0
  3. data/CHANGELOG.md +59 -33
  4. data/Gemfile.lock +2 -4
  5. data/README.md +32 -21
  6. data/data/generate_constants.rb +32 -8
  7. data/lib/unicode/emoji/constants.rb +6 -2
  8. data/lib/unicode/emoji/generated/regex.rb +1 -1
  9. data/lib/unicode/emoji/generated/regex_basic.rb +1 -1
  10. data/lib/unicode/emoji/generated/regex_emoji_keycap.rb +8 -0
  11. data/lib/unicode/emoji/generated/regex_include_mqe.rb +1 -1
  12. data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +1 -1
  13. data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
  14. data/lib/unicode/emoji/generated/regex_prop_component.rb +8 -0
  15. data/lib/unicode/emoji/generated/regex_prop_emoji.rb +8 -0
  16. data/lib/unicode/emoji/generated/regex_prop_modifier.rb +8 -0
  17. data/lib/unicode/emoji/generated/regex_prop_modifier_base.rb +8 -0
  18. data/lib/unicode/emoji/generated/regex_prop_presentation.rb +8 -0
  19. data/lib/unicode/emoji/generated/regex_text_presentation.rb +8 -0
  20. data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
  21. data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
  22. data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
  23. data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
  24. data/lib/unicode/emoji/generated_native/regex.rb +1 -1
  25. data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
  26. data/lib/unicode/emoji/generated_native/regex_emoji_keycap.rb +8 -0
  27. data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +1 -1
  28. data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +1 -1
  29. data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
  30. data/lib/unicode/emoji/generated_native/regex_picto_no_emoji.rb +1 -1
  31. data/lib/unicode/emoji/generated_native/regex_prop_component.rb +8 -0
  32. data/lib/unicode/emoji/generated_native/{regex_any.rb → regex_prop_emoji.rb} +1 -1
  33. data/lib/unicode/emoji/generated_native/regex_prop_modifier.rb +8 -0
  34. data/lib/unicode/emoji/generated_native/regex_prop_modifier_base.rb +8 -0
  35. data/lib/unicode/emoji/generated_native/regex_prop_presentation.rb +8 -0
  36. data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
  37. data/lib/unicode/emoji/generated_native/regex_text_presentation.rb +8 -0
  38. data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
  39. data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
  40. data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
  41. data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
  42. data/lib/unicode/emoji.rb +10 -5
  43. data/spec/unicode_emoji_spec.rb +97 -16
  44. data/unicode-emoji.gemspec +1 -3
  45. metadata +17 -21
  46. data/lib/unicode/emoji/generated/regex_any.rb +0 -8
  47. data/spec/data/emoji-test.txt +0 -5331
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e3c7cc2671d256d8208b72d719384e7c13aaace4fec6b4919b92640e5336d87f
4
- data.tar.gz: 9420777da4805787467c7f4eac1580f7179a6abf56e54b011c11640a50502b88
3
+ metadata.gz: 3b08d6adaddfcbca3e754c9a52a0c0d5c772da86ca708affc9799ad113c5a005
4
+ data.tar.gz: e9f3817a215ef38b7933d69b4f0563d848a03f6a6b8728ecd06a74417fb5f8a7
5
5
  SHA512:
6
- metadata.gz: f31a83c8a492affe4ec34f2f6e43fa20a44f98dae6c0855322d5c12bc924edc154f7fa4793ca28f299f5436c8981651a5777c2e7944db6dee9c3b99f5ec997ce
7
- data.tar.gz: 54c1beecbcb673274bdf98169f11fa51bc746282418c3b9ca30385194ae384e3532b04bbca3ef196316c11d3344258b9059da1ea18630d8d90b51029c608565d
6
+ metadata.gz: cedad0ceb5f1039be614bbca170cccc3f29e8f05bd7fc74714ed586ddf20edb5da25a6c8fd840acfb7dfbeb918ec6e2218cf427e5e510eb2406235253e32ad74
7
+ data.tar.gz: 4680b526737abd7491351ff87c5d323f3a6acf5996dffa4c2f4737e10bc8083da16928dbab34362b6014ae5a17863266eb456ab2d382f25350339eaf71a175bf
data/.rake_tasks CHANGED
@@ -1,3 +1,4 @@
1
+ dependencies...
1
2
  gem
2
3
  generate_constants
3
4
  irb
data/CHANGELOG.md CHANGED
@@ -1,6 +1,32 @@
1
1
  # CHANGELOG
2
2
 
3
- ### 3.8.0
3
+ ## 4.0.4
4
+
5
+ - Add `REGEX_TEXT_PRESENTATION` to be able to match for raw default-text Emoji codepoints
6
+
7
+ ## 4.0.3
8
+
9
+ - Remove emoji-test.txt from Rubygems package
10
+
11
+ ## 4.0.2
12
+
13
+ - Directly use `RbConfig::CONFIG["UNICODE_EMOJI_VERSION"]` to detect Ruby's Emoji version,
14
+ drop unicode-version dependency
15
+
16
+ ## 4.0.0
17
+
18
+ - **Breaking change:** Regexes now include single skin tone modifiers (`🏻`) and hair components (`🦰`).
19
+ They were previously considered to be invalid partial Emoji, however since they are supposed to be
20
+ displayed as Emoji in isolation, they are now part of the regexes (see *ED-20* in UTS51).
21
+ - **Breaking change:** Drop `REGEX_ANY` in favor of `REGEX_PROP_EMOJI`
22
+ - Expose regexes for Emoji props (`REGEX_PROP_*`). The advantage over using the native regex properties
23
+ directly is that you will be able to use the Emoji support level of this gem instead of Ruby's.
24
+ For example, as of releasing this, the current Emoji version is 16.0, while Ruby is at 15.0.
25
+ Also see README for a table listing the regexes that match Emoji properties.
26
+ - Add `REGEX_EMOJI_KEYCAP` for matching specifically Emoji keycaps
27
+ - Use character class instead of lookbehind for native text emoji and non-emoji pictographic regexes
28
+
29
+ ## 3.8.0
4
30
 
5
31
  - Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
6
32
  for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
@@ -10,7 +36,7 @@
10
36
  - Update CLDR to v46 (valid subdivisions)
11
37
  - Further improvements (see commit log)
12
38
 
13
- ### 3.7.0
39
+ ## 3.7.0
14
40
 
15
41
  - Bump required Ruby slightly to 2.5
16
42
  - Introduce new `REGEX_POSSIBLE` which contains the regex described in
@@ -23,46 +49,46 @@
23
49
  - Separately autoload emoji list, so it can be loaded when other indexes
24
50
  are not needed
25
51
 
26
- ### 3.6.0
52
+ ## 3.6.0
27
53
 
28
54
  - `Unicode::Emoji::REGEX_TEXT` now matches non-emoji keycaps like "3⃣" (U+0033 U+20E3)
29
55
  - Minor refactorings
30
56
 
31
- ### 3.5.0
57
+ ## 3.5.0
32
58
 
33
59
  - Emoji 16.0
34
60
 
35
- ### 3.4.0
61
+ ## 3.4.0
36
62
 
37
63
  - Emoji 15.1
38
64
 
39
- ### 3.3.2
65
+ ## 3.3.2
40
66
 
41
67
  - Update valid subdivisions to CLDR 43 (no changes)
42
68
  -> there won't be any new RGI subdivision flags in Emoji
43
69
 
44
- ### 3.3.1
70
+ ## 3.3.1
45
71
 
46
72
  - Update valid subdivisions to CLDR 42 (no changes)
47
73
 
48
- ### 3.3.0
74
+ ## 3.3.0
49
75
 
50
76
  - Emoji 15.0
51
77
 
52
- ### 3.2.0
78
+ ## 3.2.0
53
79
 
54
80
  - Update valid subdivisions to CLDR 41
55
81
 
56
- ### 3.1.1
82
+ ## 3.1.1
57
83
 
58
84
  - Fix `REGEX` to be able to match complete family emoji, instead of
59
85
  sub-matching partial families, thanks @matt17r
60
86
 
61
- ### 3.1.0
87
+ ## 3.1.0
62
88
 
63
89
  - Update valid subdivisions to CLDR 40
64
90
 
65
- ### 3.0.0
91
+ ## 3.0.0
66
92
 
67
93
  - Vastly improve memory usage, patch by @radarek
68
94
  - Emoji regexes are now pre-generated and bundled with the release
@@ -70,54 +96,54 @@
70
96
  - Most constants (e.g. regexes) now get autoloaded
71
97
  - See https://github.com/janlelis/unicode-emoji/pull/9 for more details
72
98
 
73
- ### 2.9.0
99
+ ## 2.9.0
74
100
 
75
101
  - Emoji 14.0
76
102
 
77
- ### 2.8.0
103
+ ## 2.8.0
78
104
 
79
105
  - Update valid subdivisions to CLDR 39
80
106
 
81
- ### 2.7.1
107
+ ## 2.7.1
82
108
 
83
109
  - Update valid subdivisions to CLDR 38.1
84
110
 
85
- ### 2.7.0
111
+ ## 2.7.0
86
112
 
87
113
  - Update valid subdivisions to CLDR 38
88
114
  - Loosen Ruby dependency to allow Ruby 3.0
89
115
 
90
- ### 2.6.0
116
+ ## 2.6.0
91
117
 
92
118
  - Emoji 13.1
93
119
 
94
- ### 2.5.0
120
+ ## 2.5.0
95
121
 
96
122
  - Use native Emoji regex properties when current Ruby's Emoji support is the same as our current Emoji version
97
123
  - Update valid subdivisions to CLDR 37
98
124
 
99
- ### 2.4.0
125
+ ## 2.4.0
100
126
 
101
127
  - Emoji 13.0
102
128
 
103
- ### 2.3.1
129
+ ## 2.3.1
104
130
 
105
131
  - Fix index to actually include Emoji 12.1
106
132
 
107
- ### 2.3.0
133
+ ## 2.3.0
108
134
 
109
135
  - Emoji 12.1
110
136
 
111
- ### 2.2.0
137
+ ## 2.2.0
112
138
 
113
139
  - Update subdivisions to CLDR 36
114
140
 
115
- ### 2.1.0
141
+ ## 2.1.0
116
142
 
117
143
  - Add `REGEX_PICTO` which matches codepoints with the **Extended_Pictographic** property
118
144
  - Add `REGEX_PICTO_NO_EMOJI` which matches codepoints with the **Extended_Pictographic** property, but no **Emoji** property
119
145
 
120
- ### 2.0.0
146
+ ## 2.0.0
121
147
 
122
148
  - Emoji 12.0 data (including valid subdivisions)
123
149
  - Introduce new `REGEX_WELL_FORMED` to be able to match for invalid tag and region sequences
@@ -126,40 +152,40 @@
126
152
  - Issue warning when using `#list` method to retrieve outdated category
127
153
  - Change matching for ZWJ sequences: Do not limit sequence to a maximum of 3 ZWJs
128
154
 
129
- ### 1.1.0
155
+ ## 1.1.0
130
156
 
131
157
  - Emoji 11.0
132
158
  - Do not depend on rubygems (only use zlib stdlib for unzipping)
133
159
 
134
- ### 1.0.3
160
+ ## 1.0.3
135
161
 
136
162
  - Explicitly load rubygems/util, fixes regression in 1.2.1
137
163
 
138
- ### 1.0.2
164
+ ## 1.0.2
139
165
 
140
166
  - Use `Gem::Util` for `gunzip`, removes deprecation warning
141
167
 
142
- ### 1.0.1
168
+ ## 1.0.1
143
169
 
144
170
  - Actually set required Ruby version to 2.3 in gemspec
145
171
 
146
- ### 1.0.0
172
+ ## 1.0.0
147
173
 
148
174
  - Drop support for Ruby below 2.3, use 0.9 if you need to
149
175
  - Internal refactorings, no API change
150
176
 
151
- ### 0.9.3
177
+ ## 0.9.3
152
178
 
153
179
  - Implement native Emoji regex matchers, but do not activate or document, yet
154
180
 
155
- ### 0.9.2
181
+ ## 0.9.2
156
182
 
157
183
  - REGEX_TEXT: Do not match if the text emoji is followed by a emoji modifier
158
184
 
159
- ### 0.9.1
185
+ ## 0.9.1
160
186
 
161
187
  - Include a categorized list of recommended Emoji
162
188
 
163
- ### 0.9.0
189
+ ## 0.9.0
164
190
 
165
191
  - Initial release (Emoji version 5.0)
data/Gemfile.lock CHANGED
@@ -1,8 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicode-emoji (3.5.0)
5
- unicode-version (~> 1.0)
4
+ unicode-emoji (4.0.4)
6
5
 
7
6
  GEM
8
7
  remote: https://rubygems.org/
@@ -20,7 +19,6 @@ GEM
20
19
  reline (0.3.8)
21
20
  io-console (~> 0.5)
22
21
  stringio (3.0.8)
23
- unicode-version (1.3.0)
24
22
 
25
23
  PLATFORMS
26
24
  ruby
@@ -32,4 +30,4 @@ DEPENDENCIES
32
30
  unicode-emoji!
33
31
 
34
32
  BUNDLED WITH
35
- 2.2.22
33
+ 2.5.21
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
2
2
 
3
- Provides regular expressions to find Emoji in strings, incorporating the latest Unicode / Emoji standards.
3
+ Provides various sophisticated regular expressions to work with Emoji in strings,
4
+ incorporating the latest Unicode / Emoji standards.
4
5
 
5
6
  Additional features:
6
7
 
@@ -26,16 +27,17 @@ require "unicode/emoji"
26
27
 
27
28
  string = "String which contains all types of Emoji sequences:
28
29
 
29
- - Singleton Emoji: 😴
30
- - Textual singleton Emoji with Emoji variation: ▶️
30
+ - Basic Emoji: 😴
31
+ - Textual Emoji with Emoji variation (VS16): ▶️
31
32
  - Emoji with skin tone modifier: 🛌🏽
32
33
  - Region flag: 🇵🇹
33
34
  - Sub-Region flag: 🏴󠁧󠁢󠁳󠁣󠁴󠁿
34
35
  - Keycap sequence: 2️⃣
36
+ - Skin tone modifier: 🏻
35
37
  - Sequence using ZWJ (zero width joiner): 🤾🏽‍♀️
36
38
  "
37
39
 
38
- string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🤾🏽‍♀️"]
40
+ string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🏻", "🤾🏽‍♀️"]
39
41
  ```
40
42
 
41
43
  Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
@@ -44,10 +46,10 @@ Depending on your exact usecase, you can choose between multiple levels of Emoji
44
46
 
45
47
  Regex | Description | Example Matches | Example Non-Matches
46
48
  ------------------------------|-------------|-----------------|--------------------
47
- `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
48
- `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`, `1⃣`
49
- `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`, `1⃣`
50
- `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits (except for: unqualified keycap sequences) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
49
+ `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) Basic Emoji and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🏻` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
50
+ `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) Basic Emoji and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `1`, `1⃣`
51
+ `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) Basic Emoji and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵`, `🏻` | `😴︎`, `▶`, `1`, `1⃣`
52
+ `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, all kinds of Emoji sequences, and even non-Emoji singleton components like digits. Only exception: Unqualified keycap sequences are not matched | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
51
53
 
52
54
  #### Include Text Emoji
53
55
 
@@ -55,16 +57,16 @@ By default, textual Emoji (emoji characters with text variation selector or thos
55
57
 
56
58
  Regex | Description | Example Matches | Example Non-Matches
57
59
  ------------------------------|-------------|-----------------|--------------------
58
- `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` | `🤾🏽‍♀`, `🏌‍♂️`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
59
- `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` | `🏻`, `🇵🇵`, `1`
60
- `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` | `🏻`, `1`
60
+ `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` , `🏻`| `🤾🏽‍♀`, `🏌‍♂️`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
61
+ `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` , `🏻` | `🇵🇵`, `1`
62
+ `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` , `🏻` | `1`
61
63
 
62
64
  #### Minimally-qualified and Unqualified Sequences
63
65
 
64
66
  Regex | Description | Example Matches | Example Non-Matches
65
67
  ------------------------------|-------------|-----------------|--------------------
66
- `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` | `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
67
- `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
68
+ `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏻` | `🏌‍♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
69
+ `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
68
70
 
69
71
  [List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
70
72
 
@@ -74,10 +76,10 @@ Matches only simple one-codepoint (+ optional variation selector) Emoji:
74
76
 
75
77
  Regex | Description | Example Matches | Example Non-Matches
76
78
  ------------------------------|-------------|-----------------|--------------------
77
- `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
78
- `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
79
+ `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) Basic Emoji, but no sequences at all | `😴`, `▶️`, `🏻` | `😴︎`, `▶`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
80
+ `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
79
81
 
80
- Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text)
82
+ Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text). The `REGEX_BASIC` regex also matches [visual Emoji components](https://character.construction/emoji-components) (skin tone modifiers and hair components).
81
83
 
82
84
  While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
83
85
 
@@ -140,7 +142,20 @@ Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for
140
142
 
141
143
  More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
142
144
 
143
- ### Extended Pictographic Regex
145
+ ### Emoji Property Regexes
146
+
147
+ Ruby includes native regex Emoji properties, as listed in the following table. You can also opt-in to use the `*_PROP_*` regexes to get the Emoji support level of this gem (instead of Ruby's).
148
+
149
+ Gem Regex (`Unicode::Emoji`'s Emoji support level) | Native Regex (Ruby's Emoji support level)
150
+ ---------------------------------------------------|------------------------------------------
151
+ `Unicode::Emoji::REGEX_PROP_EMOJI` | `/\p{Emoji}/`
152
+ `Unicode::Emoji::REGEX_PROP_MODIFIER` | `/\p{EMod}/`
153
+ `Unicode::Emoji::REGEX_PROP_MODIFIER_BASE` | `/\p{EBase}/`
154
+ `Unicode::Emoji::REGEX_PROP_COMPONENT` | `/\p{EComp}/`
155
+ `Unicode::Emoji::REGEX_PROP_PRESENTATION` | `/\p{EPres}/`
156
+ `Unicode::Emoji::REGEX_TEXT_PRESENTATION` | `/[\p{Emoji}&&\P{EPres}]/`
157
+
158
+ #### Extended Pictographic Regex
144
159
 
145
160
  `Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
146
161
 
@@ -148,10 +163,6 @@ More info about valid vs. recommended Emoji can also be found in this [blog arti
148
163
 
149
164
  See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
150
165
 
151
- ### Partial Regexes
152
-
153
- `Unicode::Emoji::REGEX_ANY`, same as `\p{Emoji}`. Deprecated: Will be removed or renamed in the future.
154
-
155
166
  ## Usage – List
156
167
 
157
168
  Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
@@ -69,6 +69,8 @@ def pack_and_join(ords)
69
69
  end
70
70
 
71
71
  def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
72
+ visual_component = pack_and_join(VISUAL_COMPONENT)
73
+
72
74
  emoji_presentation_sequence = \
73
75
  join(
74
76
  text_presentation + pack(EMOJI_VARIATION_SELECTOR),
@@ -78,6 +80,12 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
78
80
  non_component_emoji_presentation_sequence = \
79
81
  "(?!" + emoji_component + ")" + emoji_presentation_sequence
80
82
 
83
+ basic_emoji = \
84
+ join(
85
+ non_component_emoji_presentation_sequence,
86
+ visual_component,
87
+ )
88
+
81
89
  text_keycap_sequence = \
82
90
  pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
83
91
 
@@ -169,6 +177,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
169
177
  emoji_rgi_tag_sequence,
170
178
  emoji_valid_flag_sequence,
171
179
  emoji_core_sequence,
180
+ visual_component,
172
181
  )
173
182
 
174
183
  emoji_rgi_sequence_include_text = \
@@ -177,6 +186,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
177
186
  emoji_rgi_tag_sequence,
178
187
  emoji_valid_flag_sequence,
179
188
  emoji_core_sequence,
189
+ visual_component,
180
190
  text_emoji,
181
191
  )
182
192
 
@@ -186,6 +196,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
186
196
  emoji_rgi_tag_sequence,
187
197
  emoji_valid_flag_sequence,
188
198
  emoji_core_sequence,
199
+ visual_component,
189
200
  )
190
201
 
191
202
  emoji_rgi_include_mqe_uqe_sequence = \
@@ -195,6 +206,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
195
206
  emoji_rgi_tag_sequence,
196
207
  emoji_valid_flag_sequence,
197
208
  emoji_core_sequence,
209
+ visual_component,
198
210
  )
199
211
 
200
212
  emoji_valid_sequence = \
@@ -203,6 +215,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
203
215
  emoji_valid_tag_sequence,
204
216
  emoji_valid_flag_sequence,
205
217
  emoji_core_sequence,
218
+ visual_component,
206
219
  )
207
220
 
208
221
  emoji_valid_sequence_include_text = \
@@ -211,6 +224,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
211
224
  emoji_valid_tag_sequence,
212
225
  emoji_valid_flag_sequence,
213
226
  emoji_core_sequence,
227
+ visual_component,
214
228
  text_emoji,
215
229
  )
216
230
 
@@ -220,6 +234,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
220
234
  emoji_well_formed_tag_sequence,
221
235
  emoji_well_formed_flag_sequence,
222
236
  emoji_core_sequence,
237
+ visual_component,
223
238
  )
224
239
 
225
240
  emoji_well_formed_sequence_include_text = \
@@ -228,6 +243,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
228
243
  emoji_well_formed_tag_sequence,
229
244
  emoji_well_formed_flag_sequence,
230
245
  emoji_core_sequence,
246
+ visual_component,
231
247
  text_emoji,
232
248
  )
233
249
 
@@ -279,19 +295,27 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
279
295
  # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
280
296
  regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
281
297
 
282
- # Matches only basic single, non-textual emoji, ignores "components" like modifiers or simple digits
283
- regexes[:REGEX_BASIC] = Regexp.compile(non_component_emoji_presentation_sequence)
298
+ # Matches only basic single, non-textual emoji, ignores some components like simple digits
299
+ regexes[:REGEX_BASIC] = Regexp.compile(basic_emoji)
284
300
 
285
- # Matches only basic single, textual emoji, ignores "components" like modifiers or simple digits
301
+ # Matches only basic single, textual emoji, ignores components like modifiers or simple digits
286
302
  regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
303
+ regexes[:REGEX_TEXT_PRESENTATION] = Regexp.compile(text_presentation)
287
304
 
288
- # Same as \p{Emoji} - to be removed or renamed
289
- regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
305
+ # Export regexes for Emoji properties so they can be used with newer Unicode than Ruby's
306
+ regexes[:REGEX_PROP_EMOJI] = Regexp.compile(emoji_character)
307
+ regexes[:REGEX_PROP_MODIFIER] = Regexp.compile(emoji_modifier)
308
+ regexes[:REGEX_PROP_MODIFIER_BASE] = Regexp.compile(emoji_modifier_base)
309
+ regexes[:REGEX_PROP_COMPONENT] = Regexp.compile(emoji_component)
310
+ regexes[:REGEX_PROP_PRESENTATION] = Regexp.compile(emoji_presentation)
290
311
 
312
+ # Same goes for ExtendedPictographic
291
313
  regexes[:REGEX_PICTO] = Regexp.compile(picto)
292
-
293
314
  regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)
294
315
 
316
+ # Emoji keycaps
317
+ regexes[:REGEX_EMOJI_KEYCAP] = Regexp.compile(emoji_keycap_sequence)
318
+
295
319
  regexes
296
320
  end
297
321
 
@@ -313,8 +337,8 @@ native_regexes = compile(
313
337
  emoji_modifier_base: "\\p{EBase}",
314
338
  emoji_component: "\\p{EComp}",
315
339
  emoji_presentation: "\\p{EPres}",
316
- text_presentation: "\\p{Emoji}(?<!\\p{EPres})",
340
+ text_presentation: "[\\p{Emoji}&&\\P{EPres}]",
317
341
  picto: "\\p{ExtPict}",
318
- picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
342
+ picto_no_emoji: "[\\p{ExtPict}&&\\P{Emoji}]"
319
343
  )
320
344
  write_regexes(native_regexes, File.expand_path("../lib/unicode/emoji/generated_native", __dir__))
@@ -2,9 +2,9 @@
2
2
 
3
3
  module Unicode
4
4
  module Emoji
5
- VERSION = "3.8.0"
5
+ VERSION = "4.0.4"
6
6
  EMOJI_VERSION = "16.0"
7
- CLDR_VERSION = "45"
7
+ CLDR_VERSION = "46"
8
8
  DATA_DIRECTORY = File.expand_path('../../../data', __dir__).freeze
9
9
  INDEX_FILENAME = (DATA_DIRECTORY + "/emoji.marshal.gz").freeze
10
10
 
@@ -41,5 +41,9 @@ module Unicode
41
41
 
42
42
  # Two regional indicators make up a region
43
43
  REGIONAL_INDICATORS = [*0x1F1E6..0x1F1FF].freeze
44
+
45
+ # The current list of Emoji components that should have a visual representation
46
+ # Currently skin tone modifiers + hair components
47
+ VISUAL_COMPONENT = [*0x1F3FB..0x1F3FF, *0x1F9B0..0x1F9B3].freeze
44
48
  end
45
49
  end