unicode-emoji 3.8.0 → 4.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.rake_tasks +1 -0
  3. data/CHANGELOG.md +59 -33
  4. data/Gemfile.lock +2 -4
  5. data/README.md +32 -21
  6. data/data/generate_constants.rb +32 -8
  7. data/lib/unicode/emoji/constants.rb +6 -2
  8. data/lib/unicode/emoji/generated/regex.rb +1 -1
  9. data/lib/unicode/emoji/generated/regex_basic.rb +1 -1
  10. data/lib/unicode/emoji/generated/regex_emoji_keycap.rb +8 -0
  11. data/lib/unicode/emoji/generated/regex_include_mqe.rb +1 -1
  12. data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +1 -1
  13. data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
  14. data/lib/unicode/emoji/generated/regex_prop_component.rb +8 -0
  15. data/lib/unicode/emoji/generated/regex_prop_emoji.rb +8 -0
  16. data/lib/unicode/emoji/generated/regex_prop_modifier.rb +8 -0
  17. data/lib/unicode/emoji/generated/regex_prop_modifier_base.rb +8 -0
  18. data/lib/unicode/emoji/generated/regex_prop_presentation.rb +8 -0
  19. data/lib/unicode/emoji/generated/regex_text_presentation.rb +8 -0
  20. data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
  21. data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
  22. data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
  23. data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
  24. data/lib/unicode/emoji/generated_native/regex.rb +1 -1
  25. data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
  26. data/lib/unicode/emoji/generated_native/regex_emoji_keycap.rb +8 -0
  27. data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +1 -1
  28. data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +1 -1
  29. data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
  30. data/lib/unicode/emoji/generated_native/regex_picto_no_emoji.rb +1 -1
  31. data/lib/unicode/emoji/generated_native/regex_prop_component.rb +8 -0
  32. data/lib/unicode/emoji/generated_native/{regex_any.rb → regex_prop_emoji.rb} +1 -1
  33. data/lib/unicode/emoji/generated_native/regex_prop_modifier.rb +8 -0
  34. data/lib/unicode/emoji/generated_native/regex_prop_modifier_base.rb +8 -0
  35. data/lib/unicode/emoji/generated_native/regex_prop_presentation.rb +8 -0
  36. data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
  37. data/lib/unicode/emoji/generated_native/regex_text_presentation.rb +8 -0
  38. data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
  39. data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
  40. data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
  41. data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
  42. data/lib/unicode/emoji.rb +10 -5
  43. data/spec/unicode_emoji_spec.rb +97 -16
  44. data/unicode-emoji.gemspec +1 -3
  45. metadata +17 -21
  46. data/lib/unicode/emoji/generated/regex_any.rb +0 -8
  47. data/spec/data/emoji-test.txt +0 -5331
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e3c7cc2671d256d8208b72d719384e7c13aaace4fec6b4919b92640e5336d87f
4
- data.tar.gz: 9420777da4805787467c7f4eac1580f7179a6abf56e54b011c11640a50502b88
3
+ metadata.gz: 3b08d6adaddfcbca3e754c9a52a0c0d5c772da86ca708affc9799ad113c5a005
4
+ data.tar.gz: e9f3817a215ef38b7933d69b4f0563d848a03f6a6b8728ecd06a74417fb5f8a7
5
5
  SHA512:
6
- metadata.gz: f31a83c8a492affe4ec34f2f6e43fa20a44f98dae6c0855322d5c12bc924edc154f7fa4793ca28f299f5436c8981651a5777c2e7944db6dee9c3b99f5ec997ce
7
- data.tar.gz: 54c1beecbcb673274bdf98169f11fa51bc746282418c3b9ca30385194ae384e3532b04bbca3ef196316c11d3344258b9059da1ea18630d8d90b51029c608565d
6
+ metadata.gz: cedad0ceb5f1039be614bbca170cccc3f29e8f05bd7fc74714ed586ddf20edb5da25a6c8fd840acfb7dfbeb918ec6e2218cf427e5e510eb2406235253e32ad74
7
+ data.tar.gz: 4680b526737abd7491351ff87c5d323f3a6acf5996dffa4c2f4737e10bc8083da16928dbab34362b6014ae5a17863266eb456ab2d382f25350339eaf71a175bf
data/.rake_tasks CHANGED
@@ -1,3 +1,4 @@
1
+ dependencies...
1
2
  gem
2
3
  generate_constants
3
4
  irb
data/CHANGELOG.md CHANGED
@@ -1,6 +1,32 @@
1
1
  # CHANGELOG
2
2
 
3
- ### 3.8.0
3
+ ## 4.0.4
4
+
5
+ - Add `REGEX_TEXT_PRESENTATION` to be able to match for raw default-text Emoji codepoints
6
+
7
+ ## 4.0.3
8
+
9
+ - Remove emoji-test.txt from Rubygems package
10
+
11
+ ## 4.0.2
12
+
13
+ - Directly use `RbConfig::CONFIG["UNICODE_EMOJI_VERSION"]` to detect Ruby's Emoji version,
14
+ drop unicode-version dependency
15
+
16
+ ## 4.0.0
17
+
18
+ - **Breaking change:** Regexes now include single skin tone modifiers (`🏻`) and hair components (`🦰`).
19
+ They were previously considered to be invalid partial Emoji, however since they are supposed to be
20
+ displayed as Emoji in isolation, they are now part of the regexes (see *ED-20* in UTS51).
21
+ - **Breaking change:** Drop `REGEX_ANY` in favor of `REGEX_PROP_EMOJI`
22
+ - Expose regexes for Emoji props (`REGEX_PROP_*`). The advantage over using the native regex properties
23
+ directly is that you will be able to use the Emoji support level of this gem instead of Ruby's.
24
+ For example, as of releasing this, the current Emoji version is 16.0, while Ruby is at 15.0.
25
+ Also see README for a table listing the regexes that match Emoji properties.
26
+ - Add `REGEX_EMOJI_KEYCAP` for matching specifically Emoji keycaps
27
+ - Use character class instead of lookbehind for native text emoji and non-emoji pictographic regexes
28
+
29
+ ## 3.8.0
4
30
 
5
31
  - Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
6
32
  for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
@@ -10,7 +36,7 @@
10
36
  - Update CLDR to v46 (valid subdivisions)
11
37
  - Further improvements (see commit log)
12
38
 
13
- ### 3.7.0
39
+ ## 3.7.0
14
40
 
15
41
  - Bump required Ruby slightly to 2.5
16
42
  - Introduce new `REGEX_POSSIBLE` which contains the regex described in
@@ -23,46 +49,46 @@
23
49
  - Separately autoload emoji list, so it can be loaded when other indexes
24
50
  are not needed
25
51
 
26
- ### 3.6.0
52
+ ## 3.6.0
27
53
 
28
54
  - `Unicode::Emoji::REGEX_TEXT` now matches non-emoji keycaps like "3⃣" (U+0033 U+20E3)
29
55
  - Minor refactorings
30
56
 
31
- ### 3.5.0
57
+ ## 3.5.0
32
58
 
33
59
  - Emoji 16.0
34
60
 
35
- ### 3.4.0
61
+ ## 3.4.0
36
62
 
37
63
  - Emoji 15.1
38
64
 
39
- ### 3.3.2
65
+ ## 3.3.2
40
66
 
41
67
  - Update valid subdivisions to CLDR 43 (no changes)
42
68
  -> there won't be any new RGI subdivision flags in Emoji
43
69
 
44
- ### 3.3.1
70
+ ## 3.3.1
45
71
 
46
72
  - Update valid subdivisions to CLDR 42 (no changes)
47
73
 
48
- ### 3.3.0
74
+ ## 3.3.0
49
75
 
50
76
  - Emoji 15.0
51
77
 
52
- ### 3.2.0
78
+ ## 3.2.0
53
79
 
54
80
  - Update valid subdivisions to CLDR 41
55
81
 
56
- ### 3.1.1
82
+ ## 3.1.1
57
83
 
58
84
  - Fix `REGEX` to be able to match complete family emoji, instead of
59
85
  sub-matching partial families, thanks @matt17r
60
86
 
61
- ### 3.1.0
87
+ ## 3.1.0
62
88
 
63
89
  - Update valid subdivisions to CLDR 40
64
90
 
65
- ### 3.0.0
91
+ ## 3.0.0
66
92
 
67
93
  - Vastly improve memory usage, patch by @radarek
68
94
  - Emoji regexes are now pre-generated and bundled with the release
@@ -70,54 +96,54 @@
70
96
  - Most constants (e.g. regexes) now get autoloaded
71
97
  - See https://github.com/janlelis/unicode-emoji/pull/9 for more details
72
98
 
73
- ### 2.9.0
99
+ ## 2.9.0
74
100
 
75
101
  - Emoji 14.0
76
102
 
77
- ### 2.8.0
103
+ ## 2.8.0
78
104
 
79
105
  - Update valid subdivisions to CLDR 39
80
106
 
81
- ### 2.7.1
107
+ ## 2.7.1
82
108
 
83
109
  - Update valid subdivisions to CLDR 38.1
84
110
 
85
- ### 2.7.0
111
+ ## 2.7.0
86
112
 
87
113
  - Update valid subdivisions to CLDR 38
88
114
  - Loosen Ruby dependency to allow Ruby 3.0
89
115
 
90
- ### 2.6.0
116
+ ## 2.6.0
91
117
 
92
118
  - Emoji 13.1
93
119
 
94
- ### 2.5.0
120
+ ## 2.5.0
95
121
 
96
122
  - Use native Emoji regex properties when current Ruby's Emoji support is the same as our current Emoji version
97
123
  - Update valid subdivisions to CLDR 37
98
124
 
99
- ### 2.4.0
125
+ ## 2.4.0
100
126
 
101
127
  - Emoji 13.0
102
128
 
103
- ### 2.3.1
129
+ ## 2.3.1
104
130
 
105
131
  - Fix index to actually include Emoji 12.1
106
132
 
107
- ### 2.3.0
133
+ ## 2.3.0
108
134
 
109
135
  - Emoji 12.1
110
136
 
111
- ### 2.2.0
137
+ ## 2.2.0
112
138
 
113
139
  - Update subdivisions to CLDR 36
114
140
 
115
- ### 2.1.0
141
+ ## 2.1.0
116
142
 
117
143
  - Add `REGEX_PICTO` which matches codepoints with the **Extended_Pictographic** property
118
144
  - Add `REGEX_PICTO_NO_EMOJI` which matches codepoints with the **Extended_Pictographic** property, but no **Emoji** property
119
145
 
120
- ### 2.0.0
146
+ ## 2.0.0
121
147
 
122
148
  - Emoji 12.0 data (including valid subdivisions)
123
149
  - Introduce new `REGEX_WELL_FORMED` to be able to match for invalid tag and region sequences
@@ -126,40 +152,40 @@
126
152
  - Issue warning when using `#list` method to retrieve outdated category
127
153
  - Change matching for ZWJ sequences: Do not limit sequence to a maximum of 3 ZWJs
128
154
 
129
- ### 1.1.0
155
+ ## 1.1.0
130
156
 
131
157
  - Emoji 11.0
132
158
  - Do not depend on rubygems (only use zlib stdlib for unzipping)
133
159
 
134
- ### 1.0.3
160
+ ## 1.0.3
135
161
 
136
162
  - Explicitly load rubygems/util, fixes regression in 1.2.1
137
163
 
138
- ### 1.0.2
164
+ ## 1.0.2
139
165
 
140
166
  - Use `Gem::Util` for `gunzip`, removes deprecation warning
141
167
 
142
- ### 1.0.1
168
+ ## 1.0.1
143
169
 
144
170
  - Actually set required Ruby version to 2.3 in gemspec
145
171
 
146
- ### 1.0.0
172
+ ## 1.0.0
147
173
 
148
174
  - Drop support for Ruby below 2.3, use 0.9 if you need to
149
175
  - Internal refactorings, no API change
150
176
 
151
- ### 0.9.3
177
+ ## 0.9.3
152
178
 
153
179
  - Implement native Emoji regex matchers, but do not activate or document, yet
154
180
 
155
- ### 0.9.2
181
+ ## 0.9.2
156
182
 
157
183
  - REGEX_TEXT: Do not match if the text emoji is followed by a emoji modifier
158
184
 
159
- ### 0.9.1
185
+ ## 0.9.1
160
186
 
161
187
  - Include a categorized list of recommended Emoji
162
188
 
163
- ### 0.9.0
189
+ ## 0.9.0
164
190
 
165
191
  - Initial release (Emoji version 5.0)
data/Gemfile.lock CHANGED
@@ -1,8 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicode-emoji (3.5.0)
5
- unicode-version (~> 1.0)
4
+ unicode-emoji (4.0.4)
6
5
 
7
6
  GEM
8
7
  remote: https://rubygems.org/
@@ -20,7 +19,6 @@ GEM
20
19
  reline (0.3.8)
21
20
  io-console (~> 0.5)
22
21
  stringio (3.0.8)
23
- unicode-version (1.3.0)
24
22
 
25
23
  PLATFORMS
26
24
  ruby
@@ -32,4 +30,4 @@ DEPENDENCIES
32
30
  unicode-emoji!
33
31
 
34
32
  BUNDLED WITH
35
- 2.2.22
33
+ 2.5.21
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
2
2
 
3
- Provides regular expressions to find Emoji in strings, incorporating the latest Unicode / Emoji standards.
3
+ Provides various sophisticated regular expressions to work with Emoji in strings,
4
+ incorporating the latest Unicode / Emoji standards.
4
5
 
5
6
  Additional features:
6
7
 
@@ -26,16 +27,17 @@ require "unicode/emoji"
26
27
 
27
28
  string = "String which contains all types of Emoji sequences:
28
29
 
29
- - Singleton Emoji: 😴
30
- - Textual singleton Emoji with Emoji variation: ▶️
30
+ - Basic Emoji: 😴
31
+ - Textual Emoji with Emoji variation (VS16): ▶️
31
32
  - Emoji with skin tone modifier: 🛌🏽
32
33
  - Region flag: 🇵🇹
33
34
  - Sub-Region flag: 🏴󠁧󠁢󠁳󠁣󠁴󠁿
34
35
  - Keycap sequence: 2️⃣
36
+ - Skin tone modifier: 🏻
35
37
  - Sequence using ZWJ (zero width joiner): 🤾🏽‍♀️
36
38
  "
37
39
 
38
- string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🤾🏽‍♀️"]
40
+ string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🏻", "🤾🏽‍♀️"]
39
41
  ```
40
42
 
41
43
  Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
@@ -44,10 +46,10 @@ Depending on your exact usecase, you can choose between multiple levels of Emoji
44
46
 
45
47
  Regex | Description | Example Matches | Example Non-Matches
46
48
  ------------------------------|-------------|-----------------|--------------------
47
- `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
48
- `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`, `1⃣`
49
- `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`, `1⃣`
50
- `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits (except for: unqualified keycap sequences) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
49
+ `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) Basic Emoji and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🏻` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
50
+ `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) Basic Emoji and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `1`, `1⃣`
51
+ `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) Basic Emoji and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵`, `🏻` | `😴︎`, `▶`, `1`, `1⃣`
52
+ `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, all kinds of Emoji sequences, and even non-Emoji singleton components like digits. Only exception: Unqualified keycap sequences are not matched | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
51
53
 
52
54
  #### Include Text Emoji
53
55
 
@@ -55,16 +57,16 @@ By default, textual Emoji (emoji characters with text variation selector or thos
55
57
 
56
58
  Regex | Description | Example Matches | Example Non-Matches
57
59
  ------------------------------|-------------|-----------------|--------------------
58
- `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` | `🤾🏽‍♀`, `🏌‍♂️`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
59
- `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` | `🏻`, `🇵🇵`, `1`
60
- `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` | `🏻`, `1`
60
+ `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` , `🏻`| `🤾🏽‍♀`, `🏌‍♂️`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
61
+ `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` , `🏻` | `🇵🇵`, `1`
62
+ `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` , `🏻` | `1`
61
63
 
62
64
  #### Minimally-qualified and Unqualified Sequences
63
65
 
64
66
  Regex | Description | Example Matches | Example Non-Matches
65
67
  ------------------------------|-------------|-----------------|--------------------
66
- `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` | `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
67
- `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
68
+ `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏻` | `🏌‍♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
69
+ `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
68
70
 
69
71
  [List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
70
72
 
@@ -74,10 +76,10 @@ Matches only simple one-codepoint (+ optional variation selector) Emoji:
74
76
 
75
77
  Regex | Description | Example Matches | Example Non-Matches
76
78
  ------------------------------|-------------|-----------------|--------------------
77
- `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
78
- `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
79
+ `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) Basic Emoji, but no sequences at all | `😴`, `▶️`, `🏻` | `😴︎`, `▶`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
80
+ `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
79
81
 
80
- Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text)
82
+ Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text). The `REGEX_BASIC` regex also matches [visual Emoji components](https://character.construction/emoji-components) (skin tone modifiers and hair components).
81
83
 
82
84
  While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
83
85
 
@@ -140,7 +142,20 @@ Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for
140
142
 
141
143
  More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
142
144
 
143
- ### Extended Pictographic Regex
145
+ ### Emoji Property Regexes
146
+
147
+ Ruby includes native regex Emoji properties, as listed in the following table. You can also opt-in to use the `*_PROP_*` regexes to get the Emoji support level of this gem (instead of Ruby's).
148
+
149
+ Gem Regex (`Unicode::Emoji`'s Emoji support level) | Native Regex (Ruby's Emoji support level)
150
+ ---------------------------------------------------|------------------------------------------
151
+ `Unicode::Emoji::REGEX_PROP_EMOJI` | `/\p{Emoji}/`
152
+ `Unicode::Emoji::REGEX_PROP_MODIFIER` | `/\p{EMod}/`
153
+ `Unicode::Emoji::REGEX_PROP_MODIFIER_BASE` | `/\p{EBase}/`
154
+ `Unicode::Emoji::REGEX_PROP_COMPONENT` | `/\p{EComp}/`
155
+ `Unicode::Emoji::REGEX_PROP_PRESENTATION` | `/\p{EPres}/`
156
+ `Unicode::Emoji::REGEX_TEXT_PRESENTATION` | `/[\p{Emoji}&&\P{EPres}]/`
157
+
158
+ #### Extended Pictographic Regex
144
159
 
145
160
  `Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
146
161
 
@@ -148,10 +163,6 @@ More info about valid vs. recommended Emoji can also be found in this [blog arti
148
163
 
149
164
  See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
150
165
 
151
- ### Partial Regexes
152
-
153
- `Unicode::Emoji::REGEX_ANY`, same as `\p{Emoji}`. Deprecated: Will be removed or renamed in the future.
154
-
155
166
  ## Usage – List
156
167
 
157
168
  Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
@@ -69,6 +69,8 @@ def pack_and_join(ords)
69
69
  end
70
70
 
71
71
  def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
72
+ visual_component = pack_and_join(VISUAL_COMPONENT)
73
+
72
74
  emoji_presentation_sequence = \
73
75
  join(
74
76
  text_presentation + pack(EMOJI_VARIATION_SELECTOR),
@@ -78,6 +80,12 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
78
80
  non_component_emoji_presentation_sequence = \
79
81
  "(?!" + emoji_component + ")" + emoji_presentation_sequence
80
82
 
83
+ basic_emoji = \
84
+ join(
85
+ non_component_emoji_presentation_sequence,
86
+ visual_component,
87
+ )
88
+
81
89
  text_keycap_sequence = \
82
90
  pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
83
91
 
@@ -169,6 +177,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
169
177
  emoji_rgi_tag_sequence,
170
178
  emoji_valid_flag_sequence,
171
179
  emoji_core_sequence,
180
+ visual_component,
172
181
  )
173
182
 
174
183
  emoji_rgi_sequence_include_text = \
@@ -177,6 +186,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
177
186
  emoji_rgi_tag_sequence,
178
187
  emoji_valid_flag_sequence,
179
188
  emoji_core_sequence,
189
+ visual_component,
180
190
  text_emoji,
181
191
  )
182
192
 
@@ -186,6 +196,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
186
196
  emoji_rgi_tag_sequence,
187
197
  emoji_valid_flag_sequence,
188
198
  emoji_core_sequence,
199
+ visual_component,
189
200
  )
190
201
 
191
202
  emoji_rgi_include_mqe_uqe_sequence = \
@@ -195,6 +206,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
195
206
  emoji_rgi_tag_sequence,
196
207
  emoji_valid_flag_sequence,
197
208
  emoji_core_sequence,
209
+ visual_component,
198
210
  )
199
211
 
200
212
  emoji_valid_sequence = \
@@ -203,6 +215,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
203
215
  emoji_valid_tag_sequence,
204
216
  emoji_valid_flag_sequence,
205
217
  emoji_core_sequence,
218
+ visual_component,
206
219
  )
207
220
 
208
221
  emoji_valid_sequence_include_text = \
@@ -211,6 +224,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
211
224
  emoji_valid_tag_sequence,
212
225
  emoji_valid_flag_sequence,
213
226
  emoji_core_sequence,
227
+ visual_component,
214
228
  text_emoji,
215
229
  )
216
230
 
@@ -220,6 +234,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
220
234
  emoji_well_formed_tag_sequence,
221
235
  emoji_well_formed_flag_sequence,
222
236
  emoji_core_sequence,
237
+ visual_component,
223
238
  )
224
239
 
225
240
  emoji_well_formed_sequence_include_text = \
@@ -228,6 +243,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
228
243
  emoji_well_formed_tag_sequence,
229
244
  emoji_well_formed_flag_sequence,
230
245
  emoji_core_sequence,
246
+ visual_component,
231
247
  text_emoji,
232
248
  )
233
249
 
@@ -279,19 +295,27 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
279
295
  # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
280
296
  regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
281
297
 
282
- # Matches only basic single, non-textual emoji, ignores "components" like modifiers or simple digits
283
- regexes[:REGEX_BASIC] = Regexp.compile(non_component_emoji_presentation_sequence)
298
+ # Matches only basic single, non-textual emoji, ignores some components like simple digits
299
+ regexes[:REGEX_BASIC] = Regexp.compile(basic_emoji)
284
300
 
285
- # Matches only basic single, textual emoji, ignores "components" like modifiers or simple digits
301
+ # Matches only basic single, textual emoji, ignores components like modifiers or simple digits
286
302
  regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
303
+ regexes[:REGEX_TEXT_PRESENTATION] = Regexp.compile(text_presentation)
287
304
 
288
- # Same as \p{Emoji} - to be removed or renamed
289
- regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
305
+ # Export regexes for Emoji properties so they can be used with newer Unicode than Ruby's
306
+ regexes[:REGEX_PROP_EMOJI] = Regexp.compile(emoji_character)
307
+ regexes[:REGEX_PROP_MODIFIER] = Regexp.compile(emoji_modifier)
308
+ regexes[:REGEX_PROP_MODIFIER_BASE] = Regexp.compile(emoji_modifier_base)
309
+ regexes[:REGEX_PROP_COMPONENT] = Regexp.compile(emoji_component)
310
+ regexes[:REGEX_PROP_PRESENTATION] = Regexp.compile(emoji_presentation)
290
311
 
312
+ # Same goes for ExtendedPictographic
291
313
  regexes[:REGEX_PICTO] = Regexp.compile(picto)
292
-
293
314
  regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)
294
315
 
316
+ # Emoji keycaps
317
+ regexes[:REGEX_EMOJI_KEYCAP] = Regexp.compile(emoji_keycap_sequence)
318
+
295
319
  regexes
296
320
  end
297
321
 
@@ -313,8 +337,8 @@ native_regexes = compile(
313
337
  emoji_modifier_base: "\\p{EBase}",
314
338
  emoji_component: "\\p{EComp}",
315
339
  emoji_presentation: "\\p{EPres}",
316
- text_presentation: "\\p{Emoji}(?<!\\p{EPres})",
340
+ text_presentation: "[\\p{Emoji}&&\\P{EPres}]",
317
341
  picto: "\\p{ExtPict}",
318
- picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
342
+ picto_no_emoji: "[\\p{ExtPict}&&\\P{Emoji}]"
319
343
  )
320
344
  write_regexes(native_regexes, File.expand_path("../lib/unicode/emoji/generated_native", __dir__))
@@ -2,9 +2,9 @@
2
2
 
3
3
  module Unicode
4
4
  module Emoji
5
- VERSION = "3.8.0"
5
+ VERSION = "4.0.4"
6
6
  EMOJI_VERSION = "16.0"
7
- CLDR_VERSION = "45"
7
+ CLDR_VERSION = "46"
8
8
  DATA_DIRECTORY = File.expand_path('../../../data', __dir__).freeze
9
9
  INDEX_FILENAME = (DATA_DIRECTORY + "/emoji.marshal.gz").freeze
10
10
 
@@ -41,5 +41,9 @@ module Unicode
41
41
 
42
42
  # Two regional indicators make up a region
43
43
  REGIONAL_INDICATORS = [*0x1F1E6..0x1F1FF].freeze
44
+
45
+ # The current list of Emoji components that should have a visual representation
46
+ # Currently skin tone modifiers + hair components
47
+ VISUAL_COMPONENT = [*0x1F3FB..0x1F3FF, *0x1F9B0..0x1F9B3].freeze
44
48
  end
45
49
  end