unicode-emoji 3.8.0 → 4.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rake_tasks +1 -0
  3. data/CHANGELOG.md +55 -33
  4. data/Gemfile.lock +2 -4
  5. data/README.md +31 -21
  6. data/data/generate_constants.rb +31 -8
  7. data/lib/unicode/emoji/constants.rb +6 -2
  8. data/lib/unicode/emoji/generated/regex.rb +1 -1
  9. data/lib/unicode/emoji/generated/regex_basic.rb +1 -1
  10. data/lib/unicode/emoji/generated/regex_emoji_keycap.rb +8 -0
  11. data/lib/unicode/emoji/generated/regex_include_mqe.rb +1 -1
  12. data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +1 -1
  13. data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
  14. data/lib/unicode/emoji/generated/regex_prop_component.rb +8 -0
  15. data/lib/unicode/emoji/generated/regex_prop_emoji.rb +8 -0
  16. data/lib/unicode/emoji/generated/regex_prop_modifier.rb +8 -0
  17. data/lib/unicode/emoji/generated/regex_prop_modifier_base.rb +8 -0
  18. data/lib/unicode/emoji/generated/regex_prop_presentation.rb +8 -0
  19. data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
  20. data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
  21. data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
  22. data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
  23. data/lib/unicode/emoji/generated_native/regex.rb +1 -1
  24. data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
  25. data/lib/unicode/emoji/generated_native/regex_emoji_keycap.rb +8 -0
  26. data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +1 -1
  27. data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +1 -1
  28. data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
  29. data/lib/unicode/emoji/generated_native/regex_picto_no_emoji.rb +1 -1
  30. data/lib/unicode/emoji/generated_native/regex_prop_component.rb +8 -0
  31. data/lib/unicode/emoji/generated_native/{regex_any.rb → regex_prop_emoji.rb} +1 -1
  32. data/lib/unicode/emoji/generated_native/regex_prop_modifier.rb +8 -0
  33. data/lib/unicode/emoji/generated_native/regex_prop_modifier_base.rb +8 -0
  34. data/lib/unicode/emoji/generated_native/regex_prop_presentation.rb +8 -0
  35. data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
  36. data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
  37. data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
  38. data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
  39. data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
  40. data/lib/unicode/emoji.rb +9 -5
  41. data/spec/unicode_emoji_spec.rb +97 -16
  42. data/unicode-emoji.gemspec +1 -3
  43. metadata +15 -21
  44. data/lib/unicode/emoji/generated/regex_any.rb +0 -8
  45. data/spec/data/emoji-test.txt +0 -5331
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e3c7cc2671d256d8208b72d719384e7c13aaace4fec6b4919b92640e5336d87f
4
- data.tar.gz: 9420777da4805787467c7f4eac1580f7179a6abf56e54b011c11640a50502b88
3
+ metadata.gz: 6380c3c0ad44728037efba8733373e0c2b923b959239544cb0efda1ac595815d
4
+ data.tar.gz: 9c45c4d5ba6134933472459e8223657f23aa779fb0e2d0bc68822da3a8b726e3
5
5
  SHA512:
6
- metadata.gz: f31a83c8a492affe4ec34f2f6e43fa20a44f98dae6c0855322d5c12bc924edc154f7fa4793ca28f299f5436c8981651a5777c2e7944db6dee9c3b99f5ec997ce
7
- data.tar.gz: 54c1beecbcb673274bdf98169f11fa51bc746282418c3b9ca30385194ae384e3532b04bbca3ef196316c11d3344258b9059da1ea18630d8d90b51029c608565d
6
+ metadata.gz: d5ec122b5aad377bf3e4f17b11d9428a4eaaeaaa3245b2302cfed5befdb40a53e7b549c08e215c9eb43c445f5dd1cb69ac000343ecc7fc88473cb3a1475aa6e3
7
+ data.tar.gz: 36427662e0a3d911884d45d7d7d0ab3313449598deda482a22dd64c99c3720ef12a16aecb7f3f446913e591cea9517b7a312bcc8731a74f06ff99498a973d070
data/.rake_tasks CHANGED
@@ -1,3 +1,4 @@
1
+ dependencies...
1
2
  gem
2
3
  generate_constants
3
4
  irb
data/CHANGELOG.md CHANGED
@@ -1,6 +1,28 @@
1
1
  # CHANGELOG
2
2
 
3
- ### 3.8.0
3
+ ## 4.0.3
4
+
5
+ - Remove emoji-test.txt from Rubygems package
6
+
7
+ ## 4.0.2
8
+
9
+ - Directly use `RbConfig::CONFIG["UNICODE_EMOJI_VERSION"]` to detect Ruby's Emoji version,
10
+ drop unicode-version dependency
11
+
12
+ ## 4.0.0
13
+
14
+ - **Breaking change:** Regexes now include single skin tone modifiers (`🏻`) and hair components (`🦰`).
15
+ They were previously considered to be invalid partial Emoji, however since they are supposed to be
16
+ displayed as Emoji in isolation, they are now part of the regexes (see *ED-20* in UTS51).
17
+ - **Breaking change:** Drop `REGEX_ANY` in favor of `REGEX_PROP_EMOJI`
18
+ - Expose regexes for Emoji props (`REGEX_PROP_*`). The advantage over using the native regex properties
19
+ directly is that you will be able to use the Emoji support level of this gem instead of Ruby's.
20
+ For example, as of releasing this, the current Emoji version is 16.0, while Ruby is at 15.0.
21
+ Also see README for a table listing the regexes that match Emoji properties.
22
+ - Add `REGEX_EMOJI_KEYCAP` for matching specifically Emoji keycaps
23
+ - Use character class instead of lookbehind for native text emoji and non-emoji pictographic regexes
24
+
25
+ ## 3.8.0
4
26
 
5
27
  - Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
6
28
  for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
@@ -10,7 +32,7 @@
10
32
  - Update CLDR to v46 (valid subdivisions)
11
33
  - Further improvements (see commit log)
12
34
 
13
- ### 3.7.0
35
+ ## 3.7.0
14
36
 
15
37
  - Bump required Ruby slightly to 2.5
16
38
  - Introduce new `REGEX_POSSIBLE` which contains the regex described in
@@ -23,46 +45,46 @@
23
45
  - Separately autoload emoji list, so it can be loaded when other indexes
24
46
  are not needed
25
47
 
26
- ### 3.6.0
48
+ ## 3.6.0
27
49
 
28
50
  - `Unicode::Emoji::REGEX_TEXT` now matches non-emoji keycaps like "3⃣" (U+0033 U+20E3)
29
51
  - Minor refactorings
30
52
 
31
- ### 3.5.0
53
+ ## 3.5.0
32
54
 
33
55
  - Emoji 16.0
34
56
 
35
- ### 3.4.0
57
+ ## 3.4.0
36
58
 
37
59
  - Emoji 15.1
38
60
 
39
- ### 3.3.2
61
+ ## 3.3.2
40
62
 
41
63
  - Update valid subdivisions to CLDR 43 (no changes)
42
64
  -> there won't be any new RGI subdivision flags in Emoji
43
65
 
44
- ### 3.3.1
66
+ ## 3.3.1
45
67
 
46
68
  - Update valid subdivisions to CLDR 42 (no changes)
47
69
 
48
- ### 3.3.0
70
+ ## 3.3.0
49
71
 
50
72
  - Emoji 15.0
51
73
 
52
- ### 3.2.0
74
+ ## 3.2.0
53
75
 
54
76
  - Update valid subdivisions to CLDR 41
55
77
 
56
- ### 3.1.1
78
+ ## 3.1.1
57
79
 
58
80
  - Fix `REGEX` to be able to match complete family emoji, instead of
59
81
  sub-matching partial families, thanks @matt17r
60
82
 
61
- ### 3.1.0
83
+ ## 3.1.0
62
84
 
63
85
  - Update valid subdivisions to CLDR 40
64
86
 
65
- ### 3.0.0
87
+ ## 3.0.0
66
88
 
67
89
  - Vastly improve memory usage, patch by @radarek
68
90
  - Emoji regexes are now pre-generated and bundled with the release
@@ -70,54 +92,54 @@
70
92
  - Most constants (e.g. regexes) now get autoloaded
71
93
  - See https://github.com/janlelis/unicode-emoji/pull/9 for more details
72
94
 
73
- ### 2.9.0
95
+ ## 2.9.0
74
96
 
75
97
  - Emoji 14.0
76
98
 
77
- ### 2.8.0
99
+ ## 2.8.0
78
100
 
79
101
  - Update valid subdivisions to CLDR 39
80
102
 
81
- ### 2.7.1
103
+ ## 2.7.1
82
104
 
83
105
  - Update valid subdivisions to CLDR 38.1
84
106
 
85
- ### 2.7.0
107
+ ## 2.7.0
86
108
 
87
109
  - Update valid subdivisions to CLDR 38
88
110
  - Loosen Ruby dependency to allow Ruby 3.0
89
111
 
90
- ### 2.6.0
112
+ ## 2.6.0
91
113
 
92
114
  - Emoji 13.1
93
115
 
94
- ### 2.5.0
116
+ ## 2.5.0
95
117
 
96
118
  - Use native Emoji regex properties when current Ruby's Emoji support is the same as our current Emoji version
97
119
  - Update valid subdivisions to CLDR 37
98
120
 
99
- ### 2.4.0
121
+ ## 2.4.0
100
122
 
101
123
  - Emoji 13.0
102
124
 
103
- ### 2.3.1
125
+ ## 2.3.1
104
126
 
105
127
  - Fix index to actually include Emoji 12.1
106
128
 
107
- ### 2.3.0
129
+ ## 2.3.0
108
130
 
109
131
  - Emoji 12.1
110
132
 
111
- ### 2.2.0
133
+ ## 2.2.0
112
134
 
113
135
  - Update subdivisions to CLDR 36
114
136
 
115
- ### 2.1.0
137
+ ## 2.1.0
116
138
 
117
139
  - Add `REGEX_PICTO` which matches codepoints with the **Extended_Pictographic** property
118
140
  - Add `REGEX_PICTO_NO_EMOJI` which matches codepoints with the **Extended_Pictographic** property, but no **Emoji** property
119
141
 
120
- ### 2.0.0
142
+ ## 2.0.0
121
143
 
122
144
  - Emoji 12.0 data (including valid subdivisions)
123
145
  - Introduce new `REGEX_WELL_FORMED` to be able to match for invalid tag and region sequences
@@ -126,40 +148,40 @@
126
148
  - Issue warning when using `#list` method to retrieve outdated category
127
149
  - Change matching for ZWJ sequences: Do not limit sequence to a maximum of 3 ZWJs
128
150
 
129
- ### 1.1.0
151
+ ## 1.1.0
130
152
 
131
153
  - Emoji 11.0
132
154
  - Do not depend on rubygems (only use zlib stdlib for unzipping)
133
155
 
134
- ### 1.0.3
156
+ ## 1.0.3
135
157
 
136
158
  - Explicitly load rubygems/util, fixes regression in 1.2.1
137
159
 
138
- ### 1.0.2
160
+ ## 1.0.2
139
161
 
140
162
  - Use `Gem::Util` for `gunzip`, removes deprecation warning
141
163
 
142
- ### 1.0.1
164
+ ## 1.0.1
143
165
 
144
166
  - Actually set required Ruby version to 2.3 in gemspec
145
167
 
146
- ### 1.0.0
168
+ ## 1.0.0
147
169
 
148
170
  - Drop support for Ruby below 2.3, use 0.9 if you need to
149
171
  - Internal refactorings, no API change
150
172
 
151
- ### 0.9.3
173
+ ## 0.9.3
152
174
 
153
175
  - Implement native Emoji regex matchers, but do not activate or document, yet
154
176
 
155
- ### 0.9.2
177
+ ## 0.9.2
156
178
 
157
179
  - REGEX_TEXT: Do not match if the text emoji is followed by a emoji modifier
158
180
 
159
- ### 0.9.1
181
+ ## 0.9.1
160
182
 
161
183
  - Include a categorized list of recommended Emoji
162
184
 
163
- ### 0.9.0
185
+ ## 0.9.0
164
186
 
165
187
  - Initial release (Emoji version 5.0)
data/Gemfile.lock CHANGED
@@ -1,8 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicode-emoji (3.5.0)
5
- unicode-version (~> 1.0)
4
+ unicode-emoji (4.0.3)
6
5
 
7
6
  GEM
8
7
  remote: https://rubygems.org/
@@ -20,7 +19,6 @@ GEM
20
19
  reline (0.3.8)
21
20
  io-console (~> 0.5)
22
21
  stringio (3.0.8)
23
- unicode-version (1.3.0)
24
22
 
25
23
  PLATFORMS
26
24
  ruby
@@ -32,4 +30,4 @@ DEPENDENCIES
32
30
  unicode-emoji!
33
31
 
34
32
  BUNDLED WITH
35
- 2.2.22
33
+ 2.5.21
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
2
2
 
3
- Provides regular expressions to find Emoji in strings, incorporating the latest Unicode / Emoji standards.
3
+ Provides various sophisticated regular expressions to work with Emoji in strings,
4
+ incorporating the latest Unicode / Emoji standards.
4
5
 
5
6
  Additional features:
6
7
 
@@ -26,16 +27,17 @@ require "unicode/emoji"
26
27
 
27
28
  string = "String which contains all types of Emoji sequences:
28
29
 
29
- - Singleton Emoji: 😴
30
- - Textual singleton Emoji with Emoji variation: ▶️
30
+ - Basic Emoji: 😴
31
+ - Textual Emoji with Emoji variation (VS16): ▶️
31
32
  - Emoji with skin tone modifier: 🛌🏽
32
33
  - Region flag: 🇵🇹
33
34
  - Sub-Region flag: 🏴󠁧󠁢󠁳󠁣󠁴󠁿
34
35
  - Keycap sequence: 2️⃣
36
+ - Skin tone modifier: 🏻
35
37
  - Sequence using ZWJ (zero width joiner): 🤾🏽‍♀️
36
38
  "
37
39
 
38
- string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🤾🏽‍♀️"]
40
+ string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🏻", "🤾🏽‍♀️"]
39
41
  ```
40
42
 
41
43
  Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
@@ -44,10 +46,10 @@ Depending on your exact usecase, you can choose between multiple levels of Emoji
44
46
 
45
47
  Regex | Description | Example Matches | Example Non-Matches
46
48
  ------------------------------|-------------|-----------------|--------------------
47
- `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
48
- `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`, `1⃣`
49
- `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`, `1⃣`
50
- `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits (except for: unqualified keycap sequences) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
49
+ `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) Basic Emoji and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🏻` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
50
+ `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) Basic Emoji and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `1`, `1⃣`
51
+ `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) Basic Emoji and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵`, `🏻` | `😴︎`, `▶`, `1`, `1⃣`
52
+ `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, all kinds of Emoji sequences, and even non-Emoji singleton components like digits. Only exception: Unqualified keycap sequences are not matched | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
51
53
 
52
54
  #### Include Text Emoji
53
55
 
@@ -55,16 +57,16 @@ By default, textual Emoji (emoji characters with text variation selector or thos
55
57
 
56
58
  Regex | Description | Example Matches | Example Non-Matches
57
59
  ------------------------------|-------------|-----------------|--------------------
58
- `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` | `🤾🏽‍♀`, `🏌‍♂️`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
59
- `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` | `🏻`, `🇵🇵`, `1`
60
- `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` | `🏻`, `1`
60
+ `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` , `🏻`| `🤾🏽‍♀`, `🏌‍♂️`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
61
+ `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` , `🏻` | `🇵🇵`, `1`
62
+ `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` , `🏻` | `1`
61
63
 
62
64
  #### Minimally-qualified and Unqualified Sequences
63
65
 
64
66
  Regex | Description | Example Matches | Example Non-Matches
65
67
  ------------------------------|-------------|-----------------|--------------------
66
- `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` | `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
67
- `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
68
+ `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏻` | `🏌‍♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
69
+ `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
68
70
 
69
71
  [List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
70
72
 
@@ -74,10 +76,10 @@ Matches only simple one-codepoint (+ optional variation selector) Emoji:
74
76
 
75
77
  Regex | Description | Example Matches | Example Non-Matches
76
78
  ------------------------------|-------------|-----------------|--------------------
77
- `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
78
- `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
79
+ `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) Basic Emoji, but no sequences at all | `😴`, `▶️`, `🏻` | `😴︎`, `▶`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
80
+ `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
79
81
 
80
- Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text)
82
+ Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text). The `REGEX_BASIC` regex also matches [visual Emoji components](https://character.construction/emoji-components) (skin tone modifiers and hair components).
81
83
 
82
84
  While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
83
85
 
@@ -140,7 +142,19 @@ Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for
140
142
 
141
143
  More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
142
144
 
143
- ### Extended Pictographic Regex
145
+ ### Emoji Property Regexes
146
+
147
+ Ruby includes native regex Emoji properties, as listed in the following table. You can also opt-in to use the `*_PROP_*` regexes to get the Emoji support level of this gem (instead of Ruby's).
148
+
149
+ Gem Regex (`Unicode::Emoji`'s Emoji support level) | Native Regex (Ruby's Emoji support level)
150
+ ---------------------------------------------------|------------------------------------------
151
+ `Unicode::Emoji::REGEX_PROP_EMOJI` | `/\p{Emoji}/`
152
+ `Unicode::Emoji::REGEX_PROP_MODIFIER` | `/\p{EMod}/`
153
+ `Unicode::Emoji::REGEX_PROP_MODIFIER_BASE` | `/\p{EBase}/`
154
+ `Unicode::Emoji::REGEX_PROP_COMPONENT` | `/\p{EComp}/`
155
+ `Unicode::Emoji::REGEX_PROP_PRESENTATION` | `/\p{EPres}/`
156
+
157
+ #### Extended Pictographic Regex
144
158
 
145
159
  `Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
146
160
 
@@ -148,10 +162,6 @@ More info about valid vs. recommended Emoji can also be found in this [blog arti
148
162
 
149
163
  See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
150
164
 
151
- ### Partial Regexes
152
-
153
- `Unicode::Emoji::REGEX_ANY`, same as `\p{Emoji}`. Deprecated: Will be removed or renamed in the future.
154
-
155
165
  ## Usage – List
156
166
 
157
167
  Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
@@ -69,6 +69,8 @@ def pack_and_join(ords)
69
69
  end
70
70
 
71
71
  def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
72
+ visual_component = pack_and_join(VISUAL_COMPONENT)
73
+
72
74
  emoji_presentation_sequence = \
73
75
  join(
74
76
  text_presentation + pack(EMOJI_VARIATION_SELECTOR),
@@ -78,6 +80,12 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
78
80
  non_component_emoji_presentation_sequence = \
79
81
  "(?!" + emoji_component + ")" + emoji_presentation_sequence
80
82
 
83
+ basic_emoji = \
84
+ join(
85
+ non_component_emoji_presentation_sequence,
86
+ visual_component,
87
+ )
88
+
81
89
  text_keycap_sequence = \
82
90
  pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
83
91
 
@@ -169,6 +177,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
169
177
  emoji_rgi_tag_sequence,
170
178
  emoji_valid_flag_sequence,
171
179
  emoji_core_sequence,
180
+ visual_component,
172
181
  )
173
182
 
174
183
  emoji_rgi_sequence_include_text = \
@@ -177,6 +186,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
177
186
  emoji_rgi_tag_sequence,
178
187
  emoji_valid_flag_sequence,
179
188
  emoji_core_sequence,
189
+ visual_component,
180
190
  text_emoji,
181
191
  )
182
192
 
@@ -186,6 +196,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
186
196
  emoji_rgi_tag_sequence,
187
197
  emoji_valid_flag_sequence,
188
198
  emoji_core_sequence,
199
+ visual_component,
189
200
  )
190
201
 
191
202
  emoji_rgi_include_mqe_uqe_sequence = \
@@ -195,6 +206,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
195
206
  emoji_rgi_tag_sequence,
196
207
  emoji_valid_flag_sequence,
197
208
  emoji_core_sequence,
209
+ visual_component,
198
210
  )
199
211
 
200
212
  emoji_valid_sequence = \
@@ -203,6 +215,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
203
215
  emoji_valid_tag_sequence,
204
216
  emoji_valid_flag_sequence,
205
217
  emoji_core_sequence,
218
+ visual_component,
206
219
  )
207
220
 
208
221
  emoji_valid_sequence_include_text = \
@@ -211,6 +224,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
211
224
  emoji_valid_tag_sequence,
212
225
  emoji_valid_flag_sequence,
213
226
  emoji_core_sequence,
227
+ visual_component,
214
228
  text_emoji,
215
229
  )
216
230
 
@@ -220,6 +234,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
220
234
  emoji_well_formed_tag_sequence,
221
235
  emoji_well_formed_flag_sequence,
222
236
  emoji_core_sequence,
237
+ visual_component,
223
238
  )
224
239
 
225
240
  emoji_well_formed_sequence_include_text = \
@@ -228,6 +243,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
228
243
  emoji_well_formed_tag_sequence,
229
244
  emoji_well_formed_flag_sequence,
230
245
  emoji_core_sequence,
246
+ visual_component,
231
247
  text_emoji,
232
248
  )
233
249
 
@@ -279,19 +295,26 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
279
295
  # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
280
296
  regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
281
297
 
282
- # Matches only basic single, non-textual emoji, ignores "components" like modifiers or simple digits
283
- regexes[:REGEX_BASIC] = Regexp.compile(non_component_emoji_presentation_sequence)
298
+ # Matches only basic single, non-textual emoji, ignores some components like simple digits
299
+ regexes[:REGEX_BASIC] = Regexp.compile(basic_emoji)
284
300
 
285
- # Matches only basic single, textual emoji, ignores "components" like modifiers or simple digits
301
+ # Matches only basic single, textual emoji, ignores components like modifiers or simple digits
286
302
  regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
287
303
 
288
- # Same as \p{Emoji} - to be removed or renamed
289
- regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
304
+ # Export regexes for Emoji properties so they can be used with newer Unicode than Ruby's
305
+ regexes[:REGEX_PROP_EMOJI] = Regexp.compile(emoji_character)
306
+ regexes[:REGEX_PROP_MODIFIER] = Regexp.compile(emoji_modifier)
307
+ regexes[:REGEX_PROP_MODIFIER_BASE] = Regexp.compile(emoji_modifier_base)
308
+ regexes[:REGEX_PROP_COMPONENT] = Regexp.compile(emoji_component)
309
+ regexes[:REGEX_PROP_PRESENTATION] = Regexp.compile(emoji_presentation)
290
310
 
311
+ # Same goes for ExtendedPictographic
291
312
  regexes[:REGEX_PICTO] = Regexp.compile(picto)
292
-
293
313
  regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)
294
314
 
315
+ # Emoji keycaps
316
+ regexes[:REGEX_EMOJI_KEYCAP] = Regexp.compile(emoji_keycap_sequence)
317
+
295
318
  regexes
296
319
  end
297
320
 
@@ -313,8 +336,8 @@ native_regexes = compile(
313
336
  emoji_modifier_base: "\\p{EBase}",
314
337
  emoji_component: "\\p{EComp}",
315
338
  emoji_presentation: "\\p{EPres}",
316
- text_presentation: "\\p{Emoji}(?<!\\p{EPres})",
339
+ text_presentation: "[\\p{Emoji}&&\\P{EPres}]",
317
340
  picto: "\\p{ExtPict}",
318
- picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
341
+ picto_no_emoji: "[\\p{ExtPict}&&\\P{Emoji}]"
319
342
  )
320
343
  write_regexes(native_regexes, File.expand_path("../lib/unicode/emoji/generated_native", __dir__))
@@ -2,9 +2,9 @@
2
2
 
3
3
  module Unicode
4
4
  module Emoji
5
- VERSION = "3.8.0"
5
+ VERSION = "4.0.3"
6
6
  EMOJI_VERSION = "16.0"
7
- CLDR_VERSION = "45"
7
+ CLDR_VERSION = "46"
8
8
  DATA_DIRECTORY = File.expand_path('../../../data', __dir__).freeze
9
9
  INDEX_FILENAME = (DATA_DIRECTORY + "/emoji.marshal.gz").freeze
10
10
 
@@ -41,5 +41,9 @@ module Unicode
41
41
 
42
42
  # Two regional indicators make up a region
43
43
  REGIONAL_INDICATORS = [*0x1F1E6..0x1F1FF].freeze
44
+
45
+ # The current list of Emoji components that should have a visual representation
46
+ # Currently skin tone modifiers + hair components
47
+ VISUAL_COMPONENT = [*0x1F3FB..0x1F3FF, *0x1F9B0..0x1F9B3].freeze
44
48
  end
45
49
  end