unicode-emoji 3.7.0 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rake_tasks +1 -0
- data/CHANGELOG.md +24 -1
- data/Gemfile.lock +3 -3
- data/README.md +111 -58
- data/Rakefile +6 -2
- data/data/emoji.marshal.gz +0 -0
- data/data/generate_constants.rb +123 -43
- data/lib/unicode/emoji/constants.rb +22 -2
- data/lib/unicode/emoji/generated/regex.rb +1 -1
- data/lib/unicode/emoji/generated/regex_basic.rb +1 -1
- data/lib/unicode/emoji/generated/regex_emoji_keycap.rb +8 -0
- data/lib/unicode/emoji/generated/regex_include_mqe.rb +8 -0
- data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +8 -0
- data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_prop_component.rb +8 -0
- data/lib/unicode/emoji/generated/regex_prop_emoji.rb +8 -0
- data/lib/unicode/emoji/generated/regex_prop_modifier.rb +8 -0
- data/lib/unicode/emoji/generated/regex_prop_modifier_base.rb +8 -0
- data/lib/unicode/emoji/generated/regex_prop_presentation.rb +8 -0
- data/lib/unicode/emoji/generated/regex_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
- data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
- data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_emoji_keycap.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_picto_no_emoji.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_prop_component.rb +8 -0
- data/lib/unicode/emoji/generated_native/{regex_any.rb → regex_prop_emoji.rb} +1 -1
- data/lib/unicode/emoji/generated_native/regex_prop_modifier.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_prop_modifier_base.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_prop_presentation.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
- data/lib/unicode/emoji/lazy_constants.rb +36 -0
- data/lib/unicode/emoji/list.rb +3 -0
- data/lib/unicode/emoji.rb +39 -6
- data/spec/data/.keep +0 -0
- data/spec/data/emoji-test.txt +5331 -0
- data/spec/emoji_test_txt_spec.rb +181 -0
- data/spec/unicode_emoji_spec.rb +127 -14
- metadata +24 -4
- data/lib/unicode/emoji/generated/regex_any.rb +0 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5cc77126aeb986e5645ebce97fc3e1be5835b9e5adb9764af980e0175d7d9284
|
4
|
+
data.tar.gz: ffd47af7c556ee26951f6c5f30122ecd44520f1a3aa6856c526c4e4dec050e77
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 810730ae1a796edc6c80c87d0d4e7c0a5bfdbda45c43f57dfde3a05afee7ee5df8d43f378c5c0de34d9e9bca8e9edfaee67261d16057a0aacc79f50c39d703d2
|
7
|
+
data.tar.gz: f300c259b7389d60130b9f910b1301f31bf60d47c32fd16f86935cdf63c740bb44c28ab8fd38bc643f35cd6b4d7e9a76583d3be156683404439069ccf58d8250
|
data/.gitignore
CHANGED
data/.rake_tasks
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,28 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
### 4.0.0
|
4
|
+
|
5
|
+
- **Breaking change:** Regexes now include single skin tone modifiers (`🏻`) and hair components (`🦰`).
|
6
|
+
They were previously considered to be invalid partial Emoji, however since they are supposed to be
|
7
|
+
displayed as Emoji in isolation, they are now part of the regexes (see *ED-20* in UTS51).
|
8
|
+
- **Breaking change:** Drop `REGEX_ANY` in favor of `REGEX_PROP_EMOJI`
|
9
|
+
- Expose regexes for Emoji props (`REGEX_PROP_*`). The advantage over using the native regex properties
|
10
|
+
directly is that you will be able to use the Emoji support level of this gem instead of Ruby's.
|
11
|
+
For example, as of releasing this, the current Emoji version is 16.0, while Ruby is at 15.0.
|
12
|
+
Also see README for a table listing the regexes that match Emoji properties.
|
13
|
+
- Add `REGEX_EMOJI_KEYCAP` for matching specifically Emoji keycaps
|
14
|
+
- Use character class instead of lookbehind for native text emoji and non-emoji pictographic regexes
|
15
|
+
|
16
|
+
### 3.8.0
|
17
|
+
|
18
|
+
- Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
|
19
|
+
for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
|
20
|
+
- Add specs running through `emoji-text.txt` and classify qualification statuses per regex
|
21
|
+
- Improve documentation and add detailed table about which regex has which features
|
22
|
+
- Native regexes: Use native Emoji props for Emoji text presentation
|
23
|
+
- Update CLDR to v46 (valid subdivisions)
|
24
|
+
- Further improvements (see commit log)
|
25
|
+
|
3
26
|
### 3.7.0
|
4
27
|
|
5
28
|
- Bump required Ruby slightly to 2.5
|
@@ -29,7 +52,7 @@
|
|
29
52
|
### 3.3.2
|
30
53
|
|
31
54
|
- Update valid subdivisions to CLDR 43 (no changes)
|
32
|
-
-> there won't be any new subdivision flags in Emoji
|
55
|
+
-> there won't be any new RGI subdivision flags in Emoji
|
33
56
|
|
34
57
|
### 3.3.1
|
35
58
|
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
unicode-emoji (
|
4
|
+
unicode-emoji (4.0.0)
|
5
5
|
unicode-version (~> 1.0)
|
6
6
|
|
7
7
|
GEM
|
@@ -20,7 +20,7 @@ GEM
|
|
20
20
|
reline (0.3.8)
|
21
21
|
io-console (~> 0.5)
|
22
22
|
stringio (3.0.8)
|
23
|
-
unicode-version (1.
|
23
|
+
unicode-version (1.4.0)
|
24
24
|
|
25
25
|
PLATFORMS
|
26
26
|
ruby
|
@@ -32,4 +32,4 @@ DEPENDENCIES
|
|
32
32
|
unicode-emoji!
|
33
33
|
|
34
34
|
BUNDLED WITH
|
35
|
-
2.
|
35
|
+
2.5.21
|
data/README.md
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
# Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
|
2
2
|
|
3
|
-
Provides regular expressions to
|
3
|
+
Provides various sophisticated regular expressions to work with Emoji in strings,
|
4
|
+
incorporating the latest Unicode / Emoji standards.
|
4
5
|
|
5
6
|
Additional features:
|
6
7
|
|
7
|
-
- A categorized list of
|
8
|
+
- A categorized list of Emoji (RGI: Recommended for General Interchange)
|
8
9
|
- Retrieve Emoji properties info about specific codepoints (Emoji_Modifier, Emoji_Presentation, etc.)
|
9
10
|
|
10
11
|
Emoji version: **16.0** (September 2024)
|
11
12
|
|
12
|
-
CLDR version (used for sub-region flags): **
|
13
|
+
CLDR version (used for sub-region flags): **46** (October 2024)
|
13
14
|
|
14
15
|
## Gemfile
|
15
16
|
|
@@ -17,101 +18,153 @@ CLDR version (used for sub-region flags): **45** (April 2024)
|
|
17
18
|
gem "unicode-emoji"
|
18
19
|
```
|
19
20
|
|
20
|
-
## Usage
|
21
|
-
|
22
|
-
### Regex
|
21
|
+
## Usage – Regex Matching
|
23
22
|
|
24
23
|
The gem includes multiple Emoji regexes, which are compiled out of various Emoji Unicode data sources.
|
25
24
|
|
26
25
|
```ruby
|
27
26
|
require "unicode/emoji"
|
28
27
|
|
29
|
-
string = "String which contains all
|
28
|
+
string = "String which contains all types of Emoji sequences:
|
30
29
|
|
31
|
-
-
|
32
|
-
- Textual
|
30
|
+
- Basic Emoji: 😴
|
31
|
+
- Textual Emoji with Emoji variation (VS16): ▶️
|
33
32
|
- Emoji with skin tone modifier: 🛌🏽
|
34
33
|
- Region flag: 🇵🇹
|
35
34
|
- Sub-Region flag: 🏴
|
36
35
|
- Keycap sequence: 2️⃣
|
36
|
+
- Skin tone modifier: 🏻
|
37
37
|
- Sequence using ZWJ (zero width joiner): 🤾🏽♀️
|
38
|
-
|
39
38
|
"
|
40
39
|
|
41
40
|
string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴", "2️⃣", "🤾🏽♀️"]
|
42
41
|
```
|
43
42
|
|
44
|
-
|
43
|
+
Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
|
45
44
|
|
46
|
-
|
45
|
+
### Main Regexes
|
47
46
|
|
48
47
|
Regex | Description | Example Matches | Example Non-Matches
|
49
48
|
------------------------------|-------------|-----------------|--------------------
|
50
|
-
`Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual)
|
51
|
-
`Unicode::Emoji::REGEX_VALID` | Matches (non-textual)
|
52
|
-
`Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual)
|
53
|
-
`Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji,
|
54
|
-
|
55
|
-
##### Picking the Right Emoji Regex
|
56
|
-
|
57
|
-
- Usually you just want `REGEX` (RGI set)
|
58
|
-
- If you want broader matching (any ZJW sequences, more sub-region flags), choose `REGEX_VALID`
|
59
|
-
- Even brolader is `REGEX_WELL_FORMED`, which will also match any region flag and any tag sequence
|
60
|
-
- And then there is `REGEX_POSSIBLE` , which is a quick check for possible Emoji, which might contain false positives, [suggested in the Unicode Standard](https://www.unicode.org/reports/tr51/#EBNF_and_Regex)
|
61
|
-
|
62
|
-
Property | Escaped | `REGEX` (RGI / Recommended) | `REGEX_VALID` (Valid) | `REGEX_WELL_FORMED` (Well-formed) | `REGEX_POSSIBLE`
|
63
|
-
---------|---------|-----------------------------|-----------------------|-----------------------------------|-----------------
|
64
|
-
Region "🇵🇹" | `\u{1F1F5 1F1F9}` | Yes | Yes | Yes | Yes
|
65
|
-
Region "🇵🇵" | `\u{1F1F5 1F1F5}` | No | No | Yes | Yes
|
66
|
-
Tag Sequence "🏴" | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | Yes | Yes | Yes | Yes
|
67
|
-
Tag Sequence "🏴" | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | No | Yes | Yes | Yes
|
68
|
-
Tag Sequence "😴" | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | No | No | Yes | Yes
|
69
|
-
ZWJ Sequence "🤾🏽♀️" | `\u{1F93E 1F3FD 200D 2640 FE0F}` | Yes | Yes | Yes | Yes
|
70
|
-
ZWJ Sequence "🤠🤢" | `\u{1F920 200D 1F922}` | No | Yes | Yes | Yes
|
49
|
+
`Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) Basic Emoji and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `🏻` | `🤾🏽♀`, `🏌♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
50
|
+
`Unicode::Emoji::REGEX_VALID` | Matches (non-textual) Basic Emoji and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀` ,`🏌♂️`, `🤠🤢`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `1`, `1⃣`
|
51
|
+
`Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) Basic Emoji and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`,`🏌♂️` , `🤠🤢`, `🇵🇵`, `🏻` | `😴︎`, `▶`, `1`, `1⃣`
|
52
|
+
`Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, all kinds of Emoji sequences, and even non-Emoji singleton components like digits. Only exception: Unqualified keycap sequences are not matched | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
|
71
53
|
|
72
|
-
|
54
|
+
#### Include Text Emoji
|
73
55
|
|
74
|
-
|
56
|
+
By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
|
75
57
|
|
76
|
-
|
58
|
+
Regex | Description | Example Matches | Example Non-Matches
|
59
|
+
------------------------------|-------------|-----------------|--------------------
|
60
|
+
`Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `😴︎`, `▶`, `1⃣` , `🏻`| `🤾🏽♀`, `🏌♂️`, `🇵🇵`, `🏴`, `🤠🤢`, `1`
|
61
|
+
`Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `😴︎`, `▶`, `1⃣` , `🏻` | `🇵🇵`, `1`
|
62
|
+
`Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` , `🏻` | `1`
|
77
63
|
|
78
|
-
|
64
|
+
#### Minimally-qualified and Unqualified Sequences
|
79
65
|
|
80
66
|
Regex | Description | Example Matches | Example Non-Matches
|
81
67
|
------------------------------|-------------|-----------------|--------------------
|
82
|
-
`Unicode::Emoji::
|
83
|
-
`Unicode::Emoji::
|
68
|
+
`Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏻` | `🏌♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
69
|
+
`Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
84
70
|
|
85
|
-
|
71
|
+
[List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
|
86
72
|
|
87
|
-
|
73
|
+
#### Singleton Regexes
|
74
|
+
|
75
|
+
Matches only simple one-codepoint (+ optional variation selector) Emoji:
|
88
76
|
|
89
77
|
Regex | Description | Example Matches | Example Non-Matches
|
90
78
|
------------------------------|-------------|-----------------|--------------------
|
91
|
-
`Unicode::Emoji::
|
92
|
-
`Unicode::Emoji::
|
93
|
-
|
79
|
+
`Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) Basic Emoji, but no sequences at all | `😴`, `▶️`, `🏻` | `😴︎`, `▶`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `1`
|
80
|
+
`Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `1`
|
81
|
+
|
82
|
+
Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text). The `REGEX_BASIC` regex also matches visual Emoji components (skin tone modifiers and hair components).
|
83
|
+
|
84
|
+
While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
|
85
|
+
|
86
|
+
### Comparison
|
87
|
+
|
88
|
+
1) Fully-qualified RGI Emoji ZWJ sequence
|
89
|
+
2) Minimally-qualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selectors, but not in the first Emoji character)
|
90
|
+
3) Unqualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selector, including in the first Emoji character). Unqualified Emoji include all basic Emoji in Text Presentation (see column 11/12).
|
91
|
+
4) Non-RGI Emoji ZWJ sequence
|
92
|
+
5) Valid Region made from a pair of Regional Indicators
|
93
|
+
6) Any Region made from a pair of Regional Indicators
|
94
|
+
7) RGI Flag Emoji Tag Sequences (England, Scotland, Wales)
|
95
|
+
8) Valid Flag Emoji Tag Sequences (any known subdivision)
|
96
|
+
9) Any Emoji Tag Sequences (any tag sequence with any base)
|
97
|
+
10) Basic Default Emoji Presentation Characters or Text characters with Emoji Presentation Selector
|
98
|
+
11) Basic Default Text Presentation Characters or Basic Emoji with Text Presentation Selector
|
99
|
+
12) Non-Emoji (unqualified) keycap
|
100
|
+
|
101
|
+
Regex | 1 RGI/FQE | 2 RGI/MQE | 3 RGI/UQE | 4 Non-RGI | 5 Valid Region | 6 Any Region | 7 RGI Tag | 8 Valid Tag | 9 Any Tag | 10 Basic Emoji | 11 Basic Text | 12 Text Keycap
|
102
|
+
-|-|-|-|-|-|-|-|-|-|-|-|-
|
103
|
+
REGEX | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
|
104
|
+
REGEX INCLUDE TEXT | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
|
105
|
+
REGEX INCLUDE MQE | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
|
106
|
+
REGEX INCLUDE MQE UQE | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
|
107
|
+
REGEX VALID | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌
|
108
|
+
REGEX VALID INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅
|
109
|
+
REGEX WELL FORMED | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌
|
110
|
+
REGEX WELL FORMED INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅
|
111
|
+
REGEX POSSIBLE | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌
|
112
|
+
REGEX BASIC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌
|
113
|
+
REGEX TEXT | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅
|
114
|
+
|
115
|
+
¹ Matches all unqualified Emoji, except for textual singleton Emoji (see columns 11, 12)
|
116
|
+
|
117
|
+
See [spec files](/spec) for detailed examples about which regex matches which kind of Emoji.
|
118
|
+
|
119
|
+
### Picking the Right Emoji Regex
|
120
|
+
|
121
|
+
- Usually you just want `REGEX` (recommended Emoji set, RGI)
|
122
|
+
- Use `REGEX_INCLUDE_MQE` or `REGEX_INCLUDE_MQE_UQE` if you want to catch Emoji sequences with missing Variation Selectors.
|
123
|
+
- If you want broader matching (any ZWJ sequences, more sub-region flags), choose `REGEX_VALID`
|
124
|
+
- If you need to match any region flag and any tag sequence, choose `REGEX_WELL_FORMED`
|
125
|
+
- Use the `_INCLUDE_TEXT` suffix with any of the above base regexes, if you want to also match basic textual Emoji
|
126
|
+
- And finally, there is also the option to use `REGEX_POSSIBLE`, which is a simplified test for possible Emoji, comparable to `REGEX_WELL_FORMED*`. It might contain false positives, however, the regex is less complex and [suggested in the Unicode standard itself](https://www.unicode.org/reports/tr51/#EBNF_and_Regex) as a first check.
|
127
|
+
|
128
|
+
### Examples
|
129
|
+
|
130
|
+
Desc | Emoji | Escaped | `REGEX` (RGI/FQE) | `REGEX_INCLUDE_MQE` (RGI/MQE) | `REGEX_VALID` | `REGEX_WELL_FORMED` / `REGEX_POSSIBLE`
|
131
|
+
-----|-------|---------|---------------|-----------------------|-----------------------------------|-----------------
|
132
|
+
RGI ZWJ Sequence | 🤾🏽♀️ | `\u{1F93E 1F3FD 200D 2640 FE0F}` | ✅ | ✅ | ✅ | ✅
|
133
|
+
RGI ZWJ Sequence MQE | 🤾🏽♀ | `\u{1F93E 1F3FD 200D 2640}` | ❌ | ✅ | ✅ | ✅
|
134
|
+
Valid ZWJ Sequence, Non-RGI | 🤠🤢 | `\u{1F920 200D 1F922}` | ❌ | ❌ | ✅ | ✅
|
135
|
+
Known Region | 🇵🇹 | `\u{1F1F5 1F1F9}` | ✅ | ✅ | ✅ | ✅
|
136
|
+
Unknown Region | 🇵🇵 | `\u{1F1F5 1F1F5}` | ❌ | ❌ | ❌ | ✅
|
137
|
+
RGI Tag Sequence | 🏴 | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | ✅ | ✅ | ✅ | ✅
|
138
|
+
Valid Tag Sequence | 🏴 | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | ❌ | ❌ | ✅ | ✅
|
139
|
+
Well-formed Tag Sequence | 😴 | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | ❌ | ❌ | ❌ | ✅
|
94
140
|
|
95
|
-
|
141
|
+
Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
|
96
142
|
|
97
|
-
|
143
|
+
More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
|
98
144
|
|
99
|
-
|
145
|
+
### Emoji Property Regexes
|
100
146
|
|
101
|
-
|
147
|
+
Ruby includes native regex Emoji properties, as listed in the following table. You can also opt-in to use the `*_PROP_*` regexes to get the Emoji support level of this gem (instead of Ruby's).
|
102
148
|
|
103
|
-
|
149
|
+
Gem Regex (`Unicode::Emoji`'s Emoji support level) | Native Regex (Ruby's Emoji support level)
|
150
|
+
---------------------------------------------------|------------------------------------------
|
151
|
+
`Unicode::Emoji::REGEX_PROP_EMOJI` | `/\p{Emoji}/`
|
152
|
+
`Unicode::Emoji::REGEX_PROP_MODIFIER` | `/\p{EMod}/`
|
153
|
+
`Unicode::Emoji::REGEX_PROP_MODIFIER_BASE` | `/\p{EBase}/`
|
154
|
+
`Unicode::Emoji::REGEX_PROP_COMPONENT` | `/\p{EComp}/`
|
155
|
+
`Unicode::Emoji::REGEX_PROP_PRESENTATION` | `/\p{EPres}/`
|
104
156
|
|
105
|
-
|
157
|
+
#### Extended Pictographic Regex
|
106
158
|
|
107
|
-
|
108
|
-
------------------------------|-------------|-----------------|--------------------
|
109
|
-
`Unicode::Emoji::REGEX_ANY` | Matches any Emoji-related codepoint (but no variation selectors, tags, or zero-width joiners). Please not that this will match Emoji-parts rather than complete Emoji, for example, single digits! | `😴`, `▶`, `🏻`, `🛌`, `🏽`, `🇵`, `🇹`, `2`, `🏴`, `🤾`, `♀`, `🤠`, `🤢` | -
|
159
|
+
`Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
|
110
160
|
|
161
|
+
`Unicode::Emoji::REGEX_PICTO_NO_EMOJI` matches single codepoints with the **Extended_Pictographic** property, but excludes Emoji characters.
|
162
|
+
|
163
|
+
See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
|
111
164
|
|
112
|
-
|
165
|
+
## Usage – List
|
113
166
|
|
114
|
-
Use `Unicode::Emoji::LIST` or the list method to get a
|
167
|
+
Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
|
115
168
|
|
116
169
|
```ruby
|
117
170
|
Unicode::Emoji.list.keys
|
@@ -124,13 +177,13 @@ Unicode::Emoji.list("Food & Drink", "food-asian")
|
|
124
177
|
=> ["🍱", "🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🥮", "🍡", "🥟", "🥠", "🥡"]
|
125
178
|
```
|
126
179
|
|
127
|
-
Please note that categories might change with future versions of the Emoji standard
|
180
|
+
Please note that categories might change with future versions of the Emoji standard, although this has not happened often.
|
128
181
|
|
129
182
|
A list of all Emoji (generated from this gem) can be found at [character.construction/emoji](https://character.construction/emoji).
|
130
183
|
|
131
|
-
|
184
|
+
## Usage – Properties Data
|
132
185
|
|
133
|
-
Allows you to access the codepoint data form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
|
186
|
+
Allows you to access the codepoint data for a single character form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
|
134
187
|
|
135
188
|
```ruby
|
136
189
|
require "unicode/emoji"
|
data/Rakefile
CHANGED
@@ -28,14 +28,18 @@ task :irb do
|
|
28
28
|
end
|
29
29
|
|
30
30
|
# # #
|
31
|
-
# Run
|
31
|
+
# Run specs
|
32
32
|
|
33
33
|
desc "#{gemspec.name} | Spec"
|
34
34
|
task :spec do
|
35
|
-
ruby "spec
|
35
|
+
ruby File.join("spec", "*_spec.rb")
|
36
36
|
end
|
37
37
|
task default: :spec
|
38
38
|
|
39
|
+
|
40
|
+
# # #
|
41
|
+
# Generate regex
|
42
|
+
|
39
43
|
desc "#{gemspec.name} | Generates all regex constants and saves them to lib/unicode/emoji/{generated,generated_native} directories"
|
40
44
|
task :generate_constants do
|
41
45
|
load "data/generate_constants.rb", true
|
data/data/emoji.marshal.gz
CHANGED
Binary file
|
data/data/generate_constants.rb
CHANGED
@@ -68,25 +68,39 @@ def pack_and_join(ords)
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
|
71
|
+
def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
|
72
|
+
visual_component = pack_and_join(VISUAL_COMPONENT)
|
73
|
+
|
72
74
|
emoji_presentation_sequence = \
|
73
75
|
join(
|
74
|
-
|
76
|
+
text_presentation + pack(EMOJI_VARIATION_SELECTOR),
|
75
77
|
emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
|
76
78
|
)
|
77
79
|
|
78
80
|
non_component_emoji_presentation_sequence = \
|
79
81
|
"(?!" + emoji_component + ")" + emoji_presentation_sequence
|
80
82
|
|
83
|
+
basic_emoji = \
|
84
|
+
join(
|
85
|
+
non_component_emoji_presentation_sequence,
|
86
|
+
visual_component,
|
87
|
+
)
|
88
|
+
|
81
89
|
text_keycap_sequence = \
|
82
|
-
|
90
|
+
pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
|
83
91
|
|
84
92
|
text_presentation_sequence = \
|
85
93
|
join(
|
86
|
-
|
94
|
+
text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
|
87
95
|
emoji_presentation + pack(TEXT_VARIATION_SELECTOR),
|
88
96
|
)
|
89
97
|
|
98
|
+
text_emoji = \
|
99
|
+
join(
|
100
|
+
"(?!" + emoji_component + ")" + text_presentation_sequence,
|
101
|
+
text_keycap_sequence,
|
102
|
+
)
|
103
|
+
|
90
104
|
emoji_modifier_sequence = \
|
91
105
|
emoji_modifier_base + emoji_modifier
|
92
106
|
|
@@ -99,22 +113,11 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
99
113
|
emoji_well_formed_flag_sequence = \
|
100
114
|
'\p{RI}{2}'
|
101
115
|
|
102
|
-
|
103
|
-
join(
|
104
|
-
# emoji_character,
|
105
|
-
emoji_keycap_sequence,
|
106
|
-
emoji_modifier_sequence,
|
107
|
-
non_component_emoji_presentation_sequence,
|
108
|
-
emoji_valid_flag_sequence,
|
109
|
-
)
|
110
|
-
|
111
|
-
emoji_well_formed_core_sequence = \
|
116
|
+
emoji_core_sequence = \
|
112
117
|
join(
|
113
|
-
# emoji_character,
|
114
118
|
emoji_keycap_sequence,
|
115
119
|
emoji_modifier_sequence,
|
116
120
|
non_component_emoji_presentation_sequence,
|
117
|
-
emoji_well_formed_flag_sequence,
|
118
121
|
)
|
119
122
|
|
120
123
|
# Sort to make sure complex sequences match first
|
@@ -144,6 +147,18 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
144
147
|
emoji_rgi_zwj_sequence = \
|
145
148
|
pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse)
|
146
149
|
|
150
|
+
# FQE+MQE: Make VS16 optional after ZWJ has appeared
|
151
|
+
emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
|
152
|
+
/#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/,
|
153
|
+
pack(EMOJI_VARIATION_SELECTOR) + "?"
|
154
|
+
)
|
155
|
+
|
156
|
+
# FQE+MQE+UQE: Make all VS16 optional
|
157
|
+
emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
|
158
|
+
pack(EMOJI_VARIATION_SELECTOR),
|
159
|
+
pack(EMOJI_VARIATION_SELECTOR) + "?",
|
160
|
+
)
|
161
|
+
|
147
162
|
emoji_valid_zwj_element = \
|
148
163
|
join(
|
149
164
|
emoji_modifier_sequence,
|
@@ -160,21 +175,76 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
160
175
|
join(
|
161
176
|
emoji_rgi_zwj_sequence,
|
162
177
|
emoji_rgi_tag_sequence,
|
163
|
-
|
178
|
+
emoji_valid_flag_sequence,
|
179
|
+
emoji_core_sequence,
|
180
|
+
visual_component,
|
181
|
+
)
|
182
|
+
|
183
|
+
emoji_rgi_sequence_include_text = \
|
184
|
+
join(
|
185
|
+
emoji_rgi_zwj_sequence,
|
186
|
+
emoji_rgi_tag_sequence,
|
187
|
+
emoji_valid_flag_sequence,
|
188
|
+
emoji_core_sequence,
|
189
|
+
visual_component,
|
190
|
+
text_emoji,
|
191
|
+
)
|
192
|
+
|
193
|
+
emoji_rgi_include_mqe_sequence = \
|
194
|
+
join(
|
195
|
+
emoji_rgi_include_mqe_zwj_sequence,
|
196
|
+
emoji_rgi_tag_sequence,
|
197
|
+
emoji_valid_flag_sequence,
|
198
|
+
emoji_core_sequence,
|
199
|
+
visual_component,
|
200
|
+
)
|
201
|
+
|
202
|
+
emoji_rgi_include_mqe_uqe_sequence = \
|
203
|
+
join(
|
204
|
+
emoji_rgi_include_mqe_uqe_zwj_sequence,
|
205
|
+
text_emoji, # also uqe
|
206
|
+
emoji_rgi_tag_sequence,
|
207
|
+
emoji_valid_flag_sequence,
|
208
|
+
emoji_core_sequence,
|
209
|
+
visual_component,
|
164
210
|
)
|
165
211
|
|
166
212
|
emoji_valid_sequence = \
|
167
213
|
join(
|
168
214
|
emoji_valid_zwj_sequence,
|
169
215
|
emoji_valid_tag_sequence,
|
170
|
-
|
216
|
+
emoji_valid_flag_sequence,
|
217
|
+
emoji_core_sequence,
|
218
|
+
visual_component,
|
219
|
+
)
|
220
|
+
|
221
|
+
emoji_valid_sequence_include_text = \
|
222
|
+
join(
|
223
|
+
emoji_valid_zwj_sequence,
|
224
|
+
emoji_valid_tag_sequence,
|
225
|
+
emoji_valid_flag_sequence,
|
226
|
+
emoji_core_sequence,
|
227
|
+
visual_component,
|
228
|
+
text_emoji,
|
171
229
|
)
|
172
230
|
|
173
231
|
emoji_well_formed_sequence = \
|
174
232
|
join(
|
175
233
|
emoji_valid_zwj_sequence,
|
176
234
|
emoji_well_formed_tag_sequence,
|
177
|
-
|
235
|
+
emoji_well_formed_flag_sequence,
|
236
|
+
emoji_core_sequence,
|
237
|
+
visual_component,
|
238
|
+
)
|
239
|
+
|
240
|
+
emoji_well_formed_sequence_include_text = \
|
241
|
+
join(
|
242
|
+
emoji_valid_zwj_sequence,
|
243
|
+
emoji_well_formed_tag_sequence,
|
244
|
+
emoji_well_formed_flag_sequence,
|
245
|
+
emoji_core_sequence,
|
246
|
+
visual_component,
|
247
|
+
text_emoji,
|
178
248
|
)
|
179
249
|
|
180
250
|
emoji_possible_modification = \
|
@@ -198,45 +268,53 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
198
268
|
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
199
269
|
regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)
|
200
270
|
|
271
|
+
# rgi + singleton text
|
272
|
+
regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text)
|
273
|
+
|
274
|
+
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
275
|
+
# Also make VS16 optional if not at first emoji character
|
276
|
+
regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence)
|
277
|
+
|
278
|
+
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
279
|
+
# Also make VS16 optional even at first emoji character
|
280
|
+
regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence)
|
281
|
+
|
201
282
|
# Matches basic singleton emoji and all kind of valid sequences
|
202
283
|
regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)
|
203
284
|
|
285
|
+
# valid + singleton text
|
286
|
+
regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text)
|
287
|
+
|
204
288
|
# Matches basic singleton emoji and all kind of sequences
|
205
289
|
regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)
|
290
|
+
|
291
|
+
# well-formed + singleton text
|
292
|
+
regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text)
|
206
293
|
|
207
294
|
# Quick test which might lead to false positves
|
208
295
|
# See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
|
209
296
|
regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
|
210
297
|
|
211
|
-
# Matches only basic single, non-textual emoji
|
212
|
-
|
213
|
-
regexes[:REGEX_BASIC] = Regexp.compile(
|
214
|
-
"(?!" + emoji_component + ")" + emoji_presentation_sequence
|
215
|
-
)
|
298
|
+
# Matches only basic single, non-textual emoji, ignores some components like simple digits
|
299
|
+
regexes[:REGEX_BASIC] = Regexp.compile(basic_emoji)
|
216
300
|
|
217
|
-
# Matches only basic single, textual emoji
|
218
|
-
|
219
|
-
regexes[:REGEX_TEXT] = Regexp.compile(
|
220
|
-
join(
|
221
|
-
"(?!" + emoji_component + ")" + text_presentation_sequence,
|
222
|
-
text_keycap_sequence,
|
223
|
-
)
|
224
|
-
)
|
225
|
-
|
226
|
-
# Matches any emoji-related codepoint - Use with caution (returns partial matches)
|
227
|
-
regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
|
301
|
+
# Matches only basic single, textual emoji, ignores components like modifiers or simple digits
|
302
|
+
regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
|
228
303
|
|
229
|
-
#
|
230
|
-
regexes[:
|
231
|
-
|
232
|
-
regexes[:
|
233
|
-
|
234
|
-
regexes[:
|
304
|
+
# Export regexes for Emoji properties so they can be used with newer Unicode than Ruby's
|
305
|
+
regexes[:REGEX_PROP_EMOJI] = Regexp.compile(emoji_character)
|
306
|
+
regexes[:REGEX_PROP_MODIFIER] = Regexp.compile(emoji_modifier)
|
307
|
+
regexes[:REGEX_PROP_MODIFIER_BASE] = Regexp.compile(emoji_modifier_base)
|
308
|
+
regexes[:REGEX_PROP_COMPONENT] = Regexp.compile(emoji_component)
|
309
|
+
regexes[:REGEX_PROP_PRESENTATION] = Regexp.compile(emoji_presentation)
|
235
310
|
|
311
|
+
# Same goes for ExtendedPictographic
|
236
312
|
regexes[:REGEX_PICTO] = Regexp.compile(picto)
|
237
|
-
|
238
313
|
regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)
|
239
314
|
|
315
|
+
# Emoji keycaps
|
316
|
+
regexes[:REGEX_EMOJI_KEYCAP] = Regexp.compile(emoji_keycap_sequence)
|
317
|
+
|
240
318
|
regexes
|
241
319
|
end
|
242
320
|
|
@@ -246,6 +324,7 @@ regexes = compile(
|
|
246
324
|
emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
|
247
325
|
emoji_component: pack_and_join(EMOJI_COMPONENT),
|
248
326
|
emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
|
327
|
+
text_presentation: pack_and_join(TEXT_PRESENTATION),
|
249
328
|
picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
|
250
329
|
picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
|
251
330
|
)
|
@@ -257,7 +336,8 @@ native_regexes = compile(
|
|
257
336
|
emoji_modifier_base: "\\p{EBase}",
|
258
337
|
emoji_component: "\\p{EComp}",
|
259
338
|
emoji_presentation: "\\p{EPres}",
|
339
|
+
text_presentation: "[\\p{Emoji}&&\\P{EPres}]",
|
260
340
|
picto: "\\p{ExtPict}",
|
261
|
-
picto_no_emoji: "\\p{ExtPict}
|
341
|
+
picto_no_emoji: "[\\p{ExtPict}&&\\P{Emoji}]"
|
262
342
|
)
|
263
343
|
write_regexes(native_regexes, File.expand_path("../lib/unicode/emoji/generated_native", __dir__))
|