unicode-emoji 3.7.0 → 3.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +11 -1
- data/README.md +98 -55
- data/Rakefile +6 -2
- data/data/emoji.marshal.gz +0 -0
- data/data/generate_constants.rb +97 -40
- data/lib/unicode/emoji/constants.rb +17 -1
- data/lib/unicode/emoji/generated/regex.rb +1 -1
- data/lib/unicode/emoji/generated/regex_include_mqe.rb +8 -0
- data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +8 -0
- data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
- data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
- data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
- data/lib/unicode/emoji/lazy_constants.rb +36 -0
- data/lib/unicode/emoji/list.rb +3 -0
- data/lib/unicode/emoji.rb +33 -6
- data/spec/data/.keep +0 -0
- data/spec/data/emoji-test.txt +5331 -0
- data/spec/emoji_test_txt_spec.rb +181 -0
- data/spec/unicode_emoji_spec.rb +36 -4
- metadata +12 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3c7cc2671d256d8208b72d719384e7c13aaace4fec6b4919b92640e5336d87f
|
4
|
+
data.tar.gz: 9420777da4805787467c7f4eac1580f7179a6abf56e54b011c11640a50502b88
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f31a83c8a492affe4ec34f2f6e43fa20a44f98dae6c0855322d5c12bc924edc154f7fa4793ca28f299f5436c8981651a5777c2e7944db6dee9c3b99f5ec997ce
|
7
|
+
data.tar.gz: 54c1beecbcb673274bdf98169f11fa51bc746282418c3b9ca30385194ae384e3532b04bbca3ef196316c11d3344258b9059da1ea18630d8d90b51029c608565d
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
### 3.8.0
|
4
|
+
|
5
|
+
- Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
|
6
|
+
for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
|
7
|
+
- Add specs running through `emoji-text.txt` and classify qualification statuses per regex
|
8
|
+
- Improve documentation and add detailed table about which regex has which features
|
9
|
+
- Native regexes: Use native Emoji props for Emoji text presentation
|
10
|
+
- Update CLDR to v46 (valid subdivisions)
|
11
|
+
- Further improvements (see commit log)
|
12
|
+
|
3
13
|
### 3.7.0
|
4
14
|
|
5
15
|
- Bump required Ruby slightly to 2.5
|
@@ -29,7 +39,7 @@
|
|
29
39
|
### 3.3.2
|
30
40
|
|
31
41
|
- Update valid subdivisions to CLDR 43 (no changes)
|
32
|
-
-> there won't be any new subdivision flags in Emoji
|
42
|
+
-> there won't be any new RGI subdivision flags in Emoji
|
33
43
|
|
34
44
|
### 3.3.1
|
35
45
|
|
data/README.md
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
# Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
|
2
2
|
|
3
|
-
Provides regular expressions to find Emoji in strings, incorporating the latest Unicode
|
3
|
+
Provides regular expressions to find Emoji in strings, incorporating the latest Unicode / Emoji standards.
|
4
4
|
|
5
5
|
Additional features:
|
6
6
|
|
7
|
-
- A categorized list of
|
7
|
+
- A categorized list of Emoji (RGI: Recommended for General Interchange)
|
8
8
|
- Retrieve Emoji properties info about specific codepoints (Emoji_Modifier, Emoji_Presentation, etc.)
|
9
9
|
|
10
10
|
Emoji version: **16.0** (September 2024)
|
11
11
|
|
12
|
-
CLDR version (used for sub-region flags): **
|
12
|
+
CLDR version (used for sub-region flags): **46** (October 2024)
|
13
13
|
|
14
14
|
## Gemfile
|
15
15
|
|
@@ -17,16 +17,14 @@ CLDR version (used for sub-region flags): **45** (April 2024)
|
|
17
17
|
gem "unicode-emoji"
|
18
18
|
```
|
19
19
|
|
20
|
-
## Usage
|
21
|
-
|
22
|
-
### Regex
|
20
|
+
## Usage – Regex Matching
|
23
21
|
|
24
22
|
The gem includes multiple Emoji regexes, which are compiled out of various Emoji Unicode data sources.
|
25
23
|
|
26
24
|
```ruby
|
27
25
|
require "unicode/emoji"
|
28
26
|
|
29
|
-
string = "String which contains all
|
27
|
+
string = "String which contains all types of Emoji sequences:
|
30
28
|
|
31
29
|
- Singleton Emoji: 😴
|
32
30
|
- Textual singleton Emoji with Emoji variation: ▶️
|
@@ -35,64 +33,114 @@ string = "String which contains all kinds of emoji:
|
|
35
33
|
- Sub-Region flag: 🏴
|
36
34
|
- Keycap sequence: 2️⃣
|
37
35
|
- Sequence using ZWJ (zero width joiner): 🤾🏽♀️
|
38
|
-
|
39
36
|
"
|
40
37
|
|
41
38
|
string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴", "2️⃣", "🤾🏽♀️"]
|
42
39
|
```
|
43
40
|
|
44
|
-
|
41
|
+
Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
|
45
42
|
|
46
|
-
|
43
|
+
### Main Regexes
|
47
44
|
|
48
45
|
Regex | Description | Example Matches | Example Non-Matches
|
49
46
|
------------------------------|-------------|-----------------|--------------------
|
50
|
-
`Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴`, `🤠🤢`, `1`
|
51
|
-
`Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤠🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`
|
52
|
-
`Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤠🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`
|
53
|
-
`Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤠🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` |
|
54
|
-
|
55
|
-
##### Picking the Right Emoji Regex
|
56
|
-
|
57
|
-
- Usually you just want `REGEX` (RGI set)
|
58
|
-
- If you want broader matching (any ZJW sequences, more sub-region flags), choose `REGEX_VALID`
|
59
|
-
- Even brolader is `REGEX_WELL_FORMED`, which will also match any region flag and any tag sequence
|
60
|
-
- And then there is `REGEX_POSSIBLE` , which is a quick check for possible Emoji, which might contain false positives, [suggested in the Unicode Standard](https://www.unicode.org/reports/tr51/#EBNF_and_Regex)
|
61
|
-
|
62
|
-
Property | Escaped | `REGEX` (RGI / Recommended) | `REGEX_VALID` (Valid) | `REGEX_WELL_FORMED` (Well-formed) | `REGEX_POSSIBLE`
|
63
|
-
---------|---------|-----------------------------|-----------------------|-----------------------------------|-----------------
|
64
|
-
Region "🇵🇹" | `\u{1F1F5 1F1F9}` | Yes | Yes | Yes | Yes
|
65
|
-
Region "🇵🇵" | `\u{1F1F5 1F1F5}` | No | No | Yes | Yes
|
66
|
-
Tag Sequence "🏴" | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | Yes | Yes | Yes | Yes
|
67
|
-
Tag Sequence "🏴" | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | No | Yes | Yes | Yes
|
68
|
-
Tag Sequence "😴" | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | No | No | Yes | Yes
|
69
|
-
ZWJ Sequence "🤾🏽♀️" | `\u{1F93E 1F3FD 200D 2640 FE0F}` | Yes | Yes | Yes | Yes
|
70
|
-
ZWJ Sequence "🤠🤢" | `\u{1F920 200D 1F922}` | No | Yes | Yes | Yes
|
47
|
+
`Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️` | `🤾🏽♀`, `🏌♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
48
|
+
`Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀` ,`🏌♂️`, `🤠🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`, `1⃣`
|
49
|
+
`Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`,`🏌♂️` , `🤠🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`, `1⃣`
|
50
|
+
`Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits (except for: unqualified keycap sequences) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
|
71
51
|
|
72
|
-
|
52
|
+
#### Include Text Emoji
|
73
53
|
|
74
|
-
|
54
|
+
By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
|
75
55
|
|
76
|
-
|
56
|
+
Regex | Description | Example Matches | Example Non-Matches
|
57
|
+
------------------------------|-------------|-----------------|--------------------
|
58
|
+
`Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `😴︎`, `▶`, `1⃣` | `🤾🏽♀`, `🏌♂️`, `🏻`, `🇵🇵`, `🏴`, `🤠🤢`, `1`
|
59
|
+
`Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `😴︎`, `▶`, `1⃣` | `🏻`, `🇵🇵`, `1`
|
60
|
+
`Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` | `🏻`, `1`
|
77
61
|
|
78
|
-
|
62
|
+
#### Minimally-qualified and Unqualified Sequences
|
79
63
|
|
80
64
|
Regex | Description | Example Matches | Example Non-Matches
|
81
65
|
------------------------------|-------------|-----------------|--------------------
|
82
|
-
`Unicode::Emoji::
|
83
|
-
`Unicode::Emoji::
|
66
|
+
`Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `🤾🏽♀` | `🏌♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
67
|
+
`Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
84
68
|
|
85
|
-
|
69
|
+
[List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
|
86
70
|
|
87
|
-
|
71
|
+
#### Singleton Regexes
|
72
|
+
|
73
|
+
Matches only simple one-codepoint (+ optional variation selector) Emoji:
|
88
74
|
|
89
75
|
Regex | Description | Example Matches | Example Non-Matches
|
90
76
|
------------------------------|-------------|-----------------|--------------------
|
91
|
-
`Unicode::Emoji::
|
92
|
-
`Unicode::Emoji::
|
93
|
-
|
77
|
+
`Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `1`
|
78
|
+
`Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `1`
|
79
|
+
|
80
|
+
Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text)
|
81
|
+
|
82
|
+
While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
|
83
|
+
|
84
|
+
### Comparison
|
85
|
+
|
86
|
+
1) Fully-qualified RGI Emoji ZWJ sequence
|
87
|
+
2) Minimally-qualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selectors, but not in the first Emoji character)
|
88
|
+
3) Unqualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selector, including in the first Emoji character). Unqualified Emoji include all basic Emoji in Text Presentation (see column 11/12).
|
89
|
+
4) Non-RGI Emoji ZWJ sequence
|
90
|
+
5) Valid Region made from a pair of Regional Indicators
|
91
|
+
6) Any Region made from a pair of Regional Indicators
|
92
|
+
7) RGI Flag Emoji Tag Sequences (England, Scotland, Wales)
|
93
|
+
8) Valid Flag Emoji Tag Sequences (any known subdivision)
|
94
|
+
9) Any Emoji Tag Sequences (any tag sequence with any base)
|
95
|
+
10) Basic Default Emoji Presentation Characters or Text characters with Emoji Presentation Selector
|
96
|
+
11) Basic Default Text Presentation Characters or Basic Emoji with Text Presentation Selector
|
97
|
+
12) Non-Emoji (unqualified) keycap
|
98
|
+
|
99
|
+
Regex | 1 RGI/FQE | 2 RGI/MQE | 3 RGI/UQE | 4 Non-RGI | 5 Valid Region | 6 Any Region | 7 RGI Tag | 8 Valid Tag | 9 Any Tag | 10 Basic Emoji | 11 Basic Text | 12 Text Keycap
|
100
|
+
-|-|-|-|-|-|-|-|-|-|-|-|-
|
101
|
+
REGEX | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
|
102
|
+
REGEX INCLUDE TEXT | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
|
103
|
+
REGEX INCLUDE MQE | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
|
104
|
+
REGEX INCLUDE MQE UQE | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
|
105
|
+
REGEX VALID | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌
|
106
|
+
REGEX VALID INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅
|
107
|
+
REGEX WELL FORMED | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌
|
108
|
+
REGEX WELL FORMED INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅
|
109
|
+
REGEX POSSIBLE | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌
|
110
|
+
REGEX BASIC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌
|
111
|
+
REGEX TEXT | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅
|
112
|
+
|
113
|
+
¹ Matches all unqualified Emoji, except for textual singleton Emoji (see columns 11, 12)
|
114
|
+
|
115
|
+
See [spec files](/spec) for detailed examples about which regex matches which kind of Emoji.
|
116
|
+
|
117
|
+
### Picking the Right Emoji Regex
|
118
|
+
|
119
|
+
- Usually you just want `REGEX` (recommended Emoji set, RGI)
|
120
|
+
- Use `REGEX_INCLUDE_MQE` or `REGEX_INCLUDE_MQE_UQE` if you want to catch Emoji sequences with missing Variation Selectors.
|
121
|
+
- If you want broader matching (any ZWJ sequences, more sub-region flags), choose `REGEX_VALID`
|
122
|
+
- If you need to match any region flag and any tag sequence, choose `REGEX_WELL_FORMED`
|
123
|
+
- Use the `_INCLUDE_TEXT` suffix with any of the above base regexes, if you want to also match basic textual Emoji
|
124
|
+
- And finally, there is also the option to use `REGEX_POSSIBLE`, which is a simplified test for possible Emoji, comparable to `REGEX_WELL_FORMED*`. It might contain false positives, however, the regex is less complex and [suggested in the Unicode standard itself](https://www.unicode.org/reports/tr51/#EBNF_and_Regex) as a first check.
|
125
|
+
|
126
|
+
### Examples
|
127
|
+
|
128
|
+
Desc | Emoji | Escaped | `REGEX` (RGI/FQE) | `REGEX_INCLUDE_MQE` (RGI/MQE) | `REGEX_VALID` | `REGEX_WELL_FORMED` / `REGEX_POSSIBLE`
|
129
|
+
-----|-------|---------|---------------|-----------------------|-----------------------------------|-----------------
|
130
|
+
RGI ZWJ Sequence | 🤾🏽♀️ | `\u{1F93E 1F3FD 200D 2640 FE0F}` | ✅ | ✅ | ✅ | ✅
|
131
|
+
RGI ZWJ Sequence MQE | 🤾🏽♀ | `\u{1F93E 1F3FD 200D 2640}` | ❌ | ✅ | ✅ | ✅
|
132
|
+
Valid ZWJ Sequence, Non-RGI | 🤠🤢 | `\u{1F920 200D 1F922}` | ❌ | ❌ | ✅ | ✅
|
133
|
+
Known Region | 🇵🇹 | `\u{1F1F5 1F1F9}` | ✅ | ✅ | ✅ | ✅
|
134
|
+
Unknown Region | 🇵🇵 | `\u{1F1F5 1F1F5}` | ❌ | ❌ | ❌ | ✅
|
135
|
+
RGI Tag Sequence | 🏴 | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | ✅ | ✅ | ✅ | ✅
|
136
|
+
Valid Tag Sequence | 🏴 | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | ❌ | ❌ | ✅ | ✅
|
137
|
+
Well-formed Tag Sequence | 😴 | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | ❌ | ❌ | ❌ | ✅
|
138
|
+
|
139
|
+
Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
|
140
|
+
|
141
|
+
More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
|
94
142
|
|
95
|
-
|
143
|
+
### Extended Pictographic Regex
|
96
144
|
|
97
145
|
`Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
|
98
146
|
|
@@ -100,18 +148,13 @@ Regex | Description | Example Matches | Example Non-Matc
|
|
100
148
|
|
101
149
|
See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
|
102
150
|
|
103
|
-
|
104
|
-
|
105
|
-
Matches potential Emoji parts (often, this is not what you want):
|
106
|
-
|
107
|
-
Regex | Description | Example Matches | Example Non-Matches
|
108
|
-
------------------------------|-------------|-----------------|--------------------
|
109
|
-
`Unicode::Emoji::REGEX_ANY` | Matches any Emoji-related codepoint (but no variation selectors, tags, or zero-width joiners). Please not that this will match Emoji-parts rather than complete Emoji, for example, single digits! | `😴`, `▶`, `🏻`, `🛌`, `🏽`, `🇵`, `🇹`, `2`, `🏴`, `🤾`, `♀`, `🤠`, `🤢` | -
|
151
|
+
### Partial Regexes
|
110
152
|
|
153
|
+
`Unicode::Emoji::REGEX_ANY`, same as `\p{Emoji}`. Deprecated: Will be removed or renamed in the future.
|
111
154
|
|
112
|
-
|
155
|
+
## Usage – List
|
113
156
|
|
114
|
-
Use `Unicode::Emoji::LIST` or the list method to get a
|
157
|
+
Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
|
115
158
|
|
116
159
|
```ruby
|
117
160
|
Unicode::Emoji.list.keys
|
@@ -124,13 +167,13 @@ Unicode::Emoji.list("Food & Drink", "food-asian")
|
|
124
167
|
=> ["🍱", "🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🥮", "🍡", "🥟", "🥠", "🥡"]
|
125
168
|
```
|
126
169
|
|
127
|
-
Please note that categories might change with future versions of the Emoji standard
|
170
|
+
Please note that categories might change with future versions of the Emoji standard, although this has not happened often.
|
128
171
|
|
129
172
|
A list of all Emoji (generated from this gem) can be found at [character.construction/emoji](https://character.construction/emoji).
|
130
173
|
|
131
|
-
|
174
|
+
## Usage – Properties Data
|
132
175
|
|
133
|
-
Allows you to access the codepoint data form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
|
176
|
+
Allows you to access the codepoint data for a single character form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
|
134
177
|
|
135
178
|
```ruby
|
136
179
|
require "unicode/emoji"
|
data/Rakefile
CHANGED
@@ -28,14 +28,18 @@ task :irb do
|
|
28
28
|
end
|
29
29
|
|
30
30
|
# # #
|
31
|
-
# Run
|
31
|
+
# Run specs
|
32
32
|
|
33
33
|
desc "#{gemspec.name} | Spec"
|
34
34
|
task :spec do
|
35
|
-
ruby "spec
|
35
|
+
ruby File.join("spec", "*_spec.rb")
|
36
36
|
end
|
37
37
|
task default: :spec
|
38
38
|
|
39
|
+
|
40
|
+
# # #
|
41
|
+
# Generate regex
|
42
|
+
|
39
43
|
desc "#{gemspec.name} | Generates all regex constants and saves them to lib/unicode/emoji/{generated,generated_native} directories"
|
40
44
|
task :generate_constants do
|
41
45
|
load "data/generate_constants.rb", true
|
data/data/emoji.marshal.gz
CHANGED
Binary file
|
data/data/generate_constants.rb
CHANGED
@@ -68,10 +68,10 @@ def pack_and_join(ords)
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
|
71
|
+
def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
|
72
72
|
emoji_presentation_sequence = \
|
73
73
|
join(
|
74
|
-
|
74
|
+
text_presentation + pack(EMOJI_VARIATION_SELECTOR),
|
75
75
|
emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
|
76
76
|
)
|
77
77
|
|
@@ -79,14 +79,20 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
79
79
|
"(?!" + emoji_component + ")" + emoji_presentation_sequence
|
80
80
|
|
81
81
|
text_keycap_sequence = \
|
82
|
-
|
82
|
+
pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
|
83
83
|
|
84
84
|
text_presentation_sequence = \
|
85
85
|
join(
|
86
|
-
|
86
|
+
text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
|
87
87
|
emoji_presentation + pack(TEXT_VARIATION_SELECTOR),
|
88
88
|
)
|
89
89
|
|
90
|
+
text_emoji = \
|
91
|
+
join(
|
92
|
+
"(?!" + emoji_component + ")" + text_presentation_sequence,
|
93
|
+
text_keycap_sequence,
|
94
|
+
)
|
95
|
+
|
90
96
|
emoji_modifier_sequence = \
|
91
97
|
emoji_modifier_base + emoji_modifier
|
92
98
|
|
@@ -99,22 +105,11 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
99
105
|
emoji_well_formed_flag_sequence = \
|
100
106
|
'\p{RI}{2}'
|
101
107
|
|
102
|
-
|
103
|
-
join(
|
104
|
-
# emoji_character,
|
105
|
-
emoji_keycap_sequence,
|
106
|
-
emoji_modifier_sequence,
|
107
|
-
non_component_emoji_presentation_sequence,
|
108
|
-
emoji_valid_flag_sequence,
|
109
|
-
)
|
110
|
-
|
111
|
-
emoji_well_formed_core_sequence = \
|
108
|
+
emoji_core_sequence = \
|
112
109
|
join(
|
113
|
-
# emoji_character,
|
114
110
|
emoji_keycap_sequence,
|
115
111
|
emoji_modifier_sequence,
|
116
112
|
non_component_emoji_presentation_sequence,
|
117
|
-
emoji_well_formed_flag_sequence,
|
118
113
|
)
|
119
114
|
|
120
115
|
# Sort to make sure complex sequences match first
|
@@ -144,6 +139,18 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
144
139
|
emoji_rgi_zwj_sequence = \
|
145
140
|
pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse)
|
146
141
|
|
142
|
+
# FQE+MQE: Make VS16 optional after ZWJ has appeared
|
143
|
+
emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
|
144
|
+
/#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/,
|
145
|
+
pack(EMOJI_VARIATION_SELECTOR) + "?"
|
146
|
+
)
|
147
|
+
|
148
|
+
# FQE+MQE+UQE: Make all VS16 optional
|
149
|
+
emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
|
150
|
+
pack(EMOJI_VARIATION_SELECTOR),
|
151
|
+
pack(EMOJI_VARIATION_SELECTOR) + "?",
|
152
|
+
)
|
153
|
+
|
147
154
|
emoji_valid_zwj_element = \
|
148
155
|
join(
|
149
156
|
emoji_modifier_sequence,
|
@@ -160,21 +167,68 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
160
167
|
join(
|
161
168
|
emoji_rgi_zwj_sequence,
|
162
169
|
emoji_rgi_tag_sequence,
|
163
|
-
|
170
|
+
emoji_valid_flag_sequence,
|
171
|
+
emoji_core_sequence,
|
172
|
+
)
|
173
|
+
|
174
|
+
emoji_rgi_sequence_include_text = \
|
175
|
+
join(
|
176
|
+
emoji_rgi_zwj_sequence,
|
177
|
+
emoji_rgi_tag_sequence,
|
178
|
+
emoji_valid_flag_sequence,
|
179
|
+
emoji_core_sequence,
|
180
|
+
text_emoji,
|
181
|
+
)
|
182
|
+
|
183
|
+
emoji_rgi_include_mqe_sequence = \
|
184
|
+
join(
|
185
|
+
emoji_rgi_include_mqe_zwj_sequence,
|
186
|
+
emoji_rgi_tag_sequence,
|
187
|
+
emoji_valid_flag_sequence,
|
188
|
+
emoji_core_sequence,
|
189
|
+
)
|
190
|
+
|
191
|
+
emoji_rgi_include_mqe_uqe_sequence = \
|
192
|
+
join(
|
193
|
+
emoji_rgi_include_mqe_uqe_zwj_sequence,
|
194
|
+
text_emoji, # also uqe
|
195
|
+
emoji_rgi_tag_sequence,
|
196
|
+
emoji_valid_flag_sequence,
|
197
|
+
emoji_core_sequence,
|
164
198
|
)
|
165
199
|
|
166
200
|
emoji_valid_sequence = \
|
167
201
|
join(
|
168
202
|
emoji_valid_zwj_sequence,
|
169
203
|
emoji_valid_tag_sequence,
|
170
|
-
|
204
|
+
emoji_valid_flag_sequence,
|
205
|
+
emoji_core_sequence,
|
206
|
+
)
|
207
|
+
|
208
|
+
emoji_valid_sequence_include_text = \
|
209
|
+
join(
|
210
|
+
emoji_valid_zwj_sequence,
|
211
|
+
emoji_valid_tag_sequence,
|
212
|
+
emoji_valid_flag_sequence,
|
213
|
+
emoji_core_sequence,
|
214
|
+
text_emoji,
|
171
215
|
)
|
172
216
|
|
173
217
|
emoji_well_formed_sequence = \
|
174
218
|
join(
|
175
219
|
emoji_valid_zwj_sequence,
|
176
220
|
emoji_well_formed_tag_sequence,
|
177
|
-
|
221
|
+
emoji_well_formed_flag_sequence,
|
222
|
+
emoji_core_sequence,
|
223
|
+
)
|
224
|
+
|
225
|
+
emoji_well_formed_sequence_include_text = \
|
226
|
+
join(
|
227
|
+
emoji_valid_zwj_sequence,
|
228
|
+
emoji_well_formed_tag_sequence,
|
229
|
+
emoji_well_formed_flag_sequence,
|
230
|
+
emoji_core_sequence,
|
231
|
+
text_emoji,
|
178
232
|
)
|
179
233
|
|
180
234
|
emoji_possible_modification = \
|
@@ -198,41 +252,42 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
198
252
|
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
199
253
|
regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)
|
200
254
|
|
255
|
+
# rgi + singleton text
|
256
|
+
regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text)
|
257
|
+
|
258
|
+
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
259
|
+
# Also make VS16 optional if not at first emoji character
|
260
|
+
regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence)
|
261
|
+
|
262
|
+
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
263
|
+
# Also make VS16 optional even at first emoji character
|
264
|
+
regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence)
|
265
|
+
|
201
266
|
# Matches basic singleton emoji and all kind of valid sequences
|
202
267
|
regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)
|
203
268
|
|
269
|
+
# valid + singleton text
|
270
|
+
regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text)
|
271
|
+
|
204
272
|
# Matches basic singleton emoji and all kind of sequences
|
205
273
|
regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)
|
274
|
+
|
275
|
+
# well-formed + singleton text
|
276
|
+
regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text)
|
206
277
|
|
207
278
|
# Quick test which might lead to false positves
|
208
279
|
# See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
|
209
280
|
regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
|
210
281
|
|
211
|
-
# Matches only basic single, non-textual emoji
|
212
|
-
|
213
|
-
regexes[:REGEX_BASIC] = Regexp.compile(
|
214
|
-
"(?!" + emoji_component + ")" + emoji_presentation_sequence
|
215
|
-
)
|
282
|
+
# Matches only basic single, non-textual emoji, ignores "components" like modifiers or simple digits
|
283
|
+
regexes[:REGEX_BASIC] = Regexp.compile(non_component_emoji_presentation_sequence)
|
216
284
|
|
217
|
-
# Matches only basic single, textual emoji
|
218
|
-
|
219
|
-
regexes[:REGEX_TEXT] = Regexp.compile(
|
220
|
-
join(
|
221
|
-
"(?!" + emoji_component + ")" + text_presentation_sequence,
|
222
|
-
text_keycap_sequence,
|
223
|
-
)
|
224
|
-
)
|
285
|
+
# Matches only basic single, textual emoji, ignores "components" like modifiers or simple digits
|
286
|
+
regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
|
225
287
|
|
226
|
-
#
|
288
|
+
# Same as \p{Emoji} - to be removed or renamed
|
227
289
|
regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
|
228
290
|
|
229
|
-
# Combined REGEXes which also match for TEXTUAL emoji
|
230
|
-
regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT])
|
231
|
-
|
232
|
-
regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT])
|
233
|
-
|
234
|
-
regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT])
|
235
|
-
|
236
291
|
regexes[:REGEX_PICTO] = Regexp.compile(picto)
|
237
292
|
|
238
293
|
regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)
|
@@ -246,6 +301,7 @@ regexes = compile(
|
|
246
301
|
emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
|
247
302
|
emoji_component: pack_and_join(EMOJI_COMPONENT),
|
248
303
|
emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
|
304
|
+
text_presentation: pack_and_join(TEXT_PRESENTATION),
|
249
305
|
picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
|
250
306
|
picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
|
251
307
|
)
|
@@ -257,6 +313,7 @@ native_regexes = compile(
|
|
257
313
|
emoji_modifier_base: "\\p{EBase}",
|
258
314
|
emoji_component: "\\p{EComp}",
|
259
315
|
emoji_presentation: "\\p{EPres}",
|
316
|
+
text_presentation: "\\p{Emoji}(?<!\\p{EPres})",
|
260
317
|
picto: "\\p{ExtPict}",
|
261
318
|
picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
|
262
319
|
)
|
@@ -2,12 +2,13 @@
|
|
2
2
|
|
3
3
|
module Unicode
|
4
4
|
module Emoji
|
5
|
-
VERSION = "3.
|
5
|
+
VERSION = "3.8.0"
|
6
6
|
EMOJI_VERSION = "16.0"
|
7
7
|
CLDR_VERSION = "45"
|
8
8
|
DATA_DIRECTORY = File.expand_path('../../../data', __dir__).freeze
|
9
9
|
INDEX_FILENAME = (DATA_DIRECTORY + "/emoji.marshal.gz").freeze
|
10
10
|
|
11
|
+
# Unicode properties, see https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt
|
11
12
|
PROPERTY_NAMES = {
|
12
13
|
E: "Emoji",
|
13
14
|
B: "Emoji_Modifier_Base",
|
@@ -17,13 +18,28 @@ module Unicode
|
|
17
18
|
X: "Extended_Pictographic",
|
18
19
|
}.freeze
|
19
20
|
|
21
|
+
# Variation Selector 16 (VS16), enables emoji presentation mode for preceding codepoint
|
20
22
|
EMOJI_VARIATION_SELECTOR = 0xFE0F
|
23
|
+
|
24
|
+
# Variation Selector 15 (VS15), enables text presentation mode for preceding codepoint
|
21
25
|
TEXT_VARIATION_SELECTOR = 0xFE0E
|
26
|
+
|
27
|
+
# First codepoint of tag-based subdivision flags
|
22
28
|
EMOJI_TAG_BASE_FLAG = 0x1F3F4
|
29
|
+
|
30
|
+
# Last codepoint of tag-based subdivision flags
|
23
31
|
CANCEL_TAG = 0xE007F
|
32
|
+
|
33
|
+
# Tags characters allowed in tag-based subdivision flags
|
24
34
|
SPEC_TAGS = [*0xE0030..0xE0039, *0xE0061..0xE007A].freeze
|
35
|
+
|
36
|
+
# Combining Enclosing Keycap character
|
25
37
|
EMOJI_KEYCAP_SUFFIX = 0x20E3
|
38
|
+
|
39
|
+
# Zero-width-joiner to enable combination of multiple Emoji in a sequence
|
26
40
|
ZWJ = 0x200D
|
41
|
+
|
42
|
+
# Two regional indicators make up a region
|
27
43
|
REGIONAL_INDICATORS = [*0x1F1E6..0x1F1FF].freeze
|
28
44
|
end
|
29
45
|
end
|