unicode-emoji 3.6.0 → 3.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +24 -1
- data/README.md +105 -63
- data/Rakefile +6 -2
- data/data/emoji.marshal.gz +0 -0
- data/data/generate_constants.rb +120 -46
- data/lib/unicode/emoji/constants.rb +18 -2
- data/lib/unicode/emoji/generated/regex.rb +1 -1
- data/lib/unicode/emoji/generated/regex_include_mqe.rb +8 -0
- data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +8 -0
- data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_possible.rb +8 -0
- data/lib/unicode/emoji/generated/regex_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
- data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
- data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_possible.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
- data/lib/unicode/emoji/lazy_constants.rb +37 -6
- data/lib/unicode/emoji/list.rb +13 -0
- data/lib/unicode/emoji.rb +38 -5
- data/spec/data/.keep +0 -0
- data/spec/data/emoji-test.txt +5331 -0
- data/spec/emoji_test_txt_spec.rb +181 -0
- data/spec/unicode_emoji_spec.rb +152 -5
- data/unicode-emoji.gemspec +3 -3
- metadata +20 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3c7cc2671d256d8208b72d719384e7c13aaace4fec6b4919b92640e5336d87f
|
4
|
+
data.tar.gz: 9420777da4805787467c7f4eac1580f7179a6abf56e54b011c11640a50502b88
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f31a83c8a492affe4ec34f2f6e43fa20a44f98dae6c0855322d5c12bc924edc154f7fa4793ca28f299f5436c8981651a5777c2e7944db6dee9c3b99f5ec997ce
|
7
|
+
data.tar.gz: 54c1beecbcb673274bdf98169f11fa51bc746282418c3b9ca30385194ae384e3532b04bbca3ef196316c11d3344258b9059da1ea18630d8d90b51029c608565d
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,28 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
### 3.8.0
|
4
|
+
|
5
|
+
- Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
|
6
|
+
for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
|
7
|
+
- Add specs running through `emoji-text.txt` and classify qualification statuses per regex
|
8
|
+
- Improve documentation and add detailed table about which regex has which features
|
9
|
+
- Native regexes: Use native Emoji props for Emoji text presentation
|
10
|
+
- Update CLDR to v46 (valid subdivisions)
|
11
|
+
- Further improvements (see commit log)
|
12
|
+
|
13
|
+
### 3.7.0
|
14
|
+
|
15
|
+
- Bump required Ruby slightly to 2.5
|
16
|
+
- Introduce new `REGEX_POSSIBLE` which contains the regex described in
|
17
|
+
https://www.unicode.org/reports/tr51/#EBNF_and_Regex
|
18
|
+
- Fix that some valid subdivisions were not decompressed (`REGEX_VALID`)
|
19
|
+
- Be stricter about selection of tag characters in `REGEX_WELL_FORMED`
|
20
|
+
- Only U+E0030..U+E0039, U+E0061..U+E007A allowed
|
21
|
+
- Max tag sequence length
|
22
|
+
- Use native `/\p{RI}/` regex for regional indicators
|
23
|
+
- Separately autoload emoji list, so it can be loaded when other indexes
|
24
|
+
are not needed
|
25
|
+
|
3
26
|
### 3.6.0
|
4
27
|
|
5
28
|
- `Unicode::Emoji::REGEX_TEXT` now matches non-emoji keycaps like "3⃣" (U+0033 U+20E3)
|
@@ -16,7 +39,7 @@
|
|
16
39
|
### 3.3.2
|
17
40
|
|
18
41
|
- Update valid subdivisions to CLDR 43 (no changes)
|
19
|
-
-> there won't be any new subdivision flags in Emoji
|
42
|
+
-> there won't be any new RGI subdivision flags in Emoji
|
20
43
|
|
21
44
|
### 3.3.1
|
22
45
|
|
data/README.md
CHANGED
@@ -1,18 +1,15 @@
|
|
1
1
|
# Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
|
2
2
|
|
3
|
-
Provides
|
3
|
+
Provides regular expressions to find Emoji in strings, incorporating the latest Unicode / Emoji standards.
|
4
4
|
|
5
|
-
|
5
|
+
Additional features:
|
6
6
|
|
7
|
-
Emoji
|
8
|
-
|
9
|
-
CLDR version (used for sub-region flags): **45** (April 2024)
|
10
|
-
|
11
|
-
Supported Rubies: **3.x**
|
7
|
+
- A categorized list of Emoji (RGI: Recommended for General Interchange)
|
8
|
+
- Retrieve Emoji properties info about specific codepoints (Emoji_Modifier, Emoji_Presentation, etc.)
|
12
9
|
|
13
|
-
|
10
|
+
Emoji version: **16.0** (September 2024)
|
14
11
|
|
15
|
-
|
12
|
+
CLDR version (used for sub-region flags): **46** (October 2024)
|
16
13
|
|
17
14
|
## Gemfile
|
18
15
|
|
@@ -20,16 +17,14 @@ If you are stuck on an older Ruby version, checkout the latest [0.9 version](htt
|
|
20
17
|
gem "unicode-emoji"
|
21
18
|
```
|
22
19
|
|
23
|
-
## Usage
|
24
|
-
|
25
|
-
### Regex
|
20
|
+
## Usage – Regex Matching
|
26
21
|
|
27
|
-
The gem includes
|
22
|
+
The gem includes multiple Emoji regexes, which are compiled out of various Emoji Unicode data sources.
|
28
23
|
|
29
24
|
```ruby
|
30
25
|
require "unicode/emoji"
|
31
26
|
|
32
|
-
string = "String which contains all
|
27
|
+
string = "String which contains all types of Emoji sequences:
|
33
28
|
|
34
29
|
- Singleton Emoji: 😴
|
35
30
|
- Textual singleton Emoji with Emoji variation: ▶️
|
@@ -38,62 +33,114 @@ string = "String which contains all kinds of emoji:
|
|
38
33
|
- Sub-Region flag: 🏴
|
39
34
|
- Keycap sequence: 2️⃣
|
40
35
|
- Sequence using ZWJ (zero width joiner): 🤾🏽♀️
|
41
|
-
|
42
36
|
"
|
43
37
|
|
44
38
|
string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴", "2️⃣", "🤾🏽♀️"]
|
45
39
|
```
|
46
40
|
|
47
|
-
|
41
|
+
Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
|
48
42
|
|
49
|
-
|
43
|
+
### Main Regexes
|
50
44
|
|
51
45
|
Regex | Description | Example Matches | Example Non-Matches
|
52
46
|
------------------------------|-------------|-----------------|--------------------
|
53
|
-
`Unicode::Emoji::REGEX` | **Use this if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all
|
54
|
-
`Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all
|
55
|
-
`Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all
|
47
|
+
`Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️` | `🤾🏽♀`, `🏌♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
48
|
+
`Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀` ,`🏌♂️`, `🤠🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`, `1⃣`
|
49
|
+
`Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`,`🏌♂️` , `🤠🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`, `1⃣`
|
50
|
+
`Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits (except for: unqualified keycap sequences) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
|
56
51
|
|
57
|
-
|
52
|
+
#### Include Text Emoji
|
58
53
|
|
59
|
-
|
60
|
-
- If you want broader matching (e.g. more sub-regions), choose `REGEX_VALID`
|
61
|
-
- If you even want to match for invalid sequences, too, use `REGEX_WELL_FORMED`
|
54
|
+
By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
|
62
55
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
Region "🇵🇵" | No | No | Yes
|
69
|
-
Tag Sequence "🏴" | Yes | Yes | Yes
|
70
|
-
Tag Sequence "🏴" | No | Yes | Yes
|
71
|
-
Tag Sequence "😴" | No | No | Yes
|
72
|
-
ZWJ Sequence "🤾🏽♀️" | Yes | Yes | Yes
|
73
|
-
ZWJ Sequence "🤠🤢" | No | Yes | Yes
|
74
|
-
|
75
|
-
More info about valid vs. recommended Emoji in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
|
76
|
-
|
77
|
-
#### Singleton Regexes
|
56
|
+
Regex | Description | Example Matches | Example Non-Matches
|
57
|
+
------------------------------|-------------|-----------------|--------------------
|
58
|
+
`Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `😴︎`, `▶`, `1⃣` | `🤾🏽♀`, `🏌♂️`, `🏻`, `🇵🇵`, `🏴`, `🤠🤢`, `1`
|
59
|
+
`Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `😴︎`, `▶`, `1⃣` | `🏻`, `🇵🇵`, `1`
|
60
|
+
`Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` | `🏻`, `1`
|
78
61
|
|
79
|
-
|
62
|
+
#### Minimally-qualified and Unqualified Sequences
|
80
63
|
|
81
64
|
Regex | Description | Example Matches | Example Non-Matches
|
82
65
|
------------------------------|-------------|-----------------|--------------------
|
83
|
-
`Unicode::Emoji::
|
84
|
-
`Unicode::Emoji::
|
66
|
+
`Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `🤾🏽♀` | `🏌♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
67
|
+
`Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
85
68
|
|
86
|
-
|
69
|
+
[List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
|
87
70
|
|
88
|
-
|
71
|
+
#### Singleton Regexes
|
72
|
+
|
73
|
+
Matches only simple one-codepoint (+ optional variation selector) Emoji:
|
89
74
|
|
90
75
|
Regex | Description | Example Matches | Example Non-Matches
|
91
76
|
------------------------------|-------------|-----------------|--------------------
|
92
|
-
`Unicode::Emoji::
|
93
|
-
`Unicode::Emoji::
|
94
|
-
|
95
|
-
|
96
|
-
|
77
|
+
`Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `1`
|
78
|
+
`Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `1`
|
79
|
+
|
80
|
+
Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text)
|
81
|
+
|
82
|
+
While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
|
83
|
+
|
84
|
+
### Comparison
|
85
|
+
|
86
|
+
1) Fully-qualified RGI Emoji ZWJ sequence
|
87
|
+
2) Minimally-qualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selectors, but not in the first Emoji character)
|
88
|
+
3) Unqualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selector, including in the first Emoji character). Unqualified Emoji include all basic Emoji in Text Presentation (see column 11/12).
|
89
|
+
4) Non-RGI Emoji ZWJ sequence
|
90
|
+
5) Valid Region made from a pair of Regional Indicators
|
91
|
+
6) Any Region made from a pair of Regional Indicators
|
92
|
+
7) RGI Flag Emoji Tag Sequences (England, Scotland, Wales)
|
93
|
+
8) Valid Flag Emoji Tag Sequences (any known subdivision)
|
94
|
+
9) Any Emoji Tag Sequences (any tag sequence with any base)
|
95
|
+
10) Basic Default Emoji Presentation Characters or Text characters with Emoji Presentation Selector
|
96
|
+
11) Basic Default Text Presentation Characters or Basic Emoji with Text Presentation Selector
|
97
|
+
12) Non-Emoji (unqualified) keycap
|
98
|
+
|
99
|
+
Regex | 1 RGI/FQE | 2 RGI/MQE | 3 RGI/UQE | 4 Non-RGI | 5 Valid Region | 6 Any Region | 7 RGI Tag | 8 Valid Tag | 9 Any Tag | 10 Basic Emoji | 11 Basic Text | 12 Text Keycap
|
100
|
+
-|-|-|-|-|-|-|-|-|-|-|-|-
|
101
|
+
REGEX | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
|
102
|
+
REGEX INCLUDE TEXT | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
|
103
|
+
REGEX INCLUDE MQE | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
|
104
|
+
REGEX INCLUDE MQE UQE | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
|
105
|
+
REGEX VALID | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌
|
106
|
+
REGEX VALID INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅
|
107
|
+
REGEX WELL FORMED | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌
|
108
|
+
REGEX WELL FORMED INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅
|
109
|
+
REGEX POSSIBLE | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌
|
110
|
+
REGEX BASIC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌
|
111
|
+
REGEX TEXT | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅
|
112
|
+
|
113
|
+
¹ Matches all unqualified Emoji, except for textual singleton Emoji (see columns 11, 12)
|
114
|
+
|
115
|
+
See [spec files](/spec) for detailed examples about which regex matches which kind of Emoji.
|
116
|
+
|
117
|
+
### Picking the Right Emoji Regex
|
118
|
+
|
119
|
+
- Usually you just want `REGEX` (recommended Emoji set, RGI)
|
120
|
+
- Use `REGEX_INCLUDE_MQE` or `REGEX_INCLUDE_MQE_UQE` if you want to catch Emoji sequences with missing Variation Selectors.
|
121
|
+
- If you want broader matching (any ZWJ sequences, more sub-region flags), choose `REGEX_VALID`
|
122
|
+
- If you need to match any region flag and any tag sequence, choose `REGEX_WELL_FORMED`
|
123
|
+
- Use the `_INCLUDE_TEXT` suffix with any of the above base regexes, if you want to also match basic textual Emoji
|
124
|
+
- And finally, there is also the option to use `REGEX_POSSIBLE`, which is a simplified test for possible Emoji, comparable to `REGEX_WELL_FORMED*`. It might contain false positives, however, the regex is less complex and [suggested in the Unicode standard itself](https://www.unicode.org/reports/tr51/#EBNF_and_Regex) as a first check.
|
125
|
+
|
126
|
+
### Examples
|
127
|
+
|
128
|
+
Desc | Emoji | Escaped | `REGEX` (RGI/FQE) | `REGEX_INCLUDE_MQE` (RGI/MQE) | `REGEX_VALID` | `REGEX_WELL_FORMED` / `REGEX_POSSIBLE`
|
129
|
+
-----|-------|---------|---------------|-----------------------|-----------------------------------|-----------------
|
130
|
+
RGI ZWJ Sequence | 🤾🏽♀️ | `\u{1F93E 1F3FD 200D 2640 FE0F}` | ✅ | ✅ | ✅ | ✅
|
131
|
+
RGI ZWJ Sequence MQE | 🤾🏽♀ | `\u{1F93E 1F3FD 200D 2640}` | ❌ | ✅ | ✅ | ✅
|
132
|
+
Valid ZWJ Sequence, Non-RGI | 🤠🤢 | `\u{1F920 200D 1F922}` | ❌ | ❌ | ✅ | ✅
|
133
|
+
Known Region | 🇵🇹 | `\u{1F1F5 1F1F9}` | ✅ | ✅ | ✅ | ✅
|
134
|
+
Unknown Region | 🇵🇵 | `\u{1F1F5 1F1F5}` | ❌ | ❌ | ❌ | ✅
|
135
|
+
RGI Tag Sequence | 🏴 | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | ✅ | ✅ | ✅ | ✅
|
136
|
+
Valid Tag Sequence | 🏴 | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | ❌ | ❌ | ✅ | ✅
|
137
|
+
Well-formed Tag Sequence | 😴 | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | ❌ | ❌ | ❌ | ✅
|
138
|
+
|
139
|
+
Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
|
140
|
+
|
141
|
+
More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
|
142
|
+
|
143
|
+
### Extended Pictographic Regex
|
97
144
|
|
98
145
|
`Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
|
99
146
|
|
@@ -101,18 +148,13 @@ Regex | Description | Example Matches | Example Non-Matc
|
|
101
148
|
|
102
149
|
See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
|
103
150
|
|
104
|
-
|
105
|
-
|
106
|
-
Matches potential Emoji parts (often, this is not what you want):
|
107
|
-
|
108
|
-
Regex | Description | Example Matches | Example Non-Matches
|
109
|
-
------------------------------|-------------|-----------------|--------------------
|
110
|
-
`Unicode::Emoji::REGEX_ANY` | Matches any Emoji-related codepoint (but no variation selectors, tags, or zero-width joiners). Please not that this will match Emoji-parts rather than complete Emoji, for example, single digits! | `😴`, `▶`, `🏻`, `🛌`, `🏽`, `🇵`, `🇹`, `2`, `🏴`, `🤾`, `♀`, `🤠`, `🤢` | -
|
151
|
+
### Partial Regexes
|
111
152
|
|
153
|
+
`Unicode::Emoji::REGEX_ANY`, same as `\p{Emoji}`. Deprecated: Will be removed or renamed in the future.
|
112
154
|
|
113
|
-
|
155
|
+
## Usage – List
|
114
156
|
|
115
|
-
Use `Unicode::Emoji::LIST` or the list method to get a
|
157
|
+
Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
|
116
158
|
|
117
159
|
```ruby
|
118
160
|
Unicode::Emoji.list.keys
|
@@ -125,13 +167,13 @@ Unicode::Emoji.list("Food & Drink", "food-asian")
|
|
125
167
|
=> ["🍱", "🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🥮", "🍡", "🥟", "🥠", "🥡"]
|
126
168
|
```
|
127
169
|
|
128
|
-
Please note that categories might change with future versions of the Emoji standard
|
170
|
+
Please note that categories might change with future versions of the Emoji standard, although this has not happened often.
|
129
171
|
|
130
|
-
A list of all Emoji can be found at [character.construction](https://character.construction).
|
172
|
+
A list of all Emoji (generated from this gem) can be found at [character.construction/emoji](https://character.construction/emoji).
|
131
173
|
|
132
|
-
|
174
|
+
## Usage – Properties Data
|
133
175
|
|
134
|
-
Allows you to access the codepoint data form Unicode's [emoji-data.txt](https://www.unicode.org/Public/
|
176
|
+
Allows you to access the codepoint data for a single character form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
|
135
177
|
|
136
178
|
```ruby
|
137
179
|
require "unicode/emoji"
|
@@ -143,7 +185,7 @@ Unicode::Emoji.properties "☝" # => ["Emoji", "Emoji_Modifier_Base"]
|
|
143
185
|
|
144
186
|
- [Unicode® Technical Standard #51](https://www.unicode.org/reports/tr51/)
|
145
187
|
- [Emoji categories](https://unicode.org/emoji/charts/emoji-ordering.html)
|
146
|
-
- Ruby gem which displays [Emoji sequence names](https://github.com/janlelis/unicode-sequence_name) (
|
188
|
+
- Ruby gem which displays [Emoji sequence names](https://github.com/janlelis/unicode-sequence_name) ([as website](https://character.construction/name))
|
147
189
|
- Part of [unicode-x](https://github.com/janlelis/unicode-x)
|
148
190
|
|
149
191
|
## MIT
|
data/Rakefile
CHANGED
@@ -28,14 +28,18 @@ task :irb do
|
|
28
28
|
end
|
29
29
|
|
30
30
|
# # #
|
31
|
-
# Run
|
31
|
+
# Run specs
|
32
32
|
|
33
33
|
desc "#{gemspec.name} | Spec"
|
34
34
|
task :spec do
|
35
|
-
ruby "spec
|
35
|
+
ruby File.join("spec", "*_spec.rb")
|
36
36
|
end
|
37
37
|
task default: :spec
|
38
38
|
|
39
|
+
|
40
|
+
# # #
|
41
|
+
# Generate regex
|
42
|
+
|
39
43
|
desc "#{gemspec.name} | Generates all regex constants and saves them to lib/unicode/emoji/{generated,generated_native} directories"
|
40
44
|
task :generate_constants do
|
41
45
|
load "data/generate_constants.rb", true
|
data/data/emoji.marshal.gz
CHANGED
Binary file
|
data/data/generate_constants.rb
CHANGED
@@ -68,10 +68,10 @@ def pack_and_join(ords)
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
|
71
|
+
def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
|
72
72
|
emoji_presentation_sequence = \
|
73
73
|
join(
|
74
|
-
|
74
|
+
text_presentation + pack(EMOJI_VARIATION_SELECTOR),
|
75
75
|
emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
|
76
76
|
)
|
77
77
|
|
@@ -79,14 +79,20 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
79
79
|
"(?!" + emoji_component + ")" + emoji_presentation_sequence
|
80
80
|
|
81
81
|
text_keycap_sequence = \
|
82
|
-
|
82
|
+
pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
|
83
83
|
|
84
84
|
text_presentation_sequence = \
|
85
85
|
join(
|
86
|
-
|
86
|
+
text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
|
87
87
|
emoji_presentation + pack(TEXT_VARIATION_SELECTOR),
|
88
88
|
)
|
89
89
|
|
90
|
+
text_emoji = \
|
91
|
+
join(
|
92
|
+
"(?!" + emoji_component + ")" + text_presentation_sequence,
|
93
|
+
text_keycap_sequence,
|
94
|
+
)
|
95
|
+
|
90
96
|
emoji_modifier_sequence = \
|
91
97
|
emoji_modifier_base + emoji_modifier
|
92
98
|
|
@@ -97,27 +103,13 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
97
103
|
pack_and_join(VALID_REGION_FLAGS)
|
98
104
|
|
99
105
|
emoji_well_formed_flag_sequence = \
|
100
|
-
|
101
|
-
pack_and_join(REGIONAL_INDICATORS) +
|
102
|
-
pack_and_join(REGIONAL_INDICATORS) +
|
103
|
-
")"
|
104
|
-
|
105
|
-
emoji_valid_core_sequence = \
|
106
|
-
join(
|
107
|
-
# emoji_character,
|
108
|
-
emoji_keycap_sequence,
|
109
|
-
emoji_modifier_sequence,
|
110
|
-
non_component_emoji_presentation_sequence,
|
111
|
-
emoji_valid_flag_sequence,
|
112
|
-
)
|
106
|
+
'\p{RI}{2}'
|
113
107
|
|
114
|
-
|
108
|
+
emoji_core_sequence = \
|
115
109
|
join(
|
116
|
-
# emoji_character,
|
117
110
|
emoji_keycap_sequence,
|
118
111
|
emoji_modifier_sequence,
|
119
112
|
non_component_emoji_presentation_sequence,
|
120
|
-
emoji_well_formed_flag_sequence,
|
121
113
|
)
|
122
114
|
|
123
115
|
# Sort to make sure complex sequences match first
|
@@ -128,7 +120,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
128
120
|
"(?:" +
|
129
121
|
pack(EMOJI_TAG_BASE_FLAG) +
|
130
122
|
"(?:" + VALID_SUBDIVISIONS.sort_by(&:length).reverse.map{ |sd|
|
131
|
-
|
123
|
+
sd.tr("\u{30}-\u{39}\u{61}-\u{7A}", "\u{E0030}-\u{E0039}\u{E0061}-\u{E007A}")
|
132
124
|
}.join("|") + ")" +
|
133
125
|
pack(CANCEL_TAG) +
|
134
126
|
")"
|
@@ -139,7 +131,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
139
131
|
non_component_emoji_presentation_sequence,
|
140
132
|
emoji_modifier_sequence,
|
141
133
|
) +
|
142
|
-
pack_and_join(
|
134
|
+
pack_and_join(SPEC_TAGS) + "{1,30}" +
|
143
135
|
pack(CANCEL_TAG) +
|
144
136
|
")"
|
145
137
|
|
@@ -147,6 +139,18 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
147
139
|
emoji_rgi_zwj_sequence = \
|
148
140
|
pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse)
|
149
141
|
|
142
|
+
# FQE+MQE: Make VS16 optional after ZWJ has appeared
|
143
|
+
emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
|
144
|
+
/#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/,
|
145
|
+
pack(EMOJI_VARIATION_SELECTOR) + "?"
|
146
|
+
)
|
147
|
+
|
148
|
+
# FQE+MQE+UQE: Make all VS16 optional
|
149
|
+
emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
|
150
|
+
pack(EMOJI_VARIATION_SELECTOR),
|
151
|
+
pack(EMOJI_VARIATION_SELECTOR) + "?",
|
152
|
+
)
|
153
|
+
|
150
154
|
emoji_valid_zwj_element = \
|
151
155
|
join(
|
152
156
|
emoji_modifier_sequence,
|
@@ -163,58 +167,126 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
163
167
|
join(
|
164
168
|
emoji_rgi_zwj_sequence,
|
165
169
|
emoji_rgi_tag_sequence,
|
166
|
-
|
170
|
+
emoji_valid_flag_sequence,
|
171
|
+
emoji_core_sequence,
|
172
|
+
)
|
173
|
+
|
174
|
+
emoji_rgi_sequence_include_text = \
|
175
|
+
join(
|
176
|
+
emoji_rgi_zwj_sequence,
|
177
|
+
emoji_rgi_tag_sequence,
|
178
|
+
emoji_valid_flag_sequence,
|
179
|
+
emoji_core_sequence,
|
180
|
+
text_emoji,
|
181
|
+
)
|
182
|
+
|
183
|
+
emoji_rgi_include_mqe_sequence = \
|
184
|
+
join(
|
185
|
+
emoji_rgi_include_mqe_zwj_sequence,
|
186
|
+
emoji_rgi_tag_sequence,
|
187
|
+
emoji_valid_flag_sequence,
|
188
|
+
emoji_core_sequence,
|
189
|
+
)
|
190
|
+
|
191
|
+
emoji_rgi_include_mqe_uqe_sequence = \
|
192
|
+
join(
|
193
|
+
emoji_rgi_include_mqe_uqe_zwj_sequence,
|
194
|
+
text_emoji, # also uqe
|
195
|
+
emoji_rgi_tag_sequence,
|
196
|
+
emoji_valid_flag_sequence,
|
197
|
+
emoji_core_sequence,
|
167
198
|
)
|
168
199
|
|
169
200
|
emoji_valid_sequence = \
|
170
201
|
join(
|
171
202
|
emoji_valid_zwj_sequence,
|
172
203
|
emoji_valid_tag_sequence,
|
173
|
-
|
204
|
+
emoji_valid_flag_sequence,
|
205
|
+
emoji_core_sequence,
|
206
|
+
)
|
207
|
+
|
208
|
+
emoji_valid_sequence_include_text = \
|
209
|
+
join(
|
210
|
+
emoji_valid_zwj_sequence,
|
211
|
+
emoji_valid_tag_sequence,
|
212
|
+
emoji_valid_flag_sequence,
|
213
|
+
emoji_core_sequence,
|
214
|
+
text_emoji,
|
174
215
|
)
|
175
216
|
|
176
217
|
emoji_well_formed_sequence = \
|
177
218
|
join(
|
178
219
|
emoji_valid_zwj_sequence,
|
179
220
|
emoji_well_formed_tag_sequence,
|
180
|
-
|
221
|
+
emoji_well_formed_flag_sequence,
|
222
|
+
emoji_core_sequence,
|
223
|
+
)
|
224
|
+
|
225
|
+
emoji_well_formed_sequence_include_text = \
|
226
|
+
join(
|
227
|
+
emoji_valid_zwj_sequence,
|
228
|
+
emoji_well_formed_tag_sequence,
|
229
|
+
emoji_well_formed_flag_sequence,
|
230
|
+
emoji_core_sequence,
|
231
|
+
text_emoji,
|
232
|
+
)
|
233
|
+
|
234
|
+
emoji_possible_modification = \
|
235
|
+
join(
|
236
|
+
emoji_modifier,
|
237
|
+
pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]) + "?",
|
238
|
+
"[-]+" # raw tags
|
239
|
+
)
|
240
|
+
|
241
|
+
emoji_possible_zwj_element = \
|
242
|
+
join(
|
243
|
+
emoji_well_formed_flag_sequence,
|
244
|
+
emoji_character + emoji_possible_modification + "?"
|
181
245
|
)
|
182
246
|
|
247
|
+
emoji_possible = \
|
248
|
+
emoji_possible_zwj_element + "(?:" + pack(ZWJ) + emoji_possible_zwj_element + ")*"
|
249
|
+
|
183
250
|
regexes = {}
|
184
251
|
|
185
252
|
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
186
253
|
regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)
|
187
254
|
|
255
|
+
# rgi + singleton text
|
256
|
+
regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text)
|
257
|
+
|
258
|
+
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
259
|
+
# Also make VS16 optional if not at first emoji character
|
260
|
+
regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence)
|
261
|
+
|
262
|
+
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
263
|
+
# Also make VS16 optional even at first emoji character
|
264
|
+
regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence)
|
265
|
+
|
188
266
|
# Matches basic singleton emoji and all kind of valid sequences
|
189
267
|
regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)
|
190
268
|
|
269
|
+
# valid + singleton text
|
270
|
+
regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text)
|
271
|
+
|
191
272
|
# Matches basic singleton emoji and all kind of sequences
|
192
273
|
regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)
|
274
|
+
|
275
|
+
# well-formed + singleton text
|
276
|
+
regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text)
|
193
277
|
|
194
|
-
#
|
195
|
-
#
|
196
|
-
regexes[:
|
197
|
-
"(?!" + emoji_component + ")" + emoji_presentation_sequence
|
198
|
-
)
|
278
|
+
# Quick test which might lead to false positves
|
279
|
+
# See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
|
280
|
+
regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
|
199
281
|
|
200
|
-
# Matches only basic single, textual emoji
|
201
|
-
|
202
|
-
regexes[:REGEX_TEXT] = Regexp.compile(
|
203
|
-
join(
|
204
|
-
"(?!" + emoji_component + ")" + text_presentation_sequence,
|
205
|
-
text_keycap_sequence,
|
206
|
-
)
|
207
|
-
)
|
282
|
+
# Matches only basic single, non-textual emoji, ignores "components" like modifiers or simple digits
|
283
|
+
regexes[:REGEX_BASIC] = Regexp.compile(non_component_emoji_presentation_sequence)
|
208
284
|
|
209
|
-
# Matches
|
210
|
-
regexes[:
|
211
|
-
|
212
|
-
# Combined REGEXes which also match for TEXTUAL emoji
|
213
|
-
regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT])
|
285
|
+
# Matches only basic single, textual emoji, ignores "components" like modifiers or simple digits
|
286
|
+
regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
|
214
287
|
|
215
|
-
|
216
|
-
|
217
|
-
regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT])
|
288
|
+
# Same as \p{Emoji} - to be removed or renamed
|
289
|
+
regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
|
218
290
|
|
219
291
|
regexes[:REGEX_PICTO] = Regexp.compile(picto)
|
220
292
|
|
@@ -229,6 +301,7 @@ regexes = compile(
|
|
229
301
|
emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
|
230
302
|
emoji_component: pack_and_join(EMOJI_COMPONENT),
|
231
303
|
emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
|
304
|
+
text_presentation: pack_and_join(TEXT_PRESENTATION),
|
232
305
|
picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
|
233
306
|
picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
|
234
307
|
)
|
@@ -240,6 +313,7 @@ native_regexes = compile(
|
|
240
313
|
emoji_modifier_base: "\\p{EBase}",
|
241
314
|
emoji_component: "\\p{EComp}",
|
242
315
|
emoji_presentation: "\\p{EPres}",
|
316
|
+
text_presentation: "\\p{Emoji}(?<!\\p{EPres})",
|
243
317
|
picto: "\\p{ExtPict}",
|
244
318
|
picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
|
245
319
|
)
|
@@ -2,12 +2,13 @@
|
|
2
2
|
|
3
3
|
module Unicode
|
4
4
|
module Emoji
|
5
|
-
VERSION = "3.
|
5
|
+
VERSION = "3.8.0"
|
6
6
|
EMOJI_VERSION = "16.0"
|
7
7
|
CLDR_VERSION = "45"
|
8
8
|
DATA_DIRECTORY = File.expand_path('../../../data', __dir__).freeze
|
9
9
|
INDEX_FILENAME = (DATA_DIRECTORY + "/emoji.marshal.gz").freeze
|
10
10
|
|
11
|
+
# Unicode properties, see https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt
|
11
12
|
PROPERTY_NAMES = {
|
12
13
|
E: "Emoji",
|
13
14
|
B: "Emoji_Modifier_Base",
|
@@ -17,13 +18,28 @@ module Unicode
|
|
17
18
|
X: "Extended_Pictographic",
|
18
19
|
}.freeze
|
19
20
|
|
21
|
+
# Variation Selector 16 (VS16), enables emoji presentation mode for preceding codepoint
|
20
22
|
EMOJI_VARIATION_SELECTOR = 0xFE0F
|
23
|
+
|
24
|
+
# Variation Selector 15 (VS15), enables text presentation mode for preceding codepoint
|
21
25
|
TEXT_VARIATION_SELECTOR = 0xFE0E
|
26
|
+
|
27
|
+
# First codepoint of tag-based subdivision flags
|
22
28
|
EMOJI_TAG_BASE_FLAG = 0x1F3F4
|
29
|
+
|
30
|
+
# Last codepoint of tag-based subdivision flags
|
23
31
|
CANCEL_TAG = 0xE007F
|
24
|
-
|
32
|
+
|
33
|
+
# Tags characters allowed in tag-based subdivision flags
|
34
|
+
SPEC_TAGS = [*0xE0030..0xE0039, *0xE0061..0xE007A].freeze
|
35
|
+
|
36
|
+
# Combining Enclosing Keycap character
|
25
37
|
EMOJI_KEYCAP_SUFFIX = 0x20E3
|
38
|
+
|
39
|
+
# Zero-width-joiner to enable combination of multiple Emoji in a sequence
|
26
40
|
ZWJ = 0x200D
|
41
|
+
|
42
|
+
# Two regional indicators make up a region
|
27
43
|
REGIONAL_INDICATORS = [*0x1F1E6..0x1F1FF].freeze
|
28
44
|
end
|
29
45
|
end
|