unicode-emoji 3.7.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rake_tasks +1 -0
- data/CHANGELOG.md +24 -1
- data/Gemfile.lock +3 -3
- data/README.md +111 -58
- data/Rakefile +6 -2
- data/data/emoji.marshal.gz +0 -0
- data/data/generate_constants.rb +123 -43
- data/lib/unicode/emoji/constants.rb +22 -2
- data/lib/unicode/emoji/generated/regex.rb +1 -1
- data/lib/unicode/emoji/generated/regex_basic.rb +1 -1
- data/lib/unicode/emoji/generated/regex_emoji_keycap.rb +8 -0
- data/lib/unicode/emoji/generated/regex_include_mqe.rb +8 -0
- data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +8 -0
- data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_prop_component.rb +8 -0
- data/lib/unicode/emoji/generated/regex_prop_emoji.rb +8 -0
- data/lib/unicode/emoji/generated/regex_prop_modifier.rb +8 -0
- data/lib/unicode/emoji/generated/regex_prop_modifier_base.rb +8 -0
- data/lib/unicode/emoji/generated/regex_prop_presentation.rb +8 -0
- data/lib/unicode/emoji/generated/regex_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
- data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
- data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
- data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_emoji_keycap.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_picto_no_emoji.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_prop_component.rb +8 -0
- data/lib/unicode/emoji/generated_native/{regex_any.rb → regex_prop_emoji.rb} +1 -1
- data/lib/unicode/emoji/generated_native/regex_prop_modifier.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_prop_modifier_base.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_prop_presentation.rb +8 -0
- data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
- data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
- data/lib/unicode/emoji/lazy_constants.rb +36 -0
- data/lib/unicode/emoji/list.rb +3 -0
- data/lib/unicode/emoji.rb +39 -6
- data/spec/data/.keep +0 -0
- data/spec/data/emoji-test.txt +5331 -0
- data/spec/emoji_test_txt_spec.rb +181 -0
- data/spec/unicode_emoji_spec.rb +127 -14
- metadata +24 -4
- data/lib/unicode/emoji/generated/regex_any.rb +0 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5cc77126aeb986e5645ebce97fc3e1be5835b9e5adb9764af980e0175d7d9284
|
4
|
+
data.tar.gz: ffd47af7c556ee26951f6c5f30122ecd44520f1a3aa6856c526c4e4dec050e77
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 810730ae1a796edc6c80c87d0d4e7c0a5bfdbda45c43f57dfde3a05afee7ee5df8d43f378c5c0de34d9e9bca8e9edfaee67261d16057a0aacc79f50c39d703d2
|
7
|
+
data.tar.gz: f300c259b7389d60130b9f910b1301f31bf60d47c32fd16f86935cdf63c740bb44c28ab8fd38bc643f35cd6b4d7e9a76583d3be156683404439069ccf58d8250
|
data/.gitignore
CHANGED
data/.rake_tasks
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,28 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
### 4.0.0
|
4
|
+
|
5
|
+
- **Breaking change:** Regexes now include single skin tone modifiers (`🏻`) and hair components (`🦰`).
|
6
|
+
They were previously considered to be invalid partial Emoji, however since they are supposed to be
|
7
|
+
displayed as Emoji in isolation, they are now part of the regexes (see *ED-20* in UTS51).
|
8
|
+
- **Breaking change:** Drop `REGEX_ANY` in favor of `REGEX_PROP_EMOJI`
|
9
|
+
- Expose regexes for Emoji props (`REGEX_PROP_*`). The advantage over using the native regex properties
|
10
|
+
directly is that you will be able to use the Emoji support level of this gem instead of Ruby's.
|
11
|
+
For example, as of releasing this, the current Emoji version is 16.0, while Ruby is at 15.0.
|
12
|
+
Also see README for a table listing the regexes that match Emoji properties.
|
13
|
+
- Add `REGEX_EMOJI_KEYCAP` for matching specifically Emoji keycaps
|
14
|
+
- Use character class instead of lookbehind for native text emoji and non-emoji pictographic regexes
|
15
|
+
|
16
|
+
### 3.8.0
|
17
|
+
|
18
|
+
- Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
|
19
|
+
for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
|
20
|
+
- Add specs running through `emoji-text.txt` and classify qualification statuses per regex
|
21
|
+
- Improve documentation and add detailed table about which regex has which features
|
22
|
+
- Native regexes: Use native Emoji props for Emoji text presentation
|
23
|
+
- Update CLDR to v46 (valid subdivisions)
|
24
|
+
- Further improvements (see commit log)
|
25
|
+
|
3
26
|
### 3.7.0
|
4
27
|
|
5
28
|
- Bump required Ruby slightly to 2.5
|
@@ -29,7 +52,7 @@
|
|
29
52
|
### 3.3.2
|
30
53
|
|
31
54
|
- Update valid subdivisions to CLDR 43 (no changes)
|
32
|
-
-> there won't be any new subdivision flags in Emoji
|
55
|
+
-> there won't be any new RGI subdivision flags in Emoji
|
33
56
|
|
34
57
|
### 3.3.1
|
35
58
|
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
unicode-emoji (
|
4
|
+
unicode-emoji (4.0.0)
|
5
5
|
unicode-version (~> 1.0)
|
6
6
|
|
7
7
|
GEM
|
@@ -20,7 +20,7 @@ GEM
|
|
20
20
|
reline (0.3.8)
|
21
21
|
io-console (~> 0.5)
|
22
22
|
stringio (3.0.8)
|
23
|
-
unicode-version (1.
|
23
|
+
unicode-version (1.4.0)
|
24
24
|
|
25
25
|
PLATFORMS
|
26
26
|
ruby
|
@@ -32,4 +32,4 @@ DEPENDENCIES
|
|
32
32
|
unicode-emoji!
|
33
33
|
|
34
34
|
BUNDLED WITH
|
35
|
-
2.
|
35
|
+
2.5.21
|
data/README.md
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
# Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
|
2
2
|
|
3
|
-
Provides regular expressions to
|
3
|
+
Provides various sophisticated regular expressions to work with Emoji in strings,
|
4
|
+
incorporating the latest Unicode / Emoji standards.
|
4
5
|
|
5
6
|
Additional features:
|
6
7
|
|
7
|
-
- A categorized list of
|
8
|
+
- A categorized list of Emoji (RGI: Recommended for General Interchange)
|
8
9
|
- Retrieve Emoji properties info about specific codepoints (Emoji_Modifier, Emoji_Presentation, etc.)
|
9
10
|
|
10
11
|
Emoji version: **16.0** (September 2024)
|
11
12
|
|
12
|
-
CLDR version (used for sub-region flags): **
|
13
|
+
CLDR version (used for sub-region flags): **46** (October 2024)
|
13
14
|
|
14
15
|
## Gemfile
|
15
16
|
|
@@ -17,101 +18,153 @@ CLDR version (used for sub-region flags): **45** (April 2024)
|
|
17
18
|
gem "unicode-emoji"
|
18
19
|
```
|
19
20
|
|
20
|
-
## Usage
|
21
|
-
|
22
|
-
### Regex
|
21
|
+
## Usage – Regex Matching
|
23
22
|
|
24
23
|
The gem includes multiple Emoji regexes, which are compiled out of various Emoji Unicode data sources.
|
25
24
|
|
26
25
|
```ruby
|
27
26
|
require "unicode/emoji"
|
28
27
|
|
29
|
-
string = "String which contains all
|
28
|
+
string = "String which contains all types of Emoji sequences:
|
30
29
|
|
31
|
-
-
|
32
|
-
- Textual
|
30
|
+
- Basic Emoji: 😴
|
31
|
+
- Textual Emoji with Emoji variation (VS16): ▶️
|
33
32
|
- Emoji with skin tone modifier: 🛌🏽
|
34
33
|
- Region flag: 🇵🇹
|
35
34
|
- Sub-Region flag: 🏴
|
36
35
|
- Keycap sequence: 2️⃣
|
36
|
+
- Skin tone modifier: 🏻
|
37
37
|
- Sequence using ZWJ (zero width joiner): 🤾🏽♀️
|
38
|
-
|
39
38
|
"
|
40
39
|
|
41
40
|
string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴", "2️⃣", "🤾🏽♀️"]
|
42
41
|
```
|
43
42
|
|
44
|
-
|
43
|
+
Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
|
45
44
|
|
46
|
-
|
45
|
+
### Main Regexes
|
47
46
|
|
48
47
|
Regex | Description | Example Matches | Example Non-Matches
|
49
48
|
------------------------------|-------------|-----------------|--------------------
|
50
|
-
`Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual)
|
51
|
-
`Unicode::Emoji::REGEX_VALID` | Matches (non-textual)
|
52
|
-
`Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual)
|
53
|
-
`Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji,
|
54
|
-
|
55
|
-
##### Picking the Right Emoji Regex
|
56
|
-
|
57
|
-
- Usually you just want `REGEX` (RGI set)
|
58
|
-
- If you want broader matching (any ZJW sequences, more sub-region flags), choose `REGEX_VALID`
|
59
|
-
- Even brolader is `REGEX_WELL_FORMED`, which will also match any region flag and any tag sequence
|
60
|
-
- And then there is `REGEX_POSSIBLE` , which is a quick check for possible Emoji, which might contain false positives, [suggested in the Unicode Standard](https://www.unicode.org/reports/tr51/#EBNF_and_Regex)
|
61
|
-
|
62
|
-
Property | Escaped | `REGEX` (RGI / Recommended) | `REGEX_VALID` (Valid) | `REGEX_WELL_FORMED` (Well-formed) | `REGEX_POSSIBLE`
|
63
|
-
---------|---------|-----------------------------|-----------------------|-----------------------------------|-----------------
|
64
|
-
Region "🇵🇹" | `\u{1F1F5 1F1F9}` | Yes | Yes | Yes | Yes
|
65
|
-
Region "🇵🇵" | `\u{1F1F5 1F1F5}` | No | No | Yes | Yes
|
66
|
-
Tag Sequence "🏴" | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | Yes | Yes | Yes | Yes
|
67
|
-
Tag Sequence "🏴" | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | No | Yes | Yes | Yes
|
68
|
-
Tag Sequence "😴" | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | No | No | Yes | Yes
|
69
|
-
ZWJ Sequence "🤾🏽♀️" | `\u{1F93E 1F3FD 200D 2640 FE0F}` | Yes | Yes | Yes | Yes
|
70
|
-
ZWJ Sequence "🤠🤢" | `\u{1F920 200D 1F922}` | No | Yes | Yes | Yes
|
49
|
+
`Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) Basic Emoji and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `🏻` | `🤾🏽♀`, `🏌♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
50
|
+
`Unicode::Emoji::REGEX_VALID` | Matches (non-textual) Basic Emoji and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀` ,`🏌♂️`, `🤠🤢`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `1`, `1⃣`
|
51
|
+
`Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) Basic Emoji and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`,`🏌♂️` , `🤠🤢`, `🇵🇵`, `🏻` | `😴︎`, `▶`, `1`, `1⃣`
|
52
|
+
`Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, all kinds of Emoji sequences, and even non-Emoji singleton components like digits. Only exception: Unqualified keycap sequences are not matched | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
|
71
53
|
|
72
|
-
|
54
|
+
#### Include Text Emoji
|
73
55
|
|
74
|
-
|
56
|
+
By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
|
75
57
|
|
76
|
-
|
58
|
+
Regex | Description | Example Matches | Example Non-Matches
|
59
|
+
------------------------------|-------------|-----------------|--------------------
|
60
|
+
`Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `😴︎`, `▶`, `1⃣` , `🏻`| `🤾🏽♀`, `🏌♂️`, `🇵🇵`, `🏴`, `🤠🤢`, `1`
|
61
|
+
`Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `😴︎`, `▶`, `1⃣` , `🏻` | `🇵🇵`, `1`
|
62
|
+
`Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` , `🏻` | `1`
|
77
63
|
|
78
|
-
|
64
|
+
#### Minimally-qualified and Unqualified Sequences
|
79
65
|
|
80
66
|
Regex | Description | Example Matches | Example Non-Matches
|
81
67
|
------------------------------|-------------|-----------------|--------------------
|
82
|
-
`Unicode::Emoji::
|
83
|
-
`Unicode::Emoji::
|
68
|
+
`Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏻` | `🏌♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
69
|
+
`Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `🏴`, `🤠🤢`, `1`, `1⃣`
|
84
70
|
|
85
|
-
|
71
|
+
[List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
|
86
72
|
|
87
|
-
|
73
|
+
#### Singleton Regexes
|
74
|
+
|
75
|
+
Matches only simple one-codepoint (+ optional variation selector) Emoji:
|
88
76
|
|
89
77
|
Regex | Description | Example Matches | Example Non-Matches
|
90
78
|
------------------------------|-------------|-----------------|--------------------
|
91
|
-
`Unicode::Emoji::
|
92
|
-
`Unicode::Emoji::
|
93
|
-
|
79
|
+
`Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) Basic Emoji, but no sequences at all | `😴`, `▶️`, `🏻` | `😴︎`, `▶`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `1`
|
80
|
+
`Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴`, `🏴`, `🤾🏽♀️`, `🤾🏽♀`, `🏌♂️`, `🤠🤢`, `1`
|
81
|
+
|
82
|
+
Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text). The `REGEX_BASIC` regex also matches visual Emoji components (skin tone modifiers and hair components).
|
83
|
+
|
84
|
+
While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
|
85
|
+
|
86
|
+
### Comparison
|
87
|
+
|
88
|
+
1) Fully-qualified RGI Emoji ZWJ sequence
|
89
|
+
2) Minimally-qualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selectors, but not in the first Emoji character)
|
90
|
+
3) Unqualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selector, including in the first Emoji character). Unqualified Emoji include all basic Emoji in Text Presentation (see column 11/12).
|
91
|
+
4) Non-RGI Emoji ZWJ sequence
|
92
|
+
5) Valid Region made from a pair of Regional Indicators
|
93
|
+
6) Any Region made from a pair of Regional Indicators
|
94
|
+
7) RGI Flag Emoji Tag Sequences (England, Scotland, Wales)
|
95
|
+
8) Valid Flag Emoji Tag Sequences (any known subdivision)
|
96
|
+
9) Any Emoji Tag Sequences (any tag sequence with any base)
|
97
|
+
10) Basic Default Emoji Presentation Characters or Text characters with Emoji Presentation Selector
|
98
|
+
11) Basic Default Text Presentation Characters or Basic Emoji with Text Presentation Selector
|
99
|
+
12) Non-Emoji (unqualified) keycap
|
100
|
+
|
101
|
+
Regex | 1 RGI/FQE | 2 RGI/MQE | 3 RGI/UQE | 4 Non-RGI | 5 Valid Region | 6 Any Region | 7 RGI Tag | 8 Valid Tag | 9 Any Tag | 10 Basic Emoji | 11 Basic Text | 12 Text Keycap
|
102
|
+
-|-|-|-|-|-|-|-|-|-|-|-|-
|
103
|
+
REGEX | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
|
104
|
+
REGEX INCLUDE TEXT | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
|
105
|
+
REGEX INCLUDE MQE | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
|
106
|
+
REGEX INCLUDE MQE UQE | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
|
107
|
+
REGEX VALID | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌
|
108
|
+
REGEX VALID INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅
|
109
|
+
REGEX WELL FORMED | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌
|
110
|
+
REGEX WELL FORMED INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅
|
111
|
+
REGEX POSSIBLE | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌
|
112
|
+
REGEX BASIC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌
|
113
|
+
REGEX TEXT | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅
|
114
|
+
|
115
|
+
¹ Matches all unqualified Emoji, except for textual singleton Emoji (see columns 11, 12)
|
116
|
+
|
117
|
+
See [spec files](/spec) for detailed examples about which regex matches which kind of Emoji.
|
118
|
+
|
119
|
+
### Picking the Right Emoji Regex
|
120
|
+
|
121
|
+
- Usually you just want `REGEX` (recommended Emoji set, RGI)
|
122
|
+
- Use `REGEX_INCLUDE_MQE` or `REGEX_INCLUDE_MQE_UQE` if you want to catch Emoji sequences with missing Variation Selectors.
|
123
|
+
- If you want broader matching (any ZWJ sequences, more sub-region flags), choose `REGEX_VALID`
|
124
|
+
- If you need to match any region flag and any tag sequence, choose `REGEX_WELL_FORMED`
|
125
|
+
- Use the `_INCLUDE_TEXT` suffix with any of the above base regexes, if you want to also match basic textual Emoji
|
126
|
+
- And finally, there is also the option to use `REGEX_POSSIBLE`, which is a simplified test for possible Emoji, comparable to `REGEX_WELL_FORMED*`. It might contain false positives, however, the regex is less complex and [suggested in the Unicode standard itself](https://www.unicode.org/reports/tr51/#EBNF_and_Regex) as a first check.
|
127
|
+
|
128
|
+
### Examples
|
129
|
+
|
130
|
+
Desc | Emoji | Escaped | `REGEX` (RGI/FQE) | `REGEX_INCLUDE_MQE` (RGI/MQE) | `REGEX_VALID` | `REGEX_WELL_FORMED` / `REGEX_POSSIBLE`
|
131
|
+
-----|-------|---------|---------------|-----------------------|-----------------------------------|-----------------
|
132
|
+
RGI ZWJ Sequence | 🤾🏽♀️ | `\u{1F93E 1F3FD 200D 2640 FE0F}` | ✅ | ✅ | ✅ | ✅
|
133
|
+
RGI ZWJ Sequence MQE | 🤾🏽♀ | `\u{1F93E 1F3FD 200D 2640}` | ❌ | ✅ | ✅ | ✅
|
134
|
+
Valid ZWJ Sequence, Non-RGI | 🤠🤢 | `\u{1F920 200D 1F922}` | ❌ | ❌ | ✅ | ✅
|
135
|
+
Known Region | 🇵🇹 | `\u{1F1F5 1F1F9}` | ✅ | ✅ | ✅ | ✅
|
136
|
+
Unknown Region | 🇵🇵 | `\u{1F1F5 1F1F5}` | ❌ | ❌ | ❌ | ✅
|
137
|
+
RGI Tag Sequence | 🏴 | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | ✅ | ✅ | ✅ | ✅
|
138
|
+
Valid Tag Sequence | 🏴 | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | ❌ | ❌ | ✅ | ✅
|
139
|
+
Well-formed Tag Sequence | 😴 | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | ❌ | ❌ | ❌ | ✅
|
94
140
|
|
95
|
-
|
141
|
+
Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
|
96
142
|
|
97
|
-
|
143
|
+
More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
|
98
144
|
|
99
|
-
|
145
|
+
### Emoji Property Regexes
|
100
146
|
|
101
|
-
|
147
|
+
Ruby includes native regex Emoji properties, as listed in the following table. You can also opt-in to use the `*_PROP_*` regexes to get the Emoji support level of this gem (instead of Ruby's).
|
102
148
|
|
103
|
-
|
149
|
+
Gem Regex (`Unicode::Emoji`'s Emoji support level) | Native Regex (Ruby's Emoji support level)
|
150
|
+
---------------------------------------------------|------------------------------------------
|
151
|
+
`Unicode::Emoji::REGEX_PROP_EMOJI` | `/\p{Emoji}/`
|
152
|
+
`Unicode::Emoji::REGEX_PROP_MODIFIER` | `/\p{EMod}/`
|
153
|
+
`Unicode::Emoji::REGEX_PROP_MODIFIER_BASE` | `/\p{EBase}/`
|
154
|
+
`Unicode::Emoji::REGEX_PROP_COMPONENT` | `/\p{EComp}/`
|
155
|
+
`Unicode::Emoji::REGEX_PROP_PRESENTATION` | `/\p{EPres}/`
|
104
156
|
|
105
|
-
|
157
|
+
#### Extended Pictographic Regex
|
106
158
|
|
107
|
-
|
108
|
-
------------------------------|-------------|-----------------|--------------------
|
109
|
-
`Unicode::Emoji::REGEX_ANY` | Matches any Emoji-related codepoint (but no variation selectors, tags, or zero-width joiners). Please not that this will match Emoji-parts rather than complete Emoji, for example, single digits! | `😴`, `▶`, `🏻`, `🛌`, `🏽`, `🇵`, `🇹`, `2`, `🏴`, `🤾`, `♀`, `🤠`, `🤢` | -
|
159
|
+
`Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
|
110
160
|
|
161
|
+
`Unicode::Emoji::REGEX_PICTO_NO_EMOJI` matches single codepoints with the **Extended_Pictographic** property, but excludes Emoji characters.
|
162
|
+
|
163
|
+
See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
|
111
164
|
|
112
|
-
|
165
|
+
## Usage – List
|
113
166
|
|
114
|
-
Use `Unicode::Emoji::LIST` or the list method to get a
|
167
|
+
Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
|
115
168
|
|
116
169
|
```ruby
|
117
170
|
Unicode::Emoji.list.keys
|
@@ -124,13 +177,13 @@ Unicode::Emoji.list("Food & Drink", "food-asian")
|
|
124
177
|
=> ["🍱", "🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🥮", "🍡", "🥟", "🥠", "🥡"]
|
125
178
|
```
|
126
179
|
|
127
|
-
Please note that categories might change with future versions of the Emoji standard
|
180
|
+
Please note that categories might change with future versions of the Emoji standard, although this has not happened often.
|
128
181
|
|
129
182
|
A list of all Emoji (generated from this gem) can be found at [character.construction/emoji](https://character.construction/emoji).
|
130
183
|
|
131
|
-
|
184
|
+
## Usage – Properties Data
|
132
185
|
|
133
|
-
Allows you to access the codepoint data form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
|
186
|
+
Allows you to access the codepoint data for a single character form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
|
134
187
|
|
135
188
|
```ruby
|
136
189
|
require "unicode/emoji"
|
data/Rakefile
CHANGED
@@ -28,14 +28,18 @@ task :irb do
|
|
28
28
|
end
|
29
29
|
|
30
30
|
# # #
|
31
|
-
# Run
|
31
|
+
# Run specs
|
32
32
|
|
33
33
|
desc "#{gemspec.name} | Spec"
|
34
34
|
task :spec do
|
35
|
-
ruby "spec
|
35
|
+
ruby File.join("spec", "*_spec.rb")
|
36
36
|
end
|
37
37
|
task default: :spec
|
38
38
|
|
39
|
+
|
40
|
+
# # #
|
41
|
+
# Generate regex
|
42
|
+
|
39
43
|
desc "#{gemspec.name} | Generates all regex constants and saves them to lib/unicode/emoji/{generated,generated_native} directories"
|
40
44
|
task :generate_constants do
|
41
45
|
load "data/generate_constants.rb", true
|
data/data/emoji.marshal.gz
CHANGED
Binary file
|
data/data/generate_constants.rb
CHANGED
@@ -68,25 +68,39 @@ def pack_and_join(ords)
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
|
71
|
+
def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
|
72
|
+
visual_component = pack_and_join(VISUAL_COMPONENT)
|
73
|
+
|
72
74
|
emoji_presentation_sequence = \
|
73
75
|
join(
|
74
|
-
|
76
|
+
text_presentation + pack(EMOJI_VARIATION_SELECTOR),
|
75
77
|
emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
|
76
78
|
)
|
77
79
|
|
78
80
|
non_component_emoji_presentation_sequence = \
|
79
81
|
"(?!" + emoji_component + ")" + emoji_presentation_sequence
|
80
82
|
|
83
|
+
basic_emoji = \
|
84
|
+
join(
|
85
|
+
non_component_emoji_presentation_sequence,
|
86
|
+
visual_component,
|
87
|
+
)
|
88
|
+
|
81
89
|
text_keycap_sequence = \
|
82
|
-
|
90
|
+
pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
|
83
91
|
|
84
92
|
text_presentation_sequence = \
|
85
93
|
join(
|
86
|
-
|
94
|
+
text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
|
87
95
|
emoji_presentation + pack(TEXT_VARIATION_SELECTOR),
|
88
96
|
)
|
89
97
|
|
98
|
+
text_emoji = \
|
99
|
+
join(
|
100
|
+
"(?!" + emoji_component + ")" + text_presentation_sequence,
|
101
|
+
text_keycap_sequence,
|
102
|
+
)
|
103
|
+
|
90
104
|
emoji_modifier_sequence = \
|
91
105
|
emoji_modifier_base + emoji_modifier
|
92
106
|
|
@@ -99,22 +113,11 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
99
113
|
emoji_well_formed_flag_sequence = \
|
100
114
|
'\p{RI}{2}'
|
101
115
|
|
102
|
-
|
103
|
-
join(
|
104
|
-
# emoji_character,
|
105
|
-
emoji_keycap_sequence,
|
106
|
-
emoji_modifier_sequence,
|
107
|
-
non_component_emoji_presentation_sequence,
|
108
|
-
emoji_valid_flag_sequence,
|
109
|
-
)
|
110
|
-
|
111
|
-
emoji_well_formed_core_sequence = \
|
116
|
+
emoji_core_sequence = \
|
112
117
|
join(
|
113
|
-
# emoji_character,
|
114
118
|
emoji_keycap_sequence,
|
115
119
|
emoji_modifier_sequence,
|
116
120
|
non_component_emoji_presentation_sequence,
|
117
|
-
emoji_well_formed_flag_sequence,
|
118
121
|
)
|
119
122
|
|
120
123
|
# Sort to make sure complex sequences match first
|
@@ -144,6 +147,18 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
144
147
|
emoji_rgi_zwj_sequence = \
|
145
148
|
pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse)
|
146
149
|
|
150
|
+
# FQE+MQE: Make VS16 optional after ZWJ has appeared
|
151
|
+
emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
|
152
|
+
/#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/,
|
153
|
+
pack(EMOJI_VARIATION_SELECTOR) + "?"
|
154
|
+
)
|
155
|
+
|
156
|
+
# FQE+MQE+UQE: Make all VS16 optional
|
157
|
+
emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
|
158
|
+
pack(EMOJI_VARIATION_SELECTOR),
|
159
|
+
pack(EMOJI_VARIATION_SELECTOR) + "?",
|
160
|
+
)
|
161
|
+
|
147
162
|
emoji_valid_zwj_element = \
|
148
163
|
join(
|
149
164
|
emoji_modifier_sequence,
|
@@ -160,21 +175,76 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
160
175
|
join(
|
161
176
|
emoji_rgi_zwj_sequence,
|
162
177
|
emoji_rgi_tag_sequence,
|
163
|
-
|
178
|
+
emoji_valid_flag_sequence,
|
179
|
+
emoji_core_sequence,
|
180
|
+
visual_component,
|
181
|
+
)
|
182
|
+
|
183
|
+
emoji_rgi_sequence_include_text = \
|
184
|
+
join(
|
185
|
+
emoji_rgi_zwj_sequence,
|
186
|
+
emoji_rgi_tag_sequence,
|
187
|
+
emoji_valid_flag_sequence,
|
188
|
+
emoji_core_sequence,
|
189
|
+
visual_component,
|
190
|
+
text_emoji,
|
191
|
+
)
|
192
|
+
|
193
|
+
emoji_rgi_include_mqe_sequence = \
|
194
|
+
join(
|
195
|
+
emoji_rgi_include_mqe_zwj_sequence,
|
196
|
+
emoji_rgi_tag_sequence,
|
197
|
+
emoji_valid_flag_sequence,
|
198
|
+
emoji_core_sequence,
|
199
|
+
visual_component,
|
200
|
+
)
|
201
|
+
|
202
|
+
emoji_rgi_include_mqe_uqe_sequence = \
|
203
|
+
join(
|
204
|
+
emoji_rgi_include_mqe_uqe_zwj_sequence,
|
205
|
+
text_emoji, # also uqe
|
206
|
+
emoji_rgi_tag_sequence,
|
207
|
+
emoji_valid_flag_sequence,
|
208
|
+
emoji_core_sequence,
|
209
|
+
visual_component,
|
164
210
|
)
|
165
211
|
|
166
212
|
emoji_valid_sequence = \
|
167
213
|
join(
|
168
214
|
emoji_valid_zwj_sequence,
|
169
215
|
emoji_valid_tag_sequence,
|
170
|
-
|
216
|
+
emoji_valid_flag_sequence,
|
217
|
+
emoji_core_sequence,
|
218
|
+
visual_component,
|
219
|
+
)
|
220
|
+
|
221
|
+
emoji_valid_sequence_include_text = \
|
222
|
+
join(
|
223
|
+
emoji_valid_zwj_sequence,
|
224
|
+
emoji_valid_tag_sequence,
|
225
|
+
emoji_valid_flag_sequence,
|
226
|
+
emoji_core_sequence,
|
227
|
+
visual_component,
|
228
|
+
text_emoji,
|
171
229
|
)
|
172
230
|
|
173
231
|
emoji_well_formed_sequence = \
|
174
232
|
join(
|
175
233
|
emoji_valid_zwj_sequence,
|
176
234
|
emoji_well_formed_tag_sequence,
|
177
|
-
|
235
|
+
emoji_well_formed_flag_sequence,
|
236
|
+
emoji_core_sequence,
|
237
|
+
visual_component,
|
238
|
+
)
|
239
|
+
|
240
|
+
emoji_well_formed_sequence_include_text = \
|
241
|
+
join(
|
242
|
+
emoji_valid_zwj_sequence,
|
243
|
+
emoji_well_formed_tag_sequence,
|
244
|
+
emoji_well_formed_flag_sequence,
|
245
|
+
emoji_core_sequence,
|
246
|
+
visual_component,
|
247
|
+
text_emoji,
|
178
248
|
)
|
179
249
|
|
180
250
|
emoji_possible_modification = \
|
@@ -198,45 +268,53 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
|
|
198
268
|
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
199
269
|
regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)
|
200
270
|
|
271
|
+
# rgi + singleton text
|
272
|
+
regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text)
|
273
|
+
|
274
|
+
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
275
|
+
# Also make VS16 optional if not at first emoji character
|
276
|
+
regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence)
|
277
|
+
|
278
|
+
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
|
279
|
+
# Also make VS16 optional even at first emoji character
|
280
|
+
regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence)
|
281
|
+
|
201
282
|
# Matches basic singleton emoji and all kind of valid sequences
|
202
283
|
regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)
|
203
284
|
|
285
|
+
# valid + singleton text
|
286
|
+
regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text)
|
287
|
+
|
204
288
|
# Matches basic singleton emoji and all kind of sequences
|
205
289
|
regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)
|
290
|
+
|
291
|
+
# well-formed + singleton text
|
292
|
+
regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text)
|
206
293
|
|
207
294
|
# Quick test which might lead to false positves
|
208
295
|
# See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
|
209
296
|
regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
|
210
297
|
|
211
|
-
# Matches only basic single, non-textual emoji
|
212
|
-
|
213
|
-
regexes[:REGEX_BASIC] = Regexp.compile(
|
214
|
-
"(?!" + emoji_component + ")" + emoji_presentation_sequence
|
215
|
-
)
|
298
|
+
# Matches only basic single, non-textual emoji, ignores some components like simple digits
|
299
|
+
regexes[:REGEX_BASIC] = Regexp.compile(basic_emoji)
|
216
300
|
|
217
|
-
# Matches only basic single, textual emoji
|
218
|
-
|
219
|
-
regexes[:REGEX_TEXT] = Regexp.compile(
|
220
|
-
join(
|
221
|
-
"(?!" + emoji_component + ")" + text_presentation_sequence,
|
222
|
-
text_keycap_sequence,
|
223
|
-
)
|
224
|
-
)
|
225
|
-
|
226
|
-
# Matches any emoji-related codepoint - Use with caution (returns partial matches)
|
227
|
-
regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
|
301
|
+
# Matches only basic single, textual emoji, ignores components like modifiers or simple digits
|
302
|
+
regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
|
228
303
|
|
229
|
-
#
|
230
|
-
regexes[:
|
231
|
-
|
232
|
-
regexes[:
|
233
|
-
|
234
|
-
regexes[:
|
304
|
+
# Export regexes for Emoji properties so they can be used with newer Unicode than Ruby's
|
305
|
+
regexes[:REGEX_PROP_EMOJI] = Regexp.compile(emoji_character)
|
306
|
+
regexes[:REGEX_PROP_MODIFIER] = Regexp.compile(emoji_modifier)
|
307
|
+
regexes[:REGEX_PROP_MODIFIER_BASE] = Regexp.compile(emoji_modifier_base)
|
308
|
+
regexes[:REGEX_PROP_COMPONENT] = Regexp.compile(emoji_component)
|
309
|
+
regexes[:REGEX_PROP_PRESENTATION] = Regexp.compile(emoji_presentation)
|
235
310
|
|
311
|
+
# Same goes for ExtendedPictographic
|
236
312
|
regexes[:REGEX_PICTO] = Regexp.compile(picto)
|
237
|
-
|
238
313
|
regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)
|
239
314
|
|
315
|
+
# Emoji keycaps
|
316
|
+
regexes[:REGEX_EMOJI_KEYCAP] = Regexp.compile(emoji_keycap_sequence)
|
317
|
+
|
240
318
|
regexes
|
241
319
|
end
|
242
320
|
|
@@ -246,6 +324,7 @@ regexes = compile(
|
|
246
324
|
emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
|
247
325
|
emoji_component: pack_and_join(EMOJI_COMPONENT),
|
248
326
|
emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
|
327
|
+
text_presentation: pack_and_join(TEXT_PRESENTATION),
|
249
328
|
picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
|
250
329
|
picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
|
251
330
|
)
|
@@ -257,7 +336,8 @@ native_regexes = compile(
|
|
257
336
|
emoji_modifier_base: "\\p{EBase}",
|
258
337
|
emoji_component: "\\p{EComp}",
|
259
338
|
emoji_presentation: "\\p{EPres}",
|
339
|
+
text_presentation: "[\\p{Emoji}&&\\P{EPres}]",
|
260
340
|
picto: "\\p{ExtPict}",
|
261
|
-
picto_no_emoji: "\\p{ExtPict}
|
341
|
+
picto_no_emoji: "[\\p{ExtPict}&&\\P{Emoji}]"
|
262
342
|
)
|
263
343
|
write_regexes(native_regexes, File.expand_path("../lib/unicode/emoji/generated_native", __dir__))
|