unicode-emoji 3.7.0 → 3.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.md +11 -1
  4. data/README.md +98 -55
  5. data/Rakefile +6 -2
  6. data/data/emoji.marshal.gz +0 -0
  7. data/data/generate_constants.rb +97 -40
  8. data/lib/unicode/emoji/constants.rb +17 -1
  9. data/lib/unicode/emoji/generated/regex.rb +1 -1
  10. data/lib/unicode/emoji/generated/regex_include_mqe.rb +8 -0
  11. data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +8 -0
  12. data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
  13. data/lib/unicode/emoji/generated/regex_text.rb +1 -1
  14. data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
  15. data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
  16. data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
  17. data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
  18. data/lib/unicode/emoji/generated_native/regex.rb +1 -1
  19. data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
  20. data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +8 -0
  21. data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +8 -0
  22. data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
  23. data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
  24. data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
  25. data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
  26. data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
  27. data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
  28. data/lib/unicode/emoji/lazy_constants.rb +36 -0
  29. data/lib/unicode/emoji/list.rb +3 -0
  30. data/lib/unicode/emoji.rb +33 -6
  31. data/spec/data/.keep +0 -0
  32. data/spec/data/emoji-test.txt +5331 -0
  33. data/spec/emoji_test_txt_spec.rb +181 -0
  34. data/spec/unicode_emoji_spec.rb +36 -4
  35. metadata +12 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1e5b069f3f3d2de97f31a0aa83568454b751330b8e86392d28baa7906a6d6302
4
- data.tar.gz: afaed5c343b5cdb5f235bf1e40fd78d221b0efa3097517f015db3c5a58e96775
3
+ metadata.gz: e3c7cc2671d256d8208b72d719384e7c13aaace4fec6b4919b92640e5336d87f
4
+ data.tar.gz: 9420777da4805787467c7f4eac1580f7179a6abf56e54b011c11640a50502b88
5
5
  SHA512:
6
- metadata.gz: e9c6726eee2f48cd0c51937bb0fcd6f0009b7fe0eb9e973bef713ac063d668c4d520f490a46bdf944892dcac2854b17ca72779593257f3edc8f0dfd445f3a729
7
- data.tar.gz: 7c1747073a4ec43ea4ea24ff304eb2cf3deb58918f2757cd4b4daee922b15d5478bd979585b60984366d21787b3d5183bcd5fc8cd3cdea6aa5ceedb9860e19d6
6
+ metadata.gz: f31a83c8a492affe4ec34f2f6e43fa20a44f98dae6c0855322d5c12bc924edc154f7fa4793ca28f299f5436c8981651a5777c2e7944db6dee9c3b99f5ec997ce
7
+ data.tar.gz: 54c1beecbcb673274bdf98169f11fa51bc746282418c3b9ca30385194ae384e3532b04bbca3ef196316c11d3344258b9059da1ea18630d8d90b51029c608565d
data/.gitignore CHANGED
@@ -1,2 +1,3 @@
1
1
  Gemfile.lock
2
2
  /pkg
3
+ /spec/data/emoji-test.txt
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  # CHANGELOG
2
2
 
3
+ ### 3.8.0
4
+
5
+ - Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
6
+ for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
7
+ - Add specs running through `emoji-text.txt` and classify qualification statuses per regex
8
+ - Improve documentation and add detailed table about which regex has which features
9
+ - Native regexes: Use native Emoji props for Emoji text presentation
10
+ - Update CLDR to v46 (valid subdivisions)
11
+ - Further improvements (see commit log)
12
+
3
13
  ### 3.7.0
4
14
 
5
15
  - Bump required Ruby slightly to 2.5
@@ -29,7 +39,7 @@
29
39
  ### 3.3.2
30
40
 
31
41
  - Update valid subdivisions to CLDR 43 (no changes)
32
- -> there won't be any new subdivision flags in Emoji
42
+ -> there won't be any new RGI subdivision flags in Emoji
33
43
 
34
44
  ### 3.3.1
35
45
 
data/README.md CHANGED
@@ -1,15 +1,15 @@
1
1
  # Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
2
2
 
3
- Provides regular expressions to find Emoji in strings, incorporating the latest Unicode and Emoji standards.
3
+ Provides regular expressions to find Emoji in strings, incorporating the latest Unicode / Emoji standards.
4
4
 
5
5
  Additional features:
6
6
 
7
- - A categorized list of recommended Emoji
7
+ - A categorized list of Emoji (RGI: Recommended for General Interchange)
8
8
  - Retrieve Emoji properties info about specific codepoints (Emoji_Modifier, Emoji_Presentation, etc.)
9
9
 
10
10
  Emoji version: **16.0** (September 2024)
11
11
 
12
- CLDR version (used for sub-region flags): **45** (April 2024)
12
+ CLDR version (used for sub-region flags): **46** (October 2024)
13
13
 
14
14
  ## Gemfile
15
15
 
@@ -17,16 +17,14 @@ CLDR version (used for sub-region flags): **45** (April 2024)
17
17
  gem "unicode-emoji"
18
18
  ```
19
19
 
20
- ## Usage
21
-
22
- ### Regex
20
+ ## Usage – Regex Matching
23
21
 
24
22
  The gem includes multiple Emoji regexes, which are compiled out of various Emoji Unicode data sources.
25
23
 
26
24
  ```ruby
27
25
  require "unicode/emoji"
28
26
 
29
- string = "String which contains all kinds of emoji:
27
+ string = "String which contains all types of Emoji sequences:
30
28
 
31
29
  - Singleton Emoji: 😴
32
30
  - Textual singleton Emoji with Emoji variation: ▶️
@@ -35,64 +33,114 @@ string = "String which contains all kinds of emoji:
35
33
  - Sub-Region flag: 🏴󠁧󠁢󠁳󠁣󠁴󠁿
36
34
  - Keycap sequence: 2️⃣
37
35
  - Sequence using ZWJ (zero width joiner): 🤾🏽‍♀️
38
-
39
36
  "
40
37
 
41
38
  string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🤾🏽‍♀️"]
42
39
  ```
43
40
 
44
- #### Main Regexes
41
+ Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
45
42
 
46
- There are multiple levels of Emoji detection:
43
+ ### Main Regexes
47
44
 
48
45
  Regex | Description | Example Matches | Example Non-Matches
49
46
  ------------------------------|-------------|-----------------|--------------------
50
- `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
51
- `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`
52
- `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`
53
- `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` |
54
-
55
- ##### Picking the Right Emoji Regex
56
-
57
- - Usually you just want `REGEX` (RGI set)
58
- - If you want broader matching (any ZJW sequences, more sub-region flags), choose `REGEX_VALID`
59
- - Even brolader is `REGEX_WELL_FORMED`, which will also match any region flag and any tag sequence
60
- - And then there is `REGEX_POSSIBLE` , which is a quick check for possible Emoji, which might contain false positives, [suggested in the Unicode Standard](https://www.unicode.org/reports/tr51/#EBNF_and_Regex)
61
-
62
- Property | Escaped | `REGEX` (RGI / Recommended) | `REGEX_VALID` (Valid) | `REGEX_WELL_FORMED` (Well-formed) | `REGEX_POSSIBLE`
63
- ---------|---------|-----------------------------|-----------------------|-----------------------------------|-----------------
64
- Region "🇵🇹" | `\u{1F1F5 1F1F9}` | Yes | Yes | Yes | Yes
65
- Region "🇵🇵" | `\u{1F1F5 1F1F5}` | No | No | Yes | Yes
66
- Tag Sequence "🏴󠁧󠁢󠁳󠁣󠁴󠁿" | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | Yes | Yes | Yes | Yes
67
- Tag Sequence "🏴󠁧󠁢󠁡󠁧󠁢󠁿" | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | No | Yes | Yes | Yes
68
- Tag Sequence "😴󠁧󠁢󠁡󠁡󠁡󠁿" | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | No | No | Yes | Yes
69
- ZWJ Sequence "🤾🏽‍♀️" | `\u{1F93E 1F3FD 200D 2640 FE0F}` | Yes | Yes | Yes | Yes
70
- ZWJ Sequence "🤠‍🤢" | `\u{1F920 200D 1F922}` | No | Yes | Yes | Yes
47
+ `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
48
+ `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`, `1⃣`
49
+ `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`, `1⃣`
50
+ `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits (except for: unqualified keycap sequences) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
71
51
 
72
- Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
52
+ #### Include Text Emoji
73
53
 
74
- More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
54
+ By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
75
55
 
76
- #### Singleton Regexes
56
+ Regex | Description | Example Matches | Example Non-Matches
57
+ ------------------------------|-------------|-----------------|--------------------
58
+ `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` | `🤾🏽‍♀`, `🏌‍♂️`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
59
+ `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` | `🏻`, `🇵🇵`, `1`
60
+ `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` | `🏻`, `1`
77
61
 
78
- Matches only simple one-codepoint (+ optional variation selector) Emoji:
62
+ #### Minimally-qualified and Unqualified Sequences
79
63
 
80
64
  Regex | Description | Example Matches | Example Non-Matches
81
65
  ------------------------------|-------------|-----------------|--------------------
82
- `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `1`
83
- `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `1`
66
+ `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` | `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
67
+ `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
84
68
 
85
- #### Include Textual Emoji
69
+ [List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
86
70
 
87
- By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
71
+ #### Singleton Regexes
72
+
73
+ Matches only simple one-codepoint (+ optional variation selector) Emoji:
88
74
 
89
75
  Regex | Description | Example Matches | Example Non-Matches
90
76
  ------------------------------|-------------|-----------------|--------------------
91
- `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶` | `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`
92
- `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `😴︎`, `▶` | `🏻`, `🇵🇵`
93
- `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶` | `🏻`
77
+ `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
78
+ `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
79
+
80
+ Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text)
81
+
82
+ While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
83
+
84
+ ### Comparison
85
+
86
+ 1) Fully-qualified RGI Emoji ZWJ sequence
87
+ 2) Minimally-qualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selectors, but not in the first Emoji character)
88
+ 3) Unqualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selector, including in the first Emoji character). Unqualified Emoji include all basic Emoji in Text Presentation (see column 11/12).
89
+ 4) Non-RGI Emoji ZWJ sequence
90
+ 5) Valid Region made from a pair of Regional Indicators
91
+ 6) Any Region made from a pair of Regional Indicators
92
+ 7) RGI Flag Emoji Tag Sequences (England, Scotland, Wales)
93
+ 8) Valid Flag Emoji Tag Sequences (any known subdivision)
94
+ 9) Any Emoji Tag Sequences (any tag sequence with any base)
95
+ 10) Basic Default Emoji Presentation Characters or Text characters with Emoji Presentation Selector
96
+ 11) Basic Default Text Presentation Characters or Basic Emoji with Text Presentation Selector
97
+ 12) Non-Emoji (unqualified) keycap
98
+
99
+ Regex | 1 RGI/FQE | 2 RGI/MQE | 3 RGI/UQE | 4 Non-RGI | 5 Valid Re­gion | 6 Any Re­gion | 7 RGI Tag | 8 Valid Tag | 9 Any Tag | 10 Basic Emoji | 11 Basic Text | 12 Text Key­cap
100
+ -|-|-|-|-|-|-|-|-|-|-|-|-
101
+ REGEX | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
102
+ REGEX INCLUDE TEXT | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
103
+ REGEX INCLUDE MQE | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
104
+ REGEX INCLUDE MQE UQE | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
105
+ REGEX VALID | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌
106
+ REGEX VALID INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅
107
+ REGEX WELL FORMED | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌
108
+ REGEX WELL FORMED INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅
109
+ REGEX POSSIBLE | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌
110
+ REGEX BASIC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌
111
+ REGEX TEXT | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅
112
+
113
+ ¹ Matches all unqualified Emoji, except for textual singleton Emoji (see columns 11, 12)
114
+
115
+ See [spec files](/spec) for detailed examples about which regex matches which kind of Emoji.
116
+
117
+ ### Picking the Right Emoji Regex
118
+
119
+ - Usually you just want `REGEX` (recommended Emoji set, RGI)
120
+ - Use `REGEX_INCLUDE_MQE` or `REGEX_INCLUDE_MQE_UQE` if you want to catch Emoji sequences with missing Variation Selectors.
121
+ - If you want broader matching (any ZWJ sequences, more sub-region flags), choose `REGEX_VALID`
122
+ - If you need to match any region flag and any tag sequence, choose `REGEX_WELL_FORMED`
123
+ - Use the `_INCLUDE_TEXT` suffix with any of the above base regexes, if you want to also match basic textual Emoji
124
+ - And finally, there is also the option to use `REGEX_POSSIBLE`, which is a simplified test for possible Emoji, comparable to `REGEX_WELL_FORMED*`. It might contain false positives, however, the regex is less complex and [suggested in the Unicode standard itself](https://www.unicode.org/reports/tr51/#EBNF_and_Regex) as a first check.
125
+
126
+ ### Examples
127
+
128
+ Desc | Emoji | Escaped | `REGEX` (RGI/FQE) | `REGEX_INCLUDE_MQE` (RGI/MQE) | `REGEX_VALID` | `REGEX_WELL_FORMED` / `REGEX_POSSIBLE`
129
+ -----|-------|---------|---------------|-----------------------|-----------------------------------|-----------------
130
+ RGI ZWJ Sequence | 🤾🏽‍♀️ | `\u{1F93E 1F3FD 200D 2640 FE0F}` | ✅ | ✅ | ✅ | ✅
131
+ RGI ZWJ Sequence MQE | 🤾🏽‍♀ | `\u{1F93E 1F3FD 200D 2640}` | ❌ | ✅ | ✅ | ✅
132
+ Valid ZWJ Sequence, Non-RGI | 🤠‍🤢 | `\u{1F920 200D 1F922}` | ❌ | ❌ | ✅ | ✅
133
+ Known Region | 🇵🇹 | `\u{1F1F5 1F1F9}` | ✅ | ✅ | ✅ | ✅
134
+ Unknown Region | 🇵🇵 | `\u{1F1F5 1F1F5}` | ❌ | ❌ | ❌ | ✅
135
+ RGI Tag Sequence | 🏴󠁧󠁢󠁳󠁣󠁴󠁿 | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | ✅ | ✅ | ✅ | ✅
136
+ Valid Tag Sequence | 🏴󠁧󠁢󠁡󠁧󠁢󠁿 | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | ❌ | ❌ | ✅ | ✅
137
+ Well-formed Tag Sequence | 😴󠁧󠁢󠁡󠁡󠁡󠁿 | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | ❌ | ❌ | ❌ | ✅
138
+
139
+ Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
140
+
141
+ More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
94
142
 
95
- #### Extended Pictographic Regex
143
+ ### Extended Pictographic Regex
96
144
 
97
145
  `Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
98
146
 
@@ -100,18 +148,13 @@ Regex | Description | Example Matches | Example Non-Matc
100
148
 
101
149
  See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
102
150
 
103
- #### Partial Regexes
104
-
105
- Matches potential Emoji parts (often, this is not what you want):
106
-
107
- Regex | Description | Example Matches | Example Non-Matches
108
- ------------------------------|-------------|-----------------|--------------------
109
- `Unicode::Emoji::REGEX_ANY` | Matches any Emoji-related codepoint (but no variation selectors, tags, or zero-width joiners). Please not that this will match Emoji-parts rather than complete Emoji, for example, single digits! | `😴`, `▶`, `🏻`, `🛌`, `🏽`, `🇵`, `🇹`, `2`, `🏴`, `🤾`, `♀`, `🤠`, `🤢` | -
151
+ ### Partial Regexes
110
152
 
153
+ `Unicode::Emoji::REGEX_ANY`, same as `\p{Emoji}`. Deprecated: Will be removed or renamed in the future.
111
154
 
112
- ### List
155
+ ## Usage – List
113
156
 
114
- Use `Unicode::Emoji::LIST` or the list method to get a grouped (and ordered) list of Emoji:
157
+ Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
115
158
 
116
159
  ```ruby
117
160
  Unicode::Emoji.list.keys
@@ -124,13 +167,13 @@ Unicode::Emoji.list("Food & Drink", "food-asian")
124
167
  => ["🍱", "🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🥮", "🍡", "🥟", "🥠", "🥡"]
125
168
  ```
126
169
 
127
- Please note that categories might change with future versions of the Emoji standard. This gem will issue warnings when attempting to retrieve old categories using the `#list` method.
170
+ Please note that categories might change with future versions of the Emoji standard, although this has not happened often.
128
171
 
129
172
  A list of all Emoji (generated from this gem) can be found at [character.construction/emoji](https://character.construction/emoji).
130
173
 
131
- ### Properties
174
+ ## Usage – Properties Data
132
175
 
133
- Allows you to access the codepoint data form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
176
+ Allows you to access the codepoint data for a single character form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
134
177
 
135
178
  ```ruby
136
179
  require "unicode/emoji"
data/Rakefile CHANGED
@@ -28,14 +28,18 @@ task :irb do
28
28
  end
29
29
 
30
30
  # # #
31
- # Run Specs
31
+ # Run specs
32
32
 
33
33
  desc "#{gemspec.name} | Spec"
34
34
  task :spec do
35
- ruby "spec/unicode_emoji_spec.rb"
35
+ ruby File.join("spec", "*_spec.rb")
36
36
  end
37
37
  task default: :spec
38
38
 
39
+
40
+ # # #
41
+ # Generate regex
42
+
39
43
  desc "#{gemspec.name} | Generates all regex constants and saves them to lib/unicode/emoji/{generated,generated_native} directories"
40
44
  task :generate_constants do
41
45
  load "data/generate_constants.rb", true
Binary file
@@ -68,10 +68,10 @@ def pack_and_join(ords)
68
68
  end
69
69
  end
70
70
 
71
- def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
71
+ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
72
72
  emoji_presentation_sequence = \
73
73
  join(
74
- pack_and_join(TEXT_PRESENTATION) + pack(EMOJI_VARIATION_SELECTOR),
74
+ text_presentation + pack(EMOJI_VARIATION_SELECTOR),
75
75
  emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
76
76
  )
77
77
 
@@ -79,14 +79,20 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
79
79
  "(?!" + emoji_component + ")" + emoji_presentation_sequence
80
80
 
81
81
  text_keycap_sequence = \
82
- join(EMOJI_KEYCAPS.map{|keycap| pack([keycap, EMOJI_KEYCAP_SUFFIX]) })
82
+ pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
83
83
 
84
84
  text_presentation_sequence = \
85
85
  join(
86
- pack_and_join(TEXT_PRESENTATION)+ "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
86
+ text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
87
87
  emoji_presentation + pack(TEXT_VARIATION_SELECTOR),
88
88
  )
89
89
 
90
+ text_emoji = \
91
+ join(
92
+ "(?!" + emoji_component + ")" + text_presentation_sequence,
93
+ text_keycap_sequence,
94
+ )
95
+
90
96
  emoji_modifier_sequence = \
91
97
  emoji_modifier_base + emoji_modifier
92
98
 
@@ -99,22 +105,11 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
99
105
  emoji_well_formed_flag_sequence = \
100
106
  '\p{RI}{2}'
101
107
 
102
- emoji_valid_core_sequence = \
103
- join(
104
- # emoji_character,
105
- emoji_keycap_sequence,
106
- emoji_modifier_sequence,
107
- non_component_emoji_presentation_sequence,
108
- emoji_valid_flag_sequence,
109
- )
110
-
111
- emoji_well_formed_core_sequence = \
108
+ emoji_core_sequence = \
112
109
  join(
113
- # emoji_character,
114
110
  emoji_keycap_sequence,
115
111
  emoji_modifier_sequence,
116
112
  non_component_emoji_presentation_sequence,
117
- emoji_well_formed_flag_sequence,
118
113
  )
119
114
 
120
115
  # Sort to make sure complex sequences match first
@@ -144,6 +139,18 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
144
139
  emoji_rgi_zwj_sequence = \
145
140
  pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse)
146
141
 
142
+ # FQE+MQE: Make VS16 optional after ZWJ has appeared
143
+ emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
144
+ /#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/,
145
+ pack(EMOJI_VARIATION_SELECTOR) + "?"
146
+ )
147
+
148
+ # FQE+MQE+UQE: Make all VS16 optional
149
+ emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
150
+ pack(EMOJI_VARIATION_SELECTOR),
151
+ pack(EMOJI_VARIATION_SELECTOR) + "?",
152
+ )
153
+
147
154
  emoji_valid_zwj_element = \
148
155
  join(
149
156
  emoji_modifier_sequence,
@@ -160,21 +167,68 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
160
167
  join(
161
168
  emoji_rgi_zwj_sequence,
162
169
  emoji_rgi_tag_sequence,
163
- emoji_valid_core_sequence,
170
+ emoji_valid_flag_sequence,
171
+ emoji_core_sequence,
172
+ )
173
+
174
+ emoji_rgi_sequence_include_text = \
175
+ join(
176
+ emoji_rgi_zwj_sequence,
177
+ emoji_rgi_tag_sequence,
178
+ emoji_valid_flag_sequence,
179
+ emoji_core_sequence,
180
+ text_emoji,
181
+ )
182
+
183
+ emoji_rgi_include_mqe_sequence = \
184
+ join(
185
+ emoji_rgi_include_mqe_zwj_sequence,
186
+ emoji_rgi_tag_sequence,
187
+ emoji_valid_flag_sequence,
188
+ emoji_core_sequence,
189
+ )
190
+
191
+ emoji_rgi_include_mqe_uqe_sequence = \
192
+ join(
193
+ emoji_rgi_include_mqe_uqe_zwj_sequence,
194
+ text_emoji, # also uqe
195
+ emoji_rgi_tag_sequence,
196
+ emoji_valid_flag_sequence,
197
+ emoji_core_sequence,
164
198
  )
165
199
 
166
200
  emoji_valid_sequence = \
167
201
  join(
168
202
  emoji_valid_zwj_sequence,
169
203
  emoji_valid_tag_sequence,
170
- emoji_valid_core_sequence,
204
+ emoji_valid_flag_sequence,
205
+ emoji_core_sequence,
206
+ )
207
+
208
+ emoji_valid_sequence_include_text = \
209
+ join(
210
+ emoji_valid_zwj_sequence,
211
+ emoji_valid_tag_sequence,
212
+ emoji_valid_flag_sequence,
213
+ emoji_core_sequence,
214
+ text_emoji,
171
215
  )
172
216
 
173
217
  emoji_well_formed_sequence = \
174
218
  join(
175
219
  emoji_valid_zwj_sequence,
176
220
  emoji_well_formed_tag_sequence,
177
- emoji_well_formed_core_sequence,
221
+ emoji_well_formed_flag_sequence,
222
+ emoji_core_sequence,
223
+ )
224
+
225
+ emoji_well_formed_sequence_include_text = \
226
+ join(
227
+ emoji_valid_zwj_sequence,
228
+ emoji_well_formed_tag_sequence,
229
+ emoji_well_formed_flag_sequence,
230
+ emoji_core_sequence,
231
+ text_emoji,
178
232
  )
179
233
 
180
234
  emoji_possible_modification = \
@@ -198,41 +252,42 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
198
252
  # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
199
253
  regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)
200
254
 
255
+ # rgi + singleton text
256
+ regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text)
257
+
258
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
259
+ # Also make VS16 optional if not at first emoji character
260
+ regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence)
261
+
262
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
263
+ # Also make VS16 optional even at first emoji character
264
+ regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence)
265
+
201
266
  # Matches basic singleton emoji and all kind of valid sequences
202
267
  regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)
203
268
 
269
+ # valid + singleton text
270
+ regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text)
271
+
204
272
  # Matches basic singleton emoji and all kind of sequences
205
273
  regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)
274
+
275
+ # well-formed + singleton text
276
+ regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text)
206
277
 
207
278
  # Quick test which might lead to false positves
208
279
  # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
209
280
  regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
210
281
 
211
- # Matches only basic single, non-textual emoji
212
- # Ignores "components" like modifiers or simple digits
213
- regexes[:REGEX_BASIC] = Regexp.compile(
214
- "(?!" + emoji_component + ")" + emoji_presentation_sequence
215
- )
282
+ # Matches only basic single, non-textual emoji, ignores "components" like modifiers or simple digits
283
+ regexes[:REGEX_BASIC] = Regexp.compile(non_component_emoji_presentation_sequence)
216
284
 
217
- # Matches only basic single, textual emoji
218
- # Ignores "components" like modifiers or simple digits
219
- regexes[:REGEX_TEXT] = Regexp.compile(
220
- join(
221
- "(?!" + emoji_component + ")" + text_presentation_sequence,
222
- text_keycap_sequence,
223
- )
224
- )
285
+ # Matches only basic single, textual emoji, ignores "components" like modifiers or simple digits
286
+ regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
225
287
 
226
- # Matches any emoji-related codepoint - Use with caution (returns partial matches)
288
+ # Same as \p{Emoji} - to be removed or renamed
227
289
  regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
228
290
 
229
- # Combined REGEXes which also match for TEXTUAL emoji
230
- regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT])
231
-
232
- regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT])
233
-
234
- regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT])
235
-
236
291
  regexes[:REGEX_PICTO] = Regexp.compile(picto)
237
292
 
238
293
  regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)
@@ -246,6 +301,7 @@ regexes = compile(
246
301
  emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
247
302
  emoji_component: pack_and_join(EMOJI_COMPONENT),
248
303
  emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
304
+ text_presentation: pack_and_join(TEXT_PRESENTATION),
249
305
  picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
250
306
  picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
251
307
  )
@@ -257,6 +313,7 @@ native_regexes = compile(
257
313
  emoji_modifier_base: "\\p{EBase}",
258
314
  emoji_component: "\\p{EComp}",
259
315
  emoji_presentation: "\\p{EPres}",
316
+ text_presentation: "\\p{Emoji}(?<!\\p{EPres})",
260
317
  picto: "\\p{ExtPict}",
261
318
  picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
262
319
  )
@@ -2,12 +2,13 @@
2
2
 
3
3
  module Unicode
4
4
  module Emoji
5
- VERSION = "3.7.0"
5
+ VERSION = "3.8.0"
6
6
  EMOJI_VERSION = "16.0"
7
7
  CLDR_VERSION = "45"
8
8
  DATA_DIRECTORY = File.expand_path('../../../data', __dir__).freeze
9
9
  INDEX_FILENAME = (DATA_DIRECTORY + "/emoji.marshal.gz").freeze
10
10
 
11
+ # Unicode properties, see https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt
11
12
  PROPERTY_NAMES = {
12
13
  E: "Emoji",
13
14
  B: "Emoji_Modifier_Base",
@@ -17,13 +18,28 @@ module Unicode
17
18
  X: "Extended_Pictographic",
18
19
  }.freeze
19
20
 
21
+ # Variation Selector 16 (VS16), enables emoji presentation mode for preceding codepoint
20
22
  EMOJI_VARIATION_SELECTOR = 0xFE0F
23
+
24
+ # Variation Selector 15 (VS15), enables text presentation mode for preceding codepoint
21
25
  TEXT_VARIATION_SELECTOR = 0xFE0E
26
+
27
+ # First codepoint of tag-based subdivision flags
22
28
  EMOJI_TAG_BASE_FLAG = 0x1F3F4
29
+
30
+ # Last codepoint of tag-based subdivision flags
23
31
  CANCEL_TAG = 0xE007F
32
+
33
+ # Tags characters allowed in tag-based subdivision flags
24
34
  SPEC_TAGS = [*0xE0030..0xE0039, *0xE0061..0xE007A].freeze
35
+
36
+ # Combining Enclosing Keycap character
25
37
  EMOJI_KEYCAP_SUFFIX = 0x20E3
38
+
39
+ # Zero-width-joiner to enable combination of multiple Emoji in a sequence
26
40
  ZWJ = 0x200D
41
+
42
+ # Two regional indicators make up a region
27
43
  REGIONAL_INDICATORS = [*0x1F1E6..0x1F1FF].freeze
28
44
  end
29
45
  end