unicode-emoji 3.6.0 → 3.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.md +24 -1
  4. data/README.md +105 -63
  5. data/Rakefile +6 -2
  6. data/data/emoji.marshal.gz +0 -0
  7. data/data/generate_constants.rb +120 -46
  8. data/lib/unicode/emoji/constants.rb +18 -2
  9. data/lib/unicode/emoji/generated/regex.rb +1 -1
  10. data/lib/unicode/emoji/generated/regex_include_mqe.rb +8 -0
  11. data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +8 -0
  12. data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
  13. data/lib/unicode/emoji/generated/regex_possible.rb +8 -0
  14. data/lib/unicode/emoji/generated/regex_text.rb +1 -1
  15. data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
  16. data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
  17. data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
  18. data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
  19. data/lib/unicode/emoji/generated_native/regex.rb +1 -1
  20. data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
  21. data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +8 -0
  22. data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +8 -0
  23. data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
  24. data/lib/unicode/emoji/generated_native/regex_possible.rb +8 -0
  25. data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
  26. data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
  27. data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
  28. data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
  29. data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
  30. data/lib/unicode/emoji/lazy_constants.rb +37 -6
  31. data/lib/unicode/emoji/list.rb +13 -0
  32. data/lib/unicode/emoji.rb +38 -5
  33. data/spec/data/.keep +0 -0
  34. data/spec/data/emoji-test.txt +5331 -0
  35. data/spec/emoji_test_txt_spec.rb +181 -0
  36. data/spec/unicode_emoji_spec.rb +152 -5
  37. data/unicode-emoji.gemspec +3 -3
  38. metadata +20 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cc9d3cd1548a2cc0861e05836cc4a5dcc0b4d4d631568c6bc89de094951fcb70
4
- data.tar.gz: 2172d2a31051731d234aeae48ba8c6b9cba15181fb8b1886d3a90252048937ab
3
+ metadata.gz: e3c7cc2671d256d8208b72d719384e7c13aaace4fec6b4919b92640e5336d87f
4
+ data.tar.gz: 9420777da4805787467c7f4eac1580f7179a6abf56e54b011c11640a50502b88
5
5
  SHA512:
6
- metadata.gz: ae1d2b65292537a89748d9ba1cc78c08723b37f6ccf5debb9fbeeb73720bd6e1c232f4f1be23d24699e3ebd2f1092646e2877b96366c0d4aa05e0f080aeadf99
7
- data.tar.gz: 0d422eeccd45a9430e5df8a5bac8964dca3bd5616eb51d6f6999c3372f8697316e210b04e586ab3a734b5e74b3881a309a2c4a6b0e78964de7513550dc3380ec
6
+ metadata.gz: f31a83c8a492affe4ec34f2f6e43fa20a44f98dae6c0855322d5c12bc924edc154f7fa4793ca28f299f5436c8981651a5777c2e7944db6dee9c3b99f5ec997ce
7
+ data.tar.gz: 54c1beecbcb673274bdf98169f11fa51bc746282418c3b9ca30385194ae384e3532b04bbca3ef196316c11d3344258b9059da1ea18630d8d90b51029c608565d
data/.gitignore CHANGED
@@ -1,2 +1,3 @@
1
1
  Gemfile.lock
2
2
  /pkg
3
+ /spec/data/emoji-test.txt
data/CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  # CHANGELOG
2
2
 
3
+ ### 3.8.0
4
+
5
+ - Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
6
+ for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
7
+ - Add specs running through `emoji-text.txt` and classify qualification statuses per regex
8
+ - Improve documentation and add detailed table about which regex has which features
9
+ - Native regexes: Use native Emoji props for Emoji text presentation
10
+ - Update CLDR to v46 (valid subdivisions)
11
+ - Further improvements (see commit log)
12
+
13
+ ### 3.7.0
14
+
15
+ - Bump required Ruby slightly to 2.5
16
+ - Introduce new `REGEX_POSSIBLE` which contains the regex described in
17
+ https://www.unicode.org/reports/tr51/#EBNF_and_Regex
18
+ - Fix that some valid subdivisions were not decompressed (`REGEX_VALID`)
19
+ - Be stricter about selection of tag characters in `REGEX_WELL_FORMED`
20
+ - Only U+E0030..U+E0039, U+E0061..U+E007A allowed
21
+ - Max tag sequence length
22
+ - Use native `/\p{RI}/` regex for regional indicators
23
+ - Separately autoload emoji list, so it can be loaded when other indexes
24
+ are not needed
25
+
3
26
  ### 3.6.0
4
27
 
5
28
  - `Unicode::Emoji::REGEX_TEXT` now matches non-emoji keycaps like "3⃣" (U+0033 U+20E3)
@@ -16,7 +39,7 @@
16
39
  ### 3.3.2
17
40
 
18
41
  - Update valid subdivisions to CLDR 43 (no changes)
19
- -> there won't be any new subdivision flags in Emoji
42
+ -> there won't be any new RGI subdivision flags in Emoji
20
43
 
21
44
  ### 3.3.1
22
45
 
data/README.md CHANGED
@@ -1,18 +1,15 @@
1
1
  # Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
2
2
 
3
- Provides Unicode Emoji data and regexes, incorporating the latest Unicode and Emoji standards.
3
+ Provides regular expressions to find Emoji in strings, incorporating the latest Unicode / Emoji standards.
4
4
 
5
- Also includes a categorized list of recommended Emoji.
5
+ Additional features:
6
6
 
7
- Emoji version: **16.0** (September 2024)
8
-
9
- CLDR version (used for sub-region flags): **45** (April 2024)
10
-
11
- Supported Rubies: **3.x**
7
+ - A categorized list of Emoji (RGI: Recommended for General Interchange)
8
+ - Retrieve Emoji properties info about specific codepoints (Emoji_Modifier, Emoji_Presentation, etc.)
12
9
 
13
- No longer supported Rubies, but might still work: **2.7**, **2.6**, **2.5**, **2.4**, **2.3**
10
+ Emoji version: **16.0** (September 2024)
14
11
 
15
- If you are stuck on an older Ruby version, checkout the latest [0.9 version](https://rubygems.org/gems/unicode-emoji/versions/0.9.3) of this gem.
12
+ CLDR version (used for sub-region flags): **46** (October 2024)
16
13
 
17
14
  ## Gemfile
18
15
 
@@ -20,16 +17,14 @@ If you are stuck on an older Ruby version, checkout the latest [0.9 version](htt
20
17
  gem "unicode-emoji"
21
18
  ```
22
19
 
23
- ## Usage
24
-
25
- ### Regex
20
+ ## Usage – Regex Matching
26
21
 
27
- The gem includes a bunch of Emoji regexes, which are compiled out of various Emoji Unicode data sources.
22
+ The gem includes multiple Emoji regexes, which are compiled out of various Emoji Unicode data sources.
28
23
 
29
24
  ```ruby
30
25
  require "unicode/emoji"
31
26
 
32
- string = "String which contains all kinds of emoji:
27
+ string = "String which contains all types of Emoji sequences:
33
28
 
34
29
  - Singleton Emoji: 😴
35
30
  - Textual singleton Emoji with Emoji variation: ▶️
@@ -38,62 +33,114 @@ string = "String which contains all kinds of emoji:
38
33
  - Sub-Region flag: 🏴󠁧󠁢󠁳󠁣󠁴󠁿
39
34
  - Keycap sequence: 2️⃣
40
35
  - Sequence using ZWJ (zero width joiner): 🤾🏽‍♀️
41
-
42
36
  "
43
37
 
44
38
  string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🤾🏽‍♀️"]
45
39
  ```
46
40
 
47
- #### Main Regexes
41
+ Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
48
42
 
49
- Matches (non-textual) Emoji of all kinds:
43
+ ### Main Regexes
50
44
 
51
45
  Regex | Description | Example Matches | Example Non-Matches
52
46
  ------------------------------|-------------|-----------------|--------------------
53
- `Unicode::Emoji::REGEX` | **Use this if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kind of *recommended* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`
54
- `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kind of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`
55
- `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kind of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`
47
+ `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
48
+ `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`, `1⃣`
49
+ `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`, `1⃣`
50
+ `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits (except for: unqualified keycap sequences) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
56
51
 
57
- ##### Picking the Right Emoji Regex
52
+ #### Include Text Emoji
58
53
 
59
- - Usually you just want `REGEX` (RGI set)
60
- - If you want broader matching (e.g. more sub-regions), choose `REGEX_VALID`
61
- - If you even want to match for invalid sequences, too, use `REGEX_WELL_FORMED`
54
+ By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
62
55
 
63
- Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for details.
64
-
65
- Property | `REGEX` (RGI / Recommended) | `REGEX_VALID` (Valid) | `REGEX_WELL_FORMED` (Well-formed)
66
- ---------|-----------------------------|-----------------------|----------------------------------
67
- Region "🇵🇹" | Yes | Yes | Yes
68
- Region "🇵🇵" | No | No | Yes
69
- Tag Sequence "🏴󠁧󠁢󠁳󠁣󠁴󠁿" | Yes | Yes | Yes
70
- Tag Sequence "🏴󠁧󠁢󠁡󠁧󠁢󠁿" | No | Yes | Yes
71
- Tag Sequence "😴󠁧󠁢󠁡󠁡󠁡󠁿" | No | No | Yes
72
- ZWJ Sequence "🤾🏽‍♀️" | Yes | Yes | Yes
73
- ZWJ Sequence "🤠‍🤢" | No | Yes | Yes
74
-
75
- More info about valid vs. recommended Emoji in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
76
-
77
- #### Singleton Regexes
56
+ Regex | Description | Example Matches | Example Non-Matches
57
+ ------------------------------|-------------|-----------------|--------------------
58
+ `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` | `🤾🏽‍♀`, `🏌‍♂️`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
59
+ `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` | `🏻`, `🇵🇵`, `1`
60
+ `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` | `🏻`, `1`
78
61
 
79
- Matches only simple one-codepoint (+ optional variation selector) Emoji:
62
+ #### Minimally-qualified and Unqualified Sequences
80
63
 
81
64
  Regex | Description | Example Matches | Example Non-Matches
82
65
  ------------------------------|-------------|-----------------|--------------------
83
- `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`
84
- `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digit 1) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`
66
+ `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` | `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
67
+ `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
85
68
 
86
- #### Include Textual Emoji
69
+ [List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
87
70
 
88
- By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes. However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
71
+ #### Singleton Regexes
72
+
73
+ Matches only simple one-codepoint (+ optional variation selector) Emoji:
89
74
 
90
75
  Regex | Description | Example Matches | Example Non-Matches
91
76
  ------------------------------|-------------|-----------------|--------------------
92
- `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶` | `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`
93
- `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `😴︎`, `▶` | `🏻`, `🇵🇵`
94
- `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶` | `🏻`
95
-
96
- #### Extended Pictographic Regex
77
+ `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
78
+ `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
79
+
80
+ Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text)
81
+
82
+ While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
83
+
84
+ ### Comparison
85
+
86
+ 1) Fully-qualified RGI Emoji ZWJ sequence
87
+ 2) Minimally-qualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selectors, but not in the first Emoji character)
88
+ 3) Unqualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selector, including in the first Emoji character). Unqualified Emoji include all basic Emoji in Text Presentation (see column 11/12).
89
+ 4) Non-RGI Emoji ZWJ sequence
90
+ 5) Valid Region made from a pair of Regional Indicators
91
+ 6) Any Region made from a pair of Regional Indicators
92
+ 7) RGI Flag Emoji Tag Sequences (England, Scotland, Wales)
93
+ 8) Valid Flag Emoji Tag Sequences (any known subdivision)
94
+ 9) Any Emoji Tag Sequences (any tag sequence with any base)
95
+ 10) Basic Default Emoji Presentation Characters or Text characters with Emoji Presentation Selector
96
+ 11) Basic Default Text Presentation Characters or Basic Emoji with Text Presentation Selector
97
+ 12) Non-Emoji (unqualified) keycap
98
+
99
+ Regex | 1 RGI/FQE | 2 RGI/MQE | 3 RGI/UQE | 4 Non-RGI | 5 Valid Re­gion | 6 Any Re­gion | 7 RGI Tag | 8 Valid Tag | 9 Any Tag | 10 Basic Emoji | 11 Basic Text | 12 Text Key­cap
100
+ -|-|-|-|-|-|-|-|-|-|-|-|-
101
+ REGEX | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
102
+ REGEX INCLUDE TEXT | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
103
+ REGEX INCLUDE MQE | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
104
+ REGEX INCLUDE MQE UQE | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
105
+ REGEX VALID | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌
106
+ REGEX VALID INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅
107
+ REGEX WELL FORMED | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌
108
+ REGEX WELL FORMED INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅
109
+ REGEX POSSIBLE | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌
110
+ REGEX BASIC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌
111
+ REGEX TEXT | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅
112
+
113
+ ¹ Matches all unqualified Emoji, except for textual singleton Emoji (see columns 11, 12)
114
+
115
+ See [spec files](/spec) for detailed examples about which regex matches which kind of Emoji.
116
+
117
+ ### Picking the Right Emoji Regex
118
+
119
+ - Usually you just want `REGEX` (recommended Emoji set, RGI)
120
+ - Use `REGEX_INCLUDE_MQE` or `REGEX_INCLUDE_MQE_UQE` if you want to catch Emoji sequences with missing Variation Selectors.
121
+ - If you want broader matching (any ZWJ sequences, more sub-region flags), choose `REGEX_VALID`
122
+ - If you need to match any region flag and any tag sequence, choose `REGEX_WELL_FORMED`
123
+ - Use the `_INCLUDE_TEXT` suffix with any of the above base regexes, if you want to also match basic textual Emoji
124
+ - And finally, there is also the option to use `REGEX_POSSIBLE`, which is a simplified test for possible Emoji, comparable to `REGEX_WELL_FORMED*`. It might contain false positives, however, the regex is less complex and [suggested in the Unicode standard itself](https://www.unicode.org/reports/tr51/#EBNF_and_Regex) as a first check.
125
+
126
+ ### Examples
127
+
128
+ Desc | Emoji | Escaped | `REGEX` (RGI/FQE) | `REGEX_INCLUDE_MQE` (RGI/MQE) | `REGEX_VALID` | `REGEX_WELL_FORMED` / `REGEX_POSSIBLE`
129
+ -----|-------|---------|---------------|-----------------------|-----------------------------------|-----------------
130
+ RGI ZWJ Sequence | 🤾🏽‍♀️ | `\u{1F93E 1F3FD 200D 2640 FE0F}` | ✅ | ✅ | ✅ | ✅
131
+ RGI ZWJ Sequence MQE | 🤾🏽‍♀ | `\u{1F93E 1F3FD 200D 2640}` | ❌ | ✅ | ✅ | ✅
132
+ Valid ZWJ Sequence, Non-RGI | 🤠‍🤢 | `\u{1F920 200D 1F922}` | ❌ | ❌ | ✅ | ✅
133
+ Known Region | 🇵🇹 | `\u{1F1F5 1F1F9}` | ✅ | ✅ | ✅ | ✅
134
+ Unknown Region | 🇵🇵 | `\u{1F1F5 1F1F5}` | ❌ | ❌ | ❌ | ✅
135
+ RGI Tag Sequence | 🏴󠁧󠁢󠁳󠁣󠁴󠁿 | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | ✅ | ✅ | ✅ | ✅
136
+ Valid Tag Sequence | 🏴󠁧󠁢󠁡󠁧󠁢󠁿 | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | ❌ | ❌ | ✅ | ✅
137
+ Well-formed Tag Sequence | 😴󠁧󠁢󠁡󠁡󠁡󠁿 | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | ❌ | ❌ | ❌ | ✅
138
+
139
+ Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
140
+
141
+ More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
142
+
143
+ ### Extended Pictographic Regex
97
144
 
98
145
  `Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
99
146
 
@@ -101,18 +148,13 @@ Regex | Description | Example Matches | Example Non-Matc
101
148
 
102
149
  See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
103
150
 
104
- #### Partial Regexes
105
-
106
- Matches potential Emoji parts (often, this is not what you want):
107
-
108
- Regex | Description | Example Matches | Example Non-Matches
109
- ------------------------------|-------------|-----------------|--------------------
110
- `Unicode::Emoji::REGEX_ANY` | Matches any Emoji-related codepoint (but no variation selectors, tags, or zero-width joiners). Please not that this will match Emoji-parts rather than complete Emoji, for example, single digits! | `😴`, `▶`, `🏻`, `🛌`, `🏽`, `🇵`, `🇹`, `2`, `🏴`, `🤾`, `♀`, `🤠`, `🤢` | -
151
+ ### Partial Regexes
111
152
 
153
+ `Unicode::Emoji::REGEX_ANY`, same as `\p{Emoji}`. Deprecated: Will be removed or renamed in the future.
112
154
 
113
- ### List
155
+ ## Usage – List
114
156
 
115
- Use `Unicode::Emoji::LIST` or the list method to get a grouped (and ordered) list of Emoji:
157
+ Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
116
158
 
117
159
  ```ruby
118
160
  Unicode::Emoji.list.keys
@@ -125,13 +167,13 @@ Unicode::Emoji.list("Food & Drink", "food-asian")
125
167
  => ["🍱", "🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🥮", "🍡", "🥟", "🥠", "🥡"]
126
168
  ```
127
169
 
128
- Please note that categories might change with future versions of the Emoji standard. This gem will issue warnings when attempting to retrieve old categories using the `#list` method.
170
+ Please note that categories might change with future versions of the Emoji standard, although this has not happened often.
129
171
 
130
- A list of all Emoji can be found at [character.construction](https://character.construction).
172
+ A list of all Emoji (generated from this gem) can be found at [character.construction/emoji](https://character.construction/emoji).
131
173
 
132
- ### Properties
174
+ ## Usage – Properties Data
133
175
 
134
- Allows you to access the codepoint data form Unicode's [emoji-data.txt](https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt) file:
176
+ Allows you to access the codepoint data for a single character form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
135
177
 
136
178
  ```ruby
137
179
  require "unicode/emoji"
@@ -143,7 +185,7 @@ Unicode::Emoji.properties "☝" # => ["Emoji", "Emoji_Modifier_Base"]
143
185
 
144
186
  - [Unicode® Technical Standard #51](https://www.unicode.org/reports/tr51/)
145
187
  - [Emoji categories](https://unicode.org/emoji/charts/emoji-ordering.html)
146
- - Ruby gem which displays [Emoji sequence names](https://github.com/janlelis/unicode-sequence_name) (here [as website](https://character.construction/name))
188
+ - Ruby gem which displays [Emoji sequence names](https://github.com/janlelis/unicode-sequence_name) ([as website](https://character.construction/name))
147
189
  - Part of [unicode-x](https://github.com/janlelis/unicode-x)
148
190
 
149
191
  ## MIT
data/Rakefile CHANGED
@@ -28,14 +28,18 @@ task :irb do
28
28
  end
29
29
 
30
30
  # # #
31
- # Run Specs
31
+ # Run specs
32
32
 
33
33
  desc "#{gemspec.name} | Spec"
34
34
  task :spec do
35
- ruby "spec/unicode_emoji_spec.rb"
35
+ ruby File.join("spec", "*_spec.rb")
36
36
  end
37
37
  task default: :spec
38
38
 
39
+
40
+ # # #
41
+ # Generate regex
42
+
39
43
  desc "#{gemspec.name} | Generates all regex constants and saves them to lib/unicode/emoji/{generated,generated_native} directories"
40
44
  task :generate_constants do
41
45
  load "data/generate_constants.rb", true
Binary file
@@ -68,10 +68,10 @@ def pack_and_join(ords)
68
68
  end
69
69
  end
70
70
 
71
- def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
71
+ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
72
72
  emoji_presentation_sequence = \
73
73
  join(
74
- pack_and_join(TEXT_PRESENTATION) + pack(EMOJI_VARIATION_SELECTOR),
74
+ text_presentation + pack(EMOJI_VARIATION_SELECTOR),
75
75
  emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
76
76
  )
77
77
 
@@ -79,14 +79,20 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
79
79
  "(?!" + emoji_component + ")" + emoji_presentation_sequence
80
80
 
81
81
  text_keycap_sequence = \
82
- join(EMOJI_KEYCAPS.map{|keycap| pack([keycap, EMOJI_KEYCAP_SUFFIX]) })
82
+ pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
83
83
 
84
84
  text_presentation_sequence = \
85
85
  join(
86
- pack_and_join(TEXT_PRESENTATION)+ "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
86
+ text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
87
87
  emoji_presentation + pack(TEXT_VARIATION_SELECTOR),
88
88
  )
89
89
 
90
+ text_emoji = \
91
+ join(
92
+ "(?!" + emoji_component + ")" + text_presentation_sequence,
93
+ text_keycap_sequence,
94
+ )
95
+
90
96
  emoji_modifier_sequence = \
91
97
  emoji_modifier_base + emoji_modifier
92
98
 
@@ -97,27 +103,13 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
97
103
  pack_and_join(VALID_REGION_FLAGS)
98
104
 
99
105
  emoji_well_formed_flag_sequence = \
100
- "(?:" +
101
- pack_and_join(REGIONAL_INDICATORS) +
102
- pack_and_join(REGIONAL_INDICATORS) +
103
- ")"
104
-
105
- emoji_valid_core_sequence = \
106
- join(
107
- # emoji_character,
108
- emoji_keycap_sequence,
109
- emoji_modifier_sequence,
110
- non_component_emoji_presentation_sequence,
111
- emoji_valid_flag_sequence,
112
- )
106
+ '\p{RI}{2}'
113
107
 
114
- emoji_well_formed_core_sequence = \
108
+ emoji_core_sequence = \
115
109
  join(
116
- # emoji_character,
117
110
  emoji_keycap_sequence,
118
111
  emoji_modifier_sequence,
119
112
  non_component_emoji_presentation_sequence,
120
- emoji_well_formed_flag_sequence,
121
113
  )
122
114
 
123
115
  # Sort to make sure complex sequences match first
@@ -128,7 +120,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
128
120
  "(?:" +
129
121
  pack(EMOJI_TAG_BASE_FLAG) +
130
122
  "(?:" + VALID_SUBDIVISIONS.sort_by(&:length).reverse.map{ |sd|
131
- Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}"))
123
+ sd.tr("\u{30}-\u{39}\u{61}-\u{7A}", "\u{E0030}-\u{E0039}\u{E0061}-\u{E007A}")
132
124
  }.join("|") + ")" +
133
125
  pack(CANCEL_TAG) +
134
126
  ")"
@@ -139,7 +131,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
139
131
  non_component_emoji_presentation_sequence,
140
132
  emoji_modifier_sequence,
141
133
  ) +
142
- pack_and_join(TAGS) + "+" +
134
+ pack_and_join(SPEC_TAGS) + "{1,30}" +
143
135
  pack(CANCEL_TAG) +
144
136
  ")"
145
137
 
@@ -147,6 +139,18 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
147
139
  emoji_rgi_zwj_sequence = \
148
140
  pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse)
149
141
 
142
+ # FQE+MQE: Make VS16 optional after ZWJ has appeared
143
+ emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
144
+ /#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/,
145
+ pack(EMOJI_VARIATION_SELECTOR) + "?"
146
+ )
147
+
148
+ # FQE+MQE+UQE: Make all VS16 optional
149
+ emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
150
+ pack(EMOJI_VARIATION_SELECTOR),
151
+ pack(EMOJI_VARIATION_SELECTOR) + "?",
152
+ )
153
+
150
154
  emoji_valid_zwj_element = \
151
155
  join(
152
156
  emoji_modifier_sequence,
@@ -163,58 +167,126 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
163
167
  join(
164
168
  emoji_rgi_zwj_sequence,
165
169
  emoji_rgi_tag_sequence,
166
- emoji_valid_core_sequence,
170
+ emoji_valid_flag_sequence,
171
+ emoji_core_sequence,
172
+ )
173
+
174
+ emoji_rgi_sequence_include_text = \
175
+ join(
176
+ emoji_rgi_zwj_sequence,
177
+ emoji_rgi_tag_sequence,
178
+ emoji_valid_flag_sequence,
179
+ emoji_core_sequence,
180
+ text_emoji,
181
+ )
182
+
183
+ emoji_rgi_include_mqe_sequence = \
184
+ join(
185
+ emoji_rgi_include_mqe_zwj_sequence,
186
+ emoji_rgi_tag_sequence,
187
+ emoji_valid_flag_sequence,
188
+ emoji_core_sequence,
189
+ )
190
+
191
+ emoji_rgi_include_mqe_uqe_sequence = \
192
+ join(
193
+ emoji_rgi_include_mqe_uqe_zwj_sequence,
194
+ text_emoji, # also uqe
195
+ emoji_rgi_tag_sequence,
196
+ emoji_valid_flag_sequence,
197
+ emoji_core_sequence,
167
198
  )
168
199
 
169
200
  emoji_valid_sequence = \
170
201
  join(
171
202
  emoji_valid_zwj_sequence,
172
203
  emoji_valid_tag_sequence,
173
- emoji_valid_core_sequence,
204
+ emoji_valid_flag_sequence,
205
+ emoji_core_sequence,
206
+ )
207
+
208
+ emoji_valid_sequence_include_text = \
209
+ join(
210
+ emoji_valid_zwj_sequence,
211
+ emoji_valid_tag_sequence,
212
+ emoji_valid_flag_sequence,
213
+ emoji_core_sequence,
214
+ text_emoji,
174
215
  )
175
216
 
176
217
  emoji_well_formed_sequence = \
177
218
  join(
178
219
  emoji_valid_zwj_sequence,
179
220
  emoji_well_formed_tag_sequence,
180
- emoji_well_formed_core_sequence,
221
+ emoji_well_formed_flag_sequence,
222
+ emoji_core_sequence,
223
+ )
224
+
225
+ emoji_well_formed_sequence_include_text = \
226
+ join(
227
+ emoji_valid_zwj_sequence,
228
+ emoji_well_formed_tag_sequence,
229
+ emoji_well_formed_flag_sequence,
230
+ emoji_core_sequence,
231
+ text_emoji,
232
+ )
233
+
234
+ emoji_possible_modification = \
235
+ join(
236
+ emoji_modifier,
237
+ pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]) + "?",
238
+ "[󠀠-󠁾]+󠁿" # raw tags
239
+ )
240
+
241
+ emoji_possible_zwj_element = \
242
+ join(
243
+ emoji_well_formed_flag_sequence,
244
+ emoji_character + emoji_possible_modification + "?"
181
245
  )
182
246
 
247
+ emoji_possible = \
248
+ emoji_possible_zwj_element + "(?:" + pack(ZWJ) + emoji_possible_zwj_element + ")*"
249
+
183
250
  regexes = {}
184
251
 
185
252
  # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
186
253
  regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)
187
254
 
255
+ # rgi + singleton text
256
+ regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text)
257
+
258
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
259
+ # Also make VS16 optional if not at first emoji character
260
+ regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence)
261
+
262
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
263
+ # Also make VS16 optional even at first emoji character
264
+ regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence)
265
+
188
266
  # Matches basic singleton emoji and all kind of valid sequences
189
267
  regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)
190
268
 
269
+ # valid + singleton text
270
+ regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text)
271
+
191
272
  # Matches basic singleton emoji and all kind of sequences
192
273
  regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)
274
+
275
+ # well-formed + singleton text
276
+ regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text)
193
277
 
194
- # Matches only basic single, non-textual emoji
195
- # Ignores "components" like modifiers or simple digits
196
- regexes[:REGEX_BASIC] = Regexp.compile(
197
- "(?!" + emoji_component + ")" + emoji_presentation_sequence
198
- )
278
+ # Quick test which might lead to false positves
279
+ # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
280
+ regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
199
281
 
200
- # Matches only basic single, textual emoji
201
- # Ignores "components" like modifiers or simple digits
202
- regexes[:REGEX_TEXT] = Regexp.compile(
203
- join(
204
- "(?!" + emoji_component + ")" + text_presentation_sequence,
205
- text_keycap_sequence,
206
- )
207
- )
282
+ # Matches only basic single, non-textual emoji, ignores "components" like modifiers or simple digits
283
+ regexes[:REGEX_BASIC] = Regexp.compile(non_component_emoji_presentation_sequence)
208
284
 
209
- # Matches any emoji-related codepoint - Use with caution (returns partial matches)
210
- regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
211
-
212
- # Combined REGEXes which also match for TEXTUAL emoji
213
- regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT])
285
+ # Matches only basic single, textual emoji, ignores "components" like modifiers or simple digits
286
+ regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
214
287
 
215
- regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT])
216
-
217
- regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT])
288
+ # Same as \p{Emoji} - to be removed or renamed
289
+ regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
218
290
 
219
291
  regexes[:REGEX_PICTO] = Regexp.compile(picto)
220
292
 
@@ -229,6 +301,7 @@ regexes = compile(
229
301
  emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
230
302
  emoji_component: pack_and_join(EMOJI_COMPONENT),
231
303
  emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
304
+ text_presentation: pack_and_join(TEXT_PRESENTATION),
232
305
  picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
233
306
  picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
234
307
  )
@@ -240,6 +313,7 @@ native_regexes = compile(
240
313
  emoji_modifier_base: "\\p{EBase}",
241
314
  emoji_component: "\\p{EComp}",
242
315
  emoji_presentation: "\\p{EPres}",
316
+ text_presentation: "\\p{Emoji}(?<!\\p{EPres})",
243
317
  picto: "\\p{ExtPict}",
244
318
  picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
245
319
  )
@@ -2,12 +2,13 @@
2
2
 
3
3
  module Unicode
4
4
  module Emoji
5
- VERSION = "3.6.0"
5
+ VERSION = "3.8.0"
6
6
  EMOJI_VERSION = "16.0"
7
7
  CLDR_VERSION = "45"
8
8
  DATA_DIRECTORY = File.expand_path('../../../data', __dir__).freeze
9
9
  INDEX_FILENAME = (DATA_DIRECTORY + "/emoji.marshal.gz").freeze
10
10
 
11
+ # Unicode properties, see https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt
11
12
  PROPERTY_NAMES = {
12
13
  E: "Emoji",
13
14
  B: "Emoji_Modifier_Base",
@@ -17,13 +18,28 @@ module Unicode
17
18
  X: "Extended_Pictographic",
18
19
  }.freeze
19
20
 
21
+ # Variation Selector 16 (VS16), enables emoji presentation mode for preceding codepoint
20
22
  EMOJI_VARIATION_SELECTOR = 0xFE0F
23
+
24
+ # Variation Selector 15 (VS15), enables text presentation mode for preceding codepoint
21
25
  TEXT_VARIATION_SELECTOR = 0xFE0E
26
+
27
+ # First codepoint of tag-based subdivision flags
22
28
  EMOJI_TAG_BASE_FLAG = 0x1F3F4
29
+
30
+ # Last codepoint of tag-based subdivision flags
23
31
  CANCEL_TAG = 0xE007F
24
- TAGS = [*0xE0020..0xE007E].freeze
32
+
33
+ # Tags characters allowed in tag-based subdivision flags
34
+ SPEC_TAGS = [*0xE0030..0xE0039, *0xE0061..0xE007A].freeze
35
+
36
+ # Combining Enclosing Keycap character
25
37
  EMOJI_KEYCAP_SUFFIX = 0x20E3
38
+
39
+ # Zero-width-joiner to enable combination of multiple Emoji in a sequence
26
40
  ZWJ = 0x200D
41
+
42
+ # Two regional indicators make up a region
27
43
  REGIONAL_INDICATORS = [*0x1F1E6..0x1F1FF].freeze
28
44
  end
29
45
  end