unicode-emoji 3.6.0 → 3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGELOG.md +24 -1
  4. data/README.md +105 -63
  5. data/Rakefile +6 -2
  6. data/data/emoji.marshal.gz +0 -0
  7. data/data/generate_constants.rb +120 -46
  8. data/lib/unicode/emoji/constants.rb +18 -2
  9. data/lib/unicode/emoji/generated/regex.rb +1 -1
  10. data/lib/unicode/emoji/generated/regex_include_mqe.rb +8 -0
  11. data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +8 -0
  12. data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
  13. data/lib/unicode/emoji/generated/regex_possible.rb +8 -0
  14. data/lib/unicode/emoji/generated/regex_text.rb +1 -1
  15. data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
  16. data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
  17. data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
  18. data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
  19. data/lib/unicode/emoji/generated_native/regex.rb +1 -1
  20. data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
  21. data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +8 -0
  22. data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +8 -0
  23. data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
  24. data/lib/unicode/emoji/generated_native/regex_possible.rb +8 -0
  25. data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
  26. data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
  27. data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
  28. data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
  29. data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
  30. data/lib/unicode/emoji/lazy_constants.rb +37 -6
  31. data/lib/unicode/emoji/list.rb +13 -0
  32. data/lib/unicode/emoji.rb +38 -5
  33. data/spec/data/.keep +0 -0
  34. data/spec/data/emoji-test.txt +5331 -0
  35. data/spec/emoji_test_txt_spec.rb +181 -0
  36. data/spec/unicode_emoji_spec.rb +152 -5
  37. data/unicode-emoji.gemspec +3 -3
  38. metadata +20 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cc9d3cd1548a2cc0861e05836cc4a5dcc0b4d4d631568c6bc89de094951fcb70
4
- data.tar.gz: 2172d2a31051731d234aeae48ba8c6b9cba15181fb8b1886d3a90252048937ab
3
+ metadata.gz: e3c7cc2671d256d8208b72d719384e7c13aaace4fec6b4919b92640e5336d87f
4
+ data.tar.gz: 9420777da4805787467c7f4eac1580f7179a6abf56e54b011c11640a50502b88
5
5
  SHA512:
6
- metadata.gz: ae1d2b65292537a89748d9ba1cc78c08723b37f6ccf5debb9fbeeb73720bd6e1c232f4f1be23d24699e3ebd2f1092646e2877b96366c0d4aa05e0f080aeadf99
7
- data.tar.gz: 0d422eeccd45a9430e5df8a5bac8964dca3bd5616eb51d6f6999c3372f8697316e210b04e586ab3a734b5e74b3881a309a2c4a6b0e78964de7513550dc3380ec
6
+ metadata.gz: f31a83c8a492affe4ec34f2f6e43fa20a44f98dae6c0855322d5c12bc924edc154f7fa4793ca28f299f5436c8981651a5777c2e7944db6dee9c3b99f5ec997ce
7
+ data.tar.gz: 54c1beecbcb673274bdf98169f11fa51bc746282418c3b9ca30385194ae384e3532b04bbca3ef196316c11d3344258b9059da1ea18630d8d90b51029c608565d
data/.gitignore CHANGED
@@ -1,2 +1,3 @@
1
1
  Gemfile.lock
2
2
  /pkg
3
+ /spec/data/emoji-test.txt
data/CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  # CHANGELOG
2
2
 
3
+ ### 3.8.0
4
+
5
+ - Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
6
+ for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
7
+ - Add specs running through `emoji-text.txt` and classify qualification statuses per regex
8
+ - Improve documentation and add detailed table about which regex has which features
9
+ - Native regexes: Use native Emoji props for Emoji text presentation
10
+ - Update CLDR to v46 (valid subdivisions)
11
+ - Further improvements (see commit log)
12
+
13
+ ### 3.7.0
14
+
15
+ - Bump required Ruby slightly to 2.5
16
+ - Introduce new `REGEX_POSSIBLE` which contains the regex described in
17
+ https://www.unicode.org/reports/tr51/#EBNF_and_Regex
18
+ - Fix that some valid subdivisions were not decompressed (`REGEX_VALID`)
19
+ - Be stricter about selection of tag characters in `REGEX_WELL_FORMED`
20
+ - Only U+E0030..U+E0039, U+E0061..U+E007A allowed
21
+ - Max tag sequence length
22
+ - Use native `/\p{RI}/` regex for regional indicators
23
+ - Separately autoload emoji list, so it can be loaded when other indexes
24
+ are not needed
25
+
3
26
  ### 3.6.0
4
27
 
5
28
  - `Unicode::Emoji::REGEX_TEXT` now matches non-emoji keycaps like "3⃣" (U+0033 U+20E3)
@@ -16,7 +39,7 @@
16
39
  ### 3.3.2
17
40
 
18
41
  - Update valid subdivisions to CLDR 43 (no changes)
19
- -> there won't be any new subdivision flags in Emoji
42
+ -> there won't be any new RGI subdivision flags in Emoji
20
43
 
21
44
  ### 3.3.1
22
45
 
data/README.md CHANGED
@@ -1,18 +1,15 @@
1
1
  # Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
2
2
 
3
- Provides Unicode Emoji data and regexes, incorporating the latest Unicode and Emoji standards.
3
+ Provides regular expressions to find Emoji in strings, incorporating the latest Unicode / Emoji standards.
4
4
 
5
- Also includes a categorized list of recommended Emoji.
5
+ Additional features:
6
6
 
7
- Emoji version: **16.0** (September 2024)
8
-
9
- CLDR version (used for sub-region flags): **45** (April 2024)
10
-
11
- Supported Rubies: **3.x**
7
+ - A categorized list of Emoji (RGI: Recommended for General Interchange)
8
+ - Retrieve Emoji properties info about specific codepoints (Emoji_Modifier, Emoji_Presentation, etc.)
12
9
 
13
- No longer supported Rubies, but might still work: **2.7**, **2.6**, **2.5**, **2.4**, **2.3**
10
+ Emoji version: **16.0** (September 2024)
14
11
 
15
- If you are stuck on an older Ruby version, checkout the latest [0.9 version](https://rubygems.org/gems/unicode-emoji/versions/0.9.3) of this gem.
12
+ CLDR version (used for sub-region flags): **46** (October 2024)
16
13
 
17
14
  ## Gemfile
18
15
 
@@ -20,16 +17,14 @@ If you are stuck on an older Ruby version, checkout the latest [0.9 version](htt
20
17
  gem "unicode-emoji"
21
18
  ```
22
19
 
23
- ## Usage
24
-
25
- ### Regex
20
+ ## Usage – Regex Matching
26
21
 
27
- The gem includes a bunch of Emoji regexes, which are compiled out of various Emoji Unicode data sources.
22
+ The gem includes multiple Emoji regexes, which are compiled out of various Emoji Unicode data sources.
28
23
 
29
24
  ```ruby
30
25
  require "unicode/emoji"
31
26
 
32
- string = "String which contains all kinds of emoji:
27
+ string = "String which contains all types of Emoji sequences:
33
28
 
34
29
  - Singleton Emoji: 😴
35
30
  - Textual singleton Emoji with Emoji variation: ▶️
@@ -38,62 +33,114 @@ string = "String which contains all kinds of emoji:
38
33
  - Sub-Region flag: 🏴󠁧󠁢󠁳󠁣󠁴󠁿
39
34
  - Keycap sequence: 2️⃣
40
35
  - Sequence using ZWJ (zero width joiner): 🤾🏽‍♀️
41
-
42
36
  "
43
37
 
44
38
  string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🤾🏽‍♀️"]
45
39
  ```
46
40
 
47
- #### Main Regexes
41
+ Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
48
42
 
49
- Matches (non-textual) Emoji of all kinds:
43
+ ### Main Regexes
50
44
 
51
45
  Regex | Description | Example Matches | Example Non-Matches
52
46
  ------------------------------|-------------|-----------------|--------------------
53
- `Unicode::Emoji::REGEX` | **Use this if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kind of *recommended* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`
54
- `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kind of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`
55
- `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kind of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`
47
+ `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
48
+ `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`, `1⃣`
49
+ `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`, `1⃣`
50
+ `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits (except for: unqualified keycap sequences) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
56
51
 
57
- ##### Picking the Right Emoji Regex
52
+ #### Include Text Emoji
58
53
 
59
- - Usually you just want `REGEX` (RGI set)
60
- - If you want broader matching (e.g. more sub-regions), choose `REGEX_VALID`
61
- - If you even want to match for invalid sequences, too, use `REGEX_WELL_FORMED`
54
+ By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
62
55
 
63
- Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for details.
64
-
65
- Property | `REGEX` (RGI / Recommended) | `REGEX_VALID` (Valid) | `REGEX_WELL_FORMED` (Well-formed)
66
- ---------|-----------------------------|-----------------------|----------------------------------
67
- Region "🇵🇹" | Yes | Yes | Yes
68
- Region "🇵🇵" | No | No | Yes
69
- Tag Sequence "🏴󠁧󠁢󠁳󠁣󠁴󠁿" | Yes | Yes | Yes
70
- Tag Sequence "🏴󠁧󠁢󠁡󠁧󠁢󠁿" | No | Yes | Yes
71
- Tag Sequence "😴󠁧󠁢󠁡󠁡󠁡󠁿" | No | No | Yes
72
- ZWJ Sequence "🤾🏽‍♀️" | Yes | Yes | Yes
73
- ZWJ Sequence "🤠‍🤢" | No | Yes | Yes
74
-
75
- More info about valid vs. recommended Emoji in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
76
-
77
- #### Singleton Regexes
56
+ Regex | Description | Example Matches | Example Non-Matches
57
+ ------------------------------|-------------|-----------------|--------------------
58
+ `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` | `🤾🏽‍♀`, `🏌‍♂️`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
59
+ `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` | `🏻`, `🇵🇵`, `1`
60
+ `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` | `🏻`, `1`
78
61
 
79
- Matches only simple one-codepoint (+ optional variation selector) Emoji:
62
+ #### Minimally-qualified and Unqualified Sequences
80
63
 
81
64
  Regex | Description | Example Matches | Example Non-Matches
82
65
  ------------------------------|-------------|-----------------|--------------------
83
- `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`
84
- `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digit 1) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`
66
+ `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` | `🏌‍♂️`, `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
67
+ `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
85
68
 
86
- #### Include Textual Emoji
69
+ [List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
87
70
 
88
- By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes. However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
71
+ #### Singleton Regexes
72
+
73
+ Matches only simple one-codepoint (+ optional variation selector) Emoji:
89
74
 
90
75
  Regex | Description | Example Matches | Example Non-Matches
91
76
  ------------------------------|-------------|-----------------|--------------------
92
- `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶` | `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`
93
- `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `😴︎`, `▶` | `🏻`, `🇵🇵`
94
- `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶` | `🏻`
95
-
96
- #### Extended Pictographic Regex
77
+ `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
78
+ `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
79
+
80
+ Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text)
81
+
82
+ While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
83
+
84
+ ### Comparison
85
+
86
+ 1) Fully-qualified RGI Emoji ZWJ sequence
87
+ 2) Minimally-qualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selectors, but not in the first Emoji character)
88
+ 3) Unqualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selector, including in the first Emoji character). Unqualified Emoji include all basic Emoji in Text Presentation (see column 11/12).
89
+ 4) Non-RGI Emoji ZWJ sequence
90
+ 5) Valid Region made from a pair of Regional Indicators
91
+ 6) Any Region made from a pair of Regional Indicators
92
+ 7) RGI Flag Emoji Tag Sequences (England, Scotland, Wales)
93
+ 8) Valid Flag Emoji Tag Sequences (any known subdivision)
94
+ 9) Any Emoji Tag Sequences (any tag sequence with any base)
95
+ 10) Basic Default Emoji Presentation Characters or Text characters with Emoji Presentation Selector
96
+ 11) Basic Default Text Presentation Characters or Basic Emoji with Text Presentation Selector
97
+ 12) Non-Emoji (unqualified) keycap
98
+
99
+ Regex | 1 RGI/FQE | 2 RGI/MQE | 3 RGI/UQE | 4 Non-RGI | 5 Valid Re­gion | 6 Any Re­gion | 7 RGI Tag | 8 Valid Tag | 9 Any Tag | 10 Basic Emoji | 11 Basic Text | 12 Text Key­cap
100
+ -|-|-|-|-|-|-|-|-|-|-|-|-
101
+ REGEX | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
102
+ REGEX INCLUDE TEXT | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
103
+ REGEX INCLUDE MQE | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
104
+ REGEX INCLUDE MQE UQE | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
105
+ REGEX VALID | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌
106
+ REGEX VALID INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅
107
+ REGEX WELL FORMED | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌
108
+ REGEX WELL FORMED INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅
109
+ REGEX POSSIBLE | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌
110
+ REGEX BASIC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌
111
+ REGEX TEXT | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅
112
+
113
+ ¹ Matches all unqualified Emoji, except for textual singleton Emoji (see columns 11, 12)
114
+
115
+ See [spec files](/spec) for detailed examples about which regex matches which kind of Emoji.
116
+
117
+ ### Picking the Right Emoji Regex
118
+
119
+ - Usually you just want `REGEX` (recommended Emoji set, RGI)
120
+ - Use `REGEX_INCLUDE_MQE` or `REGEX_INCLUDE_MQE_UQE` if you want to catch Emoji sequences with missing Variation Selectors.
121
+ - If you want broader matching (any ZWJ sequences, more sub-region flags), choose `REGEX_VALID`
122
+ - If you need to match any region flag and any tag sequence, choose `REGEX_WELL_FORMED`
123
+ - Use the `_INCLUDE_TEXT` suffix with any of the above base regexes, if you want to also match basic textual Emoji
124
+ - And finally, there is also the option to use `REGEX_POSSIBLE`, which is a simplified test for possible Emoji, comparable to `REGEX_WELL_FORMED*`. It might contain false positives, however, the regex is less complex and [suggested in the Unicode standard itself](https://www.unicode.org/reports/tr51/#EBNF_and_Regex) as a first check.
125
+
126
+ ### Examples
127
+
128
+ Desc | Emoji | Escaped | `REGEX` (RGI/FQE) | `REGEX_INCLUDE_MQE` (RGI/MQE) | `REGEX_VALID` | `REGEX_WELL_FORMED` / `REGEX_POSSIBLE`
129
+ -----|-------|---------|---------------|-----------------------|-----------------------------------|-----------------
130
+ RGI ZWJ Sequence | 🤾🏽‍♀️ | `\u{1F93E 1F3FD 200D 2640 FE0F}` | ✅ | ✅ | ✅ | ✅
131
+ RGI ZWJ Sequence MQE | 🤾🏽‍♀ | `\u{1F93E 1F3FD 200D 2640}` | ❌ | ✅ | ✅ | ✅
132
+ Valid ZWJ Sequence, Non-RGI | 🤠‍🤢 | `\u{1F920 200D 1F922}` | ❌ | ❌ | ✅ | ✅
133
+ Known Region | 🇵🇹 | `\u{1F1F5 1F1F9}` | ✅ | ✅ | ✅ | ✅
134
+ Unknown Region | 🇵🇵 | `\u{1F1F5 1F1F5}` | ❌ | ❌ | ❌ | ✅
135
+ RGI Tag Sequence | 🏴󠁧󠁢󠁳󠁣󠁴󠁿 | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | ✅ | ✅ | ✅ | ✅
136
+ Valid Tag Sequence | 🏴󠁧󠁢󠁡󠁧󠁢󠁿 | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | ❌ | ❌ | ✅ | ✅
137
+ Well-formed Tag Sequence | 😴󠁧󠁢󠁡󠁡󠁡󠁿 | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | ❌ | ❌ | ❌ | ✅
138
+
139
+ Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
140
+
141
+ More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
142
+
143
+ ### Extended Pictographic Regex
97
144
 
98
145
  `Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
99
146
 
@@ -101,18 +148,13 @@ Regex | Description | Example Matches | Example Non-Matc
101
148
 
102
149
  See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
103
150
 
104
- #### Partial Regexes
105
-
106
- Matches potential Emoji parts (often, this is not what you want):
107
-
108
- Regex | Description | Example Matches | Example Non-Matches
109
- ------------------------------|-------------|-----------------|--------------------
110
- `Unicode::Emoji::REGEX_ANY` | Matches any Emoji-related codepoint (but no variation selectors, tags, or zero-width joiners). Please not that this will match Emoji-parts rather than complete Emoji, for example, single digits! | `😴`, `▶`, `🏻`, `🛌`, `🏽`, `🇵`, `🇹`, `2`, `🏴`, `🤾`, `♀`, `🤠`, `🤢` | -
151
+ ### Partial Regexes
111
152
 
153
+ `Unicode::Emoji::REGEX_ANY`, same as `\p{Emoji}`. Deprecated: Will be removed or renamed in the future.
112
154
 
113
- ### List
155
+ ## Usage – List
114
156
 
115
- Use `Unicode::Emoji::LIST` or the list method to get a grouped (and ordered) list of Emoji:
157
+ Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
116
158
 
117
159
  ```ruby
118
160
  Unicode::Emoji.list.keys
@@ -125,13 +167,13 @@ Unicode::Emoji.list("Food & Drink", "food-asian")
125
167
  => ["🍱", "🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🥮", "🍡", "🥟", "🥠", "🥡"]
126
168
  ```
127
169
 
128
- Please note that categories might change with future versions of the Emoji standard. This gem will issue warnings when attempting to retrieve old categories using the `#list` method.
170
+ Please note that categories might change with future versions of the Emoji standard, although this has not happened often.
129
171
 
130
- A list of all Emoji can be found at [character.construction](https://character.construction).
172
+ A list of all Emoji (generated from this gem) can be found at [character.construction/emoji](https://character.construction/emoji).
131
173
 
132
- ### Properties
174
+ ## Usage – Properties Data
133
175
 
134
- Allows you to access the codepoint data form Unicode's [emoji-data.txt](https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt) file:
176
+ Allows you to access the codepoint data for a single character form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
135
177
 
136
178
  ```ruby
137
179
  require "unicode/emoji"
@@ -143,7 +185,7 @@ Unicode::Emoji.properties "☝" # => ["Emoji", "Emoji_Modifier_Base"]
143
185
 
144
186
  - [Unicode® Technical Standard #51](https://www.unicode.org/reports/tr51/)
145
187
  - [Emoji categories](https://unicode.org/emoji/charts/emoji-ordering.html)
146
- - Ruby gem which displays [Emoji sequence names](https://github.com/janlelis/unicode-sequence_name) (here [as website](https://character.construction/name))
188
+ - Ruby gem which displays [Emoji sequence names](https://github.com/janlelis/unicode-sequence_name) ([as website](https://character.construction/name))
147
189
  - Part of [unicode-x](https://github.com/janlelis/unicode-x)
148
190
 
149
191
  ## MIT
data/Rakefile CHANGED
@@ -28,14 +28,18 @@ task :irb do
28
28
  end
29
29
 
30
30
  # # #
31
- # Run Specs
31
+ # Run specs
32
32
 
33
33
  desc "#{gemspec.name} | Spec"
34
34
  task :spec do
35
- ruby "spec/unicode_emoji_spec.rb"
35
+ ruby File.join("spec", "*_spec.rb")
36
36
  end
37
37
  task default: :spec
38
38
 
39
+
40
+ # # #
41
+ # Generate regex
42
+
39
43
  desc "#{gemspec.name} | Generates all regex constants and saves them to lib/unicode/emoji/{generated,generated_native} directories"
40
44
  task :generate_constants do
41
45
  load "data/generate_constants.rb", true
Binary file
@@ -68,10 +68,10 @@ def pack_and_join(ords)
68
68
  end
69
69
  end
70
70
 
71
- def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
71
+ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
72
72
  emoji_presentation_sequence = \
73
73
  join(
74
- pack_and_join(TEXT_PRESENTATION) + pack(EMOJI_VARIATION_SELECTOR),
74
+ text_presentation + pack(EMOJI_VARIATION_SELECTOR),
75
75
  emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
76
76
  )
77
77
 
@@ -79,14 +79,20 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
79
79
  "(?!" + emoji_component + ")" + emoji_presentation_sequence
80
80
 
81
81
  text_keycap_sequence = \
82
- join(EMOJI_KEYCAPS.map{|keycap| pack([keycap, EMOJI_KEYCAP_SUFFIX]) })
82
+ pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
83
83
 
84
84
  text_presentation_sequence = \
85
85
  join(
86
- pack_and_join(TEXT_PRESENTATION)+ "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
86
+ text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
87
87
  emoji_presentation + pack(TEXT_VARIATION_SELECTOR),
88
88
  )
89
89
 
90
+ text_emoji = \
91
+ join(
92
+ "(?!" + emoji_component + ")" + text_presentation_sequence,
93
+ text_keycap_sequence,
94
+ )
95
+
90
96
  emoji_modifier_sequence = \
91
97
  emoji_modifier_base + emoji_modifier
92
98
 
@@ -97,27 +103,13 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
97
103
  pack_and_join(VALID_REGION_FLAGS)
98
104
 
99
105
  emoji_well_formed_flag_sequence = \
100
- "(?:" +
101
- pack_and_join(REGIONAL_INDICATORS) +
102
- pack_and_join(REGIONAL_INDICATORS) +
103
- ")"
104
-
105
- emoji_valid_core_sequence = \
106
- join(
107
- # emoji_character,
108
- emoji_keycap_sequence,
109
- emoji_modifier_sequence,
110
- non_component_emoji_presentation_sequence,
111
- emoji_valid_flag_sequence,
112
- )
106
+ '\p{RI}{2}'
113
107
 
114
- emoji_well_formed_core_sequence = \
108
+ emoji_core_sequence = \
115
109
  join(
116
- # emoji_character,
117
110
  emoji_keycap_sequence,
118
111
  emoji_modifier_sequence,
119
112
  non_component_emoji_presentation_sequence,
120
- emoji_well_formed_flag_sequence,
121
113
  )
122
114
 
123
115
  # Sort to make sure complex sequences match first
@@ -128,7 +120,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
128
120
  "(?:" +
129
121
  pack(EMOJI_TAG_BASE_FLAG) +
130
122
  "(?:" + VALID_SUBDIVISIONS.sort_by(&:length).reverse.map{ |sd|
131
- Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}"))
123
+ sd.tr("\u{30}-\u{39}\u{61}-\u{7A}", "\u{E0030}-\u{E0039}\u{E0061}-\u{E007A}")
132
124
  }.join("|") + ")" +
133
125
  pack(CANCEL_TAG) +
134
126
  ")"
@@ -139,7 +131,7 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
139
131
  non_component_emoji_presentation_sequence,
140
132
  emoji_modifier_sequence,
141
133
  ) +
142
- pack_and_join(TAGS) + "+" +
134
+ pack_and_join(SPEC_TAGS) + "{1,30}" +
143
135
  pack(CANCEL_TAG) +
144
136
  ")"
145
137
 
@@ -147,6 +139,18 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
147
139
  emoji_rgi_zwj_sequence = \
148
140
  pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse)
149
141
 
142
+ # FQE+MQE: Make VS16 optional after ZWJ has appeared
143
+ emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
144
+ /#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/,
145
+ pack(EMOJI_VARIATION_SELECTOR) + "?"
146
+ )
147
+
148
+ # FQE+MQE+UQE: Make all VS16 optional
149
+ emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
150
+ pack(EMOJI_VARIATION_SELECTOR),
151
+ pack(EMOJI_VARIATION_SELECTOR) + "?",
152
+ )
153
+
150
154
  emoji_valid_zwj_element = \
151
155
  join(
152
156
  emoji_modifier_sequence,
@@ -163,58 +167,126 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
163
167
  join(
164
168
  emoji_rgi_zwj_sequence,
165
169
  emoji_rgi_tag_sequence,
166
- emoji_valid_core_sequence,
170
+ emoji_valid_flag_sequence,
171
+ emoji_core_sequence,
172
+ )
173
+
174
+ emoji_rgi_sequence_include_text = \
175
+ join(
176
+ emoji_rgi_zwj_sequence,
177
+ emoji_rgi_tag_sequence,
178
+ emoji_valid_flag_sequence,
179
+ emoji_core_sequence,
180
+ text_emoji,
181
+ )
182
+
183
+ emoji_rgi_include_mqe_sequence = \
184
+ join(
185
+ emoji_rgi_include_mqe_zwj_sequence,
186
+ emoji_rgi_tag_sequence,
187
+ emoji_valid_flag_sequence,
188
+ emoji_core_sequence,
189
+ )
190
+
191
+ emoji_rgi_include_mqe_uqe_sequence = \
192
+ join(
193
+ emoji_rgi_include_mqe_uqe_zwj_sequence,
194
+ text_emoji, # also uqe
195
+ emoji_rgi_tag_sequence,
196
+ emoji_valid_flag_sequence,
197
+ emoji_core_sequence,
167
198
  )
168
199
 
169
200
  emoji_valid_sequence = \
170
201
  join(
171
202
  emoji_valid_zwj_sequence,
172
203
  emoji_valid_tag_sequence,
173
- emoji_valid_core_sequence,
204
+ emoji_valid_flag_sequence,
205
+ emoji_core_sequence,
206
+ )
207
+
208
+ emoji_valid_sequence_include_text = \
209
+ join(
210
+ emoji_valid_zwj_sequence,
211
+ emoji_valid_tag_sequence,
212
+ emoji_valid_flag_sequence,
213
+ emoji_core_sequence,
214
+ text_emoji,
174
215
  )
175
216
 
176
217
  emoji_well_formed_sequence = \
177
218
  join(
178
219
  emoji_valid_zwj_sequence,
179
220
  emoji_well_formed_tag_sequence,
180
- emoji_well_formed_core_sequence,
221
+ emoji_well_formed_flag_sequence,
222
+ emoji_core_sequence,
223
+ )
224
+
225
+ emoji_well_formed_sequence_include_text = \
226
+ join(
227
+ emoji_valid_zwj_sequence,
228
+ emoji_well_formed_tag_sequence,
229
+ emoji_well_formed_flag_sequence,
230
+ emoji_core_sequence,
231
+ text_emoji,
232
+ )
233
+
234
+ emoji_possible_modification = \
235
+ join(
236
+ emoji_modifier,
237
+ pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]) + "?",
238
+ "[󠀠-󠁾]+󠁿" # raw tags
239
+ )
240
+
241
+ emoji_possible_zwj_element = \
242
+ join(
243
+ emoji_well_formed_flag_sequence,
244
+ emoji_character + emoji_possible_modification + "?"
181
245
  )
182
246
 
247
+ emoji_possible = \
248
+ emoji_possible_zwj_element + "(?:" + pack(ZWJ) + emoji_possible_zwj_element + ")*"
249
+
183
250
  regexes = {}
184
251
 
185
252
  # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
186
253
  regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)
187
254
 
255
+ # rgi + singleton text
256
+ regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text)
257
+
258
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
259
+ # Also make VS16 optional if not at first emoji character
260
+ regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence)
261
+
262
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
263
+ # Also make VS16 optional even at first emoji character
264
+ regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence)
265
+
188
266
  # Matches basic singleton emoji and all kind of valid sequences
189
267
  regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)
190
268
 
269
+ # valid + singleton text
270
+ regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text)
271
+
191
272
  # Matches basic singleton emoji and all kind of sequences
192
273
  regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)
274
+
275
+ # well-formed + singleton text
276
+ regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text)
193
277
 
194
- # Matches only basic single, non-textual emoji
195
- # Ignores "components" like modifiers or simple digits
196
- regexes[:REGEX_BASIC] = Regexp.compile(
197
- "(?!" + emoji_component + ")" + emoji_presentation_sequence
198
- )
278
+ # Quick test which might lead to false positves
279
+ # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
280
+ regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
199
281
 
200
- # Matches only basic single, textual emoji
201
- # Ignores "components" like modifiers or simple digits
202
- regexes[:REGEX_TEXT] = Regexp.compile(
203
- join(
204
- "(?!" + emoji_component + ")" + text_presentation_sequence,
205
- text_keycap_sequence,
206
- )
207
- )
282
+ # Matches only basic single, non-textual emoji, ignores "components" like modifiers or simple digits
283
+ regexes[:REGEX_BASIC] = Regexp.compile(non_component_emoji_presentation_sequence)
208
284
 
209
- # Matches any emoji-related codepoint - Use with caution (returns partial matches)
210
- regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
211
-
212
- # Combined REGEXes which also match for TEXTUAL emoji
213
- regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT])
285
+ # Matches only basic single, textual emoji, ignores "components" like modifiers or simple digits
286
+ regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
214
287
 
215
- regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT])
216
-
217
- regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT])
288
+ # Same as \p{Emoji} - to be removed or renamed
289
+ regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
218
290
 
219
291
  regexes[:REGEX_PICTO] = Regexp.compile(picto)
220
292
 
@@ -229,6 +301,7 @@ regexes = compile(
229
301
  emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
230
302
  emoji_component: pack_and_join(EMOJI_COMPONENT),
231
303
  emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
304
+ text_presentation: pack_and_join(TEXT_PRESENTATION),
232
305
  picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
233
306
  picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
234
307
  )
@@ -240,6 +313,7 @@ native_regexes = compile(
240
313
  emoji_modifier_base: "\\p{EBase}",
241
314
  emoji_component: "\\p{EComp}",
242
315
  emoji_presentation: "\\p{EPres}",
316
+ text_presentation: "\\p{Emoji}(?<!\\p{EPres})",
243
317
  picto: "\\p{ExtPict}",
244
318
  picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
245
319
  )
@@ -2,12 +2,13 @@
2
2
 
3
3
  module Unicode
4
4
  module Emoji
5
- VERSION = "3.6.0"
5
+ VERSION = "3.8.0"
6
6
  EMOJI_VERSION = "16.0"
7
7
  CLDR_VERSION = "45"
8
8
  DATA_DIRECTORY = File.expand_path('../../../data', __dir__).freeze
9
9
  INDEX_FILENAME = (DATA_DIRECTORY + "/emoji.marshal.gz").freeze
10
10
 
11
+ # Unicode properties, see https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt
11
12
  PROPERTY_NAMES = {
12
13
  E: "Emoji",
13
14
  B: "Emoji_Modifier_Base",
@@ -17,13 +18,28 @@ module Unicode
17
18
  X: "Extended_Pictographic",
18
19
  }.freeze
19
20
 
21
+ # Variation Selector 16 (VS16), enables emoji presentation mode for preceding codepoint
20
22
  EMOJI_VARIATION_SELECTOR = 0xFE0F
23
+
24
+ # Variation Selector 15 (VS15), enables text presentation mode for preceding codepoint
21
25
  TEXT_VARIATION_SELECTOR = 0xFE0E
26
+
27
+ # First codepoint of tag-based subdivision flags
22
28
  EMOJI_TAG_BASE_FLAG = 0x1F3F4
29
+
30
+ # Last codepoint of tag-based subdivision flags
23
31
  CANCEL_TAG = 0xE007F
24
- TAGS = [*0xE0020..0xE007E].freeze
32
+
33
+ # Tags characters allowed in tag-based subdivision flags
34
+ SPEC_TAGS = [*0xE0030..0xE0039, *0xE0061..0xE007A].freeze
35
+
36
+ # Combining Enclosing Keycap character
25
37
  EMOJI_KEYCAP_SUFFIX = 0x20E3
38
+
39
+ # Zero-width-joiner to enable combination of multiple Emoji in a sequence
26
40
  ZWJ = 0x200D
41
+
42
+ # Two regional indicators make up a region
27
43
  REGIONAL_INDICATORS = [*0x1F1E6..0x1F1FF].freeze
28
44
  end
29
45
  end