unicode-emoji 3.7.0 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.rake_tasks +1 -0
  4. data/CHANGELOG.md +24 -1
  5. data/Gemfile.lock +3 -3
  6. data/README.md +111 -58
  7. data/Rakefile +6 -2
  8. data/data/emoji.marshal.gz +0 -0
  9. data/data/generate_constants.rb +123 -43
  10. data/lib/unicode/emoji/constants.rb +22 -2
  11. data/lib/unicode/emoji/generated/regex.rb +1 -1
  12. data/lib/unicode/emoji/generated/regex_basic.rb +1 -1
  13. data/lib/unicode/emoji/generated/regex_emoji_keycap.rb +8 -0
  14. data/lib/unicode/emoji/generated/regex_include_mqe.rb +8 -0
  15. data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +8 -0
  16. data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
  17. data/lib/unicode/emoji/generated/regex_prop_component.rb +8 -0
  18. data/lib/unicode/emoji/generated/regex_prop_emoji.rb +8 -0
  19. data/lib/unicode/emoji/generated/regex_prop_modifier.rb +8 -0
  20. data/lib/unicode/emoji/generated/regex_prop_modifier_base.rb +8 -0
  21. data/lib/unicode/emoji/generated/regex_prop_presentation.rb +8 -0
  22. data/lib/unicode/emoji/generated/regex_text.rb +1 -1
  23. data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
  24. data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
  25. data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
  26. data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
  27. data/lib/unicode/emoji/generated_native/regex.rb +1 -1
  28. data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
  29. data/lib/unicode/emoji/generated_native/regex_emoji_keycap.rb +8 -0
  30. data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +8 -0
  31. data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +8 -0
  32. data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
  33. data/lib/unicode/emoji/generated_native/regex_picto_no_emoji.rb +1 -1
  34. data/lib/unicode/emoji/generated_native/regex_prop_component.rb +8 -0
  35. data/lib/unicode/emoji/generated_native/{regex_any.rb → regex_prop_emoji.rb} +1 -1
  36. data/lib/unicode/emoji/generated_native/regex_prop_modifier.rb +8 -0
  37. data/lib/unicode/emoji/generated_native/regex_prop_modifier_base.rb +8 -0
  38. data/lib/unicode/emoji/generated_native/regex_prop_presentation.rb +8 -0
  39. data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
  40. data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
  41. data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
  42. data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
  43. data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
  44. data/lib/unicode/emoji/lazy_constants.rb +36 -0
  45. data/lib/unicode/emoji/list.rb +3 -0
  46. data/lib/unicode/emoji.rb +39 -6
  47. data/spec/data/.keep +0 -0
  48. data/spec/data/emoji-test.txt +5331 -0
  49. data/spec/emoji_test_txt_spec.rb +181 -0
  50. data/spec/unicode_emoji_spec.rb +127 -14
  51. metadata +24 -4
  52. data/lib/unicode/emoji/generated/regex_any.rb +0 -8
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1e5b069f3f3d2de97f31a0aa83568454b751330b8e86392d28baa7906a6d6302
4
- data.tar.gz: afaed5c343b5cdb5f235bf1e40fd78d221b0efa3097517f015db3c5a58e96775
3
+ metadata.gz: 5cc77126aeb986e5645ebce97fc3e1be5835b9e5adb9764af980e0175d7d9284
4
+ data.tar.gz: ffd47af7c556ee26951f6c5f30122ecd44520f1a3aa6856c526c4e4dec050e77
5
5
  SHA512:
6
- metadata.gz: e9c6726eee2f48cd0c51937bb0fcd6f0009b7fe0eb9e973bef713ac063d668c4d520f490a46bdf944892dcac2854b17ca72779593257f3edc8f0dfd445f3a729
7
- data.tar.gz: 7c1747073a4ec43ea4ea24ff304eb2cf3deb58918f2757cd4b4daee922b15d5478bd979585b60984366d21787b3d5183bcd5fc8cd3cdea6aa5ceedb9860e19d6
6
+ metadata.gz: 810730ae1a796edc6c80c87d0d4e7c0a5bfdbda45c43f57dfde3a05afee7ee5df8d43f378c5c0de34d9e9bca8e9edfaee67261d16057a0aacc79f50c39d703d2
7
+ data.tar.gz: f300c259b7389d60130b9f910b1301f31bf60d47c32fd16f86935cdf63c740bb44c28ab8fd38bc643f35cd6b4d7e9a76583d3be156683404439069ccf58d8250
data/.gitignore CHANGED
@@ -1,2 +1,3 @@
1
1
  Gemfile.lock
2
2
  /pkg
3
+ /spec/data/emoji-test.txt
data/.rake_tasks CHANGED
@@ -1,3 +1,4 @@
1
+ dependencies...
1
2
  gem
2
3
  generate_constants
3
4
  irb
data/CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  # CHANGELOG
2
2
 
3
+ ### 4.0.0
4
+
5
+ - **Breaking change:** Regexes now include single skin tone modifiers (`🏻`) and hair components (`🦰`).
6
+ They were previously considered to be invalid partial Emoji, however since they are supposed to be
7
+ displayed as Emoji in isolation, they are now part of the regexes (see *ED-20* in UTS51).
8
+ - **Breaking change:** Drop `REGEX_ANY` in favor of `REGEX_PROP_EMOJI`
9
+ - Expose regexes for Emoji props (`REGEX_PROP_*`). The advantage over using the native regex properties
10
+ directly is that you will be able to use the Emoji support level of this gem instead of Ruby's.
11
+ For example, as of releasing this, the current Emoji version is 16.0, while Ruby is at 15.0.
12
+ Also see README for a table listing the regexes that match Emoji properties.
13
+ - Add `REGEX_EMOJI_KEYCAP` for matching specifically Emoji keycaps
14
+ - Use character class instead of lookbehind for native text emoji and non-emoji pictographic regexes
15
+
16
+ ### 3.8.0
17
+
18
+ - Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
19
+ for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
20
+ - Add specs running through `emoji-text.txt` and classify qualification statuses per regex
21
+ - Improve documentation and add detailed table about which regex has which features
22
+ - Native regexes: Use native Emoji props for Emoji text presentation
23
+ - Update CLDR to v46 (valid subdivisions)
24
+ - Further improvements (see commit log)
25
+
3
26
  ### 3.7.0
4
27
 
5
28
  - Bump required Ruby slightly to 2.5
@@ -29,7 +52,7 @@
29
52
  ### 3.3.2
30
53
 
31
54
  - Update valid subdivisions to CLDR 43 (no changes)
32
- -> there won't be any new subdivision flags in Emoji
55
+ -> there won't be any new RGI subdivision flags in Emoji
33
56
 
34
57
  ### 3.3.1
35
58
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicode-emoji (3.5.0)
4
+ unicode-emoji (4.0.0)
5
5
  unicode-version (~> 1.0)
6
6
 
7
7
  GEM
@@ -20,7 +20,7 @@ GEM
20
20
  reline (0.3.8)
21
21
  io-console (~> 0.5)
22
22
  stringio (3.0.8)
23
- unicode-version (1.3.0)
23
+ unicode-version (1.4.0)
24
24
 
25
25
  PLATFORMS
26
26
  ruby
@@ -32,4 +32,4 @@ DEPENDENCIES
32
32
  unicode-emoji!
33
33
 
34
34
  BUNDLED WITH
35
- 2.2.22
35
+ 2.5.21
data/README.md CHANGED
@@ -1,15 +1,16 @@
1
1
  # Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
2
2
 
3
- Provides regular expressions to find Emoji in strings, incorporating the latest Unicode and Emoji standards.
3
+ Provides various sophisticated regular expressions to work with Emoji in strings,
4
+ incorporating the latest Unicode / Emoji standards.
4
5
 
5
6
  Additional features:
6
7
 
7
- - A categorized list of recommended Emoji
8
+ - A categorized list of Emoji (RGI: Recommended for General Interchange)
8
9
  - Retrieve Emoji properties info about specific codepoints (Emoji_Modifier, Emoji_Presentation, etc.)
9
10
 
10
11
  Emoji version: **16.0** (September 2024)
11
12
 
12
- CLDR version (used for sub-region flags): **45** (April 2024)
13
+ CLDR version (used for sub-region flags): **46** (October 2024)
13
14
 
14
15
  ## Gemfile
15
16
 
@@ -17,101 +18,153 @@ CLDR version (used for sub-region flags): **45** (April 2024)
17
18
  gem "unicode-emoji"
18
19
  ```
19
20
 
20
- ## Usage
21
-
22
- ### Regex
21
+ ## Usage – Regex Matching
23
22
 
24
23
  The gem includes multiple Emoji regexes, which are compiled out of various Emoji Unicode data sources.
25
24
 
26
25
  ```ruby
27
26
  require "unicode/emoji"
28
27
 
29
- string = "String which contains all kinds of emoji:
28
+ string = "String which contains all types of Emoji sequences:
30
29
 
31
- - Singleton Emoji: 😴
32
- - Textual singleton Emoji with Emoji variation: ▶️
30
+ - Basic Emoji: 😴
31
+ - Textual Emoji with Emoji variation (VS16): ▶️
33
32
  - Emoji with skin tone modifier: 🛌🏽
34
33
  - Region flag: 🇵🇹
35
34
  - Sub-Region flag: 🏴󠁧󠁢󠁳󠁣󠁴󠁿
36
35
  - Keycap sequence: 2️⃣
36
+ - Skin tone modifier: 🏻
37
37
  - Sequence using ZWJ (zero width joiner): 🤾🏽‍♀️
38
-
39
38
  "
40
39
 
41
40
  string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🤾🏽‍♀️"]
42
41
  ```
43
42
 
44
- #### Main Regexes
43
+ Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
45
44
 
46
- There are multiple levels of Emoji detection:
45
+ ### Main Regexes
47
46
 
48
47
  Regex | Description | Example Matches | Example Non-Matches
49
48
  ------------------------------|-------------|-----------------|--------------------
50
- `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
51
- `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`
52
- `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`
53
- `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` |
54
-
55
- ##### Picking the Right Emoji Regex
56
-
57
- - Usually you just want `REGEX` (RGI set)
58
- - If you want broader matching (any ZJW sequences, more sub-region flags), choose `REGEX_VALID`
59
- - Even brolader is `REGEX_WELL_FORMED`, which will also match any region flag and any tag sequence
60
- - And then there is `REGEX_POSSIBLE` , which is a quick check for possible Emoji, which might contain false positives, [suggested in the Unicode Standard](https://www.unicode.org/reports/tr51/#EBNF_and_Regex)
61
-
62
- Property | Escaped | `REGEX` (RGI / Recommended) | `REGEX_VALID` (Valid) | `REGEX_WELL_FORMED` (Well-formed) | `REGEX_POSSIBLE`
63
- ---------|---------|-----------------------------|-----------------------|-----------------------------------|-----------------
64
- Region "🇵🇹" | `\u{1F1F5 1F1F9}` | Yes | Yes | Yes | Yes
65
- Region "🇵🇵" | `\u{1F1F5 1F1F5}` | No | No | Yes | Yes
66
- Tag Sequence "🏴󠁧󠁢󠁳󠁣󠁴󠁿" | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | Yes | Yes | Yes | Yes
67
- Tag Sequence "🏴󠁧󠁢󠁡󠁧󠁢󠁿" | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | No | Yes | Yes | Yes
68
- Tag Sequence "😴󠁧󠁢󠁡󠁡󠁡󠁿" | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | No | No | Yes | Yes
69
- ZWJ Sequence "🤾🏽‍♀️" | `\u{1F93E 1F3FD 200D 2640 FE0F}` | Yes | Yes | Yes | Yes
70
- ZWJ Sequence "🤠‍🤢" | `\u{1F920 200D 1F922}` | No | Yes | Yes | Yes
49
+ `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) Basic Emoji and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🏻` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
50
+ `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) Basic Emoji and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `1`, `1⃣`
51
+ `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) Basic Emoji and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵`, `🏻` | `😴︎`, `▶`, `1`, `1⃣`
52
+ `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, all kinds of Emoji sequences, and even non-Emoji singleton components like digits. Only exception: Unqualified keycap sequences are not matched | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
71
53
 
72
- Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
54
+ #### Include Text Emoji
73
55
 
74
- More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
56
+ By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
75
57
 
76
- #### Singleton Regexes
58
+ Regex | Description | Example Matches | Example Non-Matches
59
+ ------------------------------|-------------|-----------------|--------------------
60
+ `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` , `🏻`| `🤾🏽‍♀`, `🏌‍♂️`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
61
+ `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` , `🏻` | `🇵🇵`, `1`
62
+ `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` , `🏻` | `1`
77
63
 
78
- Matches only simple one-codepoint (+ optional variation selector) Emoji:
64
+ #### Minimally-qualified and Unqualified Sequences
79
65
 
80
66
  Regex | Description | Example Matches | Example Non-Matches
81
67
  ------------------------------|-------------|-----------------|--------------------
82
- `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `1`
83
- `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `1`
68
+ `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏻` | `🏌‍♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
69
+ `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
84
70
 
85
- #### Include Textual Emoji
71
+ [List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
86
72
 
87
- By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
73
+ #### Singleton Regexes
74
+
75
+ Matches only simple one-codepoint (+ optional variation selector) Emoji:
88
76
 
89
77
  Regex | Description | Example Matches | Example Non-Matches
90
78
  ------------------------------|-------------|-----------------|--------------------
91
- `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶` | `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`
92
- `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `😴︎`, `▶` | `🏻`, `🇵🇵`
93
- `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶` | `🏻`
79
+ `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) Basic Emoji, but no sequences at all | `😴`, `▶️`, `🏻` | `😴︎`, `▶`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
80
+ `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
81
+
82
+ Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text). The `REGEX_BASIC` regex also matches visual Emoji components (skin tone modifiers and hair components).
83
+
84
+ While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
85
+
86
+ ### Comparison
87
+
88
+ 1) Fully-qualified RGI Emoji ZWJ sequence
89
+ 2) Minimally-qualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selectors, but not in the first Emoji character)
90
+ 3) Unqualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selector, including in the first Emoji character). Unqualified Emoji include all basic Emoji in Text Presentation (see column 11/12).
91
+ 4) Non-RGI Emoji ZWJ sequence
92
+ 5) Valid Region made from a pair of Regional Indicators
93
+ 6) Any Region made from a pair of Regional Indicators
94
+ 7) RGI Flag Emoji Tag Sequences (England, Scotland, Wales)
95
+ 8) Valid Flag Emoji Tag Sequences (any known subdivision)
96
+ 9) Any Emoji Tag Sequences (any tag sequence with any base)
97
+ 10) Basic Default Emoji Presentation Characters or Text characters with Emoji Presentation Selector
98
+ 11) Basic Default Text Presentation Characters or Basic Emoji with Text Presentation Selector
99
+ 12) Non-Emoji (unqualified) keycap
100
+
101
+ Regex | 1 RGI/FQE | 2 RGI/MQE | 3 RGI/UQE | 4 Non-RGI | 5 Valid Re­gion | 6 Any Re­gion | 7 RGI Tag | 8 Valid Tag | 9 Any Tag | 10 Basic Emoji | 11 Basic Text | 12 Text Key­cap
102
+ -|-|-|-|-|-|-|-|-|-|-|-|-
103
+ REGEX | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
104
+ REGEX INCLUDE TEXT | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
105
+ REGEX INCLUDE MQE | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
106
+ REGEX INCLUDE MQE UQE | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
107
+ REGEX VALID | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌
108
+ REGEX VALID INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅
109
+ REGEX WELL FORMED | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌
110
+ REGEX WELL FORMED INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅
111
+ REGEX POSSIBLE | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌
112
+ REGEX BASIC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌
113
+ REGEX TEXT | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅
114
+
115
+ ¹ Matches all unqualified Emoji, except for textual singleton Emoji (see columns 11, 12)
116
+
117
+ See [spec files](/spec) for detailed examples about which regex matches which kind of Emoji.
118
+
119
+ ### Picking the Right Emoji Regex
120
+
121
+ - Usually you just want `REGEX` (recommended Emoji set, RGI)
122
+ - Use `REGEX_INCLUDE_MQE` or `REGEX_INCLUDE_MQE_UQE` if you want to catch Emoji sequences with missing Variation Selectors.
123
+ - If you want broader matching (any ZWJ sequences, more sub-region flags), choose `REGEX_VALID`
124
+ - If you need to match any region flag and any tag sequence, choose `REGEX_WELL_FORMED`
125
+ - Use the `_INCLUDE_TEXT` suffix with any of the above base regexes, if you want to also match basic textual Emoji
126
+ - And finally, there is also the option to use `REGEX_POSSIBLE`, which is a simplified test for possible Emoji, comparable to `REGEX_WELL_FORMED*`. It might contain false positives, however, the regex is less complex and [suggested in the Unicode standard itself](https://www.unicode.org/reports/tr51/#EBNF_and_Regex) as a first check.
127
+
128
+ ### Examples
129
+
130
+ Desc | Emoji | Escaped | `REGEX` (RGI/FQE) | `REGEX_INCLUDE_MQE` (RGI/MQE) | `REGEX_VALID` | `REGEX_WELL_FORMED` / `REGEX_POSSIBLE`
131
+ -----|-------|---------|---------------|-----------------------|-----------------------------------|-----------------
132
+ RGI ZWJ Sequence | 🤾🏽‍♀️ | `\u{1F93E 1F3FD 200D 2640 FE0F}` | ✅ | ✅ | ✅ | ✅
133
+ RGI ZWJ Sequence MQE | 🤾🏽‍♀ | `\u{1F93E 1F3FD 200D 2640}` | ❌ | ✅ | ✅ | ✅
134
+ Valid ZWJ Sequence, Non-RGI | 🤠‍🤢 | `\u{1F920 200D 1F922}` | ❌ | ❌ | ✅ | ✅
135
+ Known Region | 🇵🇹 | `\u{1F1F5 1F1F9}` | ✅ | ✅ | ✅ | ✅
136
+ Unknown Region | 🇵🇵 | `\u{1F1F5 1F1F5}` | ❌ | ❌ | ❌ | ✅
137
+ RGI Tag Sequence | 🏴󠁧󠁢󠁳󠁣󠁴󠁿 | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | ✅ | ✅ | ✅ | ✅
138
+ Valid Tag Sequence | 🏴󠁧󠁢󠁡󠁧󠁢󠁿 | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | ❌ | ❌ | ✅ | ✅
139
+ Well-formed Tag Sequence | 😴󠁧󠁢󠁡󠁡󠁡󠁿 | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | ❌ | ❌ | ❌ | ✅
94
140
 
95
- #### Extended Pictographic Regex
141
+ Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
96
142
 
97
- `Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
143
+ More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
98
144
 
99
- `Unicode::Emoji::REGEX_PICTO_NO_EMOJI` matches single codepoints with the **Extended_Pictographic** property, but excludes Emoji characters.
145
+ ### Emoji Property Regexes
100
146
 
101
- See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
147
+ Ruby includes native regex Emoji properties, as listed in the following table. You can also opt-in to use the `*_PROP_*` regexes to get the Emoji support level of this gem (instead of Ruby's).
102
148
 
103
- #### Partial Regexes
149
+ Gem Regex (`Unicode::Emoji`'s Emoji support level) | Native Regex (Ruby's Emoji support level)
150
+ ---------------------------------------------------|------------------------------------------
151
+ `Unicode::Emoji::REGEX_PROP_EMOJI` | `/\p{Emoji}/`
152
+ `Unicode::Emoji::REGEX_PROP_MODIFIER` | `/\p{EMod}/`
153
+ `Unicode::Emoji::REGEX_PROP_MODIFIER_BASE` | `/\p{EBase}/`
154
+ `Unicode::Emoji::REGEX_PROP_COMPONENT` | `/\p{EComp}/`
155
+ `Unicode::Emoji::REGEX_PROP_PRESENTATION` | `/\p{EPres}/`
104
156
 
105
- Matches potential Emoji parts (often, this is not what you want):
157
+ #### Extended Pictographic Regex
106
158
 
107
- Regex | Description | Example Matches | Example Non-Matches
108
- ------------------------------|-------------|-----------------|--------------------
109
- `Unicode::Emoji::REGEX_ANY` | Matches any Emoji-related codepoint (but no variation selectors, tags, or zero-width joiners). Please not that this will match Emoji-parts rather than complete Emoji, for example, single digits! | `😴`, `▶`, `🏻`, `🛌`, `🏽`, `🇵`, `🇹`, `2`, `🏴`, `🤾`, `♀`, `🤠`, `🤢` | -
159
+ `Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
110
160
 
161
+ `Unicode::Emoji::REGEX_PICTO_NO_EMOJI` matches single codepoints with the **Extended_Pictographic** property, but excludes Emoji characters.
162
+
163
+ See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
111
164
 
112
- ### List
165
+ ## Usage – List
113
166
 
114
- Use `Unicode::Emoji::LIST` or the list method to get a grouped (and ordered) list of Emoji:
167
+ Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
115
168
 
116
169
  ```ruby
117
170
  Unicode::Emoji.list.keys
@@ -124,13 +177,13 @@ Unicode::Emoji.list("Food & Drink", "food-asian")
124
177
  => ["🍱", "🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🥮", "🍡", "🥟", "🥠", "🥡"]
125
178
  ```
126
179
 
127
- Please note that categories might change with future versions of the Emoji standard. This gem will issue warnings when attempting to retrieve old categories using the `#list` method.
180
+ Please note that categories might change with future versions of the Emoji standard, although this has not happened often.
128
181
 
129
182
  A list of all Emoji (generated from this gem) can be found at [character.construction/emoji](https://character.construction/emoji).
130
183
 
131
- ### Properties
184
+ ## Usage – Properties Data
132
185
 
133
- Allows you to access the codepoint data form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
186
+ Allows you to access the codepoint data for a single character form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
134
187
 
135
188
  ```ruby
136
189
  require "unicode/emoji"
data/Rakefile CHANGED
@@ -28,14 +28,18 @@ task :irb do
28
28
  end
29
29
 
30
30
  # # #
31
- # Run Specs
31
+ # Run specs
32
32
 
33
33
  desc "#{gemspec.name} | Spec"
34
34
  task :spec do
35
- ruby "spec/unicode_emoji_spec.rb"
35
+ ruby File.join("spec", "*_spec.rb")
36
36
  end
37
37
  task default: :spec
38
38
 
39
+
40
+ # # #
41
+ # Generate regex
42
+
39
43
  desc "#{gemspec.name} | Generates all regex constants and saves them to lib/unicode/emoji/{generated,generated_native} directories"
40
44
  task :generate_constants do
41
45
  load "data/generate_constants.rb", true
Binary file
@@ -68,25 +68,39 @@ def pack_and_join(ords)
68
68
  end
69
69
  end
70
70
 
71
- def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
71
+ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
72
+ visual_component = pack_and_join(VISUAL_COMPONENT)
73
+
72
74
  emoji_presentation_sequence = \
73
75
  join(
74
- pack_and_join(TEXT_PRESENTATION) + pack(EMOJI_VARIATION_SELECTOR),
76
+ text_presentation + pack(EMOJI_VARIATION_SELECTOR),
75
77
  emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
76
78
  )
77
79
 
78
80
  non_component_emoji_presentation_sequence = \
79
81
  "(?!" + emoji_component + ")" + emoji_presentation_sequence
80
82
 
83
+ basic_emoji = \
84
+ join(
85
+ non_component_emoji_presentation_sequence,
86
+ visual_component,
87
+ )
88
+
81
89
  text_keycap_sequence = \
82
- join(EMOJI_KEYCAPS.map{|keycap| pack([keycap, EMOJI_KEYCAP_SUFFIX]) })
90
+ pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
83
91
 
84
92
  text_presentation_sequence = \
85
93
  join(
86
- pack_and_join(TEXT_PRESENTATION)+ "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
94
+ text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
87
95
  emoji_presentation + pack(TEXT_VARIATION_SELECTOR),
88
96
  )
89
97
 
98
+ text_emoji = \
99
+ join(
100
+ "(?!" + emoji_component + ")" + text_presentation_sequence,
101
+ text_keycap_sequence,
102
+ )
103
+
90
104
  emoji_modifier_sequence = \
91
105
  emoji_modifier_base + emoji_modifier
92
106
 
@@ -99,22 +113,11 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
99
113
  emoji_well_formed_flag_sequence = \
100
114
  '\p{RI}{2}'
101
115
 
102
- emoji_valid_core_sequence = \
103
- join(
104
- # emoji_character,
105
- emoji_keycap_sequence,
106
- emoji_modifier_sequence,
107
- non_component_emoji_presentation_sequence,
108
- emoji_valid_flag_sequence,
109
- )
110
-
111
- emoji_well_formed_core_sequence = \
116
+ emoji_core_sequence = \
112
117
  join(
113
- # emoji_character,
114
118
  emoji_keycap_sequence,
115
119
  emoji_modifier_sequence,
116
120
  non_component_emoji_presentation_sequence,
117
- emoji_well_formed_flag_sequence,
118
121
  )
119
122
 
120
123
  # Sort to make sure complex sequences match first
@@ -144,6 +147,18 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
144
147
  emoji_rgi_zwj_sequence = \
145
148
  pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse)
146
149
 
150
+ # FQE+MQE: Make VS16 optional after ZWJ has appeared
151
+ emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
152
+ /#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/,
153
+ pack(EMOJI_VARIATION_SELECTOR) + "?"
154
+ )
155
+
156
+ # FQE+MQE+UQE: Make all VS16 optional
157
+ emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
158
+ pack(EMOJI_VARIATION_SELECTOR),
159
+ pack(EMOJI_VARIATION_SELECTOR) + "?",
160
+ )
161
+
147
162
  emoji_valid_zwj_element = \
148
163
  join(
149
164
  emoji_modifier_sequence,
@@ -160,21 +175,76 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
160
175
  join(
161
176
  emoji_rgi_zwj_sequence,
162
177
  emoji_rgi_tag_sequence,
163
- emoji_valid_core_sequence,
178
+ emoji_valid_flag_sequence,
179
+ emoji_core_sequence,
180
+ visual_component,
181
+ )
182
+
183
+ emoji_rgi_sequence_include_text = \
184
+ join(
185
+ emoji_rgi_zwj_sequence,
186
+ emoji_rgi_tag_sequence,
187
+ emoji_valid_flag_sequence,
188
+ emoji_core_sequence,
189
+ visual_component,
190
+ text_emoji,
191
+ )
192
+
193
+ emoji_rgi_include_mqe_sequence = \
194
+ join(
195
+ emoji_rgi_include_mqe_zwj_sequence,
196
+ emoji_rgi_tag_sequence,
197
+ emoji_valid_flag_sequence,
198
+ emoji_core_sequence,
199
+ visual_component,
200
+ )
201
+
202
+ emoji_rgi_include_mqe_uqe_sequence = \
203
+ join(
204
+ emoji_rgi_include_mqe_uqe_zwj_sequence,
205
+ text_emoji, # also uqe
206
+ emoji_rgi_tag_sequence,
207
+ emoji_valid_flag_sequence,
208
+ emoji_core_sequence,
209
+ visual_component,
164
210
  )
165
211
 
166
212
  emoji_valid_sequence = \
167
213
  join(
168
214
  emoji_valid_zwj_sequence,
169
215
  emoji_valid_tag_sequence,
170
- emoji_valid_core_sequence,
216
+ emoji_valid_flag_sequence,
217
+ emoji_core_sequence,
218
+ visual_component,
219
+ )
220
+
221
+ emoji_valid_sequence_include_text = \
222
+ join(
223
+ emoji_valid_zwj_sequence,
224
+ emoji_valid_tag_sequence,
225
+ emoji_valid_flag_sequence,
226
+ emoji_core_sequence,
227
+ visual_component,
228
+ text_emoji,
171
229
  )
172
230
 
173
231
  emoji_well_formed_sequence = \
174
232
  join(
175
233
  emoji_valid_zwj_sequence,
176
234
  emoji_well_formed_tag_sequence,
177
- emoji_well_formed_core_sequence,
235
+ emoji_well_formed_flag_sequence,
236
+ emoji_core_sequence,
237
+ visual_component,
238
+ )
239
+
240
+ emoji_well_formed_sequence_include_text = \
241
+ join(
242
+ emoji_valid_zwj_sequence,
243
+ emoji_well_formed_tag_sequence,
244
+ emoji_well_formed_flag_sequence,
245
+ emoji_core_sequence,
246
+ visual_component,
247
+ text_emoji,
178
248
  )
179
249
 
180
250
  emoji_possible_modification = \
@@ -198,45 +268,53 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
198
268
  # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
199
269
  regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)
200
270
 
271
+ # rgi + singleton text
272
+ regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text)
273
+
274
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
275
+ # Also make VS16 optional if not at first emoji character
276
+ regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence)
277
+
278
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
279
+ # Also make VS16 optional even at first emoji character
280
+ regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence)
281
+
201
282
  # Matches basic singleton emoji and all kind of valid sequences
202
283
  regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)
203
284
 
285
+ # valid + singleton text
286
+ regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text)
287
+
204
288
  # Matches basic singleton emoji and all kind of sequences
205
289
  regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)
290
+
291
+ # well-formed + singleton text
292
+ regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text)
206
293
 
207
294
  # Quick test which might lead to false positves
208
295
  # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
209
296
  regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
210
297
 
211
- # Matches only basic single, non-textual emoji
212
- # Ignores "components" like modifiers or simple digits
213
- regexes[:REGEX_BASIC] = Regexp.compile(
214
- "(?!" + emoji_component + ")" + emoji_presentation_sequence
215
- )
298
+ # Matches only basic single, non-textual emoji, ignores some components like simple digits
299
+ regexes[:REGEX_BASIC] = Regexp.compile(basic_emoji)
216
300
 
217
- # Matches only basic single, textual emoji
218
- # Ignores "components" like modifiers or simple digits
219
- regexes[:REGEX_TEXT] = Regexp.compile(
220
- join(
221
- "(?!" + emoji_component + ")" + text_presentation_sequence,
222
- text_keycap_sequence,
223
- )
224
- )
225
-
226
- # Matches any emoji-related codepoint - Use with caution (returns partial matches)
227
- regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
301
+ # Matches only basic single, textual emoji, ignores components like modifiers or simple digits
302
+ regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
228
303
 
229
- # Combined REGEXes which also match for TEXTUAL emoji
230
- regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT])
231
-
232
- regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT])
233
-
234
- regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT])
304
+ # Export regexes for Emoji properties so they can be used with newer Unicode than Ruby's
305
+ regexes[:REGEX_PROP_EMOJI] = Regexp.compile(emoji_character)
306
+ regexes[:REGEX_PROP_MODIFIER] = Regexp.compile(emoji_modifier)
307
+ regexes[:REGEX_PROP_MODIFIER_BASE] = Regexp.compile(emoji_modifier_base)
308
+ regexes[:REGEX_PROP_COMPONENT] = Regexp.compile(emoji_component)
309
+ regexes[:REGEX_PROP_PRESENTATION] = Regexp.compile(emoji_presentation)
235
310
 
311
+ # Same goes for ExtendedPictographic
236
312
  regexes[:REGEX_PICTO] = Regexp.compile(picto)
237
-
238
313
  regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)
239
314
 
315
+ # Emoji keycaps
316
+ regexes[:REGEX_EMOJI_KEYCAP] = Regexp.compile(emoji_keycap_sequence)
317
+
240
318
  regexes
241
319
  end
242
320
 
@@ -246,6 +324,7 @@ regexes = compile(
246
324
  emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
247
325
  emoji_component: pack_and_join(EMOJI_COMPONENT),
248
326
  emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
327
+ text_presentation: pack_and_join(TEXT_PRESENTATION),
249
328
  picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
250
329
  picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
251
330
  )
@@ -257,7 +336,8 @@ native_regexes = compile(
257
336
  emoji_modifier_base: "\\p{EBase}",
258
337
  emoji_component: "\\p{EComp}",
259
338
  emoji_presentation: "\\p{EPres}",
339
+ text_presentation: "[\\p{Emoji}&&\\P{EPres}]",
260
340
  picto: "\\p{ExtPict}",
261
- picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
341
+ picto_no_emoji: "[\\p{ExtPict}&&\\P{Emoji}]"
262
342
  )
263
343
  write_regexes(native_regexes, File.expand_path("../lib/unicode/emoji/generated_native", __dir__))