unicode-emoji 3.7.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.rake_tasks +1 -0
  4. data/CHANGELOG.md +24 -1
  5. data/Gemfile.lock +3 -3
  6. data/README.md +111 -58
  7. data/Rakefile +6 -2
  8. data/data/emoji.marshal.gz +0 -0
  9. data/data/generate_constants.rb +123 -43
  10. data/lib/unicode/emoji/constants.rb +22 -2
  11. data/lib/unicode/emoji/generated/regex.rb +1 -1
  12. data/lib/unicode/emoji/generated/regex_basic.rb +1 -1
  13. data/lib/unicode/emoji/generated/regex_emoji_keycap.rb +8 -0
  14. data/lib/unicode/emoji/generated/regex_include_mqe.rb +8 -0
  15. data/lib/unicode/emoji/generated/regex_include_mqe_uqe.rb +8 -0
  16. data/lib/unicode/emoji/generated/regex_include_text.rb +1 -1
  17. data/lib/unicode/emoji/generated/regex_prop_component.rb +8 -0
  18. data/lib/unicode/emoji/generated/regex_prop_emoji.rb +8 -0
  19. data/lib/unicode/emoji/generated/regex_prop_modifier.rb +8 -0
  20. data/lib/unicode/emoji/generated/regex_prop_modifier_base.rb +8 -0
  21. data/lib/unicode/emoji/generated/regex_prop_presentation.rb +8 -0
  22. data/lib/unicode/emoji/generated/regex_text.rb +1 -1
  23. data/lib/unicode/emoji/generated/regex_valid.rb +1 -1
  24. data/lib/unicode/emoji/generated/regex_valid_include_text.rb +1 -1
  25. data/lib/unicode/emoji/generated/regex_well_formed.rb +1 -1
  26. data/lib/unicode/emoji/generated/regex_well_formed_include_text.rb +1 -1
  27. data/lib/unicode/emoji/generated_native/regex.rb +1 -1
  28. data/lib/unicode/emoji/generated_native/regex_basic.rb +1 -1
  29. data/lib/unicode/emoji/generated_native/regex_emoji_keycap.rb +8 -0
  30. data/lib/unicode/emoji/generated_native/regex_include_mqe.rb +8 -0
  31. data/lib/unicode/emoji/generated_native/regex_include_mqe_uqe.rb +8 -0
  32. data/lib/unicode/emoji/generated_native/regex_include_text.rb +1 -1
  33. data/lib/unicode/emoji/generated_native/regex_picto_no_emoji.rb +1 -1
  34. data/lib/unicode/emoji/generated_native/regex_prop_component.rb +8 -0
  35. data/lib/unicode/emoji/generated_native/{regex_any.rb → regex_prop_emoji.rb} +1 -1
  36. data/lib/unicode/emoji/generated_native/regex_prop_modifier.rb +8 -0
  37. data/lib/unicode/emoji/generated_native/regex_prop_modifier_base.rb +8 -0
  38. data/lib/unicode/emoji/generated_native/regex_prop_presentation.rb +8 -0
  39. data/lib/unicode/emoji/generated_native/regex_text.rb +1 -1
  40. data/lib/unicode/emoji/generated_native/regex_valid.rb +1 -1
  41. data/lib/unicode/emoji/generated_native/regex_valid_include_text.rb +1 -1
  42. data/lib/unicode/emoji/generated_native/regex_well_formed.rb +1 -1
  43. data/lib/unicode/emoji/generated_native/regex_well_formed_include_text.rb +1 -1
  44. data/lib/unicode/emoji/lazy_constants.rb +36 -0
  45. data/lib/unicode/emoji/list.rb +3 -0
  46. data/lib/unicode/emoji.rb +39 -6
  47. data/spec/data/.keep +0 -0
  48. data/spec/data/emoji-test.txt +5331 -0
  49. data/spec/emoji_test_txt_spec.rb +181 -0
  50. data/spec/unicode_emoji_spec.rb +127 -14
  51. metadata +24 -4
  52. data/lib/unicode/emoji/generated/regex_any.rb +0 -8
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1e5b069f3f3d2de97f31a0aa83568454b751330b8e86392d28baa7906a6d6302
4
- data.tar.gz: afaed5c343b5cdb5f235bf1e40fd78d221b0efa3097517f015db3c5a58e96775
3
+ metadata.gz: 5cc77126aeb986e5645ebce97fc3e1be5835b9e5adb9764af980e0175d7d9284
4
+ data.tar.gz: ffd47af7c556ee26951f6c5f30122ecd44520f1a3aa6856c526c4e4dec050e77
5
5
  SHA512:
6
- metadata.gz: e9c6726eee2f48cd0c51937bb0fcd6f0009b7fe0eb9e973bef713ac063d668c4d520f490a46bdf944892dcac2854b17ca72779593257f3edc8f0dfd445f3a729
7
- data.tar.gz: 7c1747073a4ec43ea4ea24ff304eb2cf3deb58918f2757cd4b4daee922b15d5478bd979585b60984366d21787b3d5183bcd5fc8cd3cdea6aa5ceedb9860e19d6
6
+ metadata.gz: 810730ae1a796edc6c80c87d0d4e7c0a5bfdbda45c43f57dfde3a05afee7ee5df8d43f378c5c0de34d9e9bca8e9edfaee67261d16057a0aacc79f50c39d703d2
7
+ data.tar.gz: f300c259b7389d60130b9f910b1301f31bf60d47c32fd16f86935cdf63c740bb44c28ab8fd38bc643f35cd6b4d7e9a76583d3be156683404439069ccf58d8250
data/.gitignore CHANGED
@@ -1,2 +1,3 @@
1
1
  Gemfile.lock
2
2
  /pkg
3
+ /spec/data/emoji-test.txt
data/.rake_tasks CHANGED
@@ -1,3 +1,4 @@
1
+ dependencies...
1
2
  gem
2
3
  generate_constants
3
4
  irb
data/CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  # CHANGELOG
2
2
 
3
+ ### 4.0.0
4
+
5
+ - **Breaking change:** Regexes now include single skin tone modifiers (`🏻`) and hair components (`🦰`).
6
+ They were previously considered to be invalid partial Emoji, however since they are supposed to be
7
+ displayed as Emoji in isolation, they are now part of the regexes (see *ED-20* in UTS51).
8
+ - **Breaking change:** Drop `REGEX_ANY` in favor of `REGEX_PROP_EMOJI`
9
+ - Expose regexes for Emoji props (`REGEX_PROP_*`). The advantage over using the native regex properties
10
+ directly is that you will be able to use the Emoji support level of this gem instead of Ruby's.
11
+ For example, as of releasing this, the current Emoji version is 16.0, while Ruby is at 15.0.
12
+ Also see README for a table listing the regexes that match Emoji properties.
13
+ - Add `REGEX_EMOJI_KEYCAP` for matching specifically Emoji keycaps
14
+ - Use character class instead of lookbehind for native text emoji and non-emoji pictographic regexes
15
+
16
+ ### 3.8.0
17
+
18
+ - Add new RGI-based regexes `REGEX_INCLUDE_MQE` and `REGEX_INCLUDE_MQE_UQE` which allows to match
19
+ for minimally-qualified and unqualified RGI sequences (Emoji that lack some VS16)
20
+ - Add specs running through `emoji-text.txt` and classify qualification statuses per regex
21
+ - Improve documentation and add detailed table about which regex has which features
22
+ - Native regexes: Use native Emoji props for Emoji text presentation
23
+ - Update CLDR to v46 (valid subdivisions)
24
+ - Further improvements (see commit log)
25
+
3
26
  ### 3.7.0
4
27
 
5
28
  - Bump required Ruby slightly to 2.5
@@ -29,7 +52,7 @@
29
52
  ### 3.3.2
30
53
 
31
54
  - Update valid subdivisions to CLDR 43 (no changes)
32
- -> there won't be any new subdivision flags in Emoji
55
+ -> there won't be any new RGI subdivision flags in Emoji
33
56
 
34
57
  ### 3.3.1
35
58
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicode-emoji (3.5.0)
4
+ unicode-emoji (4.0.0)
5
5
  unicode-version (~> 1.0)
6
6
 
7
7
  GEM
@@ -20,7 +20,7 @@ GEM
20
20
  reline (0.3.8)
21
21
  io-console (~> 0.5)
22
22
  stringio (3.0.8)
23
- unicode-version (1.3.0)
23
+ unicode-version (1.4.0)
24
24
 
25
25
  PLATFORMS
26
26
  ruby
@@ -32,4 +32,4 @@ DEPENDENCIES
32
32
  unicode-emoji!
33
33
 
34
34
  BUNDLED WITH
35
- 2.2.22
35
+ 2.5.21
data/README.md CHANGED
@@ -1,15 +1,16 @@
1
1
  # Unicode::Emoji [![[version]](https://badge.fury.io/rb/unicode-emoji.svg)](https://badge.fury.io/rb/unicode-emoji) [![[ci]](https://github.com/janlelis/unicode-emoji/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-emoji/actions?query=workflow%3ATest)
2
2
 
3
- Provides regular expressions to find Emoji in strings, incorporating the latest Unicode and Emoji standards.
3
+ Provides various sophisticated regular expressions to work with Emoji in strings,
4
+ incorporating the latest Unicode / Emoji standards.
4
5
 
5
6
  Additional features:
6
7
 
7
- - A categorized list of recommended Emoji
8
+ - A categorized list of Emoji (RGI: Recommended for General Interchange)
8
9
  - Retrieve Emoji properties info about specific codepoints (Emoji_Modifier, Emoji_Presentation, etc.)
9
10
 
10
11
  Emoji version: **16.0** (September 2024)
11
12
 
12
- CLDR version (used for sub-region flags): **45** (April 2024)
13
+ CLDR version (used for sub-region flags): **46** (October 2024)
13
14
 
14
15
  ## Gemfile
15
16
 
@@ -17,101 +18,153 @@ CLDR version (used for sub-region flags): **45** (April 2024)
17
18
  gem "unicode-emoji"
18
19
  ```
19
20
 
20
- ## Usage
21
-
22
- ### Regex
21
+ ## Usage – Regex Matching
23
22
 
24
23
  The gem includes multiple Emoji regexes, which are compiled out of various Emoji Unicode data sources.
25
24
 
26
25
  ```ruby
27
26
  require "unicode/emoji"
28
27
 
29
- string = "String which contains all kinds of emoji:
28
+ string = "String which contains all types of Emoji sequences:
30
29
 
31
- - Singleton Emoji: 😴
32
- - Textual singleton Emoji with Emoji variation: ▶️
30
+ - Basic Emoji: 😴
31
+ - Textual Emoji with Emoji variation (VS16): ▶️
33
32
  - Emoji with skin tone modifier: 🛌🏽
34
33
  - Region flag: 🇵🇹
35
34
  - Sub-Region flag: 🏴󠁧󠁢󠁳󠁣󠁴󠁿
36
35
  - Keycap sequence: 2️⃣
36
+ - Skin tone modifier: 🏻
37
37
  - Sequence using ZWJ (zero width joiner): 🤾🏽‍♀️
38
-
39
38
  "
40
39
 
41
40
  string.scan(Unicode::Emoji::REGEX) # => ["😴", "▶️", "🛌🏽", "🇵🇹", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "2️⃣", "🤾🏽‍♀️"]
42
41
  ```
43
42
 
44
- #### Main Regexes
43
+ Depending on your exact usecase, you can choose between multiple levels of Emoji detection:
45
44
 
46
- There are multiple levels of Emoji detection:
45
+ ### Main Regexes
47
46
 
48
47
  Regex | Description | Example Matches | Example Non-Matches
49
48
  ------------------------------|-------------|-----------------|--------------------
50
- `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *recommended* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
51
- `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢` | `😴︎`, `▶`, `🏻`, `🇵🇵`, `1`
52
- `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji) and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵` | `😴︎`, `▶`, `🏻`, `1`
53
- `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, singleton components, all kinds of Emoji sequences, and even single digits | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` |
54
-
55
- ##### Picking the Right Emoji Regex
56
-
57
- - Usually you just want `REGEX` (RGI set)
58
- - If you want broader matching (any ZJW sequences, more sub-region flags), choose `REGEX_VALID`
59
- - Even brolader is `REGEX_WELL_FORMED`, which will also match any region flag and any tag sequence
60
- - And then there is `REGEX_POSSIBLE` , which is a quick check for possible Emoji, which might contain false positives, [suggested in the Unicode Standard](https://www.unicode.org/reports/tr51/#EBNF_and_Regex)
61
-
62
- Property | Escaped | `REGEX` (RGI / Recommended) | `REGEX_VALID` (Valid) | `REGEX_WELL_FORMED` (Well-formed) | `REGEX_POSSIBLE`
63
- ---------|---------|-----------------------------|-----------------------|-----------------------------------|-----------------
64
- Region "🇵🇹" | `\u{1F1F5 1F1F9}` | Yes | Yes | Yes | Yes
65
- Region "🇵🇵" | `\u{1F1F5 1F1F5}` | No | No | Yes | Yes
66
- Tag Sequence "🏴󠁧󠁢󠁳󠁣󠁴󠁿" | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | Yes | Yes | Yes | Yes
67
- Tag Sequence "🏴󠁧󠁢󠁡󠁧󠁢󠁿" | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | No | Yes | Yes | Yes
68
- Tag Sequence "😴󠁧󠁢󠁡󠁡󠁡󠁿" | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | No | No | Yes | Yes
69
- ZWJ Sequence "🤾🏽‍♀️" | `\u{1F93E 1F3FD 200D 2640 FE0F}` | Yes | Yes | Yes | Yes
70
- ZWJ Sequence "🤠‍🤢" | `\u{1F920 200D 1F922}` | No | Yes | Yes | Yes
49
+ `Unicode::Emoji::REGEX` | **Use this one if unsure!** Matches (non-textual) Basic Emoji and all kinds of *recommended* Emoji sequences (RGI/FQE) | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🏻` | `🤾🏽‍♀`, `🏌‍♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
50
+ `Unicode::Emoji::REGEX_VALID` | Matches (non-textual) Basic Emoji and all kinds of *valid* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀` ,`🏌‍♂️`, `🤠‍🤢`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `1`, `1⃣`
51
+ `Unicode::Emoji::REGEX_WELL_FORMED` | Matches (non-textual) Basic Emoji and all kinds of *well-formed* Emoji sequences | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`,`🏌‍♂️` , `🤠‍🤢`, `🇵🇵`, `🏻` | `😴︎`, `▶`, `1`, `1⃣`
52
+ `Unicode::Emoji::REGEX_POSSIBLE` | Matches all singleton Emoji, all kinds of Emoji sequences, and even non-Emoji singleton components like digits. Only exception: Unqualified keycap sequences are not matched | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `🏻`, `1` | `1⃣`
71
53
 
72
- Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
54
+ #### Include Text Emoji
73
55
 
74
- More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
56
+ By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
75
57
 
76
- #### Singleton Regexes
58
+ Regex | Description | Example Matches | Example Non-Matches
59
+ ------------------------------|-------------|-----------------|--------------------
60
+ `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶`, `1⃣` , `🏻`| `🤾🏽‍♀`, `🏌‍♂️`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`
61
+ `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `😴︎`, `▶`, `1⃣` , `🏻` | `🇵🇵`, `1`
62
+ `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶`, `1⃣` , `🏻` | `1`
77
63
 
78
- Matches only simple one-codepoint (+ optional variation selector) Emoji:
64
+ #### Minimally-qualified and Unqualified Sequences
79
65
 
80
66
  Regex | Description | Example Matches | Example Non-Matches
81
67
  ------------------------------|-------------|-----------------|--------------------
82
- `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) singleton Emoji (except for singleton components, like a skin tone modifier without base Emoji), but no sequences at all | `😴`, `▶️` | `😴︎`, `▶`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `1`
83
- `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji (except for singleton components, like digits) | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `1`
68
+ `Unicode::Emoji::REGEX_INCLUDE_MQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors, where the first partial Emoji has all required Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏻` | `🏌‍♂️`, `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
69
+ `Unicode::Emoji::REGEX_INCLUDE_MQE_UQE` | Like `REGEX`, but additionally includes Emoji with missing Emoji Presentation Variation Selectors | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🏻` | `😴︎`, `▶`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`, `1`, `1⃣`
84
70
 
85
- #### Include Textual Emoji
71
+ [List of MQE and UQE Emoji sequences](https://character.construction/unqualified-emoji)
86
72
 
87
- By default, textual Emoji (emoji characters with text variation selector or those that have a default text presentation) will not be included in the default regexes (except in `REGEX_POSSIBLE`). However, if you wish to match for them too, you can include them in your regex by appending the `_INCLUDE_TEXT` suffix:
73
+ #### Singleton Regexes
74
+
75
+ Matches only simple one-codepoint (+ optional variation selector) Emoji:
88
76
 
89
77
  Regex | Description | Example Matches | Example Non-Matches
90
78
  ------------------------------|-------------|-----------------|--------------------
91
- `Unicode::Emoji::REGEX_INCLUDE_TEXT` | `REGEX` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🤾🏽‍♀️`, `😴︎`, `▶` | `🏻`, `🇵🇵`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤠‍🤢`
92
- `Unicode::Emoji::REGEX_VALID_INCLUDE_TEXT` | `REGEX_VALID` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `😴︎`, `▶` | `🏻`, `🇵🇵`
93
- `Unicode::Emoji::REGEX_WELL_FORMED_INCLUDE_TEXT` | `REGEX_WELL_FORMED` + `REGEX_TEXT` | `😴`, `▶️`, `🛌🏽`, `🇵🇹`, `2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤠‍🤢`, `🇵🇵`, `😴︎`, `▶` | `🏻`
79
+ `Unicode::Emoji::REGEX_BASIC` | Matches (non-textual) Basic Emoji, but no sequences at all | `😴`, `▶️`, `🏻` | `😴︎`, `▶`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
80
+ `Unicode::Emoji::REGEX_TEXT` | Matches only textual singleton Emoji | `😴︎`, `▶` | `😴`, `▶️`, `🏻`, `🛌🏽`, `🇵🇹`, `🇵🇵`,`2️⃣`, `🏴󠁧󠁢󠁳󠁣󠁴󠁿`, `🏴󠁧󠁢󠁡󠁧󠁢󠁿`, `🤾🏽‍♀️`, `🤾🏽‍♀`, `🏌‍♂️`, `🤠‍🤢`, `1`
81
+
82
+ Here is a list of all Emoji that can be matched using the two regexes: [character.construction/emoji-vs-text](https://character.construction/emoji-vs-text). The `REGEX_BASIC` regex also matches visual Emoji components (skin tone modifiers and hair components).
83
+
84
+ While `REGEX_BASIC` is part of the above regexes, `REGEX_TEXT` is only included in the `*_INCLUDE_TEXT` or `*_UQE` variants.
85
+
86
+ ### Comparison
87
+
88
+ 1) Fully-qualified RGI Emoji ZWJ sequence
89
+ 2) Minimally-qualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selectors, but not in the first Emoji character)
90
+ 3) Unqualified RGI Emoji ZWJ sequence (lacks Emoji Presentation Selector, including in the first Emoji character). Unqualified Emoji include all basic Emoji in Text Presentation (see column 11/12).
91
+ 4) Non-RGI Emoji ZWJ sequence
92
+ 5) Valid Region made from a pair of Regional Indicators
93
+ 6) Any Region made from a pair of Regional Indicators
94
+ 7) RGI Flag Emoji Tag Sequences (England, Scotland, Wales)
95
+ 8) Valid Flag Emoji Tag Sequences (any known subdivision)
96
+ 9) Any Emoji Tag Sequences (any tag sequence with any base)
97
+ 10) Basic Default Emoji Presentation Characters or Text characters with Emoji Presentation Selector
98
+ 11) Basic Default Text Presentation Characters or Basic Emoji with Text Presentation Selector
99
+ 12) Non-Emoji (unqualified) keycap
100
+
101
+ Regex | 1 RGI/FQE | 2 RGI/MQE | 3 RGI/UQE | 4 Non-RGI | 5 Valid Re­gion | 6 Any Re­gion | 7 RGI Tag | 8 Valid Tag | 9 Any Tag | 10 Basic Emoji | 11 Basic Text | 12 Text Key­cap
102
+ -|-|-|-|-|-|-|-|-|-|-|-|-
103
+ REGEX | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
104
+ REGEX INCLUDE TEXT | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
105
+ REGEX INCLUDE MQE | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌
106
+ REGEX INCLUDE MQE UQE | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅
107
+ REGEX VALID | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌
108
+ REGEX VALID INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅
109
+ REGEX WELL FORMED | ✅ | ✅ | (✅)¹ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌
110
+ REGEX WELL FORMED INCLUDE TEXT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅
111
+ REGEX POSSIBLE | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌
112
+ REGEX BASIC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌
113
+ REGEX TEXT | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅
114
+
115
+ ¹ Matches all unqualified Emoji, except for textual singleton Emoji (see columns 11, 12)
116
+
117
+ See [spec files](/spec) for detailed examples about which regex matches which kind of Emoji.
118
+
119
+ ### Picking the Right Emoji Regex
120
+
121
+ - Usually you just want `REGEX` (recommended Emoji set, RGI)
122
+ - Use `REGEX_INCLUDE_MQE` or `REGEX_INCLUDE_MQE_UQE` if you want to catch Emoji sequences with missing Variation Selectors.
123
+ - If you want broader matching (any ZWJ sequences, more sub-region flags), choose `REGEX_VALID`
124
+ - If you need to match any region flag and any tag sequence, choose `REGEX_WELL_FORMED`
125
+ - Use the `_INCLUDE_TEXT` suffix with any of the above base regexes, if you want to also match basic textual Emoji
126
+ - And finally, there is also the option to use `REGEX_POSSIBLE`, which is a simplified test for possible Emoji, comparable to `REGEX_WELL_FORMED*`. It might contain false positives, however, the regex is less complex and [suggested in the Unicode standard itself](https://www.unicode.org/reports/tr51/#EBNF_and_Regex) as a first check.
127
+
128
+ ### Examples
129
+
130
+ Desc | Emoji | Escaped | `REGEX` (RGI/FQE) | `REGEX_INCLUDE_MQE` (RGI/MQE) | `REGEX_VALID` | `REGEX_WELL_FORMED` / `REGEX_POSSIBLE`
131
+ -----|-------|---------|---------------|-----------------------|-----------------------------------|-----------------
132
+ RGI ZWJ Sequence | 🤾🏽‍♀️ | `\u{1F93E 1F3FD 200D 2640 FE0F}` | ✅ | ✅ | ✅ | ✅
133
+ RGI ZWJ Sequence MQE | 🤾🏽‍♀ | `\u{1F93E 1F3FD 200D 2640}` | ❌ | ✅ | ✅ | ✅
134
+ Valid ZWJ Sequence, Non-RGI | 🤠‍🤢 | `\u{1F920 200D 1F922}` | ❌ | ❌ | ✅ | ✅
135
+ Known Region | 🇵🇹 | `\u{1F1F5 1F1F9}` | ✅ | ✅ | ✅ | ✅
136
+ Unknown Region | 🇵🇵 | `\u{1F1F5 1F1F5}` | ❌ | ❌ | ❌ | ✅
137
+ RGI Tag Sequence | 🏴󠁧󠁢󠁳󠁣󠁴󠁿 | `\u{1F3F4 E0067 E0062 E0073 E0063 E0074 E007F}` | ✅ | ✅ | ✅ | ✅
138
+ Valid Tag Sequence | 🏴󠁧󠁢󠁡󠁧󠁢󠁿 | `\u{1F3F4 E0067 E0062 E0061 E0067 E0062 E007F}` | ❌ | ❌ | ✅ | ✅
139
+ Well-formed Tag Sequence | 😴󠁧󠁢󠁡󠁡󠁡󠁿 | `\u{1F634 E0067 E0062 E0061 E0061 E0061 E007F}` | ❌ | ❌ | ❌ | ✅
94
140
 
95
- #### Extended Pictographic Regex
141
+ Please see [the standard](https://www.unicode.org/reports/tr51/#Emoji_Sets) for more details, examples, explanations.
96
142
 
97
- `Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
143
+ More info about valid vs. recommended Emoji can also be found in this [blog article on Emojipedia](https://blog.emojipedia.org/unicode-behind-the-curtain/).
98
144
 
99
- `Unicode::Emoji::REGEX_PICTO_NO_EMOJI` matches single codepoints with the **Extended_Pictographic** property, but excludes Emoji characters.
145
+ ### Emoji Property Regexes
100
146
 
101
- See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
147
+ Ruby includes native regex Emoji properties, as listed in the following table. You can also opt-in to use the `*_PROP_*` regexes to get the Emoji support level of this gem (instead of Ruby's).
102
148
 
103
- #### Partial Regexes
149
+ Gem Regex (`Unicode::Emoji`'s Emoji support level) | Native Regex (Ruby's Emoji support level)
150
+ ---------------------------------------------------|------------------------------------------
151
+ `Unicode::Emoji::REGEX_PROP_EMOJI` | `/\p{Emoji}/`
152
+ `Unicode::Emoji::REGEX_PROP_MODIFIER` | `/\p{EMod}/`
153
+ `Unicode::Emoji::REGEX_PROP_MODIFIER_BASE` | `/\p{EBase}/`
154
+ `Unicode::Emoji::REGEX_PROP_COMPONENT` | `/\p{EComp}/`
155
+ `Unicode::Emoji::REGEX_PROP_PRESENTATION` | `/\p{EPres}/`
104
156
 
105
- Matches potential Emoji parts (often, this is not what you want):
157
+ #### Extended Pictographic Regex
106
158
 
107
- Regex | Description | Example Matches | Example Non-Matches
108
- ------------------------------|-------------|-----------------|--------------------
109
- `Unicode::Emoji::REGEX_ANY` | Matches any Emoji-related codepoint (but no variation selectors, tags, or zero-width joiners). Please not that this will match Emoji-parts rather than complete Emoji, for example, single digits! | `😴`, `▶`, `🏻`, `🛌`, `🏽`, `🇵`, `🇹`, `2`, `🏴`, `🤾`, `♀`, `🤠`, `🤢` | -
159
+ `Unicode::Emoji::REGEX_PICTO` matches single codepoints with the **Extended_Pictographic** property. For example, it will match `✀` BLACK SAFETY SCISSORS.
110
160
 
161
+ `Unicode::Emoji::REGEX_PICTO_NO_EMOJI` matches single codepoints with the **Extended_Pictographic** property, but excludes Emoji characters.
162
+
163
+ See [character.construction/picto](https://character.construction/picto) for a list of all non-Emoji pictographic characters.
111
164
 
112
- ### List
165
+ ## Usage – List
113
166
 
114
- Use `Unicode::Emoji::LIST` or the list method to get a grouped (and ordered) list of Emoji:
167
+ Use `Unicode::Emoji::LIST` or the **list** method to get a ordered and categorized list of Emoji:
115
168
 
116
169
  ```ruby
117
170
  Unicode::Emoji.list.keys
@@ -124,13 +177,13 @@ Unicode::Emoji.list("Food & Drink", "food-asian")
124
177
  => ["🍱", "🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🥮", "🍡", "🥟", "🥠", "🥡"]
125
178
  ```
126
179
 
127
- Please note that categories might change with future versions of the Emoji standard. This gem will issue warnings when attempting to retrieve old categories using the `#list` method.
180
+ Please note that categories might change with future versions of the Emoji standard, although this has not happened often.
128
181
 
129
182
  A list of all Emoji (generated from this gem) can be found at [character.construction/emoji](https://character.construction/emoji).
130
183
 
131
- ### Properties
184
+ ## Usage – Properties Data
132
185
 
133
- Allows you to access the codepoint data form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
186
+ Allows you to access the codepoint data for a single character form Unicode's [emoji-data.txt](https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt) file:
134
187
 
135
188
  ```ruby
136
189
  require "unicode/emoji"
data/Rakefile CHANGED
@@ -28,14 +28,18 @@ task :irb do
28
28
  end
29
29
 
30
30
  # # #
31
- # Run Specs
31
+ # Run specs
32
32
 
33
33
  desc "#{gemspec.name} | Spec"
34
34
  task :spec do
35
- ruby "spec/unicode_emoji_spec.rb"
35
+ ruby File.join("spec", "*_spec.rb")
36
36
  end
37
37
  task default: :spec
38
38
 
39
+
40
+ # # #
41
+ # Generate regex
42
+
39
43
  desc "#{gemspec.name} | Generates all regex constants and saves them to lib/unicode/emoji/{generated,generated_native} directories"
40
44
  task :generate_constants do
41
45
  load "data/generate_constants.rb", true
Binary file
@@ -68,25 +68,39 @@ def pack_and_join(ords)
68
68
  end
69
69
  end
70
70
 
71
- def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
71
+ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
72
+ visual_component = pack_and_join(VISUAL_COMPONENT)
73
+
72
74
  emoji_presentation_sequence = \
73
75
  join(
74
- pack_and_join(TEXT_PRESENTATION) + pack(EMOJI_VARIATION_SELECTOR),
76
+ text_presentation + pack(EMOJI_VARIATION_SELECTOR),
75
77
  emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
76
78
  )
77
79
 
78
80
  non_component_emoji_presentation_sequence = \
79
81
  "(?!" + emoji_component + ")" + emoji_presentation_sequence
80
82
 
83
+ basic_emoji = \
84
+ join(
85
+ non_component_emoji_presentation_sequence,
86
+ visual_component,
87
+ )
88
+
81
89
  text_keycap_sequence = \
82
- join(EMOJI_KEYCAPS.map{|keycap| pack([keycap, EMOJI_KEYCAP_SUFFIX]) })
90
+ pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
83
91
 
84
92
  text_presentation_sequence = \
85
93
  join(
86
- pack_and_join(TEXT_PRESENTATION)+ "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
94
+ text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
87
95
  emoji_presentation + pack(TEXT_VARIATION_SELECTOR),
88
96
  )
89
97
 
98
+ text_emoji = \
99
+ join(
100
+ "(?!" + emoji_component + ")" + text_presentation_sequence,
101
+ text_keycap_sequence,
102
+ )
103
+
90
104
  emoji_modifier_sequence = \
91
105
  emoji_modifier_base + emoji_modifier
92
106
 
@@ -99,22 +113,11 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
99
113
  emoji_well_formed_flag_sequence = \
100
114
  '\p{RI}{2}'
101
115
 
102
- emoji_valid_core_sequence = \
103
- join(
104
- # emoji_character,
105
- emoji_keycap_sequence,
106
- emoji_modifier_sequence,
107
- non_component_emoji_presentation_sequence,
108
- emoji_valid_flag_sequence,
109
- )
110
-
111
- emoji_well_formed_core_sequence = \
116
+ emoji_core_sequence = \
112
117
  join(
113
- # emoji_character,
114
118
  emoji_keycap_sequence,
115
119
  emoji_modifier_sequence,
116
120
  non_component_emoji_presentation_sequence,
117
- emoji_well_formed_flag_sequence,
118
121
  )
119
122
 
120
123
  # Sort to make sure complex sequences match first
@@ -144,6 +147,18 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
144
147
  emoji_rgi_zwj_sequence = \
145
148
  pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse)
146
149
 
150
+ # FQE+MQE: Make VS16 optional after ZWJ has appeared
151
+ emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
152
+ /#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/,
153
+ pack(EMOJI_VARIATION_SELECTOR) + "?"
154
+ )
155
+
156
+ # FQE+MQE+UQE: Make all VS16 optional
157
+ emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
158
+ pack(EMOJI_VARIATION_SELECTOR),
159
+ pack(EMOJI_VARIATION_SELECTOR) + "?",
160
+ )
161
+
147
162
  emoji_valid_zwj_element = \
148
163
  join(
149
164
  emoji_modifier_sequence,
@@ -160,21 +175,76 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
160
175
  join(
161
176
  emoji_rgi_zwj_sequence,
162
177
  emoji_rgi_tag_sequence,
163
- emoji_valid_core_sequence,
178
+ emoji_valid_flag_sequence,
179
+ emoji_core_sequence,
180
+ visual_component,
181
+ )
182
+
183
+ emoji_rgi_sequence_include_text = \
184
+ join(
185
+ emoji_rgi_zwj_sequence,
186
+ emoji_rgi_tag_sequence,
187
+ emoji_valid_flag_sequence,
188
+ emoji_core_sequence,
189
+ visual_component,
190
+ text_emoji,
191
+ )
192
+
193
+ emoji_rgi_include_mqe_sequence = \
194
+ join(
195
+ emoji_rgi_include_mqe_zwj_sequence,
196
+ emoji_rgi_tag_sequence,
197
+ emoji_valid_flag_sequence,
198
+ emoji_core_sequence,
199
+ visual_component,
200
+ )
201
+
202
+ emoji_rgi_include_mqe_uqe_sequence = \
203
+ join(
204
+ emoji_rgi_include_mqe_uqe_zwj_sequence,
205
+ text_emoji, # also uqe
206
+ emoji_rgi_tag_sequence,
207
+ emoji_valid_flag_sequence,
208
+ emoji_core_sequence,
209
+ visual_component,
164
210
  )
165
211
 
166
212
  emoji_valid_sequence = \
167
213
  join(
168
214
  emoji_valid_zwj_sequence,
169
215
  emoji_valid_tag_sequence,
170
- emoji_valid_core_sequence,
216
+ emoji_valid_flag_sequence,
217
+ emoji_core_sequence,
218
+ visual_component,
219
+ )
220
+
221
+ emoji_valid_sequence_include_text = \
222
+ join(
223
+ emoji_valid_zwj_sequence,
224
+ emoji_valid_tag_sequence,
225
+ emoji_valid_flag_sequence,
226
+ emoji_core_sequence,
227
+ visual_component,
228
+ text_emoji,
171
229
  )
172
230
 
173
231
  emoji_well_formed_sequence = \
174
232
  join(
175
233
  emoji_valid_zwj_sequence,
176
234
  emoji_well_formed_tag_sequence,
177
- emoji_well_formed_core_sequence,
235
+ emoji_well_formed_flag_sequence,
236
+ emoji_core_sequence,
237
+ visual_component,
238
+ )
239
+
240
+ emoji_well_formed_sequence_include_text = \
241
+ join(
242
+ emoji_valid_zwj_sequence,
243
+ emoji_well_formed_tag_sequence,
244
+ emoji_well_formed_flag_sequence,
245
+ emoji_core_sequence,
246
+ visual_component,
247
+ text_emoji,
178
248
  )
179
249
 
180
250
  emoji_possible_modification = \
@@ -198,45 +268,53 @@ def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_compo
198
268
  # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
199
269
  regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)
200
270
 
271
+ # rgi + singleton text
272
+ regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text)
273
+
274
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
275
+ # Also make VS16 optional if not at first emoji character
276
+ regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence)
277
+
278
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
279
+ # Also make VS16 optional even at first emoji character
280
+ regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence)
281
+
201
282
  # Matches basic singleton emoji and all kind of valid sequences
202
283
  regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)
203
284
 
285
+ # valid + singleton text
286
+ regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text)
287
+
204
288
  # Matches basic singleton emoji and all kind of sequences
205
289
  regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)
290
+
291
+ # well-formed + singleton text
292
+ regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text)
206
293
 
207
294
  # Quick test which might lead to false positves
208
295
  # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
209
296
  regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
210
297
 
211
- # Matches only basic single, non-textual emoji
212
- # Ignores "components" like modifiers or simple digits
213
- regexes[:REGEX_BASIC] = Regexp.compile(
214
- "(?!" + emoji_component + ")" + emoji_presentation_sequence
215
- )
298
+ # Matches only basic single, non-textual emoji, ignores some components like simple digits
299
+ regexes[:REGEX_BASIC] = Regexp.compile(basic_emoji)
216
300
 
217
- # Matches only basic single, textual emoji
218
- # Ignores "components" like modifiers or simple digits
219
- regexes[:REGEX_TEXT] = Regexp.compile(
220
- join(
221
- "(?!" + emoji_component + ")" + text_presentation_sequence,
222
- text_keycap_sequence,
223
- )
224
- )
225
-
226
- # Matches any emoji-related codepoint - Use with caution (returns partial matches)
227
- regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
301
+ # Matches only basic single, textual emoji, ignores components like modifiers or simple digits
302
+ regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
228
303
 
229
- # Combined REGEXes which also match for TEXTUAL emoji
230
- regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT])
231
-
232
- regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT])
233
-
234
- regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT])
304
+ # Export regexes for Emoji properties so they can be used with newer Unicode than Ruby's
305
+ regexes[:REGEX_PROP_EMOJI] = Regexp.compile(emoji_character)
306
+ regexes[:REGEX_PROP_MODIFIER] = Regexp.compile(emoji_modifier)
307
+ regexes[:REGEX_PROP_MODIFIER_BASE] = Regexp.compile(emoji_modifier_base)
308
+ regexes[:REGEX_PROP_COMPONENT] = Regexp.compile(emoji_component)
309
+ regexes[:REGEX_PROP_PRESENTATION] = Regexp.compile(emoji_presentation)
235
310
 
311
+ # Same goes for ExtendedPictographic
236
312
  regexes[:REGEX_PICTO] = Regexp.compile(picto)
237
-
238
313
  regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)
239
314
 
315
+ # Emoji keycaps
316
+ regexes[:REGEX_EMOJI_KEYCAP] = Regexp.compile(emoji_keycap_sequence)
317
+
240
318
  regexes
241
319
  end
242
320
 
@@ -246,6 +324,7 @@ regexes = compile(
246
324
  emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
247
325
  emoji_component: pack_and_join(EMOJI_COMPONENT),
248
326
  emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
327
+ text_presentation: pack_and_join(TEXT_PRESENTATION),
249
328
  picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
250
329
  picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
251
330
  )
@@ -257,7 +336,8 @@ native_regexes = compile(
257
336
  emoji_modifier_base: "\\p{EBase}",
258
337
  emoji_component: "\\p{EComp}",
259
338
  emoji_presentation: "\\p{EPres}",
339
+ text_presentation: "[\\p{Emoji}&&\\P{EPres}]",
260
340
  picto: "\\p{ExtPict}",
261
- picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
341
+ picto_no_emoji: "[\\p{ExtPict}&&\\P{Emoji}]"
262
342
  )
263
343
  write_regexes(native_regexes, File.expand_path("../lib/unicode/emoji/generated_native", __dir__))