regexp_property_values 0.3.4-java → 1.2.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/tests.yml +22 -0
- data/.gitignore +2 -0
- data/CHANGELOG.md +30 -0
- data/Gemfile +6 -0
- data/README.md +11 -14
- data/Rakefile +2 -0
- data/ext/regexp_property_values/regexp_property_values.c +22 -12
- data/lib/aliases +241 -0
- data/lib/regexp_property_values/updater.rb +147 -0
- data/lib/regexp_property_values/value/ext_adapter.rb +19 -0
- data/lib/regexp_property_values/value/ruby_fallback.rb +21 -0
- data/lib/regexp_property_values/value/shared_methods.rb +63 -0
- data/lib/regexp_property_values/value.rb +14 -0
- data/lib/regexp_property_values/version.rb +1 -1
- data/lib/regexp_property_values.rb +17 -59
- data/lib/values +603 -0
- data/regexp_property_values.gemspec +1 -7
- metadata +18 -69
- data/.travis.yml +0 -11
- data/lib/UnicodeProps.txt +0 -802
- data/lib/regexp_property_values/extension.rb +0 -65
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: dacf307a51b5b959716893297f01121614b829741f3d3f9032a95a9a7c514551
|
4
|
+
data.tar.gz: cc90c8688ee4465e316c00e676af47cbd1c8083c9f5d00b575940a6c1a61bf31
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 923198aae109bc79d5b0632e021b2dbdc83e83351394e7abfd5cf68d337113a55b2f7e81cf0f6d367fd133f4b7a75875ba65f8d8459d98a6694422c2f820eb77
|
7
|
+
data.tar.gz: aff9b6e0d4822175cef337dfcded3be3d8016edd64d03d038c4ecb56ff7eac1ff07ef74242a74db55c2ca96293c41b674479d481b97b988c92da429b5442fd10
|
@@ -0,0 +1,22 @@
|
|
1
|
+
name: tests
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
|
9
|
+
strategy:
|
10
|
+
matrix:
|
11
|
+
ruby: [ '2.3', '3.1', 'jruby-head' ] # TODO: 'ruby-head' after https://github.com/knu/sorted_set/issues/11
|
12
|
+
|
13
|
+
steps:
|
14
|
+
- uses: actions/checkout@v2
|
15
|
+
- name: Set up Ruby ${{ matrix.ruby }}
|
16
|
+
uses: ruby/setup-ruby@v1
|
17
|
+
with:
|
18
|
+
ruby-version: ${{ matrix.ruby }}
|
19
|
+
- name: Install dependencies
|
20
|
+
run: bundle install --jobs 4
|
21
|
+
- name: Test with Rake
|
22
|
+
run: bundle exec rake
|
data/.gitignore
CHANGED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# Changelog
|
2
|
+
All notable changes to this project will be documented in this file.
|
3
|
+
|
4
|
+
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
5
|
+
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
6
|
+
|
7
|
+
## [1.2.0] - 2021-12-31
|
8
|
+
|
9
|
+
### Added
|
10
|
+
- support for usage in Ractors
|
11
|
+
|
12
|
+
## [1.1.0] - 2021-12-05
|
13
|
+
|
14
|
+
### Added
|
15
|
+
- added new properties from Ruby `3.1.0` to output of `::all`, `::all_for_current_ruby`
|
16
|
+
- added options to run `::update` with custom ucd/emoji source paths
|
17
|
+
|
18
|
+
## [1.0.0] - 2019-06-16
|
19
|
+
|
20
|
+
### Changed
|
21
|
+
- removed `::by_category`, `::by_matched_codepoints`, `::short_and_long_names`
|
22
|
+
- return values are now always of a custom `Value` class, no longer extended `Strings`
|
23
|
+
- unknown properties now raise `RegexpPropertyValues::Error`, no longer an `ArgumentError`
|
24
|
+
|
25
|
+
### Added
|
26
|
+
- `Value#identifier`
|
27
|
+
- `Value#full_name`
|
28
|
+
|
29
|
+
### Fixed
|
30
|
+
- better codepoint determination speed for non-C Rubies (still slow)
|
data/Gemfile
CHANGED
@@ -4,3 +4,9 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
|
|
4
4
|
|
5
5
|
# Specify your gem's dependencies in regexp_property_values.gemspec
|
6
6
|
gemspec
|
7
|
+
|
8
|
+
gem 'character_set', '~> 1.4.0'
|
9
|
+
gem 'rake', '~> 13.0'
|
10
|
+
gem 'rake-compiler', '~> 1.0'
|
11
|
+
gem 'range_compressor', '~> 1.0'
|
12
|
+
gem 'rspec', '~> 3.0'
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# RegexpPropertyValues
|
2
2
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/regexp_property_values.svg)](http://badge.fury.io/rb/regexp_property_values)
|
4
|
-
[![Build Status](https://
|
4
|
+
[![Build Status](https://github.com/jaynetics/regexp_property_values/workflows/tests/badge.svg)](https://github.com/jaynetics/regexp_property_values/actions)
|
5
5
|
|
6
6
|
This small library lets you see which property values are supported by the regular expression engine of the Ruby version you are running and directly reads out their codepoint ranges from there.
|
7
7
|
|
@@ -16,22 +16,16 @@ require 'regexp_property_values'
|
|
16
16
|
|
17
17
|
PV = RegexpPropertyValues
|
18
18
|
|
19
|
-
PV.all # => [
|
20
|
-
PV.by_category # => {"POSIX brackets" => ["Alpha", ...], "Special" => ...}
|
21
|
-
PV.short_and_long_names # => [["M", "Grek", ...], ["Mark", "Greek", ...]]
|
19
|
+
PV.all # => [<Value name='Alpha'>, <Value name='Blank'>, ...]
|
22
20
|
```
|
23
21
|
|
24
22
|
##### Browse property values supported by the Ruby you are running
|
25
23
|
|
26
24
|
```ruby
|
27
|
-
PV.all_for_current_ruby # => [
|
28
|
-
|
29
|
-
PV.by_category.map { |k, v| [k, v.select(&:supported_by_current_ruby?] }
|
30
|
-
|
31
|
-
# etc.
|
25
|
+
PV.all_for_current_ruby # => [<Value name='Alpha'>, <Value name='Blank'>, ...]
|
32
26
|
```
|
33
27
|
|
34
|
-
##### Inspect
|
28
|
+
##### Inspect property values
|
35
29
|
|
36
30
|
```ruby
|
37
31
|
PV['alpha'].supported_by_current_ruby? # => true
|
@@ -40,9 +34,11 @@ PV['foobar'].supported_by_current_ruby? # => false
|
|
40
34
|
PV['AHex'].matched_characters # => %w[0 1 2 3 4 5 6 7 8 9 A B C ...]
|
41
35
|
PV['AHex'].matched_codepoints # => [48, 49, 50, ...]
|
42
36
|
PV['AHex'].matched_ranges # => [48..57, 65..70, 97..102]
|
37
|
+
|
38
|
+
PV['foobar'].matched_ranges # => RegexpPropertyValues::Error
|
43
39
|
```
|
44
40
|
|
45
|
-
If [`character_set`](https://github.com/
|
41
|
+
If [`character_set`](https://github.com/jaynetics/character_set) is installed, you can also do this:
|
46
42
|
|
47
43
|
```ruby
|
48
44
|
PV['AHex'].character_set # => #<CharacterSet: {48, 49...} (size: 22)>
|
@@ -51,9 +47,10 @@ PV['AHex'].character_set # => #<CharacterSet: {48, 49...} (size: 22)>
|
|
51
47
|
##### Utility methods
|
52
48
|
|
53
49
|
```ruby
|
54
|
-
#
|
55
|
-
PV.alias_hash # => {
|
50
|
+
# get a Hash of aliases for property names
|
51
|
+
PV.alias_hash # => { <Value name='M'> => <Value name='Mark'>, ... }
|
56
52
|
|
57
|
-
# download
|
53
|
+
# download a list of possible properties for the running Ruby version
|
54
|
+
# (only used for .all and .alias_hash, not needed for prop lookup via .[])
|
58
55
|
PV.update
|
59
56
|
```
|
data/Rakefile
CHANGED
@@ -2,25 +2,29 @@
|
|
2
2
|
#include "ruby/encoding.h"
|
3
3
|
#include "ruby/oniguruma.h" // still in recent rubies f. backwards compatibility
|
4
4
|
|
5
|
-
static int prop_name_to_ctype(char*
|
5
|
+
static int prop_name_to_ctype(char *name, rb_encoding *enc)
|
6
|
+
{
|
6
7
|
UChar *uname;
|
7
8
|
int ctype;
|
8
9
|
|
9
|
-
uname = (UChar*)name;
|
10
|
+
uname = (UChar *)name;
|
10
11
|
ctype = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, uname, uname + strlen(name));
|
11
|
-
if (ctype < 0)
|
12
|
+
if (ctype < 0)
|
13
|
+
rb_raise(rb_eArgError, "Unknown property name `%s`", name);
|
12
14
|
|
13
15
|
return ctype;
|
14
16
|
}
|
15
17
|
|
16
|
-
VALUE onig_ranges_to_rb(const OnigCodePoint *onig_ranges)
|
18
|
+
VALUE onig_ranges_to_rb(const OnigCodePoint *onig_ranges)
|
19
|
+
{
|
17
20
|
unsigned int range_count, i;
|
18
21
|
VALUE result, sub_range;
|
19
22
|
|
20
23
|
range_count = onig_ranges[0];
|
21
24
|
result = rb_ary_new2(range_count); // rb_ary_new_capa not avail. in Ruby 2.0
|
22
25
|
|
23
|
-
for (i = 0; i < range_count; i++)
|
26
|
+
for (i = 0; i < range_count; i++)
|
27
|
+
{
|
24
28
|
sub_range = rb_range_new(INT2FIX(onig_ranges[(i * 2) + 1]),
|
25
29
|
INT2FIX(onig_ranges[(i * 2) + 2]),
|
26
30
|
0);
|
@@ -30,26 +34,32 @@ VALUE onig_ranges_to_rb(const OnigCodePoint *onig_ranges) {
|
|
30
34
|
return result;
|
31
35
|
}
|
32
36
|
|
33
|
-
VALUE rb_prop_ranges(char*
|
37
|
+
VALUE rb_prop_ranges(char *name)
|
38
|
+
{
|
34
39
|
int ctype;
|
35
40
|
const OnigCodePoint *onig_ranges;
|
36
41
|
OnigCodePoint sb_out;
|
42
|
+
rb_encoding *enc;
|
43
|
+
enc = rb_utf8_encoding();
|
37
44
|
|
38
45
|
ctype = prop_name_to_ctype(name, enc);
|
39
46
|
ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &onig_ranges);
|
40
47
|
return onig_ranges_to_rb(onig_ranges);
|
41
48
|
}
|
42
49
|
|
43
|
-
VALUE method_matched_ranges(VALUE self, VALUE arg)
|
50
|
+
VALUE method_matched_ranges(VALUE self, VALUE arg)
|
51
|
+
{
|
44
52
|
char *prop_name;
|
45
|
-
rb_encoding *enc;
|
46
|
-
|
47
53
|
prop_name = StringValueCStr(arg);
|
48
|
-
|
49
|
-
return rb_prop_ranges(prop_name, enc);
|
54
|
+
return rb_prop_ranges(prop_name);
|
50
55
|
}
|
51
56
|
|
52
|
-
void Init_regexp_property_values()
|
57
|
+
void Init_regexp_property_values()
|
58
|
+
{
|
59
|
+
#ifdef HAVE_RB_EXT_RACTOR_SAFE
|
60
|
+
rb_ext_ractor_safe(true);
|
61
|
+
#endif
|
62
|
+
|
53
63
|
VALUE module;
|
54
64
|
module = rb_define_module("OnigRegexpPropertyHelper");
|
55
65
|
rb_define_singleton_method(module, "matched_ranges", method_matched_ranges, 1);
|
data/lib/aliases
ADDED
@@ -0,0 +1,241 @@
|
|
1
|
+
AHex;ASCII_Hex_Digit
|
2
|
+
Adlm;Adlam
|
3
|
+
Aghb;Caucasian_Albanian
|
4
|
+
Arab;Arabic
|
5
|
+
Armi;Imperial_Aramaic
|
6
|
+
Armn;Armenian
|
7
|
+
Avst;Avestan
|
8
|
+
Bali;Balinese
|
9
|
+
Bamu;Bamum
|
10
|
+
Bass;Bassa_Vah
|
11
|
+
Batk;Batak
|
12
|
+
Beng;Bengali
|
13
|
+
Bhks;Bhaiksuki
|
14
|
+
Bidi_C;Bidi_Control
|
15
|
+
Bopo;Bopomofo
|
16
|
+
Brah;Brahmi
|
17
|
+
Brai;Braille
|
18
|
+
Bugi;Buginese
|
19
|
+
Buhd;Buhid
|
20
|
+
C;Other
|
21
|
+
CI;Case_Ignorable
|
22
|
+
CWCF;Changes_When_Casefolded
|
23
|
+
CWCM;Changes_When_Casemapped
|
24
|
+
CWL;Changes_When_Lowercased
|
25
|
+
CWT;Changes_When_Titlecased
|
26
|
+
CWU;Changes_When_Uppercased
|
27
|
+
Cakm;Chakma
|
28
|
+
Cans;Canadian_Aboriginal
|
29
|
+
Cari;Carian
|
30
|
+
Cc;Control
|
31
|
+
Cf;Format
|
32
|
+
Cher;Cherokee
|
33
|
+
Chrs;Chorasmian
|
34
|
+
Cn;Unassigned
|
35
|
+
Co;Private_Use
|
36
|
+
Combining_Mark;Mark
|
37
|
+
Copt;Coptic
|
38
|
+
Cprt;Cypriot
|
39
|
+
Cs;Surrogate
|
40
|
+
Cyrl;Cyrillic
|
41
|
+
DI;Default_Ignorable_Code_Point
|
42
|
+
Dep;Deprecated
|
43
|
+
Deva;Devanagari
|
44
|
+
Dia;Diacritic
|
45
|
+
Diak;Dives_Akuru
|
46
|
+
Dogr;Dogra
|
47
|
+
Dsrt;Deseret
|
48
|
+
Dupl;Duployan
|
49
|
+
EBase;Emoji_Modifier_Base
|
50
|
+
EComp;Emoji_Component
|
51
|
+
EMod;Emoji_Modifier
|
52
|
+
EPres;Emoji_Presentation
|
53
|
+
Egyp;Egyptian_Hieroglyphs
|
54
|
+
Elba;Elbasan
|
55
|
+
Elym;Elymaic
|
56
|
+
Ethi;Ethiopic
|
57
|
+
Ext;Extender
|
58
|
+
Geor;Georgian
|
59
|
+
Glag;Glagolitic
|
60
|
+
Gong;Gunjala_Gondi
|
61
|
+
Gonm;Masaram_Gondi
|
62
|
+
Goth;Gothic
|
63
|
+
Gr_Base;Grapheme_Base
|
64
|
+
Gr_Ext;Grapheme_Extend
|
65
|
+
Gr_Link;Grapheme_Link
|
66
|
+
Gran;Grantha
|
67
|
+
Grek;Greek
|
68
|
+
Gujr;Gujarati
|
69
|
+
Guru;Gurmukhi
|
70
|
+
Hang;Hangul
|
71
|
+
Hani;Han
|
72
|
+
Hano;Hanunoo
|
73
|
+
Hatr;Hatran
|
74
|
+
Hebr;Hebrew
|
75
|
+
Hex;Hex_Digit
|
76
|
+
Hira;Hiragana
|
77
|
+
Hluw;Anatolian_Hieroglyphs
|
78
|
+
Hmng;Pahawh_Hmong
|
79
|
+
Hmnp;Nyiakeng_Puachue_Hmong
|
80
|
+
Hung;Old_Hungarian
|
81
|
+
IDC;ID_Continue
|
82
|
+
IDS;ID_Start
|
83
|
+
IDSB;IDS_Binary_Operator
|
84
|
+
IDST;IDS_Trinary_Operator
|
85
|
+
Ideo;Ideographic
|
86
|
+
Ital;Old_Italic
|
87
|
+
Java;Javanese
|
88
|
+
Join_C;Join_Control
|
89
|
+
Kali;Kayah_Li
|
90
|
+
Kana;Katakana
|
91
|
+
Khar;Kharoshthi
|
92
|
+
Khmr;Khmer
|
93
|
+
Khoj;Khojki
|
94
|
+
Kits;Khitan_Small_Script
|
95
|
+
Knda;Kannada
|
96
|
+
Kthi;Kaithi
|
97
|
+
L;Letter
|
98
|
+
LC;Cased_Letter
|
99
|
+
LOE;Logical_Order_Exception
|
100
|
+
Lana;Tai_Tham
|
101
|
+
Laoo;Lao
|
102
|
+
Latn;Latin
|
103
|
+
Lepc;Lepcha
|
104
|
+
Limb;Limbu
|
105
|
+
Lina;Linear_A
|
106
|
+
Linb;Linear_B
|
107
|
+
Ll;Lowercase_Letter
|
108
|
+
Lm;Modifier_Letter
|
109
|
+
Lo;Other_Letter
|
110
|
+
Lt;Titlecase_Letter
|
111
|
+
Lu;Uppercase_Letter
|
112
|
+
Lyci;Lycian
|
113
|
+
Lydi;Lydian
|
114
|
+
M;Mark
|
115
|
+
Mahj;Mahajani
|
116
|
+
Maka;Makasar
|
117
|
+
Mand;Mandaic
|
118
|
+
Mani;Manichaean
|
119
|
+
Marc;Marchen
|
120
|
+
Mc;Spacing_Mark
|
121
|
+
Me;Enclosing_Mark
|
122
|
+
Medf;Medefaidrin
|
123
|
+
Mend;Mende_Kikakui
|
124
|
+
Merc;Meroitic_Cursive
|
125
|
+
Mero;Meroitic_Hieroglyphs
|
126
|
+
Mlym;Malayalam
|
127
|
+
Mn;Nonspacing_Mark
|
128
|
+
Mong;Mongolian
|
129
|
+
Mroo;Mro
|
130
|
+
Mtei;Meetei_Mayek
|
131
|
+
Mult;Multani
|
132
|
+
Mymr;Myanmar
|
133
|
+
N;Number
|
134
|
+
NChar;Noncharacter_Code_Point
|
135
|
+
Nand;Nandinagari
|
136
|
+
Narb;Old_North_Arabian
|
137
|
+
Nbat;Nabataean
|
138
|
+
Nd;Decimal_Number
|
139
|
+
Nkoo;Nko
|
140
|
+
Nl;Letter_Number
|
141
|
+
No;Other_Number
|
142
|
+
Nshu;Nushu
|
143
|
+
OAlpha;Other_Alphabetic
|
144
|
+
ODI;Other_Default_Ignorable_Code_Point
|
145
|
+
OGr_Ext;Other_Grapheme_Extend
|
146
|
+
OIDC;Other_ID_Continue
|
147
|
+
OIDS;Other_ID_Start
|
148
|
+
OLower;Other_Lowercase
|
149
|
+
OMath;Other_Math
|
150
|
+
OUpper;Other_Uppercase
|
151
|
+
Ogam;Ogham
|
152
|
+
Olck;Ol_Chiki
|
153
|
+
Orkh;Old_Turkic
|
154
|
+
Orya;Oriya
|
155
|
+
Osge;Osage
|
156
|
+
Osma;Osmanya
|
157
|
+
P;Punctuation
|
158
|
+
PCM;Prepended_Concatenation_Mark
|
159
|
+
Palm;Palmyrene
|
160
|
+
Pat_Syn;Pattern_Syntax
|
161
|
+
Pat_WS;Pattern_White_Space
|
162
|
+
Pauc;Pau_Cin_Hau
|
163
|
+
Pc;Connector_Punctuation
|
164
|
+
Pd;Dash_Punctuation
|
165
|
+
Pe;Close_Punctuation
|
166
|
+
Perm;Old_Permic
|
167
|
+
Pf;Final_Punctuation
|
168
|
+
Phag;Phags_Pa
|
169
|
+
Phli;Inscriptional_Pahlavi
|
170
|
+
Phlp;Psalter_Pahlavi
|
171
|
+
Phnx;Phoenician
|
172
|
+
Pi;Initial_Punctuation
|
173
|
+
Plrd;Miao
|
174
|
+
Po;Other_Punctuation
|
175
|
+
Prti;Inscriptional_Parthian
|
176
|
+
Ps;Open_Punctuation
|
177
|
+
QMark;Quotation_Mark
|
178
|
+
Qaac;Coptic
|
179
|
+
Qaai;Inherited
|
180
|
+
RI;Regional_Indicator
|
181
|
+
Rjng;Rejang
|
182
|
+
Rohg;Hanifi_Rohingya
|
183
|
+
Runr;Runic
|
184
|
+
S;Symbol
|
185
|
+
SD;Soft_Dotted
|
186
|
+
STerm;Sentence_Terminal
|
187
|
+
Samr;Samaritan
|
188
|
+
Sarb;Old_South_Arabian
|
189
|
+
Saur;Saurashtra
|
190
|
+
Sc;Currency_Symbol
|
191
|
+
Sgnw;SignWriting
|
192
|
+
Shaw;Shavian
|
193
|
+
Shrd;Sharada
|
194
|
+
Sidd;Siddham
|
195
|
+
Sind;Khudawadi
|
196
|
+
Sinh;Sinhala
|
197
|
+
Sk;Modifier_Symbol
|
198
|
+
Sm;Math_Symbol
|
199
|
+
So;Other_Symbol
|
200
|
+
Sogd;Sogdian
|
201
|
+
Sogo;Old_Sogdian
|
202
|
+
Sora;Sora_Sompeng
|
203
|
+
Soyo;Soyombo
|
204
|
+
Sund;Sundanese
|
205
|
+
Sylo;Syloti_Nagri
|
206
|
+
Syrc;Syriac
|
207
|
+
Tagb;Tagbanwa
|
208
|
+
Takr;Takri
|
209
|
+
Tale;Tai_Le
|
210
|
+
Talu;New_Tai_Lue
|
211
|
+
Taml;Tamil
|
212
|
+
Tang;Tangut
|
213
|
+
Tavt;Tai_Viet
|
214
|
+
Telu;Telugu
|
215
|
+
Term;Terminal_Punctuation
|
216
|
+
Tfng;Tifinagh
|
217
|
+
Tglg;Tagalog
|
218
|
+
Thaa;Thaana
|
219
|
+
Tibt;Tibetan
|
220
|
+
Tirh;Tirhuta
|
221
|
+
UIdeo;Unified_Ideograph
|
222
|
+
Ugar;Ugaritic
|
223
|
+
VS;Variation_Selector
|
224
|
+
Vaii;Vai
|
225
|
+
WSpace;White_Space
|
226
|
+
Wara;Warang_Citi
|
227
|
+
Wcho;Wancho
|
228
|
+
XIDC;XID_Continue
|
229
|
+
XIDS;XID_Start
|
230
|
+
Xpeo;Old_Persian
|
231
|
+
Xsux;Cuneiform
|
232
|
+
Yezi;Yezidi
|
233
|
+
Yiii;Yi
|
234
|
+
Z;Separator
|
235
|
+
Zanb;Zanabazar_Square
|
236
|
+
Zinh;Inherited
|
237
|
+
Zl;Line_Separator
|
238
|
+
Zp;Paragraph_Separator
|
239
|
+
Zs;Space_Separator
|
240
|
+
Zyyy;Common
|
241
|
+
Zzzz;Unknown
|
@@ -0,0 +1,147 @@
|
|
1
|
+
module RegexpPropertyValues
|
2
|
+
module Updater
|
3
|
+
module_function
|
4
|
+
|
5
|
+
require 'fileutils'
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
BASE_URL = 'http://www.unicode.org/Public'
|
9
|
+
|
10
|
+
UCD_FILES = %w[
|
11
|
+
Blocks.txt
|
12
|
+
DerivedAge.txt
|
13
|
+
DerivedCoreProperties.txt
|
14
|
+
PropertyAliases.txt
|
15
|
+
PropertyValueAliases.txt
|
16
|
+
PropList.txt
|
17
|
+
Scripts.txt
|
18
|
+
]
|
19
|
+
|
20
|
+
EMOJI_FILES = %w[
|
21
|
+
emoji-data.txt
|
22
|
+
]
|
23
|
+
|
24
|
+
TMP_DIR = File.join(__dir__, 'tmp_ucd')
|
25
|
+
|
26
|
+
def call(ucd_path: nil, emoji_path: nil)
|
27
|
+
prepare_tmp_dir
|
28
|
+
download_ucd_files(ucd_path: ucd_path, emoji_path: emoji_path)
|
29
|
+
write_values
|
30
|
+
write_aliases
|
31
|
+
remove_tmp_dir
|
32
|
+
print_stats
|
33
|
+
end
|
34
|
+
|
35
|
+
def prepare_tmp_dir
|
36
|
+
FileUtils.rm_rf(TMP_DIR) if File.exist?(TMP_DIR)
|
37
|
+
FileUtils.mkdir(TMP_DIR)
|
38
|
+
end
|
39
|
+
|
40
|
+
def download_ucd_files(ucd_path: nil, emoji_path: nil)
|
41
|
+
unicode_version = RbConfig::CONFIG.fetch('UNICODE_VERSION')
|
42
|
+
emoji_version = RbConfig::CONFIG.fetch('UNICODE_EMOJI_VERSION')
|
43
|
+
|
44
|
+
ucd_path ||= ENV['RPV_UCD_PATH']
|
45
|
+
emoji_path ||= ENV['RPV_EMOJI_PATH']
|
46
|
+
|
47
|
+
if ucd_path.nil? && emoji_path.nil?
|
48
|
+
puts <<-EOS.gsub(/\n */, ' ')
|
49
|
+
This try will load ucd and emoji data for the CURRENT RUBY (
|
50
|
+
(#{RUBY_VERSION} - ucd #{unicode_version}, emoji #{emoji_version}).
|
51
|
+
Run this on the latest Ruby version you want to support.
|
52
|
+
Unicode directory structure changes sometimes, so you might need to
|
53
|
+
pass the right path(s) as keyword args or ENV vars. Continue? [y/n]'
|
54
|
+
EOS
|
55
|
+
|
56
|
+
return puts 'download skipped.' unless $stdin.gets =~ /^y/i
|
57
|
+
end
|
58
|
+
|
59
|
+
ucd_path ||= "#{BASE_URL}/#{unicode_version}/ucd"
|
60
|
+
emoji_path ||= "#{BASE_URL}/emoji/#{emoji_version}"
|
61
|
+
|
62
|
+
Dir.chdir(TMP_DIR) do
|
63
|
+
UCD_FILES.each { |f| `wget #{ucd_path}/#{f}` }
|
64
|
+
EMOJI_FILES.each { |f| `wget #{emoji_path}/#{f}` }
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def write_values
|
69
|
+
@values = Set.new
|
70
|
+
|
71
|
+
# posix properties
|
72
|
+
@values += %w[
|
73
|
+
Alpha Blank Cntrl Digit Graph Lower Print
|
74
|
+
Punct Space Upper XDigit Word Alnum ASCII
|
75
|
+
XPosixPunct
|
76
|
+
]
|
77
|
+
|
78
|
+
# special properties
|
79
|
+
@values += %w[Any Assigned In_No_Block Unknown]
|
80
|
+
|
81
|
+
# legacy properties
|
82
|
+
@values += %w[Newline]
|
83
|
+
|
84
|
+
regexp = /^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (?<prop_name>\w+) +# /
|
85
|
+
%w[
|
86
|
+
DerivedCoreProperties.txt
|
87
|
+
PropList.txt
|
88
|
+
Scripts.txt
|
89
|
+
emoji-data.txt
|
90
|
+
].each { |file| scan(file, regexp) { |caps| @values << caps[:prop_name] } }
|
91
|
+
|
92
|
+
scan('PropertyValueAliases.txt', /^gc ; \w+ *; (?<prop_name>\w+)/) do |caps|
|
93
|
+
@values << caps[:prop_name]
|
94
|
+
end
|
95
|
+
|
96
|
+
scan('Blocks.txt', /^[\dA-F.]+ *; (?<block_name>[-\w ]+)/) do |caps|
|
97
|
+
@values << 'In_' + caps[:block_name].gsub(/\W/, '_')
|
98
|
+
end
|
99
|
+
|
100
|
+
scan('DerivedAge.txt', /^[\dA-F.]+ *; (?<age_num>[\d.]+)/) do |caps|
|
101
|
+
@values << 'Age=' + caps[:age_num]
|
102
|
+
end
|
103
|
+
|
104
|
+
File.write(RegexpPropertyValues::VALUES_PATH, @values.sort.join("\n"))
|
105
|
+
end
|
106
|
+
|
107
|
+
def write_aliases
|
108
|
+
@aliases = Set.new
|
109
|
+
|
110
|
+
scan('PropertyAliases.txt', /^(?<alias>\w+) *; (?<name>\w+)/) do |caps|
|
111
|
+
if in_values?(caps[:name]) && !in_values?(caps[:alias])
|
112
|
+
@aliases << [caps[:alias], caps[:name]]
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
scan('PropertyValueAliases.txt',
|
117
|
+
/^[gs]c ; (?<alias1>\w+) *; (?<name>\w+)(?: *; (?<alias2>\w+))?/) do |caps|
|
118
|
+
if in_values?(caps[:name]) && !in_values?(caps[:alias1])
|
119
|
+
@aliases << [caps[:alias1], caps[:name]]
|
120
|
+
end
|
121
|
+
if in_values?(caps[:name]) && caps[:alias2] && !in_values?(caps[:alias2])
|
122
|
+
@aliases << [caps[:alias2], caps[:name]]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
File.write(RegexpPropertyValues::ALIASES_PATH,
|
127
|
+
@aliases.sort.map { |pair| pair.join(';') }.join("\n"))
|
128
|
+
end
|
129
|
+
|
130
|
+
def in_values?(string)
|
131
|
+
@values.any? { |value| value.casecmp?(string) }
|
132
|
+
end
|
133
|
+
|
134
|
+
def scan(file, pattern)
|
135
|
+
path = File.join(TMP_DIR, file)
|
136
|
+
File.read(path).scan(pattern) { yield(Regexp.last_match) }
|
137
|
+
end
|
138
|
+
|
139
|
+
def remove_tmp_dir
|
140
|
+
FileUtils.rm_rf(TMP_DIR)
|
141
|
+
end
|
142
|
+
|
143
|
+
def print_stats
|
144
|
+
print "\nFetched #{@values.size} values and #{@aliases.size} aliases.\n\n"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module RegexpPropertyValues
|
2
|
+
class Value
|
3
|
+
module ExtAdapter
|
4
|
+
def matched_characters
|
5
|
+
matched_codepoints.map { |cp| cp.chr('utf-8') }
|
6
|
+
end
|
7
|
+
|
8
|
+
def matched_codepoints
|
9
|
+
matched_ranges.flat_map(&:to_a)
|
10
|
+
end
|
11
|
+
|
12
|
+
def matched_ranges
|
13
|
+
OnigRegexpPropertyHelper.matched_ranges(name)
|
14
|
+
rescue ArgumentError
|
15
|
+
raise_unsupported_or_unknown_error
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module RegexpPropertyValues
|
2
|
+
class Value
|
3
|
+
module RubyFallback
|
4
|
+
def matched_characters
|
5
|
+
matched_codepoints.map { |cp| cp.chr('utf-8') }
|
6
|
+
end
|
7
|
+
|
8
|
+
def matched_codepoints
|
9
|
+
# turns out scanning one big string is the least slow way to do this
|
10
|
+
@@test_str ||= (0..0xD7FF).map { |cp| cp.chr('utf-8') }.join <<
|
11
|
+
(0xE000..0x10FFFF).map { |cp| cp.chr('utf-8') }.join
|
12
|
+
@@test_str.scan(regexp).flat_map(&:codepoints)
|
13
|
+
end
|
14
|
+
|
15
|
+
def matched_ranges
|
16
|
+
require 'range_compressor'
|
17
|
+
RangeCompressor.compress(matched_codepoints)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|