regexp_property_values 0.3.4-java → 1.2.0-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: e8f0605316c390b868ad25d0b4fdcac012f494d4
4
- data.tar.gz: a03e81b7ade12815bf943390883a171b762025b5
2
+ SHA256:
3
+ metadata.gz: dacf307a51b5b959716893297f01121614b829741f3d3f9032a95a9a7c514551
4
+ data.tar.gz: cc90c8688ee4465e316c00e676af47cbd1c8083c9f5d00b575940a6c1a61bf31
5
5
  SHA512:
6
- metadata.gz: cc3879061028fd6ae38e0bd35b63faf49771eafadfb33a5960a59cfb72a7faaa8212746cbbb5f0fd9195d296588c96fcd92fe84d4bc6e9b58ddae60225f2dd63
7
- data.tar.gz: e8a7bbe4b0cf645a3e5a5064ab69d9c3b439cbc610c1756447d185db29b28c98c4bfdd097cda58a2ed47fb8f2d8c50dde6c36a3b92186a968c1084d320608a2a
6
+ metadata.gz: 923198aae109bc79d5b0632e021b2dbdc83e83351394e7abfd5cf68d337113a55b2f7e81cf0f6d367fd133f4b7a75875ba65f8d8459d98a6694422c2f820eb77
7
+ data.tar.gz: aff9b6e0d4822175cef337dfcded3be3d8016edd64d03d038c4ecb56ff7eac1ff07ef74242a74db55c2ca96293c41b674479d481b97b988c92da429b5442fd10
@@ -0,0 +1,22 @@
1
+ name: tests
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+
9
+ strategy:
10
+ matrix:
11
+ ruby: [ '2.3', '3.1', 'jruby-head' ] # TODO: 'ruby-head' after https://github.com/knu/sorted_set/issues/11
12
+
13
+ steps:
14
+ - uses: actions/checkout@v2
15
+ - name: Set up Ruby ${{ matrix.ruby }}
16
+ uses: ruby/setup-ruby@v1
17
+ with:
18
+ ruby-version: ${{ matrix.ruby }}
19
+ - name: Install dependencies
20
+ run: bundle install --jobs 4
21
+ - name: Test with Rake
22
+ run: bundle exec rake
data/.gitignore CHANGED
@@ -15,11 +15,13 @@
15
15
  .ruby-version
16
16
  .tags
17
17
  .tags1
18
+ .tool-versions
18
19
  bbin/
19
20
  binstubs/*
20
21
  bundler_stubs/*/.yardoc
21
22
  Gemfile.lock
22
23
  /.bundle/
24
+ /.vscode/
23
25
  /_yardoc/
24
26
  /coverage/
25
27
  /doc/
data/CHANGELOG.md ADDED
@@ -0,0 +1,30 @@
1
+ # Changelog
2
+ All notable changes to this project will be documented in this file.
3
+
4
+ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
+ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [1.2.0] - 2021-12-31
8
+
9
+ ### Added
10
+ - support for usage in Ractors
11
+
12
+ ## [1.1.0] - 2021-12-05
13
+
14
+ ### Added
15
+ - added new properties from Ruby `3.1.0` to output of `::all`, `::all_for_current_ruby`
16
+ - added options to run `::update` with custom ucd/emoji source paths
17
+
18
+ ## [1.0.0] - 2019-06-16
19
+
20
+ ### Changed
21
+ - removed `::by_category`, `::by_matched_codepoints`, `::short_and_long_names`
22
+ - return values are now always of a custom `Value` class, no longer extended `Strings`
23
+ - unknown properties now raise `RegexpPropertyValues::Error`, no longer an `ArgumentError`
24
+
25
+ ### Added
26
+ - `Value#identifier`
27
+ - `Value#full_name`
28
+
29
+ ### Fixed
30
+ - better codepoint determination speed for non-C Rubies (still slow)
data/Gemfile CHANGED
@@ -4,3 +4,9 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
4
 
5
5
  # Specify your gem's dependencies in regexp_property_values.gemspec
6
6
  gemspec
7
+
8
+ gem 'character_set', '~> 1.4.0'
9
+ gem 'rake', '~> 13.0'
10
+ gem 'rake-compiler', '~> 1.0'
11
+ gem 'range_compressor', '~> 1.0'
12
+ gem 'rspec', '~> 3.0'
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # RegexpPropertyValues
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/regexp_property_values.svg)](http://badge.fury.io/rb/regexp_property_values)
4
- [![Build Status](https://travis-ci.org/janosch-x/regexp_property_values.svg?branch=master)](https://travis-ci.org/janosch-x/regexp_property_values)
4
+ [![Build Status](https://github.com/jaynetics/regexp_property_values/workflows/tests/badge.svg)](https://github.com/jaynetics/regexp_property_values/actions)
5
5
 
6
6
  This small library lets you see which property values are supported by the regular expression engine of the Ruby version you are running and directly reads out their codepoint ranges from there.
7
7
 
@@ -16,22 +16,16 @@ require 'regexp_property_values'
16
16
 
17
17
  PV = RegexpPropertyValues
18
18
 
19
- PV.all # => ["Alpha", "Blank", "Cntrl", ...]
20
- PV.by_category # => {"POSIX brackets" => ["Alpha", ...], "Special" => ...}
21
- PV.short_and_long_names # => [["M", "Grek", ...], ["Mark", "Greek", ...]]
19
+ PV.all # => [<Value name='Alpha'>, <Value name='Blank'>, ...]
22
20
  ```
23
21
 
24
22
  ##### Browse property values supported by the Ruby you are running
25
23
 
26
24
  ```ruby
27
- PV.all_for_current_ruby # => ["Alpha", "Blank", "Cntrl", ...]
28
-
29
- PV.by_category.map { |k, v| [k, v.select(&:supported_by_current_ruby?] }
30
-
31
- # etc.
25
+ PV.all_for_current_ruby # => [<Value name='Alpha'>, <Value name='Blank'>, ...]
32
26
  ```
33
27
 
34
- ##### Inspect properties
28
+ ##### Inspect property values
35
29
 
36
30
  ```ruby
37
31
  PV['alpha'].supported_by_current_ruby? # => true
@@ -40,9 +34,11 @@ PV['foobar'].supported_by_current_ruby? # => false
40
34
  PV['AHex'].matched_characters # => %w[0 1 2 3 4 5 6 7 8 9 A B C ...]
41
35
  PV['AHex'].matched_codepoints # => [48, 49, 50, ...]
42
36
  PV['AHex'].matched_ranges # => [48..57, 65..70, 97..102]
37
+
38
+ PV['foobar'].matched_ranges # => RegexpPropertyValues::Error
43
39
  ```
44
40
 
45
- If [`character_set`](https://github.com/janosch-x/character_set) is installed, you can also do this:
41
+ If [`character_set`](https://github.com/jaynetics/character_set) is installed, you can also do this:
46
42
 
47
43
  ```ruby
48
44
  PV['AHex'].character_set # => #<CharacterSet: {48, 49...} (size: 22)>
@@ -51,9 +47,10 @@ PV['AHex'].character_set # => #<CharacterSet: {48, 49...} (size: 22)>
51
47
  ##### Utility methods
52
48
 
53
49
  ```ruby
54
- # This one takes a few seconds (or minutes, without the C extension)
55
- PV.alias_hash # => {"M" => "Mark", "Grek" => "Greek", ...}
50
+ # get a Hash of aliases for property names
51
+ PV.alias_hash # => { <Value name='M'> => <Value name='Mark'>, ... }
56
52
 
57
- # download the latest list of possible properties
53
+ # download a list of possible properties for the running Ruby version
54
+ # (only used for .all and .alias_hash, not needed for prop lookup via .[])
58
55
  PV.update
59
56
  ```
data/Rakefile CHANGED
@@ -17,6 +17,8 @@ namespace :java do
17
17
  java_gemspec.platform = 'java'
18
18
  java_gemspec.extensions = []
19
19
 
20
+ java_gemspec.add_runtime_dependency 'range_compressor', '~> 1.0'
21
+
20
22
  Gem::PackageTask.new(java_gemspec) do |pkg|
21
23
  pkg.need_zip = true
22
24
  pkg.need_tar = true
@@ -2,25 +2,29 @@
2
2
  #include "ruby/encoding.h"
3
3
  #include "ruby/oniguruma.h" // still in recent rubies f. backwards compatibility
4
4
 
5
- static int prop_name_to_ctype(char* name, rb_encoding *enc) {
5
+ static int prop_name_to_ctype(char *name, rb_encoding *enc)
6
+ {
6
7
  UChar *uname;
7
8
  int ctype;
8
9
 
9
- uname = (UChar*)name;
10
+ uname = (UChar *)name;
10
11
  ctype = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, uname, uname + strlen(name));
11
- if (ctype < 0) rb_raise(rb_eArgError, "Unknown property name `%s`", name);
12
+ if (ctype < 0)
13
+ rb_raise(rb_eArgError, "Unknown property name `%s`", name);
12
14
 
13
15
  return ctype;
14
16
  }
15
17
 
16
- VALUE onig_ranges_to_rb(const OnigCodePoint *onig_ranges) {
18
+ VALUE onig_ranges_to_rb(const OnigCodePoint *onig_ranges)
19
+ {
17
20
  unsigned int range_count, i;
18
21
  VALUE result, sub_range;
19
22
 
20
23
  range_count = onig_ranges[0];
21
24
  result = rb_ary_new2(range_count); // rb_ary_new_capa not avail. in Ruby 2.0
22
25
 
23
- for (i = 0; i < range_count; i++) {
26
+ for (i = 0; i < range_count; i++)
27
+ {
24
28
  sub_range = rb_range_new(INT2FIX(onig_ranges[(i * 2) + 1]),
25
29
  INT2FIX(onig_ranges[(i * 2) + 2]),
26
30
  0);
@@ -30,26 +34,32 @@ VALUE onig_ranges_to_rb(const OnigCodePoint *onig_ranges) {
30
34
  return result;
31
35
  }
32
36
 
33
- VALUE rb_prop_ranges(char* name, rb_encoding *enc) {
37
+ VALUE rb_prop_ranges(char *name)
38
+ {
34
39
  int ctype;
35
40
  const OnigCodePoint *onig_ranges;
36
41
  OnigCodePoint sb_out;
42
+ rb_encoding *enc;
43
+ enc = rb_utf8_encoding();
37
44
 
38
45
  ctype = prop_name_to_ctype(name, enc);
39
46
  ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &onig_ranges);
40
47
  return onig_ranges_to_rb(onig_ranges);
41
48
  }
42
49
 
43
- VALUE method_matched_ranges(VALUE self, VALUE arg) {
50
+ VALUE method_matched_ranges(VALUE self, VALUE arg)
51
+ {
44
52
  char *prop_name;
45
- rb_encoding *enc;
46
-
47
53
  prop_name = StringValueCStr(arg);
48
- enc = rb_enc_get(arg);
49
- return rb_prop_ranges(prop_name, enc);
54
+ return rb_prop_ranges(prop_name);
50
55
  }
51
56
 
52
- void Init_regexp_property_values() {
57
+ void Init_regexp_property_values()
58
+ {
59
+ #ifdef HAVE_RB_EXT_RACTOR_SAFE
60
+ rb_ext_ractor_safe(true);
61
+ #endif
62
+
53
63
  VALUE module;
54
64
  module = rb_define_module("OnigRegexpPropertyHelper");
55
65
  rb_define_singleton_method(module, "matched_ranges", method_matched_ranges, 1);
data/lib/aliases ADDED
@@ -0,0 +1,241 @@
1
+ AHex;ASCII_Hex_Digit
2
+ Adlm;Adlam
3
+ Aghb;Caucasian_Albanian
4
+ Arab;Arabic
5
+ Armi;Imperial_Aramaic
6
+ Armn;Armenian
7
+ Avst;Avestan
8
+ Bali;Balinese
9
+ Bamu;Bamum
10
+ Bass;Bassa_Vah
11
+ Batk;Batak
12
+ Beng;Bengali
13
+ Bhks;Bhaiksuki
14
+ Bidi_C;Bidi_Control
15
+ Bopo;Bopomofo
16
+ Brah;Brahmi
17
+ Brai;Braille
18
+ Bugi;Buginese
19
+ Buhd;Buhid
20
+ C;Other
21
+ CI;Case_Ignorable
22
+ CWCF;Changes_When_Casefolded
23
+ CWCM;Changes_When_Casemapped
24
+ CWL;Changes_When_Lowercased
25
+ CWT;Changes_When_Titlecased
26
+ CWU;Changes_When_Uppercased
27
+ Cakm;Chakma
28
+ Cans;Canadian_Aboriginal
29
+ Cari;Carian
30
+ Cc;Control
31
+ Cf;Format
32
+ Cher;Cherokee
33
+ Chrs;Chorasmian
34
+ Cn;Unassigned
35
+ Co;Private_Use
36
+ Combining_Mark;Mark
37
+ Copt;Coptic
38
+ Cprt;Cypriot
39
+ Cs;Surrogate
40
+ Cyrl;Cyrillic
41
+ DI;Default_Ignorable_Code_Point
42
+ Dep;Deprecated
43
+ Deva;Devanagari
44
+ Dia;Diacritic
45
+ Diak;Dives_Akuru
46
+ Dogr;Dogra
47
+ Dsrt;Deseret
48
+ Dupl;Duployan
49
+ EBase;Emoji_Modifier_Base
50
+ EComp;Emoji_Component
51
+ EMod;Emoji_Modifier
52
+ EPres;Emoji_Presentation
53
+ Egyp;Egyptian_Hieroglyphs
54
+ Elba;Elbasan
55
+ Elym;Elymaic
56
+ Ethi;Ethiopic
57
+ Ext;Extender
58
+ Geor;Georgian
59
+ Glag;Glagolitic
60
+ Gong;Gunjala_Gondi
61
+ Gonm;Masaram_Gondi
62
+ Goth;Gothic
63
+ Gr_Base;Grapheme_Base
64
+ Gr_Ext;Grapheme_Extend
65
+ Gr_Link;Grapheme_Link
66
+ Gran;Grantha
67
+ Grek;Greek
68
+ Gujr;Gujarati
69
+ Guru;Gurmukhi
70
+ Hang;Hangul
71
+ Hani;Han
72
+ Hano;Hanunoo
73
+ Hatr;Hatran
74
+ Hebr;Hebrew
75
+ Hex;Hex_Digit
76
+ Hira;Hiragana
77
+ Hluw;Anatolian_Hieroglyphs
78
+ Hmng;Pahawh_Hmong
79
+ Hmnp;Nyiakeng_Puachue_Hmong
80
+ Hung;Old_Hungarian
81
+ IDC;ID_Continue
82
+ IDS;ID_Start
83
+ IDSB;IDS_Binary_Operator
84
+ IDST;IDS_Trinary_Operator
85
+ Ideo;Ideographic
86
+ Ital;Old_Italic
87
+ Java;Javanese
88
+ Join_C;Join_Control
89
+ Kali;Kayah_Li
90
+ Kana;Katakana
91
+ Khar;Kharoshthi
92
+ Khmr;Khmer
93
+ Khoj;Khojki
94
+ Kits;Khitan_Small_Script
95
+ Knda;Kannada
96
+ Kthi;Kaithi
97
+ L;Letter
98
+ LC;Cased_Letter
99
+ LOE;Logical_Order_Exception
100
+ Lana;Tai_Tham
101
+ Laoo;Lao
102
+ Latn;Latin
103
+ Lepc;Lepcha
104
+ Limb;Limbu
105
+ Lina;Linear_A
106
+ Linb;Linear_B
107
+ Ll;Lowercase_Letter
108
+ Lm;Modifier_Letter
109
+ Lo;Other_Letter
110
+ Lt;Titlecase_Letter
111
+ Lu;Uppercase_Letter
112
+ Lyci;Lycian
113
+ Lydi;Lydian
114
+ M;Mark
115
+ Mahj;Mahajani
116
+ Maka;Makasar
117
+ Mand;Mandaic
118
+ Mani;Manichaean
119
+ Marc;Marchen
120
+ Mc;Spacing_Mark
121
+ Me;Enclosing_Mark
122
+ Medf;Medefaidrin
123
+ Mend;Mende_Kikakui
124
+ Merc;Meroitic_Cursive
125
+ Mero;Meroitic_Hieroglyphs
126
+ Mlym;Malayalam
127
+ Mn;Nonspacing_Mark
128
+ Mong;Mongolian
129
+ Mroo;Mro
130
+ Mtei;Meetei_Mayek
131
+ Mult;Multani
132
+ Mymr;Myanmar
133
+ N;Number
134
+ NChar;Noncharacter_Code_Point
135
+ Nand;Nandinagari
136
+ Narb;Old_North_Arabian
137
+ Nbat;Nabataean
138
+ Nd;Decimal_Number
139
+ Nkoo;Nko
140
+ Nl;Letter_Number
141
+ No;Other_Number
142
+ Nshu;Nushu
143
+ OAlpha;Other_Alphabetic
144
+ ODI;Other_Default_Ignorable_Code_Point
145
+ OGr_Ext;Other_Grapheme_Extend
146
+ OIDC;Other_ID_Continue
147
+ OIDS;Other_ID_Start
148
+ OLower;Other_Lowercase
149
+ OMath;Other_Math
150
+ OUpper;Other_Uppercase
151
+ Ogam;Ogham
152
+ Olck;Ol_Chiki
153
+ Orkh;Old_Turkic
154
+ Orya;Oriya
155
+ Osge;Osage
156
+ Osma;Osmanya
157
+ P;Punctuation
158
+ PCM;Prepended_Concatenation_Mark
159
+ Palm;Palmyrene
160
+ Pat_Syn;Pattern_Syntax
161
+ Pat_WS;Pattern_White_Space
162
+ Pauc;Pau_Cin_Hau
163
+ Pc;Connector_Punctuation
164
+ Pd;Dash_Punctuation
165
+ Pe;Close_Punctuation
166
+ Perm;Old_Permic
167
+ Pf;Final_Punctuation
168
+ Phag;Phags_Pa
169
+ Phli;Inscriptional_Pahlavi
170
+ Phlp;Psalter_Pahlavi
171
+ Phnx;Phoenician
172
+ Pi;Initial_Punctuation
173
+ Plrd;Miao
174
+ Po;Other_Punctuation
175
+ Prti;Inscriptional_Parthian
176
+ Ps;Open_Punctuation
177
+ QMark;Quotation_Mark
178
+ Qaac;Coptic
179
+ Qaai;Inherited
180
+ RI;Regional_Indicator
181
+ Rjng;Rejang
182
+ Rohg;Hanifi_Rohingya
183
+ Runr;Runic
184
+ S;Symbol
185
+ SD;Soft_Dotted
186
+ STerm;Sentence_Terminal
187
+ Samr;Samaritan
188
+ Sarb;Old_South_Arabian
189
+ Saur;Saurashtra
190
+ Sc;Currency_Symbol
191
+ Sgnw;SignWriting
192
+ Shaw;Shavian
193
+ Shrd;Sharada
194
+ Sidd;Siddham
195
+ Sind;Khudawadi
196
+ Sinh;Sinhala
197
+ Sk;Modifier_Symbol
198
+ Sm;Math_Symbol
199
+ So;Other_Symbol
200
+ Sogd;Sogdian
201
+ Sogo;Old_Sogdian
202
+ Sora;Sora_Sompeng
203
+ Soyo;Soyombo
204
+ Sund;Sundanese
205
+ Sylo;Syloti_Nagri
206
+ Syrc;Syriac
207
+ Tagb;Tagbanwa
208
+ Takr;Takri
209
+ Tale;Tai_Le
210
+ Talu;New_Tai_Lue
211
+ Taml;Tamil
212
+ Tang;Tangut
213
+ Tavt;Tai_Viet
214
+ Telu;Telugu
215
+ Term;Terminal_Punctuation
216
+ Tfng;Tifinagh
217
+ Tglg;Tagalog
218
+ Thaa;Thaana
219
+ Tibt;Tibetan
220
+ Tirh;Tirhuta
221
+ UIdeo;Unified_Ideograph
222
+ Ugar;Ugaritic
223
+ VS;Variation_Selector
224
+ Vaii;Vai
225
+ WSpace;White_Space
226
+ Wara;Warang_Citi
227
+ Wcho;Wancho
228
+ XIDC;XID_Continue
229
+ XIDS;XID_Start
230
+ Xpeo;Old_Persian
231
+ Xsux;Cuneiform
232
+ Yezi;Yezidi
233
+ Yiii;Yi
234
+ Z;Separator
235
+ Zanb;Zanabazar_Square
236
+ Zinh;Inherited
237
+ Zl;Line_Separator
238
+ Zp;Paragraph_Separator
239
+ Zs;Space_Separator
240
+ Zyyy;Common
241
+ Zzzz;Unknown
@@ -0,0 +1,147 @@
1
+ module RegexpPropertyValues
2
+ module Updater
3
+ module_function
4
+
5
+ require 'fileutils'
6
+ require 'set'
7
+
8
+ BASE_URL = 'http://www.unicode.org/Public'
9
+
10
+ UCD_FILES = %w[
11
+ Blocks.txt
12
+ DerivedAge.txt
13
+ DerivedCoreProperties.txt
14
+ PropertyAliases.txt
15
+ PropertyValueAliases.txt
16
+ PropList.txt
17
+ Scripts.txt
18
+ ]
19
+
20
+ EMOJI_FILES = %w[
21
+ emoji-data.txt
22
+ ]
23
+
24
+ TMP_DIR = File.join(__dir__, 'tmp_ucd')
25
+
26
+ def call(ucd_path: nil, emoji_path: nil)
27
+ prepare_tmp_dir
28
+ download_ucd_files(ucd_path: ucd_path, emoji_path: emoji_path)
29
+ write_values
30
+ write_aliases
31
+ remove_tmp_dir
32
+ print_stats
33
+ end
34
+
35
+ def prepare_tmp_dir
36
+ FileUtils.rm_rf(TMP_DIR) if File.exist?(TMP_DIR)
37
+ FileUtils.mkdir(TMP_DIR)
38
+ end
39
+
40
+ def download_ucd_files(ucd_path: nil, emoji_path: nil)
41
+ unicode_version = RbConfig::CONFIG.fetch('UNICODE_VERSION')
42
+ emoji_version = RbConfig::CONFIG.fetch('UNICODE_EMOJI_VERSION')
43
+
44
+ ucd_path ||= ENV['RPV_UCD_PATH']
45
+ emoji_path ||= ENV['RPV_EMOJI_PATH']
46
+
47
+ if ucd_path.nil? && emoji_path.nil?
48
+ puts <<-EOS.gsub(/\n */, ' ')
49
+ This try will load ucd and emoji data for the CURRENT RUBY (
50
+ (#{RUBY_VERSION} - ucd #{unicode_version}, emoji #{emoji_version}).
51
+ Run this on the latest Ruby version you want to support.
52
+ Unicode directory structure changes sometimes, so you might need to
53
+ pass the right path(s) as keyword args or ENV vars. Continue? [y/n]'
54
+ EOS
55
+
56
+ return puts 'download skipped.' unless $stdin.gets =~ /^y/i
57
+ end
58
+
59
+ ucd_path ||= "#{BASE_URL}/#{unicode_version}/ucd"
60
+ emoji_path ||= "#{BASE_URL}/emoji/#{emoji_version}"
61
+
62
+ Dir.chdir(TMP_DIR) do
63
+ UCD_FILES.each { |f| `wget #{ucd_path}/#{f}` }
64
+ EMOJI_FILES.each { |f| `wget #{emoji_path}/#{f}` }
65
+ end
66
+ end
67
+
68
+ def write_values
69
+ @values = Set.new
70
+
71
+ # posix properties
72
+ @values += %w[
73
+ Alpha Blank Cntrl Digit Graph Lower Print
74
+ Punct Space Upper XDigit Word Alnum ASCII
75
+ XPosixPunct
76
+ ]
77
+
78
+ # special properties
79
+ @values += %w[Any Assigned In_No_Block Unknown]
80
+
81
+ # legacy properties
82
+ @values += %w[Newline]
83
+
84
+ regexp = /^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (?<prop_name>\w+) +# /
85
+ %w[
86
+ DerivedCoreProperties.txt
87
+ PropList.txt
88
+ Scripts.txt
89
+ emoji-data.txt
90
+ ].each { |file| scan(file, regexp) { |caps| @values << caps[:prop_name] } }
91
+
92
+ scan('PropertyValueAliases.txt', /^gc ; \w+ *; (?<prop_name>\w+)/) do |caps|
93
+ @values << caps[:prop_name]
94
+ end
95
+
96
+ scan('Blocks.txt', /^[\dA-F.]+ *; (?<block_name>[-\w ]+)/) do |caps|
97
+ @values << 'In_' + caps[:block_name].gsub(/\W/, '_')
98
+ end
99
+
100
+ scan('DerivedAge.txt', /^[\dA-F.]+ *; (?<age_num>[\d.]+)/) do |caps|
101
+ @values << 'Age=' + caps[:age_num]
102
+ end
103
+
104
+ File.write(RegexpPropertyValues::VALUES_PATH, @values.sort.join("\n"))
105
+ end
106
+
107
+ def write_aliases
108
+ @aliases = Set.new
109
+
110
+ scan('PropertyAliases.txt', /^(?<alias>\w+) *; (?<name>\w+)/) do |caps|
111
+ if in_values?(caps[:name]) && !in_values?(caps[:alias])
112
+ @aliases << [caps[:alias], caps[:name]]
113
+ end
114
+ end
115
+
116
+ scan('PropertyValueAliases.txt',
117
+ /^[gs]c ; (?<alias1>\w+) *; (?<name>\w+)(?: *; (?<alias2>\w+))?/) do |caps|
118
+ if in_values?(caps[:name]) && !in_values?(caps[:alias1])
119
+ @aliases << [caps[:alias1], caps[:name]]
120
+ end
121
+ if in_values?(caps[:name]) && caps[:alias2] && !in_values?(caps[:alias2])
122
+ @aliases << [caps[:alias2], caps[:name]]
123
+ end
124
+ end
125
+
126
+ File.write(RegexpPropertyValues::ALIASES_PATH,
127
+ @aliases.sort.map { |pair| pair.join(';') }.join("\n"))
128
+ end
129
+
130
+ def in_values?(string)
131
+ @values.any? { |value| value.casecmp?(string) }
132
+ end
133
+
134
+ def scan(file, pattern)
135
+ path = File.join(TMP_DIR, file)
136
+ File.read(path).scan(pattern) { yield(Regexp.last_match) }
137
+ end
138
+
139
+ def remove_tmp_dir
140
+ FileUtils.rm_rf(TMP_DIR)
141
+ end
142
+
143
+ def print_stats
144
+ print "\nFetched #{@values.size} values and #{@aliases.size} aliases.\n\n"
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,19 @@
1
+ module RegexpPropertyValues
2
+ class Value
3
+ module ExtAdapter
4
+ def matched_characters
5
+ matched_codepoints.map { |cp| cp.chr('utf-8') }
6
+ end
7
+
8
+ def matched_codepoints
9
+ matched_ranges.flat_map(&:to_a)
10
+ end
11
+
12
+ def matched_ranges
13
+ OnigRegexpPropertyHelper.matched_ranges(name)
14
+ rescue ArgumentError
15
+ raise_unsupported_or_unknown_error
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ module RegexpPropertyValues
2
+ class Value
3
+ module RubyFallback
4
+ def matched_characters
5
+ matched_codepoints.map { |cp| cp.chr('utf-8') }
6
+ end
7
+
8
+ def matched_codepoints
9
+ # turns out scanning one big string is the least slow way to do this
10
+ @@test_str ||= (0..0xD7FF).map { |cp| cp.chr('utf-8') }.join <<
11
+ (0xE000..0x10FFFF).map { |cp| cp.chr('utf-8') }.join
12
+ @@test_str.scan(regexp).flat_map(&:codepoints)
13
+ end
14
+
15
+ def matched_ranges
16
+ require 'range_compressor'
17
+ RangeCompressor.compress(matched_codepoints)
18
+ end
19
+ end
20
+ end
21
+ end