regexp_property_values 0.3.5 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 26eb46070df0c2f41fe773860140f0b9fea4f9512284a49f2dcac8d77d7055e5
4
- data.tar.gz: d7f31346d39f7d00d0cc3d837b0f32685c2eec56f4e784b641428959134506c2
3
+ metadata.gz: 3295d2f712a9e6aca9af41f2d4b98775dfc39cec0c9dc026fcd50999d4757dd2
4
+ data.tar.gz: 2203633248c4cfb39489edede3be46fc3f935f8bbb6d958c9bd962ecf268a593
5
5
  SHA512:
6
- metadata.gz: 5f425e9a3d570bc612b75f1b113a2077cfeb35e106406655344a13d7c9cd6b74cdc0ce59552cd6017f3b6db3b654877f0cfe6ef02ef60d463c83158a7f51b23a
7
- data.tar.gz: bdf9f9fc7c2a8077af610f2285672fcc4af78fb774542c8cfcf61c8a1ccf03d4894dd79729119c5084ccae752171c92c96430f7d1a35a4214857389f59e9eba9
6
+ metadata.gz: eb72ccf725571146439af6ebaaec6a38420b39b19348827f08f3ca230af326a32653f05b4820f2fd66d98c5e7f7b394a4790e03badb95530311e1a5def209e2d
7
+ data.tar.gz: 26d720ddd4c306223e38e944b7b08cc4c7170b39c120b906766d599b77fc402d560aa1dcbd8921bad30301536800b29cbfc39d525318827561efe162343a08a5
data/.gitignore CHANGED
@@ -20,6 +20,7 @@ binstubs/*
20
20
  bundler_stubs/*/.yardoc
21
21
  Gemfile.lock
22
22
  /.bundle/
23
+ /.vscode/
23
24
  /_yardoc/
24
25
  /coverage/
25
26
  /doc/
@@ -6,6 +6,3 @@ rvm:
6
6
  - 2.5
7
7
  - 2.6
8
8
  - jruby-9.1.9.0
9
- before_install:
10
- - gem update --system
11
- - gem install bundler
@@ -0,0 +1,19 @@
1
+ # Changelog
2
+ All notable changes to this project will be documented in this file.
3
+
4
+ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
+ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [1.0.0] - 2019-06-16
8
+
9
+ ### Changed
10
+ - removed `::by_category`, `::by_matched_codepoints`, `::short_and_long_names`
11
+ - return values are now always of a custom `Value` class, no longer extended `Strings`
12
+ - unknown properties now raise `RegexpPropertyValues::Error`, no longer an `ArgumentError`
13
+
14
+ ### Added
15
+ - `Value#identifier`
16
+ - `Value#full_name`
17
+
18
+ ### Fixed
19
+ - better codepoint determination speed for non-C Rubies (still slow)
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # RegexpPropertyValues
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/regexp_property_values.svg)](http://badge.fury.io/rb/regexp_property_values)
4
- [![Build Status](https://travis-ci.org/janosch-x/regexp_property_values.svg?branch=master)](https://travis-ci.org/janosch-x/regexp_property_values)
4
+ [![Build Status](https://travis-ci.org/jaynetics/regexp_property_values.svg?branch=master)](https://travis-ci.org/jaynetics/regexp_property_values)
5
5
 
6
6
  This small library lets you see which property values are supported by the regular expression engine of the Ruby version you are running and directly reads out their codepoint ranges from there.
7
7
 
@@ -16,22 +16,16 @@ require 'regexp_property_values'
16
16
 
17
17
  PV = RegexpPropertyValues
18
18
 
19
- PV.all # => ["Alpha", "Blank", "Cntrl", ...]
20
- PV.by_category # => {"POSIX brackets" => ["Alpha", ...], "Special" => ...}
21
- PV.short_and_long_names # => [["M", "Grek", ...], ["Mark", "Greek", ...]]
19
+ PV.all # => [<Value name='Alpha'>, <Value name='Blank'>, ...]
22
20
  ```
23
21
 
24
22
  ##### Browse property values supported by the Ruby you are running
25
23
 
26
24
  ```ruby
27
- PV.all_for_current_ruby # => ["Alpha", "Blank", "Cntrl", ...]
28
-
29
- PV.by_category.map { |k, v| [k, v.select(&:supported_by_current_ruby?] }
30
-
31
- # etc.
25
+ PV.all_for_current_ruby # => [<Value name='Alpha'>, <Value name='Blank'>, ...]
32
26
  ```
33
27
 
34
- ##### Inspect properties
28
+ ##### Inspect property values
35
29
 
36
30
  ```ruby
37
31
  PV['alpha'].supported_by_current_ruby? # => true
@@ -42,7 +36,7 @@ PV['AHex'].matched_codepoints # => [48, 49, 50, ...]
42
36
  PV['AHex'].matched_ranges # => [48..57, 65..70, 97..102]
43
37
  ```
44
38
 
45
- If [`character_set`](https://github.com/janosch-x/character_set) is installed, you can also do this:
39
+ If [`character_set`](https://github.com/jaynetics/character_set) is installed, you can also do this:
46
40
 
47
41
  ```ruby
48
42
  PV['AHex'].character_set # => #<CharacterSet: {48, 49...} (size: 22)>
@@ -51,9 +45,9 @@ PV['AHex'].character_set # => #<CharacterSet: {48, 49...} (size: 22)>
51
45
  ##### Utility methods
52
46
 
53
47
  ```ruby
54
- # This one takes a few seconds (or minutes, without the C extension)
55
- PV.alias_hash # => {"M" => "Mark", "Grek" => "Greek", ...}
48
+ # get a Hash of aliases for property names
49
+ PV.alias_hash # => { <Value name='M'> => <Value name='Mark'>, ... }
56
50
 
57
- # download the latest list of possible properties
51
+ # download a list of possible properties for the running Ruby version
58
52
  PV.update
59
53
  ```
data/Rakefile CHANGED
@@ -17,6 +17,8 @@ namespace :java do
17
17
  java_gemspec.platform = 'java'
18
18
  java_gemspec.extensions = []
19
19
 
20
+ java_gemspec.add_runtime_dependency 'range_compressor', '~> 1.0'
21
+
20
22
  Gem::PackageTask.new(java_gemspec) do |pkg|
21
23
  pkg.need_zip = true
22
24
  pkg.need_tar = true
@@ -2,25 +2,29 @@
2
2
  #include "ruby/encoding.h"
3
3
  #include "ruby/oniguruma.h" // still in recent rubies f. backwards compatibility
4
4
 
5
- static int prop_name_to_ctype(char* name, rb_encoding *enc) {
5
+ static int prop_name_to_ctype(char *name, rb_encoding *enc)
6
+ {
6
7
  UChar *uname;
7
8
  int ctype;
8
9
 
9
- uname = (UChar*)name;
10
+ uname = (UChar *)name;
10
11
  ctype = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, uname, uname + strlen(name));
11
- if (ctype < 0) rb_raise(rb_eArgError, "Unknown property name `%s`", name);
12
+ if (ctype < 0)
13
+ rb_raise(rb_eArgError, "Unknown property name `%s`", name);
12
14
 
13
15
  return ctype;
14
16
  }
15
17
 
16
- VALUE onig_ranges_to_rb(const OnigCodePoint *onig_ranges) {
18
+ VALUE onig_ranges_to_rb(const OnigCodePoint *onig_ranges)
19
+ {
17
20
  unsigned int range_count, i;
18
21
  VALUE result, sub_range;
19
22
 
20
23
  range_count = onig_ranges[0];
21
24
  result = rb_ary_new2(range_count); // rb_ary_new_capa not avail. in Ruby 2.0
22
25
 
23
- for (i = 0; i < range_count; i++) {
26
+ for (i = 0; i < range_count; i++)
27
+ {
24
28
  sub_range = rb_range_new(INT2FIX(onig_ranges[(i * 2) + 1]),
25
29
  INT2FIX(onig_ranges[(i * 2) + 2]),
26
30
  0);
@@ -30,26 +34,28 @@ VALUE onig_ranges_to_rb(const OnigCodePoint *onig_ranges) {
30
34
  return result;
31
35
  }
32
36
 
33
- VALUE rb_prop_ranges(char* name, rb_encoding *enc) {
37
+ VALUE rb_prop_ranges(char *name)
38
+ {
34
39
  int ctype;
35
40
  const OnigCodePoint *onig_ranges;
36
41
  OnigCodePoint sb_out;
42
+ rb_encoding *enc;
43
+ enc = rb_utf8_encoding();
37
44
 
38
45
  ctype = prop_name_to_ctype(name, enc);
39
46
  ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &onig_ranges);
40
47
  return onig_ranges_to_rb(onig_ranges);
41
48
  }
42
49
 
43
- VALUE method_matched_ranges(VALUE self, VALUE arg) {
50
+ VALUE method_matched_ranges(VALUE self, VALUE arg)
51
+ {
44
52
  char *prop_name;
45
- rb_encoding *enc;
46
-
47
53
  prop_name = StringValueCStr(arg);
48
- enc = rb_enc_get(arg);
49
- return rb_prop_ranges(prop_name, enc);
54
+ return rb_prop_ranges(prop_name);
50
55
  }
51
56
 
52
- void Init_regexp_property_values() {
57
+ void Init_regexp_property_values()
58
+ {
53
59
  VALUE module;
54
60
  module = rb_define_module("OnigRegexpPropertyHelper");
55
61
  rb_define_singleton_method(module, "matched_ranges", method_matched_ranges, 1);
@@ -0,0 +1,233 @@
1
+ AHex;ASCII_Hex_Digit
2
+ Adlm;Adlam
3
+ Aghb;Caucasian_Albanian
4
+ Arab;Arabic
5
+ Armi;Imperial_Aramaic
6
+ Armn;Armenian
7
+ Avst;Avestan
8
+ Bali;Balinese
9
+ Bamu;Bamum
10
+ Bass;Bassa_Vah
11
+ Batk;Batak
12
+ Beng;Bengali
13
+ Bhks;Bhaiksuki
14
+ Bidi_C;Bidi_Control
15
+ Bopo;Bopomofo
16
+ Brah;Brahmi
17
+ Brai;Braille
18
+ Bugi;Buginese
19
+ Buhd;Buhid
20
+ C;Other
21
+ CI;Case_Ignorable
22
+ CWCF;Changes_When_Casefolded
23
+ CWCM;Changes_When_Casemapped
24
+ CWL;Changes_When_Lowercased
25
+ CWT;Changes_When_Titlecased
26
+ CWU;Changes_When_Uppercased
27
+ Cakm;Chakma
28
+ Cans;Canadian_Aboriginal
29
+ Cari;Carian
30
+ Cc;Control
31
+ Cf;Format
32
+ Cher;Cherokee
33
+ Cn;Unassigned
34
+ Co;Private_Use
35
+ Combining_Mark;Mark
36
+ Copt;Coptic
37
+ Cprt;Cypriot
38
+ Cs;Surrogate
39
+ Cyrl;Cyrillic
40
+ DI;Default_Ignorable_Code_Point
41
+ Dep;Deprecated
42
+ Deva;Devanagari
43
+ Dia;Diacritic
44
+ Dogr;Dogra
45
+ Dsrt;Deseret
46
+ Dupl;Duployan
47
+ Egyp;Egyptian_Hieroglyphs
48
+ Elba;Elbasan
49
+ Elym;Elymaic
50
+ Ethi;Ethiopic
51
+ Ext;Extender
52
+ Geor;Georgian
53
+ Glag;Glagolitic
54
+ Gong;Gunjala_Gondi
55
+ Gonm;Masaram_Gondi
56
+ Goth;Gothic
57
+ Gr_Base;Grapheme_Base
58
+ Gr_Ext;Grapheme_Extend
59
+ Gr_Link;Grapheme_Link
60
+ Gran;Grantha
61
+ Grek;Greek
62
+ Gujr;Gujarati
63
+ Guru;Gurmukhi
64
+ Hang;Hangul
65
+ Hani;Han
66
+ Hano;Hanunoo
67
+ Hatr;Hatran
68
+ Hebr;Hebrew
69
+ Hex;Hex_Digit
70
+ Hira;Hiragana
71
+ Hluw;Anatolian_Hieroglyphs
72
+ Hmng;Pahawh_Hmong
73
+ Hmnp;Nyiakeng_Puachue_Hmong
74
+ Hung;Old_Hungarian
75
+ IDC;ID_Continue
76
+ IDS;ID_Start
77
+ IDSB;IDS_Binary_Operator
78
+ IDST;IDS_Trinary_Operator
79
+ Ideo;Ideographic
80
+ Ital;Old_Italic
81
+ Java;Javanese
82
+ Join_C;Join_Control
83
+ Kali;Kayah_Li
84
+ Kana;Katakana
85
+ Khar;Kharoshthi
86
+ Khmr;Khmer
87
+ Khoj;Khojki
88
+ Knda;Kannada
89
+ Kthi;Kaithi
90
+ L;Letter
91
+ LC;Cased_Letter
92
+ LOE;Logical_Order_Exception
93
+ Lana;Tai_Tham
94
+ Laoo;Lao
95
+ Latn;Latin
96
+ Lepc;Lepcha
97
+ Limb;Limbu
98
+ Lina;Linear_A
99
+ Linb;Linear_B
100
+ Ll;Lowercase_Letter
101
+ Lm;Modifier_Letter
102
+ Lo;Other_Letter
103
+ Lt;Titlecase_Letter
104
+ Lu;Uppercase_Letter
105
+ Lyci;Lycian
106
+ Lydi;Lydian
107
+ M;Mark
108
+ Mahj;Mahajani
109
+ Maka;Makasar
110
+ Mand;Mandaic
111
+ Mani;Manichaean
112
+ Marc;Marchen
113
+ Mc;Spacing_Mark
114
+ Me;Enclosing_Mark
115
+ Medf;Medefaidrin
116
+ Mend;Mende_Kikakui
117
+ Merc;Meroitic_Cursive
118
+ Mero;Meroitic_Hieroglyphs
119
+ Mlym;Malayalam
120
+ Mn;Nonspacing_Mark
121
+ Mong;Mongolian
122
+ Mroo;Mro
123
+ Mtei;Meetei_Mayek
124
+ Mult;Multani
125
+ Mymr;Myanmar
126
+ N;Number
127
+ NChar;Noncharacter_Code_Point
128
+ Nand;Nandinagari
129
+ Narb;Old_North_Arabian
130
+ Nbat;Nabataean
131
+ Nd;Decimal_Number
132
+ Nkoo;Nko
133
+ Nl;Letter_Number
134
+ No;Other_Number
135
+ Nshu;Nushu
136
+ OAlpha;Other_Alphabetic
137
+ ODI;Other_Default_Ignorable_Code_Point
138
+ OGr_Ext;Other_Grapheme_Extend
139
+ OIDC;Other_ID_Continue
140
+ OIDS;Other_ID_Start
141
+ OLower;Other_Lowercase
142
+ OMath;Other_Math
143
+ OUpper;Other_Uppercase
144
+ Ogam;Ogham
145
+ Olck;Ol_Chiki
146
+ Orkh;Old_Turkic
147
+ Orya;Oriya
148
+ Osge;Osage
149
+ Osma;Osmanya
150
+ P;Punctuation
151
+ PCM;Prepended_Concatenation_Mark
152
+ Palm;Palmyrene
153
+ Pat_Syn;Pattern_Syntax
154
+ Pat_WS;Pattern_White_Space
155
+ Pauc;Pau_Cin_Hau
156
+ Pc;Connector_Punctuation
157
+ Pd;Dash_Punctuation
158
+ Pe;Close_Punctuation
159
+ Perm;Old_Permic
160
+ Pf;Final_Punctuation
161
+ Phag;Phags_Pa
162
+ Phli;Inscriptional_Pahlavi
163
+ Phlp;Psalter_Pahlavi
164
+ Phnx;Phoenician
165
+ Pi;Initial_Punctuation
166
+ Plrd;Miao
167
+ Po;Other_Punctuation
168
+ Prti;Inscriptional_Parthian
169
+ Ps;Open_Punctuation
170
+ QMark;Quotation_Mark
171
+ Qaac;Coptic
172
+ Qaai;Inherited
173
+ RI;Regional_Indicator
174
+ Rjng;Rejang
175
+ Rohg;Hanifi_Rohingya
176
+ Runr;Runic
177
+ S;Symbol
178
+ SD;Soft_Dotted
179
+ STerm;Sentence_Terminal
180
+ Samr;Samaritan
181
+ Sarb;Old_South_Arabian
182
+ Saur;Saurashtra
183
+ Sc;Currency_Symbol
184
+ Sgnw;SignWriting
185
+ Shaw;Shavian
186
+ Shrd;Sharada
187
+ Sidd;Siddham
188
+ Sind;Khudawadi
189
+ Sinh;Sinhala
190
+ Sk;Modifier_Symbol
191
+ Sm;Math_Symbol
192
+ So;Other_Symbol
193
+ Sogd;Sogdian
194
+ Sogo;Old_Sogdian
195
+ Sora;Sora_Sompeng
196
+ Soyo;Soyombo
197
+ Sund;Sundanese
198
+ Sylo;Syloti_Nagri
199
+ Syrc;Syriac
200
+ Tagb;Tagbanwa
201
+ Takr;Takri
202
+ Tale;Tai_Le
203
+ Talu;New_Tai_Lue
204
+ Taml;Tamil
205
+ Tang;Tangut
206
+ Tavt;Tai_Viet
207
+ Telu;Telugu
208
+ Term;Terminal_Punctuation
209
+ Tfng;Tifinagh
210
+ Tglg;Tagalog
211
+ Thaa;Thaana
212
+ Tibt;Tibetan
213
+ Tirh;Tirhuta
214
+ UIdeo;Unified_Ideograph
215
+ Ugar;Ugaritic
216
+ VS;Variation_Selector
217
+ Vaii;Vai
218
+ WSpace;White_Space
219
+ Wara;Warang_Citi
220
+ Wcho;Wancho
221
+ XIDC;XID_Continue
222
+ XIDS;XID_Start
223
+ Xpeo;Old_Persian
224
+ Xsux;Cuneiform
225
+ Yiii;Yi
226
+ Z;Separator
227
+ Zanb;Zanabazar_Square
228
+ Zinh;Inherited
229
+ Zl;Line_Separator
230
+ Zp;Paragraph_Separator
231
+ Zs;Space_Separator
232
+ Zyyy;Common
233
+ Zzzz;Unknown
@@ -3,77 +3,35 @@ begin
3
3
  rescue LoadError
4
4
  warn 'regexp_property_values could not load C extension, using slower Ruby'
5
5
  end
6
- require 'regexp_property_values/extension'
6
+ require 'regexp_property_values/updater'
7
+ require 'regexp_property_values/value'
7
8
  require 'regexp_property_values/version'
8
9
 
9
10
  module RegexpPropertyValues
10
- module_function
11
+ Error = Class.new(StandardError)
11
12
 
12
- LIST_URL = 'https://raw.githubusercontent.com/k-takata/Onigmo/master/doc/UnicodeProps.txt'
13
+ VALUES_PATH = File.join(__dir__, 'values')
14
+ ALIASES_PATH = File.join(__dir__, 'aliases')
13
15
 
14
- def update
15
- puts "Downloading #{LIST_URL}"
16
- require 'open-uri'
17
- File.open(file_path, 'w') { |f| IO.copy_stream(open(LIST_URL), f) }
18
- puts 'Done!'
16
+ def self.[](name)
17
+ Value.new(name)
19
18
  end
20
19
 
21
- def file_path
22
- File.expand_path('../UnicodeProps.txt', __FILE__)
20
+ def self.all_for_current_ruby
21
+ @all_for_current_ruby ||= all.select(&:supported_by_current_ruby?)
23
22
  end
24
23
 
25
- def all
26
- by_category.values.flatten
24
+ def self.all
25
+ @all ||= File.readlines(VALUES_PATH).map { |line| Value.new(line.chomp) }
27
26
  end
28
27
 
29
- def all_for_current_ruby
30
- all.select(&:supported_by_current_ruby?)
28
+ def self.alias_hash
29
+ @alias_hash ||= File.readlines(ALIASES_PATH).map do |line|
30
+ line.chomp.split(';').map { |name| Value.new(name) }
31
+ end.to_h
31
32
  end
32
33
 
33
- def by_category
34
- result = File.foreach(file_path).each_with_object({}) do |line, hash|
35
- if /^\* (?<category>\S.+)/ =~ line
36
- @current_category = category
37
- hash[@current_category] ||= []
38
- elsif /^ {4}(?<value_name>\S.*)/ =~ line
39
- hash[@current_category] << value_name.extend(Extension)
40
- end
41
- end
42
- add_oniguruma_properties(result)
43
- result
44
- end
45
-
46
- def add_oniguruma_properties(props_by_category)
47
- props_by_category['Special'] << 'Newline'.extend(Extension)
48
- end
49
-
50
- def alias_hash
51
- short_names, long_names = short_and_long_names
52
- return {} if short_names.empty?
53
-
54
- long_names -= by_category['POSIX brackets']
55
- by_matched_codepoints.each_value.each_with_object({}) do |props, hash|
56
- next if props.count < 2
57
- long_name = (props & long_names)[0] || fail("no long name for #{props}")
58
- (props & short_names).each { |short_name| hash[short_name] = long_name }
59
- end
60
- end
61
-
62
- def short_and_long_names
63
- short_name_categories = ['Major and General Categories',
64
- 'PropertyAliases',
65
- 'PropertyValueAliases (Script)']
66
- by_category.each_with_object([[], []]) do |(cat_name, props), (short, long)|
67
- (short_name_categories.include?(cat_name) ? short : long).concat(props)
68
- end
69
- end
70
-
71
- def by_matched_codepoints
72
- puts 'Establishing property codepoints, this may take a bit ...'
73
- all_for_current_ruby.group_by(&:matched_codepoints)
74
- end
75
-
76
- def [](prop)
77
- prop.extend(Extension)
34
+ def self.update
35
+ Updater.call
78
36
  end
79
37
  end