character_set 1.2.0-java → 1.5.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +3 -0
- data/.github/workflows/gouteur.yml +20 -0
- data/.github/workflows/lint.yml +29 -0
- data/.github/workflows/tests.yml +22 -0
- data/.gitignore +1 -0
- data/.gouteur.yml +2 -0
- data/.rubocop.yml +17 -0
- data/BENCHMARK.md +53 -17
- data/CHANGELOG.md +54 -0
- data/README.md +51 -12
- data/Rakefile +20 -18
- data/benchmarks/count_in.rb +13 -0
- data/benchmarks/delete_in.rb +1 -1
- data/benchmarks/scan.rb +13 -0
- data/benchmarks/shared.rb +5 -0
- data/benchmarks/z_add.rb +12 -0
- data/benchmarks/z_delete.rb +12 -0
- data/benchmarks/z_merge.rb +15 -0
- data/benchmarks/z_minmax.rb +12 -0
- data/bin/console +2 -0
- data/character_set.gemspec +17 -4
- data/ext/character_set/character_set.c +969 -415
- data/ext/character_set/unicode_casefold_table.h +44 -1
- data/ext/character_set/unicode_casefold_table.h.tmpl +11 -0
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/core_ext/regexp_ext.rb +1 -1
- data/lib/character_set/core_ext/string_ext.rb +3 -1
- data/lib/character_set/expression_converter.rb +41 -43
- data/lib/character_set/parser.rb +1 -1
- data/lib/character_set/predefined_sets/any.cps +1 -0
- data/lib/character_set/predefined_sets/ascii.cps +1 -0
- data/lib/character_set/predefined_sets/ascii_alnum.cps +3 -0
- data/lib/character_set/predefined_sets/ascii_letter.cps +2 -0
- data/lib/character_set/predefined_sets/assigned.cps +677 -0
- data/lib/character_set/predefined_sets/bmp.cps +2 -0
- data/lib/character_set/predefined_sets/crypt.cps +2 -0
- data/lib/character_set/predefined_sets/emoji.cps +152 -0
- data/lib/character_set/predefined_sets/newline.cps +3 -0
- data/lib/character_set/predefined_sets/surrogate.cps +1 -0
- data/lib/character_set/predefined_sets/unicode.cps +2 -0
- data/lib/character_set/predefined_sets/url_fragment.cps +8 -0
- data/lib/character_set/predefined_sets/url_host.cps +10 -0
- data/lib/character_set/predefined_sets/url_path.cps +7 -0
- data/lib/character_set/predefined_sets/url_query.cps +8 -0
- data/lib/character_set/predefined_sets/whitespace.cps +10 -0
- data/lib/character_set/predefined_sets.rb +25 -260
- data/lib/character_set/ruby_fallback/character_set_methods.rb +60 -9
- data/lib/character_set/ruby_fallback/set_methods.rb +25 -17
- data/lib/character_set/ruby_fallback.rb +5 -3
- data/lib/character_set/set_method_adapters.rb +4 -3
- data/lib/character_set/shared_methods.rb +69 -50
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +114 -17
- data/.travis.yml +0 -8
- data/lib/character_set/ruby_fallback/plane_methods.rb +0 -27
@@ -6,7 +6,7 @@ typedef struct casefold_mapping {
|
|
6
6
|
unsigned long to;
|
7
7
|
} casefold_mapping;
|
8
8
|
|
9
|
-
#define CASEFOLD_COUNT
|
9
|
+
#define CASEFOLD_COUNT 1426
|
10
10
|
|
11
11
|
static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
12
12
|
{0x0041,0x0061},
|
@@ -564,6 +564,41 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
564
564
|
{0x104D1,0x104F9},
|
565
565
|
{0x104D2,0x104FA},
|
566
566
|
{0x104D3,0x104FB},
|
567
|
+
{0x10570,0x10597},
|
568
|
+
{0x10571,0x10598},
|
569
|
+
{0x10572,0x10599},
|
570
|
+
{0x10573,0x1059A},
|
571
|
+
{0x10574,0x1059B},
|
572
|
+
{0x10575,0x1059C},
|
573
|
+
{0x10576,0x1059D},
|
574
|
+
{0x10577,0x1059E},
|
575
|
+
{0x10578,0x1059F},
|
576
|
+
{0x10579,0x105A0},
|
577
|
+
{0x1057A,0x105A1},
|
578
|
+
{0x1057C,0x105A3},
|
579
|
+
{0x1057D,0x105A4},
|
580
|
+
{0x1057E,0x105A5},
|
581
|
+
{0x1057F,0x105A6},
|
582
|
+
{0x10580,0x105A7},
|
583
|
+
{0x10581,0x105A8},
|
584
|
+
{0x10582,0x105A9},
|
585
|
+
{0x10583,0x105AA},
|
586
|
+
{0x10584,0x105AB},
|
587
|
+
{0x10585,0x105AC},
|
588
|
+
{0x10586,0x105AD},
|
589
|
+
{0x10587,0x105AE},
|
590
|
+
{0x10588,0x105AF},
|
591
|
+
{0x10589,0x105B0},
|
592
|
+
{0x1058A,0x105B1},
|
593
|
+
{0x1058C,0x105B3},
|
594
|
+
{0x1058D,0x105B4},
|
595
|
+
{0x1058E,0x105B5},
|
596
|
+
{0x1058F,0x105B6},
|
597
|
+
{0x10590,0x105B7},
|
598
|
+
{0x10591,0x105B8},
|
599
|
+
{0x10592,0x105B9},
|
600
|
+
{0x10594,0x105BB},
|
601
|
+
{0x10595,0x105BC},
|
567
602
|
{0x10A0,0x2D00},
|
568
603
|
{0x10A1,0x2D01},
|
569
604
|
{0x10A2,0x2D02},
|
@@ -1102,6 +1137,7 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
1102
1137
|
{0x2C2C,0x2C5C},
|
1103
1138
|
{0x2C2D,0x2C5D},
|
1104
1139
|
{0x2C2E,0x2C5E},
|
1140
|
+
{0x2C2F,0x2C5F},
|
1105
1141
|
{0x2C60,0x2C61},
|
1106
1142
|
{0x2C62,0x026B},
|
1107
1143
|
{0x2C63,0x1D7D},
|
@@ -1282,10 +1318,17 @@ static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {
|
|
1282
1318
|
{0xA7BA,0xA7BB},
|
1283
1319
|
{0xA7BC,0xA7BD},
|
1284
1320
|
{0xA7BE,0xA7BF},
|
1321
|
+
{0xA7C0,0xA7C1},
|
1285
1322
|
{0xA7C2,0xA7C3},
|
1286
1323
|
{0xA7C4,0xA794},
|
1287
1324
|
{0xA7C5,0x0282},
|
1288
1325
|
{0xA7C6,0x1D8E},
|
1326
|
+
{0xA7C7,0xA7C8},
|
1327
|
+
{0xA7C9,0xA7CA},
|
1328
|
+
{0xA7D0,0xA7D1},
|
1329
|
+
{0xA7D6,0xA7D7},
|
1330
|
+
{0xA7D8,0xA7D9},
|
1331
|
+
{0xA7F5,0xA7F6},
|
1289
1332
|
{0xAB70,0x13A0},
|
1290
1333
|
{0xAB71,0x13A1},
|
1291
1334
|
{0xAB72,0x13A2},
|
@@ -0,0 +1,11 @@
|
|
1
|
+
// THIS FILE IS GENERATED BY $ rake sync_casefold_data - DO NOT EDIT
|
2
|
+
// -*-C-*-
|
3
|
+
|
4
|
+
typedef struct casefold_mapping {
|
5
|
+
unsigned long from;
|
6
|
+
unsigned long to;
|
7
|
+
} casefold_mapping;
|
8
|
+
|
9
|
+
#define CASEFOLD_COUNT 0
|
10
|
+
|
11
|
+
static const casefold_mapping unicode_casefold_table[CASEFOLD_COUNT] = {};
|
@@ -6,11 +6,13 @@ class CharacterSet
|
|
6
6
|
end
|
7
7
|
|
8
8
|
{
|
9
|
+
count_by_character_set: :count_in,
|
9
10
|
covered_by_character_set?: :cover?,
|
10
11
|
delete_character_set: :delete_in,
|
11
12
|
delete_character_set!: :delete_in!,
|
12
13
|
keep_character_set: :keep_in,
|
13
14
|
keep_character_set!: :keep_in!,
|
15
|
+
scan_by_character_set: :scan,
|
14
16
|
uses_character_set?: :used_by?,
|
15
17
|
}.each do |string_method, set_method|
|
16
18
|
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
@@ -27,4 +29,4 @@ class CharacterSet
|
|
27
29
|
end
|
28
30
|
end
|
29
31
|
|
30
|
-
::String.
|
32
|
+
::String.instance_eval { include CharacterSet::CoreExt::StringExt }
|
@@ -4,62 +4,57 @@ class CharacterSet
|
|
4
4
|
|
5
5
|
Error = Class.new(ArgumentError)
|
6
6
|
|
7
|
-
def convert(expression)
|
8
|
-
CharacterSet.require_optional_dependency('regexp_parser')
|
7
|
+
def convert(expression, to = CharacterSet)
|
8
|
+
CharacterSet.require_optional_dependency('regexp_parser', __method__)
|
9
9
|
|
10
10
|
case expression
|
11
11
|
when Regexp::Expression::Root
|
12
12
|
if expression.count != 1
|
13
13
|
raise Error, 'Pass a Regexp with exactly one expression, e.g. /[a-z]/'
|
14
14
|
end
|
15
|
-
convert(expression[0])
|
15
|
+
convert(expression[0], to)
|
16
16
|
|
17
17
|
when Regexp::Expression::CharacterSet
|
18
|
-
content = expression.map { |subexp| convert(subexp) }.reduce(:+)
|
18
|
+
content = expression.map { |subexp| convert(subexp, to) }.reduce(:+)
|
19
|
+
content ||= to[]
|
19
20
|
expression.negative? ? content.inversion : content
|
20
21
|
|
21
22
|
when Regexp::Expression::CharacterSet::Intersection
|
22
|
-
expression.map { |subexp| convert(subexp) }.reduce(:&)
|
23
|
+
expression.map { |subexp| convert(subexp, to) }.reduce(:&)
|
23
24
|
|
24
25
|
when Regexp::Expression::CharacterSet::IntersectedSequence
|
25
|
-
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
26
|
+
expression.map { |subexp| convert(subexp, to) }.reduce(:+) || to[]
|
26
27
|
|
27
28
|
when Regexp::Expression::CharacterSet::Range
|
28
|
-
start, finish = expression.map { |subexp| convert(subexp) }
|
29
|
-
|
29
|
+
start, finish = expression.map { |subexp| convert(subexp, to) }
|
30
|
+
to.new((start.min)..(finish.max))
|
30
31
|
|
31
32
|
when Regexp::Expression::CharacterType::Any
|
32
|
-
|
33
|
-
|
34
|
-
when Regexp::Expression::CharacterType::
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
when Regexp::Expression::CharacterType::Word
|
53
|
-
CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122)
|
54
|
-
|
55
|
-
when Regexp::Expression::CharacterType::NonWord
|
56
|
-
CharacterSet.from_ranges(48..57, 65..90, 95..95, 97..122).inversion
|
33
|
+
to.unicode
|
34
|
+
|
35
|
+
when Regexp::Expression::CharacterType::Base
|
36
|
+
/(?<negative>non)?(?<base_name>.+)/ =~ expression.token
|
37
|
+
content =
|
38
|
+
if expression.unicode_classes?
|
39
|
+
# in u-mode, type shortcuts match the same as \p{<long type name>}
|
40
|
+
to.of_property(base_name)
|
41
|
+
else
|
42
|
+
# in normal mode, types match only ascii chars
|
43
|
+
case base_name.to_sym
|
44
|
+
when :digit then to.from_ranges(48..57)
|
45
|
+
when :hex then to.from_ranges(48..57, 65..70, 97..102)
|
46
|
+
when :space then to.from_ranges(9..13, 32..32)
|
47
|
+
when :word then to.from_ranges(48..57, 65..90, 95..95, 97..122)
|
48
|
+
else raise Error, "Unsupported CharacterType #{base_name}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
negative ? content.inversion : content
|
57
52
|
|
58
53
|
when Regexp::Expression::EscapeSequence::CodepointList
|
59
|
-
|
54
|
+
to.new(expression.codepoints)
|
60
55
|
|
61
56
|
when Regexp::Expression::EscapeSequence::Base
|
62
|
-
|
57
|
+
to[expression.codepoint]
|
63
58
|
|
64
59
|
when Regexp::Expression::Group::Capture,
|
65
60
|
Regexp::Expression::Group::Passive,
|
@@ -67,19 +62,19 @@ class CharacterSet
|
|
67
62
|
Regexp::Expression::Group::Atomic,
|
68
63
|
Regexp::Expression::Group::Options
|
69
64
|
case expression.count
|
70
|
-
when 0 then
|
71
|
-
when 1 then convert(expression.first)
|
65
|
+
when 0 then to[]
|
66
|
+
when 1 then convert(expression.first, to)
|
72
67
|
else
|
73
68
|
raise Error, 'Groups must contain exactly one expression, e.g. ([a-z])'
|
74
69
|
end
|
75
70
|
|
76
|
-
when Regexp::Expression::Alternation
|
77
|
-
expression.map { |subexp| convert(subexp) }.reduce(:+)
|
71
|
+
when Regexp::Expression::Alternation # rubocop:disable Lint/DuplicateBranch
|
72
|
+
expression.map { |subexp| convert(subexp, to) }.reduce(:+)
|
78
73
|
|
79
74
|
when Regexp::Expression::Alternative
|
80
75
|
case expression.count
|
81
|
-
when 0 then
|
82
|
-
when 1 then convert(expression.first)
|
76
|
+
when 0 then to[]
|
77
|
+
when 1 then convert(expression.first, to)
|
83
78
|
else
|
84
79
|
raise Error, 'Alternatives must contain exactly one expression'
|
85
80
|
end
|
@@ -88,11 +83,14 @@ class CharacterSet
|
|
88
83
|
if expression.set_level == 0 && expression.text.size != 1
|
89
84
|
raise Error, 'Literal runs outside of sets are codepoint *sequences*'
|
90
85
|
end
|
91
|
-
|
86
|
+
to[expression.text.ord]
|
92
87
|
|
93
88
|
when Regexp::Expression::UnicodeProperty::Base,
|
94
89
|
Regexp::Expression::PosixClass
|
95
|
-
content =
|
90
|
+
content = to.of_property(expression.token)
|
91
|
+
if expression.type == :posixclass && expression.ascii_classes?
|
92
|
+
content = content.ascii_part
|
93
|
+
end
|
96
94
|
expression.negative? ? content.inversion : content
|
97
95
|
|
98
96
|
when Regexp::Expression::Base
|
data/lib/character_set/parser.rb
CHANGED
@@ -5,7 +5,7 @@ class CharacterSet
|
|
5
5
|
def codepoints_from_enumerable(object)
|
6
6
|
raise ArgumentError, 'pass an Enumerable' unless object.respond_to?(:each)
|
7
7
|
# Use #each to check first element (only this works for all Enumerables)
|
8
|
-
object.each do |e|
|
8
|
+
object.each do |e| # rubocop:disable Lint/UnreachableLoop
|
9
9
|
return object if e.is_a?(Integer) && e >= 0 && e < 0x110000
|
10
10
|
return object.map(&:ord) if e.is_a?(String) && e.length == 1
|
11
11
|
raise ArgumentError, "#{e.inspect} is not valid as a codepoint"
|
@@ -0,0 +1 @@
|
|
1
|
+
0,10FFFF
|
@@ -0,0 +1 @@
|
|
1
|
+
0,7F
|