character_set 1.3.0-java → 1.4.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -1
- data/README.md +13 -3
- data/Rakefile +1 -1
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/shared_methods.rb +4 -0
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 72bf4e9262b86b5d729e7e526e311a8617e70dff50f0f2c70a2081363c2bebb7
|
4
|
+
data.tar.gz: 26f6ff53583ebf1dae307076682df54e3c7bc365022abfc13b988131a0aecff6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '0392f8f135133ed9edd720633694a74df1ee6fcd0fcf806c1a4ecd26314976f129941efee839f9b1a9d2be95bc4d5f0964c36de2b15f309b10953628dc4a756f'
|
7
|
+
data.tar.gz: c52d3928e5c9dba3a130a0bd2e1c5b191ebd6211cc88cb9f5960fc389dea0447897633a4c5893f3ce457c76d9c1a518e11d8f4e40116cab7f728ccae972a3fd0
|
data/CHANGELOG.md
CHANGED
@@ -4,7 +4,16 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
-
##
|
7
|
+
## [1.4.0] - 2019-06-07
|
8
|
+
|
9
|
+
### Added
|
10
|
+
- `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
|
11
|
+
- allows for much shorter astral plane representations e.g. in JavaScript
|
12
|
+
- thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
|
13
|
+
- improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
|
14
|
+
|
15
|
+
### Fixed
|
16
|
+
- '/' is now escaped by default when stringifying so as to work with //-regexp syntax
|
8
17
|
|
9
18
|
## [1.3.0] - 2019-04-26
|
10
19
|
|
data/README.md
CHANGED
@@ -40,7 +40,7 @@ CharacterSet.parse('[a-c]')
|
|
40
40
|
CharacterSet.parse('\U00000061-\U00000063')
|
41
41
|
```
|
42
42
|
|
43
|
-
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
|
43
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
|
44
44
|
|
45
45
|
```ruby
|
46
46
|
CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
|
@@ -167,8 +167,18 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
|
|
167
167
|
# disable abbreviation (grouping of codepoints in ranges)
|
168
168
|
set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
|
169
169
|
|
170
|
-
#
|
171
|
-
|
170
|
+
# astral members require some trickery if we want to target environments
|
171
|
+
# that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
|
172
|
+
set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
|
173
|
+
|
174
|
+
# Use #to_s_with_surrogate_ranges e.g. for JavaScript:
|
175
|
+
set.to_s_with_surrogate_ranges
|
176
|
+
# => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
|
177
|
+
|
178
|
+
# Or use #to_s_with_surrogate_alternation if such surrogate set pairs
|
179
|
+
# don't work in your target environment:
|
180
|
+
set.to_s_with_surrogate_alternation
|
181
|
+
# => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
|
172
182
|
```
|
173
183
|
|
174
184
|
### Unicode plane methods
|
data/Rakefile
CHANGED
@@ -126,7 +126,7 @@ task :sync_predefined_sets do
|
|
126
126
|
%w[assigned emoji whitespace].each do |prop|
|
127
127
|
require 'regexp_property_values'
|
128
128
|
ranges = RegexpPropertyValues[prop].matched_ranges
|
129
|
-
str = ranges.map { |r| r.
|
129
|
+
str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
|
130
130
|
File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
|
131
131
|
end
|
132
132
|
end
|
@@ -86,6 +86,10 @@ class CharacterSet
|
|
86
86
|
Writer.write(ranges, opts, &block)
|
87
87
|
end
|
88
88
|
|
89
|
+
def to_s_with_surrogate_ranges
|
90
|
+
Writer.write_surrogate_ranges(bmp_part.ranges, astral_part.ranges)
|
91
|
+
end
|
92
|
+
|
89
93
|
def to_s_with_surrogate_alternation
|
90
94
|
Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges)
|
91
95
|
end
|
data/lib/character_set/writer.rb
CHANGED
@@ -1,37 +1,108 @@
|
|
1
1
|
class CharacterSet
|
2
2
|
module Writer
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
3
|
+
class << self
|
4
|
+
def write(codepoint_ranges, opts = {}, &block)
|
5
|
+
content = codepoint_ranges.map do |range|
|
6
|
+
if range.size > 2 && opts[:abbreviate] != false
|
7
|
+
bounds = [range.min, range.max]
|
8
|
+
bounds.map { |cp| write_codepoint(cp, opts, &block) }.join('-')
|
9
|
+
else
|
10
|
+
range.map { |cp| write_codepoint(cp, opts, &block) }.join
|
11
|
+
end
|
12
|
+
end.join
|
13
|
+
opts[:in_brackets] ? "[#{content}]" : content
|
14
|
+
end
|
15
|
+
|
16
|
+
def write_codepoint(codepoint, opts = {}, &block)
|
17
|
+
Character.new(codepoint).escape(opts, &block)
|
18
|
+
end
|
19
|
+
|
20
|
+
def write_surrogate_ranges(bmp_ranges, astral_ranges)
|
21
|
+
astral_branches = surrogate_range_expressions(astral_ranges)
|
22
|
+
bmp_set_with_alternatives(bmp_ranges, astral_branches)
|
23
|
+
end
|
24
|
+
|
25
|
+
def write_surrogate_alternation(bmp_ranges, astral_ranges)
|
26
|
+
astral_branches = surrogate_pairs(astral_ranges)
|
27
|
+
bmp_set_with_alternatives(bmp_ranges, astral_branches)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def surrogate_range_expressions(astral_ranges)
|
33
|
+
compressed_surrogate_range_pairs(astral_ranges).map do |hi_ranges, lo_ranges|
|
34
|
+
[hi_ranges, lo_ranges].map do |ranges|
|
35
|
+
use_brackets = ranges.size > 1 || ranges.first.size > 1
|
36
|
+
write(ranges, format: :js, in_brackets: use_brackets)
|
37
|
+
end.join
|
11
38
|
end
|
12
|
-
end
|
13
|
-
|
14
|
-
|
39
|
+
end
|
40
|
+
|
41
|
+
def compressed_surrogate_range_pairs(astral_ranges)
|
42
|
+
halves = astral_ranges.flat_map { |range| surrogate_half_ranges(range) }
|
15
43
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
44
|
+
# compress high surrogate codepoint ranges with common low range half
|
45
|
+
with_common_lo = halves.group_by(&:last).map do |lo_range, pairs|
|
46
|
+
hi_ranges = pairs.map(&:first)
|
47
|
+
compressed_hi_ranges = hi_ranges.each_with_object([]) do |range, arr|
|
48
|
+
prev = arr.last
|
49
|
+
if prev.nil? || prev.max + 1 < range.min # first or gap
|
50
|
+
arr << range
|
51
|
+
else # continuous codepoints, expand previous range
|
52
|
+
arr[-1] = (prev.min)..(range.max)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
[compressed_hi_ranges, lo_range]
|
56
|
+
end
|
57
|
+
|
58
|
+
# compress low surrogate codepoint ranges with common high ranges
|
59
|
+
with_common_lo.each_with_object({}) do |(hi_ranges, lo_range), hash|
|
60
|
+
(hash[hi_ranges] ||= []) << lo_range
|
61
|
+
end
|
23
62
|
end
|
24
|
-
end
|
25
63
|
|
26
|
-
|
27
|
-
|
28
|
-
|
64
|
+
def surrogate_half_ranges(astral_range)
|
65
|
+
hi_min, lo_min = surrogate_pair_codepoints(astral_range.min)
|
66
|
+
hi_max, lo_max = surrogate_pair_codepoints(astral_range.max)
|
67
|
+
hi_count = 1 + hi_max - hi_min
|
68
|
+
return [[hi_min..hi_min, lo_min..lo_max]] if hi_count == 1
|
69
|
+
|
70
|
+
ranges = []
|
71
|
+
|
72
|
+
# first high surrogate might be partially covered (if lo_min > 0xDC00)
|
73
|
+
ranges << [hi_min..hi_min, lo_min..0xDFFF]
|
74
|
+
|
75
|
+
# any high surrogates in between are fully covered
|
76
|
+
ranges << [(hi_min + 1)..(hi_max - 1), 0xDC00..0xDFFF] if hi_count > 2
|
29
77
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
78
|
+
# last high surrogate might be partially covered (if lo_max < 0xDFFF)
|
79
|
+
ranges << [hi_max..hi_max, 0xDC00..lo_max]
|
80
|
+
|
81
|
+
ranges
|
82
|
+
end
|
83
|
+
|
84
|
+
def surrogate_pair_codepoints(astral_codepoint)
|
85
|
+
base = astral_codepoint - 0x10000
|
86
|
+
high = base / 1024 + 0xD800
|
87
|
+
low = base % 1024 + 0xDC00
|
88
|
+
[high, low]
|
89
|
+
end
|
90
|
+
|
91
|
+
def bmp_set_with_alternatives(bmp_ranges, alternatives)
|
92
|
+
bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
|
93
|
+
return bmp_set if alternatives.empty? && bmp_ranges.any?
|
94
|
+
|
95
|
+
"(?:#{((bmp_ranges.any? ? [bmp_set] : []) + alternatives).join('|')})"
|
96
|
+
end
|
97
|
+
|
98
|
+
def surrogate_pairs(astral_ranges)
|
99
|
+
astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
|
100
|
+
end
|
101
|
+
|
102
|
+
def surrogate_pair(astral_codepoint)
|
103
|
+
surrogate_pair_codepoints(astral_codepoint)
|
104
|
+
.map { |half| write_codepoint(half, format: :js) }.join
|
105
|
+
end
|
35
106
|
end
|
36
107
|
end
|
37
108
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: character_set
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Janosch Müller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: benchmark-ips
|