character_set 1.3.0-java → 1.4.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -1
- data/README.md +13 -3
- data/Rakefile +1 -1
- data/lib/character_set/character.rb +1 -1
- data/lib/character_set/shared_methods.rb +4 -0
- data/lib/character_set/version.rb +1 -1
- data/lib/character_set/writer.rb +98 -27
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 72bf4e9262b86b5d729e7e526e311a8617e70dff50f0f2c70a2081363c2bebb7
|
4
|
+
data.tar.gz: 26f6ff53583ebf1dae307076682df54e3c7bc365022abfc13b988131a0aecff6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '0392f8f135133ed9edd720633694a74df1ee6fcd0fcf806c1a4ecd26314976f129941efee839f9b1a9d2be95bc4d5f0964c36de2b15f309b10953628dc4a756f'
|
7
|
+
data.tar.gz: c52d3928e5c9dba3a130a0bd2e1c5b191ebd6211cc88cb9f5960fc389dea0447897633a4c5893f3ce457c76d9c1a518e11d8f4e40116cab7f728ccae972a3fd0
|
data/CHANGELOG.md
CHANGED
@@ -4,7 +4,16 @@ All notable changes to this project will be documented in this file.
|
|
4
4
|
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
5
5
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
|
-
##
|
7
|
+
## [1.4.0] - 2019-06-07
|
8
|
+
|
9
|
+
### Added
|
10
|
+
- `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
|
11
|
+
- allows for much shorter astral plane representations e.g. in JavaScript
|
12
|
+
- thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
|
13
|
+
- improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
|
14
|
+
|
15
|
+
### Fixed
|
16
|
+
- '/' is now escaped by default when stringifying so as to work with //-regexp syntax
|
8
17
|
|
9
18
|
## [1.3.0] - 2019-04-26
|
10
19
|
|
data/README.md
CHANGED
@@ -40,7 +40,7 @@ CharacterSet.parse('[a-c]')
|
|
40
40
|
CharacterSet.parse('\U00000061-\U00000063')
|
41
41
|
```
|
42
42
|
|
43
|
-
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
|
43
|
+
If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
|
44
44
|
|
45
45
|
```ruby
|
46
46
|
CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
|
@@ -167,8 +167,18 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
|
|
167
167
|
# disable abbreviation (grouping of codepoints in ranges)
|
168
168
|
set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
|
169
169
|
|
170
|
-
#
|
171
|
-
|
170
|
+
# astral members require some trickery if we want to target environments
|
171
|
+
# that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
|
172
|
+
set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
|
173
|
+
|
174
|
+
# Use #to_s_with_surrogate_ranges e.g. for JavaScript:
|
175
|
+
set.to_s_with_surrogate_ranges
|
176
|
+
# => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
|
177
|
+
|
178
|
+
# Or use #to_s_with_surrogate_alternation if such surrogate set pairs
|
179
|
+
# don't work in your target environment:
|
180
|
+
set.to_s_with_surrogate_alternation
|
181
|
+
# => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
|
172
182
|
```
|
173
183
|
|
174
184
|
### Unicode plane methods
|
data/Rakefile
CHANGED
@@ -126,7 +126,7 @@ task :sync_predefined_sets do
|
|
126
126
|
%w[assigned emoji whitespace].each do |prop|
|
127
127
|
require 'regexp_property_values'
|
128
128
|
ranges = RegexpPropertyValues[prop].matched_ranges
|
129
|
-
str = ranges.map { |r| r.
|
129
|
+
str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
|
130
130
|
File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
|
131
131
|
end
|
132
132
|
end
|
@@ -86,6 +86,10 @@ class CharacterSet
|
|
86
86
|
Writer.write(ranges, opts, &block)
|
87
87
|
end
|
88
88
|
|
89
|
+
def to_s_with_surrogate_ranges
|
90
|
+
Writer.write_surrogate_ranges(bmp_part.ranges, astral_part.ranges)
|
91
|
+
end
|
92
|
+
|
89
93
|
def to_s_with_surrogate_alternation
|
90
94
|
Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges)
|
91
95
|
end
|
data/lib/character_set/writer.rb
CHANGED
@@ -1,37 +1,108 @@
|
|
1
1
|
class CharacterSet
|
2
2
|
module Writer
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
3
|
+
class << self
|
4
|
+
def write(codepoint_ranges, opts = {}, &block)
|
5
|
+
content = codepoint_ranges.map do |range|
|
6
|
+
if range.size > 2 && opts[:abbreviate] != false
|
7
|
+
bounds = [range.min, range.max]
|
8
|
+
bounds.map { |cp| write_codepoint(cp, opts, &block) }.join('-')
|
9
|
+
else
|
10
|
+
range.map { |cp| write_codepoint(cp, opts, &block) }.join
|
11
|
+
end
|
12
|
+
end.join
|
13
|
+
opts[:in_brackets] ? "[#{content}]" : content
|
14
|
+
end
|
15
|
+
|
16
|
+
def write_codepoint(codepoint, opts = {}, &block)
|
17
|
+
Character.new(codepoint).escape(opts, &block)
|
18
|
+
end
|
19
|
+
|
20
|
+
def write_surrogate_ranges(bmp_ranges, astral_ranges)
|
21
|
+
astral_branches = surrogate_range_expressions(astral_ranges)
|
22
|
+
bmp_set_with_alternatives(bmp_ranges, astral_branches)
|
23
|
+
end
|
24
|
+
|
25
|
+
def write_surrogate_alternation(bmp_ranges, astral_ranges)
|
26
|
+
astral_branches = surrogate_pairs(astral_ranges)
|
27
|
+
bmp_set_with_alternatives(bmp_ranges, astral_branches)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def surrogate_range_expressions(astral_ranges)
|
33
|
+
compressed_surrogate_range_pairs(astral_ranges).map do |hi_ranges, lo_ranges|
|
34
|
+
[hi_ranges, lo_ranges].map do |ranges|
|
35
|
+
use_brackets = ranges.size > 1 || ranges.first.size > 1
|
36
|
+
write(ranges, format: :js, in_brackets: use_brackets)
|
37
|
+
end.join
|
11
38
|
end
|
12
|
-
end
|
13
|
-
|
14
|
-
|
39
|
+
end
|
40
|
+
|
41
|
+
def compressed_surrogate_range_pairs(astral_ranges)
|
42
|
+
halves = astral_ranges.flat_map { |range| surrogate_half_ranges(range) }
|
15
43
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
44
|
+
# compress high surrogate codepoint ranges with common low range half
|
45
|
+
with_common_lo = halves.group_by(&:last).map do |lo_range, pairs|
|
46
|
+
hi_ranges = pairs.map(&:first)
|
47
|
+
compressed_hi_ranges = hi_ranges.each_with_object([]) do |range, arr|
|
48
|
+
prev = arr.last
|
49
|
+
if prev.nil? || prev.max + 1 < range.min # first or gap
|
50
|
+
arr << range
|
51
|
+
else # continuous codepoints, expand previous range
|
52
|
+
arr[-1] = (prev.min)..(range.max)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
[compressed_hi_ranges, lo_range]
|
56
|
+
end
|
57
|
+
|
58
|
+
# compress low surrogate codepoint ranges with common high ranges
|
59
|
+
with_common_lo.each_with_object({}) do |(hi_ranges, lo_range), hash|
|
60
|
+
(hash[hi_ranges] ||= []) << lo_range
|
61
|
+
end
|
23
62
|
end
|
24
|
-
end
|
25
63
|
|
26
|
-
|
27
|
-
|
28
|
-
|
64
|
+
def surrogate_half_ranges(astral_range)
|
65
|
+
hi_min, lo_min = surrogate_pair_codepoints(astral_range.min)
|
66
|
+
hi_max, lo_max = surrogate_pair_codepoints(astral_range.max)
|
67
|
+
hi_count = 1 + hi_max - hi_min
|
68
|
+
return [[hi_min..hi_min, lo_min..lo_max]] if hi_count == 1
|
69
|
+
|
70
|
+
ranges = []
|
71
|
+
|
72
|
+
# first high surrogate might be partially covered (if lo_min > 0xDC00)
|
73
|
+
ranges << [hi_min..hi_min, lo_min..0xDFFF]
|
74
|
+
|
75
|
+
# any high surrogates in between are fully covered
|
76
|
+
ranges << [(hi_min + 1)..(hi_max - 1), 0xDC00..0xDFFF] if hi_count > 2
|
29
77
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
78
|
+
# last high surrogate might be partially covered (if lo_max < 0xDFFF)
|
79
|
+
ranges << [hi_max..hi_max, 0xDC00..lo_max]
|
80
|
+
|
81
|
+
ranges
|
82
|
+
end
|
83
|
+
|
84
|
+
def surrogate_pair_codepoints(astral_codepoint)
|
85
|
+
base = astral_codepoint - 0x10000
|
86
|
+
high = base / 1024 + 0xD800
|
87
|
+
low = base % 1024 + 0xDC00
|
88
|
+
[high, low]
|
89
|
+
end
|
90
|
+
|
91
|
+
def bmp_set_with_alternatives(bmp_ranges, alternatives)
|
92
|
+
bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
|
93
|
+
return bmp_set if alternatives.empty? && bmp_ranges.any?
|
94
|
+
|
95
|
+
"(?:#{((bmp_ranges.any? ? [bmp_set] : []) + alternatives).join('|')})"
|
96
|
+
end
|
97
|
+
|
98
|
+
def surrogate_pairs(astral_ranges)
|
99
|
+
astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
|
100
|
+
end
|
101
|
+
|
102
|
+
def surrogate_pair(astral_codepoint)
|
103
|
+
surrogate_pair_codepoints(astral_codepoint)
|
104
|
+
.map { |half| write_codepoint(half, format: :js) }.join
|
105
|
+
end
|
35
106
|
end
|
36
107
|
end
|
37
108
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: character_set
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Janosch Müller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: benchmark-ips
|