character_set 1.3.0-java → 1.4.0-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cd336b705f4a2c9dc5af1fc0d841bbfb6324a5c8eb955bb72747c9c4c8a8431d
4
- data.tar.gz: 965a1f84fe364d1e0d44039f2947f5c91dae2bcd485201fa262d1fbd41ba7dea
3
+ metadata.gz: 72bf4e9262b86b5d729e7e526e311a8617e70dff50f0f2c70a2081363c2bebb7
4
+ data.tar.gz: 26f6ff53583ebf1dae307076682df54e3c7bc365022abfc13b988131a0aecff6
5
5
  SHA512:
6
- metadata.gz: dbef79700f9cc6d00387d373fdd0d307c1b2ebbcdd78d3efd25cbdc54e067b0576c53ec2b7b46eacde8def6a0feb2acd58396af62b462666dde213664c400d73
7
- data.tar.gz: f66e839c472188f52511a4ff3a4fda3becbe0af177c8cc4c4d7aeeaf65bb55b256f10a03a4aec453c6800ccd0bd697b4adb514f6621177f08e750d448150333b
6
+ metadata.gz: '0392f8f135133ed9edd720633694a74df1ee6fcd0fcf806c1a4ecd26314976f129941efee839f9b1a9d2be95bc4d5f0964c36de2b15f309b10953628dc4a756f'
7
+ data.tar.gz: c52d3928e5c9dba3a130a0bd2e1c5b191ebd6211cc88cb9f5960fc389dea0447897633a4c5893f3ce457c76d9c1a518e11d8f4e40116cab7f728ccae972a3fd0
data/CHANGELOG.md CHANGED
@@ -4,7 +4,16 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
- ## UNRELEASED
7
+ ## [1.4.0] - 2019-06-07
8
+
9
+ ### Added
10
+ - `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
11
+ - allows for much shorter astral plane representations e.g. in JavaScript
12
+ - thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
13
+ - improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
14
+
15
+ ### Fixed
16
+ - '/' is now escaped by default when stringifying so as to work with //-regexp syntax
8
17
 
9
18
  ## [1.3.0] - 2019-04-26
10
19
 
data/README.md CHANGED
@@ -40,7 +40,7 @@ CharacterSet.parse('[a-c]')
40
40
  CharacterSet.parse('\U00000061-\U00000063')
41
41
  ```
42
42
 
43
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
43
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
44
44
 
45
45
  ```ruby
46
46
  CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
@@ -167,8 +167,18 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
167
167
  # disable abbreviation (grouping of codepoints in ranges)
168
168
  set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
169
169
 
170
- # for full js regex compatibility in case of astral members:
171
- set.to_s_with_surrogate_alternation # => '(?:[a-c\u0258]|\ud83e\udd29)'
170
+ # astral members require some trickery if we want to target environments
171
+ # that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
172
+ set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
173
+
174
+ # Use #to_s_with_surrogate_ranges e.g. for JavaScript:
175
+ set.to_s_with_surrogate_ranges
176
+ # => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
177
+
178
+ # Or use #to_s_with_surrogate_alternation if such surrogate set pairs
179
+ # don't work in your target environment:
180
+ set.to_s_with_surrogate_alternation
181
+ # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
172
182
  ```
173
183
 
174
184
  ### Unicode plane methods
data/Rakefile CHANGED
@@ -126,7 +126,7 @@ task :sync_predefined_sets do
126
126
  %w[assigned emoji whitespace].each do |prop|
127
127
  require 'regexp_property_values'
128
128
  ranges = RegexpPropertyValues[prop].matched_ranges
129
- str = ranges.map { |r| r.minmax.map { |n| n.to_s(16) }.join(',').upcase + "\n" }.join
129
+ str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
130
  File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
131
131
  end
132
132
  end
@@ -1,7 +1,7 @@
1
1
  class CharacterSet
2
2
  class Character
3
3
  ENCODING = 'utf-8'.freeze
4
- SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord)
4
+ SAFELY_PRINTABLE = (0x21..0x7E).to_a - %w(- / [ \\ ] ^).map(&:ord)
5
5
 
6
6
  attr_accessor :codepoint
7
7
 
@@ -86,6 +86,10 @@ class CharacterSet
86
86
  Writer.write(ranges, opts, &block)
87
87
  end
88
88
 
89
+ def to_s_with_surrogate_ranges
90
+ Writer.write_surrogate_ranges(bmp_part.ranges, astral_part.ranges)
91
+ end
92
+
89
93
  def to_s_with_surrogate_alternation
90
94
  Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges)
91
95
  end
@@ -1,3 +1,3 @@
1
1
  class CharacterSet
2
- VERSION = '1.3.0'
2
+ VERSION = '1.4.0'
3
3
  end
@@ -1,37 +1,108 @@
1
1
  class CharacterSet
2
2
  module Writer
3
- module_function
4
-
5
- def write(codepoint_ranges, opts = {}, &block)
6
- content = codepoint_ranges.map do |range|
7
- if range.size > 2 && opts[:abbreviate] != false
8
- range.minmax.map { |cp| Character.new(cp).escape(opts, &block) }.join('-')
9
- else
10
- range.map { |cp| Character.new(cp).escape(opts, &block) }.join
3
+ class << self
4
+ def write(codepoint_ranges, opts = {}, &block)
5
+ content = codepoint_ranges.map do |range|
6
+ if range.size > 2 && opts[:abbreviate] != false
7
+ bounds = [range.min, range.max]
8
+ bounds.map { |cp| write_codepoint(cp, opts, &block) }.join('-')
9
+ else
10
+ range.map { |cp| write_codepoint(cp, opts, &block) }.join
11
+ end
12
+ end.join
13
+ opts[:in_brackets] ? "[#{content}]" : content
14
+ end
15
+
16
+ def write_codepoint(codepoint, opts = {}, &block)
17
+ Character.new(codepoint).escape(opts, &block)
18
+ end
19
+
20
+ def write_surrogate_ranges(bmp_ranges, astral_ranges)
21
+ astral_branches = surrogate_range_expressions(astral_ranges)
22
+ bmp_set_with_alternatives(bmp_ranges, astral_branches)
23
+ end
24
+
25
+ def write_surrogate_alternation(bmp_ranges, astral_ranges)
26
+ astral_branches = surrogate_pairs(astral_ranges)
27
+ bmp_set_with_alternatives(bmp_ranges, astral_branches)
28
+ end
29
+
30
+ private
31
+
32
+ def surrogate_range_expressions(astral_ranges)
33
+ compressed_surrogate_range_pairs(astral_ranges).map do |hi_ranges, lo_ranges|
34
+ [hi_ranges, lo_ranges].map do |ranges|
35
+ use_brackets = ranges.size > 1 || ranges.first.size > 1
36
+ write(ranges, format: :js, in_brackets: use_brackets)
37
+ end.join
11
38
  end
12
- end.join
13
- opts[:in_brackets] ? "[#{content}]" : content
14
- end
39
+ end
40
+
41
+ def compressed_surrogate_range_pairs(astral_ranges)
42
+ halves = astral_ranges.flat_map { |range| surrogate_half_ranges(range) }
15
43
 
16
- def write_surrogate_alternation(bmp_ranges, astral_ranges)
17
- bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
18
- if astral_ranges.empty?
19
- bmp_set
20
- else
21
- surrogate_pairs = surrogate_pairs(astral_ranges)
22
- "(?:#{((bmp_ranges.any? ? [bmp_set] : []) + surrogate_pairs) * '|'})"
44
+ # compress high surrogate codepoint ranges with common low range half
45
+ with_common_lo = halves.group_by(&:last).map do |lo_range, pairs|
46
+ hi_ranges = pairs.map(&:first)
47
+ compressed_hi_ranges = hi_ranges.each_with_object([]) do |range, arr|
48
+ prev = arr.last
49
+ if prev.nil? || prev.max + 1 < range.min # first or gap
50
+ arr << range
51
+ else # continuous codepoints, expand previous range
52
+ arr[-1] = (prev.min)..(range.max)
53
+ end
54
+ end
55
+ [compressed_hi_ranges, lo_range]
56
+ end
57
+
58
+ # compress low surrogate codepoint ranges with common high ranges
59
+ with_common_lo.each_with_object({}) do |(hi_ranges, lo_range), hash|
60
+ (hash[hi_ranges] ||= []) << lo_range
61
+ end
23
62
  end
24
- end
25
63
 
26
- def surrogate_pairs(astral_ranges)
27
- astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
28
- end
64
+ def surrogate_half_ranges(astral_range)
65
+ hi_min, lo_min = surrogate_pair_codepoints(astral_range.min)
66
+ hi_max, lo_max = surrogate_pair_codepoints(astral_range.max)
67
+ hi_count = 1 + hi_max - hi_min
68
+ return [[hi_min..hi_min, lo_min..lo_max]] if hi_count == 1
69
+
70
+ ranges = []
71
+
72
+ # first high surrogate might be partially covered (if lo_min > 0xDC00)
73
+ ranges << [hi_min..hi_min, lo_min..0xDFFF]
74
+
75
+ # any high surrogates in between are fully covered
76
+ ranges << [(hi_min + 1)..(hi_max - 1), 0xDC00..0xDFFF] if hi_count > 2
29
77
 
30
- def surrogate_pair(astral_codepoint)
31
- base = astral_codepoint - 0x10000
32
- high = ((base / 1024).floor + 0xD800).to_s(16)
33
- low = (base % 1024 + 0xDC00).to_s(16)
34
- "\\u#{high}\\u#{low}"
78
+ # last high surrogate might be partially covered (if lo_max < 0xDFFF)
79
+ ranges << [hi_max..hi_max, 0xDC00..lo_max]
80
+
81
+ ranges
82
+ end
83
+
84
+ def surrogate_pair_codepoints(astral_codepoint)
85
+ base = astral_codepoint - 0x10000
86
+ high = base / 1024 + 0xD800
87
+ low = base % 1024 + 0xDC00
88
+ [high, low]
89
+ end
90
+
91
+ def bmp_set_with_alternatives(bmp_ranges, alternatives)
92
+ bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
93
+ return bmp_set if alternatives.empty? && bmp_ranges.any?
94
+
95
+ "(?:#{((bmp_ranges.any? ? [bmp_set] : []) + alternatives).join('|')})"
96
+ end
97
+
98
+ def surrogate_pairs(astral_ranges)
99
+ astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
100
+ end
101
+
102
+ def surrogate_pair(astral_codepoint)
103
+ surrogate_pair_codepoints(astral_codepoint)
104
+ .map { |half| write_codepoint(half, format: :js) }.join
105
+ end
35
106
  end
36
107
  end
37
108
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: character_set
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.0
5
5
  platform: java
6
6
  authors:
7
7
  - Janosch Müller
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-05-26 00:00:00.000000000 Z
11
+ date: 2019-06-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: benchmark-ips