character_set 1.3.0-java → 1.4.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cd336b705f4a2c9dc5af1fc0d841bbfb6324a5c8eb955bb72747c9c4c8a8431d
4
- data.tar.gz: 965a1f84fe364d1e0d44039f2947f5c91dae2bcd485201fa262d1fbd41ba7dea
3
+ metadata.gz: 72bf4e9262b86b5d729e7e526e311a8617e70dff50f0f2c70a2081363c2bebb7
4
+ data.tar.gz: 26f6ff53583ebf1dae307076682df54e3c7bc365022abfc13b988131a0aecff6
5
5
  SHA512:
6
- metadata.gz: dbef79700f9cc6d00387d373fdd0d307c1b2ebbcdd78d3efd25cbdc54e067b0576c53ec2b7b46eacde8def6a0feb2acd58396af62b462666dde213664c400d73
7
- data.tar.gz: f66e839c472188f52511a4ff3a4fda3becbe0af177c8cc4c4d7aeeaf65bb55b256f10a03a4aec453c6800ccd0bd697b4adb514f6621177f08e750d448150333b
6
+ metadata.gz: '0392f8f135133ed9edd720633694a74df1ee6fcd0fcf806c1a4ecd26314976f129941efee839f9b1a9d2be95bc4d5f0964c36de2b15f309b10953628dc4a756f'
7
+ data.tar.gz: c52d3928e5c9dba3a130a0bd2e1c5b191ebd6211cc88cb9f5960fc389dea0447897633a4c5893f3ce457c76d9c1a518e11d8f4e40116cab7f728ccae972a3fd0
data/CHANGELOG.md CHANGED
@@ -4,7 +4,16 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6
6
 
7
- ## UNRELEASED
7
+ ## [1.4.0] - 2019-06-07
8
+
9
+ ### Added
10
+ - `#to_s_with_surrogate_ranges` / `Writer::write_surrogate_ranges`
11
+ - allows for much shorter astral plane representations e.g. in JavaScript
12
+ - thanks to https://github.com/singpolyma for the suggestion and groundwork (#1)
13
+ - improved performance for `#to_s` / `Writer` by avoiding bugged `Range#minmax`
14
+
15
+ ### Fixed
16
+ - '/' is now escaped by default when stringifying so as to work with //-regexp syntax
8
17
 
9
18
  ## [1.3.0] - 2019-04-26
10
19
 
data/README.md CHANGED
@@ -40,7 +40,7 @@ CharacterSet.parse('[a-c]')
40
40
  CharacterSet.parse('\U00000061-\U00000063')
41
41
  ```
42
42
 
43
- If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting.
43
+ If the gems [`regexp_parser`](https://github.com/ammar/regexp_parser) and [`regexp_property_values`](https://github.com/jaynetics/regexp_property_values) are installed, `::of_regexp` and `::of_property` can also be used. `::of_regexp` can handle intersections, negations, and set nesting. Regexp's `i`-flag is ignored; call `#case_insensitive` on the result if needed.
44
44
 
45
45
  ```ruby
46
46
  CharacterSet.of_property('Thai') # => #<CharacterSet (size: 86)>
@@ -167,8 +167,18 @@ set.to_s(escape_all: true) { |c| "<#{c.hex}>" } # => "<61>-<63><258><1F929>"
167
167
  # disable abbreviation (grouping of codepoints in ranges)
168
168
  set.to_s(abbreviate: false) # => "abc\u0258\u{1F929}"
169
169
 
170
- # for full js regex compatibility in case of astral members:
171
- set.to_s_with_surrogate_alternation # => '(?:[a-c\u0258]|\ud83e\udd29)'
170
+ # astral members require some trickery if we want to target environments
171
+ # that are based on UTF-16 or "UCS-2 with surrogates", such as JavaScript.
172
+ set = CharacterSet['a', 'b', '🤩', '🤪', '🤫']
173
+
174
+ # Use #to_s_with_surrogate_ranges e.g. for JavaScript:
175
+ set.to_s_with_surrogate_ranges
176
+ # => '(?:[ab]|\uD83E[\uDD29-\uDD2B])'
177
+
178
+ # Or use #to_s_with_surrogate_alternation if such surrogate set pairs
179
+ # don't work in your target environment:
180
+ set.to_s_with_surrogate_alternation
181
+ # => '(?:[ab]|\uD83E\uDD29|\uD83E\uDD2A|\uD83E\uDD2B)'
172
182
  ```
173
183
 
174
184
  ### Unicode plane methods
data/Rakefile CHANGED
@@ -126,7 +126,7 @@ task :sync_predefined_sets do
126
126
  %w[assigned emoji whitespace].each do |prop|
127
127
  require 'regexp_property_values'
128
128
  ranges = RegexpPropertyValues[prop].matched_ranges
129
- str = ranges.map { |r| r.minmax.map { |n| n.to_s(16) }.join(',').upcase + "\n" }.join
129
+ str = ranges.map { |r| "#{r.min.to_s(16)},#{r.max.to_s(16)}\n" }.join.upcase
130
130
  File.write("./lib/character_set/predefined_sets/#{prop}.cps", str, mode: 'w')
131
131
  end
132
132
  end
@@ -1,7 +1,7 @@
1
1
  class CharacterSet
2
2
  class Character
3
3
  ENCODING = 'utf-8'.freeze
4
- SAFELY_PRINTABLE = (0x21..0x7E).to_a - ['-', '[', '\\', ']', '^'].map(&:ord)
4
+ SAFELY_PRINTABLE = (0x21..0x7E).to_a - %w(- / [ \\ ] ^).map(&:ord)
5
5
 
6
6
  attr_accessor :codepoint
7
7
 
@@ -86,6 +86,10 @@ class CharacterSet
86
86
  Writer.write(ranges, opts, &block)
87
87
  end
88
88
 
89
+ def to_s_with_surrogate_ranges
90
+ Writer.write_surrogate_ranges(bmp_part.ranges, astral_part.ranges)
91
+ end
92
+
89
93
  def to_s_with_surrogate_alternation
90
94
  Writer.write_surrogate_alternation(bmp_part.ranges, astral_part.ranges)
91
95
  end
@@ -1,3 +1,3 @@
1
1
  class CharacterSet
2
- VERSION = '1.3.0'
2
+ VERSION = '1.4.0'
3
3
  end
@@ -1,37 +1,108 @@
1
1
  class CharacterSet
2
2
  module Writer
3
- module_function
4
-
5
- def write(codepoint_ranges, opts = {}, &block)
6
- content = codepoint_ranges.map do |range|
7
- if range.size > 2 && opts[:abbreviate] != false
8
- range.minmax.map { |cp| Character.new(cp).escape(opts, &block) }.join('-')
9
- else
10
- range.map { |cp| Character.new(cp).escape(opts, &block) }.join
3
+ class << self
4
+ def write(codepoint_ranges, opts = {}, &block)
5
+ content = codepoint_ranges.map do |range|
6
+ if range.size > 2 && opts[:abbreviate] != false
7
+ bounds = [range.min, range.max]
8
+ bounds.map { |cp| write_codepoint(cp, opts, &block) }.join('-')
9
+ else
10
+ range.map { |cp| write_codepoint(cp, opts, &block) }.join
11
+ end
12
+ end.join
13
+ opts[:in_brackets] ? "[#{content}]" : content
14
+ end
15
+
16
+ def write_codepoint(codepoint, opts = {}, &block)
17
+ Character.new(codepoint).escape(opts, &block)
18
+ end
19
+
20
+ def write_surrogate_ranges(bmp_ranges, astral_ranges)
21
+ astral_branches = surrogate_range_expressions(astral_ranges)
22
+ bmp_set_with_alternatives(bmp_ranges, astral_branches)
23
+ end
24
+
25
+ def write_surrogate_alternation(bmp_ranges, astral_ranges)
26
+ astral_branches = surrogate_pairs(astral_ranges)
27
+ bmp_set_with_alternatives(bmp_ranges, astral_branches)
28
+ end
29
+
30
+ private
31
+
32
+ def surrogate_range_expressions(astral_ranges)
33
+ compressed_surrogate_range_pairs(astral_ranges).map do |hi_ranges, lo_ranges|
34
+ [hi_ranges, lo_ranges].map do |ranges|
35
+ use_brackets = ranges.size > 1 || ranges.first.size > 1
36
+ write(ranges, format: :js, in_brackets: use_brackets)
37
+ end.join
11
38
  end
12
- end.join
13
- opts[:in_brackets] ? "[#{content}]" : content
14
- end
39
+ end
40
+
41
+ def compressed_surrogate_range_pairs(astral_ranges)
42
+ halves = astral_ranges.flat_map { |range| surrogate_half_ranges(range) }
15
43
 
16
- def write_surrogate_alternation(bmp_ranges, astral_ranges)
17
- bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
18
- if astral_ranges.empty?
19
- bmp_set
20
- else
21
- surrogate_pairs = surrogate_pairs(astral_ranges)
22
- "(?:#{((bmp_ranges.any? ? [bmp_set] : []) + surrogate_pairs) * '|'})"
44
+ # compress high surrogate codepoint ranges with common low range half
45
+ with_common_lo = halves.group_by(&:last).map do |lo_range, pairs|
46
+ hi_ranges = pairs.map(&:first)
47
+ compressed_hi_ranges = hi_ranges.each_with_object([]) do |range, arr|
48
+ prev = arr.last
49
+ if prev.nil? || prev.max + 1 < range.min # first or gap
50
+ arr << range
51
+ else # continuous codepoints, expand previous range
52
+ arr[-1] = (prev.min)..(range.max)
53
+ end
54
+ end
55
+ [compressed_hi_ranges, lo_range]
56
+ end
57
+
58
+ # compress low surrogate codepoint ranges with common high ranges
59
+ with_common_lo.each_with_object({}) do |(hi_ranges, lo_range), hash|
60
+ (hash[hi_ranges] ||= []) << lo_range
61
+ end
23
62
  end
24
- end
25
63
 
26
- def surrogate_pairs(astral_ranges)
27
- astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
28
- end
64
+ def surrogate_half_ranges(astral_range)
65
+ hi_min, lo_min = surrogate_pair_codepoints(astral_range.min)
66
+ hi_max, lo_max = surrogate_pair_codepoints(astral_range.max)
67
+ hi_count = 1 + hi_max - hi_min
68
+ return [[hi_min..hi_min, lo_min..lo_max]] if hi_count == 1
69
+
70
+ ranges = []
71
+
72
+ # first high surrogate might be partially covered (if lo_min > 0xDC00)
73
+ ranges << [hi_min..hi_min, lo_min..0xDFFF]
74
+
75
+ # any high surrogates in between are fully covered
76
+ ranges << [(hi_min + 1)..(hi_max - 1), 0xDC00..0xDFFF] if hi_count > 2
29
77
 
30
- def surrogate_pair(astral_codepoint)
31
- base = astral_codepoint - 0x10000
32
- high = ((base / 1024).floor + 0xD800).to_s(16)
33
- low = (base % 1024 + 0xDC00).to_s(16)
34
- "\\u#{high}\\u#{low}"
78
+ # last high surrogate might be partially covered (if lo_max < 0xDFFF)
79
+ ranges << [hi_max..hi_max, 0xDC00..lo_max]
80
+
81
+ ranges
82
+ end
83
+
84
+ def surrogate_pair_codepoints(astral_codepoint)
85
+ base = astral_codepoint - 0x10000
86
+ high = base / 1024 + 0xD800
87
+ low = base % 1024 + 0xDC00
88
+ [high, low]
89
+ end
90
+
91
+ def bmp_set_with_alternatives(bmp_ranges, alternatives)
92
+ bmp_set = write(bmp_ranges, format: :js, in_brackets: true)
93
+ return bmp_set if alternatives.empty? && bmp_ranges.any?
94
+
95
+ "(?:#{((bmp_ranges.any? ? [bmp_set] : []) + alternatives).join('|')})"
96
+ end
97
+
98
+ def surrogate_pairs(astral_ranges)
99
+ astral_ranges.flat_map { |range| range.map { |cp| surrogate_pair(cp) } }
100
+ end
101
+
102
+ def surrogate_pair(astral_codepoint)
103
+ surrogate_pair_codepoints(astral_codepoint)
104
+ .map { |half| write_codepoint(half, format: :js) }.join
105
+ end
35
106
  end
36
107
  end
37
108
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: character_set
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.0
5
5
  platform: java
6
6
  authors:
7
7
  - Janosch Müller
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-05-26 00:00:00.000000000 Z
11
+ date: 2019-06-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: benchmark-ips