js_regex 2.2.2 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 97bbd3590a59d72ec30e671104a4cde9485d4579acfbc8516f0e5b65395ea63d
4
- data.tar.gz: 6a78449fd51fa3c2cd7db2e09e9e00aef24fe2ce79ca7a5076239a65ce6ec90d
3
+ metadata.gz: 4f8f83148f3bcfeb5262259d0893fe92db40e59f64627c4430deee7eaee194c2
4
+ data.tar.gz: 9cf144827bd01a075552cf12bfd16152c417e82eab064f1fa6a65133381d95ac
5
5
  SHA512:
6
- metadata.gz: 1617eff1117554660bbbc2840fc3632e62246e31ec170a8df7ea5acb7277f525129e546cebf3c210f7819eb54be3415cb834b291db60fd98cf5cf7c1f459616e
7
- data.tar.gz: e41ec6d2af6543a14395fee5682b2a6423ab22e737b8dc9c79dc1844051d13b6df626590e0ecf7984b813bb0e1dcc563e0a02fbc7cbb2785f4de452ba35051c1
6
+ metadata.gz: bf9b4ff58756d2f12be785a803fda5e75aeffd556cdd49860e7474caf963957414b11e9fd1f3d35c6aee90375f3f23dc4435033ee1d3ba086534fdd7cf8d7caf
7
+ data.tar.gz: b6d4e6dd07949b8fa3394e4868214d1a1977ee3fe65713c7eb000cdffc50e1d485be9af2f7fcffcc1893c883eb36cd4bd5c4c687b2aebabd62f2454820f57db5
data/lib/js_regex.rb CHANGED
@@ -15,8 +15,8 @@ class JsRegex
15
15
 
16
16
  attr_reader :source, :options, :warnings
17
17
 
18
- def initialize(ruby_regex)
19
- @source, @options, @warnings = Conversion.of(ruby_regex)
18
+ def initialize(ruby_regex, options: nil)
19
+ @source, @options, @warnings = Conversion.of(ruby_regex, options: options)
20
20
  end
21
21
 
22
22
  def to_h
@@ -11,26 +11,26 @@ class JsRegex
11
11
  require_relative 'converter'
12
12
 
13
13
  class << self
14
- def of(ruby_regex)
14
+ def of(ruby_regex, options: nil)
15
15
  source, warnings = convert_source(ruby_regex)
16
- options = convert_options(ruby_regex)
17
- [source, options, warnings]
16
+ options_string = convert_options(ruby_regex, options)
17
+ [source, options_string, warnings]
18
18
  end
19
19
 
20
20
  private
21
21
 
22
22
  def convert_source(ruby_regex)
23
- context = Converter::Context.new(ruby_regex)
24
- expression_tree = Regexp::Parser.parse(ruby_regex)
23
+ context = Converter::Context.new(ruby_regex)
25
24
  [
26
- Converter::RootConverter.new.convert(expression_tree, context),
25
+ Converter::RootConverter.new.convert(context.ast, context),
27
26
  context.warnings
28
27
  ]
29
28
  end
30
29
 
31
- def convert_options(ruby_regex)
32
- ignore_case = (ruby_regex.options & Regexp::IGNORECASE).nonzero?
33
- ignore_case ? 'gi' : 'g'
30
+ def convert_options(ruby_regex, custom_options)
31
+ options = custom_options.to_s.scan(/[gimuy]/)
32
+ options << 'i' if (ruby_regex.options & Regexp::IGNORECASE).nonzero?
33
+ options.uniq.sort.join
34
34
  end
35
35
  end
36
36
  end
@@ -25,5 +25,16 @@ class JsRegex
25
25
  def self.for(expression)
26
26
  MAP[expression.type].new
27
27
  end
28
+
29
+ # Limit the number of generated surrogate pairs, else the output might
30
+ # get to large for certain applications. The chosen number is somewhat
31
+ # arbitrary. 100 pairs make for about 1 KB, uncompressed. The median char
32
+ # count of all properties supported by Ruby is 92. 75% are below 300 chars.
33
+ #
34
+ # Set this to nil if you need full unicode matches and size doesn't matter.
35
+ class << self
36
+ attr_accessor :surrogate_pair_limit
37
+ end
38
+ self.surrogate_pair_limit = 300
28
39
  end
29
40
  end
@@ -12,29 +12,63 @@ class JsRegex
12
12
 
13
13
  def convert_data
14
14
  case subtype
15
- when :number, :number_ref
16
- convert_number_ref
17
- when :number_rel_ref
18
- convert_number_rel_ref
19
- when :name_ref
20
- convert_name_ref
21
- else
15
+ when :name_ref then convert_name_ref
16
+ when :number, :number_ref then convert_number_ref
17
+ when :number_rel_ref then convert_number_rel_ref
18
+ when :name_call then convert_name_call
19
+ when :number_call then convert_number_call
20
+ when :number_rel_call then convert_number_rel_call
21
+ else # name_recursion_ref, number_recursion_ref, ...
22
22
  warn_of_unsupported_feature
23
23
  end
24
24
  end
25
25
 
26
+ def convert_name_ref
27
+ "\\#{context.named_group_positions.fetch(expression.name)}"
28
+ end
29
+
26
30
  def convert_number_ref
27
- "\\#{context.new_capturing_group_position(Integer(expression.number))}"
31
+ "\\#{context.new_capturing_group_position(expression.number)}"
28
32
  end
29
33
 
30
34
  def convert_number_rel_ref
31
- absolute_position = Integer(expression.number) +
32
- context.original_capturing_group_count + 1
33
35
  "\\#{context.new_capturing_group_position(absolute_position)}"
34
36
  end
35
37
 
36
- def convert_name_ref
37
- "\\#{context.named_group_positions.fetch(expression.name)}"
38
+ def absolute_position
39
+ expression.number + context.original_capturing_group_count + 1
40
+ end
41
+
42
+ def convert_name_call
43
+ replace_with_group do |group|
44
+ group.token == :named && group.name == expression.name
45
+ end
46
+ end
47
+
48
+ def convert_number_call
49
+ if expression.number == 0
50
+ return warn_of_unsupported_feature('whole-pattern recursion')
51
+ end
52
+ replace_with_group do |group|
53
+ [:capture, :options].include?(group.token) &&
54
+ group.number.equal?(expression.number)
55
+ end
56
+ end
57
+
58
+ def convert_number_rel_call
59
+ replace_with_group do |group|
60
+ [:capture, :options].include?(group.token) &&
61
+ group.number.equal?(absolute_position)
62
+ end
63
+ end
64
+
65
+ def replace_with_group
66
+ context.ast.each_expression do |subexp|
67
+ if subexp.type == :group && yield(subexp)
68
+ return Converter.for(subexp).convert(subexp, context)
69
+ end
70
+ end
71
+ ''
38
72
  end
39
73
  end
40
74
  end
@@ -8,12 +8,10 @@ class JsRegex
8
8
  # The Converters themselves are stateless.
9
9
  #
10
10
  class Context
11
- attr_reader :buffered_set_extractions,
12
- :buffered_set_members,
11
+ attr_reader :ast,
13
12
  :case_insensitive_root,
14
13
  :in_atomic_group,
15
14
  :named_group_positions,
16
- :negative_base_set,
17
15
  :warnings
18
16
 
19
17
  def initialize(ruby_regex)
@@ -22,20 +20,8 @@ class JsRegex
22
20
  self.named_group_positions = {}
23
21
  self.warnings = []
24
22
 
25
- self.case_insensitive_root =
26
- !(ruby_regex.options & Regexp::IGNORECASE).equal?(0)
27
- end
28
-
29
- # set context
30
-
31
- def negate_base_set
32
- self.negative_base_set = true
33
- end
34
-
35
- def reset_set_context
36
- self.buffered_set_extractions = []
37
- self.buffered_set_members = []
38
- self.negative_base_set = false
23
+ self.ast = Regexp::Parser.parse(ruby_regex)
24
+ self.case_insensitive_root = ast.case_insensitive?
39
25
  end
40
26
 
41
27
  # group context
@@ -75,10 +61,6 @@ class JsRegex
75
61
  capturing_group_count - total_added_capturing_groups
76
62
  end
77
63
 
78
- def total_added_capturing_groups
79
- added_capturing_groups_after_group.values.inject(0, &:+)
80
- end
81
-
82
64
  def store_named_group_position(name)
83
65
  named_group_positions[name] = capturing_group_count + 1
84
66
  end
@@ -88,13 +70,15 @@ class JsRegex
88
70
  attr_accessor :added_capturing_groups_after_group,
89
71
  :capturing_group_count
90
72
 
91
- attr_writer :buffered_set_extractions,
92
- :buffered_set_members,
73
+ attr_writer :ast,
93
74
  :case_insensitive_root,
94
75
  :in_atomic_group,
95
76
  :named_group_positions,
96
- :negative_base_set,
97
77
  :warnings
78
+
79
+ def total_added_capturing_groups
80
+ added_capturing_groups_after_group.values.inject(0, &:+)
81
+ end
98
82
  end
99
83
  end
100
84
  end
@@ -9,85 +9,63 @@ class JsRegex
9
9
  # Template class implementation.
10
10
  #
11
11
  class EscapeConverter < JsRegex::Converter::Base
12
- private
13
-
14
- ESCAPES_SHARED_BY_RUBY_AND_JS = [
15
- :backslash,
16
- :bol,
17
- :carriage,
18
- :codepoint,
19
- :dot,
20
- :eol,
21
- :form_feed,
22
- :group_close,
23
- :group_open,
24
- :hex,
25
- :interval_close,
26
- :interval_open,
27
- :newline,
28
- :octal,
29
- :one_or_more,
30
- :set_close,
31
- :set_open,
32
- :tab,
33
- :vertical_tab,
34
- :zero_or_more,
35
- :zero_or_one
12
+ ESCAPES_SHARED_BY_RUBY_AND_JS = %i[
13
+ alternation
14
+ backslash
15
+ backspace
16
+ bol
17
+ carriage
18
+ codepoint
19
+ dot
20
+ eol
21
+ form_feed
22
+ group_close
23
+ group_open
24
+ hex
25
+ interval_close
26
+ interval_open
27
+ newline
28
+ octal
29
+ one_or_more
30
+ set_close
31
+ set_open
32
+ tab
33
+ vertical_tab
34
+ zero_or_more
35
+ zero_or_one
36
36
  ].freeze
37
37
 
38
+ private
39
+
38
40
  def convert_data
39
41
  case subtype
40
42
  when :codepoint_list
41
43
  convert_codepoint_list
42
- when :control
43
- convert_control_sequence
44
+ when :control, :meta_sequence
45
+ unicode_escape_codepoint
44
46
  when :literal
45
47
  LiteralConverter.convert_data(data)
46
- when :meta_sequence
47
- convert_meta_sequence
48
48
  when *ESCAPES_SHARED_BY_RUBY_AND_JS
49
49
  pass_through
50
+ when :bell, :escape
51
+ hex_escape_codepoint
50
52
  else
51
- # Bell, Escape, HexWide, ...
52
53
  warn_of_unsupported_feature
53
54
  end
54
55
  end
55
56
 
56
57
  def convert_codepoint_list
57
- elements = data.scan(/\h+/).map do |codepoint|
58
- literal = Regexp.escape([codepoint.hex].pack('U'))
59
- LiteralConverter.convert_data(literal)
60
- end
61
- elements.join
62
- end
63
-
64
- def convert_control_sequence
65
- convert_meta_control_sequence ||
66
- unicode_escape_for(control_sequence_to_s(data))
67
- end
68
-
69
- def convert_meta_sequence
70
- convert_meta_control_sequence ||
71
- unicode_escape_for(meta_char_to_char_code(data[-1]))
72
- end
73
-
74
- def convert_meta_control_sequence
75
- return unless expression.class.to_s.include?('MetaControl')
76
- unicode_escape_for(meta_char_to_char_code(control_sequence_to_s(data)))
77
- end
78
-
79
- def unicode_escape_for(char)
80
- "\\u#{char.ord.to_s(16).upcase.rjust(4, '0')}"
58
+ expression.chars.map do |char|
59
+ LiteralConverter.convert_data(Regexp.escape(char))
60
+ end.join
81
61
  end
82
62
 
83
- def control_sequence_to_s(control_sequence)
84
- five_lsb = control_sequence.unpack('B*').first[-5..-1]
85
- ["000#{five_lsb}"].pack('B*')
63
+ def unicode_escape_codepoint
64
+ "\\u#{expression.codepoint.to_s(16).upcase.rjust(4, '0')}"
86
65
  end
87
66
 
88
- def meta_char_to_char_code(meta_char)
89
- byte_value = meta_char.ord
90
- byte_value < 128 ? byte_value + 128 : byte_value
67
+ def hex_escape_codepoint
68
+ "\\x#{expression.codepoint.to_s(16).upcase.rjust(2, '0')}"
91
69
  end
92
70
  end
93
71
  end
@@ -16,7 +16,7 @@ class JsRegex
16
16
  when :capture then build_group
17
17
  when :comment then drop_without_warning
18
18
  when :named then build_named_group
19
- when :options then build_options_group
19
+ when :options, :options_switch then build_options_group
20
20
  when :passive then build_passive_group
21
21
  when :absence then warn_of_unsupported_feature
22
22
  else build_unsupported_group
@@ -44,8 +44,7 @@ class JsRegex
44
44
  unless (encoding_options = data.scan(/[adu]/)).empty?
45
45
  warn_of_unsupported_feature("encoding options #{encoding_options}")
46
46
  end
47
- # TODO: replace this check in Regexp::Parser v1
48
- switch_only = !data.include?(':')
47
+ switch_only = subtype.equal?(:options_switch)
49
48
  switch_only ? drop_without_warning : build_group(head: '(')
50
49
  end
51
50
 
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'base'
4
- require_relative File.join('..', 'property_map')
4
+ require 'character_set'
5
5
 
6
6
  class JsRegex
7
7
  module Converter
@@ -9,24 +9,6 @@ class JsRegex
9
9
  # Template class implementation.
10
10
  #
11
11
  class PropertyConverter < JsRegex::Converter::Base
12
- class << self
13
- def property_replacement(property_name, negated = nil)
14
- replacement = PROPERTY_MAP[property_name.downcase.to_sym]
15
- negated ? negated_property_replacement(replacement) : replacement
16
- end
17
-
18
- private
19
-
20
- def negated_property_replacement(property_string)
21
- return nil unless property_string
22
- if property_string.start_with?('[^')
23
- property_string.sub('[^', '[')
24
- else
25
- property_string.sub('[', '[^')
26
- end
27
- end
28
- end
29
-
30
12
  private
31
13
 
32
14
  def convert_data
@@ -34,8 +16,29 @@ class JsRegex
34
16
  end
35
17
 
36
18
  def convert_property(negated = nil)
37
- replace = self.class.property_replacement(subtype, negated)
38
- replace || warn_of_unsupported_feature
19
+ content = CharacterSet.of_property(subtype)
20
+ if expression.case_insensitive? && !context.case_insensitive_root
21
+ content = content.case_insensitive
22
+ end
23
+
24
+ if negated
25
+ if content.astral_part.empty?
26
+ return "[^#{content.to_s(format: :js)}]"
27
+ else
28
+ warn_of_unsupported_feature('astral plane negation by property')
29
+ end
30
+ elsif Converter.surrogate_pair_limit.nil? ||
31
+ Converter.surrogate_pair_limit >= content.astral_part.size
32
+ return content.to_s_with_surrogate_alternation
33
+ else
34
+ warn_of_unsupported_feature('large astral plane match of property')
35
+ end
36
+
37
+ bmp_part = content.bmp_part
38
+ return '' if bmp_part.empty?
39
+
40
+ string = bmp_part.to_s(format: :js)
41
+ negated ? "[^#{string}]" : "[#{string}]"
39
42
  end
40
43
  end
41
44
  end
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'base'
4
- require_relative 'literal_converter'
5
- require_relative 'property_converter'
4
+ require_relative 'escape_converter'
5
+ require_relative 'type_converter'
6
+ require 'character_set'
6
7
 
7
8
  class JsRegex
8
9
  module Converter
@@ -13,143 +14,51 @@ class JsRegex
13
14
  private
14
15
 
15
16
  def convert_data
16
- if expression.set_level.equal?(0) # reached end of set expression
17
- context.reset_set_context
18
- context.negate_base_set if negative_set?
19
- process_members
20
- finalize_set
21
- elsif negative_set?
22
- warn_of_unsupported_feature('nested negative set data')
23
- else # positive subset
24
- process_members
17
+ if directly_compatible?
18
+ return expression.to_s(:base)
19
+ .gsub(%r{\\?([\f\n\r\t])}) { Regexp.escape($1) }
25
20
  end
26
- end
27
-
28
- def negative_set?
29
- expression.negative?
30
- end
31
-
32
- def process_members
33
- expression.each { |member| process_member(member) }
34
- end
35
21
 
36
- ASTRAL_PLANE_PATTERN = /[\u{10000}-\u{FFFFF}]/
37
- PROPERTY_PATTERN = /\A(?:\[:|\\([pP])\{)(\^?)([^:\}]+)/
38
-
39
- def process_member(member)
40
- return convert_subset(member) unless member.instance_of?(String)
41
-
42
- utf8_data = member.dup.force_encoding('UTF-8')
43
- case utf8_data
44
- when ASTRAL_PLANE_PATTERN
45
- warn_of_unsupported_feature('astral plane set member')
46
- when '\\h'
47
- handle_hex_type
48
- when '\\H'
49
- handle_nonhex_type
50
- when '&&'
51
- warn_of_unsupported_feature('set intersection')
52
- when PROPERTY_PATTERN
53
- handle_property($1, $2, $3)
54
- else
55
- handle_literal(utf8_data)
22
+ content = CharacterSet.of_expression(expression)
23
+ if expression.case_insensitive? && !context.case_insensitive_root
24
+ content = content.case_insensitive
25
+ elsif !expression.case_insensitive? && context.case_insensitive_root
26
+ warn_of_unsupported_feature('nested case-sensitive set')
56
27
  end
57
- end
58
-
59
- HEX_RANGES = 'A-Fa-f0-9'
60
- NONHEX_SET = '[^A-Fa-f0-9]'
61
28
 
62
- def handle_hex_type
63
- buffer_set_member(HEX_RANGES)
64
- end
65
-
66
- def handle_nonhex_type
67
- if context.negative_base_set
68
- warn_of_unsupported_feature('nonhex type in negative set')
29
+ if Converter.surrogate_pair_limit.nil? ||
30
+ Converter.surrogate_pair_limit >= content.astral_part.size
31
+ content.to_s_with_surrogate_alternation
69
32
  else
70
- buffer_set_extraction(NONHEX_SET)
33
+ warn_of_unsupported_feature('large astral plane match of set')
34
+ bmp_part = content.bmp_part
35
+ bmp_part.empty? ? '' : bmp_part.to_s(format: :js, in_brackets: true)
71
36
  end
72
37
  end
73
38
 
74
- def handle_property(sign, caret, name)
75
- if context.negative_base_set
76
- return warn_of_unsupported_feature('property in negative set')
39
+ def directly_compatible?
40
+ if expression.case_insensitive? && !context.case_insensitive_root
41
+ # casefolding needed
42
+ return false
77
43
  end
78
- std = standardize_property_name(name)
79
- negated = sign.eql?('P') ^ caret.eql?('^')
80
- if (replacement = PropertyConverter.property_replacement(std, negated))
81
- buffer_set_extraction(replacement)
82
- else
83
- warn_of_unsupported_feature('property')
84
- end
85
- end
86
-
87
- def handle_literal(utf8_data)
88
- conversion = LiteralConverter.convert_data(utf8_data)
89
- if context.case_insensitive_root && !expression.case_insensitive?
90
- warn_of_unsupported_feature('nested case-sensitive set member')
91
- elsif !context.case_insensitive_root && expression.case_insensitive?
92
- return handle_locally_case_insensitive_literal(conversion)
93
- end
94
- buffer_set_member(conversion)
95
- end
96
-
97
- DESCENDING_CASE_RANGE_PATTERN = /\p{upper}-\p{lower}/
98
44
 
99
- def handle_locally_case_insensitive_literal(literal)
100
- buffer_set_member(
101
- if literal =~ DESCENDING_CASE_RANGE_PATTERN
102
- warn_of_unsupported_feature(
103
- 'nested case-insensitive range going from upper to lower case'
104
- )
105
- literal
106
- else
107
- [literal, literal.swapcase].uniq
45
+ # check for subexpressions that need conversion
46
+ expression.each_expression do |node|
47
+ case node.type
48
+ when :literal
49
+ # surrogate pair substitution needed if astral
50
+ next if node.text.force_encoding('utf-8').ord <= 0xFFFF
51
+ when :set
52
+ # conversion needed for nested sets, intersections
53
+ next if node.token.equal?(:range)
54
+ when :type
55
+ next if TypeConverter::TYPES_SHARED_BY_RUBY_AND_JS.include?(node.token)
56
+ when :escape
57
+ next if EscapeConverter::ESCAPES_SHARED_BY_RUBY_AND_JS.include?(node.token)
108
58
  end
109
- )
110
- end
111
-
112
- def standardize_property_name(name)
113
- Regexp::Parser.parse("\\p{#{name}}").expressions.first.token
114
- end
115
-
116
- def buffer_set_member(data)
117
- context.buffered_set_members << data
118
- end
119
-
120
- def buffer_set_extraction(data)
121
- context.buffered_set_extractions << data
122
- end
123
-
124
- def convert_subset(subset)
125
- SetConverter.new.convert(subset, context)
126
- end
127
-
128
- def finalize_set
129
- buffered_members = context.buffered_set_members
130
- buffered_extractions = context.buffered_set_extractions
131
- if buffered_members.empty?
132
- finalize_depleted_set(buffered_extractions)
133
- else
134
- finalize_nondepleted_set(buffered_members, buffered_extractions)
135
- end
136
- end
137
-
138
- def finalize_depleted_set(buffered_extractions)
139
- case buffered_extractions.count
140
- when 0 then ''
141
- when 1 then buffered_extractions.first
142
- else "(?:#{buffered_extractions.join('|')})"
143
- end
144
- end
145
-
146
- def finalize_nondepleted_set(buffered_members, buffered_extractions)
147
- set = "[#{'^' if negative_set?}#{buffered_members.join}]"
148
- if buffered_extractions.empty?
149
- set
150
- else
151
- "(?:#{set}|#{buffered_extractions.join('|')})"
59
+ return false
152
60
  end
61
+ true
153
62
  end
154
63
  end
155
64
  end