js_regex 2.2.2 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 97bbd3590a59d72ec30e671104a4cde9485d4579acfbc8516f0e5b65395ea63d
4
- data.tar.gz: 6a78449fd51fa3c2cd7db2e09e9e00aef24fe2ce79ca7a5076239a65ce6ec90d
3
+ metadata.gz: 4f8f83148f3bcfeb5262259d0893fe92db40e59f64627c4430deee7eaee194c2
4
+ data.tar.gz: 9cf144827bd01a075552cf12bfd16152c417e82eab064f1fa6a65133381d95ac
5
5
  SHA512:
6
- metadata.gz: 1617eff1117554660bbbc2840fc3632e62246e31ec170a8df7ea5acb7277f525129e546cebf3c210f7819eb54be3415cb834b291db60fd98cf5cf7c1f459616e
7
- data.tar.gz: e41ec6d2af6543a14395fee5682b2a6423ab22e737b8dc9c79dc1844051d13b6df626590e0ecf7984b813bb0e1dcc563e0a02fbc7cbb2785f4de452ba35051c1
6
+ metadata.gz: bf9b4ff58756d2f12be785a803fda5e75aeffd556cdd49860e7474caf963957414b11e9fd1f3d35c6aee90375f3f23dc4435033ee1d3ba086534fdd7cf8d7caf
7
+ data.tar.gz: b6d4e6dd07949b8fa3394e4868214d1a1977ee3fe65713c7eb000cdffc50e1d485be9af2f7fcffcc1893c883eb36cd4bd5c4c687b2aebabd62f2454820f57db5
data/lib/js_regex.rb CHANGED
@@ -15,8 +15,8 @@ class JsRegex
15
15
 
16
16
  attr_reader :source, :options, :warnings
17
17
 
18
- def initialize(ruby_regex)
19
- @source, @options, @warnings = Conversion.of(ruby_regex)
18
+ def initialize(ruby_regex, options: nil)
19
+ @source, @options, @warnings = Conversion.of(ruby_regex, options: options)
20
20
  end
21
21
 
22
22
  def to_h
@@ -11,26 +11,26 @@ class JsRegex
11
11
  require_relative 'converter'
12
12
 
13
13
  class << self
14
- def of(ruby_regex)
14
+ def of(ruby_regex, options: nil)
15
15
  source, warnings = convert_source(ruby_regex)
16
- options = convert_options(ruby_regex)
17
- [source, options, warnings]
16
+ options_string = convert_options(ruby_regex, options)
17
+ [source, options_string, warnings]
18
18
  end
19
19
 
20
20
  private
21
21
 
22
22
  def convert_source(ruby_regex)
23
- context = Converter::Context.new(ruby_regex)
24
- expression_tree = Regexp::Parser.parse(ruby_regex)
23
+ context = Converter::Context.new(ruby_regex)
25
24
  [
26
- Converter::RootConverter.new.convert(expression_tree, context),
25
+ Converter::RootConverter.new.convert(context.ast, context),
27
26
  context.warnings
28
27
  ]
29
28
  end
30
29
 
31
- def convert_options(ruby_regex)
32
- ignore_case = (ruby_regex.options & Regexp::IGNORECASE).nonzero?
33
- ignore_case ? 'gi' : 'g'
30
+ def convert_options(ruby_regex, custom_options)
31
+ options = custom_options.to_s.scan(/[gimuy]/)
32
+ options << 'i' if (ruby_regex.options & Regexp::IGNORECASE).nonzero?
33
+ options.uniq.sort.join
34
34
  end
35
35
  end
36
36
  end
@@ -25,5 +25,16 @@ class JsRegex
25
25
  def self.for(expression)
26
26
  MAP[expression.type].new
27
27
  end
28
+
29
+ # Limit the number of generated surrogate pairs, else the output might
30
+ # get to large for certain applications. The chosen number is somewhat
31
+ # arbitrary. 100 pairs make for about 1 KB, uncompressed. The median char
32
+ # count of all properties supported by Ruby is 92. 75% are below 300 chars.
33
+ #
34
+ # Set this to nil if you need full unicode matches and size doesn't matter.
35
+ class << self
36
+ attr_accessor :surrogate_pair_limit
37
+ end
38
+ self.surrogate_pair_limit = 300
28
39
  end
29
40
  end
@@ -12,29 +12,63 @@ class JsRegex
12
12
 
13
13
  def convert_data
14
14
  case subtype
15
- when :number, :number_ref
16
- convert_number_ref
17
- when :number_rel_ref
18
- convert_number_rel_ref
19
- when :name_ref
20
- convert_name_ref
21
- else
15
+ when :name_ref then convert_name_ref
16
+ when :number, :number_ref then convert_number_ref
17
+ when :number_rel_ref then convert_number_rel_ref
18
+ when :name_call then convert_name_call
19
+ when :number_call then convert_number_call
20
+ when :number_rel_call then convert_number_rel_call
21
+ else # name_recursion_ref, number_recursion_ref, ...
22
22
  warn_of_unsupported_feature
23
23
  end
24
24
  end
25
25
 
26
+ def convert_name_ref
27
+ "\\#{context.named_group_positions.fetch(expression.name)}"
28
+ end
29
+
26
30
  def convert_number_ref
27
- "\\#{context.new_capturing_group_position(Integer(expression.number))}"
31
+ "\\#{context.new_capturing_group_position(expression.number)}"
28
32
  end
29
33
 
30
34
  def convert_number_rel_ref
31
- absolute_position = Integer(expression.number) +
32
- context.original_capturing_group_count + 1
33
35
  "\\#{context.new_capturing_group_position(absolute_position)}"
34
36
  end
35
37
 
36
- def convert_name_ref
37
- "\\#{context.named_group_positions.fetch(expression.name)}"
38
+ def absolute_position
39
+ expression.number + context.original_capturing_group_count + 1
40
+ end
41
+
42
+ def convert_name_call
43
+ replace_with_group do |group|
44
+ group.token == :named && group.name == expression.name
45
+ end
46
+ end
47
+
48
+ def convert_number_call
49
+ if expression.number == 0
50
+ return warn_of_unsupported_feature('whole-pattern recursion')
51
+ end
52
+ replace_with_group do |group|
53
+ [:capture, :options].include?(group.token) &&
54
+ group.number.equal?(expression.number)
55
+ end
56
+ end
57
+
58
+ def convert_number_rel_call
59
+ replace_with_group do |group|
60
+ [:capture, :options].include?(group.token) &&
61
+ group.number.equal?(absolute_position)
62
+ end
63
+ end
64
+
65
+ def replace_with_group
66
+ context.ast.each_expression do |subexp|
67
+ if subexp.type == :group && yield(subexp)
68
+ return Converter.for(subexp).convert(subexp, context)
69
+ end
70
+ end
71
+ ''
38
72
  end
39
73
  end
40
74
  end
@@ -8,12 +8,10 @@ class JsRegex
8
8
  # The Converters themselves are stateless.
9
9
  #
10
10
  class Context
11
- attr_reader :buffered_set_extractions,
12
- :buffered_set_members,
11
+ attr_reader :ast,
13
12
  :case_insensitive_root,
14
13
  :in_atomic_group,
15
14
  :named_group_positions,
16
- :negative_base_set,
17
15
  :warnings
18
16
 
19
17
  def initialize(ruby_regex)
@@ -22,20 +20,8 @@ class JsRegex
22
20
  self.named_group_positions = {}
23
21
  self.warnings = []
24
22
 
25
- self.case_insensitive_root =
26
- !(ruby_regex.options & Regexp::IGNORECASE).equal?(0)
27
- end
28
-
29
- # set context
30
-
31
- def negate_base_set
32
- self.negative_base_set = true
33
- end
34
-
35
- def reset_set_context
36
- self.buffered_set_extractions = []
37
- self.buffered_set_members = []
38
- self.negative_base_set = false
23
+ self.ast = Regexp::Parser.parse(ruby_regex)
24
+ self.case_insensitive_root = ast.case_insensitive?
39
25
  end
40
26
 
41
27
  # group context
@@ -75,10 +61,6 @@ class JsRegex
75
61
  capturing_group_count - total_added_capturing_groups
76
62
  end
77
63
 
78
- def total_added_capturing_groups
79
- added_capturing_groups_after_group.values.inject(0, &:+)
80
- end
81
-
82
64
  def store_named_group_position(name)
83
65
  named_group_positions[name] = capturing_group_count + 1
84
66
  end
@@ -88,13 +70,15 @@ class JsRegex
88
70
  attr_accessor :added_capturing_groups_after_group,
89
71
  :capturing_group_count
90
72
 
91
- attr_writer :buffered_set_extractions,
92
- :buffered_set_members,
73
+ attr_writer :ast,
93
74
  :case_insensitive_root,
94
75
  :in_atomic_group,
95
76
  :named_group_positions,
96
- :negative_base_set,
97
77
  :warnings
78
+
79
+ def total_added_capturing_groups
80
+ added_capturing_groups_after_group.values.inject(0, &:+)
81
+ end
98
82
  end
99
83
  end
100
84
  end
@@ -9,85 +9,63 @@ class JsRegex
9
9
  # Template class implementation.
10
10
  #
11
11
  class EscapeConverter < JsRegex::Converter::Base
12
- private
13
-
14
- ESCAPES_SHARED_BY_RUBY_AND_JS = [
15
- :backslash,
16
- :bol,
17
- :carriage,
18
- :codepoint,
19
- :dot,
20
- :eol,
21
- :form_feed,
22
- :group_close,
23
- :group_open,
24
- :hex,
25
- :interval_close,
26
- :interval_open,
27
- :newline,
28
- :octal,
29
- :one_or_more,
30
- :set_close,
31
- :set_open,
32
- :tab,
33
- :vertical_tab,
34
- :zero_or_more,
35
- :zero_or_one
12
+ ESCAPES_SHARED_BY_RUBY_AND_JS = %i[
13
+ alternation
14
+ backslash
15
+ backspace
16
+ bol
17
+ carriage
18
+ codepoint
19
+ dot
20
+ eol
21
+ form_feed
22
+ group_close
23
+ group_open
24
+ hex
25
+ interval_close
26
+ interval_open
27
+ newline
28
+ octal
29
+ one_or_more
30
+ set_close
31
+ set_open
32
+ tab
33
+ vertical_tab
34
+ zero_or_more
35
+ zero_or_one
36
36
  ].freeze
37
37
 
38
+ private
39
+
38
40
  def convert_data
39
41
  case subtype
40
42
  when :codepoint_list
41
43
  convert_codepoint_list
42
- when :control
43
- convert_control_sequence
44
+ when :control, :meta_sequence
45
+ unicode_escape_codepoint
44
46
  when :literal
45
47
  LiteralConverter.convert_data(data)
46
- when :meta_sequence
47
- convert_meta_sequence
48
48
  when *ESCAPES_SHARED_BY_RUBY_AND_JS
49
49
  pass_through
50
+ when :bell, :escape
51
+ hex_escape_codepoint
50
52
  else
51
- # Bell, Escape, HexWide, ...
52
53
  warn_of_unsupported_feature
53
54
  end
54
55
  end
55
56
 
56
57
  def convert_codepoint_list
57
- elements = data.scan(/\h+/).map do |codepoint|
58
- literal = Regexp.escape([codepoint.hex].pack('U'))
59
- LiteralConverter.convert_data(literal)
60
- end
61
- elements.join
62
- end
63
-
64
- def convert_control_sequence
65
- convert_meta_control_sequence ||
66
- unicode_escape_for(control_sequence_to_s(data))
67
- end
68
-
69
- def convert_meta_sequence
70
- convert_meta_control_sequence ||
71
- unicode_escape_for(meta_char_to_char_code(data[-1]))
72
- end
73
-
74
- def convert_meta_control_sequence
75
- return unless expression.class.to_s.include?('MetaControl')
76
- unicode_escape_for(meta_char_to_char_code(control_sequence_to_s(data)))
77
- end
78
-
79
- def unicode_escape_for(char)
80
- "\\u#{char.ord.to_s(16).upcase.rjust(4, '0')}"
58
+ expression.chars.map do |char|
59
+ LiteralConverter.convert_data(Regexp.escape(char))
60
+ end.join
81
61
  end
82
62
 
83
- def control_sequence_to_s(control_sequence)
84
- five_lsb = control_sequence.unpack('B*').first[-5..-1]
85
- ["000#{five_lsb}"].pack('B*')
63
+ def unicode_escape_codepoint
64
+ "\\u#{expression.codepoint.to_s(16).upcase.rjust(4, '0')}"
86
65
  end
87
66
 
88
- def meta_char_to_char_code(meta_char)
89
- byte_value = meta_char.ord
90
- byte_value < 128 ? byte_value + 128 : byte_value
67
+ def hex_escape_codepoint
68
+ "\\x#{expression.codepoint.to_s(16).upcase.rjust(2, '0')}"
91
69
  end
92
70
  end
93
71
  end
@@ -16,7 +16,7 @@ class JsRegex
16
16
  when :capture then build_group
17
17
  when :comment then drop_without_warning
18
18
  when :named then build_named_group
19
- when :options then build_options_group
19
+ when :options, :options_switch then build_options_group
20
20
  when :passive then build_passive_group
21
21
  when :absence then warn_of_unsupported_feature
22
22
  else build_unsupported_group
@@ -44,8 +44,7 @@ class JsRegex
44
44
  unless (encoding_options = data.scan(/[adu]/)).empty?
45
45
  warn_of_unsupported_feature("encoding options #{encoding_options}")
46
46
  end
47
- # TODO: replace this check in Regexp::Parser v1
48
- switch_only = !data.include?(':')
47
+ switch_only = subtype.equal?(:options_switch)
49
48
  switch_only ? drop_without_warning : build_group(head: '(')
50
49
  end
51
50
 
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'base'
4
- require_relative File.join('..', 'property_map')
4
+ require 'character_set'
5
5
 
6
6
  class JsRegex
7
7
  module Converter
@@ -9,24 +9,6 @@ class JsRegex
9
9
  # Template class implementation.
10
10
  #
11
11
  class PropertyConverter < JsRegex::Converter::Base
12
- class << self
13
- def property_replacement(property_name, negated = nil)
14
- replacement = PROPERTY_MAP[property_name.downcase.to_sym]
15
- negated ? negated_property_replacement(replacement) : replacement
16
- end
17
-
18
- private
19
-
20
- def negated_property_replacement(property_string)
21
- return nil unless property_string
22
- if property_string.start_with?('[^')
23
- property_string.sub('[^', '[')
24
- else
25
- property_string.sub('[', '[^')
26
- end
27
- end
28
- end
29
-
30
12
  private
31
13
 
32
14
  def convert_data
@@ -34,8 +16,29 @@ class JsRegex
34
16
  end
35
17
 
36
18
  def convert_property(negated = nil)
37
- replace = self.class.property_replacement(subtype, negated)
38
- replace || warn_of_unsupported_feature
19
+ content = CharacterSet.of_property(subtype)
20
+ if expression.case_insensitive? && !context.case_insensitive_root
21
+ content = content.case_insensitive
22
+ end
23
+
24
+ if negated
25
+ if content.astral_part.empty?
26
+ return "[^#{content.to_s(format: :js)}]"
27
+ else
28
+ warn_of_unsupported_feature('astral plane negation by property')
29
+ end
30
+ elsif Converter.surrogate_pair_limit.nil? ||
31
+ Converter.surrogate_pair_limit >= content.astral_part.size
32
+ return content.to_s_with_surrogate_alternation
33
+ else
34
+ warn_of_unsupported_feature('large astral plane match of property')
35
+ end
36
+
37
+ bmp_part = content.bmp_part
38
+ return '' if bmp_part.empty?
39
+
40
+ string = bmp_part.to_s(format: :js)
41
+ negated ? "[^#{string}]" : "[#{string}]"
39
42
  end
40
43
  end
41
44
  end
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'base'
4
- require_relative 'literal_converter'
5
- require_relative 'property_converter'
4
+ require_relative 'escape_converter'
5
+ require_relative 'type_converter'
6
+ require 'character_set'
6
7
 
7
8
  class JsRegex
8
9
  module Converter
@@ -13,143 +14,51 @@ class JsRegex
13
14
  private
14
15
 
15
16
  def convert_data
16
- if expression.set_level.equal?(0) # reached end of set expression
17
- context.reset_set_context
18
- context.negate_base_set if negative_set?
19
- process_members
20
- finalize_set
21
- elsif negative_set?
22
- warn_of_unsupported_feature('nested negative set data')
23
- else # positive subset
24
- process_members
17
+ if directly_compatible?
18
+ return expression.to_s(:base)
19
+ .gsub(%r{\\?([\f\n\r\t])}) { Regexp.escape($1) }
25
20
  end
26
- end
27
-
28
- def negative_set?
29
- expression.negative?
30
- end
31
-
32
- def process_members
33
- expression.each { |member| process_member(member) }
34
- end
35
21
 
36
- ASTRAL_PLANE_PATTERN = /[\u{10000}-\u{FFFFF}]/
37
- PROPERTY_PATTERN = /\A(?:\[:|\\([pP])\{)(\^?)([^:\}]+)/
38
-
39
- def process_member(member)
40
- return convert_subset(member) unless member.instance_of?(String)
41
-
42
- utf8_data = member.dup.force_encoding('UTF-8')
43
- case utf8_data
44
- when ASTRAL_PLANE_PATTERN
45
- warn_of_unsupported_feature('astral plane set member')
46
- when '\\h'
47
- handle_hex_type
48
- when '\\H'
49
- handle_nonhex_type
50
- when '&&'
51
- warn_of_unsupported_feature('set intersection')
52
- when PROPERTY_PATTERN
53
- handle_property($1, $2, $3)
54
- else
55
- handle_literal(utf8_data)
22
+ content = CharacterSet.of_expression(expression)
23
+ if expression.case_insensitive? && !context.case_insensitive_root
24
+ content = content.case_insensitive
25
+ elsif !expression.case_insensitive? && context.case_insensitive_root
26
+ warn_of_unsupported_feature('nested case-sensitive set')
56
27
  end
57
- end
58
-
59
- HEX_RANGES = 'A-Fa-f0-9'
60
- NONHEX_SET = '[^A-Fa-f0-9]'
61
28
 
62
- def handle_hex_type
63
- buffer_set_member(HEX_RANGES)
64
- end
65
-
66
- def handle_nonhex_type
67
- if context.negative_base_set
68
- warn_of_unsupported_feature('nonhex type in negative set')
29
+ if Converter.surrogate_pair_limit.nil? ||
30
+ Converter.surrogate_pair_limit >= content.astral_part.size
31
+ content.to_s_with_surrogate_alternation
69
32
  else
70
- buffer_set_extraction(NONHEX_SET)
33
+ warn_of_unsupported_feature('large astral plane match of set')
34
+ bmp_part = content.bmp_part
35
+ bmp_part.empty? ? '' : bmp_part.to_s(format: :js, in_brackets: true)
71
36
  end
72
37
  end
73
38
 
74
- def handle_property(sign, caret, name)
75
- if context.negative_base_set
76
- return warn_of_unsupported_feature('property in negative set')
39
+ def directly_compatible?
40
+ if expression.case_insensitive? && !context.case_insensitive_root
41
+ # casefolding needed
42
+ return false
77
43
  end
78
- std = standardize_property_name(name)
79
- negated = sign.eql?('P') ^ caret.eql?('^')
80
- if (replacement = PropertyConverter.property_replacement(std, negated))
81
- buffer_set_extraction(replacement)
82
- else
83
- warn_of_unsupported_feature('property')
84
- end
85
- end
86
-
87
- def handle_literal(utf8_data)
88
- conversion = LiteralConverter.convert_data(utf8_data)
89
- if context.case_insensitive_root && !expression.case_insensitive?
90
- warn_of_unsupported_feature('nested case-sensitive set member')
91
- elsif !context.case_insensitive_root && expression.case_insensitive?
92
- return handle_locally_case_insensitive_literal(conversion)
93
- end
94
- buffer_set_member(conversion)
95
- end
96
-
97
- DESCENDING_CASE_RANGE_PATTERN = /\p{upper}-\p{lower}/
98
44
 
99
- def handle_locally_case_insensitive_literal(literal)
100
- buffer_set_member(
101
- if literal =~ DESCENDING_CASE_RANGE_PATTERN
102
- warn_of_unsupported_feature(
103
- 'nested case-insensitive range going from upper to lower case'
104
- )
105
- literal
106
- else
107
- [literal, literal.swapcase].uniq
45
+ # check for subexpressions that need conversion
46
+ expression.each_expression do |node|
47
+ case node.type
48
+ when :literal
49
+ # surrogate pair substitution needed if astral
50
+ next if node.text.force_encoding('utf-8').ord <= 0xFFFF
51
+ when :set
52
+ # conversion needed for nested sets, intersections
53
+ next if node.token.equal?(:range)
54
+ when :type
55
+ next if TypeConverter::TYPES_SHARED_BY_RUBY_AND_JS.include?(node.token)
56
+ when :escape
57
+ next if EscapeConverter::ESCAPES_SHARED_BY_RUBY_AND_JS.include?(node.token)
108
58
  end
109
- )
110
- end
111
-
112
- def standardize_property_name(name)
113
- Regexp::Parser.parse("\\p{#{name}}").expressions.first.token
114
- end
115
-
116
- def buffer_set_member(data)
117
- context.buffered_set_members << data
118
- end
119
-
120
- def buffer_set_extraction(data)
121
- context.buffered_set_extractions << data
122
- end
123
-
124
- def convert_subset(subset)
125
- SetConverter.new.convert(subset, context)
126
- end
127
-
128
- def finalize_set
129
- buffered_members = context.buffered_set_members
130
- buffered_extractions = context.buffered_set_extractions
131
- if buffered_members.empty?
132
- finalize_depleted_set(buffered_extractions)
133
- else
134
- finalize_nondepleted_set(buffered_members, buffered_extractions)
135
- end
136
- end
137
-
138
- def finalize_depleted_set(buffered_extractions)
139
- case buffered_extractions.count
140
- when 0 then ''
141
- when 1 then buffered_extractions.first
142
- else "(?:#{buffered_extractions.join('|')})"
143
- end
144
- end
145
-
146
- def finalize_nondepleted_set(buffered_members, buffered_extractions)
147
- set = "[#{'^' if negative_set?}#{buffered_members.join}]"
148
- if buffered_extractions.empty?
149
- set
150
- else
151
- "(?:#{set}|#{buffered_extractions.join('|')})"
59
+ return false
152
60
  end
61
+ true
153
62
  end
154
63
  end
155
64
  end