rley 0.6.00 → 0.6.01
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -1
- data/CHANGELOG.md +3 -0
- data/Gemfile +1 -1
- data/examples/NLP/benchmark_pico_en.rb +6 -10
- data/examples/NLP/nano_eng/nano_en_demo.rb +2 -2
- data/examples/NLP/nano_eng/nano_grammar.rb +1 -2
- data/examples/data_formats/JSON/json_ast_builder.rb +8 -8
- data/examples/general/SRL/lib/ast_builder.rb +74 -72
- data/examples/general/SRL/lib/grammar.rb +2 -2
- data/examples/general/SRL/lib/regex/abstract_method.rb +28 -28
- data/examples/general/SRL/lib/regex/alternation.rb +21 -25
- data/examples/general/SRL/lib/regex/anchor.rb +6 -9
- data/examples/general/SRL/lib/regex/atomic_expression.rb +10 -15
- data/examples/general/SRL/lib/regex/capturing_group.rb +15 -14
- data/examples/general/SRL/lib/regex/char_class.rb +10 -13
- data/examples/general/SRL/lib/regex/char_range.rb +45 -46
- data/examples/general/SRL/lib/regex/char_shorthand.rb +8 -9
- data/examples/general/SRL/lib/regex/character.rb +196 -191
- data/examples/general/SRL/lib/regex/compound_expression.rb +47 -50
- data/examples/general/SRL/lib/regex/concatenation.rb +23 -27
- data/examples/general/SRL/lib/regex/expression.rb +53 -56
- data/examples/general/SRL/lib/regex/lookaround.rb +23 -20
- data/examples/general/SRL/lib/regex/match_option.rb +26 -28
- data/examples/general/SRL/lib/regex/monadic_expression.rb +20 -23
- data/examples/general/SRL/lib/regex/multiplicity.rb +17 -20
- data/examples/general/SRL/lib/regex/non_capturing_group.rb +9 -12
- data/examples/general/SRL/lib/regex/polyadic_expression.rb +51 -55
- data/examples/general/SRL/lib/regex/quantifiable.rb +14 -20
- data/examples/general/SRL/lib/regex/repetition.rb +20 -23
- data/examples/general/SRL/lib/regex/wildcard.rb +15 -19
- data/examples/general/SRL/lib/regex_repr.rb +1 -1
- data/examples/general/SRL/lib/tokenizer.rb +2 -2
- data/examples/general/SRL/spec/integration_spec.rb +17 -12
- data/examples/general/SRL/spec/regex/character_spec.rb +160 -153
- data/examples/general/SRL/spec/regex/multiplicity_spec.rb +27 -31
- data/examples/general/SRL/spec/spec_helper.rb +1 -1
- data/examples/general/SRL/spec/tokenizer_spec.rb +25 -27
- data/examples/general/calc_iter1/calc_ast_builder.rb +10 -10
- data/examples/general/calc_iter2/calc_ast_builder.rb +7 -9
- data/examples/general/calc_iter2/calc_ast_nodes.rb +5 -6
- data/examples/general/calc_iter2/calc_lexer.rb +3 -5
- data/examples/general/calc_iter2/spec/calculator_spec.rb +16 -14
- data/examples/general/left.rb +8 -8
- data/examples/general/right.rb +8 -8
- data/lib/rley/constants.rb +1 -1
- data/lib/rley/engine.rb +16 -20
- data/lib/rley/formatter/json.rb +1 -1
- data/lib/rley/gfg/grm_flow_graph.rb +1 -1
- data/lib/rley/gfg/item_vertex.rb +6 -5
- data/lib/rley/gfg/vertex.rb +3 -3
- data/lib/rley/lexical/token.rb +4 -3
- data/lib/rley/parse_rep/ast_base_builder.rb +4 -3
- data/lib/rley/parse_rep/parse_rep_creator.rb +1 -1
- data/lib/rley/parse_rep/parse_tree_builder.rb +3 -2
- data/lib/rley/parser/error_reason.rb +1 -1
- data/lib/rley/parser/gfg_chart.rb +6 -6
- data/lib/rley/parser/gfg_parsing.rb +19 -19
- data/lib/rley/parser/parse_entry.rb +3 -3
- data/lib/rley/parser/parse_entry_set.rb +1 -1
- data/lib/rley/parser/parse_walker_factory.rb +15 -15
- data/lib/rley/syntax/grammar.rb +1 -1
- data/lib/rley/syntax/grammar_builder.rb +2 -2
- data/lib/rley/syntax/production.rb +4 -3
- data/lib/rley/syntax/symbol_seq.rb +2 -2
- data/spec/rley/base/grm_items_builder_spec.rb +1 -1
- data/spec/rley/engine_spec.rb +3 -6
- data/spec/rley/formatter/asciitree_spec.rb +0 -1
- data/spec/rley/formatter/bracket_notation_spec.rb +0 -1
- data/spec/rley/formatter/debug_spec.rb +2 -3
- data/spec/rley/gfg/grm_flow_graph_spec.rb +19 -19
- data/spec/rley/parse_rep/ast_builder_spec.rb +12 -12
- data/spec/rley/parser/gfg_earley_parser_spec.rb +1 -1
- data/spec/rley/parser/parse_entry_set_spec.rb +5 -5
- data/spec/rley/parser/parse_state_spec.rb +8 -3
- data/spec/rley/parser/parse_tracer_spec.rb +3 -1
- data/spec/rley/parser/parse_walker_factory_spec.rb +1 -1
- data/spec/rley/ptree/parse_tree_node_spec.rb +1 -1
- data/spec/rley/syntax/grammar_builder_spec.rb +1 -1
- data/spec/rley/syntax/grammar_spec.rb +1 -1
- metadata +2 -3
- data/spec/rley/support/ast_builder.rb +0 -403
@@ -1,199 +1,204 @@
|
|
1
1
|
# File: character.rb
|
2
2
|
|
3
|
-
require_relative 'atomic_expression'
|
3
|
+
require_relative 'atomic_expression' # Access the superclass
|
4
4
|
|
5
5
|
module Regex # This module is used as a namespace
|
6
|
+
# A regular expression that matches a specific character in a given character set
|
7
|
+
class Character < AtomicExpression
|
8
|
+
# Constant with all special 2-characters escape sequences
|
9
|
+
DigramSequences = {
|
10
|
+
"\\a" => 0x7, # alarm
|
11
|
+
"\\n" => 0xA, # newline
|
12
|
+
"\\r" => 0xD, # carriage return
|
13
|
+
"\\t" => 0x9, # tab
|
14
|
+
"\\e" => 0x1B, # escape
|
15
|
+
"\\f" => 0xC, # form feed
|
16
|
+
"\\v" => 0xB, # vertical feed
|
17
|
+
# Single octal digit literals
|
18
|
+
"\\0" => 0,
|
19
|
+
"\\1" => 1,
|
20
|
+
"\\2" => 2,
|
21
|
+
"\\3" => 3,
|
22
|
+
"\\4" => 4,
|
23
|
+
"\\5" => 5,
|
24
|
+
"\\6" => 6,
|
25
|
+
"\\7" => 7
|
26
|
+
}.freeze
|
6
27
|
|
7
|
-
|
8
|
-
class Character < AtomicExpression
|
9
|
-
# Constant with all special 2-characters escape sequences
|
10
|
-
DigramSequences = {
|
11
|
-
"\\a" => 0x7, # alarm
|
12
|
-
"\\n" => 0xA, # newline
|
13
|
-
"\\r" => 0xD, # carriage return
|
14
|
-
"\\t" => 0x9, # tab
|
15
|
-
"\\e" => 0x1B, # escape
|
16
|
-
"\\f" => 0xC, # form feed
|
17
|
-
"\\v" => 0xB, # vertical feed
|
18
|
-
# Single octal digit literals
|
19
|
-
"\\0" => 0,
|
20
|
-
"\\1" => 1,
|
21
|
-
"\\2" => 2,
|
22
|
-
"\\3" => 3,
|
23
|
-
"\\4" => 4,
|
24
|
-
"\\5" => 5,
|
25
|
-
"\\6" => 6,
|
26
|
-
"\\7" => 7
|
27
|
-
}
|
28
|
-
|
29
|
-
MetaChars = '\^$+?.'
|
30
|
-
|
31
|
-
# The integer value that uniquely identifies the character.
|
32
|
-
attr_reader(:codepoint)
|
33
|
-
|
34
|
-
# The initial text representation of the character (if any).
|
35
|
-
attr_reader(:lexeme)
|
36
|
-
|
37
|
-
# Constructor.
|
38
|
-
# [aValue] Initialize the character with a either a String literal or a codepoint value.
|
39
|
-
# Examples:
|
40
|
-
# Initializing with codepoint value...
|
41
|
-
# RegAn::Character.new(0x3a3) # Represents: Σ (Unicode GREEK CAPITAL LETTER SIGMA)
|
42
|
-
# RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex)
|
43
|
-
#
|
44
|
-
# Initializing with a single character string
|
45
|
-
# RegAn::Character.new(?\u03a3) # Also represents: Σ
|
46
|
-
# RegAn::Character.new('Σ') # Obviously, represents a Σ
|
47
|
-
#
|
48
|
-
# Initializing with an escape sequence string
|
49
|
-
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
50
|
-
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC)
|
51
|
-
# \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal) \xXX (hex)
|
52
|
-
# Any other escaped character will be treated as a literal character
|
53
|
-
# RegAn::Character.new('\n') # Represents a newline
|
54
|
-
# RegAn::Character.new('\u03a3') # Represents a Σ
|
55
|
-
def initialize(aValue)
|
56
|
-
|
57
|
-
case aValue
|
58
|
-
when String
|
59
|
-
if aValue.size == 1
|
60
|
-
# Literal single character case...
|
61
|
-
@codepoint = self.class.char2codepoint(aValue)
|
62
|
-
else
|
63
|
-
# Should be an escape sequence...
|
64
|
-
@codepoint = self.class.esc2codepoint(aValue)
|
65
|
-
end
|
66
|
-
@lexeme = aValue
|
67
|
-
|
68
|
-
when Integer
|
69
|
-
@codepoint = aValue
|
70
|
-
else
|
71
|
-
raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
public
|
76
|
-
# Convertion method that returns a character given a codepoint (integer) value.
|
77
|
-
# Example:
|
78
|
-
# RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (The Unicode GREEK CAPITAL LETTER SIGMA)
|
79
|
-
def self.codepoint2char(aCodepoint)
|
80
|
-
return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
|
81
|
-
end
|
82
|
-
|
83
|
-
# Convertion method that returns the codepoint for the given single character.
|
84
|
-
# Example:
|
85
|
-
# RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
|
86
|
-
def self.char2codepoint(aChar)
|
87
|
-
return aChar.ord()
|
88
|
-
end
|
89
|
-
|
90
|
-
# Convertion method that returns the codepoint for the given escape sequence (a String).
|
91
|
-
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
92
|
-
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC), \v (vertical feed, 0xB)
|
93
|
-
# \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal) \xXX (hex)
|
94
|
-
# Any other escaped character will be treated as a literal character
|
95
|
-
# Example:
|
96
|
-
# RegAn::Character::esc2codepoint('\n') # Returns: 0xd
|
97
|
-
def self.esc2codepoint(anEscapeSequence)
|
98
|
-
raise StandardError, "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)." unless anEscapeSequence[0] == "\\"
|
99
|
-
result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
|
100
|
-
|
101
|
-
return result
|
102
|
-
end
|
103
|
-
|
104
|
-
# Return the character as a String object
|
105
|
-
def char()
|
106
|
-
self.class.codepoint2char(@codepoint)
|
107
|
-
end
|
108
|
-
|
109
|
-
# Returns true iff this Character and parameter 'another' represent the same character.
|
110
|
-
# [another] any Object. The way the equality is tested depends on the another's class
|
111
|
-
# Example:
|
112
|
-
# newOne = Character.new(?\u03a3)
|
113
|
-
# newOne == newOne # true. Identity
|
114
|
-
# newOne == Character.new(?\u03a3) # true. Both have same codepoint
|
115
|
-
# newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
|
116
|
-
# newOne == 0x03a3 # true. The Integer is compared to the codepoint value.
|
117
|
-
# Will test equality with any Object that knows the to_s method
|
118
|
-
def ==(another)
|
119
|
-
result = case another
|
120
|
-
when Character
|
121
|
-
self.to_str == another.to_str
|
122
|
-
|
123
|
-
when Integer
|
124
|
-
self.codepoint == another
|
125
|
-
|
126
|
-
when String
|
127
|
-
(another.size > 1) ? false : self.to_str == another
|
128
|
-
|
129
|
-
else
|
130
|
-
# Unknown type: try with a convertion
|
131
|
-
self == another.to_s() # Recursive call
|
132
|
-
end
|
133
|
-
|
134
|
-
return result
|
135
|
-
end
|
136
|
-
|
137
|
-
# Return a plain English description of the character
|
138
|
-
def explain()
|
139
|
-
return "the character '#{to_str()}'"
|
140
|
-
end
|
141
|
-
|
142
|
-
protected
|
143
|
-
|
144
|
-
# Conversion method re-definition.
|
145
|
-
# Purpose: Return the String representation of the expression.
|
146
|
-
# If the Character was initially from a text (the lexeme), then the lexeme is returned back.
|
147
|
-
# Otherwise the character corresponding to the codepoint is returned.
|
148
|
-
def text_repr()
|
149
|
-
if lexeme.nil?
|
150
|
-
result = char()
|
151
|
-
else
|
152
|
-
result = lexeme.dup()
|
153
|
-
end
|
154
|
-
|
155
|
-
return result
|
156
|
-
end
|
157
|
-
|
158
|
-
private
|
159
|
-
# Convertion method that returns a codepoint for the given two characters (digram) escape sequence.
|
160
|
-
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
161
|
-
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC), \v (vertical feed, 0xB)
|
162
|
-
# Any other escape sequence will return the codepoint of the escaped character.
|
163
|
-
# [aDigram] A sequence of two characters that starts with a backslash.
|
164
|
-
def self.digram2codepoint(aDigram)
|
165
|
-
# Check that the digram is a special escape sequence
|
166
|
-
result = DigramSequences.fetch(aDigram, nil)
|
167
|
-
|
168
|
-
# If it not a special sequence, then escaped character is considered literally (the backslash is 'dummy')
|
169
|
-
result = char2codepoint(aDigram[-1]) if result.nil?
|
170
|
-
return result
|
171
|
-
end
|
172
|
-
|
173
|
-
# Convertion method that returns a codepoint for the given complex escape sequence.
|
174
|
-
# [anEscapeSequence] A String with the format:
|
175
|
-
# \uXXXX where XXXX is a 4 hex digits integer value,
|
176
|
-
# \u{X...} X 1 or more hex digits
|
177
|
-
# \ooo (1..3 octal digits literal)
|
178
|
-
# \xXX (1..2 hex digits literal)
|
179
|
-
def self.esc_number2codepoint(anEscapeSequence)
|
180
|
-
# Next line requires Ruby >= 1.9
|
181
|
-
unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
|
182
|
-
raise StandardError, "Unsupported escape sequence #{anEscapeSequence}."
|
183
|
-
else
|
184
|
-
#shorterSeq = anEscapeSequence[1..-1] # Remove the backslash
|
185
|
-
|
186
|
-
# Octal literal case?
|
187
|
-
return octal.oct() if octal # shorterSeq =~ /[0-7]{1,3}/
|
188
|
-
|
189
|
-
# Extract the hexadecimal number
|
190
|
-
hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
|
191
|
-
return hexliteral.hex()
|
192
|
-
end
|
193
|
-
end
|
194
|
-
|
195
|
-
end # class
|
28
|
+
MetaChars = '\^$+?.'.freeze
|
196
29
|
|
30
|
+
# The integer value that uniquely identifies the character.
|
31
|
+
attr_reader(:codepoint)
|
32
|
+
|
33
|
+
# The initial text representation of the character (if any).
|
34
|
+
attr_reader(:lexeme)
|
35
|
+
|
36
|
+
# Constructor.
|
37
|
+
# [aValue] Initialize the character with a either a String literal or a
|
38
|
+
# codepoint value.
|
39
|
+
# Examples:
|
40
|
+
# Initializing with codepoint value...
|
41
|
+
# RegAn::Character.new(0x3a3) # Represents: Σ
|
42
|
+
# (Unicode GREEK CAPITAL LETTER SIGMA)
|
43
|
+
# RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex)
|
44
|
+
#
|
45
|
+
# Initializing with a single character string
|
46
|
+
# RegAn::Character.new(?\u03a3) # Also represents: Σ
|
47
|
+
# RegAn::Character.new('Σ') # Obviously, represents a Σ
|
48
|
+
#
|
49
|
+
# Initializing with an escape sequence string
|
50
|
+
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
51
|
+
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
|
52
|
+
# \f (form feed, 0xC)
|
53
|
+
# \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
|
54
|
+
# \xXX (hex)
|
55
|
+
# Any other escaped character will be treated as a literal character
|
56
|
+
# RegAn::Character.new('\n') # Represents a newline
|
57
|
+
# RegAn::Character.new('\u03a3') # Represents a Σ
|
58
|
+
def initialize(aValue)
|
59
|
+
case aValue
|
60
|
+
when String
|
61
|
+
if aValue.size == 1
|
62
|
+
# Literal single character case...
|
63
|
+
@codepoint = self.class.char2codepoint(aValue)
|
64
|
+
else
|
65
|
+
# Should be an escape sequence...
|
66
|
+
@codepoint = self.class.esc2codepoint(aValue)
|
67
|
+
end
|
68
|
+
@lexeme = aValue
|
69
|
+
|
70
|
+
when Integer
|
71
|
+
@codepoint = aValue
|
72
|
+
else
|
73
|
+
raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Convertion method that returns a character given a codepoint (integer) value.
|
78
|
+
# Example:
|
79
|
+
# RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (
|
80
|
+
# The Unicode GREEK CAPITAL LETTER SIGMA)
|
81
|
+
def self.codepoint2char(aCodepoint)
|
82
|
+
return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
|
83
|
+
end
|
84
|
+
|
85
|
+
# Convertion method that returns the codepoint for the given single character.
|
86
|
+
# Example:
|
87
|
+
# RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
|
88
|
+
def self.char2codepoint(aChar)
|
89
|
+
return aChar.ord
|
90
|
+
end
|
91
|
+
|
92
|
+
# Convertion method that returns the codepoint for the given escape
|
93
|
+
# sequence (a String).
|
94
|
+
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
95
|
+
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed,
|
96
|
+
# 0xC), \v (vertical feed, 0xB)
|
97
|
+
# \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
|
98
|
+
# \xXX (hex)
|
99
|
+
# Any other escaped character will be treated as a literal character
|
100
|
+
# Example:
|
101
|
+
# RegAn::Character::esc2codepoint('\n') # Returns: 0xd
|
102
|
+
def self.esc2codepoint(anEscapeSequence)
|
103
|
+
msg = "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)."
|
104
|
+
raise StandardError, msg unless anEscapeSequence[0] == "\\"
|
105
|
+
result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
|
106
|
+
|
107
|
+
return result
|
108
|
+
end
|
109
|
+
|
110
|
+
# Return the character as a String object
|
111
|
+
def char()
|
112
|
+
self.class.codepoint2char(@codepoint)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Returns true iff this Character and parameter 'another' represent the same character.
|
116
|
+
# [another] any Object. The way the equality is tested depends on the another's class
|
117
|
+
# Example:
|
118
|
+
# newOne = Character.new(?\u03a3)
|
119
|
+
# newOne == newOne # true. Identity
|
120
|
+
# newOne == Character.new(?\u03a3) # true. Both have same codepoint
|
121
|
+
# newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
|
122
|
+
# newOne == 0x03a3 # true. The Integer is compared to the codepoint value.
|
123
|
+
# Will test equality with any Object that knows the to_s method
|
124
|
+
def ==(other)
|
125
|
+
result = case other
|
126
|
+
when Character
|
127
|
+
self.to_str == other.to_str
|
128
|
+
|
129
|
+
when Integer
|
130
|
+
self.codepoint == other
|
131
|
+
|
132
|
+
when String
|
133
|
+
other.size > 1 ? false : to_str == other
|
134
|
+
|
135
|
+
else
|
136
|
+
# Unknown type: try with a convertion
|
137
|
+
self == other.to_s # Recursive call
|
138
|
+
end
|
139
|
+
|
140
|
+
return result
|
141
|
+
end
|
142
|
+
|
143
|
+
# Return a plain English description of the character
|
144
|
+
def explain()
|
145
|
+
return "the character '#{to_str}'"
|
146
|
+
end
|
147
|
+
|
148
|
+
protected
|
149
|
+
|
150
|
+
# Conversion method re-definition.
|
151
|
+
# Purpose: Return the String representation of the expression.
|
152
|
+
# If the Character was initially from a text (the lexeme), then the lexeme
|
153
|
+
# is returned back.
|
154
|
+
# Otherwise the character corresponding to the codepoint is returned.
|
155
|
+
def text_repr()
|
156
|
+
return char if lexeme.nil?
|
157
|
+
return lexeme.dup
|
158
|
+
end
|
159
|
+
|
160
|
+
# Convertion method that returns a codepoint for the given two characters
|
161
|
+
# (digram) escape sequence.
|
162
|
+
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
163
|
+
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
|
164
|
+
# \f (form feed, 0xC), \v (vertical feed, 0xB)
|
165
|
+
# Any other escape sequence will return the codepoint of the escaped
|
166
|
+
# character.
|
167
|
+
# [aDigram] A sequence of two characters that starts with a backslash.
|
168
|
+
def self.digram2codepoint(aDigram)
|
169
|
+
# Check that the digram is a special escape sequence
|
170
|
+
result = DigramSequences.fetch(aDigram, nil)
|
171
|
+
|
172
|
+
# If it not a special sequence, then escaped character is
|
173
|
+
# considered literally (the backslash is 'dummy')
|
174
|
+
result = char2codepoint(aDigram[-1]) if result.nil?
|
175
|
+
return result
|
176
|
+
end
|
177
|
+
|
178
|
+
private_class_method :digram2codepoint
|
179
|
+
|
180
|
+
# Convertion method that returns a codepoint for the given complex
|
181
|
+
# escape sequence.
|
182
|
+
# [anEscapeSequence] A String with the format:
|
183
|
+
# \uXXXX where XXXX is a 4 hex digits integer value,
|
184
|
+
# \u{X...} X 1 or more hex digits
|
185
|
+
# \ooo (1..3 octal digits literal)
|
186
|
+
# \xXX (1..2 hex digits literal)
|
187
|
+
def self.esc_number2codepoint(anEscapeSequence)
|
188
|
+
unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
|
189
|
+
raise StandardError, "Unsupported escape sequence #{anEscapeSequence}."
|
190
|
+
else
|
191
|
+
# Octal literal case?
|
192
|
+
return octal.oct if octal # shorterSeq =~ /[0-7]{1,3}/
|
193
|
+
|
194
|
+
# Extract the hexadecimal number
|
195
|
+
hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
|
196
|
+
return hexliteral.hex
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
private_class_method :esc_number2codepoint
|
201
|
+
end # class
|
197
202
|
end # module
|
198
203
|
|
199
|
-
# End of file
|
204
|
+
# End of file
|
@@ -1,60 +1,57 @@
|
|
1
1
|
# File: compound_expression.rb
|
2
2
|
|
3
|
-
require_relative
|
3
|
+
require_relative 'expression' # Access the superclass
|
4
4
|
|
5
5
|
module Regex # This module is used as a namespace
|
6
|
+
# Abstract class. An element that is part of a regular expression &
|
7
|
+
# that has its own child sub-expressions.
|
8
|
+
class CompoundExpression < Expression
|
9
|
+
# Redefined method. Return false since it may have one or more children.
|
10
|
+
def atomic?
|
11
|
+
return false
|
12
|
+
end
|
6
13
|
|
7
|
-
|
8
|
-
#
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
if top.kind_of?(Array)
|
30
|
-
if top.empty?
|
31
|
-
next
|
32
|
-
else
|
33
|
-
currChild = top.pop()
|
34
|
-
visit_stack.push top
|
35
|
-
end
|
36
|
-
else
|
37
|
-
currChild = top
|
38
|
-
end
|
39
|
-
|
40
|
-
result << currChild # Return the visited child
|
41
|
-
|
42
|
-
unless currChild.atomic?
|
43
|
-
children_to_enqueue = currChild.children.reverse() # in-order traversal implies LIFO queue
|
44
|
-
visit_stack.push(children_to_enqueue)
|
45
|
-
end
|
46
|
-
end until visit_stack.empty?
|
47
|
-
end
|
48
|
-
end
|
49
|
-
=end
|
14
|
+
=begin
|
15
|
+
# Build a depth-first in-order children visitor.
|
16
|
+
# The visitor is implemented as an Enumerator.
|
17
|
+
def df_visitor()
|
18
|
+
root = children # The visit will start from the children of this object
|
19
|
+
|
20
|
+
visitor = Enumerator.new do |result| # result is a Yielder
|
21
|
+
# Initialization part: will run once
|
22
|
+
visit_stack = [ root ] # The LIFO queue of nodes to visit
|
23
|
+
|
24
|
+
begin # Traversal part (as a loop)
|
25
|
+
top = visit_stack.pop()
|
26
|
+
if top.kind_of?(Array)
|
27
|
+
if top.empty?
|
28
|
+
next
|
29
|
+
else
|
30
|
+
currChild = top.pop()
|
31
|
+
visit_stack.push top
|
32
|
+
end
|
33
|
+
else
|
34
|
+
currChild = top
|
35
|
+
end
|
50
36
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
37
|
+
result << currChild # Return the visited child
|
38
|
+
|
39
|
+
unless currChild.atomic?
|
40
|
+
children_to_enqueue = currChild.children.reverse() # in-order traversal implies LIFO queue
|
41
|
+
visit_stack.push(children_to_enqueue)
|
42
|
+
end
|
43
|
+
end until visit_stack.empty?
|
44
|
+
end
|
45
|
+
end
|
46
|
+
=end
|
55
47
|
|
56
|
-
|
48
|
+
protected
|
57
49
|
|
50
|
+
# Abstract method. Return the text representation of the child (if any)
|
51
|
+
def all_child_text()
|
52
|
+
abstract_method
|
53
|
+
end
|
54
|
+
end # class
|
58
55
|
end # module
|
59
56
|
|
60
|
-
# End of file
|
57
|
+
# End of file
|