rley 0.5.08 → 0.5.09
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -1
- data/examples/general/SRL/lib/ast_builder.rb +74 -78
- data/examples/general/SRL/lib/grammar.rb +11 -3
- data/examples/general/SRL/lib/regex/abstract_method.rb +35 -0
- data/examples/general/SRL/lib/regex/atomic_expression.rb +21 -0
- data/examples/general/SRL/lib/regex/char_class.rb +34 -0
- data/examples/general/SRL/lib/regex/char_range.rb +50 -0
- data/examples/general/SRL/lib/regex/character.rb +195 -0
- data/examples/general/SRL/lib/regex/compound_expression.rb +60 -0
- data/examples/general/SRL/lib/regex/expression.rb +42 -0
- data/examples/general/SRL/lib/regex/monadic_expression.rb +31 -0
- data/examples/general/SRL/lib/regex/polyadic_expression.rb +64 -0
- data/examples/general/SRL/lib/regex/quantifiable.rb +28 -0
- data/examples/general/SRL/lib/regex/repetition.rb +31 -0
- data/examples/general/SRL/lib/regex_repr.rb +5 -1
- data/examples/general/SRL/lib/tokenizer.rb +8 -5
- data/examples/general/SRL/spec/integration_spec.rb +64 -41
- data/examples/general/SRL/spec/regex/character_spec.rb +159 -0
- data/examples/general/SRL/spec/tokenizer_spec.rb +18 -4
- data/examples/general/SRL/srl_demo.rb +14 -4
- data/lib/rley/constants.rb +1 -1
- metadata +14 -3
- data/examples/general/SRL/lib/srl_demo.rb +0 -67
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d95e1ed6a38d8e1ed70e456f46c812275a1d4530
|
4
|
+
data.tar.gz: fcf7b54ff98d107fa38239139db10e7bbf1f7825
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e185aa4e7ca59e5995f6e87379efd4f921d4242307abd55718e46bf346e7414c487b56f753f446d8518ffab74e783774aaa2adc9b7fcf8a014fdba96b8da090d
|
7
|
+
data.tar.gz: 5c990777d79432c813c92dda0b21e7af3dcf587485b868e9723d30e3d1c1b3b2d6e1bc8dc7235fb1ba32c1b27ddeb6f829ef3e1b7b9248d5be1199f27e153849
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,10 @@
|
|
1
|
-
### 0.5.
|
1
|
+
### 0.5.09 / 2017-12-02
|
2
|
+
* [CHANGE] Dir `examples/general/SRL/ Added support for letter range to Simple Regex Language parser.
|
3
|
+
|
4
|
+
### 0.5.08 / 2017-11-28
|
5
|
+
* [NEW] Dir `examples/general/SRL/ Added an initial version of the Simple Regex Language parser.
|
6
|
+
Supports the SRL quantifier syntax only.
|
7
|
+
* [FIX] Method `ParseTreeBuilder::place_TOS_child` was sometimes fooled when argument `aNode` was nil.
|
2
8
|
* [FIX] Method `BaseParser::initialize` missing parameter name in doc caused a YARD warning.
|
3
9
|
* [FIX] Method `GrmItemsBuilder::build_dotted_items` missing parameter name in doc caused a YARD warning.
|
4
10
|
* [FIX] Method `NonTerminalVertex::initialize` missing parameter name in doc caused a YARD warning.
|
@@ -35,9 +35,30 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
35
35
|
# @param theChildren [Array] Children nodes (one per rhs symbol)
|
36
36
|
def new_parent_node(aProduction, aRange, theTokens, theChildren)
|
37
37
|
node = case aProduction.name
|
38
|
-
when 'srl_0' # rule 'srl' => '
|
38
|
+
when 'srl_0' # rule 'srl' => 'term'
|
39
39
|
return_first_child(aRange, theTokens, theChildren)
|
40
40
|
|
41
|
+
when 'term_0' # rule 'term' => 'atom'
|
42
|
+
return_first_child(aRange, theTokens, theChildren)
|
43
|
+
|
44
|
+
when 'term_1' # rule 'term' => %w[atom quantifier]
|
45
|
+
reduce_term_1(aProduction, aRange, theTokens, theChildren)
|
46
|
+
|
47
|
+
when 'atom_0' #rule 'atom' => 'letter_range'
|
48
|
+
return_first_child(aRange, theTokens, theChildren)
|
49
|
+
|
50
|
+
when 'letter_range_0' # rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
|
51
|
+
reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
|
52
|
+
|
53
|
+
when 'letter_range_1' #rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
|
54
|
+
reduce_letter_range_1(aProduction, aRange, theTokens, theChildren)
|
55
|
+
|
56
|
+
when 'letter_range_2' # rule 'letter_range' => 'LETTER'
|
57
|
+
reduce_letter_range_2(aProduction, aRange, theTokens, theChildren)
|
58
|
+
|
59
|
+
when 'letter_range_3' # rule 'letter_range' => %w[UPPERCASE LETTER]
|
60
|
+
reduce_letter_range_3(aProduction, aRange, theTokens, theChildren)
|
61
|
+
|
41
62
|
when 'quantifier_0' # rule 'quantifier' => 'ONCE'
|
42
63
|
multiplicity(1, 1)
|
43
64
|
|
@@ -78,99 +99,74 @@ class ASTBuilder < Rley::Parser::ParseTreeBuilder
|
|
78
99
|
return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
|
79
100
|
end
|
80
101
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
# rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
|
88
|
-
def reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
|
89
|
-
upper = theChildren[3].token.lexeme.to_i
|
90
|
-
# lower = theChildren[1].token.lexeme.to_i
|
91
|
-
multiplicity(3, upper)
|
92
|
-
end
|
93
|
-
|
94
|
-
# rule 'quantifier' => %w[AT LEAST count TIMES]
|
95
|
-
def reduce_quantifier_7(aProduction, aRange, theTokens, theChildren)
|
96
|
-
count = theChildren[2].token.lexeme.to_i
|
97
|
-
multiplicity(count, :more)
|
102
|
+
def char_range(lowerBound, upperBound)
|
103
|
+
# TODO fix module nesting
|
104
|
+
lower = Regex::Character.new(lowerBound)
|
105
|
+
upper = Regex::Character.new(upperBound)
|
106
|
+
return Regex::CharRange.new(lower, upper)
|
98
107
|
end
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
def reduce_binary_operator(theChildren)
|
103
|
-
operator_node = theChildren[1]
|
104
|
-
operator_node.children << theChildren[0]
|
105
|
-
operator_node.children << theChildren[2]
|
106
|
-
return operator_node
|
108
|
+
|
109
|
+
def char_class(toNegate, *theChildren)
|
110
|
+
Regex::CharClass.new(toNegate, *theChildren)
|
107
111
|
end
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
reduce_binary_operator(theChildren)
|
112
|
+
|
113
|
+
def repetition(expressionToRepeat, aMultiplicity)
|
114
|
+
return Regex::Repetition.new(expressionToRepeat, aMultiplicity)
|
112
115
|
end
|
113
116
|
|
114
|
-
# rule 'term' => %w[
|
115
|
-
def reduce_term_1(
|
116
|
-
|
117
|
+
# rule 'term' => %w[atom quantifier]
|
118
|
+
def reduce_term_1(aProduction, aRange, theTokens, theChildren)
|
119
|
+
quantifier = theChildren.last
|
120
|
+
atom = theChildren.first
|
121
|
+
repetition(atom, quantifier)
|
117
122
|
end
|
118
123
|
|
119
|
-
# rule '
|
120
|
-
def
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
return result
|
124
|
+
# rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
|
125
|
+
def reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
|
126
|
+
lower = theChildren[2].token.lexeme
|
127
|
+
upper = theChildren[4].token.lexeme
|
128
|
+
ch_range = char_range(lower, upper)
|
129
|
+
char_class(false, ch_range)
|
126
130
|
end
|
127
|
-
|
128
|
-
# rule '
|
129
|
-
def
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
theChildren[1]
|
135
|
-
end
|
136
|
-
|
137
|
-
return result
|
131
|
+
|
132
|
+
# rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
|
133
|
+
def reduce_letter_range_1(aProduction, aRange, theTokens, theChildren)
|
134
|
+
lower = theChildren[3].token.lexeme
|
135
|
+
upper = theChildren[5].token.lexeme
|
136
|
+
ch_range = char_range(lower.upcase, upper.upcase)
|
137
|
+
char_class(false, ch_range)
|
138
138
|
end
|
139
|
-
|
140
|
-
# rule '
|
141
|
-
def
|
142
|
-
|
143
|
-
|
144
|
-
func.children << theChildren[1]
|
145
|
-
return func
|
139
|
+
|
140
|
+
# rule 'letter_range' => 'LETTER'
|
141
|
+
def reduce_letter_range_2(aProduction, aRange, theTokens, theChildren)
|
142
|
+
ch_range = char_range('a', 'z')
|
143
|
+
char_class(false, ch_range)
|
146
144
|
end
|
147
|
-
|
148
|
-
#
|
149
|
-
def
|
150
|
-
|
151
|
-
|
152
|
-
return negation
|
145
|
+
|
146
|
+
#rule 'letter_range' => %w[UPPERCASE LETTER]
|
147
|
+
def reduce_letter_range_3(aProduction, aRange, theTokens, theChildren)
|
148
|
+
ch_range = char_range('A', 'Z')
|
149
|
+
char_class(false, ch_range)
|
153
150
|
end
|
154
151
|
|
155
|
-
# rule '
|
156
|
-
def
|
157
|
-
|
152
|
+
# rule 'quantifier' => %w[EXACTLY count TIMES]
|
153
|
+
def reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
|
154
|
+
count = theChildren[1].token.lexeme.to_i
|
155
|
+
multiplicity(count, count)
|
158
156
|
end
|
159
157
|
|
160
|
-
# rule '
|
161
|
-
def
|
162
|
-
|
158
|
+
# rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
|
159
|
+
def reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
|
160
|
+
lower = theChildren[1].token.lexeme.to_i
|
161
|
+
upper = theChildren[3].token.lexeme.to_i
|
162
|
+
multiplicity(lower, upper)
|
163
163
|
end
|
164
164
|
|
165
|
-
# rule '
|
166
|
-
def
|
167
|
-
|
165
|
+
# rule 'quantifier' => %w[AT LEAST count TIMES]
|
166
|
+
def reduce_quantifier_7(aProduction, aRange, theTokens, theChildren)
|
167
|
+
count = theChildren[2].token.lexeme.to_i
|
168
|
+
multiplicity(count, :more)
|
168
169
|
end
|
169
170
|
|
170
|
-
# rule 'mul_operator' => 'DIVIDE'
|
171
|
-
def reduce_mul_operator_1(_production, aRange, _tokens, theChildren)
|
172
|
-
return CalcDivideNode.new(theChildren[0].symbol, aRange)
|
173
|
-
end
|
174
|
-
=end
|
175
171
|
end # class
|
176
172
|
# End of file
|
@@ -6,13 +6,21 @@ module SRL
|
|
6
6
|
# This is a very partial grammar of SRL.
|
7
7
|
# It will be expanded with the coming versions of Rley
|
8
8
|
builder = Rley::Syntax::GrammarBuilder.new do
|
9
|
-
add_terminals('
|
9
|
+
add_terminals('DIGIT_LIT', 'INTEGER', 'LETTER_LIT')
|
10
|
+
add_terminals('UPPERCASE', 'LETTER', 'FROM', 'TO')
|
10
11
|
add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
|
11
12
|
add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
|
12
13
|
add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
|
13
14
|
|
14
15
|
# For the moment one focuses on quantifier syntax only...
|
15
|
-
rule 'srl' => '
|
16
|
+
rule 'srl' => 'term'
|
17
|
+
rule 'term' => 'atom'
|
18
|
+
rule 'term' => %w[atom quantifier]
|
19
|
+
rule 'atom' => 'letter_range'
|
20
|
+
rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
|
21
|
+
rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
|
22
|
+
rule 'letter_range' => 'LETTER'
|
23
|
+
rule 'letter_range' => %w[UPPERCASE LETTER]
|
16
24
|
rule 'quantifier' => 'ONCE'
|
17
25
|
rule 'quantifier' => 'TWICE'
|
18
26
|
rule 'quantifier' => %w[EXACTLY count TIMES]
|
@@ -21,7 +29,7 @@ module SRL
|
|
21
29
|
rule 'quantifier' => %w[ONCE OR MORE]
|
22
30
|
rule 'quantifier' => %w[NEVER OR MORE]
|
23
31
|
rule 'quantifier' => %w[AT LEAST count TIMES]
|
24
|
-
rule 'count' => '
|
32
|
+
rule 'count' => 'DIGIT_LIT'
|
25
33
|
rule 'count' => 'INTEGER'
|
26
34
|
rule 'times_suffix' => 'TIMES'
|
27
35
|
rule 'times_suffix' => []
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# File: abstract_method.rb
|
2
|
+
|
3
|
+
# Mix-in module. Provides the method 'abstract_method' that raises an exception
|
4
|
+
# with an appropriate message when called.
|
5
|
+
module AbstractMethod
|
6
|
+
public
|
7
|
+
|
8
|
+
# Call this method in the body of your abstract methods.
|
9
|
+
# Example:
|
10
|
+
# require 'AbstractMethod'
|
11
|
+
# class SomeClass
|
12
|
+
# include AbstractMethod # To add the behaviour from the mix-in module AbstractMethod
|
13
|
+
# ...
|
14
|
+
# Consider that SomeClass has an abstract method called 'some_method'
|
15
|
+
#
|
16
|
+
# def some_method() abstract_method
|
17
|
+
# end
|
18
|
+
def abstract_method()
|
19
|
+
# Determine the short class name of self
|
20
|
+
className = self.class.name.split(/::/).last
|
21
|
+
|
22
|
+
# Retrieve the top text line of the call stack
|
23
|
+
top_line = caller.first
|
24
|
+
|
25
|
+
# Extract the calling method name
|
26
|
+
callerNameInQuotes = top_line.scan(/`.+?$/).first
|
27
|
+
callerName = callerNameInQuotes.gsub(/`|'/, '') # Remove enclosing quotes
|
28
|
+
|
29
|
+
# Build the error message
|
30
|
+
error_message = "The method #{className}##{callerName} is abstract. It should be implemented in subclasses of #{className}."
|
31
|
+
raise NotImplementedError, error_message
|
32
|
+
end
|
33
|
+
end # module
|
34
|
+
|
35
|
+
# End of file
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# File: atomic_expression.rb
|
2
|
+
|
3
|
+
require_relative "expression" # Access the superclass
|
4
|
+
|
5
|
+
module Regex # This module is used as a namespace
|
6
|
+
|
7
|
+
# Abstract class. A valid regular expression that
|
8
|
+
# cannot be further decomposed into sub-expressions.
|
9
|
+
class AtomicExpression < Expression
|
10
|
+
|
11
|
+
public
|
12
|
+
# Redefined method. Return true since it may not have any child.
|
13
|
+
def atomic?
|
14
|
+
return true
|
15
|
+
end
|
16
|
+
|
17
|
+
end # class
|
18
|
+
|
19
|
+
end # module
|
20
|
+
|
21
|
+
# End of file
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# File: char_class.rb
|
2
|
+
|
3
|
+
require_relative "polyadic_expression" # Access the superclass
|
4
|
+
|
5
|
+
module Regex # This module is used as a namespace
|
6
|
+
|
7
|
+
# Abstract class. A n-ary matching operator.
|
8
|
+
# It succeeds when one child expression succeeds to match the subject text
|
9
|
+
# than defined by this concatenation.
|
10
|
+
class CharClass < PolyadicExpression
|
11
|
+
# A flag that indicates whether the character is negated
|
12
|
+
attr_reader(:negated)
|
13
|
+
|
14
|
+
# Constructor.
|
15
|
+
def initialize(to_negate,*theChildren)
|
16
|
+
super(theChildren)
|
17
|
+
@negated = to_negate
|
18
|
+
end
|
19
|
+
|
20
|
+
public
|
21
|
+
# Conversion method re-definition.
|
22
|
+
# Purpose: Return the String representation of the concatented expressions.
|
23
|
+
def to_str()
|
24
|
+
result_children = children.inject('') { |subResult, aChild| subResult << aChild.to_str() }
|
25
|
+
result = '['+ (negated ? '^' : '') + result_children + ']'
|
26
|
+
|
27
|
+
return result
|
28
|
+
end
|
29
|
+
|
30
|
+
end # class
|
31
|
+
|
32
|
+
end # module
|
33
|
+
|
34
|
+
# End of file
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# File: char_range.rb
|
2
|
+
|
3
|
+
require_relative 'polyadic_expression' # Access the superclass
|
4
|
+
|
5
|
+
module Regex # This module is used as a namespace
|
6
|
+
|
7
|
+
# A binary expression that represents a contiguous range of characters.
|
8
|
+
# Assumption: characters are ordered by codepoint
|
9
|
+
class CharRange < PolyadicExpression
|
10
|
+
|
11
|
+
# Constructor.
|
12
|
+
# [thelowerBound] A character that will be the lower bound value for the range.
|
13
|
+
# [theUpperBound] A character that will be the upper bound value for the range.
|
14
|
+
# TODO: optimisation. Build a Character if lower bound == upper bound.
|
15
|
+
def initialize(theLowerBound, theUpperBound)
|
16
|
+
range = validated_range(theLowerBound, theUpperBound)
|
17
|
+
super(range)
|
18
|
+
end
|
19
|
+
|
20
|
+
public
|
21
|
+
# Return the lower bound of the range.
|
22
|
+
def lower()
|
23
|
+
return children.first
|
24
|
+
end
|
25
|
+
|
26
|
+
# Return the upper bound of the range.
|
27
|
+
def upper()
|
28
|
+
return children.last
|
29
|
+
end
|
30
|
+
|
31
|
+
# Conversion method re-definition.
|
32
|
+
# Purpose: Return the String representation of the concatented expressions.
|
33
|
+
def to_str()
|
34
|
+
result = lower.to_str() + '-' + upper.to_str()
|
35
|
+
|
36
|
+
return result
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
# Validation method. Returns a couple of Characters.after their validation.
|
41
|
+
def validated_range(theLowerBound, theUpperBound)
|
42
|
+
raise StandardError, "Character range error: lower bound is greater than upper bound." if theLowerBound.codepoint > theUpperBound.codepoint
|
43
|
+
return [theLowerBound, theUpperBound]
|
44
|
+
end
|
45
|
+
|
46
|
+
end # class
|
47
|
+
|
48
|
+
end # module
|
49
|
+
|
50
|
+
# End of file
|
@@ -0,0 +1,195 @@
|
|
1
|
+
# File: character.rb
|
2
|
+
|
3
|
+
require_relative 'atomic_expression' # Access the superclass
|
4
|
+
|
5
|
+
module Regex # This module is used as a namespace
|
6
|
+
|
7
|
+
# A regular expression that matches a specific character in a given character set
|
8
|
+
class Character < AtomicExpression
|
9
|
+
# Constant with all special 2-characters escape sequences
|
10
|
+
DigramSequences = {
|
11
|
+
"\\a" => 0x7, # alarm
|
12
|
+
"\\n" => 0xA, # newline
|
13
|
+
"\\r" => 0xD, # carriage return
|
14
|
+
"\\t" => 0x9, # tab
|
15
|
+
"\\e" => 0x1B, # escape
|
16
|
+
"\\f" => 0xC, # form feed
|
17
|
+
"\\v" => 0xB, # vertical feed
|
18
|
+
# Single octal digit literals
|
19
|
+
"\\0" => 0,
|
20
|
+
"\\1" => 1,
|
21
|
+
"\\2" => 2,
|
22
|
+
"\\3" => 3,
|
23
|
+
"\\4" => 4,
|
24
|
+
"\\5" => 5,
|
25
|
+
"\\6" => 6,
|
26
|
+
"\\7" => 7
|
27
|
+
}
|
28
|
+
|
29
|
+
# The integer value that uniquely identifies the character.
|
30
|
+
attr_reader(:codepoint)
|
31
|
+
|
32
|
+
# The initial text representation of the character (if any).
|
33
|
+
attr_reader(:lexeme)
|
34
|
+
|
35
|
+
# Constructor.
|
36
|
+
# [aValue] Initialize the character with a either a String literal or a codepoint value.
|
37
|
+
# Examples:
|
38
|
+
# Initializing with codepoint value...
|
39
|
+
# RegAn::Character.new(0x3a3) # Represents: Σ (Unicode GREEK CAPITAL LETTER SIGMA)
|
40
|
+
# RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex)
|
41
|
+
#
|
42
|
+
# Initializing with a single character string
|
43
|
+
# RegAn::Character.new(?\u03a3) # Also represents: Σ
|
44
|
+
# RegAn::Character.new('Σ') # Obviously, represents a Σ
|
45
|
+
#
|
46
|
+
# Initializing with an escape sequence string
|
47
|
+
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
48
|
+
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC)
|
49
|
+
# \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal) \xXX (hex)
|
50
|
+
# Any other escaped character will be treated as a literal character
|
51
|
+
# RegAn::Character.new('\n') # Represents a newline
|
52
|
+
# RegAn::Character.new('\u03a3') # Represents a Σ
|
53
|
+
def initialize(aValue)
|
54
|
+
|
55
|
+
case aValue
|
56
|
+
when String
|
57
|
+
if aValue.size == 1
|
58
|
+
# Literal single character case...
|
59
|
+
@codepoint = self.class.char2codepoint(aValue)
|
60
|
+
else
|
61
|
+
# Should be an escape sequence...
|
62
|
+
@codepoint = self.class.esc2codepoint(aValue)
|
63
|
+
end
|
64
|
+
@lexeme = aValue
|
65
|
+
|
66
|
+
when Fixnum
|
67
|
+
@codepoint = aValue
|
68
|
+
else
|
69
|
+
raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
public
|
74
|
+
# Convertion method that returns a character given a codepoint (integer) value.
|
75
|
+
# Example:
|
76
|
+
# RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (The Unicode GREEK CAPITAL LETTER SIGMA)
|
77
|
+
def self.codepoint2char(aCodepoint)
|
78
|
+
return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
|
79
|
+
end
|
80
|
+
|
81
|
+
# Convertion method that returns the codepoint for the given single character.
|
82
|
+
# Example:
|
83
|
+
# RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
|
84
|
+
def self.char2codepoint(aChar)
|
85
|
+
return aChar.ord()
|
86
|
+
end
|
87
|
+
|
88
|
+
# Convertion method that returns the codepoint for the given escape sequence (a String).
|
89
|
+
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
90
|
+
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC), \v (vertical feed, 0xB)
|
91
|
+
# \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal) \xXX (hex)
|
92
|
+
# Any other escaped character will be treated as a literal character
|
93
|
+
# Example:
|
94
|
+
# RegAn::Character::esc2codepoint('\n') # Returns: 0xd
|
95
|
+
def self.esc2codepoint(anEscapeSequence)
|
96
|
+
raise StandardError, "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)." unless anEscapeSequence[0] == "\\"
|
97
|
+
result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
|
98
|
+
|
99
|
+
return result
|
100
|
+
end
|
101
|
+
|
102
|
+
# Return the character as a String object
|
103
|
+
def char()
|
104
|
+
self.class.codepoint2char(@codepoint)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Conversion method re-definition.
|
108
|
+
# Purpose: Return the String representation of the expression.
|
109
|
+
# If the Character was initially from a text (the lexeme), then the lexeme is returned back.
|
110
|
+
# Otherwise the character corresponding to the codepoint is returned.
|
111
|
+
def to_str()
|
112
|
+
if lexeme.nil?
|
113
|
+
result = char()
|
114
|
+
else
|
115
|
+
result = lexeme.dup()
|
116
|
+
end
|
117
|
+
|
118
|
+
return result
|
119
|
+
end
|
120
|
+
|
121
|
+
# Returns true iff this Character and parameter 'another' represent the same character.
|
122
|
+
# [another] any Object. The way the equality is tested depends on the another's class
|
123
|
+
# Example:
|
124
|
+
# newOne = Character.new(?\u03a3)
|
125
|
+
# newOne == newOne # true. Identity
|
126
|
+
# newOne == Character.new(?\u03a3) # true. Both have same codepoint
|
127
|
+
# newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
|
128
|
+
# newOne == 0x03a3 # true. The Fixnum is compared to the codepoint value.
|
129
|
+
# Will test equality with any Object that knows the to_s method
|
130
|
+
def ==(another)
|
131
|
+
result = case another
|
132
|
+
when Character
|
133
|
+
self.to_str == another.to_str
|
134
|
+
|
135
|
+
when Fixnum
|
136
|
+
self.codepoint == another
|
137
|
+
|
138
|
+
when String
|
139
|
+
(another.size > 1) ? false : self.to_str == another
|
140
|
+
|
141
|
+
else
|
142
|
+
# Unknown type: try with a convertion
|
143
|
+
self == another.to_s() # Recursive call
|
144
|
+
end
|
145
|
+
|
146
|
+
return result
|
147
|
+
end
|
148
|
+
|
149
|
+
# Return a plain English description of the character
|
150
|
+
def explain()
|
151
|
+
return "the character '#{to_str()}'"
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
# Convertion method that returns a codepoint for the given two characters (digram) escape sequence.
|
156
|
+
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
157
|
+
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed, 0xC), \v (vertical feed, 0xB)
|
158
|
+
# Any other escape sequence will return the codepoint of the escaped character.
|
159
|
+
# [aDigram] A sequence of two characters that starts with a backslash.
|
160
|
+
def self.digram2codepoint(aDigram)
|
161
|
+
# Check that the digram is a special escape sequence
|
162
|
+
result = DigramSequences.fetch(aDigram, nil)
|
163
|
+
|
164
|
+
# If it not a special sequence, then escaped character is considered literally (the backslash is 'dummy')
|
165
|
+
result = char2codepoint(aDigram[-1]) if result.nil?
|
166
|
+
return result
|
167
|
+
end
|
168
|
+
|
169
|
+
# Convertion method that returns a codepoint for the given complex escape sequence.
|
170
|
+
# [anEscapeSequence] A String with the format:
|
171
|
+
# \uXXXX where XXXX is a 4 hex digits integer value,
|
172
|
+
# \u{X...} X 1 or more hex digits
|
173
|
+
# \ooo (1..3 octal digits literal)
|
174
|
+
# \xXX (1..2 hex digits literal)
|
175
|
+
def self.esc_number2codepoint(anEscapeSequence)
|
176
|
+
# Next line requires Ruby >= 1.9
|
177
|
+
unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
|
178
|
+
raise StandardError, "Unsupported escape sequence #{anEscapeSequence}."
|
179
|
+
else
|
180
|
+
#shorterSeq = anEscapeSequence[1..-1] # Remove the backslash
|
181
|
+
|
182
|
+
# Octal literal case?
|
183
|
+
return octal.oct() if octal # shorterSeq =~ /[0-7]{1,3}/
|
184
|
+
|
185
|
+
# Extract the hexadecimal number
|
186
|
+
hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
|
187
|
+
return hexliteral.hex()
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
end # class
|
192
|
+
|
193
|
+
end # module
|
194
|
+
|
195
|
+
# End of file
|