rley 0.6.01 → 0.6.02

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -1
  3. data/examples/NLP/engtagger.rb +58 -60
  4. data/lib/rley/constants.rb +1 -1
  5. metadata +2 -33
  6. data/examples/general/SRL/lib/ast_builder.rb +0 -382
  7. data/examples/general/SRL/lib/grammar.rb +0 -106
  8. data/examples/general/SRL/lib/regex/abstract_method.rb +0 -35
  9. data/examples/general/SRL/lib/regex/alternation.rb +0 -27
  10. data/examples/general/SRL/lib/regex/anchor.rb +0 -45
  11. data/examples/general/SRL/lib/regex/atomic_expression.rb +0 -16
  12. data/examples/general/SRL/lib/regex/capturing_group.rb +0 -51
  13. data/examples/general/SRL/lib/regex/char_class.rb +0 -38
  14. data/examples/general/SRL/lib/regex/char_range.rb +0 -51
  15. data/examples/general/SRL/lib/regex/char_shorthand.rb +0 -50
  16. data/examples/general/SRL/lib/regex/character.rb +0 -204
  17. data/examples/general/SRL/lib/regex/compound_expression.rb +0 -57
  18. data/examples/general/SRL/lib/regex/concatenation.rb +0 -29
  19. data/examples/general/SRL/lib/regex/expression.rb +0 -60
  20. data/examples/general/SRL/lib/regex/lookaround.rb +0 -50
  21. data/examples/general/SRL/lib/regex/match_option.rb +0 -34
  22. data/examples/general/SRL/lib/regex/monadic_expression.rb +0 -28
  23. data/examples/general/SRL/lib/regex/multiplicity.rb +0 -91
  24. data/examples/general/SRL/lib/regex/non_capturing_group.rb +0 -27
  25. data/examples/general/SRL/lib/regex/polyadic_expression.rb +0 -60
  26. data/examples/general/SRL/lib/regex/quantifiable.rb +0 -22
  27. data/examples/general/SRL/lib/regex/repetition.rb +0 -29
  28. data/examples/general/SRL/lib/regex/wildcard.rb +0 -23
  29. data/examples/general/SRL/lib/regex_repr.rb +0 -13
  30. data/examples/general/SRL/lib/tokenizer.rb +0 -147
  31. data/examples/general/SRL/spec/integration_spec.rb +0 -448
  32. data/examples/general/SRL/spec/regex/character_spec.rb +0 -166
  33. data/examples/general/SRL/spec/regex/multiplicity_spec.rb +0 -79
  34. data/examples/general/SRL/spec/spec_helper.rb +0 -25
  35. data/examples/general/SRL/spec/tokenizer_spec.rb +0 -148
  36. data/examples/general/SRL/srl_demo.rb +0 -75
@@ -1,106 +0,0 @@
1
- # Grammar for SRL (Simple Regex Language)
2
- require 'rley' # Load the gem
3
- module SRL
4
- ########################################
5
- # Work in progress.
6
- # This is a very partial grammar of SRL.
7
- # It will be expanded with the coming versions of Rley
8
- builder = Rley::Syntax::GrammarBuilder.new do
9
- add_terminals('LPAREN', 'RPAREN', 'COMMA')
10
- add_terminals('DIGIT_LIT', 'INTEGER', 'LETTER_LIT')
11
- add_terminals('LITERALLY', 'STRING_LIT')
12
- add_terminals('BEGIN', 'STARTS', 'WITH')
13
- add_terminals('MUST', 'END')
14
- add_terminals('UPPERCASE', 'LETTER', 'FROM', 'TO')
15
- add_terminals('DIGIT', 'NUMBER', 'ANY', 'NO')
16
- add_terminals('CHARACTER', 'WHITESPACE', 'ANYTHING')
17
- add_terminals('TAB', 'BACKSLASH', 'NEW', 'LINE')
18
- add_terminals('OF', 'ONE')
19
- add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
20
- add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
21
- add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
22
- add_terminals('IF', 'FOLLOWED', 'BY', 'NOT')
23
- add_terminals('ALREADY', 'HAD')
24
- add_terminals('CAPTURE', 'AS', 'UNTIL')
25
- add_terminals('CASE', 'INSENSITIVE', 'MULTI', 'ALL')
26
- add_terminals('LAZY')
27
-
28
- rule('srl' => 'expression').as 'start_rule'
29
- rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
30
- rule('expression' => 'pattern').as 'simple_expr'
31
- rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
32
- rule('pattern' => 'quantifiable').as 'basic_pattern'
33
- rule('separator' => 'COMMA').as 'comma_separator'
34
- rule('separator' => []).as 'void_separator'
35
- rule('flags' => %w[flags separator single_flag]).as 'flag_sequence'
36
- rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
37
- rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
38
- rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
39
- rule('quantifiable' => %w[begin_anchor anchorable end_anchor]).as 'pinned_quantifiable'
40
- rule('quantifiable' => %w[begin_anchor anchorable]).as 'begin_anchor_quantifiable'
41
- rule('quantifiable' => %w[anchorable end_anchor]).as 'end_anchor_quantifiable'
42
- rule('quantifiable' => 'anchorable').as 'simple_quantifiable'
43
- rule('begin_anchor' => %w[STARTS WITH]).as 'starts_with'
44
- rule('begin_anchor' => %w[BEGIN WITH]).as 'begin_with'
45
- rule('end_anchor' => %w[MUST END]).as 'end_anchor'
46
- rule('anchorable' => 'assertable').as 'simple_anchorable'
47
- rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
48
- rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
49
- rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
50
- rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
51
- rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
52
- rule('assertable' => 'term').as 'simple_assertable'
53
- rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
54
- rule('term' => 'atom').as 'atom_term'
55
- rule('term' => 'alternation').as 'alternation_term'
56
- rule('term' => 'grouping').as 'grouping_term'
57
- rule('term' => 'capturing_group').as 'capturing_group_atom'
58
- rule('atom' => 'letter_range').as 'letter_range_atom'
59
- rule('atom' => 'digit_range').as 'digit_range_atom'
60
- rule('atom' => 'character_class').as 'character_class_atom'
61
- rule('atom' => 'special_char').as 'special_char_atom'
62
- rule('atom' => 'literal').as 'literal_atom'
63
- rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
64
- rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
65
- rule('letter_range' => 'LETTER').as 'any_lowercase'
66
- rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
67
- rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
68
- rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
69
- rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
70
- rule('character_class' => %w[NO CHARACTER]).as 'no_character'
71
- rule('character_class' => 'WHITESPACE').as 'whitespace'
72
- rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
73
- rule('character_class' => 'ANYTHING').as 'anything'
74
- rule('character_class' => %w[ONE OF STRING_LIT]).as 'one_of'
75
- rule('special_char' => 'TAB').as 'tab'
76
- rule('special_char' => 'BACKSLASH').as 'backslash'
77
- rule('special_char' => %w[NEW LINE]).as 'new_line'
78
- rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
79
- rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
80
- rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
81
- rule('alternatives' => 'quantifiable').as 'simple_alternative'
82
- rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
83
- rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
84
- rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as 'capture_until'
85
- rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as 'named_capture'
86
- rule('capturing_group' => %w[CAPTURE assertable AS var_name UNTIL assertable]).as 'named_capture_until'
87
- rule('var_name' => 'STRING_LIT').as 'var_name'
88
- rule('quantifier' => 'ONCE').as 'once'
89
- rule('quantifier' => 'TWICE').as 'twice'
90
- rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
91
- rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as 'between_and'
92
- rule('quantifier' => 'OPTIONAL').as 'optional'
93
- rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
94
- rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
95
- rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
96
- rule('digit_or_number' => 'DIGIT').as 'digit_keyword'
97
- rule('digit_or_number' => 'NUMBER').as 'number_keyword'
98
- rule('count' => 'DIGIT_LIT').as 'single_digit'
99
- rule('count' => 'INTEGER').as 'integer_count'
100
- rule('times_suffix' => 'TIMES').as 'times_keyword'
101
- rule('times_suffix' => []).as 'times_dropped'
102
- end
103
-
104
- # And now build the grammar and make it accessible via a global constant
105
- Grammar = builder.grammar
106
- end # module
@@ -1,35 +0,0 @@
1
- # File: abstract_method.rb
2
-
3
- # Mix-in module. Provides the method 'abstract_method' that raises an exception
4
- # with an appropriate message when called.
5
- module AbstractMethod
6
- # Call this method in the body of your abstract methods.
7
- # Example:
8
- # require 'AbstractMethod'
9
- # class SomeClass
10
- # include AbstractMethod # To add the behaviour from the mix-in module AbstractMethod
11
- # ...
12
- # Consider that SomeClass has an abstract method called 'some_method'
13
- #
14
- # def some_method() abstract_method
15
- # end
16
- def abstract_method()
17
- # Determine the short class name of self
18
- className = self.class.name.split(/::/).last
19
-
20
- # Retrieve the top text line of the call stack
21
- top_line = caller(1..1)
22
-
23
- # Extract the calling method name
24
- callerNameInQuotes = top_line.scan(/`.+?$/).first
25
- callerName = callerNameInQuotes.gsub(/`|'/, '') # Remove enclosing quotes
26
-
27
- # Build the error message
28
- prefix = "The method #{className}##{callerName} is abstract."
29
- suffix = " It should be implemented in subclasses of #{className}."
30
- error_message = prefix + suffix
31
- raise NotImplementedError, error_message
32
- end
33
- end # module
34
-
35
- # End of file
@@ -1,27 +0,0 @@
1
- # File: alternation.rb
2
-
3
- require_relative 'polyadic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # Abstract class. A n-ary matching operator.
7
- # It succeeds when one child expression succeeds to match the subject text
8
- class Alternation < PolyadicExpression
9
- # Constructor.
10
- def initialize(*theChildren)
11
- super(theChildren)
12
- end
13
-
14
- protected
15
-
16
- # Conversion method re-definition.
17
- # Purpose: Return the String representation of the concatented expressions.
18
- def text_repr()
19
- result_children = children.map(&:to_str)
20
- result = '(?:' + result_children.join('|') + ')'
21
-
22
- return result
23
- end
24
- end # class
25
- end # module
26
-
27
- # End of file
@@ -1,45 +0,0 @@
1
- # File: anchor.rb
2
-
3
- require_relative 'atomic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # An anchor is a zero-width assertion based on the current position.
7
- class Anchor < AtomicExpression
8
- # A Hash for converting a lexeme to a symbolic value
9
- AnchorToSymbol = {
10
- # Lexeme => Symbol value
11
- '^' => :soLine, # Start of line
12
- '$' => :eoLine, # End of line
13
- '\A' => :soSubject,
14
- '\b' => :wordBoundary,
15
- '\B' => :nonAtWordBoundary,
16
- '\G' => :firstMatch,
17
- '\z' => :eoSubject,
18
- '\Z' => :eoSubjectOrBeforeNLAtEnd
19
- }.freeze
20
-
21
- # A symbolic value that identifies the type of assertion to perform
22
- attr_reader(:kind)
23
-
24
- # Constructor
25
- # @param aKind [String] Lexeme representation of the anchor
26
- def initialize(aKind)
27
- @kind = valid_kind(aKind)
28
- end
29
-
30
- # Conversion method re-definition.
31
- # Purpose: Return the String representation of the expression.
32
- def to_str()
33
- return AnchorToSymbol.rassoc(kind).first
34
- end
35
-
36
- private
37
-
38
- # Return the symbolic value corresponding to the given lexeme.
39
- def valid_kind(aKind)
40
- return AnchorToSymbol[aKind]
41
- end
42
- end # class
43
- end # module
44
-
45
- # End of file
@@ -1,16 +0,0 @@
1
- # File: atomic_expression.rb
2
-
3
- require_relative 'expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # Abstract class. A valid regular expression that
7
- # cannot be further decomposed into sub-expressions.
8
- class AtomicExpression < Expression
9
- # Redefined method. Return true since it may not have any child.
10
- def atomic?
11
- return true
12
- end
13
- end # class
14
- end # module
15
-
16
- # End of file
@@ -1,51 +0,0 @@
1
- # File: capturing_group.rb
2
-
3
- require_relative 'monadic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # An association between a capture variable and an expression
7
- # the subject text in the same serial arrangement
8
- class CapturingGroup < MonadicExpression
9
- # The capture variable id. It is a Fixnum when the capture group gets
10
- # a sequence number,
11
- # a String when it is an user-defined name
12
- attr_reader(:id)
13
-
14
- # When true, then capturing group forbids backtracking requests from its parent
15
- # expression.
16
- attr_reader(:no_backtrack)
17
-
18
- # Constructor.
19
- # [aChildExpression] A sub-expression to match. When successful
20
- # the matching text is assigned to the capture variable.
21
- # [theId] The id of the capture variable.
22
- # [noBacktrack] A flag that specifies whether the capturing group forbids
23
- # backtracking requests from its parent expression.
24
- def initialize(aChildExpression, theId = nil, noBacktrack = false)
25
- super(aChildExpression)
26
- @id = theId
27
- @no_backtrack = noBacktrack
28
- end
29
-
30
- # Return true iff the capturing group has a name (and not )
31
- def named?()
32
- return id.kind_of?(String)
33
- end
34
-
35
- # Conversion method re-definition.
36
- # Purpose: Return the String representation of the captured expression.
37
- def to_str()
38
- prefix = named? ? "?<#{id}>" : ''
39
- atomic = no_backtrack ? '?>' : ''
40
- if child.is_a?(Regex::NonCapturingGroup)
41
- # Minor optimization
42
- result = '(' + atomic + prefix + child.child.to_str + ')'
43
- else
44
- result = '(' + atomic + prefix + child.to_str + ')'
45
- end
46
- return result
47
- end
48
- end # class
49
- end # module
50
-
51
- # End of file
@@ -1,38 +0,0 @@
1
- # File: char_class.rb
2
-
3
- require_relative 'polyadic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # Abstract class. A n-ary matching operator.
7
- # It succeeds when one child expression succeeds to match the subject text.
8
- class CharClass < PolyadicExpression
9
- # These are characters with special meaning in character classes
10
- Metachars = ']\^-'.codepoints
11
- # A flag that indicates whether the character is negated
12
- attr_reader(:negated)
13
-
14
- # Constructor.
15
- def initialize(to_negate, *theChildren)
16
- super(theChildren)
17
- @negated = to_negate
18
- end
19
-
20
- protected
21
-
22
- # Conversion method re-definition.
23
- # Purpose: Return the String representation of the character class.
24
- def text_repr()
25
- result_children = children.inject('') do |subResult, aChild|
26
- if aChild.kind_of?(Regex::Character) && Metachars.include?(aChild.codepoint)
27
- subResult << "\\" # Escape meta-character...
28
- end
29
- subResult << aChild.to_str
30
- end
31
- result = '[' + (negated ? '^' : '') + result_children + ']'
32
-
33
- return result
34
- end
35
- end # class
36
- end # module
37
-
38
- # End of file
@@ -1,51 +0,0 @@
1
- # File: char_range.rb
2
-
3
- require_relative 'polyadic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # A binary expression that represents a contiguous range of characters.
7
- # Assumption: characters are ordered by codepoint
8
- class CharRange < PolyadicExpression
9
- # Constructor.
10
- # [thelowerBound]
11
- # A character that will be the lower bound value for the range.
12
- # [theUpperBound]
13
- # A character that will be the upper bound value for the range.
14
- # TODO: optimisation. Build a Character if lower bound == upper bound.
15
- def initialize(theLowerBound, theUpperBound)
16
- range = validated_range(theLowerBound, theUpperBound)
17
- super(range)
18
- end
19
-
20
- # Return the lower bound of the range.
21
- def lower()
22
- return children.first
23
- end
24
-
25
- # Return the upper bound of the range.
26
- def upper()
27
- return children.last
28
- end
29
-
30
- protected
31
-
32
- # Conversion method re-definition.
33
- # Purpose: Return the String representation of the concatented expressions.
34
- def text_repr()
35
- result = lower.to_str + '-' + upper.to_str
36
-
37
- return result
38
- end
39
-
40
- private
41
-
42
- # Validation method. Returns a couple of Characters.after their validation.
43
- def validated_range(theLowerBound, theUpperBound)
44
- msg = 'Character range error: lower bound is greater than upper bound.'
45
- raise StandardError, msg if theLowerBound.codepoint > theUpperBound.codepoint
46
- return [theLowerBound, theUpperBound]
47
- end
48
- end # class
49
- end # module
50
-
51
- # End of file
@@ -1,50 +0,0 @@
1
- # File: char_shorthand.rb
2
-
3
- require_relative 'atomic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # A pre-defined character class is in essence a name for a built-in, standard character class.
7
- class CharShorthand < AtomicExpression
8
- # A constant Hash that defines all the predefined character shorthands.
9
- # It contains pairs of the form:
10
- # a pre-defined character shorthand letter => a CharRange object
11
- StandardCClasses = {
12
- 'd' => '[0-9]',
13
- 'D' => '[^0-9]',
14
- 'h' => '[0-9a-fA-F]',
15
- 'H' => '[^0-9a-fA-F]',
16
- 's' => '[ \t\r\n\f]',
17
- 'S' => '[^ \t\r\n\f]',
18
- 'w' => '[0-9a-zA-Z_]',
19
- 'W' => '[^0-9a-zA-Z_]'
20
- }.freeze
21
-
22
- # An one-letter abbreviation
23
- attr_reader(:shortname)
24
-
25
- # Constructor
26
- def initialize(aShortname)
27
- @shortname = valid_shortname(aShortname)
28
- end
29
-
30
- protected
31
-
32
- # Conversion method re-definition.
33
- # Purpose: Return the String representation of the expression.
34
- def text_repr()
35
- return "\\#{shortname}"
36
- end
37
-
38
- private
39
-
40
- # Return the validated short name.
41
- def valid_shortname(aShortname)
42
- msg = "Unknown predefined character class \\#{aShortname}"
43
- raise StandardError, msg unless StandardCClasses.include? aShortname
44
-
45
- return aShortname
46
- end
47
- end # class
48
- end # module
49
-
50
- # End of file
@@ -1,204 +0,0 @@
1
- # File: character.rb
2
-
3
- require_relative 'atomic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # A regular expression that matches a specific character in a given character set
7
- class Character < AtomicExpression
8
- # Constant with all special 2-characters escape sequences
9
- DigramSequences = {
10
- "\\a" => 0x7, # alarm
11
- "\\n" => 0xA, # newline
12
- "\\r" => 0xD, # carriage return
13
- "\\t" => 0x9, # tab
14
- "\\e" => 0x1B, # escape
15
- "\\f" => 0xC, # form feed
16
- "\\v" => 0xB, # vertical feed
17
- # Single octal digit literals
18
- "\\0" => 0,
19
- "\\1" => 1,
20
- "\\2" => 2,
21
- "\\3" => 3,
22
- "\\4" => 4,
23
- "\\5" => 5,
24
- "\\6" => 6,
25
- "\\7" => 7
26
- }.freeze
27
-
28
- MetaChars = '\^$+?.'.freeze
29
-
30
- # The integer value that uniquely identifies the character.
31
- attr_reader(:codepoint)
32
-
33
- # The initial text representation of the character (if any).
34
- attr_reader(:lexeme)
35
-
36
- # Constructor.
37
- # [aValue] Initialize the character with a either a String literal or a
38
- # codepoint value.
39
- # Examples:
40
- # Initializing with codepoint value...
41
- # RegAn::Character.new(0x3a3) # Represents: Σ
42
- # (Unicode GREEK CAPITAL LETTER SIGMA)
43
- # RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex)
44
- #
45
- # Initializing with a single character string
46
- # RegAn::Character.new(?\u03a3) # Also represents: Σ
47
- # RegAn::Character.new('Σ') # Obviously, represents a Σ
48
- #
49
- # Initializing with an escape sequence string
50
- # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
51
- # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
52
- # \f (form feed, 0xC)
53
- # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
54
- # \xXX (hex)
55
- # Any other escaped character will be treated as a literal character
56
- # RegAn::Character.new('\n') # Represents a newline
57
- # RegAn::Character.new('\u03a3') # Represents a Σ
58
- def initialize(aValue)
59
- case aValue
60
- when String
61
- if aValue.size == 1
62
- # Literal single character case...
63
- @codepoint = self.class.char2codepoint(aValue)
64
- else
65
- # Should be an escape sequence...
66
- @codepoint = self.class.esc2codepoint(aValue)
67
- end
68
- @lexeme = aValue
69
-
70
- when Integer
71
- @codepoint = aValue
72
- else
73
- raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
74
- end
75
- end
76
-
77
- # Convertion method that returns a character given a codepoint (integer) value.
78
- # Example:
79
- # RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (
80
- # The Unicode GREEK CAPITAL LETTER SIGMA)
81
- def self.codepoint2char(aCodepoint)
82
- return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
83
- end
84
-
85
- # Convertion method that returns the codepoint for the given single character.
86
- # Example:
87
- # RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
88
- def self.char2codepoint(aChar)
89
- return aChar.ord
90
- end
91
-
92
- # Convertion method that returns the codepoint for the given escape
93
- # sequence (a String).
94
- # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
95
- # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed,
96
- # 0xC), \v (vertical feed, 0xB)
97
- # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
98
- # \xXX (hex)
99
- # Any other escaped character will be treated as a literal character
100
- # Example:
101
- # RegAn::Character::esc2codepoint('\n') # Returns: 0xd
102
- def self.esc2codepoint(anEscapeSequence)
103
- msg = "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)."
104
- raise StandardError, msg unless anEscapeSequence[0] == "\\"
105
- result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
106
-
107
- return result
108
- end
109
-
110
- # Return the character as a String object
111
- def char()
112
- self.class.codepoint2char(@codepoint)
113
- end
114
-
115
- # Returns true iff this Character and parameter 'another' represent the same character.
116
- # [another] any Object. The way the equality is tested depends on the another's class
117
- # Example:
118
- # newOne = Character.new(?\u03a3)
119
- # newOne == newOne # true. Identity
120
- # newOne == Character.new(?\u03a3) # true. Both have same codepoint
121
- # newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
122
- # newOne == 0x03a3 # true. The Integer is compared to the codepoint value.
123
- # Will test equality with any Object that knows the to_s method
124
- def ==(other)
125
- result = case other
126
- when Character
127
- self.to_str == other.to_str
128
-
129
- when Integer
130
- self.codepoint == other
131
-
132
- when String
133
- other.size > 1 ? false : to_str == other
134
-
135
- else
136
- # Unknown type: try with a convertion
137
- self == other.to_s # Recursive call
138
- end
139
-
140
- return result
141
- end
142
-
143
- # Return a plain English description of the character
144
- def explain()
145
- return "the character '#{to_str}'"
146
- end
147
-
148
- protected
149
-
150
- # Conversion method re-definition.
151
- # Purpose: Return the String representation of the expression.
152
- # If the Character was initially from a text (the lexeme), then the lexeme
153
- # is returned back.
154
- # Otherwise the character corresponding to the codepoint is returned.
155
- def text_repr()
156
- return char if lexeme.nil?
157
- return lexeme.dup
158
- end
159
-
160
- # Convertion method that returns a codepoint for the given two characters
161
- # (digram) escape sequence.
162
- # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
163
- # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
164
- # \f (form feed, 0xC), \v (vertical feed, 0xB)
165
- # Any other escape sequence will return the codepoint of the escaped
166
- # character.
167
- # [aDigram] A sequence of two characters that starts with a backslash.
168
- def self.digram2codepoint(aDigram)
169
- # Check that the digram is a special escape sequence
170
- result = DigramSequences.fetch(aDigram, nil)
171
-
172
- # If it not a special sequence, then escaped character is
173
- # considered literally (the backslash is 'dummy')
174
- result = char2codepoint(aDigram[-1]) if result.nil?
175
- return result
176
- end
177
-
178
- private_class_method :digram2codepoint
179
-
180
- # Convertion method that returns a codepoint for the given complex
181
- # escape sequence.
182
- # [anEscapeSequence] A String with the format:
183
- # \uXXXX where XXXX is a 4 hex digits integer value,
184
- # \u{X...} X 1 or more hex digits
185
- # \ooo (1..3 octal digits literal)
186
- # \xXX (1..2 hex digits literal)
187
- def self.esc_number2codepoint(anEscapeSequence)
188
- unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
189
- raise StandardError, "Unsupported escape sequence #{anEscapeSequence}."
190
- else
191
- # Octal literal case?
192
- return octal.oct if octal # shorterSeq =~ /[0-7]{1,3}/
193
-
194
- # Extract the hexadecimal number
195
- hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
196
- return hexliteral.hex
197
- end
198
- end
199
-
200
- private_class_method :esc_number2codepoint
201
- end # class
202
- end # module
203
-
204
- # End of file