rley 0.6.01 → 0.6.02

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -1
  3. data/examples/NLP/engtagger.rb +58 -60
  4. data/lib/rley/constants.rb +1 -1
  5. metadata +2 -33
  6. data/examples/general/SRL/lib/ast_builder.rb +0 -382
  7. data/examples/general/SRL/lib/grammar.rb +0 -106
  8. data/examples/general/SRL/lib/regex/abstract_method.rb +0 -35
  9. data/examples/general/SRL/lib/regex/alternation.rb +0 -27
  10. data/examples/general/SRL/lib/regex/anchor.rb +0 -45
  11. data/examples/general/SRL/lib/regex/atomic_expression.rb +0 -16
  12. data/examples/general/SRL/lib/regex/capturing_group.rb +0 -51
  13. data/examples/general/SRL/lib/regex/char_class.rb +0 -38
  14. data/examples/general/SRL/lib/regex/char_range.rb +0 -51
  15. data/examples/general/SRL/lib/regex/char_shorthand.rb +0 -50
  16. data/examples/general/SRL/lib/regex/character.rb +0 -204
  17. data/examples/general/SRL/lib/regex/compound_expression.rb +0 -57
  18. data/examples/general/SRL/lib/regex/concatenation.rb +0 -29
  19. data/examples/general/SRL/lib/regex/expression.rb +0 -60
  20. data/examples/general/SRL/lib/regex/lookaround.rb +0 -50
  21. data/examples/general/SRL/lib/regex/match_option.rb +0 -34
  22. data/examples/general/SRL/lib/regex/monadic_expression.rb +0 -28
  23. data/examples/general/SRL/lib/regex/multiplicity.rb +0 -91
  24. data/examples/general/SRL/lib/regex/non_capturing_group.rb +0 -27
  25. data/examples/general/SRL/lib/regex/polyadic_expression.rb +0 -60
  26. data/examples/general/SRL/lib/regex/quantifiable.rb +0 -22
  27. data/examples/general/SRL/lib/regex/repetition.rb +0 -29
  28. data/examples/general/SRL/lib/regex/wildcard.rb +0 -23
  29. data/examples/general/SRL/lib/regex_repr.rb +0 -13
  30. data/examples/general/SRL/lib/tokenizer.rb +0 -147
  31. data/examples/general/SRL/spec/integration_spec.rb +0 -448
  32. data/examples/general/SRL/spec/regex/character_spec.rb +0 -166
  33. data/examples/general/SRL/spec/regex/multiplicity_spec.rb +0 -79
  34. data/examples/general/SRL/spec/spec_helper.rb +0 -25
  35. data/examples/general/SRL/spec/tokenizer_spec.rb +0 -148
  36. data/examples/general/SRL/srl_demo.rb +0 -75
@@ -1,106 +0,0 @@
1
- # Grammar for SRL (Simple Regex Language)
2
- require 'rley' # Load the gem
3
- module SRL
4
- ########################################
5
- # Work in progress.
6
- # This is a very partial grammar of SRL.
7
- # It will be expanded with the coming versions of Rley
8
- builder = Rley::Syntax::GrammarBuilder.new do
9
- add_terminals('LPAREN', 'RPAREN', 'COMMA')
10
- add_terminals('DIGIT_LIT', 'INTEGER', 'LETTER_LIT')
11
- add_terminals('LITERALLY', 'STRING_LIT')
12
- add_terminals('BEGIN', 'STARTS', 'WITH')
13
- add_terminals('MUST', 'END')
14
- add_terminals('UPPERCASE', 'LETTER', 'FROM', 'TO')
15
- add_terminals('DIGIT', 'NUMBER', 'ANY', 'NO')
16
- add_terminals('CHARACTER', 'WHITESPACE', 'ANYTHING')
17
- add_terminals('TAB', 'BACKSLASH', 'NEW', 'LINE')
18
- add_terminals('OF', 'ONE')
19
- add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
20
- add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
21
- add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
22
- add_terminals('IF', 'FOLLOWED', 'BY', 'NOT')
23
- add_terminals('ALREADY', 'HAD')
24
- add_terminals('CAPTURE', 'AS', 'UNTIL')
25
- add_terminals('CASE', 'INSENSITIVE', 'MULTI', 'ALL')
26
- add_terminals('LAZY')
27
-
28
- rule('srl' => 'expression').as 'start_rule'
29
- rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
30
- rule('expression' => 'pattern').as 'simple_expr'
31
- rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
32
- rule('pattern' => 'quantifiable').as 'basic_pattern'
33
- rule('separator' => 'COMMA').as 'comma_separator'
34
- rule('separator' => []).as 'void_separator'
35
- rule('flags' => %w[flags separator single_flag]).as 'flag_sequence'
36
- rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
37
- rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
38
- rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
39
- rule('quantifiable' => %w[begin_anchor anchorable end_anchor]).as 'pinned_quantifiable'
40
- rule('quantifiable' => %w[begin_anchor anchorable]).as 'begin_anchor_quantifiable'
41
- rule('quantifiable' => %w[anchorable end_anchor]).as 'end_anchor_quantifiable'
42
- rule('quantifiable' => 'anchorable').as 'simple_quantifiable'
43
- rule('begin_anchor' => %w[STARTS WITH]).as 'starts_with'
44
- rule('begin_anchor' => %w[BEGIN WITH]).as 'begin_with'
45
- rule('end_anchor' => %w[MUST END]).as 'end_anchor'
46
- rule('anchorable' => 'assertable').as 'simple_anchorable'
47
- rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
48
- rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
49
- rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
50
- rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
51
- rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
52
- rule('assertable' => 'term').as 'simple_assertable'
53
- rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
54
- rule('term' => 'atom').as 'atom_term'
55
- rule('term' => 'alternation').as 'alternation_term'
56
- rule('term' => 'grouping').as 'grouping_term'
57
- rule('term' => 'capturing_group').as 'capturing_group_atom'
58
- rule('atom' => 'letter_range').as 'letter_range_atom'
59
- rule('atom' => 'digit_range').as 'digit_range_atom'
60
- rule('atom' => 'character_class').as 'character_class_atom'
61
- rule('atom' => 'special_char').as 'special_char_atom'
62
- rule('atom' => 'literal').as 'literal_atom'
63
- rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
64
- rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
65
- rule('letter_range' => 'LETTER').as 'any_lowercase'
66
- rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
67
- rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
68
- rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
69
- rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
70
- rule('character_class' => %w[NO CHARACTER]).as 'no_character'
71
- rule('character_class' => 'WHITESPACE').as 'whitespace'
72
- rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
73
- rule('character_class' => 'ANYTHING').as 'anything'
74
- rule('character_class' => %w[ONE OF STRING_LIT]).as 'one_of'
75
- rule('special_char' => 'TAB').as 'tab'
76
- rule('special_char' => 'BACKSLASH').as 'backslash'
77
- rule('special_char' => %w[NEW LINE]).as 'new_line'
78
- rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
79
- rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
80
- rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
81
- rule('alternatives' => 'quantifiable').as 'simple_alternative'
82
- rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
83
- rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
84
- rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as 'capture_until'
85
- rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as 'named_capture'
86
- rule('capturing_group' => %w[CAPTURE assertable AS var_name UNTIL assertable]).as 'named_capture_until'
87
- rule('var_name' => 'STRING_LIT').as 'var_name'
88
- rule('quantifier' => 'ONCE').as 'once'
89
- rule('quantifier' => 'TWICE').as 'twice'
90
- rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
91
- rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as 'between_and'
92
- rule('quantifier' => 'OPTIONAL').as 'optional'
93
- rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
94
- rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
95
- rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
96
- rule('digit_or_number' => 'DIGIT').as 'digit_keyword'
97
- rule('digit_or_number' => 'NUMBER').as 'number_keyword'
98
- rule('count' => 'DIGIT_LIT').as 'single_digit'
99
- rule('count' => 'INTEGER').as 'integer_count'
100
- rule('times_suffix' => 'TIMES').as 'times_keyword'
101
- rule('times_suffix' => []).as 'times_dropped'
102
- end
103
-
104
- # And now build the grammar and make it accessible via a global constant
105
- Grammar = builder.grammar
106
- end # module
@@ -1,35 +0,0 @@
1
- # File: abstract_method.rb
2
-
3
- # Mix-in module. Provides the method 'abstract_method' that raises an exception
4
- # with an appropriate message when called.
5
- module AbstractMethod
6
- # Call this method in the body of your abstract methods.
7
- # Example:
8
- # require 'AbstractMethod'
9
- # class SomeClass
10
- # include AbstractMethod # To add the behaviour from the mix-in module AbstractMethod
11
- # ...
12
- # Consider that SomeClass has an abstract method called 'some_method'
13
- #
14
- # def some_method() abstract_method
15
- # end
16
- def abstract_method()
17
- # Determine the short class name of self
18
- className = self.class.name.split(/::/).last
19
-
20
- # Retrieve the top text line of the call stack
21
- top_line = caller(1..1)
22
-
23
- # Extract the calling method name
24
- callerNameInQuotes = top_line.scan(/`.+?$/).first
25
- callerName = callerNameInQuotes.gsub(/`|'/, '') # Remove enclosing quotes
26
-
27
- # Build the error message
28
- prefix = "The method #{className}##{callerName} is abstract."
29
- suffix = " It should be implemented in subclasses of #{className}."
30
- error_message = prefix + suffix
31
- raise NotImplementedError, error_message
32
- end
33
- end # module
34
-
35
- # End of file
@@ -1,27 +0,0 @@
1
- # File: alternation.rb
2
-
3
- require_relative 'polyadic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # Abstract class. A n-ary matching operator.
7
- # It succeeds when one child expression succeeds to match the subject text
8
- class Alternation < PolyadicExpression
9
- # Constructor.
10
- def initialize(*theChildren)
11
- super(theChildren)
12
- end
13
-
14
- protected
15
-
16
- # Conversion method re-definition.
17
- # Purpose: Return the String representation of the concatented expressions.
18
- def text_repr()
19
- result_children = children.map(&:to_str)
20
- result = '(?:' + result_children.join('|') + ')'
21
-
22
- return result
23
- end
24
- end # class
25
- end # module
26
-
27
- # End of file
@@ -1,45 +0,0 @@
1
- # File: anchor.rb
2
-
3
- require_relative 'atomic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # An anchor is a zero-width assertion based on the current position.
7
- class Anchor < AtomicExpression
8
- # A Hash for converting a lexeme to a symbolic value
9
- AnchorToSymbol = {
10
- # Lexeme => Symbol value
11
- '^' => :soLine, # Start of line
12
- '$' => :eoLine, # End of line
13
- '\A' => :soSubject,
14
- '\b' => :wordBoundary,
15
- '\B' => :nonAtWordBoundary,
16
- '\G' => :firstMatch,
17
- '\z' => :eoSubject,
18
- '\Z' => :eoSubjectOrBeforeNLAtEnd
19
- }.freeze
20
-
21
- # A symbolic value that identifies the type of assertion to perform
22
- attr_reader(:kind)
23
-
24
- # Constructor
25
- # @param aKind [String] Lexeme representation of the anchor
26
- def initialize(aKind)
27
- @kind = valid_kind(aKind)
28
- end
29
-
30
- # Conversion method re-definition.
31
- # Purpose: Return the String representation of the expression.
32
- def to_str()
33
- return AnchorToSymbol.rassoc(kind).first
34
- end
35
-
36
- private
37
-
38
- # Return the symbolic value corresponding to the given lexeme.
39
- def valid_kind(aKind)
40
- return AnchorToSymbol[aKind]
41
- end
42
- end # class
43
- end # module
44
-
45
- # End of file
@@ -1,16 +0,0 @@
1
- # File: atomic_expression.rb
2
-
3
- require_relative 'expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # Abstract class. A valid regular expression that
7
- # cannot be further decomposed into sub-expressions.
8
- class AtomicExpression < Expression
9
- # Redefined method. Return true since it may not have any child.
10
- def atomic?
11
- return true
12
- end
13
- end # class
14
- end # module
15
-
16
- # End of file
@@ -1,51 +0,0 @@
1
- # File: capturing_group.rb
2
-
3
- require_relative 'monadic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # An association between a capture variable and an expression
7
- # the subject text in the same serial arrangement
8
- class CapturingGroup < MonadicExpression
9
- # The capture variable id. It is a Fixnum when the capture group gets
10
- # a sequence number,
11
- # a String when it is an user-defined name
12
- attr_reader(:id)
13
-
14
- # When true, then capturing group forbids backtracking requests from its parent
15
- # expression.
16
- attr_reader(:no_backtrack)
17
-
18
- # Constructor.
19
- # [aChildExpression] A sub-expression to match. When successful
20
- # the matching text is assigned to the capture variable.
21
- # [theId] The id of the capture variable.
22
- # [noBacktrack] A flag that specifies whether the capturing group forbids
23
- # backtracking requests from its parent expression.
24
- def initialize(aChildExpression, theId = nil, noBacktrack = false)
25
- super(aChildExpression)
26
- @id = theId
27
- @no_backtrack = noBacktrack
28
- end
29
-
30
- # Return true iff the capturing group has a name (and not )
31
- def named?()
32
- return id.kind_of?(String)
33
- end
34
-
35
- # Conversion method re-definition.
36
- # Purpose: Return the String representation of the captured expression.
37
- def to_str()
38
- prefix = named? ? "?<#{id}>" : ''
39
- atomic = no_backtrack ? '?>' : ''
40
- if child.is_a?(Regex::NonCapturingGroup)
41
- # Minor optimization
42
- result = '(' + atomic + prefix + child.child.to_str + ')'
43
- else
44
- result = '(' + atomic + prefix + child.to_str + ')'
45
- end
46
- return result
47
- end
48
- end # class
49
- end # module
50
-
51
- # End of file
@@ -1,38 +0,0 @@
1
- # File: char_class.rb
2
-
3
- require_relative 'polyadic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # Abstract class. A n-ary matching operator.
7
- # It succeeds when one child expression succeeds to match the subject text.
8
- class CharClass < PolyadicExpression
9
- # These are characters with special meaning in character classes
10
- Metachars = ']\^-'.codepoints
11
- # A flag that indicates whether the character is negated
12
- attr_reader(:negated)
13
-
14
- # Constructor.
15
- def initialize(to_negate, *theChildren)
16
- super(theChildren)
17
- @negated = to_negate
18
- end
19
-
20
- protected
21
-
22
- # Conversion method re-definition.
23
- # Purpose: Return the String representation of the character class.
24
- def text_repr()
25
- result_children = children.inject('') do |subResult, aChild|
26
- if aChild.kind_of?(Regex::Character) && Metachars.include?(aChild.codepoint)
27
- subResult << "\\" # Escape meta-character...
28
- end
29
- subResult << aChild.to_str
30
- end
31
- result = '[' + (negated ? '^' : '') + result_children + ']'
32
-
33
- return result
34
- end
35
- end # class
36
- end # module
37
-
38
- # End of file
@@ -1,51 +0,0 @@
1
- # File: char_range.rb
2
-
3
- require_relative 'polyadic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # A binary expression that represents a contiguous range of characters.
7
- # Assumption: characters are ordered by codepoint
8
- class CharRange < PolyadicExpression
9
- # Constructor.
10
- # [thelowerBound]
11
- # A character that will be the lower bound value for the range.
12
- # [theUpperBound]
13
- # A character that will be the upper bound value for the range.
14
- # TODO: optimisation. Build a Character if lower bound == upper bound.
15
- def initialize(theLowerBound, theUpperBound)
16
- range = validated_range(theLowerBound, theUpperBound)
17
- super(range)
18
- end
19
-
20
- # Return the lower bound of the range.
21
- def lower()
22
- return children.first
23
- end
24
-
25
- # Return the upper bound of the range.
26
- def upper()
27
- return children.last
28
- end
29
-
30
- protected
31
-
32
- # Conversion method re-definition.
33
- # Purpose: Return the String representation of the concatented expressions.
34
- def text_repr()
35
- result = lower.to_str + '-' + upper.to_str
36
-
37
- return result
38
- end
39
-
40
- private
41
-
42
- # Validation method. Returns a couple of Characters.after their validation.
43
- def validated_range(theLowerBound, theUpperBound)
44
- msg = 'Character range error: lower bound is greater than upper bound.'
45
- raise StandardError, msg if theLowerBound.codepoint > theUpperBound.codepoint
46
- return [theLowerBound, theUpperBound]
47
- end
48
- end # class
49
- end # module
50
-
51
- # End of file
@@ -1,50 +0,0 @@
1
- # File: char_shorthand.rb
2
-
3
- require_relative 'atomic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # A pre-defined character class is in essence a name for a built-in, standard character class.
7
- class CharShorthand < AtomicExpression
8
- # A constant Hash that defines all the predefined character shorthands.
9
- # It contains pairs of the form:
10
- # a pre-defined character shorthand letter => a CharRange object
11
- StandardCClasses = {
12
- 'd' => '[0-9]',
13
- 'D' => '[^0-9]',
14
- 'h' => '[0-9a-fA-F]',
15
- 'H' => '[^0-9a-fA-F]',
16
- 's' => '[ \t\r\n\f]',
17
- 'S' => '[^ \t\r\n\f]',
18
- 'w' => '[0-9a-zA-Z_]',
19
- 'W' => '[^0-9a-zA-Z_]'
20
- }.freeze
21
-
22
- # An one-letter abbreviation
23
- attr_reader(:shortname)
24
-
25
- # Constructor
26
- def initialize(aShortname)
27
- @shortname = valid_shortname(aShortname)
28
- end
29
-
30
- protected
31
-
32
- # Conversion method re-definition.
33
- # Purpose: Return the String representation of the expression.
34
- def text_repr()
35
- return "\\#{shortname}"
36
- end
37
-
38
- private
39
-
40
- # Return the validated short name.
41
- def valid_shortname(aShortname)
42
- msg = "Unknown predefined character class \\#{aShortname}"
43
- raise StandardError, msg unless StandardCClasses.include? aShortname
44
-
45
- return aShortname
46
- end
47
- end # class
48
- end # module
49
-
50
- # End of file
@@ -1,204 +0,0 @@
1
- # File: character.rb
2
-
3
- require_relative 'atomic_expression' # Access the superclass
4
-
5
- module Regex # This module is used as a namespace
6
- # A regular expression that matches a specific character in a given character set
7
- class Character < AtomicExpression
8
- # Constant with all special 2-characters escape sequences
9
- DigramSequences = {
10
- "\\a" => 0x7, # alarm
11
- "\\n" => 0xA, # newline
12
- "\\r" => 0xD, # carriage return
13
- "\\t" => 0x9, # tab
14
- "\\e" => 0x1B, # escape
15
- "\\f" => 0xC, # form feed
16
- "\\v" => 0xB, # vertical feed
17
- # Single octal digit literals
18
- "\\0" => 0,
19
- "\\1" => 1,
20
- "\\2" => 2,
21
- "\\3" => 3,
22
- "\\4" => 4,
23
- "\\5" => 5,
24
- "\\6" => 6,
25
- "\\7" => 7
26
- }.freeze
27
-
28
- MetaChars = '\^$+?.'.freeze
29
-
30
- # The integer value that uniquely identifies the character.
31
- attr_reader(:codepoint)
32
-
33
- # The initial text representation of the character (if any).
34
- attr_reader(:lexeme)
35
-
36
- # Constructor.
37
- # [aValue] Initialize the character with a either a String literal or a
38
- # codepoint value.
39
- # Examples:
40
- # Initializing with codepoint value...
41
- # RegAn::Character.new(0x3a3) # Represents: Σ
42
- # (Unicode GREEK CAPITAL LETTER SIGMA)
43
- # RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex)
44
- #
45
- # Initializing with a single character string
46
- # RegAn::Character.new(?\u03a3) # Also represents: Σ
47
- # RegAn::Character.new('Σ') # Obviously, represents a Σ
48
- #
49
- # Initializing with an escape sequence string
50
- # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
51
- # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
52
- # \f (form feed, 0xC)
53
- # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
54
- # \xXX (hex)
55
- # Any other escaped character will be treated as a literal character
56
- # RegAn::Character.new('\n') # Represents a newline
57
- # RegAn::Character.new('\u03a3') # Represents a Σ
58
- def initialize(aValue)
59
- case aValue
60
- when String
61
- if aValue.size == 1
62
- # Literal single character case...
63
- @codepoint = self.class.char2codepoint(aValue)
64
- else
65
- # Should be an escape sequence...
66
- @codepoint = self.class.esc2codepoint(aValue)
67
- end
68
- @lexeme = aValue
69
-
70
- when Integer
71
- @codepoint = aValue
72
- else
73
- raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
74
- end
75
- end
76
-
77
- # Convertion method that returns a character given a codepoint (integer) value.
78
- # Example:
79
- # RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (
80
- # The Unicode GREEK CAPITAL LETTER SIGMA)
81
- def self.codepoint2char(aCodepoint)
82
- return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
83
- end
84
-
85
- # Convertion method that returns the codepoint for the given single character.
86
- # Example:
87
- # RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
88
- def self.char2codepoint(aChar)
89
- return aChar.ord
90
- end
91
-
92
- # Convertion method that returns the codepoint for the given escape
93
- # sequence (a String).
94
- # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
95
- # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed,
96
- # 0xC), \v (vertical feed, 0xB)
97
- # \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
98
- # \xXX (hex)
99
- # Any other escaped character will be treated as a literal character
100
- # Example:
101
- # RegAn::Character::esc2codepoint('\n') # Returns: 0xd
102
- def self.esc2codepoint(anEscapeSequence)
103
- msg = "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)."
104
- raise StandardError, msg unless anEscapeSequence[0] == "\\"
105
- result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
106
-
107
- return result
108
- end
109
-
110
- # Return the character as a String object
111
- def char()
112
- self.class.codepoint2char(@codepoint)
113
- end
114
-
115
- # Returns true iff this Character and parameter 'another' represent the same character.
116
- # [another] any Object. The way the equality is tested depends on the another's class
117
- # Example:
118
- # newOne = Character.new(?\u03a3)
119
- # newOne == newOne # true. Identity
120
- # newOne == Character.new(?\u03a3) # true. Both have same codepoint
121
- # newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
122
- # newOne == 0x03a3 # true. The Integer is compared to the codepoint value.
123
- # Will test equality with any Object that knows the to_s method
124
- def ==(other)
125
- result = case other
126
- when Character
127
- self.to_str == other.to_str
128
-
129
- when Integer
130
- self.codepoint == other
131
-
132
- when String
133
- other.size > 1 ? false : to_str == other
134
-
135
- else
136
- # Unknown type: try with a convertion
137
- self == other.to_s # Recursive call
138
- end
139
-
140
- return result
141
- end
142
-
143
- # Return a plain English description of the character
144
- def explain()
145
- return "the character '#{to_str}'"
146
- end
147
-
148
- protected
149
-
150
- # Conversion method re-definition.
151
- # Purpose: Return the String representation of the expression.
152
- # If the Character was initially from a text (the lexeme), then the lexeme
153
- # is returned back.
154
- # Otherwise the character corresponding to the codepoint is returned.
155
- def text_repr()
156
- return char if lexeme.nil?
157
- return lexeme.dup
158
- end
159
-
160
- # Convertion method that returns a codepoint for the given two characters
161
- # (digram) escape sequence.
162
- # Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
163
- # \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
164
- # \f (form feed, 0xC), \v (vertical feed, 0xB)
165
- # Any other escape sequence will return the codepoint of the escaped
166
- # character.
167
- # [aDigram] A sequence of two characters that starts with a backslash.
168
- def self.digram2codepoint(aDigram)
169
- # Check that the digram is a special escape sequence
170
- result = DigramSequences.fetch(aDigram, nil)
171
-
172
- # If it not a special sequence, then escaped character is
173
- # considered literally (the backslash is 'dummy')
174
- result = char2codepoint(aDigram[-1]) if result.nil?
175
- return result
176
- end
177
-
178
- private_class_method :digram2codepoint
179
-
180
- # Convertion method that returns a codepoint for the given complex
181
- # escape sequence.
182
- # [anEscapeSequence] A String with the format:
183
- # \uXXXX where XXXX is a 4 hex digits integer value,
184
- # \u{X...} X 1 or more hex digits
185
- # \ooo (1..3 octal digits literal)
186
- # \xXX (1..2 hex digits literal)
187
- def self.esc_number2codepoint(anEscapeSequence)
188
- unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
189
- raise StandardError, "Unsupported escape sequence #{anEscapeSequence}."
190
- else
191
- # Octal literal case?
192
- return octal.oct if octal # shorterSeq =~ /[0-7]{1,3}/
193
-
194
- # Extract the hexadecimal number
195
- hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
196
- return hexliteral.hex
197
- end
198
- end
199
-
200
- private_class_method :esc_number2codepoint
201
- end # class
202
- end # module
203
-
204
- # End of file