rley 0.6.01 → 0.6.02
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -1
- data/examples/NLP/engtagger.rb +58 -60
- data/lib/rley/constants.rb +1 -1
- metadata +2 -33
- data/examples/general/SRL/lib/ast_builder.rb +0 -382
- data/examples/general/SRL/lib/grammar.rb +0 -106
- data/examples/general/SRL/lib/regex/abstract_method.rb +0 -35
- data/examples/general/SRL/lib/regex/alternation.rb +0 -27
- data/examples/general/SRL/lib/regex/anchor.rb +0 -45
- data/examples/general/SRL/lib/regex/atomic_expression.rb +0 -16
- data/examples/general/SRL/lib/regex/capturing_group.rb +0 -51
- data/examples/general/SRL/lib/regex/char_class.rb +0 -38
- data/examples/general/SRL/lib/regex/char_range.rb +0 -51
- data/examples/general/SRL/lib/regex/char_shorthand.rb +0 -50
- data/examples/general/SRL/lib/regex/character.rb +0 -204
- data/examples/general/SRL/lib/regex/compound_expression.rb +0 -57
- data/examples/general/SRL/lib/regex/concatenation.rb +0 -29
- data/examples/general/SRL/lib/regex/expression.rb +0 -60
- data/examples/general/SRL/lib/regex/lookaround.rb +0 -50
- data/examples/general/SRL/lib/regex/match_option.rb +0 -34
- data/examples/general/SRL/lib/regex/monadic_expression.rb +0 -28
- data/examples/general/SRL/lib/regex/multiplicity.rb +0 -91
- data/examples/general/SRL/lib/regex/non_capturing_group.rb +0 -27
- data/examples/general/SRL/lib/regex/polyadic_expression.rb +0 -60
- data/examples/general/SRL/lib/regex/quantifiable.rb +0 -22
- data/examples/general/SRL/lib/regex/repetition.rb +0 -29
- data/examples/general/SRL/lib/regex/wildcard.rb +0 -23
- data/examples/general/SRL/lib/regex_repr.rb +0 -13
- data/examples/general/SRL/lib/tokenizer.rb +0 -147
- data/examples/general/SRL/spec/integration_spec.rb +0 -448
- data/examples/general/SRL/spec/regex/character_spec.rb +0 -166
- data/examples/general/SRL/spec/regex/multiplicity_spec.rb +0 -79
- data/examples/general/SRL/spec/spec_helper.rb +0 -25
- data/examples/general/SRL/spec/tokenizer_spec.rb +0 -148
- data/examples/general/SRL/srl_demo.rb +0 -75
@@ -1,106 +0,0 @@
|
|
1
|
-
# Grammar for SRL (Simple Regex Language)
|
2
|
-
require 'rley' # Load the gem
|
3
|
-
module SRL
|
4
|
-
########################################
|
5
|
-
# Work in progress.
|
6
|
-
# This is a very partial grammar of SRL.
|
7
|
-
# It will be expanded with the coming versions of Rley
|
8
|
-
builder = Rley::Syntax::GrammarBuilder.new do
|
9
|
-
add_terminals('LPAREN', 'RPAREN', 'COMMA')
|
10
|
-
add_terminals('DIGIT_LIT', 'INTEGER', 'LETTER_LIT')
|
11
|
-
add_terminals('LITERALLY', 'STRING_LIT')
|
12
|
-
add_terminals('BEGIN', 'STARTS', 'WITH')
|
13
|
-
add_terminals('MUST', 'END')
|
14
|
-
add_terminals('UPPERCASE', 'LETTER', 'FROM', 'TO')
|
15
|
-
add_terminals('DIGIT', 'NUMBER', 'ANY', 'NO')
|
16
|
-
add_terminals('CHARACTER', 'WHITESPACE', 'ANYTHING')
|
17
|
-
add_terminals('TAB', 'BACKSLASH', 'NEW', 'LINE')
|
18
|
-
add_terminals('OF', 'ONE')
|
19
|
-
add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
|
20
|
-
add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
|
21
|
-
add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
|
22
|
-
add_terminals('IF', 'FOLLOWED', 'BY', 'NOT')
|
23
|
-
add_terminals('ALREADY', 'HAD')
|
24
|
-
add_terminals('CAPTURE', 'AS', 'UNTIL')
|
25
|
-
add_terminals('CASE', 'INSENSITIVE', 'MULTI', 'ALL')
|
26
|
-
add_terminals('LAZY')
|
27
|
-
|
28
|
-
rule('srl' => 'expression').as 'start_rule'
|
29
|
-
rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
|
30
|
-
rule('expression' => 'pattern').as 'simple_expr'
|
31
|
-
rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
|
32
|
-
rule('pattern' => 'quantifiable').as 'basic_pattern'
|
33
|
-
rule('separator' => 'COMMA').as 'comma_separator'
|
34
|
-
rule('separator' => []).as 'void_separator'
|
35
|
-
rule('flags' => %w[flags separator single_flag]).as 'flag_sequence'
|
36
|
-
rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
|
37
|
-
rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
|
38
|
-
rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
|
39
|
-
rule('quantifiable' => %w[begin_anchor anchorable end_anchor]).as 'pinned_quantifiable'
|
40
|
-
rule('quantifiable' => %w[begin_anchor anchorable]).as 'begin_anchor_quantifiable'
|
41
|
-
rule('quantifiable' => %w[anchorable end_anchor]).as 'end_anchor_quantifiable'
|
42
|
-
rule('quantifiable' => 'anchorable').as 'simple_quantifiable'
|
43
|
-
rule('begin_anchor' => %w[STARTS WITH]).as 'starts_with'
|
44
|
-
rule('begin_anchor' => %w[BEGIN WITH]).as 'begin_with'
|
45
|
-
rule('end_anchor' => %w[MUST END]).as 'end_anchor'
|
46
|
-
rule('anchorable' => 'assertable').as 'simple_anchorable'
|
47
|
-
rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
|
48
|
-
rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
|
49
|
-
rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
|
50
|
-
rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
|
51
|
-
rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
|
52
|
-
rule('assertable' => 'term').as 'simple_assertable'
|
53
|
-
rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
|
54
|
-
rule('term' => 'atom').as 'atom_term'
|
55
|
-
rule('term' => 'alternation').as 'alternation_term'
|
56
|
-
rule('term' => 'grouping').as 'grouping_term'
|
57
|
-
rule('term' => 'capturing_group').as 'capturing_group_atom'
|
58
|
-
rule('atom' => 'letter_range').as 'letter_range_atom'
|
59
|
-
rule('atom' => 'digit_range').as 'digit_range_atom'
|
60
|
-
rule('atom' => 'character_class').as 'character_class_atom'
|
61
|
-
rule('atom' => 'special_char').as 'special_char_atom'
|
62
|
-
rule('atom' => 'literal').as 'literal_atom'
|
63
|
-
rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
|
64
|
-
rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
|
65
|
-
rule('letter_range' => 'LETTER').as 'any_lowercase'
|
66
|
-
rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
|
67
|
-
rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
|
68
|
-
rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
|
69
|
-
rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
|
70
|
-
rule('character_class' => %w[NO CHARACTER]).as 'no_character'
|
71
|
-
rule('character_class' => 'WHITESPACE').as 'whitespace'
|
72
|
-
rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
|
73
|
-
rule('character_class' => 'ANYTHING').as 'anything'
|
74
|
-
rule('character_class' => %w[ONE OF STRING_LIT]).as 'one_of'
|
75
|
-
rule('special_char' => 'TAB').as 'tab'
|
76
|
-
rule('special_char' => 'BACKSLASH').as 'backslash'
|
77
|
-
rule('special_char' => %w[NEW LINE]).as 'new_line'
|
78
|
-
rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
|
79
|
-
rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
|
80
|
-
rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
|
81
|
-
rule('alternatives' => 'quantifiable').as 'simple_alternative'
|
82
|
-
rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
|
83
|
-
rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
|
84
|
-
rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as 'capture_until'
|
85
|
-
rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as 'named_capture'
|
86
|
-
rule('capturing_group' => %w[CAPTURE assertable AS var_name UNTIL assertable]).as 'named_capture_until'
|
87
|
-
rule('var_name' => 'STRING_LIT').as 'var_name'
|
88
|
-
rule('quantifier' => 'ONCE').as 'once'
|
89
|
-
rule('quantifier' => 'TWICE').as 'twice'
|
90
|
-
rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
|
91
|
-
rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as 'between_and'
|
92
|
-
rule('quantifier' => 'OPTIONAL').as 'optional'
|
93
|
-
rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
|
94
|
-
rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
|
95
|
-
rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
|
96
|
-
rule('digit_or_number' => 'DIGIT').as 'digit_keyword'
|
97
|
-
rule('digit_or_number' => 'NUMBER').as 'number_keyword'
|
98
|
-
rule('count' => 'DIGIT_LIT').as 'single_digit'
|
99
|
-
rule('count' => 'INTEGER').as 'integer_count'
|
100
|
-
rule('times_suffix' => 'TIMES').as 'times_keyword'
|
101
|
-
rule('times_suffix' => []).as 'times_dropped'
|
102
|
-
end
|
103
|
-
|
104
|
-
# And now build the grammar and make it accessible via a global constant
|
105
|
-
Grammar = builder.grammar
|
106
|
-
end # module
|
@@ -1,35 +0,0 @@
|
|
1
|
-
# File: abstract_method.rb
|
2
|
-
|
3
|
-
# Mix-in module. Provides the method 'abstract_method' that raises an exception
|
4
|
-
# with an appropriate message when called.
|
5
|
-
module AbstractMethod
|
6
|
-
# Call this method in the body of your abstract methods.
|
7
|
-
# Example:
|
8
|
-
# require 'AbstractMethod'
|
9
|
-
# class SomeClass
|
10
|
-
# include AbstractMethod # To add the behaviour from the mix-in module AbstractMethod
|
11
|
-
# ...
|
12
|
-
# Consider that SomeClass has an abstract method called 'some_method'
|
13
|
-
#
|
14
|
-
# def some_method() abstract_method
|
15
|
-
# end
|
16
|
-
def abstract_method()
|
17
|
-
# Determine the short class name of self
|
18
|
-
className = self.class.name.split(/::/).last
|
19
|
-
|
20
|
-
# Retrieve the top text line of the call stack
|
21
|
-
top_line = caller(1..1)
|
22
|
-
|
23
|
-
# Extract the calling method name
|
24
|
-
callerNameInQuotes = top_line.scan(/`.+?$/).first
|
25
|
-
callerName = callerNameInQuotes.gsub(/`|'/, '') # Remove enclosing quotes
|
26
|
-
|
27
|
-
# Build the error message
|
28
|
-
prefix = "The method #{className}##{callerName} is abstract."
|
29
|
-
suffix = " It should be implemented in subclasses of #{className}."
|
30
|
-
error_message = prefix + suffix
|
31
|
-
raise NotImplementedError, error_message
|
32
|
-
end
|
33
|
-
end # module
|
34
|
-
|
35
|
-
# End of file
|
@@ -1,27 +0,0 @@
|
|
1
|
-
# File: alternation.rb
|
2
|
-
|
3
|
-
require_relative 'polyadic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# Abstract class. A n-ary matching operator.
|
7
|
-
# It succeeds when one child expression succeeds to match the subject text
|
8
|
-
class Alternation < PolyadicExpression
|
9
|
-
# Constructor.
|
10
|
-
def initialize(*theChildren)
|
11
|
-
super(theChildren)
|
12
|
-
end
|
13
|
-
|
14
|
-
protected
|
15
|
-
|
16
|
-
# Conversion method re-definition.
|
17
|
-
# Purpose: Return the String representation of the concatented expressions.
|
18
|
-
def text_repr()
|
19
|
-
result_children = children.map(&:to_str)
|
20
|
-
result = '(?:' + result_children.join('|') + ')'
|
21
|
-
|
22
|
-
return result
|
23
|
-
end
|
24
|
-
end # class
|
25
|
-
end # module
|
26
|
-
|
27
|
-
# End of file
|
@@ -1,45 +0,0 @@
|
|
1
|
-
# File: anchor.rb
|
2
|
-
|
3
|
-
require_relative 'atomic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# An anchor is a zero-width assertion based on the current position.
|
7
|
-
class Anchor < AtomicExpression
|
8
|
-
# A Hash for converting a lexeme to a symbolic value
|
9
|
-
AnchorToSymbol = {
|
10
|
-
# Lexeme => Symbol value
|
11
|
-
'^' => :soLine, # Start of line
|
12
|
-
'$' => :eoLine, # End of line
|
13
|
-
'\A' => :soSubject,
|
14
|
-
'\b' => :wordBoundary,
|
15
|
-
'\B' => :nonAtWordBoundary,
|
16
|
-
'\G' => :firstMatch,
|
17
|
-
'\z' => :eoSubject,
|
18
|
-
'\Z' => :eoSubjectOrBeforeNLAtEnd
|
19
|
-
}.freeze
|
20
|
-
|
21
|
-
# A symbolic value that identifies the type of assertion to perform
|
22
|
-
attr_reader(:kind)
|
23
|
-
|
24
|
-
# Constructor
|
25
|
-
# @param aKind [String] Lexeme representation of the anchor
|
26
|
-
def initialize(aKind)
|
27
|
-
@kind = valid_kind(aKind)
|
28
|
-
end
|
29
|
-
|
30
|
-
# Conversion method re-definition.
|
31
|
-
# Purpose: Return the String representation of the expression.
|
32
|
-
def to_str()
|
33
|
-
return AnchorToSymbol.rassoc(kind).first
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
# Return the symbolic value corresponding to the given lexeme.
|
39
|
-
def valid_kind(aKind)
|
40
|
-
return AnchorToSymbol[aKind]
|
41
|
-
end
|
42
|
-
end # class
|
43
|
-
end # module
|
44
|
-
|
45
|
-
# End of file
|
@@ -1,16 +0,0 @@
|
|
1
|
-
# File: atomic_expression.rb
|
2
|
-
|
3
|
-
require_relative 'expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# Abstract class. A valid regular expression that
|
7
|
-
# cannot be further decomposed into sub-expressions.
|
8
|
-
class AtomicExpression < Expression
|
9
|
-
# Redefined method. Return true since it may not have any child.
|
10
|
-
def atomic?
|
11
|
-
return true
|
12
|
-
end
|
13
|
-
end # class
|
14
|
-
end # module
|
15
|
-
|
16
|
-
# End of file
|
@@ -1,51 +0,0 @@
|
|
1
|
-
# File: capturing_group.rb
|
2
|
-
|
3
|
-
require_relative 'monadic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# An association between a capture variable and an expression
|
7
|
-
# the subject text in the same serial arrangement
|
8
|
-
class CapturingGroup < MonadicExpression
|
9
|
-
# The capture variable id. It is a Fixnum when the capture group gets
|
10
|
-
# a sequence number,
|
11
|
-
# a String when it is an user-defined name
|
12
|
-
attr_reader(:id)
|
13
|
-
|
14
|
-
# When true, then capturing group forbids backtracking requests from its parent
|
15
|
-
# expression.
|
16
|
-
attr_reader(:no_backtrack)
|
17
|
-
|
18
|
-
# Constructor.
|
19
|
-
# [aChildExpression] A sub-expression to match. When successful
|
20
|
-
# the matching text is assigned to the capture variable.
|
21
|
-
# [theId] The id of the capture variable.
|
22
|
-
# [noBacktrack] A flag that specifies whether the capturing group forbids
|
23
|
-
# backtracking requests from its parent expression.
|
24
|
-
def initialize(aChildExpression, theId = nil, noBacktrack = false)
|
25
|
-
super(aChildExpression)
|
26
|
-
@id = theId
|
27
|
-
@no_backtrack = noBacktrack
|
28
|
-
end
|
29
|
-
|
30
|
-
# Return true iff the capturing group has a name (and not )
|
31
|
-
def named?()
|
32
|
-
return id.kind_of?(String)
|
33
|
-
end
|
34
|
-
|
35
|
-
# Conversion method re-definition.
|
36
|
-
# Purpose: Return the String representation of the captured expression.
|
37
|
-
def to_str()
|
38
|
-
prefix = named? ? "?<#{id}>" : ''
|
39
|
-
atomic = no_backtrack ? '?>' : ''
|
40
|
-
if child.is_a?(Regex::NonCapturingGroup)
|
41
|
-
# Minor optimization
|
42
|
-
result = '(' + atomic + prefix + child.child.to_str + ')'
|
43
|
-
else
|
44
|
-
result = '(' + atomic + prefix + child.to_str + ')'
|
45
|
-
end
|
46
|
-
return result
|
47
|
-
end
|
48
|
-
end # class
|
49
|
-
end # module
|
50
|
-
|
51
|
-
# End of file
|
@@ -1,38 +0,0 @@
|
|
1
|
-
# File: char_class.rb
|
2
|
-
|
3
|
-
require_relative 'polyadic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# Abstract class. A n-ary matching operator.
|
7
|
-
# It succeeds when one child expression succeeds to match the subject text.
|
8
|
-
class CharClass < PolyadicExpression
|
9
|
-
# These are characters with special meaning in character classes
|
10
|
-
Metachars = ']\^-'.codepoints
|
11
|
-
# A flag that indicates whether the character is negated
|
12
|
-
attr_reader(:negated)
|
13
|
-
|
14
|
-
# Constructor.
|
15
|
-
def initialize(to_negate, *theChildren)
|
16
|
-
super(theChildren)
|
17
|
-
@negated = to_negate
|
18
|
-
end
|
19
|
-
|
20
|
-
protected
|
21
|
-
|
22
|
-
# Conversion method re-definition.
|
23
|
-
# Purpose: Return the String representation of the character class.
|
24
|
-
def text_repr()
|
25
|
-
result_children = children.inject('') do |subResult, aChild|
|
26
|
-
if aChild.kind_of?(Regex::Character) && Metachars.include?(aChild.codepoint)
|
27
|
-
subResult << "\\" # Escape meta-character...
|
28
|
-
end
|
29
|
-
subResult << aChild.to_str
|
30
|
-
end
|
31
|
-
result = '[' + (negated ? '^' : '') + result_children + ']'
|
32
|
-
|
33
|
-
return result
|
34
|
-
end
|
35
|
-
end # class
|
36
|
-
end # module
|
37
|
-
|
38
|
-
# End of file
|
@@ -1,51 +0,0 @@
|
|
1
|
-
# File: char_range.rb
|
2
|
-
|
3
|
-
require_relative 'polyadic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# A binary expression that represents a contiguous range of characters.
|
7
|
-
# Assumption: characters are ordered by codepoint
|
8
|
-
class CharRange < PolyadicExpression
|
9
|
-
# Constructor.
|
10
|
-
# [thelowerBound]
|
11
|
-
# A character that will be the lower bound value for the range.
|
12
|
-
# [theUpperBound]
|
13
|
-
# A character that will be the upper bound value for the range.
|
14
|
-
# TODO: optimisation. Build a Character if lower bound == upper bound.
|
15
|
-
def initialize(theLowerBound, theUpperBound)
|
16
|
-
range = validated_range(theLowerBound, theUpperBound)
|
17
|
-
super(range)
|
18
|
-
end
|
19
|
-
|
20
|
-
# Return the lower bound of the range.
|
21
|
-
def lower()
|
22
|
-
return children.first
|
23
|
-
end
|
24
|
-
|
25
|
-
# Return the upper bound of the range.
|
26
|
-
def upper()
|
27
|
-
return children.last
|
28
|
-
end
|
29
|
-
|
30
|
-
protected
|
31
|
-
|
32
|
-
# Conversion method re-definition.
|
33
|
-
# Purpose: Return the String representation of the concatented expressions.
|
34
|
-
def text_repr()
|
35
|
-
result = lower.to_str + '-' + upper.to_str
|
36
|
-
|
37
|
-
return result
|
38
|
-
end
|
39
|
-
|
40
|
-
private
|
41
|
-
|
42
|
-
# Validation method. Returns a couple of Characters.after their validation.
|
43
|
-
def validated_range(theLowerBound, theUpperBound)
|
44
|
-
msg = 'Character range error: lower bound is greater than upper bound.'
|
45
|
-
raise StandardError, msg if theLowerBound.codepoint > theUpperBound.codepoint
|
46
|
-
return [theLowerBound, theUpperBound]
|
47
|
-
end
|
48
|
-
end # class
|
49
|
-
end # module
|
50
|
-
|
51
|
-
# End of file
|
@@ -1,50 +0,0 @@
|
|
1
|
-
# File: char_shorthand.rb
|
2
|
-
|
3
|
-
require_relative 'atomic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# A pre-defined character class is in essence a name for a built-in, standard character class.
|
7
|
-
class CharShorthand < AtomicExpression
|
8
|
-
# A constant Hash that defines all the predefined character shorthands.
|
9
|
-
# It contains pairs of the form:
|
10
|
-
# a pre-defined character shorthand letter => a CharRange object
|
11
|
-
StandardCClasses = {
|
12
|
-
'd' => '[0-9]',
|
13
|
-
'D' => '[^0-9]',
|
14
|
-
'h' => '[0-9a-fA-F]',
|
15
|
-
'H' => '[^0-9a-fA-F]',
|
16
|
-
's' => '[ \t\r\n\f]',
|
17
|
-
'S' => '[^ \t\r\n\f]',
|
18
|
-
'w' => '[0-9a-zA-Z_]',
|
19
|
-
'W' => '[^0-9a-zA-Z_]'
|
20
|
-
}.freeze
|
21
|
-
|
22
|
-
# An one-letter abbreviation
|
23
|
-
attr_reader(:shortname)
|
24
|
-
|
25
|
-
# Constructor
|
26
|
-
def initialize(aShortname)
|
27
|
-
@shortname = valid_shortname(aShortname)
|
28
|
-
end
|
29
|
-
|
30
|
-
protected
|
31
|
-
|
32
|
-
# Conversion method re-definition.
|
33
|
-
# Purpose: Return the String representation of the expression.
|
34
|
-
def text_repr()
|
35
|
-
return "\\#{shortname}"
|
36
|
-
end
|
37
|
-
|
38
|
-
private
|
39
|
-
|
40
|
-
# Return the validated short name.
|
41
|
-
def valid_shortname(aShortname)
|
42
|
-
msg = "Unknown predefined character class \\#{aShortname}"
|
43
|
-
raise StandardError, msg unless StandardCClasses.include? aShortname
|
44
|
-
|
45
|
-
return aShortname
|
46
|
-
end
|
47
|
-
end # class
|
48
|
-
end # module
|
49
|
-
|
50
|
-
# End of file
|
@@ -1,204 +0,0 @@
|
|
1
|
-
# File: character.rb
|
2
|
-
|
3
|
-
require_relative 'atomic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# A regular expression that matches a specific character in a given character set
|
7
|
-
class Character < AtomicExpression
|
8
|
-
# Constant with all special 2-characters escape sequences
|
9
|
-
DigramSequences = {
|
10
|
-
"\\a" => 0x7, # alarm
|
11
|
-
"\\n" => 0xA, # newline
|
12
|
-
"\\r" => 0xD, # carriage return
|
13
|
-
"\\t" => 0x9, # tab
|
14
|
-
"\\e" => 0x1B, # escape
|
15
|
-
"\\f" => 0xC, # form feed
|
16
|
-
"\\v" => 0xB, # vertical feed
|
17
|
-
# Single octal digit literals
|
18
|
-
"\\0" => 0,
|
19
|
-
"\\1" => 1,
|
20
|
-
"\\2" => 2,
|
21
|
-
"\\3" => 3,
|
22
|
-
"\\4" => 4,
|
23
|
-
"\\5" => 5,
|
24
|
-
"\\6" => 6,
|
25
|
-
"\\7" => 7
|
26
|
-
}.freeze
|
27
|
-
|
28
|
-
MetaChars = '\^$+?.'.freeze
|
29
|
-
|
30
|
-
# The integer value that uniquely identifies the character.
|
31
|
-
attr_reader(:codepoint)
|
32
|
-
|
33
|
-
# The initial text representation of the character (if any).
|
34
|
-
attr_reader(:lexeme)
|
35
|
-
|
36
|
-
# Constructor.
|
37
|
-
# [aValue] Initialize the character with a either a String literal or a
|
38
|
-
# codepoint value.
|
39
|
-
# Examples:
|
40
|
-
# Initializing with codepoint value...
|
41
|
-
# RegAn::Character.new(0x3a3) # Represents: Σ
|
42
|
-
# (Unicode GREEK CAPITAL LETTER SIGMA)
|
43
|
-
# RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex)
|
44
|
-
#
|
45
|
-
# Initializing with a single character string
|
46
|
-
# RegAn::Character.new(?\u03a3) # Also represents: Σ
|
47
|
-
# RegAn::Character.new('Σ') # Obviously, represents a Σ
|
48
|
-
#
|
49
|
-
# Initializing with an escape sequence string
|
50
|
-
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
51
|
-
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
|
52
|
-
# \f (form feed, 0xC)
|
53
|
-
# \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
|
54
|
-
# \xXX (hex)
|
55
|
-
# Any other escaped character will be treated as a literal character
|
56
|
-
# RegAn::Character.new('\n') # Represents a newline
|
57
|
-
# RegAn::Character.new('\u03a3') # Represents a Σ
|
58
|
-
def initialize(aValue)
|
59
|
-
case aValue
|
60
|
-
when String
|
61
|
-
if aValue.size == 1
|
62
|
-
# Literal single character case...
|
63
|
-
@codepoint = self.class.char2codepoint(aValue)
|
64
|
-
else
|
65
|
-
# Should be an escape sequence...
|
66
|
-
@codepoint = self.class.esc2codepoint(aValue)
|
67
|
-
end
|
68
|
-
@lexeme = aValue
|
69
|
-
|
70
|
-
when Integer
|
71
|
-
@codepoint = aValue
|
72
|
-
else
|
73
|
-
raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
# Convertion method that returns a character given a codepoint (integer) value.
|
78
|
-
# Example:
|
79
|
-
# RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (
|
80
|
-
# The Unicode GREEK CAPITAL LETTER SIGMA)
|
81
|
-
def self.codepoint2char(aCodepoint)
|
82
|
-
return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
|
83
|
-
end
|
84
|
-
|
85
|
-
# Convertion method that returns the codepoint for the given single character.
|
86
|
-
# Example:
|
87
|
-
# RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
|
88
|
-
def self.char2codepoint(aChar)
|
89
|
-
return aChar.ord
|
90
|
-
end
|
91
|
-
|
92
|
-
# Convertion method that returns the codepoint for the given escape
|
93
|
-
# sequence (a String).
|
94
|
-
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
95
|
-
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed,
|
96
|
-
# 0xC), \v (vertical feed, 0xB)
|
97
|
-
# \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
|
98
|
-
# \xXX (hex)
|
99
|
-
# Any other escaped character will be treated as a literal character
|
100
|
-
# Example:
|
101
|
-
# RegAn::Character::esc2codepoint('\n') # Returns: 0xd
|
102
|
-
def self.esc2codepoint(anEscapeSequence)
|
103
|
-
msg = "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)."
|
104
|
-
raise StandardError, msg unless anEscapeSequence[0] == "\\"
|
105
|
-
result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
|
106
|
-
|
107
|
-
return result
|
108
|
-
end
|
109
|
-
|
110
|
-
# Return the character as a String object
|
111
|
-
def char()
|
112
|
-
self.class.codepoint2char(@codepoint)
|
113
|
-
end
|
114
|
-
|
115
|
-
# Returns true iff this Character and parameter 'another' represent the same character.
|
116
|
-
# [another] any Object. The way the equality is tested depends on the another's class
|
117
|
-
# Example:
|
118
|
-
# newOne = Character.new(?\u03a3)
|
119
|
-
# newOne == newOne # true. Identity
|
120
|
-
# newOne == Character.new(?\u03a3) # true. Both have same codepoint
|
121
|
-
# newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
|
122
|
-
# newOne == 0x03a3 # true. The Integer is compared to the codepoint value.
|
123
|
-
# Will test equality with any Object that knows the to_s method
|
124
|
-
def ==(other)
|
125
|
-
result = case other
|
126
|
-
when Character
|
127
|
-
self.to_str == other.to_str
|
128
|
-
|
129
|
-
when Integer
|
130
|
-
self.codepoint == other
|
131
|
-
|
132
|
-
when String
|
133
|
-
other.size > 1 ? false : to_str == other
|
134
|
-
|
135
|
-
else
|
136
|
-
# Unknown type: try with a convertion
|
137
|
-
self == other.to_s # Recursive call
|
138
|
-
end
|
139
|
-
|
140
|
-
return result
|
141
|
-
end
|
142
|
-
|
143
|
-
# Return a plain English description of the character
|
144
|
-
def explain()
|
145
|
-
return "the character '#{to_str}'"
|
146
|
-
end
|
147
|
-
|
148
|
-
protected
|
149
|
-
|
150
|
-
# Conversion method re-definition.
|
151
|
-
# Purpose: Return the String representation of the expression.
|
152
|
-
# If the Character was initially from a text (the lexeme), then the lexeme
|
153
|
-
# is returned back.
|
154
|
-
# Otherwise the character corresponding to the codepoint is returned.
|
155
|
-
def text_repr()
|
156
|
-
return char if lexeme.nil?
|
157
|
-
return lexeme.dup
|
158
|
-
end
|
159
|
-
|
160
|
-
# Convertion method that returns a codepoint for the given two characters
|
161
|
-
# (digram) escape sequence.
|
162
|
-
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
163
|
-
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
|
164
|
-
# \f (form feed, 0xC), \v (vertical feed, 0xB)
|
165
|
-
# Any other escape sequence will return the codepoint of the escaped
|
166
|
-
# character.
|
167
|
-
# [aDigram] A sequence of two characters that starts with a backslash.
|
168
|
-
def self.digram2codepoint(aDigram)
|
169
|
-
# Check that the digram is a special escape sequence
|
170
|
-
result = DigramSequences.fetch(aDigram, nil)
|
171
|
-
|
172
|
-
# If it not a special sequence, then escaped character is
|
173
|
-
# considered literally (the backslash is 'dummy')
|
174
|
-
result = char2codepoint(aDigram[-1]) if result.nil?
|
175
|
-
return result
|
176
|
-
end
|
177
|
-
|
178
|
-
private_class_method :digram2codepoint
|
179
|
-
|
180
|
-
# Convertion method that returns a codepoint for the given complex
|
181
|
-
# escape sequence.
|
182
|
-
# [anEscapeSequence] A String with the format:
|
183
|
-
# \uXXXX where XXXX is a 4 hex digits integer value,
|
184
|
-
# \u{X...} X 1 or more hex digits
|
185
|
-
# \ooo (1..3 octal digits literal)
|
186
|
-
# \xXX (1..2 hex digits literal)
|
187
|
-
def self.esc_number2codepoint(anEscapeSequence)
|
188
|
-
unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
|
189
|
-
raise StandardError, "Unsupported escape sequence #{anEscapeSequence}."
|
190
|
-
else
|
191
|
-
# Octal literal case?
|
192
|
-
return octal.oct if octal # shorterSeq =~ /[0-7]{1,3}/
|
193
|
-
|
194
|
-
# Extract the hexadecimal number
|
195
|
-
hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
|
196
|
-
return hexliteral.hex
|
197
|
-
end
|
198
|
-
end
|
199
|
-
|
200
|
-
private_class_method :esc_number2codepoint
|
201
|
-
end # class
|
202
|
-
end # module
|
203
|
-
|
204
|
-
# End of file
|