rley 0.6.01 → 0.6.02
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -1
- data/examples/NLP/engtagger.rb +58 -60
- data/lib/rley/constants.rb +1 -1
- metadata +2 -33
- data/examples/general/SRL/lib/ast_builder.rb +0 -382
- data/examples/general/SRL/lib/grammar.rb +0 -106
- data/examples/general/SRL/lib/regex/abstract_method.rb +0 -35
- data/examples/general/SRL/lib/regex/alternation.rb +0 -27
- data/examples/general/SRL/lib/regex/anchor.rb +0 -45
- data/examples/general/SRL/lib/regex/atomic_expression.rb +0 -16
- data/examples/general/SRL/lib/regex/capturing_group.rb +0 -51
- data/examples/general/SRL/lib/regex/char_class.rb +0 -38
- data/examples/general/SRL/lib/regex/char_range.rb +0 -51
- data/examples/general/SRL/lib/regex/char_shorthand.rb +0 -50
- data/examples/general/SRL/lib/regex/character.rb +0 -204
- data/examples/general/SRL/lib/regex/compound_expression.rb +0 -57
- data/examples/general/SRL/lib/regex/concatenation.rb +0 -29
- data/examples/general/SRL/lib/regex/expression.rb +0 -60
- data/examples/general/SRL/lib/regex/lookaround.rb +0 -50
- data/examples/general/SRL/lib/regex/match_option.rb +0 -34
- data/examples/general/SRL/lib/regex/monadic_expression.rb +0 -28
- data/examples/general/SRL/lib/regex/multiplicity.rb +0 -91
- data/examples/general/SRL/lib/regex/non_capturing_group.rb +0 -27
- data/examples/general/SRL/lib/regex/polyadic_expression.rb +0 -60
- data/examples/general/SRL/lib/regex/quantifiable.rb +0 -22
- data/examples/general/SRL/lib/regex/repetition.rb +0 -29
- data/examples/general/SRL/lib/regex/wildcard.rb +0 -23
- data/examples/general/SRL/lib/regex_repr.rb +0 -13
- data/examples/general/SRL/lib/tokenizer.rb +0 -147
- data/examples/general/SRL/spec/integration_spec.rb +0 -448
- data/examples/general/SRL/spec/regex/character_spec.rb +0 -166
- data/examples/general/SRL/spec/regex/multiplicity_spec.rb +0 -79
- data/examples/general/SRL/spec/spec_helper.rb +0 -25
- data/examples/general/SRL/spec/tokenizer_spec.rb +0 -148
- data/examples/general/SRL/srl_demo.rb +0 -75
@@ -1,106 +0,0 @@
|
|
1
|
-
# Grammar for SRL (Simple Regex Language)
|
2
|
-
require 'rley' # Load the gem
|
3
|
-
module SRL
|
4
|
-
########################################
|
5
|
-
# Work in progress.
|
6
|
-
# This is a very partial grammar of SRL.
|
7
|
-
# It will be expanded with the coming versions of Rley
|
8
|
-
builder = Rley::Syntax::GrammarBuilder.new do
|
9
|
-
add_terminals('LPAREN', 'RPAREN', 'COMMA')
|
10
|
-
add_terminals('DIGIT_LIT', 'INTEGER', 'LETTER_LIT')
|
11
|
-
add_terminals('LITERALLY', 'STRING_LIT')
|
12
|
-
add_terminals('BEGIN', 'STARTS', 'WITH')
|
13
|
-
add_terminals('MUST', 'END')
|
14
|
-
add_terminals('UPPERCASE', 'LETTER', 'FROM', 'TO')
|
15
|
-
add_terminals('DIGIT', 'NUMBER', 'ANY', 'NO')
|
16
|
-
add_terminals('CHARACTER', 'WHITESPACE', 'ANYTHING')
|
17
|
-
add_terminals('TAB', 'BACKSLASH', 'NEW', 'LINE')
|
18
|
-
add_terminals('OF', 'ONE')
|
19
|
-
add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
|
20
|
-
add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
|
21
|
-
add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
|
22
|
-
add_terminals('IF', 'FOLLOWED', 'BY', 'NOT')
|
23
|
-
add_terminals('ALREADY', 'HAD')
|
24
|
-
add_terminals('CAPTURE', 'AS', 'UNTIL')
|
25
|
-
add_terminals('CASE', 'INSENSITIVE', 'MULTI', 'ALL')
|
26
|
-
add_terminals('LAZY')
|
27
|
-
|
28
|
-
rule('srl' => 'expression').as 'start_rule'
|
29
|
-
rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
|
30
|
-
rule('expression' => 'pattern').as 'simple_expr'
|
31
|
-
rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
|
32
|
-
rule('pattern' => 'quantifiable').as 'basic_pattern'
|
33
|
-
rule('separator' => 'COMMA').as 'comma_separator'
|
34
|
-
rule('separator' => []).as 'void_separator'
|
35
|
-
rule('flags' => %w[flags separator single_flag]).as 'flag_sequence'
|
36
|
-
rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
|
37
|
-
rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
|
38
|
-
rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
|
39
|
-
rule('quantifiable' => %w[begin_anchor anchorable end_anchor]).as 'pinned_quantifiable'
|
40
|
-
rule('quantifiable' => %w[begin_anchor anchorable]).as 'begin_anchor_quantifiable'
|
41
|
-
rule('quantifiable' => %w[anchorable end_anchor]).as 'end_anchor_quantifiable'
|
42
|
-
rule('quantifiable' => 'anchorable').as 'simple_quantifiable'
|
43
|
-
rule('begin_anchor' => %w[STARTS WITH]).as 'starts_with'
|
44
|
-
rule('begin_anchor' => %w[BEGIN WITH]).as 'begin_with'
|
45
|
-
rule('end_anchor' => %w[MUST END]).as 'end_anchor'
|
46
|
-
rule('anchorable' => 'assertable').as 'simple_anchorable'
|
47
|
-
rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
|
48
|
-
rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
|
49
|
-
rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
|
50
|
-
rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
|
51
|
-
rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
|
52
|
-
rule('assertable' => 'term').as 'simple_assertable'
|
53
|
-
rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
|
54
|
-
rule('term' => 'atom').as 'atom_term'
|
55
|
-
rule('term' => 'alternation').as 'alternation_term'
|
56
|
-
rule('term' => 'grouping').as 'grouping_term'
|
57
|
-
rule('term' => 'capturing_group').as 'capturing_group_atom'
|
58
|
-
rule('atom' => 'letter_range').as 'letter_range_atom'
|
59
|
-
rule('atom' => 'digit_range').as 'digit_range_atom'
|
60
|
-
rule('atom' => 'character_class').as 'character_class_atom'
|
61
|
-
rule('atom' => 'special_char').as 'special_char_atom'
|
62
|
-
rule('atom' => 'literal').as 'literal_atom'
|
63
|
-
rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
|
64
|
-
rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
|
65
|
-
rule('letter_range' => 'LETTER').as 'any_lowercase'
|
66
|
-
rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
|
67
|
-
rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
|
68
|
-
rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
|
69
|
-
rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
|
70
|
-
rule('character_class' => %w[NO CHARACTER]).as 'no_character'
|
71
|
-
rule('character_class' => 'WHITESPACE').as 'whitespace'
|
72
|
-
rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
|
73
|
-
rule('character_class' => 'ANYTHING').as 'anything'
|
74
|
-
rule('character_class' => %w[ONE OF STRING_LIT]).as 'one_of'
|
75
|
-
rule('special_char' => 'TAB').as 'tab'
|
76
|
-
rule('special_char' => 'BACKSLASH').as 'backslash'
|
77
|
-
rule('special_char' => %w[NEW LINE]).as 'new_line'
|
78
|
-
rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
|
79
|
-
rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
|
80
|
-
rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
|
81
|
-
rule('alternatives' => 'quantifiable').as 'simple_alternative'
|
82
|
-
rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
|
83
|
-
rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
|
84
|
-
rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as 'capture_until'
|
85
|
-
rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as 'named_capture'
|
86
|
-
rule('capturing_group' => %w[CAPTURE assertable AS var_name UNTIL assertable]).as 'named_capture_until'
|
87
|
-
rule('var_name' => 'STRING_LIT').as 'var_name'
|
88
|
-
rule('quantifier' => 'ONCE').as 'once'
|
89
|
-
rule('quantifier' => 'TWICE').as 'twice'
|
90
|
-
rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
|
91
|
-
rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as 'between_and'
|
92
|
-
rule('quantifier' => 'OPTIONAL').as 'optional'
|
93
|
-
rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
|
94
|
-
rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
|
95
|
-
rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
|
96
|
-
rule('digit_or_number' => 'DIGIT').as 'digit_keyword'
|
97
|
-
rule('digit_or_number' => 'NUMBER').as 'number_keyword'
|
98
|
-
rule('count' => 'DIGIT_LIT').as 'single_digit'
|
99
|
-
rule('count' => 'INTEGER').as 'integer_count'
|
100
|
-
rule('times_suffix' => 'TIMES').as 'times_keyword'
|
101
|
-
rule('times_suffix' => []).as 'times_dropped'
|
102
|
-
end
|
103
|
-
|
104
|
-
# And now build the grammar and make it accessible via a global constant
|
105
|
-
Grammar = builder.grammar
|
106
|
-
end # module
|
@@ -1,35 +0,0 @@
|
|
1
|
-
# File: abstract_method.rb
|
2
|
-
|
3
|
-
# Mix-in module. Provides the method 'abstract_method' that raises an exception
|
4
|
-
# with an appropriate message when called.
|
5
|
-
module AbstractMethod
|
6
|
-
# Call this method in the body of your abstract methods.
|
7
|
-
# Example:
|
8
|
-
# require 'AbstractMethod'
|
9
|
-
# class SomeClass
|
10
|
-
# include AbstractMethod # To add the behaviour from the mix-in module AbstractMethod
|
11
|
-
# ...
|
12
|
-
# Consider that SomeClass has an abstract method called 'some_method'
|
13
|
-
#
|
14
|
-
# def some_method() abstract_method
|
15
|
-
# end
|
16
|
-
def abstract_method()
|
17
|
-
# Determine the short class name of self
|
18
|
-
className = self.class.name.split(/::/).last
|
19
|
-
|
20
|
-
# Retrieve the top text line of the call stack
|
21
|
-
top_line = caller(1..1)
|
22
|
-
|
23
|
-
# Extract the calling method name
|
24
|
-
callerNameInQuotes = top_line.scan(/`.+?$/).first
|
25
|
-
callerName = callerNameInQuotes.gsub(/`|'/, '') # Remove enclosing quotes
|
26
|
-
|
27
|
-
# Build the error message
|
28
|
-
prefix = "The method #{className}##{callerName} is abstract."
|
29
|
-
suffix = " It should be implemented in subclasses of #{className}."
|
30
|
-
error_message = prefix + suffix
|
31
|
-
raise NotImplementedError, error_message
|
32
|
-
end
|
33
|
-
end # module
|
34
|
-
|
35
|
-
# End of file
|
@@ -1,27 +0,0 @@
|
|
1
|
-
# File: alternation.rb
|
2
|
-
|
3
|
-
require_relative 'polyadic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# Abstract class. A n-ary matching operator.
|
7
|
-
# It succeeds when one child expression succeeds to match the subject text
|
8
|
-
class Alternation < PolyadicExpression
|
9
|
-
# Constructor.
|
10
|
-
def initialize(*theChildren)
|
11
|
-
super(theChildren)
|
12
|
-
end
|
13
|
-
|
14
|
-
protected
|
15
|
-
|
16
|
-
# Conversion method re-definition.
|
17
|
-
# Purpose: Return the String representation of the concatented expressions.
|
18
|
-
def text_repr()
|
19
|
-
result_children = children.map(&:to_str)
|
20
|
-
result = '(?:' + result_children.join('|') + ')'
|
21
|
-
|
22
|
-
return result
|
23
|
-
end
|
24
|
-
end # class
|
25
|
-
end # module
|
26
|
-
|
27
|
-
# End of file
|
@@ -1,45 +0,0 @@
|
|
1
|
-
# File: anchor.rb
|
2
|
-
|
3
|
-
require_relative 'atomic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# An anchor is a zero-width assertion based on the current position.
|
7
|
-
class Anchor < AtomicExpression
|
8
|
-
# A Hash for converting a lexeme to a symbolic value
|
9
|
-
AnchorToSymbol = {
|
10
|
-
# Lexeme => Symbol value
|
11
|
-
'^' => :soLine, # Start of line
|
12
|
-
'$' => :eoLine, # End of line
|
13
|
-
'\A' => :soSubject,
|
14
|
-
'\b' => :wordBoundary,
|
15
|
-
'\B' => :nonAtWordBoundary,
|
16
|
-
'\G' => :firstMatch,
|
17
|
-
'\z' => :eoSubject,
|
18
|
-
'\Z' => :eoSubjectOrBeforeNLAtEnd
|
19
|
-
}.freeze
|
20
|
-
|
21
|
-
# A symbolic value that identifies the type of assertion to perform
|
22
|
-
attr_reader(:kind)
|
23
|
-
|
24
|
-
# Constructor
|
25
|
-
# @param aKind [String] Lexeme representation of the anchor
|
26
|
-
def initialize(aKind)
|
27
|
-
@kind = valid_kind(aKind)
|
28
|
-
end
|
29
|
-
|
30
|
-
# Conversion method re-definition.
|
31
|
-
# Purpose: Return the String representation of the expression.
|
32
|
-
def to_str()
|
33
|
-
return AnchorToSymbol.rassoc(kind).first
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
# Return the symbolic value corresponding to the given lexeme.
|
39
|
-
def valid_kind(aKind)
|
40
|
-
return AnchorToSymbol[aKind]
|
41
|
-
end
|
42
|
-
end # class
|
43
|
-
end # module
|
44
|
-
|
45
|
-
# End of file
|
@@ -1,16 +0,0 @@
|
|
1
|
-
# File: atomic_expression.rb
|
2
|
-
|
3
|
-
require_relative 'expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# Abstract class. A valid regular expression that
|
7
|
-
# cannot be further decomposed into sub-expressions.
|
8
|
-
class AtomicExpression < Expression
|
9
|
-
# Redefined method. Return true since it may not have any child.
|
10
|
-
def atomic?
|
11
|
-
return true
|
12
|
-
end
|
13
|
-
end # class
|
14
|
-
end # module
|
15
|
-
|
16
|
-
# End of file
|
@@ -1,51 +0,0 @@
|
|
1
|
-
# File: capturing_group.rb
|
2
|
-
|
3
|
-
require_relative 'monadic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# An association between a capture variable and an expression
|
7
|
-
# the subject text in the same serial arrangement
|
8
|
-
class CapturingGroup < MonadicExpression
|
9
|
-
# The capture variable id. It is a Fixnum when the capture group gets
|
10
|
-
# a sequence number,
|
11
|
-
# a String when it is an user-defined name
|
12
|
-
attr_reader(:id)
|
13
|
-
|
14
|
-
# When true, then capturing group forbids backtracking requests from its parent
|
15
|
-
# expression.
|
16
|
-
attr_reader(:no_backtrack)
|
17
|
-
|
18
|
-
# Constructor.
|
19
|
-
# [aChildExpression] A sub-expression to match. When successful
|
20
|
-
# the matching text is assigned to the capture variable.
|
21
|
-
# [theId] The id of the capture variable.
|
22
|
-
# [noBacktrack] A flag that specifies whether the capturing group forbids
|
23
|
-
# backtracking requests from its parent expression.
|
24
|
-
def initialize(aChildExpression, theId = nil, noBacktrack = false)
|
25
|
-
super(aChildExpression)
|
26
|
-
@id = theId
|
27
|
-
@no_backtrack = noBacktrack
|
28
|
-
end
|
29
|
-
|
30
|
-
# Return true iff the capturing group has a name (and not )
|
31
|
-
def named?()
|
32
|
-
return id.kind_of?(String)
|
33
|
-
end
|
34
|
-
|
35
|
-
# Conversion method re-definition.
|
36
|
-
# Purpose: Return the String representation of the captured expression.
|
37
|
-
def to_str()
|
38
|
-
prefix = named? ? "?<#{id}>" : ''
|
39
|
-
atomic = no_backtrack ? '?>' : ''
|
40
|
-
if child.is_a?(Regex::NonCapturingGroup)
|
41
|
-
# Minor optimization
|
42
|
-
result = '(' + atomic + prefix + child.child.to_str + ')'
|
43
|
-
else
|
44
|
-
result = '(' + atomic + prefix + child.to_str + ')'
|
45
|
-
end
|
46
|
-
return result
|
47
|
-
end
|
48
|
-
end # class
|
49
|
-
end # module
|
50
|
-
|
51
|
-
# End of file
|
@@ -1,38 +0,0 @@
|
|
1
|
-
# File: char_class.rb
|
2
|
-
|
3
|
-
require_relative 'polyadic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# Abstract class. A n-ary matching operator.
|
7
|
-
# It succeeds when one child expression succeeds to match the subject text.
|
8
|
-
class CharClass < PolyadicExpression
|
9
|
-
# These are characters with special meaning in character classes
|
10
|
-
Metachars = ']\^-'.codepoints
|
11
|
-
# A flag that indicates whether the character is negated
|
12
|
-
attr_reader(:negated)
|
13
|
-
|
14
|
-
# Constructor.
|
15
|
-
def initialize(to_negate, *theChildren)
|
16
|
-
super(theChildren)
|
17
|
-
@negated = to_negate
|
18
|
-
end
|
19
|
-
|
20
|
-
protected
|
21
|
-
|
22
|
-
# Conversion method re-definition.
|
23
|
-
# Purpose: Return the String representation of the character class.
|
24
|
-
def text_repr()
|
25
|
-
result_children = children.inject('') do |subResult, aChild|
|
26
|
-
if aChild.kind_of?(Regex::Character) && Metachars.include?(aChild.codepoint)
|
27
|
-
subResult << "\\" # Escape meta-character...
|
28
|
-
end
|
29
|
-
subResult << aChild.to_str
|
30
|
-
end
|
31
|
-
result = '[' + (negated ? '^' : '') + result_children + ']'
|
32
|
-
|
33
|
-
return result
|
34
|
-
end
|
35
|
-
end # class
|
36
|
-
end # module
|
37
|
-
|
38
|
-
# End of file
|
@@ -1,51 +0,0 @@
|
|
1
|
-
# File: char_range.rb
|
2
|
-
|
3
|
-
require_relative 'polyadic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# A binary expression that represents a contiguous range of characters.
|
7
|
-
# Assumption: characters are ordered by codepoint
|
8
|
-
class CharRange < PolyadicExpression
|
9
|
-
# Constructor.
|
10
|
-
# [thelowerBound]
|
11
|
-
# A character that will be the lower bound value for the range.
|
12
|
-
# [theUpperBound]
|
13
|
-
# A character that will be the upper bound value for the range.
|
14
|
-
# TODO: optimisation. Build a Character if lower bound == upper bound.
|
15
|
-
def initialize(theLowerBound, theUpperBound)
|
16
|
-
range = validated_range(theLowerBound, theUpperBound)
|
17
|
-
super(range)
|
18
|
-
end
|
19
|
-
|
20
|
-
# Return the lower bound of the range.
|
21
|
-
def lower()
|
22
|
-
return children.first
|
23
|
-
end
|
24
|
-
|
25
|
-
# Return the upper bound of the range.
|
26
|
-
def upper()
|
27
|
-
return children.last
|
28
|
-
end
|
29
|
-
|
30
|
-
protected
|
31
|
-
|
32
|
-
# Conversion method re-definition.
|
33
|
-
# Purpose: Return the String representation of the concatented expressions.
|
34
|
-
def text_repr()
|
35
|
-
result = lower.to_str + '-' + upper.to_str
|
36
|
-
|
37
|
-
return result
|
38
|
-
end
|
39
|
-
|
40
|
-
private
|
41
|
-
|
42
|
-
# Validation method. Returns a couple of Characters.after their validation.
|
43
|
-
def validated_range(theLowerBound, theUpperBound)
|
44
|
-
msg = 'Character range error: lower bound is greater than upper bound.'
|
45
|
-
raise StandardError, msg if theLowerBound.codepoint > theUpperBound.codepoint
|
46
|
-
return [theLowerBound, theUpperBound]
|
47
|
-
end
|
48
|
-
end # class
|
49
|
-
end # module
|
50
|
-
|
51
|
-
# End of file
|
@@ -1,50 +0,0 @@
|
|
1
|
-
# File: char_shorthand.rb
|
2
|
-
|
3
|
-
require_relative 'atomic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# A pre-defined character class is in essence a name for a built-in, standard character class.
|
7
|
-
class CharShorthand < AtomicExpression
|
8
|
-
# A constant Hash that defines all the predefined character shorthands.
|
9
|
-
# It contains pairs of the form:
|
10
|
-
# a pre-defined character shorthand letter => a CharRange object
|
11
|
-
StandardCClasses = {
|
12
|
-
'd' => '[0-9]',
|
13
|
-
'D' => '[^0-9]',
|
14
|
-
'h' => '[0-9a-fA-F]',
|
15
|
-
'H' => '[^0-9a-fA-F]',
|
16
|
-
's' => '[ \t\r\n\f]',
|
17
|
-
'S' => '[^ \t\r\n\f]',
|
18
|
-
'w' => '[0-9a-zA-Z_]',
|
19
|
-
'W' => '[^0-9a-zA-Z_]'
|
20
|
-
}.freeze
|
21
|
-
|
22
|
-
# An one-letter abbreviation
|
23
|
-
attr_reader(:shortname)
|
24
|
-
|
25
|
-
# Constructor
|
26
|
-
def initialize(aShortname)
|
27
|
-
@shortname = valid_shortname(aShortname)
|
28
|
-
end
|
29
|
-
|
30
|
-
protected
|
31
|
-
|
32
|
-
# Conversion method re-definition.
|
33
|
-
# Purpose: Return the String representation of the expression.
|
34
|
-
def text_repr()
|
35
|
-
return "\\#{shortname}"
|
36
|
-
end
|
37
|
-
|
38
|
-
private
|
39
|
-
|
40
|
-
# Return the validated short name.
|
41
|
-
def valid_shortname(aShortname)
|
42
|
-
msg = "Unknown predefined character class \\#{aShortname}"
|
43
|
-
raise StandardError, msg unless StandardCClasses.include? aShortname
|
44
|
-
|
45
|
-
return aShortname
|
46
|
-
end
|
47
|
-
end # class
|
48
|
-
end # module
|
49
|
-
|
50
|
-
# End of file
|
@@ -1,204 +0,0 @@
|
|
1
|
-
# File: character.rb
|
2
|
-
|
3
|
-
require_relative 'atomic_expression' # Access the superclass
|
4
|
-
|
5
|
-
module Regex # This module is used as a namespace
|
6
|
-
# A regular expression that matches a specific character in a given character set
|
7
|
-
class Character < AtomicExpression
|
8
|
-
# Constant with all special 2-characters escape sequences
|
9
|
-
DigramSequences = {
|
10
|
-
"\\a" => 0x7, # alarm
|
11
|
-
"\\n" => 0xA, # newline
|
12
|
-
"\\r" => 0xD, # carriage return
|
13
|
-
"\\t" => 0x9, # tab
|
14
|
-
"\\e" => 0x1B, # escape
|
15
|
-
"\\f" => 0xC, # form feed
|
16
|
-
"\\v" => 0xB, # vertical feed
|
17
|
-
# Single octal digit literals
|
18
|
-
"\\0" => 0,
|
19
|
-
"\\1" => 1,
|
20
|
-
"\\2" => 2,
|
21
|
-
"\\3" => 3,
|
22
|
-
"\\4" => 4,
|
23
|
-
"\\5" => 5,
|
24
|
-
"\\6" => 6,
|
25
|
-
"\\7" => 7
|
26
|
-
}.freeze
|
27
|
-
|
28
|
-
MetaChars = '\^$+?.'.freeze
|
29
|
-
|
30
|
-
# The integer value that uniquely identifies the character.
|
31
|
-
attr_reader(:codepoint)
|
32
|
-
|
33
|
-
# The initial text representation of the character (if any).
|
34
|
-
attr_reader(:lexeme)
|
35
|
-
|
36
|
-
# Constructor.
|
37
|
-
# [aValue] Initialize the character with a either a String literal or a
|
38
|
-
# codepoint value.
|
39
|
-
# Examples:
|
40
|
-
# Initializing with codepoint value...
|
41
|
-
# RegAn::Character.new(0x3a3) # Represents: Σ
|
42
|
-
# (Unicode GREEK CAPITAL LETTER SIGMA)
|
43
|
-
# RegAn::Character.new(931) # Also represents: Σ (931 dec == 3a3 hex)
|
44
|
-
#
|
45
|
-
# Initializing with a single character string
|
46
|
-
# RegAn::Character.new(?\u03a3) # Also represents: Σ
|
47
|
-
# RegAn::Character.new('Σ') # Obviously, represents a Σ
|
48
|
-
#
|
49
|
-
# Initializing with an escape sequence string
|
50
|
-
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
51
|
-
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
|
52
|
-
# \f (form feed, 0xC)
|
53
|
-
# \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
|
54
|
-
# \xXX (hex)
|
55
|
-
# Any other escaped character will be treated as a literal character
|
56
|
-
# RegAn::Character.new('\n') # Represents a newline
|
57
|
-
# RegAn::Character.new('\u03a3') # Represents a Σ
|
58
|
-
def initialize(aValue)
|
59
|
-
case aValue
|
60
|
-
when String
|
61
|
-
if aValue.size == 1
|
62
|
-
# Literal single character case...
|
63
|
-
@codepoint = self.class.char2codepoint(aValue)
|
64
|
-
else
|
65
|
-
# Should be an escape sequence...
|
66
|
-
@codepoint = self.class.esc2codepoint(aValue)
|
67
|
-
end
|
68
|
-
@lexeme = aValue
|
69
|
-
|
70
|
-
when Integer
|
71
|
-
@codepoint = aValue
|
72
|
-
else
|
73
|
-
raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
# Convertion method that returns a character given a codepoint (integer) value.
|
78
|
-
# Example:
|
79
|
-
# RegAn::Character::codepoint2char(0x3a3) # Returns: Σ (
|
80
|
-
# The Unicode GREEK CAPITAL LETTER SIGMA)
|
81
|
-
def self.codepoint2char(aCodepoint)
|
82
|
-
return [aCodepoint].pack('U') # Remark: chr() fails with codepoints > 256
|
83
|
-
end
|
84
|
-
|
85
|
-
# Convertion method that returns the codepoint for the given single character.
|
86
|
-
# Example:
|
87
|
-
# RegAn::Character::char2codepoint('Σ') # Returns: 0x3a3
|
88
|
-
def self.char2codepoint(aChar)
|
89
|
-
return aChar.ord
|
90
|
-
end
|
91
|
-
|
92
|
-
# Convertion method that returns the codepoint for the given escape
|
93
|
-
# sequence (a String).
|
94
|
-
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
95
|
-
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B), \f (form feed,
|
96
|
-
# 0xC), \v (vertical feed, 0xB)
|
97
|
-
# \uXXXX where XXXX is a 4 hex digits integer value, \u{X...}, \ooo (octal)
|
98
|
-
# \xXX (hex)
|
99
|
-
# Any other escaped character will be treated as a literal character
|
100
|
-
# Example:
|
101
|
-
# RegAn::Character::esc2codepoint('\n') # Returns: 0xd
|
102
|
-
def self.esc2codepoint(anEscapeSequence)
|
103
|
-
msg = "Escape sequence #{anEscapeSequence} does not begin with a backslash (\)."
|
104
|
-
raise StandardError, msg unless anEscapeSequence[0] == "\\"
|
105
|
-
result = (anEscapeSequence.length == 2)? digram2codepoint(anEscapeSequence) : esc_number2codepoint(anEscapeSequence)
|
106
|
-
|
107
|
-
return result
|
108
|
-
end
|
109
|
-
|
110
|
-
# Return the character as a String object
|
111
|
-
def char()
|
112
|
-
self.class.codepoint2char(@codepoint)
|
113
|
-
end
|
114
|
-
|
115
|
-
# Returns true iff this Character and parameter 'another' represent the same character.
|
116
|
-
# [another] any Object. The way the equality is tested depends on the another's class
|
117
|
-
# Example:
|
118
|
-
# newOne = Character.new(?\u03a3)
|
119
|
-
# newOne == newOne # true. Identity
|
120
|
-
# newOne == Character.new(?\u03a3) # true. Both have same codepoint
|
121
|
-
# newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
|
122
|
-
# newOne == 0x03a3 # true. The Integer is compared to the codepoint value.
|
123
|
-
# Will test equality with any Object that knows the to_s method
|
124
|
-
def ==(other)
|
125
|
-
result = case other
|
126
|
-
when Character
|
127
|
-
self.to_str == other.to_str
|
128
|
-
|
129
|
-
when Integer
|
130
|
-
self.codepoint == other
|
131
|
-
|
132
|
-
when String
|
133
|
-
other.size > 1 ? false : to_str == other
|
134
|
-
|
135
|
-
else
|
136
|
-
# Unknown type: try with a convertion
|
137
|
-
self == other.to_s # Recursive call
|
138
|
-
end
|
139
|
-
|
140
|
-
return result
|
141
|
-
end
|
142
|
-
|
143
|
-
# Return a plain English description of the character
|
144
|
-
def explain()
|
145
|
-
return "the character '#{to_str}'"
|
146
|
-
end
|
147
|
-
|
148
|
-
protected
|
149
|
-
|
150
|
-
# Conversion method re-definition.
|
151
|
-
# Purpose: Return the String representation of the expression.
|
152
|
-
# If the Character was initially from a text (the lexeme), then the lexeme
|
153
|
-
# is returned back.
|
154
|
-
# Otherwise the character corresponding to the codepoint is returned.
|
155
|
-
def text_repr()
|
156
|
-
return char if lexeme.nil?
|
157
|
-
return lexeme.dup
|
158
|
-
end
|
159
|
-
|
160
|
-
# Convertion method that returns a codepoint for the given two characters
|
161
|
-
# (digram) escape sequence.
|
162
|
-
# Recognized escaped characters are: \a (alarm, 0x07), \n (newline, 0xA),
|
163
|
-
# \r (carriage return, 0xD), \t (tab, 0x9), \e (escape, 0x1B),
|
164
|
-
# \f (form feed, 0xC), \v (vertical feed, 0xB)
|
165
|
-
# Any other escape sequence will return the codepoint of the escaped
|
166
|
-
# character.
|
167
|
-
# [aDigram] A sequence of two characters that starts with a backslash.
|
168
|
-
def self.digram2codepoint(aDigram)
|
169
|
-
# Check that the digram is a special escape sequence
|
170
|
-
result = DigramSequences.fetch(aDigram, nil)
|
171
|
-
|
172
|
-
# If it not a special sequence, then escaped character is
|
173
|
-
# considered literally (the backslash is 'dummy')
|
174
|
-
result = char2codepoint(aDigram[-1]) if result.nil?
|
175
|
-
return result
|
176
|
-
end
|
177
|
-
|
178
|
-
private_class_method :digram2codepoint
|
179
|
-
|
180
|
-
# Convertion method that returns a codepoint for the given complex
|
181
|
-
# escape sequence.
|
182
|
-
# [anEscapeSequence] A String with the format:
|
183
|
-
# \uXXXX where XXXX is a 4 hex digits integer value,
|
184
|
-
# \u{X...} X 1 or more hex digits
|
185
|
-
# \ooo (1..3 octal digits literal)
|
186
|
-
# \xXX (1..2 hex digits literal)
|
187
|
-
def self.esc_number2codepoint(anEscapeSequence)
|
188
|
-
unless /^\\(?:(?:(?<prefix>[uxX])\{?(?<hexa>\h+)\}?)|(?<octal>[0-7]{1,3}))$/ =~ anEscapeSequence
|
189
|
-
raise StandardError, "Unsupported escape sequence #{anEscapeSequence}."
|
190
|
-
else
|
191
|
-
# Octal literal case?
|
192
|
-
return octal.oct if octal # shorterSeq =~ /[0-7]{1,3}/
|
193
|
-
|
194
|
-
# Extract the hexadecimal number
|
195
|
-
hexliteral = hexa # shorterSeq.sub(/^[xXu]\{?([0-9a-fA-F]+)}?$/, '\1')
|
196
|
-
return hexliteral.hex
|
197
|
-
end
|
198
|
-
end
|
199
|
-
|
200
|
-
private_class_method :esc_number2codepoint
|
201
|
-
end # class
|
202
|
-
end # module
|
203
|
-
|
204
|
-
# End of file
|