rley 0.5.10 → 0.5.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/LICENSE.txt +1 -1
- data/README.md +2 -1
- data/appveyor.yml +6 -5
- data/examples/NLP/engtagger.rb +176 -0
- data/examples/general/SRL/lib/ast_builder.rb +217 -21
- data/examples/general/SRL/lib/grammar.rb +33 -5
- data/examples/general/SRL/lib/regex/alternation.rb +30 -0
- data/examples/general/SRL/lib/regex/char_class.rb +28 -22
- data/examples/general/SRL/lib/regex/char_shorthand.rb +50 -0
- data/examples/general/SRL/lib/regex/character.rb +5 -3
- data/examples/general/SRL/lib/regex/concatenation.rb +32 -0
- data/examples/general/SRL/lib/regex/non_capturing_group.rb +29 -0
- data/examples/general/SRL/lib/regex/wildcard.rb +26 -0
- data/examples/general/SRL/lib/regex_repr.rb +5 -0
- data/examples/general/SRL/lib/tokenizer.rb +28 -3
- data/examples/general/SRL/spec/integration_spec.rb +151 -8
- data/examples/general/SRL/spec/tokenizer_spec.rb +12 -0
- data/examples/general/left.rb +36 -0
- data/examples/general/right.rb +36 -0
- data/lib/rley/constants.rb +1 -1
- data/lib/rley/gfg/edge.rb +12 -1
- data/lib/rley/gfg/grm_flow_graph.rb +21 -1
- data/lib/rley/gfg/item_vertex.rb +1 -1
- data/lib/rley/gfg/non_terminal_vertex.rb +1 -1
- data/lib/rley/gfg/start_vertex.rb +1 -0
- data/lib/rley/gfg/vertex.rb +27 -0
- data/lib/rley/lexical/token.rb +1 -0
- data/lib/rley/parser/error_reason.rb +2 -1
- data/lib/rley/parser/gfg_chart.rb +14 -0
- data/lib/rley/parser/gfg_earley_parser.rb +0 -1
- data/lib/rley/parser/gfg_parsing.rb +4 -3
- data/lib/rley/parser/parse_entry.rb +33 -3
- data/lib/rley/parser/parse_entry_set.rb +14 -2
- data/lib/rley/parser/parse_tree_builder.rb +1 -1
- data/lib/rley/parser/parse_walker_factory.rb +0 -1
- data/lib/rley/syntax/grm_symbol.rb +2 -0
- data/lib/rley/syntax/production.rb +15 -3
- data/lib/rley/syntax/symbol_seq.rb +16 -1
- data/spec/rley/gfg/end_vertex_spec.rb +9 -1
- data/spec/rley/gfg/grm_flow_graph_spec.rb +9 -0
- data/spec/rley/gfg/item_vertex_spec.rb +9 -0
- data/spec/rley/gfg/start_vertex_spec.rb +9 -1
- data/spec/rley/parser/gfg_parsing_spec.rb +0 -1
- data/spec/rley/parser/parse_entry_set_spec.rb +15 -0
- data/spec/rley/parser/parse_entry_spec.rb +24 -13
- data/spec/rley/parser/parse_tracer_spec.rb +1 -1
- data/spec/rley/syntax/production_spec.rb +10 -0
- data/spec/rley/syntax/symbol_seq_spec.rb +5 -0
- metadata +10 -2
@@ -6,25 +6,53 @@ module SRL
|
|
6
6
|
# This is a very partial grammar of SRL.
|
7
7
|
# It will be expanded with the coming versions of Rley
|
8
8
|
builder = Rley::Syntax::GrammarBuilder.new do
|
9
|
+
add_terminals('LPAREN', 'RPAREN', 'COMMA')
|
9
10
|
add_terminals('DIGIT_LIT', 'INTEGER', 'LETTER_LIT')
|
11
|
+
add_terminals('LITERALLY', 'STRING_LIT')
|
10
12
|
add_terminals('UPPERCASE', 'LETTER', 'FROM', 'TO')
|
11
|
-
add_terminals('DIGIT', 'NUMBER')
|
13
|
+
add_terminals('DIGIT', 'NUMBER', 'ANY', 'NO')
|
14
|
+
add_terminals('CHARACTER', 'WHITESPACE', 'ANYTHING')
|
15
|
+
add_terminals('TAB', 'BACKSLASH', 'NEW', 'LINE')
|
16
|
+
add_terminals('OF', 'ONE')
|
12
17
|
add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
|
13
18
|
add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
|
14
19
|
add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
|
15
20
|
|
16
|
-
|
17
|
-
rule '
|
21
|
+
rule 'srl' => 'pattern'
|
22
|
+
rule 'pattern' => %w[pattern COMMA quantifiable]
|
23
|
+
rule 'pattern' => %w[pattern quantifiable]
|
24
|
+
rule 'pattern' => 'quantifiable'
|
25
|
+
rule 'quantifiable' => 'term'
|
26
|
+
rule 'quantifiable' => %w[term quantifier]
|
18
27
|
rule 'term' => 'atom'
|
19
|
-
rule 'term' =>
|
28
|
+
rule 'term' => 'alternation'
|
29
|
+
rule 'term' => 'grouping'
|
20
30
|
rule 'atom' => 'letter_range'
|
21
31
|
rule 'atom' => 'digit_range'
|
32
|
+
rule 'atom' => 'character_class'
|
33
|
+
rule 'atom' => 'special_char'
|
34
|
+
rule 'atom' => 'literal'
|
22
35
|
rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
|
23
36
|
rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
|
24
37
|
rule 'letter_range' => 'LETTER'
|
25
38
|
rule 'letter_range' => %w[UPPERCASE LETTER]
|
26
39
|
rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
|
27
|
-
rule 'digit_range' => 'digit_or_number'
|
40
|
+
rule 'digit_range' => 'digit_or_number'
|
41
|
+
rule 'character_class' => %w[ANY CHARACTER]
|
42
|
+
rule 'character_class' => %w[NO CHARACTER]
|
43
|
+
rule 'character_class' => 'WHITESPACE'
|
44
|
+
rule 'character_class' => %w[NO WHITESPACE]
|
45
|
+
rule 'character_class' => 'ANYTHING'
|
46
|
+
rule 'character_class' => %w[ONE OF STRING_LIT]
|
47
|
+
rule 'special_char' => 'TAB'
|
48
|
+
rule 'special_char' => 'BACKSLASH'
|
49
|
+
rule 'special_char' => %w[NEW LINE]
|
50
|
+
rule 'literal' => %w[LITERALLY STRING_LIT]
|
51
|
+
rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
|
52
|
+
rule 'alternatives' => %w[alternatives COMMA quantifiable]
|
53
|
+
rule 'alternatives' => %w[alternatives quantifiable]
|
54
|
+
rule 'alternatives' => 'quantifiable'
|
55
|
+
rule 'grouping' => %w[LPAREN pattern RPAREN]
|
28
56
|
rule 'quantifier' => 'ONCE'
|
29
57
|
rule 'quantifier' => 'TWICE'
|
30
58
|
rule 'quantifier' => %w[EXACTLY count TIMES]
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# File: alternation.rb
|
2
|
+
|
3
|
+
require_relative 'polyadic_expression' # Access the superclass
|
4
|
+
|
5
|
+
module Regex # This module is used as a namespace
|
6
|
+
|
7
|
+
# Abstract class. A n-ary matching operator.
|
8
|
+
# It succeeds when one child expression succeeds to match the subject text
|
9
|
+
class Alternation < PolyadicExpression
|
10
|
+
|
11
|
+
# Constructor.
|
12
|
+
def initialize(*theChildren)
|
13
|
+
super(theChildren)
|
14
|
+
end
|
15
|
+
|
16
|
+
public
|
17
|
+
# Conversion method re-definition.
|
18
|
+
# Purpose: Return the String representation of the concatented expressions.
|
19
|
+
def to_str()
|
20
|
+
result_children = children.map { |aChild| aChild.to_str() }
|
21
|
+
result = '(?:' + result_children.join('|') + ')'
|
22
|
+
|
23
|
+
return result
|
24
|
+
end
|
25
|
+
|
26
|
+
end # class
|
27
|
+
|
28
|
+
end # module
|
29
|
+
|
30
|
+
# End of file
|
@@ -4,30 +4,36 @@ require_relative "polyadic_expression" # Access the superclass
|
|
4
4
|
|
5
5
|
module Regex # This module is used as a namespace
|
6
6
|
|
7
|
-
# Abstract class. A n-ary matching operator.
|
8
|
-
# It succeeds when one child expression succeeds to match the subject text
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
7
|
+
# Abstract class. A n-ary matching operator.
|
8
|
+
# It succeeds when one child expression succeeds to match the subject text.
|
9
|
+
class CharClass < PolyadicExpression
|
10
|
+
# These are characters with special meaning in character classes
|
11
|
+
Metachars = ']\^-'.codepoints
|
12
|
+
# A flag that indicates whether the character is negated
|
13
|
+
attr_reader(:negated)
|
14
|
+
|
15
|
+
# Constructor.
|
16
|
+
def initialize(to_negate,*theChildren)
|
17
|
+
super(theChildren)
|
18
|
+
@negated = to_negate
|
19
|
+
end
|
19
20
|
|
20
|
-
public
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
21
|
+
public
|
22
|
+
# Conversion method re-definition.
|
23
|
+
# Purpose: Return the String representation of the character class.
|
24
|
+
def to_str()
|
25
|
+
result_children = children.inject('') do |subResult, aChild|
|
26
|
+
if aChild.kind_of?(Regex::Character) && Metachars.include?(aChild.codepoint)
|
27
|
+
subResult << "\\" # Escape meta-character...
|
28
|
+
end
|
29
|
+
subResult << aChild.to_str()
|
30
|
+
end
|
31
|
+
result = '['+ (negated ? '^' : '') + result_children + ']'
|
32
|
+
|
33
|
+
return result
|
34
|
+
end
|
29
35
|
|
30
|
-
end # class
|
36
|
+
end # class
|
31
37
|
|
32
38
|
end # module
|
33
39
|
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# File: char_shorthand.rb
|
2
|
+
|
3
|
+
require_relative "atomic_expression" # Access the superclass
|
4
|
+
|
5
|
+
module Regex # This module is used as a namespace
|
6
|
+
|
7
|
+
# A pre-defined character class is in essence a name for a built-in, standard character class.
|
8
|
+
class CharShorthand < AtomicExpression
|
9
|
+
# A constant Hash that defines all the predefined character shorthands.
|
10
|
+
# It contains pairs of the form:
|
11
|
+
# a pre-defined character shorthand letter => a CharRange object
|
12
|
+
StandardCClasses = {
|
13
|
+
'd' => '[0-9]',
|
14
|
+
'D' => '[^0-9]',
|
15
|
+
'h' => '[0-9a-fA-F]',
|
16
|
+
'H' => '[^0-9a-fA-F]',
|
17
|
+
's' => '[ \t\r\n\f]',
|
18
|
+
'S' => '[^ \t\r\n\f]',
|
19
|
+
'w' => '[0-9a-zA-Z_]',
|
20
|
+
'W' => '[^0-9a-zA-Z_]'
|
21
|
+
}
|
22
|
+
|
23
|
+
# An one-letter abbreviation
|
24
|
+
attr_reader(:shortname)
|
25
|
+
|
26
|
+
# Constructor
|
27
|
+
def initialize(aShortname)
|
28
|
+
@shortname = valid_shortname(aShortname)
|
29
|
+
end
|
30
|
+
|
31
|
+
public
|
32
|
+
# Conversion method re-definition.
|
33
|
+
# Purpose: Return the String representation of the expression.
|
34
|
+
def to_str()
|
35
|
+
return "\\#{shortname}"
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
# Return the validated short name.
|
40
|
+
def valid_shortname(aShortname)
|
41
|
+
raise StandardError, "Unknown predefined character class \\#{aShortname}" unless StandardCClasses.include? aShortname
|
42
|
+
|
43
|
+
return aShortname
|
44
|
+
end
|
45
|
+
|
46
|
+
end # class
|
47
|
+
|
48
|
+
end # module
|
49
|
+
|
50
|
+
# End of file
|
@@ -25,6 +25,8 @@ class Character < AtomicExpression
|
|
25
25
|
"\\6" => 6,
|
26
26
|
"\\7" => 7
|
27
27
|
}
|
28
|
+
|
29
|
+
MetaChars = '\^$+?.'
|
28
30
|
|
29
31
|
# The integer value that uniquely identifies the character.
|
30
32
|
attr_reader(:codepoint)
|
@@ -63,7 +65,7 @@ class Character < AtomicExpression
|
|
63
65
|
end
|
64
66
|
@lexeme = aValue
|
65
67
|
|
66
|
-
when
|
68
|
+
when Integer
|
67
69
|
@codepoint = aValue
|
68
70
|
else
|
69
71
|
raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
|
@@ -125,14 +127,14 @@ public
|
|
125
127
|
# newOne == newOne # true. Identity
|
126
128
|
# newOne == Character.new(?\u03a3) # true. Both have same codepoint
|
127
129
|
# newOne == ?\u03a3 # true. The single character String match exactly the char attribute.
|
128
|
-
# newOne == 0x03a3 # true. The
|
130
|
+
# newOne == 0x03a3 # true. The Integer is compared to the codepoint value.
|
129
131
|
# Will test equality with any Object that knows the to_s method
|
130
132
|
def ==(another)
|
131
133
|
result = case another
|
132
134
|
when Character
|
133
135
|
self.to_str == another.to_str
|
134
136
|
|
135
|
-
when
|
137
|
+
when Integer
|
136
138
|
self.codepoint == another
|
137
139
|
|
138
140
|
when String
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# File: concatenation.rb
|
2
|
+
|
3
|
+
require_relative 'polyadic_expression' # Access the superclass
|
4
|
+
|
5
|
+
module Regex # This module is used as a namespace
|
6
|
+
|
7
|
+
# Abstract class. A n-ary matching operator.
|
8
|
+
# It succeeds when each child succeeds to match the subject text in the same
|
9
|
+
# serial arrangement than defined by this concatenation.
|
10
|
+
class Concatenation < PolyadicExpression
|
11
|
+
|
12
|
+
# Constructor.
|
13
|
+
def initialize(*theChildren)
|
14
|
+
super(theChildren)
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
# Conversion method re-definition.
|
19
|
+
# Purpose: Return the String representation of the concatented expressions.
|
20
|
+
def to_str()
|
21
|
+
result = children.inject('') { |result, aChild|
|
22
|
+
result << aChild.to_str()
|
23
|
+
}
|
24
|
+
|
25
|
+
return result
|
26
|
+
end
|
27
|
+
|
28
|
+
end # class
|
29
|
+
|
30
|
+
end # module
|
31
|
+
|
32
|
+
# End of file
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# File: non_capturing_group.rb
|
2
|
+
|
3
|
+
require_relative "monadic_expression" # Access the superclass
|
4
|
+
|
5
|
+
module Regex # This module is used as a namespace
|
6
|
+
|
7
|
+
# A non-capturing group, in other word it is a pure grouping of sub-expressions
|
8
|
+
class NonCapturingGroup < MonadicExpression
|
9
|
+
|
10
|
+
# Constructor.
|
11
|
+
# [aChildExpression] A sub-expression to match. When successful
|
12
|
+
# the matching text is assigned to the capture variable.
|
13
|
+
def initialize(aChildExpression)
|
14
|
+
super(aChildExpression)
|
15
|
+
end
|
16
|
+
|
17
|
+
public
|
18
|
+
# Conversion method re-definition.
|
19
|
+
# Purpose: Return the String representation of the captured expression.
|
20
|
+
def to_str()
|
21
|
+
result = '(?:' + all_child_text() + ")"
|
22
|
+
return result
|
23
|
+
end
|
24
|
+
|
25
|
+
end # class
|
26
|
+
|
27
|
+
end # module
|
28
|
+
|
29
|
+
# End of file
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# File: wildcard.rb
|
2
|
+
|
3
|
+
require_relative 'atomic_expression' # Access the superclass
|
4
|
+
|
5
|
+
module Regex # This module is used as a namespace
|
6
|
+
|
7
|
+
# A wildcard matches any character (except for the newline).
|
8
|
+
class Wildcard < AtomicExpression
|
9
|
+
|
10
|
+
# Constructor
|
11
|
+
def initialize()
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
public
|
16
|
+
# Conversion method re-definition.
|
17
|
+
# Purpose: Return the String representation of the expression.
|
18
|
+
def to_str()
|
19
|
+
return '.'
|
20
|
+
end
|
21
|
+
|
22
|
+
end # class
|
23
|
+
|
24
|
+
end # module
|
25
|
+
|
26
|
+
# End of file
|
@@ -1,5 +1,10 @@
|
|
1
1
|
require_relative './regex/character'
|
2
2
|
require_relative './regex/char_range'
|
3
|
+
require_relative './regex/concatenation'
|
3
4
|
require_relative './regex/multiplicity'
|
4
5
|
require_relative './regex/repetition'
|
5
6
|
require_relative './regex/char_class'
|
7
|
+
require_relative './regex/char_shorthand'
|
8
|
+
require_relative './regex/wildcard'
|
9
|
+
require_relative './regex/alternation'
|
10
|
+
require_relative './regex/non_capturing_group'
|
@@ -26,23 +26,35 @@ module SRL
|
|
26
26
|
# Here are all the SRL keywords (in uppercase)
|
27
27
|
@@keywords = %w[
|
28
28
|
AND
|
29
|
+
ANY
|
30
|
+
ANYTHING
|
29
31
|
AT
|
32
|
+
BACKSLASH
|
30
33
|
BETWEEN
|
34
|
+
CHARACTER
|
31
35
|
DIGIT
|
32
36
|
EXACTLY
|
33
37
|
FROM
|
34
38
|
LEAST
|
35
39
|
LETTER
|
40
|
+
LINE
|
41
|
+
LITERALLY
|
36
42
|
MORE
|
37
43
|
NEVER
|
44
|
+
NEW
|
45
|
+
NO
|
38
46
|
NUMBER
|
47
|
+
OF
|
39
48
|
ONCE
|
49
|
+
ONE
|
40
50
|
OPTIONAL
|
41
51
|
OR
|
52
|
+
TAB
|
42
53
|
TIMES
|
43
54
|
TO
|
44
55
|
TWICE
|
45
56
|
UPPERCASE
|
57
|
+
WHITESPACE
|
46
58
|
].map { |x| [x, x] } .to_h
|
47
59
|
|
48
60
|
class ScanError < StandardError; end
|
@@ -68,7 +80,7 @@ module SRL
|
|
68
80
|
def _next_token()
|
69
81
|
skip_whitespaces
|
70
82
|
curr_ch = scanner.peek(1)
|
71
|
-
return nil if curr_ch.nil?
|
83
|
+
return nil if curr_ch.nil? || curr_ch.empty?
|
72
84
|
|
73
85
|
token = nil
|
74
86
|
|
@@ -83,7 +95,13 @@ module SRL
|
|
83
95
|
token = build_token(@@keywords[lexeme.upcase], lexeme)
|
84
96
|
# TODO: handle case unknown identifier
|
85
97
|
elsif (lexeme = scanner.scan(/[a-zA-Z]((?=\s)|$)/))
|
86
|
-
token = build_token('LETTER_LIT', lexeme)
|
98
|
+
token = build_token('LETTER_LIT', lexeme)
|
99
|
+
elsif (lexeme = scanner.scan(/"([^"]|\\")*"/)) # Double quotes literal?
|
100
|
+
unquoted = lexeme.gsub(/(^")|("$)/, '')
|
101
|
+
token = build_token('STRING_LIT', unquoted)
|
102
|
+
elsif (lexeme = scanner.scan(/'([^']|\\')*'/)) # Single quotes literal?
|
103
|
+
unquoted = lexeme.gsub(/(^')|('$)/, '')
|
104
|
+
token = build_token('STRING_LIT', unquoted)
|
87
105
|
else # Unknown token
|
88
106
|
erroneous = curr_ch.nil? ? '' : curr_ch
|
89
107
|
sequel = scanner.scan(/.{1,20}/)
|
@@ -96,7 +114,14 @@ module SRL
|
|
96
114
|
|
97
115
|
def build_token(aSymbolName, aLexeme)
|
98
116
|
token_type = name2symbol[aSymbolName]
|
99
|
-
|
117
|
+
begin
|
118
|
+
token = Rley::Lexical::Token.new(aLexeme, token_type)
|
119
|
+
rescue Exception => ex
|
120
|
+
puts "Failing with '#{aSymbolName}' and '#{aLexeme}'"
|
121
|
+
raise ex
|
122
|
+
end
|
123
|
+
|
124
|
+
return token
|
100
125
|
end
|
101
126
|
|
102
127
|
def skip_whitespaces()
|
@@ -16,7 +16,6 @@ describe 'Integration tests:' do
|
|
16
16
|
end
|
17
17
|
|
18
18
|
context 'Parsing character ranges:' do
|
19
|
-
|
20
19
|
it "should parse 'letter from ... to ...' syntax" do
|
21
20
|
result = parse('letter from a to f')
|
22
21
|
expect(result).to be_success
|
@@ -56,13 +55,41 @@ describe 'Integration tests:' do
|
|
56
55
|
regexp = regexp_repr(result)
|
57
56
|
expect(regexp.to_str).to eq('[1-4]')
|
58
57
|
end
|
58
|
+
end # context
|
59
|
+
|
60
|
+
context 'Parsing string literals:' do
|
61
|
+
it 'should parse double quotes literal string' do
|
62
|
+
result = parse('literally "hello"')
|
63
|
+
expect(result).to be_success
|
64
|
+
|
65
|
+
regexp = regexp_repr(result)
|
66
|
+
expect(regexp.to_str).to eq('hello')
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should parse single quotes literal string' do
|
70
|
+
result = parse("literally 'hello'")
|
71
|
+
expect(result).to be_success
|
72
|
+
|
73
|
+
regexp = regexp_repr(result)
|
74
|
+
expect(regexp.to_str).to eq('hello')
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'should escape special characters' do
|
78
|
+
result = parse("literally '.'")
|
79
|
+
expect(result).to be_success
|
80
|
+
|
81
|
+
regexp = regexp_repr(result)
|
82
|
+
expect(regexp.to_str).to eq('\.')
|
83
|
+
end
|
84
|
+
end
|
59
85
|
|
86
|
+
context 'Parsing character classes:' do
|
60
87
|
it "should parse 'digit' syntax" do
|
61
88
|
result = parse('digit')
|
62
89
|
expect(result).to be_success
|
63
90
|
|
64
91
|
regexp = regexp_repr(result)
|
65
|
-
expect(regexp.to_str).to eq('
|
92
|
+
expect(regexp.to_str).to eq('\d')
|
66
93
|
end
|
67
94
|
|
68
95
|
it "should parse 'number' syntax" do
|
@@ -70,9 +97,126 @@ describe 'Integration tests:' do
|
|
70
97
|
expect(result).to be_success
|
71
98
|
|
72
99
|
regexp = regexp_repr(result)
|
73
|
-
expect(regexp.to_str).to eq('
|
100
|
+
expect(regexp.to_str).to eq('\d')
|
101
|
+
end
|
102
|
+
|
103
|
+
it "should parse 'any character' syntax" do
|
104
|
+
result = parse('any character')
|
105
|
+
expect(result).to be_success
|
106
|
+
|
107
|
+
regexp = regexp_repr(result)
|
108
|
+
expect(regexp.to_str).to eq('\w')
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should parse 'no character' syntax" do
|
112
|
+
result = parse('no character')
|
113
|
+
expect(result).to be_success
|
114
|
+
|
115
|
+
regexp = regexp_repr(result)
|
116
|
+
expect(regexp.to_str).to eq('\W')
|
117
|
+
end
|
118
|
+
|
119
|
+
it "should parse 'whitespace' syntax" do
|
120
|
+
result = parse('whitespace')
|
121
|
+
expect(result).to be_success
|
122
|
+
|
123
|
+
regexp = regexp_repr(result)
|
124
|
+
expect(regexp.to_str).to eq('\s')
|
125
|
+
end
|
126
|
+
|
127
|
+
it "should parse 'no whitespace' syntax" do
|
128
|
+
result = parse('no whitespace')
|
129
|
+
expect(result).to be_success
|
130
|
+
|
131
|
+
regexp = regexp_repr(result)
|
132
|
+
expect(regexp.to_str).to eq('\S')
|
74
133
|
end
|
75
134
|
|
135
|
+
it "should parse 'anything' syntax" do
|
136
|
+
result = parse('anything')
|
137
|
+
expect(result).to be_success
|
138
|
+
|
139
|
+
regexp = regexp_repr(result)
|
140
|
+
expect(regexp.to_str).to eq('.')
|
141
|
+
end
|
142
|
+
|
143
|
+
it "should parse 'one of' syntax" do
|
144
|
+
result = parse('one of "._%+-"')
|
145
|
+
expect(result).to be_success
|
146
|
+
|
147
|
+
regexp = regexp_repr(result)
|
148
|
+
# Remark: reference implementation less readable
|
149
|
+
# (escapes more characters than required)
|
150
|
+
expect(regexp.to_str).to eq('[._%+\-]')
|
151
|
+
end
|
152
|
+
end # context
|
153
|
+
|
154
|
+
|
155
|
+
context 'Parsing special character declarations:' do
|
156
|
+
it "should parse 'tab' syntax" do
|
157
|
+
result = parse('tab')
|
158
|
+
expect(result).to be_success
|
159
|
+
|
160
|
+
regexp = regexp_repr(result)
|
161
|
+
expect(regexp.to_str).to eq('\t')
|
162
|
+
end
|
163
|
+
|
164
|
+
it "should parse 'backslash' syntax" do
|
165
|
+
result = parse('backslash')
|
166
|
+
expect(result).to be_success
|
167
|
+
|
168
|
+
regexp = regexp_repr(result)
|
169
|
+
expect(regexp.to_str).to eq('\\')
|
170
|
+
end
|
171
|
+
|
172
|
+
it "should parse 'new line' syntax" do
|
173
|
+
result = parse('new line')
|
174
|
+
expect(result).to be_success
|
175
|
+
|
176
|
+
regexp = regexp_repr(result)
|
177
|
+
expect(regexp.to_str).to eq('\n')
|
178
|
+
end
|
179
|
+
end # context
|
180
|
+
|
181
|
+
context 'Parsing alternations:' do
|
182
|
+
it "should parse 'any of' syntax" do
|
183
|
+
source = 'any of (any character, one of "._%-+")'
|
184
|
+
result = parse(source)
|
185
|
+
expect(result).to be_success
|
186
|
+
|
187
|
+
regexp = regexp_repr(result)
|
188
|
+
expect(regexp.to_str).to eq('(?:\w|[._%\-+])')
|
189
|
+
end
|
190
|
+
end # context
|
191
|
+
|
192
|
+
context 'Parsing concatenation:' do
|
193
|
+
it "should reject dangling comma" do
|
194
|
+
source = 'literally "a",'
|
195
|
+
result = parse(source)
|
196
|
+
expect(result).not_to be_success
|
197
|
+
message_prefix = /Premature end of input after ','/
|
198
|
+
expect(result.failure_reason.message).to match(message_prefix)
|
199
|
+
end
|
200
|
+
|
201
|
+
it "should parse a sequence of patterns" do
|
202
|
+
#
|
203
|
+
# DEBUG When I put a comma at the end ... looping endlessly
|
204
|
+
#
|
205
|
+
source = <<-ENDS
|
206
|
+
any of (any character, one of "._%-+") once or more,
|
207
|
+
literally "@",
|
208
|
+
any of (digit, letter, one of ".-") once or more,
|
209
|
+
literally ".",
|
210
|
+
letter at least 2 times
|
211
|
+
ENDS
|
212
|
+
|
213
|
+
result = parse(source)
|
214
|
+
expect(result).to be_success
|
215
|
+
|
216
|
+
regexp = regexp_repr(result)
|
217
|
+
# SRL expect: (?:\w|[\._%\-\+])+(?:@)(?:[0-9]|[a-z]|[\.\-])+(?:\.)[a-z]{2,}
|
218
|
+
expect(regexp.to_str).to eq('(?:\w|[._%\-+])+@(?:\d|[a-z]|[.\-])+\.[a-z]{2,}')
|
219
|
+
end
|
76
220
|
end # context
|
77
221
|
|
78
222
|
context 'Parsing quantifiers:' do
|
@@ -87,19 +231,19 @@ describe 'Integration tests:' do
|
|
87
231
|
end
|
88
232
|
|
89
233
|
it "should parse 'twice' syntax" do
|
90
|
-
result = parse(
|
234
|
+
result = parse('digit twice')
|
91
235
|
expect(result).to be_success
|
92
236
|
|
93
237
|
regexp = regexp_repr(result)
|
94
|
-
expect(regexp.to_str).to eq('
|
238
|
+
expect(regexp.to_str).to eq('\d{2}')
|
95
239
|
end
|
96
240
|
|
97
241
|
it "should parse 'optional' syntax" do
|
98
|
-
result = parse(
|
242
|
+
result = parse('anything optional')
|
99
243
|
expect(result).to be_success
|
100
244
|
|
101
245
|
regexp = regexp_repr(result)
|
102
|
-
expect(regexp.to_str).to eq('
|
246
|
+
expect(regexp.to_str).to eq('.?')
|
103
247
|
end
|
104
248
|
|
105
249
|
it "should parse 'exactly ... times' syntax" do
|
@@ -121,7 +265,6 @@ describe 'Integration tests:' do
|
|
121
265
|
expect(regexp.to_str).to eq('[p-t]{2,4}')
|
122
266
|
end
|
123
267
|
|
124
|
-
|
125
268
|
it "should parse 'once or more' syntax" do
|
126
269
|
result = parse(prefix + 'once or more')
|
127
270
|
expect(result).to be_success
|