rley 0.6.09 → 0.7.00
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +13 -2
- data/examples/NLP/benchmark_pico_en.rb +4 -1
- data/examples/NLP/engtagger.rb +4 -1
- data/examples/NLP/nano_eng/nano_en_demo.rb +15 -4
- data/examples/NLP/pico_en_demo.rb +2 -17
- data/examples/data_formats/JSON/json_ast_builder.rb +2 -2
- data/examples/data_formats/JSON/json_ast_nodes.rb +18 -2
- data/examples/data_formats/JSON/json_lexer.rb +10 -4
- data/examples/general/calc_iter1/calc_lexer.rb +5 -4
- data/examples/general/calc_iter2/calc_lexer.rb +2 -1
- data/examples/general/left.rb +4 -1
- data/examples/general/right.rb +4 -1
- data/lib/rley/constants.rb +1 -1
- data/lib/rley/lexical/token.rb +14 -2
- data/lib/rley/parser/error_reason.rb +1 -1
- data/lib/rley/parser/gfg_earley_parser.rb +4 -0
- data/lib/rley/syntax/terminal.rb +6 -2
- data/lib/support/base_tokenizer.rb +197 -0
- data/spec/rley/engine_spec.rb +2 -1
- data/spec/rley/formatter/asciitree_spec.rb +2 -1
- data/spec/rley/formatter/bracket_notation_spec.rb +2 -1
- data/spec/rley/formatter/debug_spec.rb +4 -2
- data/spec/rley/formatter/json_spec.rb +2 -1
- data/spec/rley/lexical/token_spec.rb +10 -5
- data/spec/rley/parse_rep/ambiguous_parse_spec.rb +1 -1
- data/spec/rley/parse_rep/ast_builder_spec.rb +1 -1
- data/spec/rley/parse_rep/cst_builder_spec.rb +2 -2
- data/spec/rley/parse_rep/groucho_spec.rb +2 -1
- data/spec/rley/parse_rep/parse_forest_builder_spec.rb +1 -1
- data/spec/rley/parse_tree_visitor_spec.rb +2 -1
- data/spec/rley/parser/error_reason_spec.rb +6 -4
- data/spec/rley/parser/gfg_earley_parser_spec.rb +59 -57
- data/spec/rley/parser/gfg_parsing_spec.rb +1 -1
- data/spec/rley/parser/parse_tracer_spec.rb +3 -2
- data/spec/rley/sppf/token_node_spec.rb +9 -6
- data/spec/rley/support/ambiguous_grammar_helper.rb +2 -1
- data/spec/rley/support/expectation_helper.rb +1 -0
- data/spec/rley/support/grammar_ambig01_helper.rb +15 -6
- data/spec/rley/support/grammar_arr_int_helper.rb +16 -15
- data/spec/rley/support/grammar_b_expr_helper.rb +16 -7
- data/spec/rley/support/grammar_helper.rb +6 -2
- data/spec/rley/support/grammar_l0_helper.rb +12 -4
- data/spec/rley/support/grammar_pb_helper.rb +46 -21
- data/spec/support/base_tokenizer_spec.rb +77 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2b462d4c492ffb698715478492a962d65e41834c
|
4
|
+
data.tar.gz: 282ab2ed83d7b1ead2646c8dd98176d1753a142e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 514e4a9429b4fd1231001269cd18e96fa70d4b9145c60115f042f338e0a0063871f979ba9c04e971ee589c8e2d3919fece6b3af4499af50f03899f44318a0598
|
7
|
+
data.tar.gz: 870e01cb9e693c126b9fa13915dadcec72559553147e0bf220f6e39a6431c6b59c7fb1b67aa1a19d8eadf0076fc6378948fbf1f56850c88d9f99584fefb7f259
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
### 0.7.00 / 2018-11-24
|
2
|
+
- Version bump. Core class `Token` is changed.
|
3
|
+
|
4
|
+
* [NEW] Structure `Lexical::Position` to hold the line, column position of a token.
|
5
|
+
* [NEW] Class `BaseTokenizer`: Provides basic tokenizer operations to customized through subclassing.
|
6
|
+
* [CHANGE] Class `Lexical::Token`: Attribute `position` added.
|
7
|
+
* [CHANGE] Method `Lexical::Token#initialize`: Add a third argument for specifying the position of the token.
|
8
|
+
* [CHANGE] Many classes and examples updated to conform to `Token` class change.
|
9
|
+
* [FIX] Missing methods in class `JSONPair` added.
|
10
|
+
|
1
11
|
### 0.6.09 / 2018-10-20
|
2
12
|
* [FIXED] Method `GrmFlowGraph#traverse_df` now returns a meaningful message when the grammar uses a terminal symbol without declaring it first.
|
3
13
|
|
data/README.md
CHANGED
@@ -148,14 +148,25 @@ The subset of English grammar is based on an example from the NLTK book.
|
|
148
148
|
|
149
149
|
### Creating a tokenizer
|
150
150
|
```ruby
|
151
|
+
require 'strscan'
|
152
|
+
|
151
153
|
# A tokenizer reads the input string and converts it into a sequence of tokens.
|
152
154
|
# Remark: Rley doesn't provide tokenizer functionality.
|
153
155
|
# Highly simplified tokenizer implementation
|
154
156
|
def tokenizer(aTextToParse)
|
155
|
-
|
157
|
+
scanner = StringScanner.new(aTextToParse)
|
158
|
+
tokens = []
|
159
|
+
|
160
|
+
loop do
|
161
|
+
scanner.skip(/\s+/)
|
162
|
+
curr_pos = scanner.pos
|
163
|
+
word = scanner.scan(/\S+/)
|
164
|
+
break unless word
|
165
|
+
|
156
166
|
term_name = Lexicon[word]
|
157
167
|
raise StandardError, "Word '#{word}' not found in lexicon" if term_name.nil?
|
158
|
-
Rley::Lexical::
|
168
|
+
pos = Rley::Lexical::Position.new(1, curr_pos + 1)
|
169
|
+
tokens << Rley::Lexical::Token.new(word, term_name, pos)
|
159
170
|
end
|
160
171
|
|
161
172
|
return tokens
|
@@ -63,10 +63,13 @@ Lexicon = {
|
|
63
63
|
# Rley doesn't provide tokenizer functionality.
|
64
64
|
# (Highly simplified tokenizer implementation).
|
65
65
|
def tokenizer(aTextToParse)
|
66
|
+
offset = -1
|
66
67
|
tokens = aTextToParse.scan(/\S+/).map do |word|
|
67
68
|
term_name = Lexicon[word]
|
68
69
|
raise StandardError, "Word '#{word}' not found in lexicon" if term_name.nil?
|
69
|
-
Rley::Lexical::
|
70
|
+
pos = Rley::Lexical::Position.new(1, offset + 1)
|
71
|
+
offset += word.length
|
72
|
+
Rley::Lexical::Token.new(word, term_name, pos)
|
70
73
|
end
|
71
74
|
|
72
75
|
return tokens
|
data/examples/NLP/engtagger.rb
CHANGED
@@ -147,10 +147,13 @@ lexicon = clean_text(text)
|
|
147
147
|
tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
|
148
148
|
|
149
149
|
def tokenizer(lexicon, tokens)
|
150
|
+
pos = -1
|
150
151
|
rley_tokens = []
|
151
152
|
lexicon.each_with_index do |word, i|
|
152
153
|
term_name = tokens[i].last
|
153
|
-
|
154
|
+
rank = Rley::Lexical::Position.new(1, pos + 1)
|
155
|
+
pos += word.length + 1 # Assuming one space between words.
|
156
|
+
rley_tokens << Rley::Lexical::Token.new(word, term_name, pos)
|
154
157
|
end
|
155
158
|
return rley_tokens
|
156
159
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'strscan'
|
1
2
|
require 'rley' # Load Rley library
|
2
3
|
|
3
4
|
########################################
|
@@ -67,16 +68,26 @@ Lexicon = {
|
|
67
68
|
# Step 4. Creating a tokenizer
|
68
69
|
# A tokenizer reads the input string and converts it into a sequence of tokens
|
69
70
|
# Highly simplified tokenizer implementation.
|
70
|
-
def tokenizer(aTextToParse)
|
71
|
-
|
71
|
+
def tokenizer(aTextToParse)
|
72
|
+
scanner = StringScanner.new(aTextToParse)
|
73
|
+
tokens = []
|
74
|
+
|
75
|
+
loop do
|
76
|
+
scanner.skip(/\s+/)
|
77
|
+
curr_pos = scanner.pos
|
78
|
+
word = scanner.scan(/\S+/)
|
79
|
+
break unless word
|
80
|
+
|
72
81
|
term_name = Lexicon[word]
|
73
82
|
raise StandardError, "Word '#{word}' not found in lexicon" if term_name.nil?
|
74
|
-
Rley::Lexical::
|
83
|
+
pos = Rley::Lexical::Position.new(1, curr_pos + 1)
|
84
|
+
tokens << Rley::Lexical::Token.new(word, term_name, pos)
|
75
85
|
end
|
76
86
|
|
77
|
-
return tokens
|
87
|
+
return tokens
|
78
88
|
end
|
79
89
|
|
90
|
+
|
80
91
|
########################################
|
81
92
|
# Step 5. Parsing the input
|
82
93
|
input_to_parse = 'John saw Mary'
|
@@ -56,21 +56,6 @@ Lexicon = {
|
|
56
56
|
'with' => 'Preposition'
|
57
57
|
}.freeze
|
58
58
|
|
59
|
-
Position = Struct.new(:line, :column) do
|
60
|
-
def to_s()
|
61
|
-
"line #{line}, column #{column}"
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
class NLPToken < Rley::Lexical::Token
|
66
|
-
attr_reader(:position)
|
67
|
-
|
68
|
-
def initialize(theLexeme, aTerminal, aPosition)
|
69
|
-
super(theLexeme, aTerminal)
|
70
|
-
@position = aPosition
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
59
|
########################################
|
75
60
|
# Step 4. Create a tokenizer
|
76
61
|
# A tokenizer reads the input string and converts it into a sequence of tokens.
|
@@ -88,8 +73,8 @@ def tokenizer(aTextToParse)
|
|
88
73
|
|
89
74
|
term_name = Lexicon[word]
|
90
75
|
raise StandardError, "Word '#{word}' not found in lexicon" if term_name.nil?
|
91
|
-
pos = Position.new(1, curr_pos + 1)
|
92
|
-
tokens <<
|
76
|
+
pos = Rley::Lexical::Position.new(1, curr_pos + 1)
|
77
|
+
tokens << Rley::Lexical::Token.new(word, term_name, pos)
|
93
78
|
end
|
94
79
|
|
95
80
|
return tokens
|
@@ -7,7 +7,7 @@ require_relative 'json_ast_nodes'
|
|
7
7
|
# The Builder pattern creates a complex object
|
8
8
|
# (say, a parse tree) from simpler objects (terminal and non-terminal
|
9
9
|
# nodes) and using a step by step approach.
|
10
|
-
class JSONASTBuilder < Rley::ParseRep::
|
10
|
+
class JSONASTBuilder < Rley::ParseRep::ASTBaseBuilder
|
11
11
|
Terminal2NodeClass = {
|
12
12
|
'false' => JSONBooleanNode,
|
13
13
|
'true' => JSONBooleanNode,
|
@@ -63,7 +63,7 @@ class JSONASTBuilder < Rley::ParseRep::ParseTreeBuilder
|
|
63
63
|
return JSONPair.new(theChildren[0], theChildren[2], aProduction.lhs)
|
64
64
|
end
|
65
65
|
|
66
|
-
# rule '
|
66
|
+
# rule 'array' => %w[begin-array array-items end-array]
|
67
67
|
def reduce_array_0(aProduction, _range, _tokens, theChildren)
|
68
68
|
second_child = theChildren[1]
|
69
69
|
second_child.symbol = aProduction.lhs
|
@@ -27,6 +27,9 @@ JSONTerminalNode = Struct.new(:token, :value, :position) do
|
|
27
27
|
def accept(aVisitor)
|
28
28
|
aVisitor.visit_terminal(self)
|
29
29
|
end
|
30
|
+
|
31
|
+
def done!
|
32
|
+
end
|
30
33
|
end
|
31
34
|
|
32
35
|
|
@@ -71,6 +74,9 @@ class JSONCompositeNode
|
|
71
74
|
def accept(aVisitor)
|
72
75
|
aVisitor.visit_nonterminal(self)
|
73
76
|
end
|
77
|
+
|
78
|
+
def done!
|
79
|
+
end
|
74
80
|
|
75
81
|
alias subnodes children
|
76
82
|
end # class
|
@@ -96,7 +102,7 @@ end # class
|
|
96
102
|
class JSONPair
|
97
103
|
attr_reader(:name)
|
98
104
|
attr_reader(:value)
|
99
|
-
|
105
|
+
attr_accessor(:symbol)
|
100
106
|
|
101
107
|
def initialize(aName, aValue, aSymbol)
|
102
108
|
@name = aName
|
@@ -115,6 +121,16 @@ class JSONPair
|
|
115
121
|
def accept(aVisitor)
|
116
122
|
aVisitor.visit_nonterminal(self)
|
117
123
|
end
|
124
|
+
|
125
|
+
def done!
|
126
|
+
end
|
127
|
+
|
128
|
+
def to_ruby
|
129
|
+
rep = {}
|
130
|
+
rep[name.to_ruby] = value.to_ruby
|
131
|
+
|
132
|
+
return rep
|
133
|
+
end
|
118
134
|
end # class
|
119
135
|
|
120
136
|
class JSONObjectNode < JSONCompositeNode
|
@@ -123,7 +139,7 @@ class JSONObjectNode < JSONCompositeNode
|
|
123
139
|
end
|
124
140
|
|
125
141
|
# Convert this tree node in a simpler Ruby representation.
|
126
|
-
# Basically a JSON object corresponds to a
|
142
|
+
# Basically a JSON object corresponds to a Ruby Hash
|
127
143
|
def to_ruby()
|
128
144
|
rep = {}
|
129
145
|
members.each do |pair|
|
@@ -23,6 +23,7 @@ class JSONLexer
|
|
23
23
|
def initialize(source)
|
24
24
|
@scanner = StringScanner.new(source)
|
25
25
|
@lineno = 1
|
26
|
+
@line_start = 0
|
26
27
|
end
|
27
28
|
|
28
29
|
def tokens()
|
@@ -48,7 +49,7 @@ class JSONLexer
|
|
48
49
|
case curr_ch
|
49
50
|
when '{', '}', '[', ']', ',', ':'
|
50
51
|
token_type = @@lexeme2name[curr_ch]
|
51
|
-
token =
|
52
|
+
token = build_token(curr_ch, token_type)
|
52
53
|
|
53
54
|
when /[ftn]/ # First letter of keywords
|
54
55
|
@scanner.pos = scanner.pos - 1 # Simulate putback
|
@@ -57,7 +58,7 @@ class JSONLexer
|
|
57
58
|
invalid_keyw = scanner.scan(/\w+/)
|
58
59
|
raise ScanError.new("Invalid keyword: #{invalid_keyw}")
|
59
60
|
else
|
60
|
-
token =
|
61
|
+
token = build_token(keyw, keyw)
|
61
62
|
end
|
62
63
|
|
63
64
|
# LITERALS
|
@@ -66,12 +67,12 @@ class JSONLexer
|
|
66
67
|
end_delimiter = scanner.getch
|
67
68
|
err_msg = 'No closing quotes (") found'
|
68
69
|
raise ScanError.new(err_msg) if end_delimiter.nil?
|
69
|
-
token =
|
70
|
+
token = build_token(value, 'string')
|
70
71
|
|
71
72
|
when /[-0-9]/ # Start character of number literal found
|
72
73
|
@scanner.pos = scanner.pos - 1 # Simulate putback
|
73
74
|
value = scanner.scan(/-?[0-9]+(\.[0-9]+)?([eE][-+]?[0-9])?/)
|
74
|
-
token =
|
75
|
+
token = build_token(value, 'number')
|
75
76
|
|
76
77
|
else # Unknown token
|
77
78
|
erroneous = curr_ch.nil? ? '' : curr_ch
|
@@ -84,6 +85,11 @@ class JSONLexer
|
|
84
85
|
|
85
86
|
return token
|
86
87
|
end
|
88
|
+
|
89
|
+
def build_token(lexeme, token)
|
90
|
+
pos = Rley::Lexical::Position.new(lineno, scanner.pos - line_start)
|
91
|
+
Rley::Lexical::Token.new(lexeme, token, pos)
|
92
|
+
end
|
87
93
|
|
88
94
|
def skip_whitespaces()
|
89
95
|
matched = scanner.scan(/[ \t\f\n\r]+/)
|
@@ -42,13 +42,13 @@ class CalcLexer
|
|
42
42
|
skip_whitespaces
|
43
43
|
curr_ch = scanner.peek(1)
|
44
44
|
return nil if curr_ch.nil?
|
45
|
-
|
45
|
+
|
46
46
|
token = nil
|
47
47
|
|
48
48
|
if '()+/'.include? curr_ch
|
49
49
|
# Single character token
|
50
50
|
token = build_token(@@lexeme2name[curr_ch], scanner.getch)
|
51
|
-
|
51
|
+
|
52
52
|
elsif (lexeme = scanner.scan(/\*\*/))
|
53
53
|
token = build_token(@@lexeme2name[lexeme], lexeme)
|
54
54
|
elsif (lexeme = scanner.scan(/\*/))
|
@@ -66,9 +66,10 @@ class CalcLexer
|
|
66
66
|
|
67
67
|
return token
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
def build_token(aSymbolName, aLexeme)
|
71
|
-
|
71
|
+
pos = Rley::Lexical::Position.new(1, scanner.pos)
|
72
|
+
return Rley::Lexical::Token.new(aLexeme, aSymbolName, pos)
|
72
73
|
end
|
73
74
|
|
74
75
|
def skip_whitespaces()
|
@@ -75,7 +75,8 @@ class CalcLexer
|
|
75
75
|
end
|
76
76
|
|
77
77
|
def build_token(aSymbolName, aLexeme)
|
78
|
-
|
78
|
+
pos = Rley::Lexical::Position.new(1, scanner.pos)
|
79
|
+
return Rley::Lexical::Token.new(aLexeme, aSymbolName, pos)
|
79
80
|
end
|
80
81
|
|
81
82
|
def skip_whitespaces()
|
data/examples/general/left.rb
CHANGED
@@ -17,9 +17,12 @@ grammar = builder.grammar
|
|
17
17
|
|
18
18
|
# Highly simplified tokenizer implementation.
|
19
19
|
def tokenizer(aText, aGrammar)
|
20
|
+
index = 0
|
20
21
|
tokens = aText.scan(/\./).map do |dot|
|
21
22
|
terminal = aGrammar.name2symbol['DOT']
|
22
|
-
|
23
|
+
index += 1
|
24
|
+
pos = Rley::Lexical::Position.new(1, index)
|
25
|
+
Rley::Lexical::Token.new(dot, terminal, pos)
|
23
26
|
end
|
24
27
|
|
25
28
|
return tokens
|
data/examples/general/right.rb
CHANGED
@@ -17,9 +17,12 @@ grammar = builder.grammar
|
|
17
17
|
|
18
18
|
# Highly simplified tokenizer implementation.
|
19
19
|
def tokenizer(aText, aGrammar)
|
20
|
+
index = 0
|
20
21
|
tokens = aText.scan(/\./).map do |dot|
|
21
22
|
terminal = aGrammar.name2symbol['DOT']
|
22
|
-
|
23
|
+
index += 1
|
24
|
+
pos = Rley::Lexical::Position.new(1, index)
|
25
|
+
Rley::Lexical::Token.new(dot, terminal, pos)
|
23
26
|
end
|
24
27
|
|
25
28
|
return tokens
|
data/lib/rley/constants.rb
CHANGED
data/lib/rley/lexical/token.rb
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
module Rley # This module is used as a namespace
|
2
2
|
module Lexical # This module is used as a namespace
|
3
|
+
# A Position is the location of a lexeme within a source file.
|
4
|
+
Position = Struct.new(:line, :column) do
|
5
|
+
def to_s
|
6
|
+
"line #{line}, column #{column}"
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
|
3
11
|
# In Rley, a (lexical) token is an object created by a lexer (tokenizer)
|
4
12
|
# and passed to the parser. Such token an object is created when a lexer
|
5
13
|
# detects that a sequence of characters(a lexeme) from the input stream
|
@@ -17,15 +25,19 @@ module Rley # This module is used as a namespace
|
|
17
25
|
|
18
26
|
# @return [Syntax::Terminal] Terminal symbol corresponding to the lexeme.
|
19
27
|
attr_reader(:terminal)
|
28
|
+
|
29
|
+
# @return [Position] The position of the lexeme in the source file.
|
30
|
+
attr_reader(:position)
|
20
31
|
|
21
32
|
# Constructor.
|
22
33
|
# @param theLexeme [String] the lexeme (= piece of text from input)
|
23
|
-
# @param aTerminal [Syntax::Terminal]
|
34
|
+
# @param aTerminal [Syntax::Terminal, String]
|
24
35
|
# The terminal symbol corresponding to the lexeme.
|
25
|
-
def initialize(theLexeme, aTerminal)
|
36
|
+
def initialize(theLexeme, aTerminal, aPosition)
|
26
37
|
raise 'Internal error: nil terminal symbol detected' if aTerminal.nil?
|
27
38
|
@lexeme = theLexeme
|
28
39
|
@terminal = aTerminal
|
40
|
+
@position = aPosition
|
29
41
|
end
|
30
42
|
end # class
|
31
43
|
end # module
|
@@ -84,7 +84,7 @@ module Rley # Module used as a namespace
|
|
84
84
|
err_msg = "Syntax error at or near token #{position} "
|
85
85
|
err_msg << ">>>#{last_token.lexeme}<<<\n"
|
86
86
|
err_msg << expectations
|
87
|
-
err_msg << ", found a '#{last_token.terminal
|
87
|
+
err_msg << ", found a '#{last_token.terminal}' instead."
|
88
88
|
|
89
89
|
return err_msg
|
90
90
|
end
|
@@ -30,6 +30,10 @@ module Rley # This module is used as a namespace
|
|
30
30
|
|
31
31
|
aTokenSequence.each_with_index do |token, i|
|
32
32
|
parse_for_token(result, i)
|
33
|
+
if token.terminal.kind_of?(String)
|
34
|
+
symb = grammar.name2symbol[token.terminal]
|
35
|
+
token.instance_variable_set(:@terminal, symb)
|
36
|
+
end
|
33
37
|
scan_success = scan_rule(result, i, token)
|
34
38
|
break unless scan_success
|
35
39
|
end
|
data/lib/rley/syntax/terminal.rb
CHANGED
@@ -14,16 +14,20 @@ module Rley # This module is used as a namespace
|
|
14
14
|
end
|
15
15
|
|
16
16
|
# Return true iff the symbol is a terminal
|
17
|
-
def terminal?
|
17
|
+
def terminal?
|
18
18
|
return true
|
19
19
|
end
|
20
20
|
|
21
21
|
# @return [false] Return true if the symbol derives
|
22
22
|
# the empty string. As terminal symbol corresponds to a input token
|
23
23
|
# it is by definition non-nullable.
|
24
|
-
def nullable?
|
24
|
+
def nullable?
|
25
25
|
false
|
26
26
|
end
|
27
|
+
|
28
|
+
def to_s
|
29
|
+
name
|
30
|
+
end
|
27
31
|
end # class
|
28
32
|
end # module
|
29
33
|
end # module
|
@@ -0,0 +1,197 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
require_relative '../rley/lexical/token'
|
3
|
+
|
4
|
+
class BaseTokenizer
|
5
|
+
attr_reader(:scanner)
|
6
|
+
attr_reader(:lineno)
|
7
|
+
attr_reader(:line_start)
|
8
|
+
|
9
|
+
class ScanError < StandardError; end
|
10
|
+
|
11
|
+
# Constructor. Initialize a tokenizer for Skeem.
|
12
|
+
# @param source [String] Skeem text to tokenize.
|
13
|
+
def initialize(source)
|
14
|
+
@scanner = StringScanner.new('')
|
15
|
+
restart(source)
|
16
|
+
end
|
17
|
+
|
18
|
+
# @param source [String] Skeem text to tokenize.
|
19
|
+
def restart(source)
|
20
|
+
@scanner.string = source
|
21
|
+
@lineno = 1
|
22
|
+
@line_start = 0
|
23
|
+
end
|
24
|
+
|
25
|
+
# @return [Array<SkmToken>] | Returns a sequence of tokens
|
26
|
+
def tokens
|
27
|
+
tok_sequence = []
|
28
|
+
until @scanner.eos?
|
29
|
+
token = _next_token
|
30
|
+
tok_sequence << token unless token.nil?
|
31
|
+
end
|
32
|
+
|
33
|
+
return tok_sequence
|
34
|
+
end
|
35
|
+
|
36
|
+
protected
|
37
|
+
|
38
|
+
# Patterns:
|
39
|
+
# Unambiguous single character
|
40
|
+
# Conditional single character (e.g. '+' operator, '+' prefix for positive numbers)
|
41
|
+
def _next_token
|
42
|
+
skip_whitespaces
|
43
|
+
curr_ch = scanner.peek(1)
|
44
|
+
return nil if curr_ch.nil? || curr_ch.empty?
|
45
|
+
|
46
|
+
token = recognize_token()
|
47
|
+
if token.nil? # Unknown token
|
48
|
+
curr_ch = scanner.peek(1)
|
49
|
+
erroneous = curr_ch.nil? ? '' : scanner.scan(/./)
|
50
|
+
sequel = scanner.scan(/.{1,20}/)
|
51
|
+
erroneous += sequel unless sequel.nil?
|
52
|
+
raise ScanError, "Unknown token #{erroneous} on line #{lineno}"
|
53
|
+
end
|
54
|
+
|
55
|
+
return token
|
56
|
+
end
|
57
|
+
|
58
|
+
def recognize_token()
|
59
|
+
=begin
|
60
|
+
if "()'`".include? curr_ch # Single characters
|
61
|
+
# Delimiters, separators => single character token
|
62
|
+
token = build_token(@@lexeme2name[curr_ch], scanner.getch)
|
63
|
+
elsif (lexeme = scanner.scan(/(?:\.)(?=\s)/)) # Single char occurring alone
|
64
|
+
token = build_token('PERIOD', lexeme)
|
65
|
+
elsif (lexeme = scanner.scan(/,@?/))
|
66
|
+
token = build_token(@@lexeme2name[lexeme], lexeme)
|
67
|
+
elsif (lexeme = scanner.scan(/#(?:(?:true)|(?:false)|(?:u8)|[\\\(tfeiodx]|(?:\d+[=#]))/))
|
68
|
+
token = cardinal_token(lexeme)
|
69
|
+
elsif (lexeme = scanner.scan(/[+-]?[0-9]+(?=\s|[|()";]|$)/))
|
70
|
+
token = build_token('INTEGER', lexeme) # Decimal radix
|
71
|
+
elsif (lexeme = scanner.scan(/[+-]?[0-9]+(?:\.[0-9]+)?(?:(?:e|E)[+-]?[0-9]+)?/))
|
72
|
+
# Order dependency: must be tested after INTEGER case
|
73
|
+
token = build_token('REAL', lexeme)
|
74
|
+
elsif (lexeme = scanner.scan(/"(?:\\"|[^"])*"/)) # Double quotes literal?
|
75
|
+
token = build_token('STRING_LIT', lexeme)
|
76
|
+
elsif (lexeme = scanner.scan(/[a-zA-Z!$%&*\/:<=>?@^_~][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
|
77
|
+
keyw = @@keywords[lexeme.upcase]
|
78
|
+
tok_type = keyw ? keyw : 'IDENTIFIER'
|
79
|
+
token = build_token(tok_type, lexeme)
|
80
|
+
elsif (lexeme = scanner.scan(/\|(?:[^|])*\|/)) # Vertical bar delimited
|
81
|
+
token = build_token('IDENTIFIER', lexeme)
|
82
|
+
elsif (lexeme = scanner.scan(/([\+\-])((?=\s|[|()";])|$)/))
|
83
|
+
# # R7RS peculiar identifiers case 1: isolated plus and minus as identifiers
|
84
|
+
token = build_token('IDENTIFIER', lexeme)
|
85
|
+
elsif (lexeme = scanner.scan(/[+-][a-zA-Z!$%&*\/:<=>?@^_~+-@][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
|
86
|
+
# R7RS peculiar identifiers case 2
|
87
|
+
token = build_token('IDENTIFIER', lexeme)
|
88
|
+
elsif (lexeme = scanner.scan(/\.[a-zA-Z!$%&*\/:<=>?@^_~+-@.][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
|
89
|
+
# R7RS peculiar identifiers case 4
|
90
|
+
token = build_token('IDENTIFIER', lexeme)
|
91
|
+
=end
|
92
|
+
end
|
93
|
+
|
94
|
+
def build_token(aSymbolName, aLexeme, aFormat = :default)
|
95
|
+
begin
|
96
|
+
value = convert_to(aLexeme, aSymbolName, aFormat)
|
97
|
+
col = scanner.pos - aLexeme.size - @line_start + 1
|
98
|
+
pos = Rley::Lexical::Position.new(@lineno, col)
|
99
|
+
token = Rley::Lexical::Token.new(value, aSymbolName, pos)
|
100
|
+
rescue StandardError => exc
|
101
|
+
puts "Failing with '#{aSymbolName}' and '#{aLexeme}'"
|
102
|
+
raise exc
|
103
|
+
end
|
104
|
+
|
105
|
+
return token
|
106
|
+
end
|
107
|
+
|
108
|
+
def convert_to(aLexeme, aSymbolName, aFormat)
|
109
|
+
return aLexeme
|
110
|
+
end
|
111
|
+
|
112
|
+
def skip_whitespaces
|
113
|
+
pre_pos = scanner.pos
|
114
|
+
|
115
|
+
loop do
|
116
|
+
ws_found = false
|
117
|
+
cmt_found = false
|
118
|
+
found = scanner.skip(/[ \t\f]+/)
|
119
|
+
ws_found = true if found
|
120
|
+
found = scanner.skip(/(?:\r\n)|\r|\n/)
|
121
|
+
if found
|
122
|
+
ws_found = true
|
123
|
+
next_line
|
124
|
+
end
|
125
|
+
# next_ch = scanner.peek(1)
|
126
|
+
# if next_ch == ';'
|
127
|
+
# cmt_found = true
|
128
|
+
# scanner.skip(/;[^\r\n]*(?:(?:\r\n)|\r|\n)?/)
|
129
|
+
# next_line
|
130
|
+
# end
|
131
|
+
break unless ws_found or cmt_found
|
132
|
+
end
|
133
|
+
|
134
|
+
curr_pos = scanner.pos
|
135
|
+
return if curr_pos == pre_pos
|
136
|
+
end
|
137
|
+
|
138
|
+
def next_line
|
139
|
+
@lineno += 1
|
140
|
+
@line_start = scanner.pos
|
141
|
+
end
|
142
|
+
end # class
|
143
|
+
=begin
|
144
|
+
require 'base_tokenizer'
|
145
|
+
|
146
|
+
class PB_Tokenizer < BaseTokenizer
|
147
|
+
@@lexeme2name = {
|
148
|
+
'(' => 'LPAREN',
|
149
|
+
')' => 'RPAREN',
|
150
|
+
'+' => 'PLUS',
|
151
|
+
}.freeze
|
152
|
+
|
153
|
+
protected
|
154
|
+
|
155
|
+
def recognize_token()
|
156
|
+
token = nil
|
157
|
+
curr_ch = scanner.peek(1)
|
158
|
+
|
159
|
+
if '()'.include? curr_ch # Single characters
|
160
|
+
# Delimiters, separators => single character token
|
161
|
+
token = build_token(@@lexeme2name[curr_ch], scanner.getch)
|
162
|
+
elsif (lexeme = scanner.scan(/(?:\+)(?=\s)/)) # Single char occurring alone
|
163
|
+
token = build_token(@@lexeme2name[lexeme], lexeme)
|
164
|
+
elsif (lexeme = scanner.scan(/[+-]?[0-9]+/))
|
165
|
+
token = build_token('INTEGER', lexeme)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end # class
|
169
|
+
|
170
|
+
# Basic tokenizer
|
171
|
+
# @return [Array<Rley::Lexical::Token>]
|
172
|
+
def tokenize(aText)
|
173
|
+
tokenizer = PB_Tokenizer.new(aText)
|
174
|
+
tokenizer.token
|
175
|
+
end
|
176
|
+
|
177
|
+
=end
|
178
|
+
=begin
|
179
|
+
# Basic expression tokenizer
|
180
|
+
def tokenize(aText)
|
181
|
+
tokens = aText.scan(/\S+/).map do |lexeme|
|
182
|
+
case lexeme
|
183
|
+
when '+', '(', ')'
|
184
|
+
terminal = @grammar.name2symbol[lexeme]
|
185
|
+
when /^[-+]?\d+$/
|
186
|
+
terminal = @grammar.name2symbol['int']
|
187
|
+
else
|
188
|
+
msg = "Unknown input text '#{lexeme}'"
|
189
|
+
raise StandardError, msg
|
190
|
+
end
|
191
|
+
pos = Rley::Lexical::Position.new(1, 4) # Dummy position
|
192
|
+
Rley::Lexical::Token.new(lexeme, terminal, pos)
|
193
|
+
end
|
194
|
+
|
195
|
+
return tokens
|
196
|
+
end
|
197
|
+
=end
|