rley 0.6.09 → 0.7.00
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +13 -2
- data/examples/NLP/benchmark_pico_en.rb +4 -1
- data/examples/NLP/engtagger.rb +4 -1
- data/examples/NLP/nano_eng/nano_en_demo.rb +15 -4
- data/examples/NLP/pico_en_demo.rb +2 -17
- data/examples/data_formats/JSON/json_ast_builder.rb +2 -2
- data/examples/data_formats/JSON/json_ast_nodes.rb +18 -2
- data/examples/data_formats/JSON/json_lexer.rb +10 -4
- data/examples/general/calc_iter1/calc_lexer.rb +5 -4
- data/examples/general/calc_iter2/calc_lexer.rb +2 -1
- data/examples/general/left.rb +4 -1
- data/examples/general/right.rb +4 -1
- data/lib/rley/constants.rb +1 -1
- data/lib/rley/lexical/token.rb +14 -2
- data/lib/rley/parser/error_reason.rb +1 -1
- data/lib/rley/parser/gfg_earley_parser.rb +4 -0
- data/lib/rley/syntax/terminal.rb +6 -2
- data/lib/support/base_tokenizer.rb +197 -0
- data/spec/rley/engine_spec.rb +2 -1
- data/spec/rley/formatter/asciitree_spec.rb +2 -1
- data/spec/rley/formatter/bracket_notation_spec.rb +2 -1
- data/spec/rley/formatter/debug_spec.rb +4 -2
- data/spec/rley/formatter/json_spec.rb +2 -1
- data/spec/rley/lexical/token_spec.rb +10 -5
- data/spec/rley/parse_rep/ambiguous_parse_spec.rb +1 -1
- data/spec/rley/parse_rep/ast_builder_spec.rb +1 -1
- data/spec/rley/parse_rep/cst_builder_spec.rb +2 -2
- data/spec/rley/parse_rep/groucho_spec.rb +2 -1
- data/spec/rley/parse_rep/parse_forest_builder_spec.rb +1 -1
- data/spec/rley/parse_tree_visitor_spec.rb +2 -1
- data/spec/rley/parser/error_reason_spec.rb +6 -4
- data/spec/rley/parser/gfg_earley_parser_spec.rb +59 -57
- data/spec/rley/parser/gfg_parsing_spec.rb +1 -1
- data/spec/rley/parser/parse_tracer_spec.rb +3 -2
- data/spec/rley/sppf/token_node_spec.rb +9 -6
- data/spec/rley/support/ambiguous_grammar_helper.rb +2 -1
- data/spec/rley/support/expectation_helper.rb +1 -0
- data/spec/rley/support/grammar_ambig01_helper.rb +15 -6
- data/spec/rley/support/grammar_arr_int_helper.rb +16 -15
- data/spec/rley/support/grammar_b_expr_helper.rb +16 -7
- data/spec/rley/support/grammar_helper.rb +6 -2
- data/spec/rley/support/grammar_l0_helper.rb +12 -4
- data/spec/rley/support/grammar_pb_helper.rb +46 -21
- data/spec/support/base_tokenizer_spec.rb +77 -0
- metadata +5 -2
@@ -29,7 +29,8 @@ module AmbiguousGrammarHelper
|
|
29
29
|
msg = "Unknown input text '#{lexeme}'"
|
30
30
|
raise StandardError, msg
|
31
31
|
end
|
32
|
-
Rley::Lexical::
|
32
|
+
pos = Rley::Lexical::Position.new(3, 4) # dummy pos
|
33
|
+
Rley::Lexical::Token.new(lexeme, terminal, pos)
|
33
34
|
end
|
34
35
|
|
35
36
|
return tokens
|
@@ -7,6 +7,7 @@ module ExpectationHelper
|
|
7
7
|
# Helper method. Compare the data from all the parse entries
|
8
8
|
# of a given ParseEntrySet with an array of expectation strings.
|
9
9
|
def compare_entry_texts(anEntrySet, expectations)
|
10
|
+
raise StandardError, "Nil entry set" if anEntrySet.nil?
|
10
11
|
(0...expectations.size).each do |i|
|
11
12
|
expect(anEntrySet.entries[i].to_s).to eq(expectations[i])
|
12
13
|
end
|
@@ -21,19 +21,28 @@ module GrammarAmbig01Helper
|
|
21
21
|
end
|
22
22
|
|
23
23
|
# Highly simplified tokenizer implementation.
|
24
|
-
def tokenizer_ambig01(aText
|
25
|
-
|
24
|
+
def tokenizer_ambig01(aText)
|
25
|
+
scanner = StringScanner.new(aText)
|
26
|
+
tokens = []
|
27
|
+
|
28
|
+
loop do
|
29
|
+
scanner.skip(/\s+/)
|
30
|
+
curr_pos = scanner.pos
|
31
|
+
lexeme = scanner.scan(/\S+/)
|
32
|
+
break unless lexeme
|
26
33
|
case lexeme
|
27
34
|
when '+', '*'
|
28
|
-
terminal =
|
35
|
+
terminal = lexeme
|
29
36
|
when /^[-+]?\d+$/
|
30
|
-
terminal =
|
37
|
+
terminal = 'integer'
|
31
38
|
else
|
32
39
|
msg = "Unknown input text '#{lexeme}'"
|
33
40
|
raise StandardError, msg
|
34
41
|
end
|
35
|
-
|
36
|
-
|
42
|
+
|
43
|
+
pos = Rley::Lexical::Position.new(1, curr_pos + 1)
|
44
|
+
tokens << Rley::Lexical::Token.new(lexeme, terminal, pos)
|
45
|
+
end
|
37
46
|
|
38
47
|
return tokens
|
39
48
|
end
|
@@ -23,29 +23,30 @@ module GrammarArrIntHelper
|
|
23
23
|
end
|
24
24
|
|
25
25
|
# Basic tokenizer for array of integers
|
26
|
-
def arr_int_tokenizer(aText
|
27
|
-
tokens = []
|
26
|
+
def arr_int_tokenizer(aText)
|
28
27
|
scanner = StringScanner.new(aText)
|
29
|
-
|
30
|
-
|
28
|
+
tokens = []
|
29
|
+
|
30
|
+
loop do
|
31
31
|
scanner.skip(/\s+/)
|
32
|
-
|
33
|
-
if
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
lexeme = scanner.scan(
|
39
|
-
|
40
|
-
terminal = aGrammar.name2symbol['integer']
|
41
|
-
tokens << Rley::Lexical::Token.new(lexeme, terminal)
|
32
|
+
curr_ch = scanner.peek(1)
|
33
|
+
break if curr_ch.nil? || curr_ch.empty?
|
34
|
+
curr_pos = scanner.pos
|
35
|
+
|
36
|
+
if (lexeme = scanner.scan(/[\[\],]/))
|
37
|
+
terminal = lexeme
|
38
|
+
elsif (lexeme = scanner.scan(/[-+]?\d+/))
|
39
|
+
terminal = 'integer'
|
42
40
|
else
|
43
41
|
msg = "Unknown input text '#{lexeme}'"
|
44
42
|
raise StandardError, msg
|
45
43
|
end
|
44
|
+
|
45
|
+
pos = Rley::Lexical::Position.new(1, curr_pos + 1)
|
46
|
+
tokens << Rley::Lexical::Token.new(lexeme, terminal, pos)
|
46
47
|
end
|
47
48
|
|
48
|
-
return tokens
|
49
|
+
return tokens
|
49
50
|
end
|
50
51
|
end # module
|
51
52
|
# End of file
|
@@ -21,21 +21,30 @@ module GrammarBExprHelper
|
|
21
21
|
end
|
22
22
|
|
23
23
|
# Basic expression tokenizer
|
24
|
-
def expr_tokenizer(aText
|
25
|
-
|
24
|
+
def expr_tokenizer(aText)
|
25
|
+
scanner = StringScanner.new(aText)
|
26
|
+
tokens = []
|
27
|
+
|
28
|
+
loop do
|
29
|
+
scanner.skip(/\s+/)
|
30
|
+
curr_pos = scanner.pos
|
31
|
+
lexeme = scanner.scan(/\S+/)
|
32
|
+
break unless lexeme
|
26
33
|
case lexeme
|
27
34
|
when '+', '*'
|
28
|
-
terminal =
|
35
|
+
terminal = lexeme
|
29
36
|
when /^[-+]?\d+$/
|
30
|
-
terminal =
|
37
|
+
terminal = 'integer'
|
31
38
|
else
|
32
39
|
msg = "Unknown input text '#{lexeme}'"
|
33
40
|
raise StandardError, msg
|
34
41
|
end
|
35
|
-
Rley::Lexical::Token.new(lexeme, terminal)
|
36
|
-
end
|
37
42
|
|
38
|
-
|
43
|
+
pos = Rley::Lexical::Position.new(1, curr_pos + 1)
|
44
|
+
tokens << Rley::Lexical::Token.new(lexeme, terminal, pos)
|
45
|
+
end
|
46
|
+
|
47
|
+
return tokens
|
39
48
|
end
|
40
49
|
end # module
|
41
50
|
# End of file
|
@@ -8,19 +8,23 @@ module GrammarHelper
|
|
8
8
|
# Synopsis:
|
9
9
|
# build_token_sequence(%w(a a b c c), grm1)
|
10
10
|
def build_token_sequence(literals, aGrammar)
|
11
|
+
col = 1
|
11
12
|
tokens = literals.map do |lexeme|
|
13
|
+
pos = Rley::Lexical::Position.new(1, col)
|
12
14
|
case lexeme
|
13
15
|
when String
|
14
16
|
terminal = aGrammar.name2symbol[lexeme]
|
15
|
-
Rley::Lexical::Token.new(lexeme, terminal)
|
17
|
+
token = Rley::Lexical::Token.new(lexeme, terminal, pos)
|
16
18
|
|
17
19
|
when Hash # lexeme is reality a Hash: literal => terminal name
|
18
20
|
sub_array = lexeme.to_a
|
19
21
|
sub_array.map do |(literal, name)|
|
20
22
|
terminal = aGrammar.name2symbol[name]
|
21
|
-
Rley::Lexical::Token.new(literal, terminal)
|
23
|
+
token = Rley::Lexical::Token.new(literal, terminal, pos)
|
22
24
|
end
|
23
25
|
end
|
26
|
+
col += lexeme.length + 1
|
27
|
+
token
|
24
28
|
end
|
25
29
|
|
26
30
|
return tokens.flatten
|
@@ -66,14 +66,22 @@ module GrammarL0Helper
|
|
66
66
|
end
|
67
67
|
|
68
68
|
# Highly simplified tokenizer implementation.
|
69
|
-
def tokenizer_l0(aText
|
70
|
-
|
69
|
+
def tokenizer_l0(aText)
|
70
|
+
scanner = StringScanner.new(aText)
|
71
|
+
tokens = []
|
72
|
+
|
73
|
+
loop do
|
74
|
+
scanner.skip(/\s+/)
|
75
|
+
curr_pos = scanner.pos
|
76
|
+
word = scanner.scan(/\S+/)
|
77
|
+
break unless word
|
78
|
+
|
71
79
|
term_name = lexicon_l0[word]
|
72
80
|
if term_name.nil?
|
73
81
|
raise StandardError, "Word '#{word}' not found in lexicon"
|
74
82
|
end
|
75
|
-
|
76
|
-
Rley::Lexical::Token.new(word,
|
83
|
+
pos = Rley::Lexical::Position.new(1, curr_pos + 1)
|
84
|
+
tokens << Rley::Lexical::Token.new(word, term_name, pos)
|
77
85
|
end
|
78
86
|
|
79
87
|
return tokens
|
@@ -1,5 +1,6 @@
|
|
1
1
|
# Load the builder class
|
2
2
|
require_relative '../../../lib/rley/syntax/grammar_builder'
|
3
|
+
require_relative '../../../lib/support/base_tokenizer'
|
3
4
|
require_relative '../../../lib/rley/lexical/token'
|
4
5
|
|
5
6
|
|
@@ -12,36 +13,60 @@ class GrammarPBHelper
|
|
12
13
|
def grammar()
|
13
14
|
@grammar ||= begin
|
14
15
|
builder = Rley::Syntax::GrammarBuilder.new do
|
15
|
-
|
16
|
-
t_plus = Rley::Syntax::VerbatimSymbol.new('+')
|
17
|
-
t_lparen = Rley::Syntax::VerbatimSymbol.new('(')
|
18
|
-
t_rparen = Rley::Syntax::VerbatimSymbol.new(')')
|
19
|
-
add_terminals(t_int, t_plus, t_lparen, t_rparen)
|
16
|
+
add_terminals('int', '+', '(', ')')
|
20
17
|
rule 'S' => 'E'
|
21
18
|
rule 'E' => 'int'
|
22
|
-
rule 'E' =>
|
23
|
-
rule 'E' =>
|
19
|
+
rule 'E' => '( E + E )'
|
20
|
+
rule 'E' => 'E + E'
|
24
21
|
end
|
25
22
|
builder.grammar
|
26
23
|
end
|
27
24
|
end
|
28
25
|
|
29
|
-
# Basic expression tokenizer
|
30
|
-
def tokenize(aText)
|
31
|
-
tokens = aText.scan(/\S+/).map do |lexeme|
|
32
|
-
case lexeme
|
33
|
-
when '+', '(', ')'
|
34
|
-
terminal = @grammar.name2symbol[lexeme]
|
35
|
-
when /^[-+]?\d+$/
|
36
|
-
terminal = @grammar.name2symbol['int']
|
37
|
-
else
|
38
|
-
msg = "Unknown input text '#{lexeme}'"
|
39
|
-
raise StandardError, msg
|
26
|
+
# # Basic expression tokenizer
|
27
|
+
# def tokenize(aText)
|
28
|
+
# tokens = aText.scan(/\S+/).map do |lexeme|
|
29
|
+
# case lexeme
|
30
|
+
# when '+', '(', ')'
|
31
|
+
# terminal = @grammar.name2symbol[lexeme]
|
32
|
+
# when /^[-+]?\d+$/
|
33
|
+
# terminal = @grammar.name2symbol['int']
|
34
|
+
# else
|
35
|
+
# msg = "Unknown input text '#{lexeme}'"
|
36
|
+
# raise StandardError, msg
|
37
|
+
# end
|
38
|
+
# pos = Rley::Lexical::Position.new(1, 4) # Dummy position
|
39
|
+
# Rley::Lexical::Token.new(lexeme, terminal, pos)
|
40
|
+
# end
|
41
|
+
|
42
|
+
# return tokens
|
43
|
+
# end
|
44
|
+
|
45
|
+
|
46
|
+
class PB_Tokenizer < BaseTokenizer
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
def recognize_token()
|
51
|
+
token = nil
|
52
|
+
|
53
|
+
if (lexeme = scanner.scan(/[\(\)]/)) # Single characters
|
54
|
+
# Delimiters, separators => single character token
|
55
|
+
token = build_token(lexeme, lexeme)
|
56
|
+
elsif (lexeme = scanner.scan(/(?:\+)(?=\s|$)/)) # Single char occurring alone
|
57
|
+
token = build_token(lexeme, lexeme)
|
58
|
+
elsif (lexeme = scanner.scan(/[+-]?[0-9]+/))
|
59
|
+
token = build_token('int', lexeme)
|
40
60
|
end
|
41
|
-
Rley::Lexical::Token.new(lexeme, terminal)
|
42
61
|
end
|
62
|
+
end # class
|
43
63
|
|
44
|
-
|
64
|
+
# Basic tokenizer
|
65
|
+
# @return [Array<Rley::Lexical::Token>]
|
66
|
+
def tokenize(aText)
|
67
|
+
tokenizer = PB_Tokenizer.new(aText)
|
68
|
+
tokenizer.tokens
|
45
69
|
end
|
46
|
-
|
70
|
+
|
71
|
+
end # class
|
47
72
|
# End of file
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require_relative '../spec_helper'
|
2
|
+
|
3
|
+
# Load the class under test
|
4
|
+
require_relative '../../lib/support/base_tokenizer'
|
5
|
+
|
6
|
+
describe BaseTokenizer do
|
7
|
+
let(:sample_input) { '7 + (8 + 9)' }
|
8
|
+
context 'Standard creation & initialization:' do
|
9
|
+
subject { BaseTokenizer.new(sample_input) }
|
10
|
+
|
11
|
+
it 'should be initialized with a text argument' do
|
12
|
+
expect { BaseTokenizer.new(sample_input) }.not_to raise_error
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should have a scanner initialized' do
|
16
|
+
expect(subject.scanner).to be_kind_of(StringScanner)
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should have line number initialized' do
|
20
|
+
expect(subject.lineno).to eq(1)
|
21
|
+
end
|
22
|
+
end # context
|
23
|
+
|
24
|
+
|
25
|
+
context 'Provided services:' do
|
26
|
+
class PB_Tokenizer < BaseTokenizer
|
27
|
+
@@lexeme2name = {
|
28
|
+
'(' => 'LPAREN',
|
29
|
+
')' => 'RPAREN',
|
30
|
+
'+' => 'PLUS',
|
31
|
+
}.freeze
|
32
|
+
|
33
|
+
protected
|
34
|
+
|
35
|
+
def recognize_token()
|
36
|
+
token = nil
|
37
|
+
|
38
|
+
if (lexeme = scanner.scan(/[\(\)]/)) # Single characters
|
39
|
+
# Delimiters, separators => single character token
|
40
|
+
token = build_token(@@lexeme2name[lexeme], lexeme)
|
41
|
+
elsif (lexeme = scanner.scan(/(?:\+)(?=\s)/)) # Single char occurring alone
|
42
|
+
token = build_token(@@lexeme2name[lexeme], lexeme)
|
43
|
+
elsif (lexeme = scanner.scan(/[+-]?[0-9]+/))
|
44
|
+
token = build_token('int', lexeme)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end # class
|
48
|
+
|
49
|
+
# Basic tokenizer
|
50
|
+
# @return [Array<Rley::Lexical::Token>]
|
51
|
+
def tokenize(aText)
|
52
|
+
tokenizer = PB_Tokenizer.new(aText)
|
53
|
+
tokenizer.tokens
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should return a sequence of tokens' do
|
57
|
+
sequence = tokenize(sample_input)
|
58
|
+
checks = [
|
59
|
+
['int', 7, [1, 1]],
|
60
|
+
['PLUS', '+', [1, 3]],
|
61
|
+
['LPAREN', '(', [1, 5]],
|
62
|
+
['int', 8, [1, 6]],
|
63
|
+
['PLUS', '+', [1, 8]],
|
64
|
+
['int', 9, [1, 10]],
|
65
|
+
['RPAREN', ')', [1, 11]]
|
66
|
+
]
|
67
|
+
sequence.each_with_index do |token, i|
|
68
|
+
(tok_type, tok_value, tok_pos) = checks[i]
|
69
|
+
(line, col) = tok_pos
|
70
|
+
expect(token.terminal).to eq(tok_type)
|
71
|
+
expect(token.lexeme).to eq(tok_value.to_s)
|
72
|
+
expect(token.position.line).to eq(line)
|
73
|
+
expect(token.position.column).to eq(col)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end # describe
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rley
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.00
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dimitri Geshef
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-11-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: coveralls
|
@@ -224,6 +224,7 @@ files:
|
|
224
224
|
- lib/rley/syntax/symbol_seq.rb
|
225
225
|
- lib/rley/syntax/terminal.rb
|
226
226
|
- lib/rley/syntax/verbatim_symbol.rb
|
227
|
+
- lib/support/base_tokenizer.rb
|
227
228
|
- spec/rley/base/dotted_item_spec.rb
|
228
229
|
- spec/rley/base/grm_items_builder_spec.rb
|
229
230
|
- spec/rley/engine_spec.rb
|
@@ -291,6 +292,7 @@ files:
|
|
291
292
|
- spec/rley/syntax/terminal_spec.rb
|
292
293
|
- spec/rley/syntax/verbatim_symbol_spec.rb
|
293
294
|
- spec/spec_helper.rb
|
295
|
+
- spec/support/base_tokenizer_spec.rb
|
294
296
|
homepage: https://github.com/famished-tiger/Rley
|
295
297
|
licenses:
|
296
298
|
- MIT
|
@@ -376,3 +378,4 @@ test_files:
|
|
376
378
|
- spec/rley/syntax/symbol_seq_spec.rb
|
377
379
|
- spec/rley/syntax/terminal_spec.rb
|
378
380
|
- spec/rley/syntax/verbatim_symbol_spec.rb
|
381
|
+
- spec/support/base_tokenizer_spec.rb
|