rley 0.6.09 → 0.7.00

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/README.md +13 -2
  4. data/examples/NLP/benchmark_pico_en.rb +4 -1
  5. data/examples/NLP/engtagger.rb +4 -1
  6. data/examples/NLP/nano_eng/nano_en_demo.rb +15 -4
  7. data/examples/NLP/pico_en_demo.rb +2 -17
  8. data/examples/data_formats/JSON/json_ast_builder.rb +2 -2
  9. data/examples/data_formats/JSON/json_ast_nodes.rb +18 -2
  10. data/examples/data_formats/JSON/json_lexer.rb +10 -4
  11. data/examples/general/calc_iter1/calc_lexer.rb +5 -4
  12. data/examples/general/calc_iter2/calc_lexer.rb +2 -1
  13. data/examples/general/left.rb +4 -1
  14. data/examples/general/right.rb +4 -1
  15. data/lib/rley/constants.rb +1 -1
  16. data/lib/rley/lexical/token.rb +14 -2
  17. data/lib/rley/parser/error_reason.rb +1 -1
  18. data/lib/rley/parser/gfg_earley_parser.rb +4 -0
  19. data/lib/rley/syntax/terminal.rb +6 -2
  20. data/lib/support/base_tokenizer.rb +197 -0
  21. data/spec/rley/engine_spec.rb +2 -1
  22. data/spec/rley/formatter/asciitree_spec.rb +2 -1
  23. data/spec/rley/formatter/bracket_notation_spec.rb +2 -1
  24. data/spec/rley/formatter/debug_spec.rb +4 -2
  25. data/spec/rley/formatter/json_spec.rb +2 -1
  26. data/spec/rley/lexical/token_spec.rb +10 -5
  27. data/spec/rley/parse_rep/ambiguous_parse_spec.rb +1 -1
  28. data/spec/rley/parse_rep/ast_builder_spec.rb +1 -1
  29. data/spec/rley/parse_rep/cst_builder_spec.rb +2 -2
  30. data/spec/rley/parse_rep/groucho_spec.rb +2 -1
  31. data/spec/rley/parse_rep/parse_forest_builder_spec.rb +1 -1
  32. data/spec/rley/parse_tree_visitor_spec.rb +2 -1
  33. data/spec/rley/parser/error_reason_spec.rb +6 -4
  34. data/spec/rley/parser/gfg_earley_parser_spec.rb +59 -57
  35. data/spec/rley/parser/gfg_parsing_spec.rb +1 -1
  36. data/spec/rley/parser/parse_tracer_spec.rb +3 -2
  37. data/spec/rley/sppf/token_node_spec.rb +9 -6
  38. data/spec/rley/support/ambiguous_grammar_helper.rb +2 -1
  39. data/spec/rley/support/expectation_helper.rb +1 -0
  40. data/spec/rley/support/grammar_ambig01_helper.rb +15 -6
  41. data/spec/rley/support/grammar_arr_int_helper.rb +16 -15
  42. data/spec/rley/support/grammar_b_expr_helper.rb +16 -7
  43. data/spec/rley/support/grammar_helper.rb +6 -2
  44. data/spec/rley/support/grammar_l0_helper.rb +12 -4
  45. data/spec/rley/support/grammar_pb_helper.rb +46 -21
  46. data/spec/support/base_tokenizer_spec.rb +77 -0
  47. metadata +5 -2
@@ -29,7 +29,8 @@ module AmbiguousGrammarHelper
29
29
  msg = "Unknown input text '#{lexeme}'"
30
30
  raise StandardError, msg
31
31
  end
32
- Rley::Lexical::Token.new(lexeme, terminal)
32
+ pos = Rley::Lexical::Position.new(3, 4) # dummy pos
33
+ Rley::Lexical::Token.new(lexeme, terminal, pos)
33
34
  end
34
35
 
35
36
  return tokens
@@ -7,6 +7,7 @@ module ExpectationHelper
7
7
  # Helper method. Compare the data from all the parse entries
8
8
  # of a given ParseEntrySet with an array of expectation strings.
9
9
  def compare_entry_texts(anEntrySet, expectations)
10
+ raise StandardError, "Nil entry set" if anEntrySet.nil?
10
11
  (0...expectations.size).each do |i|
11
12
  expect(anEntrySet.entries[i].to_s).to eq(expectations[i])
12
13
  end
@@ -21,19 +21,28 @@ module GrammarAmbig01Helper
21
21
  end
22
22
 
23
23
  # Highly simplified tokenizer implementation.
24
- def tokenizer_ambig01(aText, aGrammar)
25
- tokens = aText.scan(/\S+/).map do |lexeme|
24
+ def tokenizer_ambig01(aText)
25
+ scanner = StringScanner.new(aText)
26
+ tokens = []
27
+
28
+ loop do
29
+ scanner.skip(/\s+/)
30
+ curr_pos = scanner.pos
31
+ lexeme = scanner.scan(/\S+/)
32
+ break unless lexeme
26
33
  case lexeme
27
34
  when '+', '*'
28
- terminal = aGrammar.name2symbol[lexeme]
35
+ terminal = lexeme
29
36
  when /^[-+]?\d+$/
30
- terminal = aGrammar.name2symbol['integer']
37
+ terminal = 'integer'
31
38
  else
32
39
  msg = "Unknown input text '#{lexeme}'"
33
40
  raise StandardError, msg
34
41
  end
35
- Rley::Lexical::Token.new(lexeme, terminal)
36
- end
42
+
43
+ pos = Rley::Lexical::Position.new(1, curr_pos + 1)
44
+ tokens << Rley::Lexical::Token.new(lexeme, terminal, pos)
45
+ end
37
46
 
38
47
  return tokens
39
48
  end
@@ -23,29 +23,30 @@ module GrammarArrIntHelper
23
23
  end
24
24
 
25
25
  # Basic tokenizer for array of integers
26
- def arr_int_tokenizer(aText, aGrammar)
27
- tokens = []
26
+ def arr_int_tokenizer(aText)
28
27
  scanner = StringScanner.new(aText)
29
-
30
- until scanner.eos?
28
+ tokens = []
29
+
30
+ loop do
31
31
  scanner.skip(/\s+/)
32
- lexeme = scanner.scan(/[\[,\]]/)
33
- if lexeme
34
- terminal = aGrammar.name2symbol[lexeme]
35
- tokens << Rley::Lexical::Token.new(lexeme, terminal)
36
- next
37
- end
38
- lexeme = scanner.scan(/^[-+]?\d+/)
39
- if lexeme
40
- terminal = aGrammar.name2symbol['integer']
41
- tokens << Rley::Lexical::Token.new(lexeme, terminal)
32
+ curr_ch = scanner.peek(1)
33
+ break if curr_ch.nil? || curr_ch.empty?
34
+ curr_pos = scanner.pos
35
+
36
+ if (lexeme = scanner.scan(/[\[\],]/))
37
+ terminal = lexeme
38
+ elsif (lexeme = scanner.scan(/[-+]?\d+/))
39
+ terminal = 'integer'
42
40
  else
43
41
  msg = "Unknown input text '#{lexeme}'"
44
42
  raise StandardError, msg
45
43
  end
44
+
45
+ pos = Rley::Lexical::Position.new(1, curr_pos + 1)
46
+ tokens << Rley::Lexical::Token.new(lexeme, terminal, pos)
46
47
  end
47
48
 
48
- return tokens
49
+ return tokens
49
50
  end
50
51
  end # module
51
52
  # End of file
@@ -21,21 +21,30 @@ module GrammarBExprHelper
21
21
  end
22
22
 
23
23
  # Basic expression tokenizer
24
- def expr_tokenizer(aText, aGrammar)
25
- tokens = aText.scan(/\S+/).map do |lexeme|
24
+ def expr_tokenizer(aText)
25
+ scanner = StringScanner.new(aText)
26
+ tokens = []
27
+
28
+ loop do
29
+ scanner.skip(/\s+/)
30
+ curr_pos = scanner.pos
31
+ lexeme = scanner.scan(/\S+/)
32
+ break unless lexeme
26
33
  case lexeme
27
34
  when '+', '*'
28
- terminal = aGrammar.name2symbol[lexeme]
35
+ terminal = lexeme
29
36
  when /^[-+]?\d+$/
30
- terminal = aGrammar.name2symbol['integer']
37
+ terminal = 'integer'
31
38
  else
32
39
  msg = "Unknown input text '#{lexeme}'"
33
40
  raise StandardError, msg
34
41
  end
35
- Rley::Lexical::Token.new(lexeme, terminal)
36
- end
37
42
 
38
- return tokens
43
+ pos = Rley::Lexical::Position.new(1, curr_pos + 1)
44
+ tokens << Rley::Lexical::Token.new(lexeme, terminal, pos)
45
+ end
46
+
47
+ return tokens
39
48
  end
40
49
  end # module
41
50
  # End of file
@@ -8,19 +8,23 @@ module GrammarHelper
8
8
  # Synopsis:
9
9
  # build_token_sequence(%w(a a b c c), grm1)
10
10
  def build_token_sequence(literals, aGrammar)
11
+ col = 1
11
12
  tokens = literals.map do |lexeme|
13
+ pos = Rley::Lexical::Position.new(1, col)
12
14
  case lexeme
13
15
  when String
14
16
  terminal = aGrammar.name2symbol[lexeme]
15
- Rley::Lexical::Token.new(lexeme, terminal)
17
+ token = Rley::Lexical::Token.new(lexeme, terminal, pos)
16
18
 
17
19
  when Hash # lexeme is reality a Hash: literal => terminal name
18
20
  sub_array = lexeme.to_a
19
21
  sub_array.map do |(literal, name)|
20
22
  terminal = aGrammar.name2symbol[name]
21
- Rley::Lexical::Token.new(literal, terminal)
23
+ token = Rley::Lexical::Token.new(literal, terminal, pos)
22
24
  end
23
25
  end
26
+ col += lexeme.length + 1
27
+ token
24
28
  end
25
29
 
26
30
  return tokens.flatten
@@ -66,14 +66,22 @@ module GrammarL0Helper
66
66
  end
67
67
 
68
68
  # Highly simplified tokenizer implementation.
69
- def tokenizer_l0(aText, aGrammar)
70
- tokens = aText.scan(/\S+/).map do |word|
69
+ def tokenizer_l0(aText)
70
+ scanner = StringScanner.new(aText)
71
+ tokens = []
72
+
73
+ loop do
74
+ scanner.skip(/\s+/)
75
+ curr_pos = scanner.pos
76
+ word = scanner.scan(/\S+/)
77
+ break unless word
78
+
71
79
  term_name = lexicon_l0[word]
72
80
  if term_name.nil?
73
81
  raise StandardError, "Word '#{word}' not found in lexicon"
74
82
  end
75
- terminal = aGrammar.name2symbol[term_name]
76
- Rley::Lexical::Token.new(word, terminal)
83
+ pos = Rley::Lexical::Position.new(1, curr_pos + 1)
84
+ tokens << Rley::Lexical::Token.new(word, term_name, pos)
77
85
  end
78
86
 
79
87
  return tokens
@@ -1,5 +1,6 @@
1
1
  # Load the builder class
2
2
  require_relative '../../../lib/rley/syntax/grammar_builder'
3
+ require_relative '../../../lib/support/base_tokenizer'
3
4
  require_relative '../../../lib/rley/lexical/token'
4
5
 
5
6
 
@@ -12,36 +13,60 @@ class GrammarPBHelper
12
13
  def grammar()
13
14
  @grammar ||= begin
14
15
  builder = Rley::Syntax::GrammarBuilder.new do
15
- t_int = Rley::Syntax::Literal.new('int', /[-+]?\d+/)
16
- t_plus = Rley::Syntax::VerbatimSymbol.new('+')
17
- t_lparen = Rley::Syntax::VerbatimSymbol.new('(')
18
- t_rparen = Rley::Syntax::VerbatimSymbol.new(')')
19
- add_terminals(t_int, t_plus, t_lparen, t_rparen)
16
+ add_terminals('int', '+', '(', ')')
20
17
  rule 'S' => 'E'
21
18
  rule 'E' => 'int'
22
- rule 'E' => %w[( E + E )]
23
- rule 'E' => %w[E + E]
19
+ rule 'E' => '( E + E )'
20
+ rule 'E' => 'E + E'
24
21
  end
25
22
  builder.grammar
26
23
  end
27
24
  end
28
25
 
29
- # Basic expression tokenizer
30
- def tokenize(aText)
31
- tokens = aText.scan(/\S+/).map do |lexeme|
32
- case lexeme
33
- when '+', '(', ')'
34
- terminal = @grammar.name2symbol[lexeme]
35
- when /^[-+]?\d+$/
36
- terminal = @grammar.name2symbol['int']
37
- else
38
- msg = "Unknown input text '#{lexeme}'"
39
- raise StandardError, msg
26
+ # # Basic expression tokenizer
27
+ # def tokenize(aText)
28
+ # tokens = aText.scan(/\S+/).map do |lexeme|
29
+ # case lexeme
30
+ # when '+', '(', ')'
31
+ # terminal = @grammar.name2symbol[lexeme]
32
+ # when /^[-+]?\d+$/
33
+ # terminal = @grammar.name2symbol['int']
34
+ # else
35
+ # msg = "Unknown input text '#{lexeme}'"
36
+ # raise StandardError, msg
37
+ # end
38
+ # pos = Rley::Lexical::Position.new(1, 4) # Dummy position
39
+ # Rley::Lexical::Token.new(lexeme, terminal, pos)
40
+ # end
41
+
42
+ # return tokens
43
+ # end
44
+
45
+
46
+ class PB_Tokenizer < BaseTokenizer
47
+
48
+ protected
49
+
50
+ def recognize_token()
51
+ token = nil
52
+
53
+ if (lexeme = scanner.scan(/[\(\)]/)) # Single characters
54
+ # Delimiters, separators => single character token
55
+ token = build_token(lexeme, lexeme)
56
+ elsif (lexeme = scanner.scan(/(?:\+)(?=\s|$)/)) # Single char occurring alone
57
+ token = build_token(lexeme, lexeme)
58
+ elsif (lexeme = scanner.scan(/[+-]?[0-9]+/))
59
+ token = build_token('int', lexeme)
40
60
  end
41
- Rley::Lexical::Token.new(lexeme, terminal)
42
61
  end
62
+ end # class
43
63
 
44
- return tokens
64
+ # Basic tokenizer
65
+ # @return [Array<Rley::Lexical::Token>]
66
+ def tokenize(aText)
67
+ tokenizer = PB_Tokenizer.new(aText)
68
+ tokenizer.tokens
45
69
  end
46
- end # module
70
+
71
+ end # class
47
72
  # End of file
@@ -0,0 +1,77 @@
1
+ require_relative '../spec_helper'
2
+
3
+ # Load the class under test
4
+ require_relative '../../lib/support/base_tokenizer'
5
+
6
+ describe BaseTokenizer do
7
+ let(:sample_input) { '7 + (8 + 9)' }
8
+ context 'Standard creation & initialization:' do
9
+ subject { BaseTokenizer.new(sample_input) }
10
+
11
+ it 'should be initialized with a text argument' do
12
+ expect { BaseTokenizer.new(sample_input) }.not_to raise_error
13
+ end
14
+
15
+ it 'should have a scanner initialized' do
16
+ expect(subject.scanner).to be_kind_of(StringScanner)
17
+ end
18
+
19
+ it 'should have line number initialized' do
20
+ expect(subject.lineno).to eq(1)
21
+ end
22
+ end # context
23
+
24
+
25
+ context 'Provided services:' do
26
+ class PB_Tokenizer < BaseTokenizer
27
+ @@lexeme2name = {
28
+ '(' => 'LPAREN',
29
+ ')' => 'RPAREN',
30
+ '+' => 'PLUS',
31
+ }.freeze
32
+
33
+ protected
34
+
35
+ def recognize_token()
36
+ token = nil
37
+
38
+ if (lexeme = scanner.scan(/[\(\)]/)) # Single characters
39
+ # Delimiters, separators => single character token
40
+ token = build_token(@@lexeme2name[lexeme], lexeme)
41
+ elsif (lexeme = scanner.scan(/(?:\+)(?=\s)/)) # Single char occurring alone
42
+ token = build_token(@@lexeme2name[lexeme], lexeme)
43
+ elsif (lexeme = scanner.scan(/[+-]?[0-9]+/))
44
+ token = build_token('int', lexeme)
45
+ end
46
+ end
47
+ end # class
48
+
49
+ # Basic tokenizer
50
+ # @return [Array<Rley::Lexical::Token>]
51
+ def tokenize(aText)
52
+ tokenizer = PB_Tokenizer.new(aText)
53
+ tokenizer.tokens
54
+ end
55
+
56
+ it 'should return a sequence of tokens' do
57
+ sequence = tokenize(sample_input)
58
+ checks = [
59
+ ['int', 7, [1, 1]],
60
+ ['PLUS', '+', [1, 3]],
61
+ ['LPAREN', '(', [1, 5]],
62
+ ['int', 8, [1, 6]],
63
+ ['PLUS', '+', [1, 8]],
64
+ ['int', 9, [1, 10]],
65
+ ['RPAREN', ')', [1, 11]]
66
+ ]
67
+ sequence.each_with_index do |token, i|
68
+ (tok_type, tok_value, tok_pos) = checks[i]
69
+ (line, col) = tok_pos
70
+ expect(token.terminal).to eq(tok_type)
71
+ expect(token.lexeme).to eq(tok_value.to_s)
72
+ expect(token.position.line).to eq(line)
73
+ expect(token.position.column).to eq(col)
74
+ end
75
+ end
76
+ end
77
+ end # describe
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rley
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.09
4
+ version: 0.7.00
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dimitri Geshef
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-10-20 00:00:00.000000000 Z
11
+ date: 2018-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: coveralls
@@ -224,6 +224,7 @@ files:
224
224
  - lib/rley/syntax/symbol_seq.rb
225
225
  - lib/rley/syntax/terminal.rb
226
226
  - lib/rley/syntax/verbatim_symbol.rb
227
+ - lib/support/base_tokenizer.rb
227
228
  - spec/rley/base/dotted_item_spec.rb
228
229
  - spec/rley/base/grm_items_builder_spec.rb
229
230
  - spec/rley/engine_spec.rb
@@ -291,6 +292,7 @@ files:
291
292
  - spec/rley/syntax/terminal_spec.rb
292
293
  - spec/rley/syntax/verbatim_symbol_spec.rb
293
294
  - spec/spec_helper.rb
295
+ - spec/support/base_tokenizer_spec.rb
294
296
  homepage: https://github.com/famished-tiger/Rley
295
297
  licenses:
296
298
  - MIT
@@ -376,3 +378,4 @@ test_files:
376
378
  - spec/rley/syntax/symbol_seq_spec.rb
377
379
  - spec/rley/syntax/terminal_spec.rb
378
380
  - spec/rley/syntax/verbatim_symbol_spec.rb
381
+ - spec/support/base_tokenizer_spec.rb