rley 0.6.09 → 0.7.00

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/README.md +13 -2
  4. data/examples/NLP/benchmark_pico_en.rb +4 -1
  5. data/examples/NLP/engtagger.rb +4 -1
  6. data/examples/NLP/nano_eng/nano_en_demo.rb +15 -4
  7. data/examples/NLP/pico_en_demo.rb +2 -17
  8. data/examples/data_formats/JSON/json_ast_builder.rb +2 -2
  9. data/examples/data_formats/JSON/json_ast_nodes.rb +18 -2
  10. data/examples/data_formats/JSON/json_lexer.rb +10 -4
  11. data/examples/general/calc_iter1/calc_lexer.rb +5 -4
  12. data/examples/general/calc_iter2/calc_lexer.rb +2 -1
  13. data/examples/general/left.rb +4 -1
  14. data/examples/general/right.rb +4 -1
  15. data/lib/rley/constants.rb +1 -1
  16. data/lib/rley/lexical/token.rb +14 -2
  17. data/lib/rley/parser/error_reason.rb +1 -1
  18. data/lib/rley/parser/gfg_earley_parser.rb +4 -0
  19. data/lib/rley/syntax/terminal.rb +6 -2
  20. data/lib/support/base_tokenizer.rb +197 -0
  21. data/spec/rley/engine_spec.rb +2 -1
  22. data/spec/rley/formatter/asciitree_spec.rb +2 -1
  23. data/spec/rley/formatter/bracket_notation_spec.rb +2 -1
  24. data/spec/rley/formatter/debug_spec.rb +4 -2
  25. data/spec/rley/formatter/json_spec.rb +2 -1
  26. data/spec/rley/lexical/token_spec.rb +10 -5
  27. data/spec/rley/parse_rep/ambiguous_parse_spec.rb +1 -1
  28. data/spec/rley/parse_rep/ast_builder_spec.rb +1 -1
  29. data/spec/rley/parse_rep/cst_builder_spec.rb +2 -2
  30. data/spec/rley/parse_rep/groucho_spec.rb +2 -1
  31. data/spec/rley/parse_rep/parse_forest_builder_spec.rb +1 -1
  32. data/spec/rley/parse_tree_visitor_spec.rb +2 -1
  33. data/spec/rley/parser/error_reason_spec.rb +6 -4
  34. data/spec/rley/parser/gfg_earley_parser_spec.rb +59 -57
  35. data/spec/rley/parser/gfg_parsing_spec.rb +1 -1
  36. data/spec/rley/parser/parse_tracer_spec.rb +3 -2
  37. data/spec/rley/sppf/token_node_spec.rb +9 -6
  38. data/spec/rley/support/ambiguous_grammar_helper.rb +2 -1
  39. data/spec/rley/support/expectation_helper.rb +1 -0
  40. data/spec/rley/support/grammar_ambig01_helper.rb +15 -6
  41. data/spec/rley/support/grammar_arr_int_helper.rb +16 -15
  42. data/spec/rley/support/grammar_b_expr_helper.rb +16 -7
  43. data/spec/rley/support/grammar_helper.rb +6 -2
  44. data/spec/rley/support/grammar_l0_helper.rb +12 -4
  45. data/spec/rley/support/grammar_pb_helper.rb +46 -21
  46. data/spec/support/base_tokenizer_spec.rb +77 -0
  47. metadata +5 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8a924baa9568e3076c5c8ebb0d3f1e9ff162ab09
4
- data.tar.gz: f01496851e2679a598a34f6635caf27e90aeda19
3
+ metadata.gz: 2b462d4c492ffb698715478492a962d65e41834c
4
+ data.tar.gz: 282ab2ed83d7b1ead2646c8dd98176d1753a142e
5
5
  SHA512:
6
- metadata.gz: 80b5ac648702c5ab11e2e84e6748b0be336f8ff39c61d6f05b49fda2a50ed7213cd82f4711590d8173434feaa83e4236a0b7ed56c74242eac53f2608d6ab1d4c
7
- data.tar.gz: b8cf7f4a64c2526a784b0f6396e031040e57244a15ae760d5b050900fb26ad38c98e5fb66316fa29381384482fa2b40722502e11b7ede794fb1f65e31508a2f8
6
+ metadata.gz: 514e4a9429b4fd1231001269cd18e96fa70d4b9145c60115f042f338e0a0063871f979ba9c04e971ee589c8e2d3919fece6b3af4499af50f03899f44318a0598
7
+ data.tar.gz: 870e01cb9e693c126b9fa13915dadcec72559553147e0bf220f6e39a6431c6b59c7fb1b67aa1a19d8eadf0076fc6378948fbf1f56850c88d9f99584fefb7f259
@@ -1,3 +1,13 @@
1
+ ### 0.7.00 / 2018-11-24
2
+ - Version bump. Core class `Token` is changed.
3
+
4
+ * [NEW] Structure `Lexical::Position` to hold the line, column position of a token.
5
+ * [NEW] Class `BaseTokenizer`: Provides basic tokenizer operations to customized through subclassing.
6
+ * [CHANGE] Class `Lexical::Token`: Attribute `position` added.
7
+ * [CHANGE] Method `Lexical::Token#initialize`: Add a third argument for specifying the position of the token.
8
+ * [CHANGE] Many classes and examples updated to conform to `Token` class change.
9
+ * [FIX] Missing methods in class `JSONPair` added.
10
+
1
11
  ### 0.6.09 / 2018-10-20
2
12
  * [FIXED] Method `GrmFlowGraph#traverse_df` now returns a meaningful message when the grammar uses a terminal symbol without declaring it first.
3
13
 
data/README.md CHANGED
@@ -148,14 +148,25 @@ The subset of English grammar is based on an example from the NLTK book.
148
148
 
149
149
  ### Creating a tokenizer
150
150
  ```ruby
151
+ require 'strscan'
152
+
151
153
  # A tokenizer reads the input string and converts it into a sequence of tokens.
152
154
  # Remark: Rley doesn't provide tokenizer functionality.
153
155
  # Highly simplified tokenizer implementation
154
156
  def tokenizer(aTextToParse)
155
- tokens = aTextToParse.scan(/\S+/).map do |word|
157
+ scanner = StringScanner.new(aTextToParse)
158
+ tokens = []
159
+
160
+ loop do
161
+ scanner.skip(/\s+/)
162
+ curr_pos = scanner.pos
163
+ word = scanner.scan(/\S+/)
164
+ break unless word
165
+
156
166
  term_name = Lexicon[word]
157
167
  raise StandardError, "Word '#{word}' not found in lexicon" if term_name.nil?
158
- Rley::Lexical::Token.new(word, term_name)
168
+ pos = Rley::Lexical::Position.new(1, curr_pos + 1)
169
+ tokens << Rley::Lexical::Token.new(word, term_name, pos)
159
170
  end
160
171
 
161
172
  return tokens
@@ -63,10 +63,13 @@ Lexicon = {
63
63
  # Rley doesn't provide tokenizer functionality.
64
64
  # (Highly simplified tokenizer implementation).
65
65
  def tokenizer(aTextToParse)
66
+ offset = -1
66
67
  tokens = aTextToParse.scan(/\S+/).map do |word|
67
68
  term_name = Lexicon[word]
68
69
  raise StandardError, "Word '#{word}' not found in lexicon" if term_name.nil?
69
- Rley::Lexical::Token.new(word, term_name)
70
+ pos = Rley::Lexical::Position.new(1, offset + 1)
71
+ offset += word.length
72
+ Rley::Lexical::Token.new(word, term_name, pos)
70
73
  end
71
74
 
72
75
  return tokens
@@ -147,10 +147,13 @@ lexicon = clean_text(text)
147
147
  tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
148
148
 
149
149
  def tokenizer(lexicon, tokens)
150
+ pos = -1
150
151
  rley_tokens = []
151
152
  lexicon.each_with_index do |word, i|
152
153
  term_name = tokens[i].last
153
- rley_tokens << Rley::Lexical::Token.new(word, term_name)
154
+ rank = Rley::Lexical::Position.new(1, pos + 1)
155
+ pos += word.length + 1 # Assuming one space between words.
156
+ rley_tokens << Rley::Lexical::Token.new(word, term_name, pos)
154
157
  end
155
158
  return rley_tokens
156
159
  end
@@ -1,3 +1,4 @@
1
+ require 'strscan'
1
2
  require 'rley' # Load Rley library
2
3
 
3
4
  ########################################
@@ -67,16 +68,26 @@ Lexicon = {
67
68
  # Step 4. Creating a tokenizer
68
69
  # A tokenizer reads the input string and converts it into a sequence of tokens
69
70
  # Highly simplified tokenizer implementation.
70
- def tokenizer(aTextToParse)
71
- tokens = aTextToParse.scan(/\S+/).map do |word|
71
+ def tokenizer(aTextToParse)
72
+ scanner = StringScanner.new(aTextToParse)
73
+ tokens = []
74
+
75
+ loop do
76
+ scanner.skip(/\s+/)
77
+ curr_pos = scanner.pos
78
+ word = scanner.scan(/\S+/)
79
+ break unless word
80
+
72
81
  term_name = Lexicon[word]
73
82
  raise StandardError, "Word '#{word}' not found in lexicon" if term_name.nil?
74
- Rley::Lexical::Token.new(word, term_name)
83
+ pos = Rley::Lexical::Position.new(1, curr_pos + 1)
84
+ tokens << Rley::Lexical::Token.new(word, term_name, pos)
75
85
  end
76
86
 
77
- return tokens
87
+ return tokens
78
88
  end
79
89
 
90
+
80
91
  ########################################
81
92
  # Step 5. Parsing the input
82
93
  input_to_parse = 'John saw Mary'
@@ -56,21 +56,6 @@ Lexicon = {
56
56
  'with' => 'Preposition'
57
57
  }.freeze
58
58
 
59
- Position = Struct.new(:line, :column) do
60
- def to_s()
61
- "line #{line}, column #{column}"
62
- end
63
- end
64
-
65
- class NLPToken < Rley::Lexical::Token
66
- attr_reader(:position)
67
-
68
- def initialize(theLexeme, aTerminal, aPosition)
69
- super(theLexeme, aTerminal)
70
- @position = aPosition
71
- end
72
- end
73
-
74
59
  ########################################
75
60
  # Step 4. Create a tokenizer
76
61
  # A tokenizer reads the input string and converts it into a sequence of tokens.
@@ -88,8 +73,8 @@ def tokenizer(aTextToParse)
88
73
 
89
74
  term_name = Lexicon[word]
90
75
  raise StandardError, "Word '#{word}' not found in lexicon" if term_name.nil?
91
- pos = Position.new(1, curr_pos + 1)
92
- tokens << NLPToken.new(word, term_name, pos)
76
+ pos = Rley::Lexical::Position.new(1, curr_pos + 1)
77
+ tokens << Rley::Lexical::Token.new(word, term_name, pos)
93
78
  end
94
79
 
95
80
  return tokens
@@ -7,7 +7,7 @@ require_relative 'json_ast_nodes'
7
7
  # The Builder pattern creates a complex object
8
8
  # (say, a parse tree) from simpler objects (terminal and non-terminal
9
9
  # nodes) and using a step by step approach.
10
- class JSONASTBuilder < Rley::ParseRep::ParseTreeBuilder
10
+ class JSONASTBuilder < Rley::ParseRep::ASTBaseBuilder
11
11
  Terminal2NodeClass = {
12
12
  'false' => JSONBooleanNode,
13
13
  'true' => JSONBooleanNode,
@@ -63,7 +63,7 @@ class JSONASTBuilder < Rley::ParseRep::ParseTreeBuilder
63
63
  return JSONPair.new(theChildren[0], theChildren[2], aProduction.lhs)
64
64
  end
65
65
 
66
- # rule 'object' => %w[begin-object member-list end-object]
66
+ # rule 'array' => %w[begin-array array-items end-array]
67
67
  def reduce_array_0(aProduction, _range, _tokens, theChildren)
68
68
  second_child = theChildren[1]
69
69
  second_child.symbol = aProduction.lhs
@@ -27,6 +27,9 @@ JSONTerminalNode = Struct.new(:token, :value, :position) do
27
27
  def accept(aVisitor)
28
28
  aVisitor.visit_terminal(self)
29
29
  end
30
+
31
+ def done!
32
+ end
30
33
  end
31
34
 
32
35
 
@@ -71,6 +74,9 @@ class JSONCompositeNode
71
74
  def accept(aVisitor)
72
75
  aVisitor.visit_nonterminal(self)
73
76
  end
77
+
78
+ def done!
79
+ end
74
80
 
75
81
  alias subnodes children
76
82
  end # class
@@ -96,7 +102,7 @@ end # class
96
102
  class JSONPair
97
103
  attr_reader(:name)
98
104
  attr_reader(:value)
99
- attr_reader(:symbol)
105
+ attr_accessor(:symbol)
100
106
 
101
107
  def initialize(aName, aValue, aSymbol)
102
108
  @name = aName
@@ -115,6 +121,16 @@ class JSONPair
115
121
  def accept(aVisitor)
116
122
  aVisitor.visit_nonterminal(self)
117
123
  end
124
+
125
+ def done!
126
+ end
127
+
128
+ def to_ruby
129
+ rep = {}
130
+ rep[name.to_ruby] = value.to_ruby
131
+
132
+ return rep
133
+ end
118
134
  end # class
119
135
 
120
136
  class JSONObjectNode < JSONCompositeNode
@@ -123,7 +139,7 @@ class JSONObjectNode < JSONCompositeNode
123
139
  end
124
140
 
125
141
  # Convert this tree node in a simpler Ruby representation.
126
- # Basically a JSON object corresponds to a Ruhy Hash
142
+ # Basically a JSON object corresponds to a Ruby Hash
127
143
  def to_ruby()
128
144
  rep = {}
129
145
  members.each do |pair|
@@ -23,6 +23,7 @@ class JSONLexer
23
23
  def initialize(source)
24
24
  @scanner = StringScanner.new(source)
25
25
  @lineno = 1
26
+ @line_start = 0
26
27
  end
27
28
 
28
29
  def tokens()
@@ -48,7 +49,7 @@ class JSONLexer
48
49
  case curr_ch
49
50
  when '{', '}', '[', ']', ',', ':'
50
51
  token_type = @@lexeme2name[curr_ch]
51
- token = Rley::Lexical::Token.new(curr_ch, token_type)
52
+ token = build_token(curr_ch, token_type)
52
53
 
53
54
  when /[ftn]/ # First letter of keywords
54
55
  @scanner.pos = scanner.pos - 1 # Simulate putback
@@ -57,7 +58,7 @@ class JSONLexer
57
58
  invalid_keyw = scanner.scan(/\w+/)
58
59
  raise ScanError.new("Invalid keyword: #{invalid_keyw}")
59
60
  else
60
- token = Rley::Lexical::Token.new(keyw, keyw)
61
+ token = build_token(keyw, keyw)
61
62
  end
62
63
 
63
64
  # LITERALS
@@ -66,12 +67,12 @@ class JSONLexer
66
67
  end_delimiter = scanner.getch
67
68
  err_msg = 'No closing quotes (") found'
68
69
  raise ScanError.new(err_msg) if end_delimiter.nil?
69
- token = Rley::Lexical::Token.new(value, 'string')
70
+ token = build_token(value, 'string')
70
71
 
71
72
  when /[-0-9]/ # Start character of number literal found
72
73
  @scanner.pos = scanner.pos - 1 # Simulate putback
73
74
  value = scanner.scan(/-?[0-9]+(\.[0-9]+)?([eE][-+]?[0-9])?/)
74
- token = Rley::Lexical::Token.new(value, 'number')
75
+ token = build_token(value, 'number')
75
76
 
76
77
  else # Unknown token
77
78
  erroneous = curr_ch.nil? ? '' : curr_ch
@@ -84,6 +85,11 @@ class JSONLexer
84
85
 
85
86
  return token
86
87
  end
88
+
89
+ def build_token(lexeme, token)
90
+ pos = Rley::Lexical::Position.new(lineno, scanner.pos - line_start)
91
+ Rley::Lexical::Token.new(lexeme, token, pos)
92
+ end
87
93
 
88
94
  def skip_whitespaces()
89
95
  matched = scanner.scan(/[ \t\f\n\r]+/)
@@ -42,13 +42,13 @@ class CalcLexer
42
42
  skip_whitespaces
43
43
  curr_ch = scanner.peek(1)
44
44
  return nil if curr_ch.nil?
45
-
45
+
46
46
  token = nil
47
47
 
48
48
  if '()+/'.include? curr_ch
49
49
  # Single character token
50
50
  token = build_token(@@lexeme2name[curr_ch], scanner.getch)
51
-
51
+
52
52
  elsif (lexeme = scanner.scan(/\*\*/))
53
53
  token = build_token(@@lexeme2name[lexeme], lexeme)
54
54
  elsif (lexeme = scanner.scan(/\*/))
@@ -66,9 +66,10 @@ class CalcLexer
66
66
 
67
67
  return token
68
68
  end
69
-
69
+
70
70
  def build_token(aSymbolName, aLexeme)
71
- return Rley::Lexical::Token.new(aLexeme, aSymbolName)
71
+ pos = Rley::Lexical::Position.new(1, scanner.pos)
72
+ return Rley::Lexical::Token.new(aLexeme, aSymbolName, pos)
72
73
  end
73
74
 
74
75
  def skip_whitespaces()
@@ -75,7 +75,8 @@ class CalcLexer
75
75
  end
76
76
 
77
77
  def build_token(aSymbolName, aLexeme)
78
- return Rley::Lexical::Token.new(aLexeme, aSymbolName)
78
+ pos = Rley::Lexical::Position.new(1, scanner.pos)
79
+ return Rley::Lexical::Token.new(aLexeme, aSymbolName, pos)
79
80
  end
80
81
 
81
82
  def skip_whitespaces()
@@ -17,9 +17,12 @@ grammar = builder.grammar
17
17
 
18
18
  # Highly simplified tokenizer implementation.
19
19
  def tokenizer(aText, aGrammar)
20
+ index = 0
20
21
  tokens = aText.scan(/\./).map do |dot|
21
22
  terminal = aGrammar.name2symbol['DOT']
22
- Rley::Lexical::Token.new(dot, terminal)
23
+ index += 1
24
+ pos = Rley::Lexical::Position.new(1, index)
25
+ Rley::Lexical::Token.new(dot, terminal, pos)
23
26
  end
24
27
 
25
28
  return tokens
@@ -17,9 +17,12 @@ grammar = builder.grammar
17
17
 
18
18
  # Highly simplified tokenizer implementation.
19
19
  def tokenizer(aText, aGrammar)
20
+ index = 0
20
21
  tokens = aText.scan(/\./).map do |dot|
21
22
  terminal = aGrammar.name2symbol['DOT']
22
- Rley::Lexical::Token.new(dot, terminal)
23
+ index += 1
24
+ pos = Rley::Lexical::Position.new(1, index)
25
+ Rley::Lexical::Token.new(dot, terminal, pos)
23
26
  end
24
27
 
25
28
  return tokens
@@ -3,7 +3,7 @@
3
3
 
4
4
  module Rley # Module used as a namespace
5
5
  # The version number of the gem.
6
- Version = '0.6.09'.freeze
6
+ Version = '0.7.00'.freeze
7
7
 
8
8
  # Brief description of the gem.
9
9
  Description = "Ruby implementation of the Earley's parsing algorithm".freeze
@@ -1,5 +1,13 @@
1
1
  module Rley # This module is used as a namespace
2
2
  module Lexical # This module is used as a namespace
3
+ # A Position is the location of a lexeme within a source file.
4
+ Position = Struct.new(:line, :column) do
5
+ def to_s
6
+ "line #{line}, column #{column}"
7
+ end
8
+ end
9
+
10
+
3
11
  # In Rley, a (lexical) token is an object created by a lexer (tokenizer)
4
12
  # and passed to the parser. Such token an object is created when a lexer
5
13
  # detects that a sequence of characters(a lexeme) from the input stream
@@ -17,15 +25,19 @@ module Rley # This module is used as a namespace
17
25
 
18
26
  # @return [Syntax::Terminal] Terminal symbol corresponding to the lexeme.
19
27
  attr_reader(:terminal)
28
+
29
+ # @return [Position] The position of the lexeme in the source file.
30
+ attr_reader(:position)
20
31
 
21
32
  # Constructor.
22
33
  # @param theLexeme [String] the lexeme (= piece of text from input)
23
- # @param aTerminal [Syntax::Terminal]
34
+ # @param aTerminal [Syntax::Terminal, String]
24
35
  # The terminal symbol corresponding to the lexeme.
25
- def initialize(theLexeme, aTerminal)
36
+ def initialize(theLexeme, aTerminal, aPosition)
26
37
  raise 'Internal error: nil terminal symbol detected' if aTerminal.nil?
27
38
  @lexeme = theLexeme
28
39
  @terminal = aTerminal
40
+ @position = aPosition
29
41
  end
30
42
  end # class
31
43
  end # module
@@ -84,7 +84,7 @@ module Rley # Module used as a namespace
84
84
  err_msg = "Syntax error at or near token #{position} "
85
85
  err_msg << ">>>#{last_token.lexeme}<<<\n"
86
86
  err_msg << expectations
87
- err_msg << ", found a '#{last_token.terminal.name}' instead."
87
+ err_msg << ", found a '#{last_token.terminal}' instead."
88
88
 
89
89
  return err_msg
90
90
  end
@@ -30,6 +30,10 @@ module Rley # This module is used as a namespace
30
30
 
31
31
  aTokenSequence.each_with_index do |token, i|
32
32
  parse_for_token(result, i)
33
+ if token.terminal.kind_of?(String)
34
+ symb = grammar.name2symbol[token.terminal]
35
+ token.instance_variable_set(:@terminal, symb)
36
+ end
33
37
  scan_success = scan_rule(result, i, token)
34
38
  break unless scan_success
35
39
  end
@@ -14,16 +14,20 @@ module Rley # This module is used as a namespace
14
14
  end
15
15
 
16
16
  # Return true iff the symbol is a terminal
17
- def terminal?()
17
+ def terminal?
18
18
  return true
19
19
  end
20
20
 
21
21
  # @return [false] Return true if the symbol derives
22
22
  # the empty string. As terminal symbol corresponds to a input token
23
23
  # it is by definition non-nullable.
24
- def nullable?()
24
+ def nullable?
25
25
  false
26
26
  end
27
+
28
+ def to_s
29
+ name
30
+ end
27
31
  end # class
28
32
  end # module
29
33
  end # module
@@ -0,0 +1,197 @@
1
+ require 'strscan'
2
+ require_relative '../rley/lexical/token'
3
+
4
+ class BaseTokenizer
5
+ attr_reader(:scanner)
6
+ attr_reader(:lineno)
7
+ attr_reader(:line_start)
8
+
9
+ class ScanError < StandardError; end
10
+
11
+ # Constructor. Initialize a tokenizer for Skeem.
12
+ # @param source [String] Skeem text to tokenize.
13
+ def initialize(source)
14
+ @scanner = StringScanner.new('')
15
+ restart(source)
16
+ end
17
+
18
+ # @param source [String] Skeem text to tokenize.
19
+ def restart(source)
20
+ @scanner.string = source
21
+ @lineno = 1
22
+ @line_start = 0
23
+ end
24
+
25
+ # @return [Array<SkmToken>] | Returns a sequence of tokens
26
+ def tokens
27
+ tok_sequence = []
28
+ until @scanner.eos?
29
+ token = _next_token
30
+ tok_sequence << token unless token.nil?
31
+ end
32
+
33
+ return tok_sequence
34
+ end
35
+
36
+ protected
37
+
38
+ # Patterns:
39
+ # Unambiguous single character
40
+ # Conditional single character (e.g. '+' operator, '+' prefix for positive numbers)
41
+ def _next_token
42
+ skip_whitespaces
43
+ curr_ch = scanner.peek(1)
44
+ return nil if curr_ch.nil? || curr_ch.empty?
45
+
46
+ token = recognize_token()
47
+ if token.nil? # Unknown token
48
+ curr_ch = scanner.peek(1)
49
+ erroneous = curr_ch.nil? ? '' : scanner.scan(/./)
50
+ sequel = scanner.scan(/.{1,20}/)
51
+ erroneous += sequel unless sequel.nil?
52
+ raise ScanError, "Unknown token #{erroneous} on line #{lineno}"
53
+ end
54
+
55
+ return token
56
+ end
57
+
58
+ def recognize_token()
59
+ =begin
60
+ if "()'`".include? curr_ch # Single characters
61
+ # Delimiters, separators => single character token
62
+ token = build_token(@@lexeme2name[curr_ch], scanner.getch)
63
+ elsif (lexeme = scanner.scan(/(?:\.)(?=\s)/)) # Single char occurring alone
64
+ token = build_token('PERIOD', lexeme)
65
+ elsif (lexeme = scanner.scan(/,@?/))
66
+ token = build_token(@@lexeme2name[lexeme], lexeme)
67
+ elsif (lexeme = scanner.scan(/#(?:(?:true)|(?:false)|(?:u8)|[\\\(tfeiodx]|(?:\d+[=#]))/))
68
+ token = cardinal_token(lexeme)
69
+ elsif (lexeme = scanner.scan(/[+-]?[0-9]+(?=\s|[|()";]|$)/))
70
+ token = build_token('INTEGER', lexeme) # Decimal radix
71
+ elsif (lexeme = scanner.scan(/[+-]?[0-9]+(?:\.[0-9]+)?(?:(?:e|E)[+-]?[0-9]+)?/))
72
+ # Order dependency: must be tested after INTEGER case
73
+ token = build_token('REAL', lexeme)
74
+ elsif (lexeme = scanner.scan(/"(?:\\"|[^"])*"/)) # Double quotes literal?
75
+ token = build_token('STRING_LIT', lexeme)
76
+ elsif (lexeme = scanner.scan(/[a-zA-Z!$%&*\/:<=>?@^_~][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
77
+ keyw = @@keywords[lexeme.upcase]
78
+ tok_type = keyw ? keyw : 'IDENTIFIER'
79
+ token = build_token(tok_type, lexeme)
80
+ elsif (lexeme = scanner.scan(/\|(?:[^|])*\|/)) # Vertical bar delimited
81
+ token = build_token('IDENTIFIER', lexeme)
82
+ elsif (lexeme = scanner.scan(/([\+\-])((?=\s|[|()";])|$)/))
83
+ # # R7RS peculiar identifiers case 1: isolated plus and minus as identifiers
84
+ token = build_token('IDENTIFIER', lexeme)
85
+ elsif (lexeme = scanner.scan(/[+-][a-zA-Z!$%&*\/:<=>?@^_~+-@][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
86
+ # R7RS peculiar identifiers case 2
87
+ token = build_token('IDENTIFIER', lexeme)
88
+ elsif (lexeme = scanner.scan(/\.[a-zA-Z!$%&*\/:<=>?@^_~+-@.][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
89
+ # R7RS peculiar identifiers case 4
90
+ token = build_token('IDENTIFIER', lexeme)
91
+ =end
92
+ end
93
+
94
+ def build_token(aSymbolName, aLexeme, aFormat = :default)
95
+ begin
96
+ value = convert_to(aLexeme, aSymbolName, aFormat)
97
+ col = scanner.pos - aLexeme.size - @line_start + 1
98
+ pos = Rley::Lexical::Position.new(@lineno, col)
99
+ token = Rley::Lexical::Token.new(value, aSymbolName, pos)
100
+ rescue StandardError => exc
101
+ puts "Failing with '#{aSymbolName}' and '#{aLexeme}'"
102
+ raise exc
103
+ end
104
+
105
+ return token
106
+ end
107
+
108
+ def convert_to(aLexeme, aSymbolName, aFormat)
109
+ return aLexeme
110
+ end
111
+
112
+ def skip_whitespaces
113
+ pre_pos = scanner.pos
114
+
115
+ loop do
116
+ ws_found = false
117
+ cmt_found = false
118
+ found = scanner.skip(/[ \t\f]+/)
119
+ ws_found = true if found
120
+ found = scanner.skip(/(?:\r\n)|\r|\n/)
121
+ if found
122
+ ws_found = true
123
+ next_line
124
+ end
125
+ # next_ch = scanner.peek(1)
126
+ # if next_ch == ';'
127
+ # cmt_found = true
128
+ # scanner.skip(/;[^\r\n]*(?:(?:\r\n)|\r|\n)?/)
129
+ # next_line
130
+ # end
131
+ break unless ws_found or cmt_found
132
+ end
133
+
134
+ curr_pos = scanner.pos
135
+ return if curr_pos == pre_pos
136
+ end
137
+
138
+ def next_line
139
+ @lineno += 1
140
+ @line_start = scanner.pos
141
+ end
142
+ end # class
143
+ =begin
144
+ require 'base_tokenizer'
145
+
146
+ class PB_Tokenizer < BaseTokenizer
147
+ @@lexeme2name = {
148
+ '(' => 'LPAREN',
149
+ ')' => 'RPAREN',
150
+ '+' => 'PLUS',
151
+ }.freeze
152
+
153
+ protected
154
+
155
+ def recognize_token()
156
+ token = nil
157
+ curr_ch = scanner.peek(1)
158
+
159
+ if '()'.include? curr_ch # Single characters
160
+ # Delimiters, separators => single character token
161
+ token = build_token(@@lexeme2name[curr_ch], scanner.getch)
162
+ elsif (lexeme = scanner.scan(/(?:\+)(?=\s)/)) # Single char occurring alone
163
+ token = build_token(@@lexeme2name[lexeme], lexeme)
164
+ elsif (lexeme = scanner.scan(/[+-]?[0-9]+/))
165
+ token = build_token('INTEGER', lexeme)
166
+ end
167
+ end
168
+ end # class
169
+
170
+ # Basic tokenizer
171
+ # @return [Array<Rley::Lexical::Token>]
172
+ def tokenize(aText)
173
+ tokenizer = PB_Tokenizer.new(aText)
174
+ tokenizer.token
175
+ end
176
+
177
+ =end
178
+ =begin
179
+ # Basic expression tokenizer
180
+ def tokenize(aText)
181
+ tokens = aText.scan(/\S+/).map do |lexeme|
182
+ case lexeme
183
+ when '+', '(', ')'
184
+ terminal = @grammar.name2symbol[lexeme]
185
+ when /^[-+]?\d+$/
186
+ terminal = @grammar.name2symbol['int']
187
+ else
188
+ msg = "Unknown input text '#{lexeme}'"
189
+ raise StandardError, msg
190
+ end
191
+ pos = Rley::Lexical::Position.new(1, 4) # Dummy position
192
+ Rley::Lexical::Token.new(lexeme, terminal, pos)
193
+ end
194
+
195
+ return tokens
196
+ end
197
+ =end