RubyGems - dendroid - Versions diffs - 0.0.10 → 0.0.12 - Mend

dendroid 0.0.10 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/.rubocop.yml +1 -1
data/CHANGELOG.md +19 -0
data/lib/dendroid/recognizer/chart.rb +55 -0
data/lib/dendroid/recognizer/e_item.rb +47 -0
data/lib/dendroid/recognizer/item_set.rb +38 -0
data/lib/dendroid/recognizer/recognizer.rb +286 -0
data/lib/dendroid/syntax/grammar.rb +1 -1
data/spec/dendroid/grm_analysis/grm_analyzer_spec.rb +1 -72
data/spec/dendroid/recognizer/chart_spec.rb +1 -0
data/spec/dendroid/recognizer/e_item_spec.rb +59 -0
data/spec/dendroid/recognizer/item_set_spec.rb +63 -0
data/spec/dendroid/recognizer/recognizer_spec.rb +761 -0
data/spec/dendroid/support/sample_grammars.rb +319 -0
data/spec/dendroid/syntax/grammar_spec.rb +145 -0
data/version.txt +1 -1
metadata +11 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 270fc74811d70652e19c4ed42cd11138a1fe9fc413e9b1856b982edfa28c5d51
-  data.tar.gz: 280351b252bd5c4a63f3082375053ea7d3bf9a9d0d32acc055dc33cce91ed628
+  metadata.gz: 56842965215f0cef73b768223b5acb907fc1642b57528a9e616852ae6adab2cc
+  data.tar.gz: d53478ebcb86c89a407d648c67bfd34dd1f3333f41f7b6e0eac1dcb3e2a25cb6
 SHA512:
-  metadata.gz: 7a34047f56f1f488377afd88c4049b935d03d8a0a902cd44f8ffba3d58578c212c5ef7f0b1229192a7f4606b1d683d70ca479273d45d716d98154a38663f233f
-  data.tar.gz: 36578ffb40a0463a2e411000b24fa8005166c1ede8f6a856293c0122e44fdbb46d3758159042db0c9c4ccacf9c1bf071e49cfb86a64792b98fac8bb89447a85a
+  metadata.gz: 7ef9e4766ad0c786471d08ba6cffcdffaec2d9acf734e25dafa796e714e1103ee838421b3b817c0c732b91c1a238c5fd32c9c3a6f2954926880336b70caab8b9
+  data.tar.gz: 7e389e83762cedfbdbdf23acbcf821f478a237d8e2b6da6c4299db17d0ade7e760f77328e8f9f59cb10c251f6b1711257793c2b94c12e7ddd8bde2bcad5add66

data/.rubocop.yml CHANGED Viewed

@@ -12,7 +12,7 @@ Metrics/BlockLength:
 Metrics/ClassLength:
   Enabled: true
-  Max: 200
+  Max: 300
 Metrics/CyclomaticComplexity:
   Enabled: true

data/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,25 @@
 ## [Unreleased]
+## [0.0.12] - 2023-11-02
+Added more tests.
+### Added
+- Added more tests to spec file of `Grammar` class.
+- Added more tests to spec file of `Recognizer` class.
+## [0.0.11] - 2023-11-02
+Added Earley recognizer and its ancillary classes.
+### Added
+- Class `Chart` and its spec file
+- Class `EItem` and its spec file
+- Class `ItemSet` and its spec file
+- Class `Recognizer` and its spec file
+### Changed
+- RSpec tests: moved module `SampleGrammars` to separate file in folder `support`
 ## [0.0.10] - 2023-11-01
 Added missing class and method documentation, fixed some `Rubocop` offenses.

data/lib/dendroid/recognizer/chart.rb ADDED Viewed

@@ -0,0 +1,55 @@
+# frozen_string_literal: true
+require_relative 'item_set'
+module Dendroid
+  module Recognizer
+    # Also called a parse table. It records the progress of the
+    # Earley recognizer whens its verifies the compliance of the input text
+    # to the language grammar rules.
+    # It essentially consists in an array of item sets.
+    # If n is the number of input tokens then the chart has n + 1 entry sets.
+    class Chart
+      extend Forwardable
+      # @return [Array<Recognizer::ItemSet>] The array of item sets
+      attr_reader :item_sets
+      # @return [Boolean] Indicates whether the recognizer successfully processed the whole input
+      attr_writer :success
+      # @return [StandardError] The exception class in case of an error found by the recognizer
+      attr_accessor :failure_class
+      # @return [String] The error message
+      attr_accessor :failure_reason
+      def_delegators :@item_sets, :[], :last, :size
+      # Constructor
+      # Initialize the chart with one empty item set.
+      def initialize
+        @item_sets = []
+        @success = false
+        append_new_set
+      end
+      # Add a new empty item set at the end of the array of item sets
+      def append_new_set
+        item_sets << ItemSet.new
+      end
+      # Add an EItem to the last item set
+      # @param e_item [EItem]
+      def seed_last_set(e_item)
+        item_sets.last.add_item(e_item)
+      end
+      # Return true if the input text is valid according to the grammar.
+      # @return [Boolean]
+      def successful?
+        @success
+      end
+    end # class
+  end # module
+end # module

data/lib/dendroid/recognizer/e_item.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+require 'forwardable'
+module Dendroid
+  module Recognizer
+    # An Earley item is essentially a pair consisting of a dotted item and the rank of a token.
+    # It helps to keep track the progress of an Earley recognizer.
+    class EItem
+      extend Forwardable
+      # @return [Dendroid::GrmAnalysis::DottedItem]
+      attr_reader :dotted_item
+      # @return [Integer] the rank of the token that correspond to the start of the rule.
+      attr_reader :origin
+      def_delegators :@dotted_item, :completed?, :expecting?, :next_symbol, :pre_scan?
+      # @param aDottedItem [Dendroid::GrmAnalysis::DottedItem]
+      # @param origin [Integer]
+      def initialize(aDottedItem, origin)
+        @dotted_item = aDottedItem
+        @origin = origin
+      end
+      # @return [Dendroid::Syntax::NonTerminal] the head of the production rule
+      def lhs
+        dotted_item.rule.lhs
+      end
+      # Equality test.
+      # @return [Boolean] true iff dotted items and origins are equal
+      def ==(other)
+        return true if eql?(other)
+        di = dotted_item
+        (origin == other.origin) && (di == other.dotted_item)
+      end
+      # @return [String] the text representation of the Earley item
+      def to_s
+        "#{dotted_item} @ #{origin}"
+      end
+    end # class
+  end # module
+end # module

data/lib/dendroid/recognizer/item_set.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+module Dendroid
+  module Recognizer
+    # Holds the EItem identified by the recognizer when processing at token at given rank.
+    class ItemSet
+      extend Forwardable
+      # @return [Recognizer::EItem]
+      attr_reader :items
+      def_delegators :@items, :clear, :each, :empty?, :select, :size
+      def initialize
+        @items = []
+      end
+      # Add an Early item to the set
+      # @param anItem [Recognizer::EItem]
+      def add_item(anItem)
+        @items << anItem unless items.include? anItem
+      end
+      # Find the items that expect a given grammar symbol
+      # @param aSymbol [Denroid::Syntax::GrmSymbol]
+      # @return [void]
+      def items_expecting(aSymbol)
+        items.select { |itm| itm.expecting?(aSymbol) }
+      end
+      # Return a text representation of the item set
+      # @return [String]
+      def to_s
+        items.join("\n")
+      end
+    end # class
+  end # module
+end # module

data/lib/dendroid/recognizer/recognizer.rb ADDED Viewed

@@ -0,0 +1,286 @@
+# frozen_string_literal: true
+require_relative '../grm_analysis/grm_analyzer'
+require_relative 'e_item'
+require_relative 'chart'
+module Dendroid
+  # This module host classes needed to implement an Earley recognizer
+  module Recognizer
+    # A recognizer determines whether the input text complies to the grammar (syntax) rules.
+    # This class implements the Earley recognition algorithm.
+    class Recognizer
+      # @return [GrmAnalysis::GrmAnalyzer]
+      attr_reader :grm_analysis
+      # @return [Object]
+      attr_reader :tokenizer
+      # @param grammar [Dendroid::Syntax::Grammar]
+      # @param tokenizer [Object]
+      def initialize(grammar, tokenizer)
+        @grm_analysis = GrmAnalysis::GrmAnalyzer.new(grammar)
+        @tokenizer = tokenizer
+      end
+      # Try to read the `source` text and verify that it is syntactically correct.
+      # @param source [String] Input text to recognize
+      # @return [Dendroid::Recognizer::Chart]
+      def run(source)
+        tokenizer.input = source
+        tok = tokenizer.next_token
+        if tok.nil? && !grm_analysis.grammar.start_symbol.nullable?
+          chart = new_chart
+          chart.failure_class = StandardError
+          chart.failure_reason = 'Error: Input may not be empty nor blank.'
+          chart
+        else
+          earley_parse(tok)
+        end
+      end
+      # Run the Earley algorithm
+      # @param initial_token [Dednroid::Lexical::Token]
+      def earley_parse(initial_token)
+        chart = new_chart
+        tokens = [initial_token]
+        predicted_symbols = [Set.new]
+        eos_reached = initial_token.nil?
+        rank = 0
+        loop do
+          eos_reached ||= advance_next_token(tokens, predicted_symbols)
+          advance = false
+          curr_rank = rank
+          curr_set = chart[curr_rank]
+          curr_set.each do |entry|
+            # For each entry, do either completer, scanner or predictor action
+            tick = do_entry_action(chart, entry, curr_rank, tokens, :genuine, predicted_symbols)
+            advance ||= tick
+          end
+          rank += 1 if advance
+          break if eos_reached && !advance
+          break if !advance
+        end
+        determine_outcome(chart, tokens)
+        chart
+      end
+      private
+      def new_chart
+        top_symbol = grm_analysis.grammar.start_symbol
+        # Reminder: there might be multiple rules for the start symbol
+        prods = grm_analysis.grammar.nonterm2productions[top_symbol]
+        chart = Chart.new
+        prods.each do |prd|
+          seed_items = prd.predicted_items
+          seed_items.each { |item| chart.seed_last_set(EItem.new(item, 0)) }
+        end
+        chart
+      end
+      def advance_next_token(tokens, predicted_symbols)
+        eos_reached = false
+        tok = tokenizer.next_token
+        if tok
+          tokens << tok
+        else
+          eos_reached = true
+        end
+        predicted_symbols << Set.new unless eos_reached
+        eos_reached
+      end
+      def do_entry_action(chart, entry, rank, tokens, mode, predicted_symbols)
+        advance = false
+        if entry.completed?
+          completer(chart, entry, rank, tokens, mode)
+        else
+          if entry.next_symbol.terminal?
+            advance = scanner(chart, entry, rank, tokens)
+          else
+            predictor(chart, entry, rank, tokens, mode, predicted_symbols)
+          end
+        end
+        advance
+      end
+      # procedure PREDICTOR((A → α•Bβ, j), k)
+      #     for each (B → γ) in GRAMMAR_RULES_FOR(B) do
+      #         ADD_TO_SET((B → •γ, k), S[k])
+      #     end
+      #   Assuming next symbol is a non-terminal
+      #
+      #   Error case: next actual token matches none of the expected tokens.
+      def predictor(chart, item, rank, tokens, mode, predicted_symbols)
+        next_symbol = item.next_symbol
+        if mode == :genuine
+          predicted_symbols << Set.new if rank == predicted_symbols.size
+          predicted = predicted_symbols[rank]
+          return if predicted.include?(next_symbol)
+          predicted.add(next_symbol)
+        end
+        prods = grm_analysis.symbol2productions[next_symbol]
+        curr_set = chart[rank]
+        next_token = tokens[rank]
+        prods.each do |prd|
+          entry_items = prd.predicted_items
+          entry_items.each do |entry|
+            member = entry.next_symbol
+            if member&.terminal?
+              next unless next_token
+              next if (member.name != next_token.terminal) && mode == :genuine
+            end
+            new_item = EItem.new(entry, rank)
+            curr_set.add_item(new_item)
+          end
+        end
+        # Use trick from paper John Aycock and R. Nigel Horspool: "Practical Earley Parsing"
+        if next_symbol.nullable?
+          next_item = grm_analysis.next_item(item.dotted_item)
+          if next_item
+            new_item = EItem.new(next_item, item.origin)
+            curr_set.add_item(new_item)
+          end
+        end
+      end
+      # procedure SCANNER((A → α•aβ, j), k, words)
+      #     if j < LENGTH(words) and a ⊂ PARTS_OF_SPEECH(words[k]) then
+      #         ADD_TO_SET((A → αa•β, j), S[k+1])
+      #     end
+      # Assuming next symbol is a terminal
+      def scanner(chart, scan_item, rank, tokens)
+        advance = false
+        dit = scan_item.dotted_item
+        if rank < tokens.size && dit.next_symbol.name == tokens[rank].terminal
+          new_rank = rank + 1
+          chart.append_new_set if chart[new_rank].nil?
+          next_dotted_item = grm_analysis.next_item(dit)
+          new_item = EItem.new(next_dotted_item, scan_item.origin)
+          chart[new_rank].add_item(new_item)
+          advance = true
+        end
+        advance
+      end
+      # procedure COMPLETER((B → γ•, x), k)
+      #     for each (A → α•Bβ, j) in S[x] do
+      #         ADD_TO_SET((A → αB•β, j), S[k])
+      #     end
+      def completer(chart, item, rank, tokens, mode)
+        origin = item.origin
+        curr_set = chart[rank]
+        set_at_origin = chart[origin]
+        next_token = tokens[rank]
+        callers = set_at_origin.items_expecting(item.lhs)
+        callers.each do |call_item|
+          return_item = grm_analysis.next_item(call_item.dotted_item)
+          next unless return_item
+          member = return_item.next_symbol
+          if member&.terminal? && (mode == :genuine)
+            next unless next_token
+            next if member.name != next_token.terminal
+          end
+          new_item = EItem.new(return_item, call_item.origin)
+          curr_set.add_item(new_item)
+        end
+      end
+      def seed_set(chart, rank)
+        curr_set = chart[rank]
+        previous_set = chart[rank - 1]
+        curr_set.clear
+        scan_entries = previous_set.select { |ent| ent.dotted_item.next_symbol&.terminal? }
+        scan_entries.map do |ent|
+          new_item = grm_analysis.next_item(ent.dotted_item)
+          curr_set.add_item(EItem.new(new_item, ent.origin))
+        end
+      end
+      def determine_outcome(chart, tokens)
+        success = false
+        if chart.size == tokens.size + 1
+          top_symbol = grm_analysis.grammar.start_symbol
+          top_rules = grm_analysis.grammar.nonterm2productions[top_symbol]
+          final_items = top_rules.reduce([]) do |items, rule|
+            items.concat(rule.reduce_items)
+          end
+          last_set = chart.item_sets.last
+          last_set.each do |entry|
+            next if ((!entry.origin.zero?) || !final_items.include?(entry.dotted_item))
+            success = true
+          end
+        end
+        unless success
+          # Error detected...
+          replay_last_set(chart, tokens)
+          if chart.size < tokens.size + 1
+            # Recognizer stopped prematurely...
+            offending_token = tokens[chart.size - 1]
+            pos = offending_token.position
+            (line, col) = [pos.lineno, pos.column]
+            last_set = chart.last
+            terminals = last_set.items.reduce([]) do |result, ent|
+              result << ent.next_symbol if ent.pre_scan?
+              result
+            end
+            terminals.uniq!
+            prefix = "Syntax error at or near token line #{line}, column #{col} >>>#{offending_token.source}<<<"
+            expectation = terminals.size == 1 ? terminals[0].name.to_s : "one of: [#{terminals.map(&:name).join(', ')}]"
+            err_msg = "#{prefix} Expected #{expectation}, found a #{offending_token.terminal} instead."
+            chart.failure_class = StandardError
+            chart.failure_reason = err_msg
+          elsif chart.size == tokens.size + 1
+            # EOS unexpected...
+            last_token = tokens.last
+            pos = last_token.position
+            (line, col) = [pos.lineno, pos.column]
+            last_set = chart.last
+            terminals = last_set.items.reduce([]) do |result, ent|
+              result << ent.next_symbol if ent.pre_scan?
+              result
+            end
+            terminals.uniq!
+            prefix = "Line #{line}, column #{col}: Premature end of input after '#{last_token.source}'"
+            expectation = terminals.size == 1 ? terminals[0].name.to_s : "one of: [#{terminals.map(&:name).join(', ')}]"
+            err_msg = "#{prefix}, expected: #{expectation}."
+            chart.failure_class = StandardError
+            chart.failure_reason = err_msg
+          end
+        end
+        chart.success = success
+      end
+      def replay_last_set(chart, tokens)
+        rank = chart.size - 1
+        seed_set(chart, rank) # Re-initialize last set with scan entries
+        # Replay in full the actions for last set
+        chart[rank].each do |entry|
+          do_entry_action(chart, entry, rank, tokens, :error, [Set.new])
+        end
+      end
+    end # class
+  end # module
+end # module

data/lib/dendroid/syntax/grammar.rb CHANGED Viewed

@@ -47,7 +47,7 @@ module Dendroid
         end
         # TODO: add test for duplicate productions
         if nonterm2productions[rule.head]&.include? rule
-          raise StandardError, "Production rule '#{production}' appears more than once in the grammar."
+          raise StandardError, "Production rule '#{rule}' appears more than once in the grammar."
         end
         add_symbol(rule.head)

data/spec/dendroid/grm_analysis/grm_analyzer_spec.rb CHANGED Viewed

@@ -1,80 +1,9 @@
 # frozen_string_literal: true
 require_relative '../../spec_helper'
-require_relative '../../../lib/dendroid/grm_dsl/base_grm_builder'
+require_relative '../support/sample_grammars'
 require_relative '../../../lib/dendroid/grm_analysis/grm_analyzer'
-module SampleGrammars
-  def grammar_l1
-    builder = Dendroid::GrmDSL::BaseGrmBuilder.new do
-      # Grammar inspired from Wikipedia entry on Earley parsing
-      declare_terminals('PLUS', 'STAR', 'INTEGER')
-      rule('p' => 's')
-      rule('s' => ['s PLUS m', 'm'])
-      rule('m' => ['m STAR t', 't'])
-      rule('t' => 'INTEGER')
-    end
-    builder.grammar
-  end
-  def tokenizer_l1
-    Utils::BaseTokenizer.new do
-      map_verbatim2terminal({ '+' => :PLUS, '*' => :STAR })
-      scan_verbatim(['+', '*'])
-      scan_value(/\d+/, :INTEGER, ->(txt) { txt.to_i })
-    end
-  end
-  def grammar_l2
-    builder = GrmDSL::BaseGrmBuilder.new do
-      # Grammar inspired from Loup Vaillant's example
-      # https://loup-vaillant.fr/tutorials/earley-parsing/recogniser
-      declare_terminals('PLUS', 'MINUS',  'STAR', 'SLASH')
-      declare_terminals('LPAREN', 'RPAREN', 'NUMBER')
-      rule('p' => 'sum')
-      rule('sum' => ['sum PLUS product', 'sum MINUS product', 'product'])
-      rule('product' => ['product STAR factor', 'product SLASH factor', 'factor'])
-      rule('factor' => ['LPAREN sum RPAREN', 'NUMBER'])
-    end
-    builder.grammar
-  end
-  def tokenizer_l2
-    Utils::BaseTokenizer.new do
-      map_verbatim2terminal({
-                              '+' => :PLUS,
-                              '-' => :MINUS,
-                              '*' => :STAR,
-                              '/' => :SLASH,
-                              '(' => :LPAREN,
-                              ')' => :RPAREN
-                            })
-      scan_verbatim(['+', '-', '*', '/', '(', ')'])
-      scan_value(/\d+/, :NUMBER, ->(txt) { txt.to_i })
-    end
-  end
-  def grammar_l3
-    builder = Dendroid::GrmDSL::BaseGrmBuilder.new do
-      # Grammar inspired from Andrew Appel's example
-      # Modern Compiler Implementation in Java
-      declare_terminals('a', 'c', 'd')
-      rule('Z' => ['d', 'X Y Z'])
-      rule('Y' => ['', 'c'])
-      rule('X' => %w[Y a])
-    end
-    builder.grammar
-  end
-end # module
 describe Dendroid::GrmAnalysis::GrmAnalyzer do
   include SampleGrammars
   let(:grammar) { grammar_l1 }

data/spec/dendroid/recognizer/chart_spec.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ # frozen_string_literal: true

data/spec/dendroid/recognizer/e_item_spec.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# frozen_string_literal: true
+require_relative '../../spec_helper'
+require_relative '../../../lib/dendroid/syntax/terminal'
+require_relative '../../../lib/dendroid/syntax/non_terminal'
+require_relative '../../../lib/dendroid/syntax/symbol_seq'
+require_relative '../../../lib/dendroid/syntax/production'
+require_relative '../../../lib/dendroid/grm_analysis/dotted_item'
+require_relative '../../../lib/dendroid/recognizer/e_item'
+describe Dendroid::Recognizer::EItem do
+  let(:num_symb) { Dendroid::Syntax::Terminal.new('NUMBER') }
+  let(:plus_symb) { Dendroid::Syntax::Terminal.new('PLUS') }
+  let(:expr_symb) { Dendroid::Syntax::NonTerminal.new('expression') }
+  let(:rhs) { Dendroid::Syntax::SymbolSeq.new([num_symb, plus_symb, num_symb]) }
+  let(:empty_body) { Dendroid::Syntax::SymbolSeq.new([]) }
+  let(:prod) { Dendroid::Syntax::Production.new(expr_symb, rhs) }
+  let(:empty_prod) { Dendroid::Syntax::Production.new(expr_symb, empty_body) }
+  let(:sample_dotted) { Dendroid::GrmAnalysis::DottedItem.new(prod, 1) }
+  let(:other_dotted) { Dendroid::GrmAnalysis::DottedItem.new(empty_prod, 0) }
+  let(:sample_origin) { 3 }
+  subject { described_class.new(sample_dotted, sample_origin) }
+  context 'Initialization:' do
+    it 'is initialized with a dotted item and an origin position' do
+      expect { described_class.new(sample_dotted, sample_origin) }.not_to raise_error
+    end
+    it 'knows its related dotted item' do
+      expect(subject.dotted_item).to eq(sample_dotted)
+    end
+    it 'knows its origin value' do
+      expect(subject.origin).to eq(sample_origin)
+    end
+  end # context
+  context 'Provided service:' do
+    it 'knows the lhs of related production' do
+      expect(subject.lhs).to eq(expr_symb)
+    end # context
+    # rubocop: disable Lint/BinaryOperatorWithIdenticalOperands
+    it 'can compare with another EItem' do
+      expect(subject == subject).to be_truthy
+      expect(subject == described_class.new(sample_dotted, sample_origin)).to be_truthy
+      expect(subject == described_class.new(sample_dotted, 2)).to be_falsey
+      expect(subject == described_class.new(other_dotted, sample_origin)).to be_falsey
+    end
+    # rubocop: enable Lint/BinaryOperatorWithIdenticalOperands
+    it 'can renders a String representation of itself' do
+      expect(subject.to_s).to eq("#{sample_dotted} @ #{sample_origin}")
+    end
+  end # context
+end # describe