RubyGems - rley - Versions diffs - 0.1.12 → 0.2.00 - Mend

rley 0.1.12 → 0.2.00

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +8 -8
data/CHANGELOG.md +9 -0
data/examples/parsers/parsing_L0.rb +125 -0
data/examples/parsers/parsing_b_expr.rb +84 -84
data/lib/rley/constants.rb +1 -1
data/lib/rley/formatter/json.rb +4 -0
data/lib/rley/parser/chart.rb +12 -0
data/lib/rley/parser/parse_state_tracker.rb +55 -0
data/lib/rley/parser/parse_tree_builder.rb +181 -0
data/lib/rley/parser/parsing.rb +92 -55
data/lib/rley/ptree/parse_tree.rb +3 -86
data/lib/rley/ptree/token_range.rb +2 -0
data/lib/rley/syntax/grammar.rb +6 -0
data/lib/rley/syntax/grammar_builder.rb +1 -1
data/spec/rley/formatter/json_spec.rb +2 -2
data/spec/rley/parser/chart_spec.rb +5 -1
data/spec/rley/parser/parse_tree_builder_spec.rb +179 -0
data/spec/rley/parser/parsing_spec.rb +83 -9
data/spec/rley/ptree/parse_tree_spec.rb +7 -60
data/spec/rley/ptree/token_range_spec.rb +1 -0
data/spec/rley/syntax/grammar_spec.rb +4 -0
metadata +7 -2

checksums.yaml CHANGED

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    NTc2NGE4MzYxOTc1ZDUyMDVkYjdmNGFhODllNmEwM2YxMjVkZDk1OQ==
+    ZTAxMjk3Yzc3YTFjZmUxY2Q2OGI1MDFhZjMxMTk5M2Q1YmMzZWMwNw==
   data.tar.gz: !binary |-
-    ZDBkMTdmZWM2NTMwYWMwNDFkNDQ4NGI2YzdkNjk3NDU0ZGExMGYzNA==
+    ODMxNDQ0ODZjYmI0YWRlYTE5NWQwZjE3Njg2ZGI2MzBjMjc5ZmVlMQ==
 !binary "U0hBNTEy":
   metadata.gz: !binary |-
-    YWZjYmQxNDNiNjVmMDYyYWI1YzM0YzMyN2VjMzk4ZjUxOTIyMmQxNTE4Y2Y4
-    YjM4NTAyNGNhNjhiOWRhNWMyZWVmYzRhYjFjNzhhOWEzMDY1ZTgzMzRiMTVh
-    Njg1MDRjMjQ5NTlhYmU3NTk2MDBiMWQyZmI3MDIyMWUwNGM1NjM=
+    NmVkNTY4MzE3ZWNhMDhlNWI5OTJlODBkM2ZmOTg1Y2RjODViOThiMzAyYmUy
+    ZTY5N2RhN2EyNzFhMmY3MjhlNmJkNjI4MGVlOTBjZDgwYmM4ZTY0ZjhlNjE2
+    OGU5ZTRlNzAzOGRiOTlmMmZkZDg0NzcyMGQwZDFlYjVmYjAwMzI=
   data.tar.gz: !binary |-
-    ODZiMmY1ZDMwNTVlNmM3ZGJmZDIzOTAzYjQ1MTFlMmY4OTBlODhhZWZhN2M3
-    MjEzYjI0YjZhNTA4NzkzYjJiMzMwY2Y0NzliMGMyZDdlMjI3NjkzMTliYWNh
-    YzQ2MGVmNWM1NTA5M2IxYjcxNzliNzhhMzViMDE4OTM3NWY1NjI=
+    N2E0OGJhNjVjOWM3YTMzMzk5YjdjZDdiY2IzMGE5N2RkNTJkNDlkMDcyYTI3
+    YzY5Y2YzZTE4NjcxYjFlNzhhMmM4NDExYzk0NGEzMTE1ZmMwNDUzNzExMTA5
+    OGI3MDFlMzFkYWQ2Y2UzNmI1MGQ5ZGEwNGYyNTg4N2MzMTk3NTk=

data/CHANGELOG.md CHANGED

@@ -1,3 +1,12 @@
+### 0.2.00 / 2015-01-03
+Version number bump: major re-design of the parse tree generation.
+* [NEW] Class `ParseTreeBuilder`: builder for creating parse tree.
+* [NEW] Class `ParseStateTracker`: helper class used in parse tree generation.
+* [NEW] Examplar file `parsing_L0.rb`: demo using a (highly simplified) English grammar.
+* [CHANGE]  Class `ParseTree`: construction methods removed.
+* [CHANGE]  Method `Parsing#parse_tree` completely rewritten.
+* [FIX] Method `Parsing#parse_tree` now handles situations where there are multiple complete parse states for a non-terminal.
 ### 0.1.12 / 2014-12-22
 * [FIX] Fixed `Parsing#parse_tree`: code couldn't cope with parse state set containing more
   than one parse state that expected the same symbol.

data/examples/parsers/parsing_L0.rb ADDED

@@ -0,0 +1,125 @@
+# Purpose: to demonstrate how to build and render a parse tree for the L0
+# language
+require 'pp'
+require 'rley'  # Load the gem
+# Steps to render a parse tree (of a valid parsed input):
+# 1. Define a grammar
+# 2. Create a tokenizer for the language
+# 3. Create a parser for that grammar
+# 4. Tokenize the input
+# 5. Let the parser process the input
+# 6. Generate a parse tree from the parse result
+# 7. Render the parse tree (in JSON)
+########################################
+# Step 1. Define a grammar for a micro English-like language
+# based on Jurafky & Martin L0 language.
+# It defines the syntax of a sentence in a language with a
+# very limited syntax and lexicon in the context of airline reservation.
+builder = Rley::Syntax::GrammarBuilder.new
+builder.add_terminals('Noun', 'Verb', 'Pronoun', 'Proper-Noun')
+builder.add_terminals('Determiner', 'Preposition', )
+builder.add_production('S' => %w[NP VP])
+builder.add_production('NP' => 'Pronoun')
+builder.add_production('NP' => 'Proper-Noun')
+builder.add_production('NP' => %w[Determiner Nominal])
+builder.add_production('Nominal' => %w[Nominal Noun])
+builder.add_production('Nominal' => 'Noun')
+builder.add_production('VP' => 'Verb')
+builder.add_production('VP' => %w[Verb NP])
+builder.add_production('VP' => %w[Verb NP PP])
+builder.add_production('VP' => %w[Verb PP])
+builder.add_production('PP' => %w[Preposition PP])
+# And now build the grammar...
+grammar_l0 = builder.grammar
+########################################
+# 2. Create a tokenizer for the language
+# The tokenizer transforms the input into an array of tokens
+# This is a very simplistic implementation for demo purposes.
+# The lexicon is just a Hash with pairs of the form:
+# word =>terminal symbol name
+L0_lexicon = {
+  'flight' => 'Noun',
+  'breeze' => 'Noun',
+  'trip' => 'Noun',
+  'morning' => 'Noun',
+  'is' => 'Verb',
+  'prefer' => 'Verb',
+  'like' => 'Verb',
+  'need' => 'Verb',
+  'want' => 'Verb',
+  'fly' => 'Verb',
+  'me' => 'Pronoun',
+  'I' => 'Pronoun',
+  'you' => 'Pronoun',
+  'it' => 'Pronoun',
+  'Alaska' => 'Proper-Noun',
+  'Baltimore' => 'Proper-Noun',
+  'Chicago' => 'Proper-Noun',
+  'United' => 'Proper-Noun',
+  'American' => 'Proper-Noun',
+  'the' => 'Determiner',
+  'a' => 'Determiner',
+  'an' => 'Determiner',
+  'this' => 'Determiner',
+  'these' => 'Determiner',
+  'that' => 'Determiner',
+  'from' => 'Preposition',
+  'to' => 'Preposition',
+  'on' => 'Preposition',
+  'near' => 'Preposition'
+}
+# Highly simplified tokenizer implementation.
+def tokenizer(aText, aGrammar)
+  tokens = aText.scan(/\S+/).map do |word|
+    term_name = L0_lexicon[word]
+    if term_name.nil?
+      fail StandardError, "Word '#{word}' not found in lexicon"
+    end
+    terminal = aGrammar.name2symbol[term_name]
+    Rley::Parser::Token.new(word, terminal)
+  end
+  return tokens
+end
+########################################
+# Step 3. Create a parser for that grammar
+parser = Rley::Parser::EarleyParser.new(grammar_l0)
+########################################
+# Step 3. Tokenize the input
+valid_input = 'I prefer a morning flight'
+# Another sentence: it is a flight from Chicago
+tokens = tokenizer(valid_input, grammar_l0)
+########################################
+# Step 5. Let the parser process the input
+result = parser.parse(tokens)
+puts "Parsing success? #{result.success?}"
+########################################
+# Step 6. Generate a parse tree from the parse result
+ptree = result.parse_tree
+pp ptree
+########################################
+# Step 7. Render the parse tree (in JSON)
+# Let's create a parse tree visitor
+visitor = Rley::ParseTreeVisitor.new(ptree)
+#Here we create a renderer object...
+renderer = Rley::Formatter::Json.new(STDOUT)
+# Now emit the parse tree as JSON on the console output
+puts "JSON rendering of the parse tree for '#{valid_input}' input:"
+renderer.render(visitor)
+# End of file

data/examples/parsers/parsing_b_expr.rb CHANGED

@@ -1,85 +1,85 @@
-# Purpose: to demonstrate how to parse basic arithmetic expressions
-# and render a parse tree
-require 'pp' # TODO remove this dependency
-require 'rley'  # Load the gem
-# Steps to render a parse tree (of a valid parsed input):
-# 1. Define a grammar
-# 2. Create a tokenizer for the language
-# 3. Create a parser for that grammar
-# 4. Tokenize the input
-# 5. Let the parser process the input
-# 6. Generate a parse tree from the parse result
-# 7. Render the parse tree (in JSON)
-########################################
-# Step 1. Define a grammar for a very simple arithmetic expression language
-# (based on example in article on Earley's algorithm in Wikipedia)
-# Let's create the grammar piece by piece
-builder = Rley::Syntax::GrammarBuilder.new
-builder.add_terminals('+', '*', 'integer')
-builder.add_production('P' => 'S')
-builder.add_production('S' => %w(S + M))
-builder.add_production('S' => 'M')
-builder.add_production('M' => %w(M * T))
-builder.add_production('M' => 'T')
-builder.add_production('T' => 'integer')
-# And now build the grammar...
-grammar_s_expr = builder.grammar
-########################################
-# 2. Create a tokenizer for the language
-# The tokenizer transforms the input into an array of tokens
-def tokenizer(aText, aGrammar)
-  tokens = aText.scan(/\S+/).map do |lexeme|
-    case lexeme
-      when '+', '*'
-        terminal = aGrammar.name2symbol[lexeme]
-      when /^[-+]?\d+$/
-        terminal = aGrammar.name2symbol['integer']
-      else
-        msg = "Unknown input text '#{lexeme}'"
-        fail StandardError, msg
-    end
-    Rley::Parser::Token.new(lexeme, terminal)
-  end
-  return tokens
-end
-########################################
-# Step 3. Create a parser for that grammar
-parser = Rley::Parser::EarleyParser.new(grammar_s_expr)
-########################################
-# Step 3. Tokenize the input
-valid_input = '2 + 3 * 4'
-tokens = tokenizer(valid_input, grammar_s_expr)
-########################################
-# Step 5. Let the parser process the input
-result = parser.parse(tokens)
-puts "Parse successful? #{result.success?}"
-pp result
-########################################
-# Step 6. Generate a parse tree from the parse result
-ptree = result.parse_tree
-=begin
-########################################
-# Step 7. Render the parse tree (in JSON)
-# Let's create a parse tree visitor
-visitor = Rley::ParseTreeVisitor.new(ptree)
-#Here we create a renderer object...
-renderer = Rley::Formatter::Json.new(STDOUT)
-# Now emit the parse tree as JSON on the console output
-puts "JSON rendering of the parse tree for '#{valid_input}' input:"
-renderer.render(visitor)
-=end
+# Purpose: to demonstrate how to parse basic arithmetic expressions
+# and render a parse tree
+require 'pp' # TODO remove this dependency
+require 'rley'  # Load the gem
+# Steps to render a parse tree (of a valid parsed input):
+# 1. Define a grammar
+# 2. Create a tokenizer for the language
+# 3. Create a parser for that grammar
+# 4. Tokenize the input
+# 5. Let the parser process the input
+# 6. Generate a parse tree from the parse result
+# 7. Render the parse tree (in JSON)
+########################################
+# Step 1. Define a grammar for a very simple arithmetic expression language
+# (based on example in article on Earley's algorithm in Wikipedia)
+# Let's create the grammar piece by piece
+builder = Rley::Syntax::GrammarBuilder.new
+builder.add_terminals('+', '*', 'integer')
+builder.add_production('P' => 'S')
+builder.add_production('S' => %w(S + M))
+builder.add_production('S' => 'M')
+builder.add_production('M' => %w(M * T))
+builder.add_production('M' => 'T')
+builder.add_production('T' => 'integer')
+# And now build the grammar...
+grammar_s_expr = builder.grammar
+########################################
+# 2. Create a tokenizer for the language
+# The tokenizer transforms the input into an array of tokens
+def tokenizer(aText, aGrammar)
+  tokens = aText.scan(/\S+/).map do |lexeme|
+    case lexeme
+      when '+', '*'
+        terminal = aGrammar.name2symbol[lexeme]
+      when /^[-+]?\d+$/
+        terminal = aGrammar.name2symbol['integer']
+      else
+        msg = "Unknown input text '#{lexeme}'"
+        fail StandardError, msg
+    end
+    Rley::Parser::Token.new(lexeme, terminal)
+  end
+  return tokens
+end
+########################################
+# Step 3. Create a parser for that grammar
+parser = Rley::Parser::EarleyParser.new(grammar_s_expr)
+########################################
+# Step 3. Tokenize the input
+valid_input = '2 + 3 * 4'
+tokens = tokenizer(valid_input, grammar_s_expr)
+########################################
+# Step 5. Let the parser process the input
+result = parser.parse(tokens)
+puts "Parse successful? #{result.success?}"
+########################################
+# Step 6. Generate a parse tree from the parse result
+ptree = result.parse_tree
+pp ptree
+########################################
+# Step 7. Render the parse tree (in JSON)
+# Let's create a parse tree visitor
+visitor = Rley::ParseTreeVisitor.new(ptree)
+#Here we create a renderer object...
+renderer = Rley::Formatter::Json.new(STDOUT)
+# Now emit the parse tree as JSON on the console output
+puts "JSON rendering of the parse tree for '#{valid_input}' input:"
+renderer.render(visitor)
 # End of file

data/lib/rley/constants.rb CHANGED

@@ -3,7 +3,7 @@
 module Rley # Module used as a namespace
   # The version number of the gem.
-  Version = '0.1.12'
+  Version = '0.2.00'
   # Brief description of the gem.
   Description = "Ruby implementation of the Earley's parsing algorithm"

data/lib/rley/formatter/json.rb CHANGED

@@ -64,6 +64,10 @@ module Rley # This module is used as a namespace
       def before_terminal(term_node)
         separator = sibling_flags[-1] ? ",\n" : "\n"
         name = term_node.symbol.name
+        if term_node.token.nil?
+          msg = "No token associated with #{name}"
+          fail StandardError, msg
+        end
         lexeme = term_node.token.lexeme
         print_text(separator, "{\"#{name}\": \"#{lexeme}\"}")
         sibling_flags[-1] = true

data/lib/rley/parser/chart.rb CHANGED

@@ -24,6 +24,18 @@ module Rley # This module is used as a namespace
       def [](index)
         return state_sets[index]
       end
+      # Return the index value of the last non-empty state set.
+      def last_index()
+        first_empty =  state_sets.find_index { |set| set.empty? }
+        if first_empty.nil?
+          index = state_sets.size - 1
+        else
+          index = first_empty == 0 ? 0 : first_empty - 1
+        end
+        return index
+      end
       # Push a parse state for the chart entry with given index
       def push_state(aDottedItem, anOrigin, anIndex)

data/lib/rley/parser/parse_state_tracker.rb ADDED

@@ -0,0 +1,55 @@
+module Rley # This module is used as a namespace
+  module Parser # This module is used as a namespace
+    # Helper class that keeps track of the parse states used
+    # while a Parsing instance is constructing a parse tree.
+    class ParseStateTracker
+      # The index of the current state set
+      attr_reader(:state_set_index)
+      # The current parse state
+      attr_reader(:parse_state)
+      # The already processed states from current state set
+      attr_reader(:processed_states)
+      # Constructor. Refined variant of the inherited constructor.
+      def initialize(aStateSetIndex)
+        self.state_set_index = aStateSetIndex
+      end
+      # Write accessor. Sets the value of the state set index
+      def state_set_index=(anIndex)
+        @state_set_index = anIndex
+        @processed_states = {}
+      end
+      # Write accessor. Set the given parse state as the current one.
+      def parse_state=(aParseState)
+        @parse_state = aParseState
+        self.processed_states[parse_state] = true
+      end
+      # Take the first provided state that wasn't processed yet.
+      def select_state(theStates)
+        a_state = theStates.find { |st| ! processed_states.include?(st) }
+        self.parse_state = a_state
+      end
+      # The dotted item for the current parse state.
+      def curr_dotted_item()
+        parse_state.dotted_rule
+      end
+      def symbol_on_left()
+        return curr_dotted_item.prev_symbol
+      end
+      # Notification that one begins with the previous state set
+      def to_prev_state_set()
+        self.state_set_index = self.state_set_index - 1
+      end
+    end # class
+  end # module
+end # module
+# End of file

data/lib/rley/parser/parse_tree_builder.rb ADDED

@@ -0,0 +1,181 @@
+require 'ostruct' # TODO delete this
+require_relative '../ptree/terminal_node'
+require_relative '../ptree/non_terminal_node'
+require_relative '../ptree/parse_tree'
+module Rley # This module is used as a namespace
+  module Parser # This module is used as a namespace
+    # Builder GoF pattern. Builder pattern builds a complex object
+    # (say, a parse tree) from simpler objects (terminal and non-terminal
+    # nodes) and using a step by step approach.
+    class ParseTreeBuilder
+      attr_reader(:root)
+      attr_reader(:current_path)
+      def initialize(aStartProduction, aRange)
+        @current_path = []
+        start_symbol = aStartProduction.lhs
+        add_node(start_symbol, aRange)
+        use_production(aStartProduction, aRange)
+        move_down
+      end
+      # Return the active node.
+      def current_node()
+        return current_path.last
+      end
+      # Factory method.
+      def parse_tree()
+        return PTree::ParseTree.new(root)
+      end
+      # Given that the current node is also lhs of the production
+      # associated with the complete parse state,
+      # Then add the rhs constituents as child nodes of the current node.
+      # Assumption: current node is lhs of the production association
+      # with the parse state.
+      # @param aCompleteState [ParseState] A complete parse state
+      # (dot is at end of rhs)
+      def use_complete_state(aCompleteState)
+        prod = aCompleteState.dotted_rule.production
+        use_production(prod, {low: aCompleteState.origin})
+      end
+      # Given that the current node is a non-terminal
+      # Make its last child node the current node.
+      def move_down()
+        curr_node = current_node
+        unless curr_node.is_a?(PTree::NonTerminalNode)
+          msg = "Current node isn't a non-terminal node #{curr_node.class}"
+          fail StandardError, msg
+        end
+        children = curr_node.children
+        path_increment = [children.size - 1, children.last]
+        @current_path.concat(path_increment)
+      end
+      # Make the predecessor of current node the
+      # new current node.
+      def move_back()
+        begin
+          if current_path.length == 1
+            msg = 'Cannot move further back'
+            fail StandardError, msg
+          end
+          (parent, pos, child_node) = current_path[-3, 3]
+          current_path.pop(2)
+          if pos > 0
+            new_pos = pos - 1
+            new_curr_node = parent.children[new_pos]
+            current_path << new_pos
+            current_path << new_curr_node
+            range = high_bound(child_node.range.low)
+          end
+        end while pos == 0 && new_curr_node.is_a?(PTree::NonTerminalNode)
+      end
+      # Add a child node to the current node.
+      def add_node(aSymbol, aRange)
+        # Create the node
+        a_node = new_node(aSymbol, aRange)
+        # Add it to the current node
+        add_child(a_node)
+      end
+      # Set unbound endpoints of current node range
+      # to the given range.
+      def range=(aRange)
+        curr_node = current_node
+        return if curr_node.nil?
+        lower = low_bound(aRange)
+        unless lower.nil?
+          current_node.range = lower
+          if curr_node.is_a?(PTree::TerminalNode)
+            current_node.range = high_bound(lower[:low] + 1)
+          end
+        end
+        upper = high_bound(aRange)
+        current_node.range = upper unless upper.nil?
+      end
+      private
+      def new_node(aSymbol, aRange)
+        case aSymbol
+          when Syntax::Terminal
+            new_node = PTree::TerminalNode.new(aSymbol, aRange)
+          when Syntax::NonTerminal
+            new_node = PTree::NonTerminalNode.new(aSymbol, aRange)
+        end
+        return new_node
+      end
+      # Add children nodes to current one.
+      # The children correspond to the members of the rhs of the production.
+      def use_production(aProduction, aRange)
+        prod = aProduction
+        curr_node = current_node
+        if curr_node.symbol != prod.lhs
+          msg = "Current node is a #{curr_node.symbol} instead of #{prod.lhs}"
+          fail StandardError, msg
+        end
+        self.range = aRange
+        prod.rhs.each { |symb| add_node(symb, {}) }
+        unless curr_node.children.empty?
+          curr_node.children.first.range.assign({ low: curr_node.range.low })
+          curr_node.children.last.range.assign({ high: curr_node.range.high })
+        end
+      end
+      # Add the given node as child node of current node
+      def add_child(aNode)
+        curr_node = current_node
+        if curr_node.nil?
+          self.root = aNode
+        else
+          curr_node.children << aNode
+        end
+      end
+      # Set the root node of the tree.
+      def root=(aNode)
+        @root = aNode
+        @current_path = [ @root ]
+        root.range = low_bound(0)
+      end
+      def low_bound(aRange)
+        result = case aRange
+          when Fixnum then aRange
+          when Hash then aRange[:low]
+          when PTree::TokenRange then aRange.low
+        end
+        return { low: result }
+      end
+      def high_bound(aRange)
+        result = case aRange
+          when Fixnum then aRange
+          when Hash then aRange[:high]
+          when PTree::TokenRange then aRange.high
+        end
+        return { high: result }
+      end
+    end # class
+  end # module
+end # module
+# End of file