RubyGems - rley - Versions diffs - 0.5.10 → 0.5.11 - Mend

rley 0.5.10 → 0.5.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +15 -0
data/LICENSE.txt +1 -1
data/README.md +2 -1
data/appveyor.yml +6 -5
data/examples/NLP/engtagger.rb +176 -0
data/examples/general/SRL/lib/ast_builder.rb +217 -21
data/examples/general/SRL/lib/grammar.rb +33 -5
data/examples/general/SRL/lib/regex/alternation.rb +30 -0
data/examples/general/SRL/lib/regex/char_class.rb +28 -22
data/examples/general/SRL/lib/regex/char_shorthand.rb +50 -0
data/examples/general/SRL/lib/regex/character.rb +5 -3
data/examples/general/SRL/lib/regex/concatenation.rb +32 -0
data/examples/general/SRL/lib/regex/non_capturing_group.rb +29 -0
data/examples/general/SRL/lib/regex/wildcard.rb +26 -0
data/examples/general/SRL/lib/regex_repr.rb +5 -0
data/examples/general/SRL/lib/tokenizer.rb +28 -3
data/examples/general/SRL/spec/integration_spec.rb +151 -8
data/examples/general/SRL/spec/tokenizer_spec.rb +12 -0
data/examples/general/left.rb +36 -0
data/examples/general/right.rb +36 -0
data/lib/rley/constants.rb +1 -1
data/lib/rley/gfg/edge.rb +12 -1
data/lib/rley/gfg/grm_flow_graph.rb +21 -1
data/lib/rley/gfg/item_vertex.rb +1 -1
data/lib/rley/gfg/non_terminal_vertex.rb +1 -1
data/lib/rley/gfg/start_vertex.rb +1 -0
data/lib/rley/gfg/vertex.rb +27 -0
data/lib/rley/lexical/token.rb +1 -0
data/lib/rley/parser/error_reason.rb +2 -1
data/lib/rley/parser/gfg_chart.rb +14 -0
data/lib/rley/parser/gfg_earley_parser.rb +0 -1
data/lib/rley/parser/gfg_parsing.rb +4 -3
data/lib/rley/parser/parse_entry.rb +33 -3
data/lib/rley/parser/parse_entry_set.rb +14 -2
data/lib/rley/parser/parse_tree_builder.rb +1 -1
data/lib/rley/parser/parse_walker_factory.rb +0 -1
data/lib/rley/syntax/grm_symbol.rb +2 -0
data/lib/rley/syntax/production.rb +15 -3
data/lib/rley/syntax/symbol_seq.rb +16 -1
data/spec/rley/gfg/end_vertex_spec.rb +9 -1
data/spec/rley/gfg/grm_flow_graph_spec.rb +9 -0
data/spec/rley/gfg/item_vertex_spec.rb +9 -0
data/spec/rley/gfg/start_vertex_spec.rb +9 -1
data/spec/rley/parser/gfg_parsing_spec.rb +0 -1
data/spec/rley/parser/parse_entry_set_spec.rb +15 -0
data/spec/rley/parser/parse_entry_spec.rb +24 -13
data/spec/rley/parser/parse_tracer_spec.rb +1 -1
data/spec/rley/syntax/production_spec.rb +10 -0
data/spec/rley/syntax/symbol_seq_spec.rb +5 -0
metadata +10 -2

data/examples/general/SRL/lib/grammar.rb CHANGED

@@ -6,25 +6,53 @@ module SRL
   # This is a very partial grammar of SRL.
   # It will be expanded with the coming versions of Rley
   builder = Rley::Syntax::GrammarBuilder.new do
+    add_terminals('LPAREN', 'RPAREN', 'COMMA')
     add_terminals('DIGIT_LIT', 'INTEGER', 'LETTER_LIT')
+    add_terminals('LITERALLY', 'STRING_LIT')
     add_terminals('UPPERCASE', 'LETTER', 'FROM', 'TO')
-    add_terminals('DIGIT', 'NUMBER')
+    add_terminals('DIGIT', 'NUMBER', 'ANY', 'NO')
+    add_terminals('CHARACTER', 'WHITESPACE', 'ANYTHING')
+    add_terminals('TAB', 'BACKSLASH', 'NEW', 'LINE')
+    add_terminals('OF', 'ONE')
     add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
     add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
     add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
-    # For the moment one focuses on quantifier syntax only...
-    rule 'srl' => 'term'
+    rule 'srl' => 'pattern'
+    rule 'pattern' => %w[pattern COMMA quantifiable]
+    rule 'pattern' => %w[pattern quantifiable]
+    rule 'pattern' => 'quantifiable'
+    rule 'quantifiable' => 'term'
+    rule 'quantifiable' => %w[term quantifier]
     rule 'term' => 'atom'
-    rule 'term' => %w[atom quantifier]
+    rule 'term' => 'alternation'
+    rule 'term' => 'grouping'
     rule 'atom' => 'letter_range'
     rule 'atom' => 'digit_range'
+    rule 'atom' => 'character_class'
+    rule 'atom' => 'special_char'
+    rule 'atom' => 'literal'
     rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
     rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
     rule 'letter_range' => 'LETTER'
     rule 'letter_range' => %w[UPPERCASE LETTER]
     rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
-    rule 'digit_range' => 'digit_or_number'
+    rule 'digit_range' => 'digit_or_number'
+    rule 'character_class' => %w[ANY CHARACTER]
+    rule 'character_class' => %w[NO CHARACTER]
+    rule 'character_class' => 'WHITESPACE'
+    rule 'character_class' => %w[NO WHITESPACE]
+    rule 'character_class' => 'ANYTHING'
+    rule 'character_class' => %w[ONE OF STRING_LIT]
+    rule 'special_char' => 'TAB'
+    rule 'special_char' => 'BACKSLASH'
+    rule 'special_char' => %w[NEW LINE]
+    rule 'literal' => %w[LITERALLY STRING_LIT]
+    rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
+    rule 'alternatives' => %w[alternatives COMMA quantifiable]
+    rule 'alternatives' => %w[alternatives quantifiable]
+    rule 'alternatives' => 'quantifiable'
+    rule 'grouping' => %w[LPAREN pattern RPAREN]
     rule 'quantifier' => 'ONCE'
     rule 'quantifier' => 'TWICE'
     rule 'quantifier' => %w[EXACTLY count TIMES]

data/examples/general/SRL/lib/regex/alternation.rb ADDED

@@ -0,0 +1,30 @@
+# File: alternation.rb
+require_relative 'polyadic_expression'	# Access the superclass
+module Regex # This module is used as a namespace
+# Abstract class. A n-ary matching operator.
+# It succeeds when one child expression succeeds to match the subject text
+class Alternation < PolyadicExpression
+	# Constructor.
+	def initialize(*theChildren)
+		super(theChildren)
+	end
+public
+	# Conversion method re-definition.
+	# Purpose: Return the String representation of the concatented expressions.
+	def to_str()
+		result_children = children.map { |aChild| aChild.to_str() }
+		result =  '(?:' + result_children.join('|') + ')'
+		return result
+	end
+end # class
+end # module
+# End of file

data/examples/general/SRL/lib/regex/char_class.rb CHANGED

@@ -4,30 +4,36 @@ require_relative "polyadic_expression"	# Access the superclass
 module Regex # This module is used as a namespace
-# Abstract class. A n-ary matching operator.
-# It succeeds when one child expression succeeds to match the subject text
-# than defined by this concatenation.
-class CharClass < PolyadicExpression
-	# A flag that indicates whether the character is negated
-	attr_reader(:negated)
-	# Constructor.
-	def initialize(to_negate,*theChildren)
-		super(theChildren)
-		@negated = to_negate
-	end
+  # Abstract class. A n-ary matching operator.
+  # It succeeds when one child expression succeeds to match the subject text.
+  class CharClass < PolyadicExpression
+    # These are characters with special meaning in character classes
+    Metachars = ']\^-'.codepoints
+    # A flag that indicates whether the character is negated
+    attr_reader(:negated)
+    # Constructor.
+    def initialize(to_negate,*theChildren)
+      super(theChildren)
+      @negated = to_negate
+    end
-public
-	# Conversion method re-definition.
-	# Purpose: Return the String representation of the concatented expressions.
-	def to_str()
-		result_children = children.inject('') { |subResult, aChild| subResult << aChild.to_str() }
-		result = '['+ (negated ? '^' : '')  + result_children + ']'
-		return result
-	end
+  public
+    # Conversion method re-definition.
+    # Purpose: Return the String representation of the character class.
+    def to_str()
+      result_children = children.inject('') do |subResult, aChild|
+        if aChild.kind_of?(Regex::Character) && Metachars.include?(aChild.codepoint)
+          subResult << "\\" # Escape meta-character...
+        end
+        subResult << aChild.to_str()
+      end
+      result = '['+ (negated ? '^' : '')  + result_children + ']'
+      return result
+    end
-end # class
+  end # class
 end # module

data/examples/general/SRL/lib/regex/char_shorthand.rb ADDED

@@ -0,0 +1,50 @@
+# File: char_shorthand.rb
+require_relative "atomic_expression"	# Access the superclass
+module Regex # This module is used as a namespace
+  # A pre-defined character class is in essence a name for a built-in, standard character class.
+  class CharShorthand < AtomicExpression
+    # A constant Hash that defines all the predefined character shorthands.
+    # It contains pairs of the form:
+    # a pre-defined character shorthand letter => a CharRange object
+    StandardCClasses = {
+      'd' => '[0-9]',
+      'D' => '[^0-9]',
+      'h' => '[0-9a-fA-F]',
+      'H' => '[^0-9a-fA-F]',
+      's' => '[ \t\r\n\f]',
+      'S' => '[^ \t\r\n\f]',
+      'w' => '[0-9a-zA-Z_]',
+      'W' => '[^0-9a-zA-Z_]'
+    }
+    # An one-letter abbreviation
+    attr_reader(:shortname)
+    # Constructor
+    def initialize(aShortname)
+      @shortname = valid_shortname(aShortname)
+    end
+  public
+    # Conversion method re-definition.
+    # Purpose: Return the String representation of the expression.
+    def to_str()
+      return "\\#{shortname}"
+    end
+  private
+    # Return the validated short name.
+    def valid_shortname(aShortname)
+      raise StandardError, "Unknown predefined character class \\#{aShortname}" unless StandardCClasses.include? aShortname
+      return aShortname
+    end
+  end # class
+end # module
+# End of file

data/examples/general/SRL/lib/regex/character.rb CHANGED

@@ -25,6 +25,8 @@ class Character < AtomicExpression
 		"\\6" => 6,
 		"\\7" => 7
 	}
+  MetaChars = '\^$+?.'
 	# The integer value that uniquely identifies the character.
 	attr_reader(:codepoint)
@@ -63,7 +65,7 @@ class Character < AtomicExpression
 				end
 				@lexeme = aValue
-			when Fixnum
+			when Integer
 				@codepoint = aValue
 			else
 				raise StandardError, "Cannot initialize a Character with a '#{aValue}'."
@@ -125,14 +127,14 @@ public
 	# newOne == newOne	# true. Identity
 	# newOne == Character.new(?\u03a3)	# true. Both have same codepoint
 	# newOne == ?\u03a3	# true. The single character String match exactly the char attribute.
-	# newOne == 0x03a3	# true. The Fixnum is compared to the codepoint value.
+	# newOne == 0x03a3	# true. The Integer is compared to the codepoint value.
 	# Will test equality with any Object that knows the to_s method
 	def ==(another)
 		result = case another
 			when Character
 				self.to_str == another.to_str
-			when Fixnum
+			when Integer
 				self.codepoint == another
 			when String

data/examples/general/SRL/lib/regex/concatenation.rb ADDED

@@ -0,0 +1,32 @@
+# File: concatenation.rb
+require_relative 'polyadic_expression'	# Access the superclass
+module Regex # This module is used as a namespace
+# Abstract class. A n-ary matching operator.
+# It succeeds when each child succeeds to match the subject text in the same
+# serial arrangement than defined by this concatenation.
+class Concatenation < PolyadicExpression
+	# Constructor.
+	def initialize(*theChildren)
+		super(theChildren)
+	end
+public
+	# Conversion method re-definition.
+	# Purpose: Return the String representation of the concatented expressions.
+	def to_str()
+		result = children.inject('') { |result, aChild|
+			result << aChild.to_str()
+		}
+		return result
+	end
+end # class
+end # module
+# End of file

data/examples/general/SRL/lib/regex/non_capturing_group.rb ADDED

@@ -0,0 +1,29 @@
+# File: non_capturing_group.rb
+require_relative "monadic_expression"	# Access the superclass
+module Regex # This module is used as a namespace
+  # A non-capturing group, in other word it is a pure grouping of sub-expressions
+  class NonCapturingGroup < MonadicExpression
+    # Constructor.
+    # [aChildExpression]	A sub-expression to match. When successful
+    # the matching text is assigned to the capture variable.
+    def initialize(aChildExpression)
+      super(aChildExpression)
+    end
+  public
+    # Conversion method re-definition.
+    # Purpose: Return the String representation of the captured expression.
+    def to_str()
+      result = '(?:' + all_child_text() + ")"
+      return result
+    end
+  end # class
+end # module
+# End of file

data/examples/general/SRL/lib/regex/wildcard.rb ADDED

@@ -0,0 +1,26 @@
+# File: wildcard.rb
+require_relative 'atomic_expression'	# Access the superclass
+module Regex # This module is used as a namespace
+# A wildcard matches any character (except for the newline).
+class Wildcard < AtomicExpression
+	# Constructor
+	def initialize()
+		super
+	end
+public
+	# Conversion method re-definition.
+	# Purpose: Return the String representation of the expression.
+	def to_str()
+		return '.'
+	end
+end # class
+end # module
+# End of file

data/examples/general/SRL/lib/regex_repr.rb CHANGED

@@ -1,5 +1,10 @@
 require_relative './regex/character'
 require_relative './regex/char_range'
+require_relative './regex/concatenation'
 require_relative './regex/multiplicity'
 require_relative './regex/repetition'
 require_relative './regex/char_class'
+require_relative './regex/char_shorthand'
+require_relative './regex/wildcard'
+require_relative './regex/alternation'
+require_relative './regex/non_capturing_group'

data/examples/general/SRL/lib/tokenizer.rb CHANGED

@@ -26,23 +26,35 @@ module SRL
     # Here are all the SRL keywords (in uppercase)
     @@keywords = %w[
       AND
+      ANY
+      ANYTHING
       AT
+      BACKSLASH
       BETWEEN
+      CHARACTER
       DIGIT
       EXACTLY
       FROM
       LEAST
       LETTER
+      LINE
+      LITERALLY
       MORE
       NEVER
+      NEW
+      NO
       NUMBER
+      OF
       ONCE
+      ONE
       OPTIONAL
       OR
+      TAB
       TIMES
       TO
       TWICE
       UPPERCASE
+      WHITESPACE
     ].map { |x| [x, x] } .to_h
     class ScanError < StandardError; end
@@ -68,7 +80,7 @@ module SRL
     def _next_token()
       skip_whitespaces
       curr_ch = scanner.peek(1)
-      return nil if curr_ch.nil?
+      return nil if curr_ch.nil? || curr_ch.empty?
       token = nil
@@ -83,7 +95,13 @@ module SRL
         token = build_token(@@keywords[lexeme.upcase], lexeme)
         # TODO: handle case unknown identifier
       elsif (lexeme = scanner.scan(/[a-zA-Z]((?=\s)|$)/))
-        token = build_token('LETTER_LIT', lexeme)
+        token = build_token('LETTER_LIT', lexeme)
+      elsif (lexeme = scanner.scan(/"([^"]|\\")*"/)) # Double quotes literal?
+        unquoted = lexeme.gsub(/(^")|("$)/, '')
+        token = build_token('STRING_LIT', unquoted)
+      elsif (lexeme = scanner.scan(/'([^']|\\')*'/)) # Single quotes literal?
+        unquoted = lexeme.gsub(/(^')|('$)/, '')
+        token = build_token('STRING_LIT', unquoted)
       else # Unknown token
         erroneous = curr_ch.nil? ? '' : curr_ch
         sequel = scanner.scan(/.{1,20}/)
@@ -96,7 +114,14 @@ module SRL
     def build_token(aSymbolName, aLexeme)
       token_type = name2symbol[aSymbolName]
-      return Rley::Lexical::Token.new(aLexeme, token_type)
+      begin
+        token = Rley::Lexical::Token.new(aLexeme, token_type)
+      rescue Exception => ex
+        puts "Failing with '#{aSymbolName}' and '#{aLexeme}'"
+        raise ex
+      end
+      return token
     end
     def skip_whitespaces()

data/examples/general/SRL/spec/integration_spec.rb CHANGED

@@ -16,7 +16,6 @@ describe 'Integration tests:' do
   end
   context 'Parsing character ranges:' do
     it "should parse 'letter from ... to ...' syntax" do
       result = parse('letter from a to f')
       expect(result).to be_success
@@ -56,13 +55,41 @@ describe 'Integration tests:' do
       regexp = regexp_repr(result)
       expect(regexp.to_str).to eq('[1-4]')
     end
+  end # context
+  context 'Parsing string literals:' do
+    it 'should parse double quotes literal string' do
+      result = parse('literally "hello"')
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('hello')
+    end
+    it 'should parse single quotes literal string' do
+      result = parse("literally 'hello'")
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('hello')
+    end
+    it 'should escape special characters' do
+      result = parse("literally '.'")
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('\.')
+    end
+  end
+  context 'Parsing character classes:' do
     it "should parse 'digit' syntax" do
       result = parse('digit')
       expect(result).to be_success
       regexp = regexp_repr(result)
-      expect(regexp.to_str).to eq('[0-9]')
+      expect(regexp.to_str).to eq('\d')
     end
     it "should parse 'number' syntax" do
@@ -70,9 +97,126 @@ describe 'Integration tests:' do
       expect(result).to be_success
       regexp = regexp_repr(result)
-      expect(regexp.to_str).to eq('[0-9]')
+      expect(regexp.to_str).to eq('\d')
+    end
+    it "should parse 'any character' syntax" do
+      result = parse('any character')
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('\w')
+    end
+    it "should parse 'no character' syntax" do
+      result = parse('no character')
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('\W')
+    end
+    it "should parse 'whitespace' syntax" do
+      result = parse('whitespace')
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('\s')
+    end
+    it "should parse 'no whitespace' syntax" do
+      result = parse('no whitespace')
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('\S')
     end
+    it "should parse 'anything' syntax" do
+      result = parse('anything')
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('.')
+    end
+    it "should parse 'one of' syntax" do
+      result = parse('one of "._%+-"')
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      # Remark: reference implementation less readable
+      # (escapes more characters than required)
+      expect(regexp.to_str).to eq('[._%+\-]')
+    end
+  end # context
+  context 'Parsing special character declarations:' do
+    it "should parse 'tab' syntax" do
+      result = parse('tab')
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('\t')
+    end
+    it "should parse 'backslash' syntax" do
+      result = parse('backslash')
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('\\')
+    end
+    it "should parse 'new line' syntax" do
+      result = parse('new line')
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('\n')
+    end
+  end # context
+  context 'Parsing alternations:' do
+    it "should parse 'any of' syntax" do
+      source = 'any of (any character, one of "._%-+")'
+      result = parse(source)
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      expect(regexp.to_str).to eq('(?:\w|[._%\-+])')
+    end
+  end # context
+  context 'Parsing concatenation:' do
+    it "should reject dangling comma" do
+      source = 'literally "a",'
+      result = parse(source)
+      expect(result).not_to be_success
+      message_prefix = /Premature end of input after ','/
+      expect(result.failure_reason.message).to match(message_prefix)
+    end
+    it "should parse a sequence of patterns" do
+      #
+      # DEBUG When I put a comma at the end ... looping endlessly
+      #
+      source = <<-ENDS
+      any of (any character, one of "._%-+") once or more,
+      literally "@",
+      any of (digit, letter, one of ".-") once or more,
+      literally ".",
+      letter at least 2 times
+ENDS
+      result = parse(source)
+      expect(result).to be_success
+      regexp = regexp_repr(result)
+      # SRL expect: (?:\w|[\._%\-\+])+(?:@)(?:[0-9]|[a-z]|[\.\-])+(?:\.)[a-z]{2,}
+      expect(regexp.to_str).to eq('(?:\w|[._%\-+])+@(?:\d|[a-z]|[.\-])+\.[a-z]{2,}')
+    end
   end # context
   context 'Parsing quantifiers:' do
@@ -87,19 +231,19 @@ describe 'Integration tests:' do
     end
     it "should parse 'twice' syntax" do
-      result = parse(prefix + 'twice')
+      result = parse('digit twice')
       expect(result).to be_success
       regexp = regexp_repr(result)
-      expect(regexp.to_str).to eq('[p-t]{2}')
+      expect(regexp.to_str).to eq('\d{2}')
     end
     it "should parse 'optional' syntax" do
-      result = parse(prefix + 'optional')
+      result = parse('anything optional')
       expect(result).to be_success
       regexp = regexp_repr(result)
-      expect(regexp.to_str).to eq('[p-t]?')
+      expect(regexp.to_str).to eq('.?')
     end
     it "should parse 'exactly ... times' syntax" do
@@ -121,7 +265,6 @@ describe 'Integration tests:' do
       expect(regexp.to_str).to eq('[p-t]{2,4}')
     end
     it "should parse 'once or more' syntax" do
       result = parse(prefix + 'once or more')
       expect(result).to be_success