RubyGems - dhaka - Versions diffs - 2.0.1 → 2.1.0 - Mend

dhaka 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

data/Rakefile +64 -0
data/lib/dhaka.rb +12 -0
data/lib/dot/dot.rb +29 -0
data/lib/evaluator/evaluator.rb +35 -26
data/lib/grammar/grammar.rb +42 -17
data/lib/grammar/grammar_symbol.rb +4 -3
data/lib/grammar/production.rb +9 -3
data/lib/lexer/compiled_lexer.rb +46 -0
data/lib/lexer/dfa.rb +71 -0
data/lib/lexer/lexeme.rb +33 -0
data/lib/lexer/lexer.rb +61 -0
data/lib/lexer/lexer_run.rb +66 -0
data/lib/lexer/regex_grammar.rb +368 -0
data/lib/lexer/regex_parser.rb +1888 -0
data/lib/lexer/regex_tokenizer.rb +14 -0
data/lib/lexer/specification.rb +69 -0
data/lib/lexer/state.rb +45 -0
data/lib/lexer/state_machine.rb +37 -0
data/lib/parser/action.rb +3 -3
data/lib/parser/compiled_parser.rb +11 -3
data/lib/parser/parse_result.rb +3 -5
data/lib/parser/parse_tree.rb +6 -17
data/lib/parser/parser.rb +15 -14
data/lib/parser/parser_run.rb +4 -2
data/lib/parser/parser_state.rb +16 -8
data/lib/tokenizer/tokenizer.rb +5 -3
data/test/arithmetic_precedence/arithmetic_precedence_lexer_specification.rb +23 -0
data/test/arithmetic_precedence/arithmetic_precedence_parser_test.rb +4 -2
data/test/chittagong/chittagong_driver.rb +12 -13
data/test/chittagong/chittagong_driver_test.rb +18 -11
data/test/chittagong/chittagong_evaluator.rb +7 -16
data/test/chittagong/chittagong_evaluator_test.rb +7 -4
data/test/chittagong/chittagong_grammar.rb +0 -6
data/test/chittagong/chittagong_lexer.rb +109 -0
data/test/chittagong/chittagong_lexer_specification.rb +39 -0
data/test/chittagong/{chittagong_tokenizer_test.rb → chittagong_lexer_test.rb} +12 -6
data/test/chittagong/chittagong_parser.rb +879 -0
data/test/chittagong/chittagong_parser_test.rb +8 -10
data/test/chittagong/chittagong_test.rb +17 -13
data/test/compiled_parser_test.rb +7 -2
data/test/evaluator_test.rb +0 -1
data/test/grammar_test.rb +19 -1
data/test/lexer_test.rb +215 -0
data/test/parse_result_test.rb +8 -8
data/test/parser_state_test.rb +0 -12
metadata +21 -5
data/test/arithmetic_precedence/arithmetic_precedence_tokenizer.rb +0 -39
data/test/chittagong/chittagong_tokenizer.rb +0 -88

data/test/chittagong/chittagong_parser_test.rb CHANGED

@@ -1,31 +1,29 @@
 require File.dirname(__FILE__) + '/../dhaka_test_helper'
 require File.dirname(__FILE__) + '/chittagong_grammar'
+begin
+  require File.dirname(__FILE__) + "/chittagong_parser"
+rescue LoadError
+  puts "Please run the rake command in the root folder to generate the lexer and parser required for this test."
+  exit
+end
 class TestChittagongParser < Test::Unit::TestCase
-  def setup
-    fake_logger = FakeLogger.new
-    @parser = Dhaka::Parser.new(ChittagongGrammar, fake_logger)
-    assert_equal(80, fake_logger.warnings.size)
-    assert_equal(0, fake_logger.errors.size)
-    eval(@parser.compile_to_ruby_source_as(:ChittagongParser)) unless defined? ChittagongParser
-  end
   def test_parses_a_series_of_statements
     token_stream = build_tokens(
       'newline',
-      'word_literal', '=', 'int_literal', 'newline',
+      'word_literal', '=', 'numeric_literal', 'newline',
       'print', 'word_literal', 'newline',
       'newline',
       'word_literal', '=', 'word_literal', 'newline',
       'newline', Dhaka::END_SYMBOL_NAME
     )
-    result = @parser.parse(token_stream)
+    result = ChittagongParser.parse(token_stream)
     assert_equal(["single_term",
      "some_terms",
      "variable_name",
-     "integer",
      "literal",
      "assignment_statement",
      "main_body_simple_statement",

data/test/chittagong/chittagong_test.rb CHANGED

@@ -1,10 +1,14 @@
 require File.dirname(__FILE__) + '/../dhaka_test_helper'
 require File.dirname(__FILE__) + "/chittagong_grammar"
-require File.dirname(__FILE__) + "/chittagong_tokenizer"
-require File.dirname(__FILE__) + "/chittagong_evaluator"
-unless defined? ChittagongParser
-  eval(Dhaka::Parser.new(ChittagongGrammar, FakeLogger.new).compile_to_ruby_source_as(:ChittagongParser))
+require File.dirname(__FILE__) + "/chittagong_lexer_specification"
+begin
+  require File.dirname(__FILE__) + "/chittagong_parser"
+  require File.dirname(__FILE__) + "/chittagong_lexer"
+rescue LoadError
+  puts "Please run the rake command in the root folder to generate the lexer and parser required for this test."
+  exit
 end
+require File.dirname(__FILE__) + "/chittagong_evaluator"
 class TestChittagong < Test::Unit::TestCase
@@ -15,7 +19,7 @@ class TestChittagong < Test::Unit::TestCase
   def program_output program
     output_stream = []
-    parse_result = ChittagongParser.parse(ChittagongTokenizer.tokenize(program))
+    parse_result = ChittagongParser.parse(ChittagongLexer.lex(program))
     result = ChittagongEvaluator.new([{}], output_stream).evaluate(parse_result)
     return result, output_stream
   end
@@ -37,7 +41,7 @@ class TestChittagong < Test::Unit::TestCase
     "
     result, output_stream = program_output(program)
-    assert_equal(["1", "1", "2", "3", "5", "8", "13", "21", "34"], output_stream)
+    assert_equal(["1.0", "1.0", "2.0", "3.0", "5.0", "8.0", "13.0", "21.0", "34.0"], output_stream)
   end
   def test_iterative_fibonacci_with_functions
@@ -64,7 +68,7 @@ class TestChittagong < Test::Unit::TestCase
     "
     result, output_stream = program_output(program)
-    assert_equal(["1", "1", "2", "3", "5", "8", "13", "21", "34"], output_stream)
+    assert_equal(["1.0", "1.0", "2.0", "3.0", "5.0", "8.0", "13.0", "21.0", "34.0"], output_stream)
   end
   def test_recursive_factorial
@@ -83,7 +87,7 @@ class TestChittagong < Test::Unit::TestCase
     end"
     result, output_stream = program_output(program)
-    assert_equal((1..10).collect {|i| fact(i).to_s}, output_stream)
+    assert_equal((1..10).collect {|i| fact(i).to_f.to_s}, output_stream)
   end
   def test_various_things
@@ -106,7 +110,7 @@ class TestChittagong < Test::Unit::TestCase
     "
     result, output_stream = program_output(program)
-    assert_equal(["4", "1", "6"], output_stream)
+    assert_equal(["4.0", "1.0", "6.0"], output_stream)
   end
   def test_if_else_block
@@ -126,7 +130,7 @@ class TestChittagong < Test::Unit::TestCase
     "
     result, output_stream = program_output(program)
-    assert_equal(["1", "2"], output_stream)
+    assert_equal(["1.0", "2.0"], output_stream)
   end
   def test_no_arg_functions
@@ -141,15 +145,15 @@ class TestChittagong < Test::Unit::TestCase
     "
     result, output_stream = program_output(program)
-    assert_equal(["1", "2"], output_stream)
+    assert_equal(["1.0", "2.0"], output_stream)
   end
   def test_decimal_numbers
     program = "
-    print .2347 * 23.34
+    print 0.2347 * 23.34
     a = 1.012
     b = 345.44
-    c = .234
+    c = 0.234
     print (a^b)/c
     def foo(a)
       print a

data/test/compiled_parser_test.rb CHANGED

@@ -4,7 +4,7 @@ eval(Dhaka::Parser.new(SimpleGrammar).compile_to_ruby_source_as(:SimpleParser))
 class TestCompiledParser < Test::Unit::TestCase
   def test_compiled_parser_generates_parse_tree_for_simple_grammar
-    parse_tree = SimpleParser.parse(build_tokens(['(','n','-','(','n','-','n',')',')','-','n','#',Dhaka::END_SYMBOL_NAME]))
+    parse_tree = SimpleParser.parse(build_tokens(%w| ( n - ( n - n ) ) - n # | +[Dhaka::END_SYMBOL_NAME]))
     assert_equal \
       ["literal",
        "term",
@@ -26,9 +26,14 @@ class TestCompiledParser < Test::Unit::TestCase
     parse_result = SimpleParser.parse(build_tokens([Dhaka::END_SYMBOL_NAME]))
     assert parse_result.has_error?
   end
+  def test_parse_result_is_nil_if_no_end_token
+    parse_result = SimpleParser.parse(build_tokens(%w| n - n |))
+    assert_nil(parse_result)
+  end
   def test_parser_returns_error_result_with_index_of_bad_token_if_parse_error
-    parse_result = SimpleParser.parse(build_tokens(['(', '-', ')',Dhaka::END_SYMBOL_NAME]))
+    parse_result = SimpleParser.parse(build_tokens(['(', '-', ')', Dhaka::END_SYMBOL_NAME]))
     assert parse_result.has_error?
     assert_equal '-', parse_result.unexpected_token.symbol_name
   end

data/test/evaluator_test.rb CHANGED

@@ -18,6 +18,5 @@ class TestEvaluator < Test::Unit::TestCase
         end
       end")
     end
   end
 end

data/test/grammar_test.rb CHANGED

@@ -6,7 +6,7 @@ class SimpleGrammarTest < Test::Unit::TestCase
     @grammar = SimpleGrammar
   end
-  def test_loads_symbol_and_classifies_them
+  def test_loads_symbols_and_classifies_them
     expected_non_terminals = Set.new(['E', 'S', 'T', Dhaka::START_SYMBOL_NAME])
     expected_terminals     = Set.new(['-', 'n', '(', ')', '#', Dhaka::END_SYMBOL_NAME])
     assert_equal(expected_non_terminals, Set.new(@grammar.non_terminal_symbols.collect {|symbol| symbol.name}))
@@ -62,4 +62,22 @@ class SimpleGrammarTest < Test::Unit::TestCase
     assert_equal(expected_items, Set.new(closure.values.collect{|item| item.to_s}))
     assert_equal(expected_channels, Set.new(channels.collect{|item| item.to_s}))
   end
+  def test_export_grammar_to_bnf
+    assert_equal(
+'
+"_Start_" :
+  | "S" "#"
+"S" :
+  | "E"
+"E" :
+  | "E" "-" "T"
+  | "T"
+"T" :
+  | "n"
+  | "(" "E" ")"', @grammar.to_bnf)
+  end
 end

data/test/lexer_test.rb ADDED

@@ -0,0 +1,215 @@
+require File.dirname(__FILE__) + '/dhaka_test_helper'
+class TestLexer < Test::Unit::TestCase
+  def test_build_AST_from_parse_tree_and_compute_follow_first_and_last
+    root      = Dhaka::LexerSupport::RegexParser.parse(Dhaka::LexerSupport::RegexTokenizer.tokenize("(a|b)*abb"))
+    star_node = root.left.left.left.left
+    or_node   = star_node.child
+    first_a   = or_node.children[0]
+    first_b   = or_node.children[1]
+    second_a  = root.left.left.left.right
+    second_b  = root.left.left.right
+    last_b    = root.left.right
+    sentinel  = root.right
+    assert(!root.nullable)
+    assert(!root.left.nullable)
+    assert(!root.left.left.nullable)
+    assert(star_node.nullable)
+    assert_equal(Set.new([first_a, first_b, second_a]), root.first)
+    assert_equal(Set.new([last_b]), root.left.last)
+    root.calculate_follow_sets
+    assert_equal(Set.new([first_a, first_b, second_a]), first_a.follow_set)
+    assert_equal(Set.new([first_a, first_b, second_a]), first_b.follow_set)
+    assert_equal(Set.new([second_b]), second_a.follow_set)
+    assert_equal(Set.new([last_b]), second_b.follow_set)
+    assert_equal(Set.new([sentinel]), last_b.follow_set)
+  end
+  def test_DFA_raises_exception_if_empty_regex
+    machine = Dhaka::LexerSupport::DFA.new("")
+    flunk "Should have thrown an unexpected end of regex exception"
+  rescue Dhaka::LexerSupport::InvalidRegexException => e
+    assert_equal("Unexpected end of regex.", e.message)
+  end
+  def test_DFA_raises_exception_if_error_parsing_regex
+    machine = Dhaka::LexerSupport::DFA.new("(a|b)*+abb")
+    flunk "Should have thrown an unexpected token exception"
+  rescue Dhaka::LexerSupport::InvalidRegexException => e
+    assert_equal("Unexpected token +: (a|b)*>>>+abb", e.message)
+  end
+  def test_match_a_regex
+    machine = Dhaka::LexerSupport::DFA.new("(a|b)*abb")
+    assert(machine.matches("abababb"))
+    assert(machine.matches("ababaabb"))
+    assert(!machine.matches("abababab"))
+    assert(!machine.matches("abababbc"))
+    assert(!machine.matches("abababbaa"))
+  end
+  def test_match_a_regex_with_optional_characters_at_the_end
+    machine = Dhaka::LexerSupport::DFA.new("bad(c|d)+(ab)*")
+    assert(machine.matches("badccddabab"))
+    assert(machine.matches("baddcc"))
+    assert(!machine.matches("badab"))
+    assert(!machine.matches("bacdab"))
+  end
+  def test_match_a_nullable_regex
+    machine = Dhaka::LexerSupport::DFA.new("(ab)*")
+    assert(machine.matches("abab"))
+    assert(machine.matches("ab"))
+    assert(machine.matches(""))
+    assert(!machine.matches("b"))
+  end
+  def test_match_a_regex_with_the_dot_character
+    machine = Dhaka::LexerSupport::DFA.new("ab.*cd")
+    assert(machine.matches("abacd"))
+    assert(machine.matches("abcd"))
+    assert(machine.matches("abAcd"))
+    assert(!machine.matches("ab999c"))
+  end
+  def test_match_a_regex_with_sets
+    machine = Dhaka::LexerSupport::DFA.new("ab[j-lu]*cd")
+    assert(!machine.matches("abacd"))
+    assert(machine.matches("abcd"))
+    assert(machine.matches("abjklucd"))
+    assert(!machine.matches("abijklucd"))
+    assert(!machine.matches("ab999c"))
+  end
+  def test_match_a_regex_with_negative_sets
+    machine = Dhaka::LexerSupport::DFA.new("ab[^j-lr]*cd")
+    assert(machine.matches("abcd"))
+    assert(!machine.matches("abjcd"))
+    assert(!machine.matches("abrcd"))
+    assert(!machine.matches("abijklucd"))
+    assert(machine.matches("abyqcd"))
+  end
+  def test_match_a_regex_with_sets_containing_escaped_characters
+    machine = Dhaka::LexerSupport::DFA.new("ab[\\^\\-.]*cd")
+    assert(machine.matches("abcd"))
+    assert(!machine.matches("abjcd"))
+    assert(machine.matches("ab^-.cd"))
+    assert(!machine.matches("abijklucd"))
+    assert(!machine.matches("ab\\cd"))
+  end
+  def test_match_a_regex_using_unescaped_caret_and_dash_characters
+    machine = Dhaka::LexerSupport::DFA.new("(\\^-)+")
+    assert(machine.matches("^-"))
+    assert(machine.matches("^-^-"))
+    assert(!machine.matches("?cd"))
+  end
+  def test_match_a_regex_using_escape_characters
+    machine = Dhaka::LexerSupport::DFA.new(%q/(-\?\(\)\\\\)*/)
+    assert(machine.matches("-?()\\"))
+  end
+  def test_match_a_regex_using_lt_and_gt
+    machine = Dhaka::LexerSupport::DFA.new('<.+>')
+    assert(machine.matches("<ab>"))
+    assert(machine.matches("<absdf><sdg><sse>"))
+    assert(!machine.matches("ab>"))
+  end
+  def test_simulating_curly_brace_quantifiers
+    machine = Dhaka::LexerSupport::DFA.new('aaa?a?a?')
+    assert(machine.matches("aa"))
+    assert(machine.matches("aaa"))
+    assert(machine.matches("aaaa"))
+    assert(machine.matches("aaaaa"))
+    assert(!machine.matches("aaaaaa"))
+    assert(!machine.matches("a"))
+  end
+  class LexerSpec < Dhaka::LexerSpecification
+    for_pattern 'zz' do
+      "recognized two zs"
+    end
+    for_pattern '\w(\w|\d)*' do
+      "recognized word token #{current_lexeme.value}"
+    end
+    for_pattern '(\d)*(\.\d+)?' do
+      "recognized number #{current_lexeme.value}"
+    end
+    for_pattern '<.*>' do
+      "recognized tag #{current_lexeme.value}"
+    end
+    for_pattern ' +' do
+      #ignores whitespace
+    end
+    for_pattern "\n+" do
+      "recognized newline"
+    end
+  end
+  def test_lexer_with_valid_input
+    lexer = Dhaka::Lexer.new(LexerSpec)
+    eval(lexer.compile_to_ruby_source_as(:SomeLexer))
+    input = "these are words a z zz caPITALIZED word
+    this is a float 12.00 an integer 134 a float without a leading digit .2335 another word1"
+    results =  SomeLexer.lex(input).collect
+    assert_equal(
+    ["recognized word token these",
+     "recognized word token are",
+     "recognized word token words",
+     "recognized word token a",
+     "recognized word token z",
+     "recognized two zs",
+     "recognized word token caPITALIZED",
+     "recognized word token word",
+     "recognized newline",
+     "recognized word token this",
+     "recognized word token is",
+     "recognized word token a",
+     "recognized word token float",
+     "recognized number 12.00",
+     "recognized word token an",
+     "recognized word token integer",
+     "recognized number 134",
+     "recognized word token a",
+     "recognized word token float",
+     "recognized word token without",
+     "recognized word token a",
+     "recognized word token leading",
+     "recognized word token digit",
+     "recognized number .2335",
+     "recognized word token another",
+     "recognized word token word1"], results[0..-2])
+  end
+  def test_lexer_with_invalid_input
+    lexer = Dhaka::Lexer.new(LexerSpec)
+    result = lexer.lex("this will cause an error here 123.").each do |result|
+    end
+    assert(result.has_error?)
+    assert_equal(34, result.unexpected_char_index)
+  end
+  def test_lexer_with_greedy_character_consumption
+    lexer = Dhaka::Lexer.new(LexerSpec)
+    results = lexer.lex("<html></html>this is a word").collect
+    assert_equal(["recognized tag <html></html>",
+     "recognized word token this",
+     "recognized word token is",
+     "recognized word token a",
+     "recognized word token word"], results[0..-2])
+  end
+end

data/test/parse_result_test.rb CHANGED

@@ -7,13 +7,13 @@ class TestParseSuccessResult < Test::Unit::TestCase
   def composite_node(production, child_nodes, dot_name)
     node = ParseTreeCompositeNode.new(SimpleGrammar.production_named(production))
     node.child_nodes.concat child_nodes
-    node.stubs(:dot_name).returns(dot_name)
+    node.stubs(:object_id).returns(dot_name)
     node
   end
   def leaf_node(token, value, dot_name)
     node = ParseTreeLeafNode.new(Token.new(token, value, nil))
-    node.stubs(:dot_name).returns(dot_name)
+    node.stubs(:object_id).returns(dot_name)
     node
   end
@@ -25,17 +25,17 @@ class TestParseSuccessResult < Test::Unit::TestCase
     result        = ParseSuccessResult.new(tree)
     assert_equal(
 %(digraph x {
-node [fontsize="10" shape=box size="5"]
+node [fontsize="10" shape="box" size="5"]
 expression [label="subtraction E ::= E - T"]
-expression -> first_term
+expression -> first_term
 first_term [label="literal T ::= n"]
-first_term -> literal_1
+first_term -> literal_1
 literal_1 [label="n : 1"]
-expression -> subtraction_operator
+expression -> subtraction_operator
 subtraction_operator [label="-"]
-expression -> second_term
+expression -> second_term
 second_term [label="literal T ::= n"]
-second_term -> literal_2
+second_term -> literal_2
 literal_2 [label="n : 2"]
 }),
     result.to_dot)