RubyGems - rbpar - Versions diffs - 0.1.0 - Mend

rbpar 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/README +58 -0
data/bin/rbpar.rb +94 -0
data/lib/rbpar_engine.rb +196 -0
data/lib/rbpar_main.rb +68 -0
data/lib/rbpar_paragraph.rb +344 -0
data/test/rbpar_engine_test.rb +152 -0
data/test/rbpar_main_test.rb +212 -0
data/test/rbpar_paragraph_test.rb +305 -0
data/test/rbpar_test.rb +11 -0
metadata +54 -0

data/lib/rbpar_paragraph.rb ADDED Viewed

@@ -0,0 +1,344 @@
+#!/usr/bin/ruby
+# Paragraph is the main building block for the paragraph processing
+# library. The paragraph keeps information about it's prefix and can
+# split itself to subparagraphs and break its lines to optimal line
+# widths.
+#
+# Copyright Ismo Puustinen 2007.
+require 'pp'
+class Paragraph < Array
+    QUOTE_REGEXP_WITH_WHITESPACE = /\A(\s*>)+\s*/
+    QUOTE_REGEXP = /\A(\s*>)+/
+    attr_accessor :quote_prefix
+    def initialize(array = [])
+        super(array)
+        @quote_prefix = ""
+        @paragraphs = nil
+        @whitespace = nil
+    end
+    def prefix_library(original)
+        new_prefix = @prefix_db[original]
+        if new_prefix.nil?
+            return original
+        else
+            return new_prefix
+        end
+    end
+    def get_paragraphs
+        if @paragraphs.nil?
+            @paragraphs = Array.new()
+        end
+        # start the recursion, results are collected to @paragraphs
+        tmp_paragraphs = get_sub_paragraphs(self)
+        @paragraphs
+    end
+    def get_sub_paragraphs(paragraph)
+        # return the list of the sub-paragraphs
+        paragraph.find_quote_prefix()
+        paragraphs = paragraph.split_to_quoted()
+        # end recursion
+        if paragraphs.length == 1
+            # this is a depth-first search, thus we are processing the
+            # paragraphs in order.
+            @paragraphs << paragraph
+            return [paragraph]
+        end
+        paragraphs.each_with_index do |new_paragraph, i|
+            # new_paragraph.find_quote_prefix
+            paragraphs[i] = get_sub_paragraphs(new_paragraph)
+        end
+        return paragraphs
+    end
+    def find_quote_prefix()
+        longest_match = ""
+        first = true
+        self.each do |line|
+            found = line.match(QUOTE_REGEXP_WITH_WHITESPACE)
+            @quote_prefix = "" ; @whitespace = 0 ; return "" unless found # no change
+            if first
+                # initialize
+                longest_match = $&
+                first = false
+            end
+            if $&.length < longest_match.length
+                longest_match = $&
+            end
+        end
+        # puts "quote_prefix: '" + longest_match + "'"
+        # pp self
+        @quote_prefix = longest_match.rstrip
+        @whitespace = longest_match.size - @quote_prefix.size
+        # pp @quote_prefix
+        @quote_prefix
+    end
+    def quoted?(line)
+        return false unless line.length > @quote_prefix.length
+        # puts "line for comparison: '" + line[@quote_prefix.length..-1] + "'"
+        # puts "match: '" + (line[@quote_prefix.length..-1].match(QUOTE_REGEXP)).to_s + "'"
+        return !line[@quote_prefix.length..-1].match(QUOTE_REGEXP).nil?
+    end
+    def do_process?
+        # decide if this paragraph needs to be processed
+        #
+        # This is a problem case: we have found a prefix that is
+        # longer than the maximum width of the paragraph. For
+        # example, when maximum width is 5:
+        #
+        # >>>> This is text
+        # >>>> And so is this
+        #
+        # If we cannot do solve the problem, let's return the data
+        # as it was.
+        # FIXME: compare agains the new prefix
+        # when @width <= @quote_prefix.length: false
+        true
+    end
+    def do_process_without_quotes?
+        # decide if this (now unquoted) paragraph needs to be processed
+        # * do not process mail signatures
+        # * do not process code
+        return case
+            # The line marks the beginning of an email signature: do not
+            # process
+            # when self[0] == "-- " || self[0] == "-- \n": false
+            when self[0] =~ /\ *-- \n?/: false
+            else true
+        end
+    end
+    def change_quote()
+        return @quote_prefix.strip.squeeze(" ") + " " unless @quote_prefix.empty?
+        return ""
+    end
+    def process!(line_breaker, width)
+        @width = width
+        error = false
+        # pp self
+        # this is the time and place to change the quote_prefix to a better one
+        new_quote_prefix = change_quote()
+        # some sanity checks
+        # see if we want to process this at all
+        error = true unless do_process?
+        # remove the original prefixes from the lines
+        self.each do |line|
+            line.slice!(0...@quote_prefix.length)
+        end
+        # pp self
+        # see if we want to process the unquoted text
+        error = true unless !error and do_process_without_quotes?
+        # error case: just change the prefixes, otherwise do not do
+        # anything
+        if error
+            self.each do |line|
+                line.chomp!
+                line.slice!(0...@whitespace) # take away the extra whitespace after the quote_prefix
+                line.insert(0, new_quote_prefix)
+            end
+            return self
+        end
+        # break the lines to a word array
+        words = self.inject([]) do |value, line|
+            value += line.chomp.split(' ')
+        end
+        # there should be something over the prefix anyway...
+        width = new_quote_prefix.length + 1 unless (width - new_quote_prefix.length) > 0
+        # find linebreaks and recreate the prefix: the real thing!
+        self[0..-1] = line_breaker.parse(words, width - new_quote_prefix.length).collect do |line|
+            # pp line
+            line.insert(0, new_quote_prefix)
+        end
+        if self.empty? and !@quote_prefix.empty?
+            # If there were no result words, the line might be only a
+            # quoted empty line. By default, we want to save it.
+            # However, if there is no quote prefix, this is a paragraph
+            # break, and they are handled elsewhere. TODO: might be
+            # better to also handle this case here.
+            self[0..-1] = new_quote_prefix
+        end
+        return self
+    end
+    def split_to_quoted()
+        results = Array.new
+        tmp_paragraph = Paragraph.new()
+        previous_quoted = false
+        first = true
+        if @whitespace.nil? # not initialized yet!
+            self.find_quote_prefix()
+        end
+        # puts "split (quote '" + @quote_prefix + "'):"
+        # pp self
+        return self if self.length == 1
+        self.each do |line|
+            # check if the line is empty after removing the prefix
+            # pp "line: '" + line + "', quote: '" + @quote_prefix + "'"
+            if (line.strip == @quote_prefix.strip)
+                # puts "empty line with only the quote prefix!"
+                # end the paragraph
+                results << tmp_paragraph unless tmp_paragraph.empty?
+                # add the empty line as a new paragraph
+                empty_paragraph = Paragraph.new()
+                empty_paragraph << line
+                results << empty_paragraph
+                # start a new paragraph
+                tmp_paragraph = Paragraph.new()
+                # reset the quoting status
+                first = true
+                next # this line is now handled properly, go get the next one
+            end
+            if first
+                # note that we need to call the paragraph's own quoted?
+                previous_quoted = self.quoted?(line)
+                # puts "quoting state initialized to " + previous_quoted.to_s
+                # puts "prefix: " + paragraph.quote_prefix
+                first = false
+            end
+            if (previous_quoted and self.quoted?(line)) or
+               (!previous_quoted and !self.quoted?(line))
+                # add to tmp_paragraph if the quoting status status is the
+                # same
+                tmp_paragraph << line
+                # puts "added: " +line
+            else
+                # one quoting block is now at end
+                results << tmp_paragraph
+                # initialize with the old prefix
+                tmp_paragraph = Paragraph.new()
+                tmp_paragraph << line
+                previous_quoted = !previous_quoted
+                # puts "quoting state changed to " + previous_quoted.to_s
+                # puts "added: " +line
+            end
+        end
+        # also get the last paragraph block
+        results << tmp_paragraph unless tmp_paragraph.empty?
+        # pp results
+        return results
+    end
+    def find_prefix(suspicious_level)
+        suspicious = 0
+        # no use finding prefix if we only have one line
+        return "", "" if self.length <= 1
+        # only two lines... let's be careful now
+        suspicious += 1 if self.length == 2
+        # find the prefix (incredible find from the web)
+        min, max = self.sort.values_at(0, -1)
+        prefix = (min+max).match(/\A(.*).*(?=.{#{max.length}}\z)\1/m)[1]
+        old_prefix = prefix
+        # puts "prefix: '" + prefix + "'"
+        # is there a prefix?
+        return "", "" if prefix.nil? or prefix.empty?
+        # see if the prefix is only whitespace indent
+        return prefix, old_prefix if /\A\s*\Z/.match(prefix)
+        # Remove whitespace from right. It's also suspicious if there
+        # isn't any...
+        suspicious += 1 unless prefix.rstrip!
+        # see if the prefix library already has a better alternative for
+        # this prefix
+        newprefix = prefix_library(prefix.lstrip)
+        return newprefix, old_prefix unless newprefix.nil?
+        # our prefix wasn't in the library... this could be a mismatch
+        suspicious += 1
+        # only letters in the prefix
+        suspicious += 1 if /[a-zA-Z]/.match(prefix) && prefix.length < 3
+        # see if we dare to return the suspected prefix
+        return prefix + " ", old_prefix if suspicious < suspicious_level
+        # assume that there is actually no real prefix
+        return "", ""
+    end
+end

data/test/rbpar_engine_test.rb ADDED Viewed

@@ -0,0 +1,152 @@
+#!/usr/bin/ruby
+require 'test/unit'
+require 'rbpar_engine'
+class TestRbParEngine < Test::Unit::TestCase
+    def setup
+        @breaker = DynamicBreaker.new()
+    end
+    def test_char_length
+        words = "This is a test"
+        # FIXME: index should go from 0 to 3
+        assert_equal(words.length, @breaker.charLength(words.split(" "), 0, 4))
+        # Do it twice to see if the caching doesn't corrupt anything
+        assert_equal(words.length, @breaker.charLength(words.split(" "), 0, 4))
+    end
+    def test_real_char_length
+        words = "This is a test"
+        # FIXME: index should go from 0 to 3
+        assert_equal(words.length, @breaker.realCharLength(words.split(" "), 0, 4))
+    end
+    def test_parse
+        words = "Sed eget ligula. Nunc fringilla. In ullamcorper turpis quis tortor. Maecenas fringilla dui aliquet leo. Nulla nec mi ut mauris ultrices sollicitudin. Mauris feugiat ornare massa. Ut vitae dolor sed urna blandit imperdiet. Cras tempus, orci sollicitudin pulvinar ultricies, sapien urna fringilla risus, eu rhoncus metus nisi a risus. Aliquam erat volutpat. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Duis iaculis lorem sit amet neque. Cras quis tellus. Ut molestie eros sit amet nibh blandit luctus. Quisque ac sem. Nulla condimentum eros rhoncus ipsum. Duis enim. Phasellus mattis posuere augue."
+        # let's try parsing with several values
+        # 1. desired width 45
+        resultlines = @breaker.parse(words.split(" "), 45)
+        assert_not_nil(resultlines)
+        # see that no resulting lines are longer than the maximum length
+        resultlines.each do |line|
+            assert(line.length <= 45)
+        end
+        # see that we aren't missing any words or that new words haven't
+        # crept in
+        newwords = resultlines.inject("") do |value, line|
+            if value == ""
+                value = line
+            else
+                value = value + " " + line
+            end
+        end
+        assert_equal(words, newwords)
+        # 2. desired width 25
+        resultlines = @breaker.parse(words.split(" "), 25)
+        assert_not_nil(resultlines)
+        # see that no resulting lines are longer than the maximum length
+        resultlines.each do |line|
+            assert(line.length <= 25, "line length was " + line.length.to_s)
+        end
+        # see that we aren't missing any words or that new words haven't
+        # crept in
+        newwords = resultlines.inject("") do |value, line|
+            if value == ""
+                value = line
+            else
+                value = value + " " + line
+            end
+        end
+        assert_equal(words, newwords)
+        # 3. desired width 5. Now there are words that are longer than
+        # 5, and that's okay. Still, there must not be lines consisting
+        # of several words that are over 5 characters long.
+        resultlines = @breaker.parse(words.split(" "), 5)
+        assert_not_nil(resultlines)
+        # see that no resulting lines are longer than the maximum length
+        resultlines.each do |line|
+            # puts line.length
+            if (line.length > 5)
+                assert_no_match(/ /, line)
+            end
+        end
+        # see that we aren't missing any words or that new words haven't
+        # crept in
+        newwords = resultlines.inject("") do |value, line|
+            if value == ""
+                value = line
+            else
+                value = value + " " + line
+            end
+        end
+        assert_equal(words, newwords)
+        # 4. Test from Wikipedia
+        # (http://en.wikipedia.org/wiki/Word_wrap)
+        words2 = "aaa bb cc ddddd"
+        # line width is 6
+        #
+        # greedy algorithm:
+        #
+        # aaa bb
+        # cc
+        # ddddd
+        #
+        # optimal algorithm:
+        #
+        # aaa
+        # bb cc
+        # ddddd
+        resultlines = @breaker.parse(words2.split(" "), 6)
+        # resultlines.each do |line|
+        #     puts line
+        # end
+        assert_equal("aaa", resultlines[0])
+        assert_equal("bb cc", resultlines[1])
+        assert_equal("ddddd", resultlines[2])
+        # 5. Test only one line
+        words3 = "laama fdsf fds f eee ddd lll eee linna lumi lunni"
+        resultlines = @breaker.parse(words3.split(" "), 63)
+        assert_equal("laama fdsf fds f eee ddd lll eee linna lumi lunni", resultlines[0])
+        # 6. Test only one word
+        resultlines = @breaker.parse(["test"], 63)
+        assert_equal("test", resultlines[0])
+        # 6. Test only one word (longer than allowed)
+        resultlines = @breaker.parse(["testingtesting"], 10)
+        assert_equal("testingtesting", resultlines[0])
+        # 7. Test only two words
+        resultlines = @breaker.parse(["testing1", "testing2"], 10)
+        assert_equal("testing1", resultlines[0])
+        assert_equal("testing2", resultlines[1])
+    end
+end

data/test/rbpar_main_test.rb ADDED Viewed

@@ -0,0 +1,212 @@
+#!/usr/bin/ruby
+require 'test/unit'
+require 'rbpar_main'
+require 'pp'
+class TestRbParMain < Test::Unit::TestCase
+    def test_lines
+        rbpar1 = RbParIterator.new(63)
+        lines1 = ["\n"]
+        rbpar1 << lines1
+        lines1 = rbpar1.collect
+        assert_equal("", lines1[0][0])
+        assert_nil(lines1[0][1])
+        assert_nil(lines1[1])
+        rbpar2 = RbParIterator.new(63)
+        lines2 = ["\n\n"]
+        rbpar2 << lines2
+        lines2 = rbpar2.collect
+        assert_equal("", lines2[0][0])
+        assert_equal("", lines2[0][1])
+        assert_nil(lines2[1])
+    end
+    def test_paragraphs
+        # very basic one line test
+        rbpar1 = RbParIterator.new(63)
+        lines1 = ["lunni\n", "lumi\n", "lumikki\n", "luoto\n"]
+        rbpar1 << lines1
+        lines1 = rbpar1.collect
+        assert_equal("lunni lumi lumikki luoto", lines1[0][0])
+        # very basic one line test
+        rbpar2 = RbParIterator.new(10)
+        lines2 = ["This is a test of splitting the lines to two lines\n"]
+        rbpar2 << lines2
+        lines2 = rbpar2.collect
+        # check the line lengths
+        lines2.each do |paragraph|
+            paragraph.each do |line|
+                assert(line.length <= 10, "line length was " + line.length.to_s)
+            end
+        end
+        # for regression testing: the results should be like this
+        assert_equal("This is", lines2[0][0])
+        assert_equal("a test of", lines2[0][1])
+        assert_equal("splitting", lines2[0][2])
+        assert_equal("the lines", lines2[0][3])
+        assert_equal("to two", lines2[0][4])
+        assert_equal("lines", lines2[0][5])
+        # test with an actual email
+        rbpar3 = RbParIterator.new(50)
+        lines3 = [
+            "On 12/1/07, Lorem Ipsum wrote:\n",
+            "\n",
+            "> Lorem ipsum dolor sit amet, consectetuer adipiscing elit.\n",
+            "> Praesent viverra magna sed urna.\n",
+            "> \n",
+            "> On 11/29/07, The previous Lorem Ipsum wrote:\n",
+            "> > Vivamus imperdiet purus eget velit. Pellentesque vehicula\n",
+            "> > gravida justo. In facilisis. Curabitur at tortor eu ante\n",
+            "> > mattis pulvinar. Cras lobortis, augue malesuada accumsan\n",
+            "> > vehicula, erat tortor ultricies nulla, ac interdum odio tortor.\n",
+            "\n",
+            "Phasellus faucibus porta nunc.\n",
+            "Vestibulum vel lectus.\n",
+            "Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos hymenaeos."
+        ]
+        rbpar3 << lines3
+        lines3 = rbpar3.collect
+        assert_equal("On 12/1/07, Lorem Ipsum wrote:", lines3[0][0])
+        # Paragraph break is part of the lines list: this is so that the
+        # empty lines would not be counted twice as paragraph breaks
+        # anymore, but we still would get two '\n' characters after a
+        # line break in the data.
+        assert_equal("", lines3[0][1])
+        assert_equal("> Lorem ipsum dolor sit amet, consectetuer", lines3[1][0])
+        assert_equal("> adipiscing elit. Praesent viverra magna sed", lines3[1][1])
+        assert_equal("> urna.", lines3[1][2])
+        assert_equal("> ", lines3[1][3])
+        # Note that these are now part of the same paragraph as the
+        # previous lines. This is because we don't want an empty line
+        # between the two paragraphs.
+        assert_equal("> On 11/29/07, The previous Lorem Ipsum wrote:", lines3[1][4])
+        assert_equal("> > Vivamus imperdiet purus eget velit.", lines3[1][5])
+        assert_equal("> > Pellentesque vehicula gravida justo. In", lines3[1][6])
+        assert_equal("> > facilisis. Curabitur at tortor eu ante mattis", lines3[1][7])
+        assert_equal("> > pulvinar. Cras lobortis, augue malesuada", lines3[1][8])
+        assert_equal("> > accumsan vehicula, erat tortor ultricies", lines3[1][9])
+        assert_equal("> > nulla, ac interdum odio tortor.", lines3[1][10])
+        assert_equal("", lines3[1][11])
+        assert_equal("Phasellus faucibus porta nunc. Vestibulum", lines3[2][0])
+        assert_equal("vel lectus. Class aptent taciti sociosqu ad", lines3[2][1])
+        assert_equal("litora torquent per conubia nostra, per inceptos", lines3[2][2])
+        assert_equal("hymenaeos.", lines3[2][3])
+        rbpar4 = RbParIterator.new(6)
+        lines4 = [ "aaa bb cc ddddd" ]
+        rbpar4 << lines4
+        lines4 = rbpar4.collect
+        assert_equal("aaa", lines4[0][0])
+        assert_equal("bb cc", lines4[0][1])
+        assert_equal("ddddd", lines4[0][2])
+        # Check the handling of empty quoted lines.
+        # if this happens:
+        #
+        # > text text
+        # >
+        # > text
+        #
+        # the paragraph break should be respected no matter how much or
+        # how little whitespace is on the middle line.
+        rbpar5 = RbParIterator.new(15)
+        lines5 = [ "> text text\n", ">\n", "> text\n" ]
+        rbpar5 << lines5
+        lines5 = rbpar5.collect
+        assert_equal("> text text", lines5[0][0])
+        assert_equal("> ", lines5[0][1])
+        assert_equal("> text", lines5[0][2])
+        rbpar6 = RbParIterator.new(15)
+        lines6 = [ "> text text\n", ">       \n", "> text\n" ]
+        rbpar6 << lines6
+        lines6 = rbpar6.collect
+        assert_equal("> text text", lines6[0][0])
+        assert_equal("> ", lines6[0][1])
+        assert_equal("> text", lines6[0][2])
+        rbpar7 = RbParIterator.new(15)
+        lines7 = [ "> text text\n", "\n", "> text\n" ]
+        rbpar7 << lines7
+        lines7 = rbpar7.collect
+        assert_equal("> text text", lines7[0][0])
+        assert_equal("", lines7[0][1])
+        assert_equal("> text", lines7[1][0])
+    end
+    def test_signature
+        # test that signature paragraphs behave correctly
+        # TODO: refactor to rbpar_paragraph_test?
+        lines_sig_1 = ["-- \n", "    - NanoNano\n"]
+        rbpar_sig_1 = RbParIterator.new(68)
+        rbpar_sig_1 << lines_sig_1
+        lines_sig_1 = rbpar_sig_1.collect
+        assert_equal("-- ", lines_sig_1[0][0])
+        assert_equal("    - NanoNano", lines_sig_1[0][1])
+        # pp lines_sig_1
+        lines_sig_2 = [">-- \n", ">    - NanoNano\n"]
+        rbpar_sig_2 = RbParIterator.new(68)
+        rbpar_sig_2 << lines_sig_2
+        lines_sig_2 = rbpar_sig_2.collect
+        assert_equal("> -- ", lines_sig_2[0][0])
+        assert_equal(">     - NanoNano", lines_sig_2[0][1])
+        # pp lines_sig_2
+        lines_sig_3 = ["> -- \n", ">     - NanoNano\n"]
+        rbpar_sig_3 = RbParIterator.new(68)
+        rbpar_sig_3 << lines_sig_3
+        lines_sig_3 = rbpar_sig_3.collect
+        assert_equal("> -- ", lines_sig_3[0][0])
+        assert_equal(">     - NanoNano", lines_sig_3[0][1])
+        # pp lines_sig_3
+    end
+end