RubyGems - docdiff - Versions diffs - 0.6.5 → 0.6.6 - Mend

docdiff 0.6.5 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +4 -4
data/Gemfile +7 -7
data/Guardfile +4 -4
data/Makefile +1 -1
data/Rakefile +6 -6
data/bin/docdiff +1 -1
data/devutil/Rakefile +12 -5
data/devutil/char_by_charclass.rb +43 -20
data/devutil/charclass_by_char.rb +40 -19
data/devutil/jis0208.rb +263 -231
data/devutil/jis0208_test.rb +196 -0
data/doc/news.md +8 -0
data/docdiff.gemspec +12 -10
data/lib/doc_diff.rb +59 -60
data/lib/docdiff/charstring.rb +225 -241
data/lib/docdiff/cli.rb +285 -250
data/lib/docdiff/diff/contours.rb +1 -1
data/lib/docdiff/diff/editscript.rb +1 -1
data/lib/docdiff/diff/rcsdiff.rb +1 -1
data/lib/docdiff/diff/shortestpath.rb +1 -1
data/lib/docdiff/diff/speculative.rb +1 -1
data/lib/docdiff/diff/subsequence.rb +1 -1
data/lib/docdiff/diff/unidiff.rb +1 -1
data/lib/docdiff/diff.rb +1 -1
data/lib/docdiff/difference.rb +71 -70
data/lib/docdiff/document.rb +129 -109
data/lib/docdiff/encoding/en_ascii.rb +64 -58
data/lib/docdiff/encoding/ja_eucjp.rb +250 -235
data/lib/docdiff/encoding/ja_sjis.rb +240 -226
data/lib/docdiff/encoding/ja_utf8.rb +6952 -6939
data/lib/docdiff/version.rb +1 -1
data/lib/docdiff/view.rb +522 -438
data/lib/docdiff.rb +2 -2
data/test/charstring_test.rb +475 -351
data/test/cli_test.rb +103 -101
data/test/diff_test.rb +15 -16
data/test/difference_test.rb +40 -31
data/test/docdiff_test.rb +162 -136
data/test/document_test.rb +280 -175
data/test/test_helper.rb +2 -1
data/test/view_test.rb +636 -497
metadata +8 -8
data/devutil/testjis0208.rb +0 -38

data/lib/docdiff/charstring.rb CHANGED Viewed

@@ -4,300 +4,284 @@
 # 2003- Hisashi MORITA
 class DocDiff
-module CharString
-  Encodings = {}
-  EOLChars = {}  # End-of-line characters, such as CR, LF, CRLF.
-  def initialize(string)
-=begin unnecessary
-#    @encoding = CharString.guess_encoding(string)
-#    @eol     = CharString.guess_eol(string)
-=end unnecessary
-    super
-  end
+  module CharString
+    Encodings = {}
+    EOLChars = {} # End-of-line characters, such as CR, LF, CRLF.
+    def initialize(string)
+      # unnecessary
+      # @encoding = CharString.guess_encoding(string)
+      # @eol      = CharString.guess_eol(string)
+      super
+    end
-  def eol()
-    @eol
-#     if @eol
-#       @eol
-#     else
-#       @eol = CharString.guess_eol(self)
-#       # raise "eol is not set.\n"
-#     end
-  end
+    def eol
+      @eol
+      # if @eol
+      #   @eol
+      # else
+      #   @eol = CharString.guess_eol(self)
+      #   # raise "eol is not set.\n"
+      # end
+    end
-  def eol=(e)
-    @eol = e
-    extend EOLChars[@eol]
-  end
+    def eol=(e)
+      @eol = e
+      extend(EOLChars[@eol])
+    end
-  def eol_char()
-    if @eol_char
-      @eol_char
-    else
-      nil
-#       extend EOLChars[eol]
-#       eol_char
+    def eol_char
+      if @eol_char
+        @eol_char
+      else
+        nil
+        # extend EOLChars[eol]
+        # eol_char
+      end
     end
-  end
-  def debug()
-    case
-    when @encoding  == nil
-      raise "@encoding is nil."
-    when Encodings[@encoding] == nil
-      raise "Encodings[@encoding(=#{@encoding})] is nil."
-    when Encodings[@encoding].class != Module
-      raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module."
-    when @eol == nil
-      raise "@eol is nil."
-    when EOLChars[@eol] == nil
-      raise "EOLChars[@eol(=#{@eol})] is nil."
-    else
-      # should I do some alert?
-    end
-    ["id: #{self.id}, class: #{self.class}, self: #{self}, ",
-     "module: #{Encodings[@encoding]}, #{EOLChars[@eol]}"].join
-  end
+    def debug
+      if @encoding.nil?
+        raise "@encoding is nil."
+      elsif Encodings[@encoding].nil?
+        raise "Encodings[@encoding(=#{@encoding})] is nil."
+      elsif Encodings[@encoding].class != Module
+        raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module."
+      elsif @eol.nil?
+        raise "@eol is nil."
+      elsif EOLChars[@eol].nil?
+        raise "EOLChars[@eol(=#{@eol})] is nil."
+      else
+        # should I do some alert?
+      end
+      [
+        "id: #{id}, class: #{self.class}, self: #{self}, ",
+        "module: #{Encodings[@encoding]}, #{EOLChars[@eol]}",
+      ].join
+    end
-  def CharString.register_encoding(mod)
-    Encodings[mod::Encoding] = mod
-  end
+    class << self
+      def register_encoding(mod)
+        Encodings[mod::ENCODING] = mod
+      end
+      def register_eol(mod)
+        EOLChars[mod::EOL] = mod
+      end
+      def guess_eol(string)
+        # returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
+        # 'NONE'(1-line), or nil
+        return if string.nil? #=> nil (argument missing)
+        bin_string = string.dup.force_encoding("ASCII-8BIT")
+        eol_counts = {
+          "CR"   => bin_string.scan(/(\r)(?!\n)/o).size,
+          "LF"   => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
+          "CRLF" => bin_string.scan(/(\r\n)/o).size,
+        }
+        eol_counts.delete_if { |_eol, count| count == 0 } # Remove missing EOL
+        eols = eol_counts.keys
+        eol_variety = eols.size # numbers of flavors found
+        if eol_variety == 1     # Only one type of EOL found
+          eols[0]               #=> 'CR', 'LF', or 'CRLF'
+        elsif eol_variety == 0  # No EOL found
+          "NONE"                #=> 'NONE' (might be 1-line file)
+        else                    # Multiple types of EOL found
+          "UNKNOWN"             #=> 'UNKNOWN' (might be binary data)
+        end
+      end
+    end
-  def CharString.register_eol(mod)
-    EOLChars[mod::EOL] = mod
-  end
+    # Note that some languages (like Japanese) do not have 'word' or 'phrase',
+    # thus some of the following methods are not 'linguistically correct'.
-  def CharString.guess_eol(string)
-    # returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
-    # 'NONE'(1-line), or nil
-    return nil if string == nil  #=> nil (argument missing)
-    bin_string = string.dup.force_encoding("ASCII-8BIT")
-    eol_counts = {'CR'   => bin_string.scan(/(\r)(?!\n)/o).size,
-                  'LF'   => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
-                  'CRLF' => bin_string.scan(/(\r\n)/o).size}
-    eol_counts.delete_if{|eol, count| count == 0}  # Remove missing EOL
-    eols = eol_counts.keys
-    eol_variety = eols.size  # numbers of flavors found
-    if eol_variety == 1          # Only one type of EOL found
-      return eols[0]         #=> 'CR', 'LF', or 'CRLF'
-    elsif eol_variety == 0       # No EOL found
-      return 'NONE'              #=> 'NONE' (might be 1-line file)
-    else                         # Multiple types of EOL found
-      return 'UNKNOWN'           #=> 'UNKNOWN' (might be binary data)
+    def count_byte
+      split_to_byte.size
     end
-  end
-  # Note that some languages (like Japanese) do not have 'word' or 'phrase',
-  # thus some of the following methods are not 'linguistically correct'.
-  def count_byte()
-    split_to_byte().size
-  end
+    def count_char  # eol = 1 char
+      split_to_char.size
+    end
-  def count_char()  # eol = 1 char
-    split_to_char().size
-  end
+    def count_graph_char
+      count_latin_graph_char + count_ja_graph_char
+    end
-  def count_graph_char()
-    count_latin_graph_char() + count_ja_graph_char()
-  end
+    def count_blank_char
+      count_latin_blank_char + count_ja_blank_char
+    end
-  def count_blank_char()
-    count_latin_blank_char() + count_ja_blank_char()
-  end
+    def count_word
+      split_to_word.size
+    end
-  def count_word()
-    split_to_word().size
-  end
+    def count_valid_word
+      count_latin_valid_word + count_ja_valid_word
+    end
-  def count_valid_word()
-    count_latin_valid_word() + count_ja_valid_word()
-  end
+    def count_line  # this is common to all encodings.
+      split_to_line.size
+    end
-  def count_line()  # this is common to all encodings.
-    split_to_line.size
-  end
+    def count_empty_line
+      split_to_line.count { |line| /^(?:#{eol_char})|^$/m.match(line) }
+    end
-  def count_empty_line()
-    split_to_line.collect{|line|
-      line if /^(?:#{eol_char})|^$/m.match line
-    }.compact.size
-  end
+    # for Ruby-1.9
+    def encoding
+      String.new(self).encoding.to_s
+    end
-  # for Ruby-1.9
-  def encoding()
-    String.new(self).encoding.to_s
-  end
+    def encoding=(cs)
+      force_encoding(cs) if self
+    end
-  def encoding=(cs)
-    force_encoding(cs) if self
-  end
+    class << self
+      def guess_encoding(string)
+        if string
+          string.encoding.to_s
+        end
+      end
+    end
-  def CharString.guess_encoding(string)
-    if string
-      string.encoding.to_s
-    else
-      nil
+    def split_to_byte
+      encode("ASCII-8BIT").scan(/./nm)
     end
-  end
-  def split_to_byte()
-    encode("ASCII-8BIT").scan(/./nm)
-  end
+    def split_to_char
+      re =
+        if eol_char # sometimes string has no end-of-line char
+          Regexp.new("(?:#{eol_char})|(?:.)", Regexp::MULTILINE)
+        else        # it seems that no EOL module was extended...
+          Regexp.new("(?:.)", Regexp::MULTILINE)
+        end
+      encode("UTF-8").scan(re).map { |e| e.encode(encoding) }
+    end
-  def split_to_char()
-    if eol_char  # sometimes string has no end-of-line char
-      encode('UTF-8').scan(Regexp.new("(?:#{eol_char})|(?:.)",
-                      Regexp::MULTILINE)
-      ).map{|e| e.encode(self.encoding)}
-    else                  # it seems that no EOL module was extended...
-      encode('UTF-8').scan(Regexp.new("(?:.)",
-                      Regexp::MULTILINE)
-      ).map{|e| e.encode(self.encoding)}
+    def count_latin_graph_char
+      re = Regexp.new("[#{Encodings["UTF-8"]::GRAPH}]", Regexp::MULTILINE)
+      encode("UTF-8").scan(re).size
     end
-  end
-  def count_latin_graph_char()
-    encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::GRAPH}]",
-                    Regexp::MULTILINE)
-    ).size
-  end
+    def count_ja_graph_char
+      re = Regexp.new("[#{Encodings["UTF-8"]::JA_GRAPH}]", Regexp::MULTILINE)
+      encode("UTF-8").scan(re).size
+    end
-  def count_ja_graph_char()
-    encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
-                    Regexp::MULTILINE)
-    ).size
-  end
+    def count_latin_blank_char
+      re = Regexp.new("[#{Encodings["UTF-8"]::BLANK}]", Regexp::MULTILINE)
+      encode("UTF-8").scan(re).size
+    end
-  def count_latin_blank_char()
-    encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::BLANK}]",
-                    Regexp::MULTILINE)
-    ).size
-  end
+    def count_ja_blank_char
+      re = Regexp.new("[#{Encodings["UTF-8"]::JA_BLANK}]", Regexp::MULTILINE)
+      encode("UTF-8").scan(re).size
+    end
-  def count_ja_blank_char()
-    encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_BLANK}]",
-                    Regexp::MULTILINE)
-    ).size
-  end
+    def split_to_word
+      re = Regexp.new(Encodings["UTF-8"]::WORD_REGEXP_SRC, Regexp::MULTILINE)
+      encode("UTF-8").scan(re).map { |e| e.encode(encoding) }
+    end
-  def split_to_word()
-    encode('UTF-8').scan(Regexp.new(Encodings['UTF-8']::WORD_REGEXP_SRC,
-                    Regexp::MULTILINE)
-    ).map{|e| e.encode(self.encoding)}
-  end
+    def count_latin_word
+      re = Regexp.new("[#{Encodings["UTF-8"]::PRINT}]", Regexp::MULTILINE)
+      split_to_word.count { |word| re.match(word.encode("UTF-8")) }
+    end
-  def count_latin_word()
-    split_to_word.collect{|word|
-      word if Regexp.new("[#{Encodings['UTF-8']::PRINT}]",
-                         Regexp::MULTILINE).match word.encode('UTF-8')
-    }.compact.size
-  end
+    def count_ja_word
+      re = Regexp.new("[#{Encodings["UTF-8"]::JA_PRINT}]", Regexp::MULTILINE)
+      split_to_word.count { |word| re.match(word.encode("UTF-8")) }
+    end
-  def count_ja_word()
-    split_to_word.collect{|word|
-      word if Regexp.new("[#{Encodings['UTF-8']::JA_PRINT}]",
-                         Regexp::MULTILINE).match word.encode('UTF-8')
-    }.compact.size
-  end
+    def count_latin_valid_word
+      re = Regexp.new("[#{Encodings["UTF-8"]::ALNUM}]", Regexp::MULTILINE)
+      split_to_word.count { |word| re.match(word.encode("UTF-8")) }
+    end
-  def count_latin_valid_word()
-    split_to_word.collect{|word|
-      word if Regexp.new("[#{Encodings['UTF-8']::ALNUM}]",
-                         Regexp::MULTILINE).match word.encode('UTF-8')
-    }.compact.size
-  end
+    def count_ja_valid_word
+      re = Regexp.new("[#{Encodings["UTF-8"]::JA_GRAPH}]", Regexp::MULTILINE)
+      split_to_word.count { |word| re.match(word.encode("UTF-8")) }
+    end
-  def count_ja_valid_word()
-    split_to_word.collect{|word|
-      word if Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
-                         Regexp::MULTILINE).match word.encode('UTF-8')
-    }.compact.size
-  end
+    def split_to_line
+      raise <<~EOS.chomp unless EOLChars[eol]
+        EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed.
+      EOS
+      re =
+        if defined? eol_char
+          Regexp.new(".*?#{eol_char}|.+", Regexp::MULTILINE)
+        else
+          Regexp.new(".+", Regexp::MULTILINE)
+        end
+      encode("UTF-8").scan(re).map { |e| e.encode(encoding) }
+    end
-  def split_to_line()
-    raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
-    if defined? eol_char
-      encode('UTF-8').scan(Regexp.new(".*?#{eol_char}|.+",
-                      Regexp::MULTILINE)
-      ).map{|e| e.encode(self.encoding)}
-    else
-      encode('UTF-8').scan(Regexp.new(".+",
-                      Regexp::MULTILINE)
-      ).map{|e| e.encode(self.encoding)}
+    def count_graph_line
+      graph = (Encodings["UTF-8"]::GRAPH + Encodings["UTF-8"]::JA_GRAPH).chars.uniq.join
+      re = Regexp.new("[#{Regexp.quote(graph)}]", Regexp::MULTILINE)
+      split_to_line.count { |line| re.match(line.encode("UTF-8")) }
     end
-  end
-  def count_graph_line()
-    graph = (Encodings['UTF-8']::GRAPH +
-             Encodings['UTF-8']::JA_GRAPH).chars.uniq.join
-    re_graph = Regexp.new("[#{Regexp.quote(graph)}]", Regexp::MULTILINE)
-    split_to_line.collect{|line|
-      line if re_graph.match line.encode('UTF-8')
-    }.compact.size
-  end
+    def count_blank_line
+      blank = (Encodings["UTF-8"]::BLANK + Encodings["UTF-8"]::JA_BLANK).chars.uniq.join
+      re = Regexp.new("^[#{blank}]+(?:#{eol_char})?", Regexp::MULTILINE)
+      split_to_line.count { |line| re.match(line.encode("UTF-8")) }
+    end
-  def count_blank_line()
-    split_to_line.collect{|line|
-      line if Regexp.new("^[#{Encodings['UTF-8']::BLANK}" +
-                         "#{Encodings['UTF-8']::JA_BLANK}]+(?:#{eol_char})?",
-                         Regexp::MULTILINE).match line.encode('UTF-8')
-    }.compact.size
-  end
+    # load encoding modules
+    require "docdiff/encoding/en_ascii"
+    require "docdiff/encoding/ja_eucjp"
+    require "docdiff/encoding/ja_sjis"
+    require "docdiff/encoding/ja_utf8"
+    alias_method :to_bytes, :split_to_byte
+    alias_method :to_chars, :split_to_char
+    alias_method :to_words, :split_to_word
+    alias_method :to_lines, :split_to_line
-  # load encoding modules
-  require 'docdiff/encoding/en_ascii'
-  require 'docdiff/encoding/ja_eucjp'
-  require 'docdiff/encoding/ja_sjis'
-  require 'docdiff/encoding/ja_utf8'
-  alias to_bytes split_to_byte
-  alias to_chars split_to_char
-  alias to_words split_to_word
-  alias to_lines split_to_line
+    module CR
+      EOL = "CR"
-  module CR
-    EOL = 'CR'
+      def eol_char
+        "\r"
+      end
-    def eol_char()
-      "\r"
+      CharString.register_eol(self)
     end
-    CharString.register_eol(self)
-  end
+    module LF
+      EOL = "LF"
-  module LF
-    EOL = 'LF'
+      def eol_char
+        "\n"
+      end
-    def eol_char()
-      "\n"
+      CharString.register_eol(self)
     end
-    CharString.register_eol(self)
-  end
+    module CRLF
+      EOL = "CRLF"
-  module CRLF
-    EOL = 'CRLF'
+      def eol_char
+        "\r\n"
+      end
-    def eol_char()
-      "\r\n"
+      CharString.register_eol(self)
     end
-    CharString.register_eol(self)
-  end
+    module NoEOL
+      EOL = "NONE"
-  module NoEOL
-    EOL = 'NONE'
-    def eol_char()
-      nil
-    end
+      def eol_char
+        nil
+      end
-    CharString.register_eol(self)
+      CharString.register_eol(self)
+    end
   end
-end  # module CharString
-end  # class DocDiff
+end
 # class String
 #   include CharString