RubyGems - rchardet - Versions diffs - 1.3.1 → 1.4.0 - Mend

rchardet 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

data/lib/rchardet.rb +1 -3
data/lib/rchardet/big5freq.rb +2 -2
data/lib/rchardet/big5prober.rb +2 -2
data/lib/rchardet/chardistribution.rb +74 -69
data/lib/rchardet/charsetgroupprober.rb +50 -52
data/lib/rchardet/charsetprober.rb +2 -7
data/lib/rchardet/codingstatemachine.rb +14 -13
data/lib/rchardet/constants.rb +0 -0
data/lib/rchardet/escprober.rb +34 -34
data/lib/rchardet/escsm.rb +33 -32
data/lib/rchardet/eucjpprober.rb +28 -28
data/lib/rchardet/euckrfreq.rb +2 -1
data/lib/rchardet/euckrprober.rb +2 -2
data/lib/rchardet/euctwfreq.rb +2 -1
data/lib/rchardet/euctwprober.rb +2 -2
data/lib/rchardet/gb2312freq.rb +2 -2
data/lib/rchardet/gb2312prober.rb +2 -2
data/lib/rchardet/hebrewprober.rb +40 -40
data/lib/rchardet/jisfreq.rb +2 -1
data/lib/rchardet/jpcntx.rb +131 -130
data/lib/rchardet/langbulgarianmodel.rb +6 -6
data/lib/rchardet/langcyrillicmodel.rb +13 -13
data/lib/rchardet/langgreekmodel.rb +5 -5
data/lib/rchardet/langhebrewmodel.rb +3 -3
data/lib/rchardet/langhungarianmodel.rb +5 -5
data/lib/rchardet/langthaimodel.rb +3 -3
data/lib/rchardet/latin1prober.rb +18 -18
data/lib/rchardet/mbcharsetprober.rb +30 -30
data/lib/rchardet/mbcsgroupprober.rb +9 -9
data/lib/rchardet/mbcssm.rb +72 -72
data/lib/rchardet/sbcharsetprober.rb +48 -50
data/lib/rchardet/sbcsgroupprober.rb +16 -16
data/lib/rchardet/sjisprober.rb +28 -28
data/lib/rchardet/universaldetector.rb +92 -90
data/lib/rchardet/utf8prober.rb +25 -25
data/lib/rchardet/version.rb +3 -0
metadata +30 -47
data/COPYING +0 -504
data/README +0 -12

data/lib/rchardet.rb CHANGED

@@ -15,8 +15,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
-$LOAD_PATH.unshift File.expand_path(File.dirname(__FILE__))
+require 'rchardet/version'
 require 'rchardet/charsetprober'
 require 'rchardet/mbcharsetprober'
@@ -56,7 +55,6 @@ require 'rchardet/universaldetector'
 require 'rchardet/utf8prober'
 module CharDet
-  VERSION = "1.3.1"
   def CharDet.detect(aBuf)
     u = UniversalDetector.new
     u.reset

data/lib/rchardet/big5freq.rb CHANGED

@@ -922,6 +922,6 @@ Big5CharToFreqOrder = [
 13920,13921,13922,13923,13924,13925,13926,13927,13928,13929,13930,13931,13932,13933,13934,13935, #13936
 13936,13937,13938,13939,13940,13941,13942,13943,13944,13945,13946,13947,13948,13949,13950,13951, #13952
 13952,13953,13954,13955,13956,13957,13958,13959,13960,13961,13962,13963,13964,13965,13966,13967, #13968
-13968,13969,13970,13971,13972] #13973
+13968,13969,13970,13971,13972 #13973
+].freeze
 end

data/lib/rchardet/big5prober.rb CHANGED

@@ -30,8 +30,8 @@ module CharDet
   class Big5Prober < MultiByteCharSetProber
     def initialize
       super
-      @_mCodingSM = CodingStateMachine.new(Big5SMModel)
-      @_mDistributionAnalyzer = Big5DistributionAnalysis.new()
+      @codingSM = CodingStateMachine.new(Big5SMModel)
+      @distributionAnalyzer = Big5DistributionAnalysis.new()
       reset()
     end

data/lib/rchardet/chardistribution.rb CHANGED

@@ -33,50 +33,50 @@ module CharDet
   class CharDistributionAnalysis
     def initialize
-      @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
-      @_mTableSize = nil # Size of above table
-      @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence.  See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
+      @charToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
+      @tableSize = nil # Size of above table
+      @typicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence.  See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
       reset()
     end
     def reset
       # # """reset analyser, clear any state"""
-      @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
-      @_mTotalChars = 0 # Total characters encountered
-      @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
+      @done = false # If this flag is set to constants.True, detection is done and conclusion has been made
+      @totalChars = 0 # Total characters encountered
+      @freqChars = 0 # The number of characters whose frequency order is less than 512
     end
     def feed(aStr, aCharLen)
       # # """feed a character with known length"""
       if aCharLen == 2
-	# we only care about 2-bytes character in our distribution analysis
-	order = get_order(aStr)
+        # we only care about 2-bytes character in our distribution analysis
+        order = get_order(aStr)
       else
-	order = -1
+        order = -1
       end
       if order >= 0
-	@_mTotalChars += 1
-	# order is valid
-	if order < @_mTableSize
-	  if 512 > @_mCharToFreqOrder[order]
-	    @_mFreqChars += 1
-	  end
-	end
+        @totalChars += 1
+        # order is valid
+        if order < @tableSize
+          if 512 > @charToFreqOrder[order]
+            @freqChars += 1
+          end
+        end
       end
     end
     def get_confidence
       # """return confidence based on existing data"""
       # if we didn't receive any character in our consideration range, return negative answer
-      if @_mTotalChars <= 0
-	return SURE_NO
+      if @totalChars <= 0
+        return SURE_NO
       end
-      if @_mTotalChars != @_mFreqChars
-	r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
-	if r < SURE_YES
-	  return r
-	end
+      if @totalChars != @freqChars
+        r = @freqChars / ((@totalChars - @freqChars) * @typicalDistributionRatio)
+        if r < SURE_YES
+          return r
+        end
       end
       # normalize confidence (we don't want to be 100% sure)
@@ -86,7 +86,7 @@ module CharDet
     def got_enough_data
       # It is not necessary to receive all data to draw conclusion. For charset detection,
       # certain amount of data is enough
-      return @_mTotalChars > ENOUGH_DATA_THRESHOLD
+      return @totalChars > ENOUGH_DATA_THRESHOLD
     end
     def get_order(aStr)
@@ -100,9 +100,9 @@ module CharDet
   class EUCTWDistributionAnalysis < CharDistributionAnalysis
     def initialize
       super()
-      @_mCharToFreqOrder = EUCTWCharToFreqOrder
-      @_mTableSize = EUCTW_TABLE_SIZE
-      @_mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
+      @charToFreqOrder = EUCTWCharToFreqOrder
+      @tableSize = EUCTW_TABLE_SIZE
+      @typicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
     end
     def get_order(aStr)
@@ -110,10 +110,11 @@ module CharDet
       #   first  byte range: 0xc4 -- 0xfe
       #   second byte range: 0xa1 -- 0xfe
       # no validation needed here. State machine has done that
-      if aStr[0..0] >= "\xC4"
-	return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
+      if aStr[0, 1] >= "\xC4"
+        bytes = aStr.bytes
+        return 94 * (bytes[0] - 0xC4) + bytes[1] - 0xA1
       else
-	return -1
+        return -1
       end
     end
   end
@@ -121,9 +122,9 @@ module CharDet
   class EUCKRDistributionAnalysis < CharDistributionAnalysis
     def initialize
       super()
-      @_mCharToFreqOrder = EUCKRCharToFreqOrder
-      @_mTableSize = EUCKR_TABLE_SIZE
-      @_mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
+      @charToFreqOrder = EUCKRCharToFreqOrder
+      @tableSize = EUCKR_TABLE_SIZE
+      @typicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
     end
     def get_order(aStr)
@@ -131,10 +132,11 @@ module CharDet
       #   first  byte range: 0xb0 -- 0xfe
       #   second byte range: 0xa1 -- 0xfe
       # no validation needed here. State machine has done that
-      if aStr[0..0] >= "\xB0"
-	return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
+      if aStr[0, 1] >= "\xB0"
+        bytes = aStr.bytes
+        return 94 * (bytes[0] - 0xB0) + bytes[1] - 0xA1
       else
-	return -1
+        return -1
       end
     end
   end
@@ -142,9 +144,9 @@ module CharDet
   class GB2312DistributionAnalysis < CharDistributionAnalysis
     def initialize
       super()
-      @_mCharToFreqOrder = GB2312CharToFreqOrder
-      @_mTableSize = GB2312_TABLE_SIZE
-      @_mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
+      @charToFreqOrder = GB2312CharToFreqOrder
+      @tableSize = GB2312_TABLE_SIZE
+      @typicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
     end
     def get_order(aStr)
@@ -152,10 +154,11 @@ module CharDet
       #  first  byte range: 0xb0 -- 0xfe
       #  second byte range: 0xa1 -- 0xfe
       # no validation needed here. State machine has done that
-      if (aStr[0..0] >= "\xB0") and (aStr[1..1] >= "\xA1")
-	return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
+      if (aStr[0, 1] >= "\xB0") and (aStr[1, 1] >= "\xA1")
+        bytes = aStr.bytes
+        return 94 * (bytes[0] - 0xB0) + bytes[1] - 0xA1
       else
-	return -1
+        return -1
       end
     end
   end
@@ -163,9 +166,9 @@ module CharDet
   class Big5DistributionAnalysis < CharDistributionAnalysis
     def initialize
       super
-      @_mCharToFreqOrder = Big5CharToFreqOrder
-      @_mTableSize = BIG5_TABLE_SIZE
-      @_mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
+      @charToFreqOrder = Big5CharToFreqOrder
+      @tableSize = BIG5_TABLE_SIZE
+      @typicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
     end
     def get_order(aStr)
@@ -173,14 +176,15 @@ module CharDet
       #   first  byte range: 0xa4 -- 0xfe
       #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
       # no validation needed here. State machine has done that
-      if aStr[0..0] >= "\xA4"
-	if aStr[1..1] >= "\xA1"
-	  return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
-	else
-	  return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
-	end
+      if aStr[0, 1] >= "\xA4"
+        bytes = aStr.bytes
+        if aStr[1, 1] >= "\xA1"
+          return 157 * (bytes[0] - 0xA4) + bytes[1] - 0xA1 + 63
+        else
+          return 157 * (bytes[0] - 0xA4) + bytes[1] - 0x40
+        end
       else
-	return -1
+        return -1
       end
     end
   end
@@ -188,9 +192,9 @@ module CharDet
   class SJISDistributionAnalysis < CharDistributionAnalysis
     def initialize
       super()
-      @_mCharToFreqOrder = JISCharToFreqOrder
-      @_mTableSize = JIS_TABLE_SIZE
-      @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
+      @charToFreqOrder = JISCharToFreqOrder
+      @tableSize = JIS_TABLE_SIZE
+      @typicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
     end
     def get_order(aStr)
@@ -198,17 +202,17 @@ module CharDet
       #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
       #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
       # no validation needed here. State machine has done that
-      aStr = aStr[0..1].join if aStr.class == Array
-      if (aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")
-	order = 188 * (aStr[0] - 0x81)
-      elsif (aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xEF")
-	order = 188 * (aStr[0] - 0xE0 + 31)
+      bytes = aStr.bytes
+      if (aStr[0, 1] >= "\x81") and (aStr[0, 1] <= "\x9F")
+        order = 188 * (bytes[0] - 0x81)
+      elsif (aStr[0, 1] >= "\xE0") and (aStr[0, 1] <= "\xEF")
+        order = 188 * (bytes[0] - 0xE0 + 31)
       else
-	return -1
+        return -1
       end
-      order = order + aStr[1] - 0x40
-      if aStr[1..1] > "\x7F"
-	order =- 1
+      order = order + bytes[1] - 0x40
+      if aStr[1, 1] > "\x7F"
+        order =- 1
       end
       return order
     end
@@ -217,9 +221,9 @@ module CharDet
   class EUCJPDistributionAnalysis < CharDistributionAnalysis
     def initialize
       super()
-      @_mCharToFreqOrder = JISCharToFreqOrder
-      @_mTableSize = JIS_TABLE_SIZE
-      @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
+      @charToFreqOrder = JISCharToFreqOrder
+      @tableSize = JIS_TABLE_SIZE
+      @typicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
     end
     def get_order(aStr)
@@ -227,8 +231,9 @@ module CharDet
       #   first  byte range: 0xa0 -- 0xfe
       #   second byte range: 0xa1 -- 0xfe
       # no validation needed here. State machine has done that
-      if aStr[0..0] >= "\xA0"
-        return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
+      if aStr[0, 1] >= "\xA0"
+        bytes = aStr.bytes
+        return 94 * (bytes[0] - 0xA1) + bytes[1] - 0xa1
       else
         return -1
       end

data/lib/rchardet/charsetgroupprober.rb CHANGED

@@ -28,54 +28,55 @@
 module CharDet
   class CharSetGroupProber < CharSetProber
-    attr_accessor :_mProbers
+    attr_accessor :probers
     def initialize
       super
-      @_mActiveNum = 0
-      @_mProbers = []
-      @_mBestGuessProber = nil
+      @activeNum = 0
+      @probers = []
+      @bestGuessProber = nil
     end
     def reset
       super
-      @_mActiveNum = 0
+      @activeNum = 0
-      for prober in @_mProbers
-	if prober
-	  prober.reset()
-	  prober.active = true
-	  @_mActiveNum += 1
-	end
+      for prober in @probers
+        if prober
+          prober.reset()
+          prober.active = true
+          @activeNum += 1
+        end
       end
-      @_mBestGuessProber = nil
+      @bestGuessProber = nil
     end
     def get_charset_name
-      if not @_mBestGuessProber
-	get_confidence()
-	return nil unless @_mBestGuessProber
-	#                self._mBestGuessProber = self._mProbers[0]
+      if !@bestGuessProber
+        get_confidence()
+        if !@bestGuessProber
+          return nil
+        end
       end
-      return @_mBestGuessProber.get_charset_name()
+      return @bestGuessProber.get_charset_name()
     end
     def feed(aBuf)
-      for prober in @_mProbers
-	next unless prober
-	next unless prober.active
-	st = prober.feed(aBuf)
-	next unless st
-	if st == EFoundIt
-	  @_mBestGuessProber = prober
-	  return get_state()
-	elsif st == ENotMe
-	  prober.active = false
-	  @_mActiveNum -= 1
-	  if @_mActiveNum <= 0
-	    @_mState = ENotMe
-	    return get_state()
-	  end
-	end
+      for prober in @probers
+        next unless prober
+        next unless prober.active
+        st = prober.feed(aBuf)
+        next unless st
+        if st == EFoundIt
+          @bestGuessProber = prober
+          return get_state()
+        elsif st == ENotMe
+          prober.active = false
+          @activeNum -= 1
+          if @activeNum <= 0
+            @state = ENotMe
+            return get_state()
+          end
+        end
       end
       return get_state()
     end
@@ -83,30 +84,27 @@ module CharDet
     def get_confidence()
       st = get_state()
       if st == EFoundIt
-	return 0.99
+        return 0.99
       elsif st == ENotMe
-	return 0.01
+        return 0.01
       end
       bestConf = 0.0
-      @_mBestGuessProber = nil
-      for prober in @_mProbers
-	next unless prober
-	unless prober.active
-	  $stderr << "#{prober.get_charset_name()} not active\n" if $debug
-	  next
-	end
-	cf = prober.get_confidence()
-	$stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
-	if bestConf < cf
-	  bestConf = cf
-	  @_mBestGuessProber = prober
-	end
+      @bestGuessProber = nil
+      for prober in @probers
+        next unless prober
+        unless prober.active
+          $stderr << "#{prober.get_charset_name()} not active\n" if $debug
+          next
+        end
+        cf = prober.get_confidence()
+        $stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
+        if bestConf < cf
+          bestConf = cf
+          @bestGuessProber = prober
+        end
       end
-      return 0.0 unless @_mBestGuessProber
+      return 0.0 unless @bestGuessProber
       return bestConf
-      #        else:
-      #            self._mBestGuessProber = self._mProbers[0]
-      #            return self._mBestGuessProber.get_confidence()
     end
   end
 end

data/lib/rchardet/charsetprober.rb CHANGED

@@ -34,7 +34,7 @@ module CharDet
     end
     def reset
-      @_mState = EDetecting
+      @state = EDetecting
     end
     def get_charset_name
@@ -45,7 +45,7 @@ module CharDet
     end
     def get_state
-      return @_mState
+      return @state
     end
     def get_confidence
@@ -53,11 +53,6 @@ module CharDet
     end
     def filter_high_bit_only(aBuf)
-      # DO NOT USE `gsub!`
-      # It will remove all characters from the buffer that is later used by
-      # other probers.  This is because gsub! removes data from the instance variable
-      # that will be passed to later probers, while gsub makes a new instance variable
-      # that will not.
       newBuf = aBuf.gsub(/([\x00-\x7F])+/, ' ')
       return newBuf
     end