RubyGems - rchardet - Versions diffs - 1.3.1 → 1.4.0 - Mend

rchardet 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

data/lib/rchardet.rb +1 -3
data/lib/rchardet/big5freq.rb +2 -2
data/lib/rchardet/big5prober.rb +2 -2
data/lib/rchardet/chardistribution.rb +74 -69
data/lib/rchardet/charsetgroupprober.rb +50 -52
data/lib/rchardet/charsetprober.rb +2 -7
data/lib/rchardet/codingstatemachine.rb +14 -13
data/lib/rchardet/constants.rb +0 -0
data/lib/rchardet/escprober.rb +34 -34
data/lib/rchardet/escsm.rb +33 -32
data/lib/rchardet/eucjpprober.rb +28 -28
data/lib/rchardet/euckrfreq.rb +2 -1
data/lib/rchardet/euckrprober.rb +2 -2
data/lib/rchardet/euctwfreq.rb +2 -1
data/lib/rchardet/euctwprober.rb +2 -2
data/lib/rchardet/gb2312freq.rb +2 -2
data/lib/rchardet/gb2312prober.rb +2 -2
data/lib/rchardet/hebrewprober.rb +40 -40
data/lib/rchardet/jisfreq.rb +2 -1
data/lib/rchardet/jpcntx.rb +131 -130
data/lib/rchardet/langbulgarianmodel.rb +6 -6
data/lib/rchardet/langcyrillicmodel.rb +13 -13
data/lib/rchardet/langgreekmodel.rb +5 -5
data/lib/rchardet/langhebrewmodel.rb +3 -3
data/lib/rchardet/langhungarianmodel.rb +5 -5
data/lib/rchardet/langthaimodel.rb +3 -3
data/lib/rchardet/latin1prober.rb +18 -18
data/lib/rchardet/mbcharsetprober.rb +30 -30
data/lib/rchardet/mbcsgroupprober.rb +9 -9
data/lib/rchardet/mbcssm.rb +72 -72
data/lib/rchardet/sbcharsetprober.rb +48 -50
data/lib/rchardet/sbcsgroupprober.rb +16 -16
data/lib/rchardet/sjisprober.rb +28 -28
data/lib/rchardet/universaldetector.rb +92 -90
data/lib/rchardet/utf8prober.rb +25 -25
data/lib/rchardet/version.rb +3 -0
metadata +30 -47
data/COPYING +0 -504
data/README +0 -12

data/lib/rchardet/sbcharsetprober.rb CHANGED

@@ -40,68 +40,68 @@ module CharDet
   class SingleByteCharSetProber < CharSetProber
     def initialize(model, reversed=false, nameProber=nil)
       super()
-      @_mModel = model
-      @_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
-      @_mNameProber = nameProber # Optional auxiliary prober for name decision
+      @model = model
+      @reversed = reversed # TRUE if we need to reverse every pair in the model lookup
+      @nameProber = nameProber # Optional auxiliary prober for name decision
       reset()
     end
     def reset
       super()
-      @_mLastOrder = 255 # char order of last character
-      @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
-      @_mTotalSeqs = 0
-      @_mTotalChar = 0
-      @_mFreqChar = 0 # characters that fall in our sampling range
+      @lastOrder = 255 # char order of last character
+      @seqCounters = [0] * NUMBER_OF_SEQ_CAT
+      @totalSeqs = 0
+      @totalChar = 0
+      @freqChar = 0 # characters that fall in our sampling range
     end
     def get_charset_name
-      if @_mNameProber
-	return @_mNameProber.get_charset_name()
+      if @nameProber
+        return @nameProber.get_charset_name()
       else
-	return @_mModel['charsetName']
+        return @model['charsetName']
       end
     end
     def feed(aBuf)
-      if not @_mModel['keepEnglishLetter']
-	aBuf = filter_without_english_letters(aBuf)
+      if !@model['keepEnglishLetter']
+        aBuf = filter_without_english_letters(aBuf)
       end
       aLen = aBuf.length
-      if not aLen
-	return get_state()
+      if aLen == 0
+        return get_state()
       end
       aBuf.each_byte do |b|
-	c = b.chr
-	order = @_mModel['charToOrderMap'][c[0]]
-	if order < SYMBOL_CAT_ORDER
-	  @_mTotalChar += 1
-	end
-	if order < SAMPLE_SIZE
-	  @_mFreqChar += 1
-	  if @_mLastOrder < SAMPLE_SIZE
-	    @_mTotalSeqs += 1
-	    if not @_mReversed
-	      @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
-	    else # reverse the order of the letters in the lookup
-	      @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
-	    end
-	  end
-	end
-	@_mLastOrder = order
+        c = b.chr
+        order = @model['charToOrderMap'][c.bytes.first]
+        if order < SYMBOL_CAT_ORDER
+          @totalChar += 1
+        end
+        if order < SAMPLE_SIZE
+          @freqChar += 1
+          if @lastOrder < SAMPLE_SIZE
+            @totalSeqs += 1
+            if !@reversed
+              @seqCounters[@model['precedenceMatrix'][(@lastOrder * SAMPLE_SIZE) + order]] += 1
+            else # reverse the order of the letters in the lookup
+              @seqCounters[@model['precedenceMatrix'][(order * SAMPLE_SIZE) + @lastOrder]] += 1
+            end
+          end
+        end
+        @lastOrder = order
       end
       if get_state() == EDetecting
-	if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
-	  cf = get_confidence()
-	  if cf > POSITIVE_SHORTCUT_THRESHOLD
-	    $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
-	    @_mState = EFoundIt
-	  elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
-	    $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
-	    @_mState = ENotMe
-	  end
-	end
+        if @totalSeqs > SB_ENOUGH_REL_THRESHOLD
+          cf = get_confidence()
+          if cf > POSITIVE_SHORTCUT_THRESHOLD
+            $stderr << "#{@model['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
+            @state = EFoundIt
+          elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
+            $stderr << "#{@model['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
+            @state = ENotMe
+          end
+        end
       end
       return get_state()
@@ -109,14 +109,12 @@ module CharDet
     def get_confidence
       r = 0.01
-      if @_mTotalSeqs > 0
-	#            print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
-	r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
-	#            print r, self._mFreqChar, self._mTotalChar
-	r = r * @_mFreqChar / @_mTotalChar
-	if r >= 1.0
-	  r = 0.99
-	end
+      if @totalSeqs > 0
+        r = (1.0 * @seqCounters[POSITIVE_CAT]) / @totalSeqs / @model['mTypicalPositiveRatio']
+        r = r * @freqChar / @totalChar
+        if r >= 1.0
+          r = 0.99
+        end
       end
       return r
     end

data/lib/rchardet/sbcsgroupprober.rb CHANGED

@@ -31,26 +31,26 @@ module CharDet
   class SBCSGroupProber < CharSetGroupProber
     def initialize
       super
-      @_mProbers = [
-	SingleByteCharSetProber.new(Win1251CyrillicModel),
-	SingleByteCharSetProber.new(Koi8rModel),
-	SingleByteCharSetProber.new(Latin5CyrillicModel),
-	SingleByteCharSetProber.new(MacCyrillicModel),
-	SingleByteCharSetProber.new(Ibm866Model),
-	SingleByteCharSetProber.new(Ibm855Model),
-	SingleByteCharSetProber.new(Latin7GreekModel),
-	SingleByteCharSetProber.new(Win1253GreekModel),
-	SingleByteCharSetProber.new(Latin5BulgarianModel),
-	SingleByteCharSetProber.new(Win1251BulgarianModel),
-	SingleByteCharSetProber.new(Latin2HungarianModel),
-	SingleByteCharSetProber.new(Win1250HungarianModel),
-	SingleByteCharSetProber.new(TIS620ThaiModel),
-      ]
+      @probers = [
+                    SingleByteCharSetProber.new(Win1251CyrillicModel),
+                    SingleByteCharSetProber.new(Koi8rModel),
+                    SingleByteCharSetProber.new(Latin5CyrillicModel),
+                    SingleByteCharSetProber.new(MacCyrillicModel),
+                    SingleByteCharSetProber.new(Ibm866Model),
+                    SingleByteCharSetProber.new(Ibm855Model),
+                    SingleByteCharSetProber.new(Latin7GreekModel),
+                    SingleByteCharSetProber.new(Win1253GreekModel),
+                    SingleByteCharSetProber.new(Latin5BulgarianModel),
+                    SingleByteCharSetProber.new(Win1251BulgarianModel),
+                    SingleByteCharSetProber.new(Latin2HungarianModel),
+                    SingleByteCharSetProber.new(Win1250HungarianModel),
+                    SingleByteCharSetProber.new(TIS620ThaiModel),
+                   ]
       hebrewProber = HebrewProber.new()
       logicalHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, false, hebrewProber)
       visualHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, true, hebrewProber)
       hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
-      @_mProbers += [hebrewProber, logicalHebrewProber, visualHebrewProber]
+      @probers += [hebrewProber, logicalHebrewProber, visualHebrewProber]
       reset()
     end

data/lib/rchardet/sjisprober.rb CHANGED

@@ -30,15 +30,15 @@ module CharDet
   class SJISProber < MultiByteCharSetProber
     def initialize
       super()
-      @_mCodingSM = CodingStateMachine.new(SJISSMModel)
-      @_mDistributionAnalyzer = SJISDistributionAnalysis.new()
-      @_mContextAnalyzer = SJISContextAnalysis.new()
+      @codingSM = CodingStateMachine.new(SJISSMModel)
+      @distributionAnalyzer = SJISDistributionAnalysis.new()
+      @contextAnalyzer = SJISContextAnalysis.new()
       reset()
     end
     def reset
       super()
-      @_mContextAnalyzer.reset()
+      @contextAnalyzer.reset()
     end
     def get_charset_name
@@ -48,40 +48,40 @@ module CharDet
     def feed(aBuf)
       aLen = aBuf.length
       for i in (0...aLen)
-	codingState = @_mCodingSM.next_state(aBuf[i..i])
-	if codingState == EError
-	  $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
-	  @_mState = ENotMe
-	  break
-	elsif codingState == EItsMe
-	  @_mState = EFoundIt
-	  break
-	elsif codingState == EStart
-	  charLen = @_mCodingSM.get_current_charlen()
-	  if i == 0
-	    @_mLastChar[1] = aBuf[0..0]
-	    @_mContextAnalyzer.feed(@_mLastChar[2 - charLen..-1], charLen)
-	    @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
-	  else
-	    @_mContextAnalyzer.feed(aBuf[i + 1 - charLen ... i + 3 - charLen], charLen)
-	    @_mDistributionAnalyzer.feed(aBuf[i - 1 ... i + 1], charLen)
-	  end
-	end
+        codingState = @codingSM.next_state(aBuf[i,1])
+        if codingState == EError
+          $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
+          @state = ENotMe
+          break
+        elsif codingState == EItsMe
+          @state = EFoundIt
+          break
+        elsif codingState == EStart
+          charLen = @codingSM.get_current_charlen()
+          if i == 0
+            @lastChar[1] = aBuf[0, 1]
+            @contextAnalyzer.feed(@lastChar[2-charLen, 1], charLen)
+            @distributionAnalyzer.feed(@lastChar, charLen)
+          else
+            @contextAnalyzer.feed(aBuf[i+1-charLen, 2], charLen)
+            @distributionAnalyzer.feed(aBuf[i-1, 2], charLen)
+          end
+        end
       end
-      @_mLastChar[0] = aBuf[aLen - 1.. aLen-1]
+      @lastChar[0] = aBuf[aLen-1, 1]
       if get_state() == EDetecting
-	if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
-	  @_mState = EFoundIt
-	end
+        if @contextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
+          @state = EFoundIt
+        end
       end
       return get_state()
     end
     def get_confidence
-      l = [@_mContextAnalyzer.get_confidence(), @_mDistributionAnalyzer.get_confidence()]
+      l = [@contextAnalyzer.get_confidence(), @distributionAnalyzer.get_confidence()]
       return l.max
     end
   end

data/lib/rchardet/universaldetector.rb CHANGED

@@ -1,3 +1,4 @@
+# encoding: US-ASCII
 ######################## BEGIN LICENSE BLOCK ########################
 # The Original Code is Mozilla Universal charset detector code.
 #
@@ -34,27 +35,28 @@ module CharDet
   EHighbyte = 2
   class UniversalDetector
-    attr_accessor :result
+    attr_reader :done, :result
     def initialize
-      @_highBitDetector = /[\x80-\xFF]/
-      @_escDetector = /(\033|\~\{)/
-      @_mEscCharSetProber = nil
-      @_mCharSetProbers = []
+      @highBitDetector = /[\x80-\xFF]/
+      @escDetector = /(\033|\~\{)/
+      @escCharSetProber = nil
+      @charSetProbers = []
       reset()
     end
     def reset
       @result = {'encoding' => nil, 'confidence' => 0.0}
       @done = false
-      @_mStart = true
-      @_mGotData = false
-      @_mInputState = EPureAscii
-      @_mLastChar = ''
-      if @_mEscCharSetProber
-	@_mEscCharSetProber.reset()
+      @start = true
+      @gotData = false
+      @inputState = EPureAscii
+      @lastChar = ''
+      if @escCharSetProber
+        @escCharSetProber.reset()
       end
-      for prober in @_mCharSetProbers
-	prober.reset()
+      for prober in @charSetProbers
+        prober.reset()
       end
     end
@@ -62,104 +64,104 @@ module CharDet
       return if @done
       aLen = aBuf.length
-      return if not aLen
+      return if aLen == 0
-      if not @_mGotData
-	# If the data starts with BOM, we know it is UTF
-	if aBuf[0...3] == "\xEF\xBB\xBF"
-	  # EF BB BF  UTF-8 with BOM
-	  @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
-	elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
-	  # FF FE 00 00  UTF-32, little-endian BOM
-	  @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
-	elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
-	  # 00 00 FE FF  UTF-32, big-endian BOM
-	  @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
-	elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
-	  # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
-	  @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
-	elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
-	  # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
-	  @result = {'encoding' =>  "X-ISO-10646-UCS-4-2143", 'confidence' =>  1.0}
-	elsif aBuf[0...2] == "\xFF\xFE"
-	  # FF FE  UTF-16, little endian BOM
-	  @result = {'encoding' =>  "UTF-16LE", 'confidence' =>  1.0}
-	elsif aBuf[0...2] == "\xFE\xFF"
-	  # FE FF  UTF-16, big endian BOM
-	  @result = {'encoding' =>  "UTF-16BE", 'confidence' =>  1.0}
-	end
+      if !@gotData
+        # If the data starts with BOM, we know it is UTF
+        if aBuf[0, 3] == "\xEF\xBB\xBF"
+          # EF BB BF  UTF-8 with BOM
+          @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
+        elsif aBuf[0, 4] == "\xFF\xFE\x00\x00"
+          # FF FE 00 00  UTF-32, little-endian BOM
+          @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
+        elsif aBuf[0, 4] == "\x00\x00\xFE\xFF"
+          # 00 00 FE FF  UTF-32, big-endian BOM
+          @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
+        elsif aBuf[0, 4] == "\xFE\xFF\x00\x00"
+          # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
+          @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
+        elsif aBuf[0, 4] == "\x00\x00\xFF\xFE"
+          # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
+          @result = {'encoding' =>  "X-ISO-10646-UCS-4-2143", 'confidence' =>  1.0}
+        elsif aBuf[0, 2] == "\xFF\xFE"
+          # FF FE  UTF-16, little endian BOM
+          @result = {'encoding' =>  "UTF-16LE", 'confidence' =>  1.0}
+        elsif aBuf[0, 2] == "\xFE\xFF"
+          # FE FF  UTF-16, big endian BOM
+          @result = {'encoding' =>  "UTF-16BE", 'confidence' =>  1.0}
+        end
       end
-      @_mGotData = true
+      @gotData = true
       if @result['encoding'] and (@result['confidence'] > 0.0)
-	@done = true
-	return
+        @done = true
+        return
       end
-      if @_mInputState == EPureAscii
-	if @_highBitDetector =~ (aBuf)
-	  @_mInputState = EHighbyte
-	elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
-	  @_mInputState = EEscAscii
-	end
+      if @inputState == EPureAscii
+        if @highBitDetector =~ (aBuf)
+          @inputState = EHighbyte
+        elsif (@inputState == EPureAscii) and @escDetector =~ (@lastChar + aBuf)
+          @inputState = EEscAscii
+        end
       end
-      @_mLastChar = aBuf[-1..-1]
-      if @_mInputState == EEscAscii
-	if not @_mEscCharSetProber
-	  @_mEscCharSetProber = EscCharSetProber.new()
-	end
-	if @_mEscCharSetProber.feed(aBuf) == EFoundIt
-	  @result = {'encoding' =>  @_mEscCharSetProber.get_charset_name(),
-			       'confidence' =>  @_mEscCharSetProber.get_confidence()
-	  }
-	  @done = true
-	end
-      elsif @_mInputState == EHighbyte
-	if not @_mCharSetProbers or @_mCharSetProbers.empty?
-	  @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
-	end
-	for prober in @_mCharSetProbers
-	  if prober.feed(aBuf) == EFoundIt
-	    @result = {'encoding' =>  prober.get_charset_name(),
-				   'confidence' =>  prober.get_confidence()}
-	    @done = true
-	    break
-	  end
-	end
+      @lastChar = aBuf[-1, 1]
+      if @inputState == EEscAscii
+        if !@escCharSetProber
+          @escCharSetProber = EscCharSetProber.new()
+        end
+        if @escCharSetProber.feed(aBuf) == EFoundIt
+          @result = {'encoding' =>  @escCharSetProber.get_charset_name(),
+            'confidence' =>  @escCharSetProber.get_confidence()
+          }
+          @done = true
+        end
+      elsif @inputState == EHighbyte
+        if @charSetProbers.nil? || @charSetProbers.empty?
+          @charSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
+        end
+        for prober in @charSetProbers
+          if prober.feed(aBuf) == EFoundIt
+            @result = {'encoding' =>  prober.get_charset_name(),
+              'confidence' =>  prober.get_confidence()}
+            @done = true
+            break
+          end
+        end
       end
     end
     def close
       return if @done
-      if not @_mGotData
-	$stderr << "no data received!\n" if $debug
-	return
+      if !@gotData
+        $stderr << "no data received!\n" if $debug
+        return
       end
       @done = true
-      if @_mInputState == EPureAscii
-	@result = {'encoding' => 'ascii', 'confidence' => 1.0}
-	return @result
+      if @inputState == EPureAscii
+        @result = {'encoding' => 'ascii', 'confidence' => 1.0}
+        return @result
       end
-      if @_mInputState == EHighbyte
-	confidences = {}
-        @_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
-	maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
-	if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
-	  @result = {'encoding' =>  maxProber.get_charset_name(),
-			       'confidence' =>  maxProber.get_confidence()}
-	  return @result
-	end
+      if @inputState == EHighbyte
+        confidences = {}
+        @charSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
+        maxProber = @charSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
+        if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
+          @result = {'encoding' =>  maxProber.get_charset_name(),
+            'confidence' =>  maxProber.get_confidence()}
+          return @result
+        end
       end
       if $debug
-	$stderr << "no probers hit minimum threshhold\n" if $debug
-	for prober in @_mCharSetProbers[0]._mProbers
-	  next if not prober
-	  $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
-	end
+        $stderr << "no probers hit minimum threshhold\n" if $debug
+        for prober in @charSetProbers[0].probers
+          next if !prober
+          $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
+        end
       end
     end
   end