RubyGems - chardet2 - Versions diffs - 1.0.0 - Mend

chardet2 1.0.0

Files changed (36) hide show

data/COPYING +504 -0
data/README.markdown +29 -0
data/lib/Big5Freq.rb +913 -0
data/lib/Big5Prober.rb +48 -0
data/lib/CharDistributionAnalysis.rb +245 -0
data/lib/CharSetGroupProber.rb +114 -0
data/lib/CharSetProber.rb +70 -0
data/lib/CodingStateMachine.rb +74 -0
data/lib/ESCSM.rb +242 -0
data/lib/EUCJPProber.rb +97 -0
data/lib/EUCKRFreq.rb +600 -0
data/lib/EUCKRProber.rb +48 -0
data/lib/EUCTWFreq.rb +432 -0
data/lib/EUCTWProber.rb +48 -0
data/lib/EscCharSetProber.rb +94 -0
data/lib/GB2312Freq.rb +475 -0
data/lib/GB2312Prober.rb +48 -0
data/lib/HebrewProber.rb +292 -0
data/lib/JISFreq.rb +573 -0
data/lib/JapaneseContextAnalysis.rb +234 -0
data/lib/LangBulgarianModel.rb +231 -0
data/lib/LangCyrillicModel.rb +332 -0
data/lib/LangGreekModel.rb +229 -0
data/lib/LangHebrewModel.rb +202 -0
data/lib/LangHungarianModel.rb +228 -0
data/lib/LangThaiModel.rb +203 -0
data/lib/Latin1Prober.rb +155 -0
data/lib/MBCSGroupProber.rb +57 -0
data/lib/MBCSSM.rb +513 -0
data/lib/MultiByteCharSetProber.rb +94 -0
data/lib/SBCSGroupProber.rb +71 -0
data/lib/SJISProber.rb +99 -0
data/lib/SingleByteCharSetProber.rb +131 -0
data/lib/UTF8Prober.rb +91 -0
data/lib/UniversalDetector.rb +209 -0
metadata +83 -0

data/lib/SJISProber.rb ADDED Viewed

@@ -0,0 +1,99 @@
+######################## BEGIN LICENSE BLOCK ########################
+# The Original Code is mozilla.org code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 1998
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+#   Hui (zhengzhengzheng@gmail.com) - port to Ruby
+#   Mark Pilgrim - first port to Python
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+# 02110-1301  USA
+######################### END LICENSE BLOCK #########################
+require 'UniversalDetector'
+require 'MultiByteCharSetProber'
+require 'CodingStateMachine'
+require 'JapaneseContextAnalysis'
+require 'CharDistributionAnalysis'
+require 'MBCSSM'
+module UniversalDetector
+    class SJISProber < MultiByteCharSetProber
+        def initialize
+            super
+            @_mCodingSM = CodingStateMachine.new(SJISSMModel)
+            @_mDistributionAnalyzer = SJISDistributionAnalysis.new
+            @_mContextAnalyzer = SJISContextAnalysis.new
+            reset()
+        end
+        def reset
+            super
+            @_mContextAnalyzer.reset()
+        end
+        def get_charset_name
+            return "SHIFT_JIS"
+        end
+        def feed(aBuf)
+            aLen = aBuf.length
+            for i in 0...aLen
+                codingState = @_mCodingSM.next_state(aBuf[i])
+                if codingState == :Error
+                    if DEBUG
+                        p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
+                    end
+                    @_mState = :NotMe
+                    break
+                elsif codingState == :ItsMe
+                    @_mState = :FoundIt
+                    break
+                elsif codingState == :Start
+                    charLen = @_mCodingSM.get_current_charlen()
+                    if i == 0
+                        @_mLastChar[1] = aBuf[0]
+                        @_mContextAnalyzer.feed(@_mLastChar[2 - charLen..@_mLastChar.length], charLen)
+                        @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
+                    else
+                        @_mContextAnalyzer.feed(aBuf[i + 1 - charLen .. i + 3 - charLen], charLen)
+                        @_mDistributionAnalyzer.feed(aBuf[i - 1 .. i + 1], charLen)
+                    end
+                end
+            end
+            @_mLastChar[0] = aBuf[aLen - 1]
+            if get_state() == :Detecting
+                if @_mContextAnalyzer.got_enough_data() and \
+                       (get_confidence() > SHORTCUT_THRESHOLD)
+                    @_mState = :FoundIt
+                end
+            end
+            return get_state()
+        end
+        def get_confidence
+            contxtCf = @_mContextAnalyzer.get_confidence()
+            distribCf = @_mDistributionAnalyzer.get_confidence()
+            return [contxtCf, distribCf].max
+        end
+    end
+end

data/lib/SingleByteCharSetProber.rb ADDED Viewed

@@ -0,0 +1,131 @@
+######################## BEGIN LICENSE BLOCK ########################
+# The Original Code is mozilla.org code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 1998
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+#   Hui (zhengzhengzheng@gmail.com) - port to Ruby
+#   Mark Pilgrim - first port to Python
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+# 02110-1301  USA
+######################### END LICENSE BLOCK #########################
+require 'UniversalDetector'
+require 'CharSetProber'
+module UniversalDetector
+    SAMPLE_SIZE = 64
+    SB_ENOUGH_REL_THRESHOLD = 1024
+    POSITIVE_SHORTCUT_THRESHOLD = 0.95
+    NEGATIVE_SHORTCUT_THRESHOLD = 0.05
+    SYMBOL_CAT_ORDER = 250
+    NUMBER_OF_SEQ_CAT = 4
+    POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
+    class SingleByteCharSetProber < CharSetProber
+        def initialize(model, reversed=false, nameProber=nil)
+            super()
+            @_mModel = model
+            @_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
+            @_mNameProber = nameProber # Optional auxiliary prober for name decision
+            reset()
+        end
+        def reset
+            super
+            @_mLastOrder = 255 # char order of last character
+            @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
+            @_mTotalSeqs = 0
+            @_mTotalChar = 0
+            @_mFreqChar = 0 # characters that fall in our sampling range
+        end
+        def get_charset_name
+            if @_mNameProber
+                return @_mNameProber.get_charset_name()
+            else
+                return @_mModel['charsetName']
+            end
+        end
+        def feed(aBuf)
+            unless @_mModel['keepEnglishLetter']
+                aBuf = filter_without_english_letters(aBuf)
+            end
+            aLen = aBuf.length
+            unless aLen
+                return get_state()
+            end
+            for i in 0...aLen
+                c = aBuf[i]
+                order = @_mModel['charToOrderMap'][c]
+                if order < SYMBOL_CAT_ORDER
+                    @_mTotalChar += 1
+                end
+                if order < SAMPLE_SIZE
+                    @_mFreqChar += 1
+                    if @_mLastOrder < SAMPLE_SIZE
+                        @_mTotalSeqs += 1
+                        unless @_mReversed
+                            @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
+                        else # reverse the order of the letters in the lookup
+                            @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
+                        end
+                    end
+                end
+                @_mLastOrder = order
+            end
+            if get_state() == :Detecting
+                if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
+                    cf = get_confidence()
+                    if cf > POSITIVE_SHORTCUT_THRESHOLD
+                        if DEBUG
+                            p('%s confidence = %s, we have a winner\n' % [@_mModel['charsetName'], cf])
+                        end
+                        @_mState = :FoundIt
+                    elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
+                        if DEBUG
+                            p('%s confidence = %s, below negative shortcut threshhold %s\n' % [@_mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD])
+                        end
+                        @_mState = :NotMe
+                    end
+                end
+            end
+            return get_state()
+        end
+        def get_confidence
+            r = 0.01
+            if @_mTotalSeqs > 0
+    #            print @_mSeqCounters[POSITIVE_CAT], @_mTotalSeqs, @_mModel['mTypicalPositiveRatio']
+                r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
+    #            print r, @_mFreqChar, @_mTotalChar
+                r = r * @_mFreqChar / @_mTotalChar
+                if r >= 1.0
+                    r = 0.99
+                end
+            end
+            return r
+        end
+    end
+end

data/lib/UTF8Prober.rb ADDED Viewed

@@ -0,0 +1,91 @@
+######################## BEGIN LICENSE BLOCK ########################
+# The Original Code is mozilla.org code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 1998
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+#   Hui (zhengzhengzheng@gmail.com) - port to Ruby
+#   Mark Pilgrim - first port to Python
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+# 02110-1301  USA
+######################### END LICENSE BLOCK #########################
+require 'UniversalDetector'
+require 'CharSetProber'
+require 'CodingStateMachine'
+require 'MBCSSM'
+module UniversalDetector
+    ONE_CHAR_PROB = 0.5
+    class UTF8Prober < CharSetProber
+        def initialize
+            super()
+            @_mCodingSM = CodingStateMachine.new(UTF8SMModel)
+            reset()
+        end
+        def reset
+            super
+            @_mCodingSM.reset()
+            @_mNumOfMBChar = 0
+        end
+        def get_charset_name
+            return "utf-8"
+        end
+        def feed(aBuf)
+            aLen = aBuf.length
+            for i in 0...aLen
+                codingState = @_mCodingSM.next_state(aBuf[i])
+                if codingState == :Error
+                    @_mState = :NotMe
+                    break
+                elsif codingState == :ItsMe
+                    @_mState = :FoundIt
+                    break
+                elsif codingState == :Start
+                    if @_mCodingSM.get_current_charlen() >= 2
+                        @_mNumOfMBChar += 1
+                    end
+                end
+            end
+            if get_state() == :Detecting
+                if get_confidence() > SHORTCUT_THRESHOLD
+                    @_mState = :FoundIt
+                end
+            end
+            return get_state()
+        end
+        def get_confidence
+            unlike = 0.99
+            if @_mNumOfMBChar < 6
+                for i in 0...@_mNumOfMBChar
+                    unlike = unlike * ONE_CHAR_PROB
+                end
+                return 1.0 - unlike
+            else
+                return unlike
+            end
+        end
+    end
+end

data/lib/UniversalDetector.rb ADDED Viewed

@@ -0,0 +1,209 @@
+######################## BEGIN LICENSE BLOCK ########################
+# The Original Code is mozilla.org code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 1998
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+#   Hui (zhengzhengzheng@gmail.com) - port to Ruby
+#   Mark Pilgrim - first port to Python
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+# 02110-1301  USA
+######################### END LICENSE BLOCK #########################
+require "EscCharSetProber"
+require "MBCSGroupProber"
+require "SBCSGroupProber"
+require "Latin1Prober"
+require "singleton"
+module UniversalDetector
+    class << self
+        def encoding(data)
+            chardet(data)['encoding']
+        end
+        def chardet(data)
+            u = UniversalDetector::Detector.instance
+            u.reset()
+            u.feed(data)
+            u.close()
+            u.result
+        end
+    end
+    DEBUG = nil
+    Detectiong = 0
+    FoundIt = 1
+    NotMe = 2
+    Start = 0
+    Error = 1
+    ItsMe = 2
+    MINIMUM_THRESHOLD = 0.20
+    PureAscii = 0
+    EscAscii = 1
+    Highbyte = 2
+    SHORTCUT_THRESHOLD = 0.95
+    class Detector
+        include Singleton
+        attr_reader :result
+        def initialize
+            @_highBitDetector = /[\x80-\xFF]/n
+            @_escDetector = /\033|~\{/n
+            @_mEscCharSetProber = nil
+            @_mCharSetProbers = []
+            reset
+        end
+        def reset
+            @result = {"encoding"=> nil, "confidence"=> 0.0}
+            @done = false
+            @_mStart = true
+            @_mGotData = false
+            @_mInputState = :PureAscii
+            @_mLastChar = ""
+            if @_mEscCharSetProber
+                @_mEscCharSetProber.reset
+            end
+            for prober in @_mCharSetProbers
+                prober.reset
+            end
+        end
+        def feed(data)
+            if @done || data.empty?
+                return
+            end
+            unless  @_mGotData
+                # If the data starts with BOM, we know it is UTF
+                if data[0,3] == "\xEF\xBB\xBF"
+                    # EF BB BF  UTF-8 with BOM
+                    @result = {"encoding"=> "UTF-8", "confidence"=> 1.0}
+                elsif data[0,4] == "\xFF\xFE\x00\x00"
+                    # FF FE 00 00  UTF-32, little-endian BOM
+                    @result = {"encoding"=> "UTF-32LE", "confidence"=> 1.0}
+                elsif data[0,4] == "\x00\x00\xFE\xFF"
+                    # 00 00 FE FF  UTF-32, big-endian BOM
+                    @result = {"encoding"=> "UTF-32BE", "confidence"=> 1.0}
+                elsif data[0,4] == "\xFE\xFF\x00\x00"
+                    # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
+                    @result = {"encoding"=> "X-ISO-10646-UCS-4-3412", "confidence"=> 1.0}
+                elsif data[0,4] == "\x00\x00\xFF\xFE"
+                    # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
+                    @result = {"encoding"=> "X-ISO-10646-UCS-4-2143", "confidence"=> 1.0}
+                elsif data[0,4] == "\xFF\xFE"
+                    # FF FE  UTF-16, little endian BOM
+                    @result = {"encoding"=> "UTF-16LE", "confidence"=> 1.0}
+                elsif data[0,2] == "\xFE\xFF"
+                    # FE FF  UTF-16, big endian BOM
+                    @result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0}
+                end
+            end
+            @_mGotData = true
+            if @result["encoding"] && @result["confidence"] > 0.0
+                @done = true
+                return
+            end
+            if @_mInputState == :PureAscii
+                if data =~ @_highBitDetector
+                    @_mInputState = :Highbyte
+                elsif (@_mLastChar + data) =~ @_escDetector
+                    @_mInputState = :EscAscii
+                end
+            end
+            @_mLastChar = data[-1]
+            if @_mInputState == :EscAscii
+                unless @_mEscCharSetProber
+                    @_mEscCharSetProber = EscCharSetProber.new
+                end
+                if @_mEscCharSetProber.feed(data) == constants.eFoundIt
+                    @result = {"encoding"=> @_mEscCharSetProber.get_charset_name() ,"confidence"=> @_mEscCharSetProber.get_confidence()}
+                    @done = true
+                end
+            elsif @_mInputState == :Highbyte
+                if @_mCharSetProbers.empty?
+                    @_mCharSetProbers = MBCSGroupProber.new.mProbers + SBCSGroupProber.new.mProbers + [Latin1Prober.new]
+                end
+                @_mCharSetProbers.each do |prober|
+                    if prober.feed(data) == :FoundIt
+                        @result = {"encoding"=> prober.get_charset_name(), "confidence"=> prober.get_confidence()}
+                        @done = true
+                        break
+                    end
+                end #for
+            end
+        end #feed
+        def close
+            if @done then return end
+            unless @_mGotData
+                if DEBUG
+                    p("no data received!\n")
+                end
+                return
+            end
+            @done = true
+            if @_mInputState == :PureAscii
+                @result = {"encoding" =>  "ascii", "confidence" => 1.0}
+                return @result
+            end
+            if @_mInputState == :Highbyte
+                proberConfidence = nil
+                maxProberConfidence = 0.0
+                maxProber = nil
+                for prober in @_mCharSetProbers
+                    unless prober then next end
+                    proberConfidence = prober.get_confidence()
+                    if proberConfidence > maxProberConfidence
+                        maxProberConfidence = proberConfidence
+                        maxProber = prober
+                    end
+                end
+                if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD)
+                    @result = {"encoding" => maxProber.get_charset_name(),
+                                   "confidence" => maxProber.get_confidence()}
+                    return @result
+                end
+            end #if
+            if DEBUG
+                p("no probers hit minimum threshhold\n")
+                for prober in @_mCharSetProbers
+                    unless prober then next end
+                    p("%s confidence = %s\n" % \
+                                     [prober.get_charset_name(), \
+                                      prober.get_confidence()])
+                end
+            end
+        end #close
+    end #class
+end #module

metadata ADDED Viewed

@@ -0,0 +1,83 @@
+--- !ruby/object:Gem::Specification
+name: chardet2
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+  prerelease:
+platform: ruby
+authors:
+- Jan Xie
+- Felipe Tanus
+- Hui
+autorequire: UniversalDetector
+bindir: bin
+cert_chain: []
+date: 2013-05-17 00:00:00.000000000 Z
+dependencies: []
+description:
+email:
+- jan.h.xie@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/MBCSSM.rb
+- lib/MultiByteCharSetProber.rb
+- lib/JapaneseContextAnalysis.rb
+- lib/LangCyrillicModel.rb
+- lib/EUCKRFreq.rb
+- lib/GB2312Freq.rb
+- lib/EUCKRProber.rb
+- lib/CodingStateMachine.rb
+- lib/LangHungarianModel.rb
+- lib/HebrewProber.rb
+- lib/Big5Prober.rb
+- lib/CharSetGroupProber.rb
+- lib/SingleByteCharSetProber.rb
+- lib/EUCTWFreq.rb
+- lib/MBCSGroupProber.rb
+- lib/SBCSGroupProber.rb
+- lib/LangBulgarianModel.rb
+- lib/SJISProber.rb
+- lib/Big5Freq.rb
+- lib/UniversalDetector.rb
+- lib/CharDistributionAnalysis.rb
+- lib/UTF8Prober.rb
+- lib/Latin1Prober.rb
+- lib/ESCSM.rb
+- lib/EscCharSetProber.rb
+- lib/JISFreq.rb
+- lib/EUCJPProber.rb
+- lib/EUCTWProber.rb
+- lib/LangGreekModel.rb
+- lib/LangHebrewModel.rb
+- lib/GB2312Prober.rb
+- lib/LangThaiModel.rb
+- lib/CharSetProber.rb
+- COPYING
+- README.markdown
+homepage: https://github.com/janx/chardet
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
+specification_version: 3
+summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base
+  on Mark Pilgrim's Python port and Hui's ruby port.
+test_files: []