chardet2 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +504 -0
- data/README.markdown +29 -0
- data/lib/Big5Freq.rb +913 -0
- data/lib/Big5Prober.rb +48 -0
- data/lib/CharDistributionAnalysis.rb +245 -0
- data/lib/CharSetGroupProber.rb +114 -0
- data/lib/CharSetProber.rb +70 -0
- data/lib/CodingStateMachine.rb +74 -0
- data/lib/ESCSM.rb +242 -0
- data/lib/EUCJPProber.rb +97 -0
- data/lib/EUCKRFreq.rb +600 -0
- data/lib/EUCKRProber.rb +48 -0
- data/lib/EUCTWFreq.rb +432 -0
- data/lib/EUCTWProber.rb +48 -0
- data/lib/EscCharSetProber.rb +94 -0
- data/lib/GB2312Freq.rb +475 -0
- data/lib/GB2312Prober.rb +48 -0
- data/lib/HebrewProber.rb +292 -0
- data/lib/JISFreq.rb +573 -0
- data/lib/JapaneseContextAnalysis.rb +234 -0
- data/lib/LangBulgarianModel.rb +231 -0
- data/lib/LangCyrillicModel.rb +332 -0
- data/lib/LangGreekModel.rb +229 -0
- data/lib/LangHebrewModel.rb +202 -0
- data/lib/LangHungarianModel.rb +228 -0
- data/lib/LangThaiModel.rb +203 -0
- data/lib/Latin1Prober.rb +155 -0
- data/lib/MBCSGroupProber.rb +57 -0
- data/lib/MBCSSM.rb +513 -0
- data/lib/MultiByteCharSetProber.rb +94 -0
- data/lib/SBCSGroupProber.rb +71 -0
- data/lib/SJISProber.rb +99 -0
- data/lib/SingleByteCharSetProber.rb +131 -0
- data/lib/UTF8Prober.rb +91 -0
- data/lib/UniversalDetector.rb +209 -0
- metadata +83 -0
data/lib/SJISProber.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
######################## BEGIN LICENSE BLOCK ########################
|
2
|
+
# The Original Code is mozilla.org code.
|
3
|
+
#
|
4
|
+
# The Initial Developer of the Original Code is
|
5
|
+
# Netscape Communications Corporation.
|
6
|
+
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
|
+
# the Initial Developer. All Rights Reserved.
|
8
|
+
#
|
9
|
+
# Contributor(s):
|
10
|
+
# Hui (zhengzhengzheng@gmail.com) - port to Ruby
|
11
|
+
# Mark Pilgrim - first port to Python
|
12
|
+
#
|
13
|
+
# This library is free software; you can redistribute it and/or
|
14
|
+
# modify it under the terms of the GNU Lesser General Public
|
15
|
+
# License as published by the Free Software Foundation; either
|
16
|
+
# version 2.1 of the License, or (at your option) any later version.
|
17
|
+
#
|
18
|
+
# This library is distributed in the hope that it will be useful,
|
19
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
20
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
21
|
+
# Lesser General Public License for more details.
|
22
|
+
#
|
23
|
+
# You should have received a copy of the GNU Lesser General Public
|
24
|
+
# License along with this library; if not, write to the Free Software
|
25
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
26
|
+
# 02110-1301 USA
|
27
|
+
######################### END LICENSE BLOCK #########################
|
28
|
+
|
29
|
+
require 'UniversalDetector'
|
30
|
+
require 'MultiByteCharSetProber'
|
31
|
+
require 'CodingStateMachine'
|
32
|
+
require 'JapaneseContextAnalysis'
|
33
|
+
require 'CharDistributionAnalysis'
|
34
|
+
require 'MBCSSM'
|
35
|
+
|
36
|
+
module UniversalDetector
|
37
|
+
class SJISProber < MultiByteCharSetProber
|
38
|
+
def initialize
|
39
|
+
super
|
40
|
+
@_mCodingSM = CodingStateMachine.new(SJISSMModel)
|
41
|
+
@_mDistributionAnalyzer = SJISDistributionAnalysis.new
|
42
|
+
@_mContextAnalyzer = SJISContextAnalysis.new
|
43
|
+
reset()
|
44
|
+
end
|
45
|
+
|
46
|
+
def reset
|
47
|
+
super
|
48
|
+
@_mContextAnalyzer.reset()
|
49
|
+
end
|
50
|
+
|
51
|
+
def get_charset_name
|
52
|
+
return "SHIFT_JIS"
|
53
|
+
end
|
54
|
+
|
55
|
+
def feed(aBuf)
|
56
|
+
aLen = aBuf.length
|
57
|
+
for i in 0...aLen
|
58
|
+
codingState = @_mCodingSM.next_state(aBuf[i])
|
59
|
+
if codingState == :Error
|
60
|
+
if DEBUG
|
61
|
+
p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
|
62
|
+
end
|
63
|
+
@_mState = :NotMe
|
64
|
+
break
|
65
|
+
elsif codingState == :ItsMe
|
66
|
+
@_mState = :FoundIt
|
67
|
+
break
|
68
|
+
elsif codingState == :Start
|
69
|
+
charLen = @_mCodingSM.get_current_charlen()
|
70
|
+
if i == 0
|
71
|
+
@_mLastChar[1] = aBuf[0]
|
72
|
+
@_mContextAnalyzer.feed(@_mLastChar[2 - charLen..@_mLastChar.length], charLen)
|
73
|
+
@_mDistributionAnalyzer.feed(@_mLastChar, charLen)
|
74
|
+
else
|
75
|
+
@_mContextAnalyzer.feed(aBuf[i + 1 - charLen .. i + 3 - charLen], charLen)
|
76
|
+
@_mDistributionAnalyzer.feed(aBuf[i - 1 .. i + 1], charLen)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
@_mLastChar[0] = aBuf[aLen - 1]
|
82
|
+
|
83
|
+
if get_state() == :Detecting
|
84
|
+
if @_mContextAnalyzer.got_enough_data() and \
|
85
|
+
(get_confidence() > SHORTCUT_THRESHOLD)
|
86
|
+
@_mState = :FoundIt
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
return get_state()
|
91
|
+
end
|
92
|
+
|
93
|
+
def get_confidence
|
94
|
+
contxtCf = @_mContextAnalyzer.get_confidence()
|
95
|
+
distribCf = @_mDistributionAnalyzer.get_confidence()
|
96
|
+
return [contxtCf, distribCf].max
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
######################## BEGIN LICENSE BLOCK ########################
|
2
|
+
# The Original Code is mozilla.org code.
|
3
|
+
#
|
4
|
+
# The Initial Developer of the Original Code is
|
5
|
+
# Netscape Communications Corporation.
|
6
|
+
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
|
+
# the Initial Developer. All Rights Reserved.
|
8
|
+
#
|
9
|
+
# Contributor(s):
|
10
|
+
# Hui (zhengzhengzheng@gmail.com) - port to Ruby
|
11
|
+
# Mark Pilgrim - first port to Python
|
12
|
+
#
|
13
|
+
# This library is free software; you can redistribute it and/or
|
14
|
+
# modify it under the terms of the GNU Lesser General Public
|
15
|
+
# License as published by the Free Software Foundation; either
|
16
|
+
# version 2.1 of the License, or (at your option) any later version.
|
17
|
+
#
|
18
|
+
# This library is distributed in the hope that it will be useful,
|
19
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
20
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
21
|
+
# Lesser General Public License for more details.
|
22
|
+
#
|
23
|
+
# You should have received a copy of the GNU Lesser General Public
|
24
|
+
# License along with this library; if not, write to the Free Software
|
25
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
26
|
+
# 02110-1301 USA
|
27
|
+
######################### END LICENSE BLOCK #########################
|
28
|
+
|
29
|
+
require 'UniversalDetector'
|
30
|
+
require 'CharSetProber'
|
31
|
+
|
32
|
+
module UniversalDetector
|
33
|
+
|
34
|
+
SAMPLE_SIZE = 64
|
35
|
+
SB_ENOUGH_REL_THRESHOLD = 1024
|
36
|
+
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
37
|
+
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
38
|
+
SYMBOL_CAT_ORDER = 250
|
39
|
+
NUMBER_OF_SEQ_CAT = 4
|
40
|
+
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
|
41
|
+
|
42
|
+
class SingleByteCharSetProber < CharSetProber
|
43
|
+
def initialize(model, reversed=false, nameProber=nil)
|
44
|
+
super()
|
45
|
+
@_mModel = model
|
46
|
+
@_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
|
47
|
+
@_mNameProber = nameProber # Optional auxiliary prober for name decision
|
48
|
+
reset()
|
49
|
+
end
|
50
|
+
|
51
|
+
def reset
|
52
|
+
super
|
53
|
+
@_mLastOrder = 255 # char order of last character
|
54
|
+
@_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
|
55
|
+
@_mTotalSeqs = 0
|
56
|
+
@_mTotalChar = 0
|
57
|
+
@_mFreqChar = 0 # characters that fall in our sampling range
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_charset_name
|
61
|
+
if @_mNameProber
|
62
|
+
return @_mNameProber.get_charset_name()
|
63
|
+
else
|
64
|
+
return @_mModel['charsetName']
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def feed(aBuf)
|
69
|
+
unless @_mModel['keepEnglishLetter']
|
70
|
+
aBuf = filter_without_english_letters(aBuf)
|
71
|
+
end
|
72
|
+
aLen = aBuf.length
|
73
|
+
unless aLen
|
74
|
+
return get_state()
|
75
|
+
end
|
76
|
+
|
77
|
+
for i in 0...aLen
|
78
|
+
c = aBuf[i]
|
79
|
+
order = @_mModel['charToOrderMap'][c]
|
80
|
+
if order < SYMBOL_CAT_ORDER
|
81
|
+
@_mTotalChar += 1
|
82
|
+
end
|
83
|
+
if order < SAMPLE_SIZE
|
84
|
+
@_mFreqChar += 1
|
85
|
+
if @_mLastOrder < SAMPLE_SIZE
|
86
|
+
@_mTotalSeqs += 1
|
87
|
+
unless @_mReversed
|
88
|
+
@_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
|
89
|
+
else # reverse the order of the letters in the lookup
|
90
|
+
@_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
@_mLastOrder = order
|
95
|
+
end
|
96
|
+
|
97
|
+
if get_state() == :Detecting
|
98
|
+
if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
|
99
|
+
cf = get_confidence()
|
100
|
+
if cf > POSITIVE_SHORTCUT_THRESHOLD
|
101
|
+
if DEBUG
|
102
|
+
p('%s confidence = %s, we have a winner\n' % [@_mModel['charsetName'], cf])
|
103
|
+
end
|
104
|
+
@_mState = :FoundIt
|
105
|
+
elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
|
106
|
+
if DEBUG
|
107
|
+
p('%s confidence = %s, below negative shortcut threshhold %s\n' % [@_mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD])
|
108
|
+
end
|
109
|
+
@_mState = :NotMe
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
return get_state()
|
115
|
+
end
|
116
|
+
|
117
|
+
def get_confidence
|
118
|
+
r = 0.01
|
119
|
+
if @_mTotalSeqs > 0
|
120
|
+
# print @_mSeqCounters[POSITIVE_CAT], @_mTotalSeqs, @_mModel['mTypicalPositiveRatio']
|
121
|
+
r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
|
122
|
+
# print r, @_mFreqChar, @_mTotalChar
|
123
|
+
r = r * @_mFreqChar / @_mTotalChar
|
124
|
+
if r >= 1.0
|
125
|
+
r = 0.99
|
126
|
+
end
|
127
|
+
end
|
128
|
+
return r
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
data/lib/UTF8Prober.rb
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
######################## BEGIN LICENSE BLOCK ########################
|
2
|
+
# The Original Code is mozilla.org code.
|
3
|
+
#
|
4
|
+
# The Initial Developer of the Original Code is
|
5
|
+
# Netscape Communications Corporation.
|
6
|
+
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
|
+
# the Initial Developer. All Rights Reserved.
|
8
|
+
#
|
9
|
+
# Contributor(s):
|
10
|
+
# Hui (zhengzhengzheng@gmail.com) - port to Ruby
|
11
|
+
# Mark Pilgrim - first port to Python
|
12
|
+
#
|
13
|
+
# This library is free software; you can redistribute it and/or
|
14
|
+
# modify it under the terms of the GNU Lesser General Public
|
15
|
+
# License as published by the Free Software Foundation; either
|
16
|
+
# version 2.1 of the License, or (at your option) any later version.
|
17
|
+
#
|
18
|
+
# This library is distributed in the hope that it will be useful,
|
19
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
20
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
21
|
+
# Lesser General Public License for more details.
|
22
|
+
#
|
23
|
+
# You should have received a copy of the GNU Lesser General Public
|
24
|
+
# License along with this library; if not, write to the Free Software
|
25
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
26
|
+
# 02110-1301 USA
|
27
|
+
######################### END LICENSE BLOCK #########################
|
28
|
+
|
29
|
+
require 'UniversalDetector'
|
30
|
+
require 'CharSetProber'
|
31
|
+
require 'CodingStateMachine'
|
32
|
+
require 'MBCSSM'
|
33
|
+
|
34
|
+
module UniversalDetector
|
35
|
+
ONE_CHAR_PROB = 0.5
|
36
|
+
|
37
|
+
class UTF8Prober < CharSetProber
|
38
|
+
def initialize
|
39
|
+
super()
|
40
|
+
@_mCodingSM = CodingStateMachine.new(UTF8SMModel)
|
41
|
+
reset()
|
42
|
+
end
|
43
|
+
|
44
|
+
def reset
|
45
|
+
super
|
46
|
+
@_mCodingSM.reset()
|
47
|
+
@_mNumOfMBChar = 0
|
48
|
+
end
|
49
|
+
|
50
|
+
def get_charset_name
|
51
|
+
return "utf-8"
|
52
|
+
end
|
53
|
+
|
54
|
+
def feed(aBuf)
|
55
|
+
aLen = aBuf.length
|
56
|
+
for i in 0...aLen
|
57
|
+
codingState = @_mCodingSM.next_state(aBuf[i])
|
58
|
+
if codingState == :Error
|
59
|
+
@_mState = :NotMe
|
60
|
+
break
|
61
|
+
elsif codingState == :ItsMe
|
62
|
+
@_mState = :FoundIt
|
63
|
+
break
|
64
|
+
elsif codingState == :Start
|
65
|
+
if @_mCodingSM.get_current_charlen() >= 2
|
66
|
+
@_mNumOfMBChar += 1
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
if get_state() == :Detecting
|
72
|
+
if get_confidence() > SHORTCUT_THRESHOLD
|
73
|
+
@_mState = :FoundIt
|
74
|
+
end
|
75
|
+
end
|
76
|
+
return get_state()
|
77
|
+
end
|
78
|
+
|
79
|
+
def get_confidence
|
80
|
+
unlike = 0.99
|
81
|
+
if @_mNumOfMBChar < 6
|
82
|
+
for i in 0...@_mNumOfMBChar
|
83
|
+
unlike = unlike * ONE_CHAR_PROB
|
84
|
+
end
|
85
|
+
return 1.0 - unlike
|
86
|
+
else
|
87
|
+
return unlike
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,209 @@
|
|
1
|
+
######################## BEGIN LICENSE BLOCK ########################
|
2
|
+
# The Original Code is mozilla.org code.
|
3
|
+
#
|
4
|
+
# The Initial Developer of the Original Code is
|
5
|
+
# Netscape Communications Corporation.
|
6
|
+
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
|
+
# the Initial Developer. All Rights Reserved.
|
8
|
+
#
|
9
|
+
# Contributor(s):
|
10
|
+
# Hui (zhengzhengzheng@gmail.com) - port to Ruby
|
11
|
+
# Mark Pilgrim - first port to Python
|
12
|
+
#
|
13
|
+
# This library is free software; you can redistribute it and/or
|
14
|
+
# modify it under the terms of the GNU Lesser General Public
|
15
|
+
# License as published by the Free Software Foundation; either
|
16
|
+
# version 2.1 of the License, or (at your option) any later version.
|
17
|
+
#
|
18
|
+
# This library is distributed in the hope that it will be useful,
|
19
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
20
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
21
|
+
# Lesser General Public License for more details.
|
22
|
+
#
|
23
|
+
# You should have received a copy of the GNU Lesser General Public
|
24
|
+
# License along with this library; if not, write to the Free Software
|
25
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
26
|
+
# 02110-1301 USA
|
27
|
+
######################### END LICENSE BLOCK #########################
|
28
|
+
|
29
|
+
require "EscCharSetProber"
|
30
|
+
require "MBCSGroupProber"
|
31
|
+
require "SBCSGroupProber"
|
32
|
+
require "Latin1Prober"
|
33
|
+
require "singleton"
|
34
|
+
|
35
|
+
module UniversalDetector
|
36
|
+
|
37
|
+
class << self
|
38
|
+
def encoding(data)
|
39
|
+
chardet(data)['encoding']
|
40
|
+
end
|
41
|
+
|
42
|
+
def chardet(data)
|
43
|
+
u = UniversalDetector::Detector.instance
|
44
|
+
u.reset()
|
45
|
+
u.feed(data)
|
46
|
+
u.close()
|
47
|
+
u.result
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
DEBUG = nil
|
52
|
+
|
53
|
+
Detectiong = 0
|
54
|
+
FoundIt = 1
|
55
|
+
NotMe = 2
|
56
|
+
|
57
|
+
Start = 0
|
58
|
+
Error = 1
|
59
|
+
ItsMe = 2
|
60
|
+
|
61
|
+
MINIMUM_THRESHOLD = 0.20
|
62
|
+
PureAscii = 0
|
63
|
+
EscAscii = 1
|
64
|
+
Highbyte = 2
|
65
|
+
|
66
|
+
SHORTCUT_THRESHOLD = 0.95
|
67
|
+
|
68
|
+
class Detector
|
69
|
+
|
70
|
+
include Singleton
|
71
|
+
|
72
|
+
attr_reader :result
|
73
|
+
|
74
|
+
def initialize
|
75
|
+
@_highBitDetector = /[\x80-\xFF]/n
|
76
|
+
@_escDetector = /\033|~\{/n
|
77
|
+
@_mEscCharSetProber = nil
|
78
|
+
@_mCharSetProbers = []
|
79
|
+
reset
|
80
|
+
end
|
81
|
+
|
82
|
+
def reset
|
83
|
+
@result = {"encoding"=> nil, "confidence"=> 0.0}
|
84
|
+
@done = false
|
85
|
+
@_mStart = true
|
86
|
+
@_mGotData = false
|
87
|
+
@_mInputState = :PureAscii
|
88
|
+
@_mLastChar = ""
|
89
|
+
if @_mEscCharSetProber
|
90
|
+
@_mEscCharSetProber.reset
|
91
|
+
end
|
92
|
+
for prober in @_mCharSetProbers
|
93
|
+
prober.reset
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def feed(data)
|
98
|
+
if @done || data.empty?
|
99
|
+
return
|
100
|
+
end
|
101
|
+
unless @_mGotData
|
102
|
+
# If the data starts with BOM, we know it is UTF
|
103
|
+
if data[0,3] == "\xEF\xBB\xBF"
|
104
|
+
# EF BB BF UTF-8 with BOM
|
105
|
+
@result = {"encoding"=> "UTF-8", "confidence"=> 1.0}
|
106
|
+
elsif data[0,4] == "\xFF\xFE\x00\x00"
|
107
|
+
# FF FE 00 00 UTF-32, little-endian BOM
|
108
|
+
@result = {"encoding"=> "UTF-32LE", "confidence"=> 1.0}
|
109
|
+
elsif data[0,4] == "\x00\x00\xFE\xFF"
|
110
|
+
# 00 00 FE FF UTF-32, big-endian BOM
|
111
|
+
@result = {"encoding"=> "UTF-32BE", "confidence"=> 1.0}
|
112
|
+
elsif data[0,4] == "\xFE\xFF\x00\x00"
|
113
|
+
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
114
|
+
@result = {"encoding"=> "X-ISO-10646-UCS-4-3412", "confidence"=> 1.0}
|
115
|
+
elsif data[0,4] == "\x00\x00\xFF\xFE"
|
116
|
+
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
117
|
+
@result = {"encoding"=> "X-ISO-10646-UCS-4-2143", "confidence"=> 1.0}
|
118
|
+
elsif data[0,4] == "\xFF\xFE"
|
119
|
+
# FF FE UTF-16, little endian BOM
|
120
|
+
@result = {"encoding"=> "UTF-16LE", "confidence"=> 1.0}
|
121
|
+
elsif data[0,2] == "\xFE\xFF"
|
122
|
+
# FE FF UTF-16, big endian BOM
|
123
|
+
@result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0}
|
124
|
+
end
|
125
|
+
end
|
126
|
+
@_mGotData = true
|
127
|
+
if @result["encoding"] && @result["confidence"] > 0.0
|
128
|
+
@done = true
|
129
|
+
return
|
130
|
+
end
|
131
|
+
|
132
|
+
if @_mInputState == :PureAscii
|
133
|
+
if data =~ @_highBitDetector
|
134
|
+
@_mInputState = :Highbyte
|
135
|
+
elsif (@_mLastChar + data) =~ @_escDetector
|
136
|
+
@_mInputState = :EscAscii
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
@_mLastChar = data[-1]
|
141
|
+
if @_mInputState == :EscAscii
|
142
|
+
unless @_mEscCharSetProber
|
143
|
+
@_mEscCharSetProber = EscCharSetProber.new
|
144
|
+
end
|
145
|
+
if @_mEscCharSetProber.feed(data) == constants.eFoundIt
|
146
|
+
@result = {"encoding"=> @_mEscCharSetProber.get_charset_name() ,"confidence"=> @_mEscCharSetProber.get_confidence()}
|
147
|
+
@done = true
|
148
|
+
end
|
149
|
+
elsif @_mInputState == :Highbyte
|
150
|
+
if @_mCharSetProbers.empty?
|
151
|
+
@_mCharSetProbers = MBCSGroupProber.new.mProbers + SBCSGroupProber.new.mProbers + [Latin1Prober.new]
|
152
|
+
end
|
153
|
+
@_mCharSetProbers.each do |prober|
|
154
|
+
if prober.feed(data) == :FoundIt
|
155
|
+
@result = {"encoding"=> prober.get_charset_name(), "confidence"=> prober.get_confidence()}
|
156
|
+
@done = true
|
157
|
+
break
|
158
|
+
end
|
159
|
+
end #for
|
160
|
+
end
|
161
|
+
end #feed
|
162
|
+
|
163
|
+
def close
|
164
|
+
if @done then return end
|
165
|
+
unless @_mGotData
|
166
|
+
if DEBUG
|
167
|
+
p("no data received!\n")
|
168
|
+
end
|
169
|
+
return
|
170
|
+
end
|
171
|
+
@done = true
|
172
|
+
|
173
|
+
if @_mInputState == :PureAscii
|
174
|
+
@result = {"encoding" => "ascii", "confidence" => 1.0}
|
175
|
+
return @result
|
176
|
+
end
|
177
|
+
|
178
|
+
if @_mInputState == :Highbyte
|
179
|
+
proberConfidence = nil
|
180
|
+
maxProberConfidence = 0.0
|
181
|
+
maxProber = nil
|
182
|
+
for prober in @_mCharSetProbers
|
183
|
+
unless prober then next end
|
184
|
+
proberConfidence = prober.get_confidence()
|
185
|
+
if proberConfidence > maxProberConfidence
|
186
|
+
maxProberConfidence = proberConfidence
|
187
|
+
maxProber = prober
|
188
|
+
end
|
189
|
+
end
|
190
|
+
if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD)
|
191
|
+
@result = {"encoding" => maxProber.get_charset_name(),
|
192
|
+
"confidence" => maxProber.get_confidence()}
|
193
|
+
return @result
|
194
|
+
end
|
195
|
+
end #if
|
196
|
+
|
197
|
+
if DEBUG
|
198
|
+
p("no probers hit minimum threshhold\n")
|
199
|
+
for prober in @_mCharSetProbers
|
200
|
+
unless prober then next end
|
201
|
+
p("%s confidence = %s\n" % \
|
202
|
+
[prober.get_charset_name(), \
|
203
|
+
prober.get_confidence()])
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end #close
|
207
|
+
end #class
|
208
|
+
|
209
|
+
end #module
|
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: chardet2
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jan Xie
|
9
|
+
- Felipe Tanus
|
10
|
+
- Hui
|
11
|
+
autorequire: UniversalDetector
|
12
|
+
bindir: bin
|
13
|
+
cert_chain: []
|
14
|
+
date: 2013-05-17 00:00:00.000000000 Z
|
15
|
+
dependencies: []
|
16
|
+
description:
|
17
|
+
email:
|
18
|
+
- jan.h.xie@gmail.com
|
19
|
+
executables: []
|
20
|
+
extensions: []
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- lib/MBCSSM.rb
|
24
|
+
- lib/MultiByteCharSetProber.rb
|
25
|
+
- lib/JapaneseContextAnalysis.rb
|
26
|
+
- lib/LangCyrillicModel.rb
|
27
|
+
- lib/EUCKRFreq.rb
|
28
|
+
- lib/GB2312Freq.rb
|
29
|
+
- lib/EUCKRProber.rb
|
30
|
+
- lib/CodingStateMachine.rb
|
31
|
+
- lib/LangHungarianModel.rb
|
32
|
+
- lib/HebrewProber.rb
|
33
|
+
- lib/Big5Prober.rb
|
34
|
+
- lib/CharSetGroupProber.rb
|
35
|
+
- lib/SingleByteCharSetProber.rb
|
36
|
+
- lib/EUCTWFreq.rb
|
37
|
+
- lib/MBCSGroupProber.rb
|
38
|
+
- lib/SBCSGroupProber.rb
|
39
|
+
- lib/LangBulgarianModel.rb
|
40
|
+
- lib/SJISProber.rb
|
41
|
+
- lib/Big5Freq.rb
|
42
|
+
- lib/UniversalDetector.rb
|
43
|
+
- lib/CharDistributionAnalysis.rb
|
44
|
+
- lib/UTF8Prober.rb
|
45
|
+
- lib/Latin1Prober.rb
|
46
|
+
- lib/ESCSM.rb
|
47
|
+
- lib/EscCharSetProber.rb
|
48
|
+
- lib/JISFreq.rb
|
49
|
+
- lib/EUCJPProber.rb
|
50
|
+
- lib/EUCTWProber.rb
|
51
|
+
- lib/LangGreekModel.rb
|
52
|
+
- lib/LangHebrewModel.rb
|
53
|
+
- lib/GB2312Prober.rb
|
54
|
+
- lib/LangThaiModel.rb
|
55
|
+
- lib/CharSetProber.rb
|
56
|
+
- COPYING
|
57
|
+
- README.markdown
|
58
|
+
homepage: https://github.com/janx/chardet
|
59
|
+
licenses: []
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
require_paths:
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
requirements: []
|
77
|
+
rubyforge_project:
|
78
|
+
rubygems_version: 1.8.23
|
79
|
+
signing_key:
|
80
|
+
specification_version: 3
|
81
|
+
summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base
|
82
|
+
on Mark Pilgrim's Python port and Hui's ruby port.
|
83
|
+
test_files: []
|