rchardet 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,123 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ SAMPLE_SIZE = 64
32
+ SB_ENOUGH_REL_THRESHOLD = 1024
33
+ POSITIVE_SHORTCUT_THRESHOLD = 0.95
34
+ NEGATIVE_SHORTCUT_THRESHOLD = 0.05
35
+ SYMBOL_CAT_ORDER = 250
36
+ NUMBER_OF_SEQ_CAT = 4
37
+ POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
38
+ #NEGATIVE_CAT = 0
39
+
40
+ class SingleByteCharSetProber < CharSetProber
41
+ def initialize(model, reversed=false, nameProber=nil)
42
+ super()
43
+ @_mModel = model
44
+ @_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
45
+ @_mNameProber = nameProber # Optional auxiliary prober for name decision
46
+ reset()
47
+ end
48
+
49
+ def reset
50
+ super()
51
+ @_mLastOrder = 255 # char order of last character
52
+ @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
53
+ @_mTotalSeqs = 0
54
+ @_mTotalChar = 0
55
+ @_mFreqChar = 0 # characters that fall in our sampling range
56
+ end
57
+
58
+ def get_charset_name
59
+ if @_mNameProber
60
+ return @_mNameProber.get_charset_name()
61
+ else
62
+ return @_mModel['charsetName']
63
+ end
64
+ end
65
+
66
+ def feed(aBuf)
67
+ if not @_mModel['keepEnglishLetter']
68
+ aBuf = filter_without_english_letters(aBuf)
69
+ end
70
+ aLen = aBuf.length
71
+ if not aLen
72
+ return get_state()
73
+ end
74
+ for c in aBuf.split('')
75
+ order = @_mModel['charToOrderMap'][c[0]]
76
+ if order < SYMBOL_CAT_ORDER
77
+ @_mTotalChar += 1
78
+ end
79
+ if order < SAMPLE_SIZE
80
+ @_mFreqChar += 1
81
+ if @_mLastOrder < SAMPLE_SIZE
82
+ @_mTotalSeqs += 1
83
+ if not @_mReversed
84
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
85
+ else # reverse the order of the letters in the lookup
86
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
87
+ end
88
+ end
89
+ end
90
+ @_mLastOrder = order
91
+ end
92
+
93
+ if get_state() == EDetecting
94
+ if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
95
+ cf = get_confidence()
96
+ if cf > POSITIVE_SHORTCUT_THRESHOLD
97
+ $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
98
+ @_mState = EFoundIt
99
+ elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
100
+ $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
101
+ @_mState = ENotMe
102
+ end
103
+ end
104
+ end
105
+
106
+ return get_state()
107
+ end
108
+
109
+ def get_confidence
110
+ r = 0.01
111
+ if @_mTotalSeqs > 0
112
+ # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
113
+ r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
114
+ # print r, self._mFreqChar, self._mTotalChar
115
+ r = r * @_mFreqChar / @_mTotalChar
116
+ if r >= 1.0
117
+ r = 0.99
118
+ end
119
+ end
120
+ return r
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,58 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ class SBCSGroupProber < CharSetGroupProber
32
+ def initialize
33
+ super
34
+ @_mProbers = [
35
+ SingleByteCharSetProber.new(Win1251CyrillicModel),
36
+ SingleByteCharSetProber.new(Koi8rModel),
37
+ SingleByteCharSetProber.new(Latin5CyrillicModel),
38
+ SingleByteCharSetProber.new(MacCyrillicModel),
39
+ SingleByteCharSetProber.new(Ibm866Model),
40
+ SingleByteCharSetProber.new(Ibm855Model),
41
+ SingleByteCharSetProber.new(Latin7GreekModel),
42
+ SingleByteCharSetProber.new(Win1253GreekModel),
43
+ SingleByteCharSetProber.new(Latin5BulgarianModel),
44
+ SingleByteCharSetProber.new(Win1251BulgarianModel),
45
+ SingleByteCharSetProber.new(Latin2HungarianModel),
46
+ SingleByteCharSetProber.new(Win1250HungarianModel),
47
+ SingleByteCharSetProber.new(TIS620ThaiModel),
48
+ ]
49
+ hebrewProber = HebrewProber.new()
50
+ logicalHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, false, hebrewProber)
51
+ visualHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, true, hebrewProber)
52
+ hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
53
+ @_mProbers += [hebrewProber, logicalHebrewProber, visualHebrewProber]
54
+
55
+ reset()
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,88 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class SJISProber < MultiByteCharSetProber
31
+ def initialize
32
+ super()
33
+ @_mCodingSM = CodingStateMachine.new(SJISSMModel)
34
+ @_mDistributionAnalyzer = SJISDistributionAnalysis.new()
35
+ @_mContextAnalyzer = SJISContextAnalysis.new()
36
+ reset()
37
+ end
38
+
39
+ def reset
40
+ super()
41
+ @_mContextAnalyzer.reset()
42
+ end
43
+
44
+ def get_charset_name
45
+ return "SHIFT_JIS"
46
+ end
47
+
48
+ def feed(aBuf)
49
+ aLen = aBuf.length
50
+ for i in (0...aLen)
51
+ codingState = @_mCodingSM.next_state(aBuf[i..i])
52
+ if codingState == EError
53
+ $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
54
+ @_mState = ENotMe
55
+ break
56
+ elsif codingState == EItsMe
57
+ @_mState = EFoundIt
58
+ break
59
+ elsif codingState == EStart
60
+ charLen = @_mCodingSM.get_current_charlen()
61
+ if i == 0
62
+ @_mLastChar[1] = aBuf[0..0]
63
+ @_mContextAnalyzer.feed(@_mLastChar[2 - charLen..-1], charLen)
64
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
65
+ else
66
+ @_mContextAnalyzer.feed(aBuf[i + 1 - charLen ... i + 3 - charLen], charLen)
67
+ @_mDistributionAnalyzer.feed(aBuf[i - 1 ... i + 1], charLen)
68
+ end
69
+ end
70
+ end
71
+
72
+ @_mLastChar[0] = aBuf[aLen - 1.. aLen-1]
73
+
74
+ if get_state() == EDetecting:
75
+ if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
76
+ @_mState = EFoundIt
77
+ end
78
+ end
79
+
80
+ return get_state()
81
+ end
82
+
83
+ def get_confidence
84
+ l = [@_mContextAnalyzer.get_confidence(), @_mDistributionAnalyzer.get_confidence()]
85
+ return l.max
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,166 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ MINIMUM_THRESHOLD = 0.20
32
+ EPureAscii = 0
33
+ EEscAscii = 1
34
+ EHighbyte = 2
35
+
36
+ class UniversalDetector
37
+ attr_accessor :result
38
+ def initialize
39
+ @_highBitDetector = /[\x80-\xFF]/
40
+ @_escDetector = /(\033|\~\{)/
41
+ @_mEscCharSetProber = nil
42
+ @_mCharSetProbers = []
43
+ reset()
44
+ end
45
+
46
+ def reset
47
+ @result = {'encoding' => nil, 'confidence' => 0.0}
48
+ @done = false
49
+ @_mStart = true
50
+ @_mGotData = false
51
+ @_mInputState = EPureAscii
52
+ @_mLastChar = ''
53
+ if @_mEscCharSetProber
54
+ @_mEscCharSetProber.reset()
55
+ end
56
+ for prober in @_mCharSetProbers
57
+ prober.reset()
58
+ end
59
+ end
60
+
61
+ def feed(aBuf)
62
+ return if @done
63
+
64
+ aLen = aBuf.length
65
+ return if not aLen
66
+
67
+ if not @_mGotData
68
+ # If the data starts with BOM, we know it is UTF
69
+ if aBuf[0...3] == "\xEF\xBB\xBF"
70
+ # EF BB BF UTF-8 with BOM
71
+ @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
72
+ elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
73
+ # FF FE 00 00 UTF-32, little-endian BOM
74
+ @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
75
+ elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
76
+ # 00 00 FE FF UTF-32, big-endian BOM
77
+ @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
78
+ elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
79
+ # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
80
+ @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
81
+ elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
82
+ # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
83
+ @result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0}
84
+ elsif aBuf[0...2] == '\xFF\xFE' # FIXME BUGME bug in python chardet was [:4]
85
+ # FF FE UTF-16, little endian BOM
86
+ @result = {'encoding' => "UTF-16LE", 'confidence' => 1.0}
87
+ elsif aBuf[0...2] == '\xFE\xFF'
88
+ # FE FF UTF-16, big endian BOM
89
+ @result = {'encoding' => "UTF-16BE", 'confidence' => 1.0}
90
+ end
91
+ end
92
+
93
+ @_mGotData = true
94
+ if @result['encoding'] and (@result['confidence'] > 0.0):
95
+ @done = true
96
+ return
97
+ end
98
+ if @_mInputState == EPureAscii:
99
+ if @_highBitDetector =~ (aBuf)
100
+ @_mInputState = EHighbyte
101
+ elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
102
+ @_mInputState = EEscAscii
103
+ end
104
+ end
105
+
106
+ @_mLastChar = aBuf[-1..-1]
107
+ if @_mInputState == EEscAscii
108
+ if not @_mEscCharSetProber
109
+ @_mEscCharSetProber = EscCharSetProber.new()
110
+ end
111
+ if @_mEscCharSetProber.feed(aBuf) == EFoundIt
112
+ @result = {'encoding' => self._mEscCharSetProber.get_charset_name(),
113
+ 'confidence' => @_mEscCharSetProber.get_confidence()
114
+ }
115
+ @done = true
116
+ end
117
+ elsif @_mInputState == EHighbyte
118
+ if not @_mCharSetProbers or @_mCharSetProbers.empty?
119
+ @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
120
+ end
121
+ for prober in @_mCharSetProbers
122
+ if prober.feed(aBuf) == EFoundIt
123
+ @result = {'encoding' => prober.get_charset_name(),
124
+ 'confidence' => prober.get_confidence()}
125
+ @done = true
126
+ break
127
+ end
128
+ end
129
+ end
130
+
131
+ end
132
+
133
+ def close
134
+ return if @done
135
+ if not @_mGotData
136
+ $stderr << "no data received!\n" if $debug
137
+ return
138
+ end
139
+ @done = true
140
+
141
+ if @_mInputState == EPureAscii:
142
+ @result = {'encoding' => 'ascii', 'confidence' => 1.0}
143
+ return @result
144
+ end
145
+
146
+ if @_mInputState == EHighbyte:
147
+ confidences = {}
148
+ @_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
149
+ maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
150
+ if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
151
+ @result = {'encoding' => maxProber.get_charset_name(),
152
+ 'confidence' => maxProber.get_confidence()}
153
+ return @result
154
+ end
155
+ end
156
+
157
+ if $debug
158
+ $stderr << "no probers hit minimum threshhold\n" if $debug
159
+ for prober in @_mCharSetProbers[0]._mProbers
160
+ next if not prober
161
+ $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
162
+ end
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,86 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ ONE_CHAR_PROB = 0.5
31
+
32
+ class UTF8Prober < CharSetProber
33
+ def initialize
34
+ super()
35
+ @_mCodingSM = CodingStateMachine.new(UTF8SMModel)
36
+ reset()
37
+ end
38
+
39
+ def reset
40
+ super()
41
+ @_mCodingSM.reset()
42
+ @_mNumOfMBChar = 0
43
+ end
44
+
45
+ def get_charset_name
46
+ return "utf-8"
47
+ end
48
+
49
+ def feed(aBuf)
50
+ for c in aBuf.split('')
51
+ codingState = @_mCodingSM.next_state(c)
52
+ if codingState == EError
53
+ @_mState = ENotMe
54
+ break
55
+ elsif codingState == EItsMe
56
+ @_mState = EFoundIt
57
+ break
58
+ elsif codingState == EStart
59
+ if @_mCodingSM.get_current_charlen() >= 2
60
+ @_mNumOfMBChar += 1
61
+ end
62
+ end
63
+ end
64
+
65
+ if get_state() == EDetecting:
66
+ if get_confidence() > SHORTCUT_THRESHOLD
67
+ @_mState = EFoundIt
68
+ end
69
+ end
70
+
71
+ return get_state()
72
+ end
73
+
74
+ def get_confidence
75
+ unlike = 0.99
76
+ if @_mNumOfMBChar < 6
77
+ for i in (0...@_mNumOfMBChar)
78
+ unlike = unlike * ONE_CHAR_PROB
79
+ end
80
+ return 1.0 - unlike
81
+ else
82
+ return unlike
83
+ end
84
+ end
85
+ end
86
+ end