rchardet 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,146 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ FREQ_CAT_NUM = 4
32
+
33
+ UDF = 0 # undefined
34
+ OTH = 1 # other
35
+ ASC = 2 # ascii capital letter
36
+ ASS = 3 # ascii small letter
37
+ ACV = 4 # accent capital vowel
38
+ ACO = 5 # accent capital other
39
+ ASV = 6 # accent small vowel
40
+ ASO = 7 # accent small other
41
+ CLASS_NUM = 8 # total classes
42
+
43
+ Latin1_CharToClass = [
44
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
45
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
46
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
47
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
48
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
49
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
50
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
51
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
52
+ OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
53
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
54
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
55
+ ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
56
+ OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
57
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
58
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
59
+ ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
60
+ OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
61
+ OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
62
+ UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
63
+ OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
64
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
65
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
66
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
67
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
68
+ ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
69
+ ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
70
+ ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
71
+ ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
72
+ ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
73
+ ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
74
+ ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
75
+ ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
76
+ ]
77
+
78
+ # 0 : illegal
79
+ # 1 : very unlikely
80
+ # 2 : normal
81
+ # 3 : very likely
82
+ Latin1ClassModel = [
83
+ # UDF OTH ASC ASS ACV ACO ASV ASO
84
+ 0, 0, 0, 0, 0, 0, 0, 0, # UDF
85
+ 0, 3, 3, 3, 3, 3, 3, 3, # OTH
86
+ 0, 3, 3, 3, 3, 3, 3, 3, # ASC
87
+ 0, 3, 3, 3, 1, 1, 3, 3, # ASS
88
+ 0, 3, 3, 3, 1, 2, 1, 2, # ACV
89
+ 0, 3, 3, 3, 3, 3, 3, 3, # ACO
90
+ 0, 3, 1, 3, 1, 1, 1, 3, # ASV
91
+ 0, 3, 1, 3, 1, 1, 3, 3, # ASO
92
+ ]
93
+
94
+ class Latin1Prober < CharSetProber
95
+ def initialize
96
+ super
97
+ reset()
98
+ end
99
+
100
+ def reset
101
+ @_mLastCharClass = OTH
102
+ @_mFreqCounter = [0] * FREQ_CAT_NUM
103
+ super
104
+ end
105
+
106
+ def get_charset_name
107
+ return "windows-1252"
108
+ end
109
+
110
+ def feed(aBuf)
111
+ aBuf = filter_with_english_letters(aBuf)
112
+ for c in aBuf.split('')
113
+ charClass = Latin1_CharToClass[c[0]]
114
+ freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
115
+ if freq == 0
116
+ @_mState = ENotMe
117
+ break
118
+ end
119
+ @_mFreqCounter[freq] += 1
120
+ @_mLastCharClass = charClass
121
+ end
122
+
123
+ return get_state()
124
+ end
125
+
126
+ def get_confidence
127
+ if get_state() == ENotMe
128
+ return 0.01
129
+ end
130
+
131
+ total = @_mFreqCounter.inject{|a,b| a+b}
132
+ if total < 0.01
133
+ confidence = 0.0
134
+ else
135
+ confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
136
+ end
137
+ if confidence < 0.0
138
+ confidence = 0.0
139
+ end
140
+ # lower the confidence of latin1 so that other more accurate detector
141
+ # can take priority.
142
+ confidence = confidence * 0.5
143
+ return confidence
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,89 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ # Proofpoint, Inc.
14
+ #
15
+ # This library is free software; you can redistribute it and/or
16
+ # modify it under the terms of the GNU Lesser General Public
17
+ # License as published by the Free Software Foundation; either
18
+ # version 2.1 of the License, or (at your option) any later version.
19
+ #
20
+ # This library is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
+ # Lesser General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU Lesser General Public
26
+ # License along with this library; if not, write to the Free Software
27
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28
+ # 02110-1301 USA
29
+ ######################### END LICENSE BLOCK #########################
30
+
31
+ module CharDet
32
+ class MultiByteCharSetProber < CharSetProber
33
+ def initialize
34
+ super
35
+ @_mDistributionAnalyzer = nil
36
+ @_mCodingSM = nil
37
+ @_mLastChar = "\x00\x00"
38
+ end
39
+
40
+ def reset
41
+ super
42
+ if @_mCodingSM
43
+ @_mCodingSM.reset()
44
+ end
45
+ if @_mDistributionAnalyzer
46
+ @_mDistributionAnalyzer.reset()
47
+ end
48
+ @_mLastChar = "\x00\x00"
49
+ end
50
+
51
+ def get_charset_name
52
+ end
53
+
54
+ def feed(aBuf)
55
+ aLen = aBuf.length
56
+ for i in (0...aLen)
57
+ codingState = @_mCodingSM.next_state(aBuf[i..i])
58
+ if codingState == EError
59
+ $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
60
+ @_mState = ENotMe
61
+ break
62
+ elsif codingState == EItsMe
63
+ @_mState = EFoundIt
64
+ break
65
+ elsif codingState == EStart
66
+ charLen = @_mCodingSM.get_current_charlen()
67
+ if i == 0
68
+ @_mLastChar[1] = aBuf[0..0]
69
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
70
+ else
71
+ @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
72
+ end
73
+ end
74
+ end
75
+ @_mLastChar[0] = aBuf[aLen-1..aLen-1]
76
+
77
+ if get_state() == EDetecting
78
+ if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
79
+ @_mState = EFoundIt
80
+ end
81
+ end
82
+ return get_state()
83
+ end
84
+
85
+ def get_confidence
86
+ return @_mDistributionAnalyzer.get_confidence()
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,47 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ # Proofpoint, Inc.
14
+ #
15
+ # This library is free software; you can redistribute it and/or
16
+ # modify it under the terms of the GNU Lesser General Public
17
+ # License as published by the Free Software Foundation; either
18
+ # version 2.1 of the License, or (at your option) any later version.
19
+ #
20
+ # This library is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
+ # Lesser General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU Lesser General Public
26
+ # License along with this library; if not, write to the Free Software
27
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28
+ # 02110-1301 USA
29
+ ######################### END LICENSE BLOCK #########################
30
+
31
+ module CharDet
32
+ class MBCSGroupProber < CharSetGroupProber
33
+ def initialize
34
+ super
35
+ @_mProbers = [
36
+ UTF8Prober.new,
37
+ SJISProber.new,
38
+ EUCJPProber.new,
39
+ GB2312Prober.new,
40
+ EUCKRProber.new,
41
+ Big5Prober.new,
42
+ EUCTWProber.new
43
+ ]
44
+ reset()
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,542 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ # BIG5
31
+
32
+ BIG5_cls = [
33
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
34
+ 1,1,1,1,1,1,0,0, # 08 - 0f
35
+ 1,1,1,1,1,1,1,1, # 10 - 17
36
+ 1,1,1,0,1,1,1,1, # 18 - 1f
37
+ 1,1,1,1,1,1,1,1, # 20 - 27
38
+ 1,1,1,1,1,1,1,1, # 28 - 2f
39
+ 1,1,1,1,1,1,1,1, # 30 - 37
40
+ 1,1,1,1,1,1,1,1, # 38 - 3f
41
+ 2,2,2,2,2,2,2,2, # 40 - 47
42
+ 2,2,2,2,2,2,2,2, # 48 - 4f
43
+ 2,2,2,2,2,2,2,2, # 50 - 57
44
+ 2,2,2,2,2,2,2,2, # 58 - 5f
45
+ 2,2,2,2,2,2,2,2, # 60 - 67
46
+ 2,2,2,2,2,2,2,2, # 68 - 6f
47
+ 2,2,2,2,2,2,2,2, # 70 - 77
48
+ 2,2,2,2,2,2,2,1, # 78 - 7f
49
+ 4,4,4,4,4,4,4,4, # 80 - 87
50
+ 4,4,4,4,4,4,4,4, # 88 - 8f
51
+ 4,4,4,4,4,4,4,4, # 90 - 97
52
+ 4,4,4,4,4,4,4,4, # 98 - 9f
53
+ 4,3,3,3,3,3,3,3, # a0 - a7
54
+ 3,3,3,3,3,3,3,3, # a8 - af
55
+ 3,3,3,3,3,3,3,3, # b0 - b7
56
+ 3,3,3,3,3,3,3,3, # b8 - bf
57
+ 3,3,3,3,3,3,3,3, # c0 - c7
58
+ 3,3,3,3,3,3,3,3, # c8 - cf
59
+ 3,3,3,3,3,3,3,3, # d0 - d7
60
+ 3,3,3,3,3,3,3,3, # d8 - df
61
+ 3,3,3,3,3,3,3,3, # e0 - e7
62
+ 3,3,3,3,3,3,3,3, # e8 - ef
63
+ 3,3,3,3,3,3,3,3, # f0 - f7
64
+ 3,3,3,3,3,3,3,0 # f8 - ff
65
+ ]
66
+
67
+ BIG5_st = [
68
+ EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
69
+ EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,#08-0f
70
+ EError,EStart,EStart,EStart,EStart,EStart,EStart,EStart #10-17
71
+ ]
72
+
73
+ Big5CharLenTable = [0, 1, 1, 2, 0]
74
+
75
+ Big5SMModel = {'classTable' => BIG5_cls,
76
+ 'classFactor' => 5,
77
+ 'stateTable' => BIG5_st,
78
+ 'charLenTable' => Big5CharLenTable,
79
+ 'name' => 'Big5'
80
+ }
81
+
82
+ # EUC-JP
83
+
84
+ EUCJP_cls = [
85
+ 4,4,4,4,4,4,4,4, # 00 - 07
86
+ 4,4,4,4,4,4,5,5, # 08 - 0f
87
+ 4,4,4,4,4,4,4,4, # 10 - 17
88
+ 4,4,4,5,4,4,4,4, # 18 - 1f
89
+ 4,4,4,4,4,4,4,4, # 20 - 27
90
+ 4,4,4,4,4,4,4,4, # 28 - 2f
91
+ 4,4,4,4,4,4,4,4, # 30 - 37
92
+ 4,4,4,4,4,4,4,4, # 38 - 3f
93
+ 4,4,4,4,4,4,4,4, # 40 - 47
94
+ 4,4,4,4,4,4,4,4, # 48 - 4f
95
+ 4,4,4,4,4,4,4,4, # 50 - 57
96
+ 4,4,4,4,4,4,4,4, # 58 - 5f
97
+ 4,4,4,4,4,4,4,4, # 60 - 67
98
+ 4,4,4,4,4,4,4,4, # 68 - 6f
99
+ 4,4,4,4,4,4,4,4, # 70 - 77
100
+ 4,4,4,4,4,4,4,4, # 78 - 7f
101
+ 5,5,5,5,5,5,5,5, # 80 - 87
102
+ 5,5,5,5,5,5,1,3, # 88 - 8f
103
+ 5,5,5,5,5,5,5,5, # 90 - 97
104
+ 5,5,5,5,5,5,5,5, # 98 - 9f
105
+ 5,2,2,2,2,2,2,2, # a0 - a7
106
+ 2,2,2,2,2,2,2,2, # a8 - af
107
+ 2,2,2,2,2,2,2,2, # b0 - b7
108
+ 2,2,2,2,2,2,2,2, # b8 - bf
109
+ 2,2,2,2,2,2,2,2, # c0 - c7
110
+ 2,2,2,2,2,2,2,2, # c8 - cf
111
+ 2,2,2,2,2,2,2,2, # d0 - d7
112
+ 2,2,2,2,2,2,2,2, # d8 - df
113
+ 0,0,0,0,0,0,0,0, # e0 - e7
114
+ 0,0,0,0,0,0,0,0, # e8 - ef
115
+ 0,0,0,0,0,0,0,0, # f0 - f7
116
+ 0,0,0,0,0,0,0,5 # f8 - ff
117
+ ]
118
+
119
+ EUCJP_st = [
120
+ 3, 4, 3, 5,EStart,EError,EError,EError,#00-07
121
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
122
+ EItsMe,EItsMe,EStart,EError,EStart,EError,EError,EError,#10-17
123
+ EError,EError,EStart,EError,EError,EError, 3,EError,#18-1f
124
+ 3,EError,EError,EError,EStart,EStart,EStart,EStart #20-27
125
+ ]
126
+
127
+ EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
128
+
129
+ EUCJPSMModel = {'classTable' => EUCJP_cls,
130
+ 'classFactor' => 6,
131
+ 'stateTable' => EUCJP_st,
132
+ 'charLenTable' => EUCJPCharLenTable,
133
+ 'name' => 'EUC-JP'
134
+ }
135
+
136
+ # EUC-KR
137
+
138
+ EUCKR_cls = [
139
+ 1,1,1,1,1,1,1,1, # 00 - 07
140
+ 1,1,1,1,1,1,0,0, # 08 - 0f
141
+ 1,1,1,1,1,1,1,1, # 10 - 17
142
+ 1,1,1,0,1,1,1,1, # 18 - 1f
143
+ 1,1,1,1,1,1,1,1, # 20 - 27
144
+ 1,1,1,1,1,1,1,1, # 28 - 2f
145
+ 1,1,1,1,1,1,1,1, # 30 - 37
146
+ 1,1,1,1,1,1,1,1, # 38 - 3f
147
+ 1,1,1,1,1,1,1,1, # 40 - 47
148
+ 1,1,1,1,1,1,1,1, # 48 - 4f
149
+ 1,1,1,1,1,1,1,1, # 50 - 57
150
+ 1,1,1,1,1,1,1,1, # 58 - 5f
151
+ 1,1,1,1,1,1,1,1, # 60 - 67
152
+ 1,1,1,1,1,1,1,1, # 68 - 6f
153
+ 1,1,1,1,1,1,1,1, # 70 - 77
154
+ 1,1,1,1,1,1,1,1, # 78 - 7f
155
+ 0,0,0,0,0,0,0,0, # 80 - 87
156
+ 0,0,0,0,0,0,0,0, # 88 - 8f
157
+ 0,0,0,0,0,0,0,0, # 90 - 97
158
+ 0,0,0,0,0,0,0,0, # 98 - 9f
159
+ 0,2,2,2,2,2,2,2, # a0 - a7
160
+ 2,2,2,2,2,3,3,3, # a8 - af
161
+ 2,2,2,2,2,2,2,2, # b0 - b7
162
+ 2,2,2,2,2,2,2,2, # b8 - bf
163
+ 2,2,2,2,2,2,2,2, # c0 - c7
164
+ 2,3,2,2,2,2,2,2, # c8 - cf
165
+ 2,2,2,2,2,2,2,2, # d0 - d7
166
+ 2,2,2,2,2,2,2,2, # d8 - df
167
+ 2,2,2,2,2,2,2,2, # e0 - e7
168
+ 2,2,2,2,2,2,2,2, # e8 - ef
169
+ 2,2,2,2,2,2,2,2, # f0 - f7
170
+ 2,2,2,2,2,2,2,0 # f8 - ff
171
+ ]
172
+
173
+ EUCKR_st = [
174
+ EError,EStart, 3,EError,EError,EError,EError,EError,#00-07
175
+ EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,EStart#08-0f
176
+ ]
177
+
178
+ EUCKRCharLenTable = [0, 1, 2, 0]
179
+
180
+ EUCKRSMModel = {'classTable' => EUCKR_cls,
181
+ 'classFactor' => 4,
182
+ 'stateTable' => EUCKR_st,
183
+ 'charLenTable' => EUCKRCharLenTable,
184
+ 'name' => 'EUC-KR'
185
+ }
186
+
187
+ # EUC-TW
188
+
189
+ EUCTW_cls = [
190
+ 2,2,2,2,2,2,2,2, # 00 - 07
191
+ 2,2,2,2,2,2,0,0, # 08 - 0f
192
+ 2,2,2,2,2,2,2,2, # 10 - 17
193
+ 2,2,2,0,2,2,2,2, # 18 - 1f
194
+ 2,2,2,2,2,2,2,2, # 20 - 27
195
+ 2,2,2,2,2,2,2,2, # 28 - 2f
196
+ 2,2,2,2,2,2,2,2, # 30 - 37
197
+ 2,2,2,2,2,2,2,2, # 38 - 3f
198
+ 2,2,2,2,2,2,2,2, # 40 - 47
199
+ 2,2,2,2,2,2,2,2, # 48 - 4f
200
+ 2,2,2,2,2,2,2,2, # 50 - 57
201
+ 2,2,2,2,2,2,2,2, # 58 - 5f
202
+ 2,2,2,2,2,2,2,2, # 60 - 67
203
+ 2,2,2,2,2,2,2,2, # 68 - 6f
204
+ 2,2,2,2,2,2,2,2, # 70 - 77
205
+ 2,2,2,2,2,2,2,2, # 78 - 7f
206
+ 0,0,0,0,0,0,0,0, # 80 - 87
207
+ 0,0,0,0,0,0,6,0, # 88 - 8f
208
+ 0,0,0,0,0,0,0,0, # 90 - 97
209
+ 0,0,0,0,0,0,0,0, # 98 - 9f
210
+ 0,3,4,4,4,4,4,4, # a0 - a7
211
+ 5,5,1,1,1,1,1,1, # a8 - af
212
+ 1,1,1,1,1,1,1,1, # b0 - b7
213
+ 1,1,1,1,1,1,1,1, # b8 - bf
214
+ 1,1,3,1,3,3,3,3, # c0 - c7
215
+ 3,3,3,3,3,3,3,3, # c8 - cf
216
+ 3,3,3,3,3,3,3,3, # d0 - d7
217
+ 3,3,3,3,3,3,3,3, # d8 - df
218
+ 3,3,3,3,3,3,3,3, # e0 - e7
219
+ 3,3,3,3,3,3,3,3, # e8 - ef
220
+ 3,3,3,3,3,3,3,3, # f0 - f7
221
+ 3,3,3,3,3,3,3,0 # f8 - ff
222
+ ]
223
+
224
+ EUCTW_st = [
225
+ EError,EError,EStart, 3, 3, 3, 4,EError,#00-07
226
+ EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
227
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EStart,EError,#10-17
228
+ EStart,EStart,EStart,EError,EError,EError,EError,EError,#18-1f
229
+ 5,EError,EError,EError,EStart,EError,EStart,EStart,#20-27
230
+ EStart,EError,EStart,EStart,EStart,EStart,EStart,EStart #28-2f
231
+ ]
232
+
233
+ EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
234
+
235
+ EUCTWSMModel = {'classTable' => EUCTW_cls,
236
+ 'classFactor' => 7,
237
+ 'stateTable' => EUCTW_st,
238
+ 'charLenTable' => EUCTWCharLenTable,
239
+ 'name' => 'x-euc-tw'
240
+ }
241
+
242
+ # GB2312
243
+
244
+ GB2312_cls = [
245
+ 1,1,1,1,1,1,1,1, # 00 - 07
246
+ 1,1,1,1,1,1,0,0, # 08 - 0f
247
+ 1,1,1,1,1,1,1,1, # 10 - 17
248
+ 1,1,1,0,1,1,1,1, # 18 - 1f
249
+ 1,1,1,1,1,1,1,1, # 20 - 27
250
+ 1,1,1,1,1,1,1,1, # 28 - 2f
251
+ 3,3,3,3,3,3,3,3, # 30 - 37
252
+ 3,3,1,1,1,1,1,1, # 38 - 3f
253
+ 2,2,2,2,2,2,2,2, # 40 - 47
254
+ 2,2,2,2,2,2,2,2, # 48 - 4f
255
+ 2,2,2,2,2,2,2,2, # 50 - 57
256
+ 2,2,2,2,2,2,2,2, # 58 - 5f
257
+ 2,2,2,2,2,2,2,2, # 60 - 67
258
+ 2,2,2,2,2,2,2,2, # 68 - 6f
259
+ 2,2,2,2,2,2,2,2, # 70 - 77
260
+ 2,2,2,2,2,2,2,4, # 78 - 7f
261
+ 5,6,6,6,6,6,6,6, # 80 - 87
262
+ 6,6,6,6,6,6,6,6, # 88 - 8f
263
+ 6,6,6,6,6,6,6,6, # 90 - 97
264
+ 6,6,6,6,6,6,6,6, # 98 - 9f
265
+ 6,6,6,6,6,6,6,6, # a0 - a7
266
+ 6,6,6,6,6,6,6,6, # a8 - af
267
+ 6,6,6,6,6,6,6,6, # b0 - b7
268
+ 6,6,6,6,6,6,6,6, # b8 - bf
269
+ 6,6,6,6,6,6,6,6, # c0 - c7
270
+ 6,6,6,6,6,6,6,6, # c8 - cf
271
+ 6,6,6,6,6,6,6,6, # d0 - d7
272
+ 6,6,6,6,6,6,6,6, # d8 - df
273
+ 6,6,6,6,6,6,6,6, # e0 - e7
274
+ 6,6,6,6,6,6,6,6, # e8 - ef
275
+ 6,6,6,6,6,6,6,6, # f0 - f7
276
+ 6,6,6,6,6,6,6,0 # f8 - ff
277
+ ]
278
+
279
+ GB2312_st = [
280
+ EError,EStart,EStart,EStart,EStart,EStart, 3,EError,#00-07
281
+ EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
282
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,#10-17
283
+ 4,EError,EStart,EStart,EError,EError,EError,EError,#18-1f
284
+ EError,EError, 5,EError,EError,EError,EItsMe,EError,#20-27
285
+ EError,EError,EStart,EStart,EStart,EStart,EStart,EStart#28-2f
286
+ ]
287
+
288
+ # To be accurate, the length of class 6 can be either 2 or 4.
289
+ # But it is not necessary to discriminate between the two since
290
+ # it is used for frequency analysis only, and we are validing
291
+ # each code range there as well. So it is safe to set it to be
292
+ # 2 here.
293
+ GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
294
+
295
+ GB2312SMModel = {'classTable' => GB2312_cls,
296
+ 'classFactor' => 7,
297
+ 'stateTable' => GB2312_st,
298
+ 'charLenTable' => GB2312CharLenTable,
299
+ 'name' => 'GB2312'
300
+ }
301
+
302
+ # Shift_JIS
303
+
304
+ SJIS_cls = [
305
+ 1,1,1,1,1,1,1,1, # 00 - 07
306
+ 1,1,1,1,1,1,0,0, # 08 - 0f
307
+ 1,1,1,1,1,1,1,1, # 10 - 17
308
+ 1,1,1,0,1,1,1,1, # 18 - 1f
309
+ 1,1,1,1,1,1,1,1, # 20 - 27
310
+ 1,1,1,1,1,1,1,1, # 28 - 2f
311
+ 1,1,1,1,1,1,1,1, # 30 - 37
312
+ 1,1,1,1,1,1,1,1, # 38 - 3f
313
+ 2,2,2,2,2,2,2,2, # 40 - 47
314
+ 2,2,2,2,2,2,2,2, # 48 - 4f
315
+ 2,2,2,2,2,2,2,2, # 50 - 57
316
+ 2,2,2,2,2,2,2,2, # 58 - 5f
317
+ 2,2,2,2,2,2,2,2, # 60 - 67
318
+ 2,2,2,2,2,2,2,2, # 68 - 6f
319
+ 2,2,2,2,2,2,2,2, # 70 - 77
320
+ 2,2,2,2,2,2,2,1, # 78 - 7f
321
+ 3,3,3,3,3,3,3,3, # 80 - 87
322
+ 3,3,3,3,3,3,3,3, # 88 - 8f
323
+ 3,3,3,3,3,3,3,3, # 90 - 97
324
+ 3,3,3,3,3,3,3,3, # 98 - 9f
325
+ #0xa0 is illegal in sjis encoding, but some pages does
326
+ #contain such byte. We need to be more error forgiven.
327
+ 2,2,2,2,2,2,2,2, # a0 - a7
328
+ 2,2,2,2,2,2,2,2, # a8 - af
329
+ 2,2,2,2,2,2,2,2, # b0 - b7
330
+ 2,2,2,2,2,2,2,2, # b8 - bf
331
+ 2,2,2,2,2,2,2,2, # c0 - c7
332
+ 2,2,2,2,2,2,2,2, # c8 - cf
333
+ 2,2,2,2,2,2,2,2, # d0 - d7
334
+ 2,2,2,2,2,2,2,2, # d8 - df
335
+ 3,3,3,3,3,3,3,3, # e0 - e7
336
+ 3,3,3,3,3,4,4,4, # e8 - ef
337
+ 4,4,4,4,4,4,4,4, # f0 - f7
338
+ 4,4,4,4,4,0,0,0 # f8 - ff
339
+ ]
340
+
341
+ SJIS_st = [
342
+ EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
343
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
344
+ EItsMe,EItsMe,EError,EError,EStart,EStart,EStart,EStart#10-17
345
+ ]
346
+
347
+ SJISCharLenTable = [0, 1, 1, 2, 0, 0]
348
+
349
+ SJISSMModel = {'classTable' => SJIS_cls,
350
+ 'classFactor' => 6,
351
+ 'stateTable' => SJIS_st,
352
+ 'charLenTable' => SJISCharLenTable,
353
+ 'name' => 'Shift_JIS'
354
+ }
355
+
356
+ # UCS2-BE
357
+
358
+ UCS2BE_cls = [
359
+ 0,0,0,0,0,0,0,0, # 00 - 07
360
+ 0,0,1,0,0,2,0,0, # 08 - 0f
361
+ 0,0,0,0,0,0,0,0, # 10 - 17
362
+ 0,0,0,3,0,0,0,0, # 18 - 1f
363
+ 0,0,0,0,0,0,0,0, # 20 - 27
364
+ 0,3,3,3,3,3,0,0, # 28 - 2f
365
+ 0,0,0,0,0,0,0,0, # 30 - 37
366
+ 0,0,0,0,0,0,0,0, # 38 - 3f
367
+ 0,0,0,0,0,0,0,0, # 40 - 47
368
+ 0,0,0,0,0,0,0,0, # 48 - 4f
369
+ 0,0,0,0,0,0,0,0, # 50 - 57
370
+ 0,0,0,0,0,0,0,0, # 58 - 5f
371
+ 0,0,0,0,0,0,0,0, # 60 - 67
372
+ 0,0,0,0,0,0,0,0, # 68 - 6f
373
+ 0,0,0,0,0,0,0,0, # 70 - 77
374
+ 0,0,0,0,0,0,0,0, # 78 - 7f
375
+ 0,0,0,0,0,0,0,0, # 80 - 87
376
+ 0,0,0,0,0,0,0,0, # 88 - 8f
377
+ 0,0,0,0,0,0,0,0, # 90 - 97
378
+ 0,0,0,0,0,0,0,0, # 98 - 9f
379
+ 0,0,0,0,0,0,0,0, # a0 - a7
380
+ 0,0,0,0,0,0,0,0, # a8 - af
381
+ 0,0,0,0,0,0,0,0, # b0 - b7
382
+ 0,0,0,0,0,0,0,0, # b8 - bf
383
+ 0,0,0,0,0,0,0,0, # c0 - c7
384
+ 0,0,0,0,0,0,0,0, # c8 - cf
385
+ 0,0,0,0,0,0,0,0, # d0 - d7
386
+ 0,0,0,0,0,0,0,0, # d8 - df
387
+ 0,0,0,0,0,0,0,0, # e0 - e7
388
+ 0,0,0,0,0,0,0,0, # e8 - ef
389
+ 0,0,0,0,0,0,0,0, # f0 - f7
390
+ 0,0,0,0,0,0,4,5 # f8 - ff
391
+ ]
392
+
393
+ UCS2BE_st = [
394
+ 5, 7, 7,EError, 4, 3,EError,EError,#00-07
395
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
396
+ EItsMe,EItsMe, 6, 6, 6, 6,EError,EError,#10-17
397
+ 6, 6, 6, 6, 6,EItsMe, 6, 6,#18-1f
398
+ 6, 6, 6, 6, 5, 7, 7,EError,#20-27
399
+ 5, 8, 6, 6,EError, 6, 6, 6,#28-2f
400
+ 6, 6, 6, 6,EError,EError,EStart,EStart#30-37
401
+ ]
402
+
403
+ UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
404
+
405
+ UCS2BESMModel = {'classTable' => UCS2BE_cls,
406
+ 'classFactor' => 6,
407
+ 'stateTable' => UCS2BE_st,
408
+ 'charLenTable' => UCS2BECharLenTable,
409
+ 'name' => 'UTF-16BE'
410
+ }
411
+
412
+ # UCS2-LE
413
+
414
+ UCS2LE_cls = [
415
+ 0,0,0,0,0,0,0,0, # 00 - 07
416
+ 0,0,1,0,0,2,0,0, # 08 - 0f
417
+ 0,0,0,0,0,0,0,0, # 10 - 17
418
+ 0,0,0,3,0,0,0,0, # 18 - 1f
419
+ 0,0,0,0,0,0,0,0, # 20 - 27
420
+ 0,3,3,3,3,3,0,0, # 28 - 2f
421
+ 0,0,0,0,0,0,0,0, # 30 - 37
422
+ 0,0,0,0,0,0,0,0, # 38 - 3f
423
+ 0,0,0,0,0,0,0,0, # 40 - 47
424
+ 0,0,0,0,0,0,0,0, # 48 - 4f
425
+ 0,0,0,0,0,0,0,0, # 50 - 57
426
+ 0,0,0,0,0,0,0,0, # 58 - 5f
427
+ 0,0,0,0,0,0,0,0, # 60 - 67
428
+ 0,0,0,0,0,0,0,0, # 68 - 6f
429
+ 0,0,0,0,0,0,0,0, # 70 - 77
430
+ 0,0,0,0,0,0,0,0, # 78 - 7f
431
+ 0,0,0,0,0,0,0,0, # 80 - 87
432
+ 0,0,0,0,0,0,0,0, # 88 - 8f
433
+ 0,0,0,0,0,0,0,0, # 90 - 97
434
+ 0,0,0,0,0,0,0,0, # 98 - 9f
435
+ 0,0,0,0,0,0,0,0, # a0 - a7
436
+ 0,0,0,0,0,0,0,0, # a8 - af
437
+ 0,0,0,0,0,0,0,0, # b0 - b7
438
+ 0,0,0,0,0,0,0,0, # b8 - bf
439
+ 0,0,0,0,0,0,0,0, # c0 - c7
440
+ 0,0,0,0,0,0,0,0, # c8 - cf
441
+ 0,0,0,0,0,0,0,0, # d0 - d7
442
+ 0,0,0,0,0,0,0,0, # d8 - df
443
+ 0,0,0,0,0,0,0,0, # e0 - e7
444
+ 0,0,0,0,0,0,0,0, # e8 - ef
445
+ 0,0,0,0,0,0,0,0, # f0 - f7
446
+ 0,0,0,0,0,0,4,5 # f8 - ff
447
+ ]
448
+
449
+ UCS2LE_st = [
450
+ 6, 6, 7, 6, 4, 3,EError,EError,#00-07
451
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
452
+ EItsMe,EItsMe, 5, 5, 5,EError,EItsMe,EError,#10-17
453
+ 5, 5, 5,EError, 5,EError, 6, 6,#18-1f
454
+ 7, 6, 8, 8, 5, 5, 5,EError,#20-27
455
+ 5, 5, 5,EError,EError,EError, 5, 5,#28-2f
456
+ 5, 5, 5,EError, 5,EError,EStart,EStart#30-37
457
+ ]
458
+
459
+ UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
460
+
461
+ UCS2LESMModel = {'classTable' => UCS2LE_cls,
462
+ 'classFactor' => 6,
463
+ 'stateTable' => UCS2LE_st,
464
+ 'charLenTable' => UCS2LECharLenTable,
465
+ 'name' => 'UTF-16LE'
466
+ }
467
+
468
+ # UTF-8
469
+
470
+ UTF8_cls = [
471
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
472
+ 1,1,1,1,1,1,0,0, # 08 - 0f
473
+ 1,1,1,1,1,1,1,1, # 10 - 17
474
+ 1,1,1,0,1,1,1,1, # 18 - 1f
475
+ 1,1,1,1,1,1,1,1, # 20 - 27
476
+ 1,1,1,1,1,1,1,1, # 28 - 2f
477
+ 1,1,1,1,1,1,1,1, # 30 - 37
478
+ 1,1,1,1,1,1,1,1, # 38 - 3f
479
+ 1,1,1,1,1,1,1,1, # 40 - 47
480
+ 1,1,1,1,1,1,1,1, # 48 - 4f
481
+ 1,1,1,1,1,1,1,1, # 50 - 57
482
+ 1,1,1,1,1,1,1,1, # 58 - 5f
483
+ 1,1,1,1,1,1,1,1, # 60 - 67
484
+ 1,1,1,1,1,1,1,1, # 68 - 6f
485
+ 1,1,1,1,1,1,1,1, # 70 - 77
486
+ 1,1,1,1,1,1,1,1, # 78 - 7f
487
+ 2,2,2,2,3,3,3,3, # 80 - 87
488
+ 4,4,4,4,4,4,4,4, # 88 - 8f
489
+ 4,4,4,4,4,4,4,4, # 90 - 97
490
+ 4,4,4,4,4,4,4,4, # 98 - 9f
491
+ 5,5,5,5,5,5,5,5, # a0 - a7
492
+ 5,5,5,5,5,5,5,5, # a8 - af
493
+ 5,5,5,5,5,5,5,5, # b0 - b7
494
+ 5,5,5,5,5,5,5,5, # b8 - bf
495
+ 0,0,6,6,6,6,6,6, # c0 - c7
496
+ 6,6,6,6,6,6,6,6, # c8 - cf
497
+ 6,6,6,6,6,6,6,6, # d0 - d7
498
+ 6,6,6,6,6,6,6,6, # d8 - df
499
+ 7,8,8,8,8,8,8,8, # e0 - e7
500
+ 8,8,8,8,8,9,8,8, # e8 - ef
501
+ 10,11,11,11,11,11,11,11, # f0 - f7
502
+ 12,13,13,13,14,15,0,0 # f8 - ff
503
+ ]
504
+
505
+ UTF8_st = [
506
+ EError,EStart,EError,EError,EError,EError, 12, 10,#00-07
507
+ 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
508
+ EError,EError,EError,EError,EError,EError,EError,EError,#10-17
509
+ EError,EError,EError,EError,EError,EError,EError,EError,#18-1f
510
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,#20-27
511
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,#28-2f
512
+ EError,EError, 5, 5, 5, 5,EError,EError,#30-37
513
+ EError,EError,EError,EError,EError,EError,EError,EError,#38-3f
514
+ EError,EError,EError, 5, 5, 5,EError,EError,#40-47
515
+ EError,EError,EError,EError,EError,EError,EError,EError,#48-4f
516
+ EError,EError, 7, 7, 7, 7,EError,EError,#50-57
517
+ EError,EError,EError,EError,EError,EError,EError,EError,#58-5f
518
+ EError,EError,EError,EError, 7, 7,EError,EError,#60-67
519
+ EError,EError,EError,EError,EError,EError,EError,EError,#68-6f
520
+ EError,EError, 9, 9, 9, 9,EError,EError,#70-77
521
+ EError,EError,EError,EError,EError,EError,EError,EError,#78-7f
522
+ EError,EError,EError,EError,EError, 9,EError,EError,#80-87
523
+ EError,EError,EError,EError,EError,EError,EError,EError,#88-8f
524
+ EError,EError, 12, 12, 12, 12,EError,EError,#90-97
525
+ EError,EError,EError,EError,EError,EError,EError,EError,#98-9f
526
+ EError,EError,EError,EError,EError, 12,EError,EError,#a0-a7
527
+ EError,EError,EError,EError,EError,EError,EError,EError,#a8-af
528
+ EError,EError, 12, 12, 12,EError,EError,EError,#b0-b7
529
+ EError,EError,EError,EError,EError,EError,EError,EError,#b8-bf
530
+ EError,EError,EStart,EStart,EStart,EStart,EError,EError,#c0-c7
531
+ EError,EError,EError,EError,EError,EError,EError,EError#c8-cf
532
+ ]
533
+
534
+ UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
535
+
536
+ UTF8SMModel = {'classTable' => UTF8_cls,
537
+ 'classFactor' => 16,
538
+ 'stateTable' => UTF8_st,
539
+ 'charLenTable' => UTF8CharLenTable,
540
+ 'name' => 'UTF-8'
541
+ }
542
+ end