chardet 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/COPYING +504 -0
  2. data/README +12 -0
  3. data/lib/Big5Freq.rb +913 -0
  4. data/lib/Big5Prober.rb +48 -0
  5. data/lib/CharDistributionAnalysis.rb +245 -0
  6. data/lib/CharSetGroupProber.rb +114 -0
  7. data/lib/CharSetProber.rb +70 -0
  8. data/lib/CodingStateMachine.rb +74 -0
  9. data/lib/ESCSM.rb +242 -0
  10. data/lib/EUCJPProber.rb +97 -0
  11. data/lib/EUCKRFreq.rb +600 -0
  12. data/lib/EUCKRProber.rb +48 -0
  13. data/lib/EUCTWFreq.rb +432 -0
  14. data/lib/EUCTWProber.rb +48 -0
  15. data/lib/EscCharSetProber.rb +94 -0
  16. data/lib/GB2312Freq.rb +475 -0
  17. data/lib/GB2312Prober.rb +48 -0
  18. data/lib/HebrewProber.rb +292 -0
  19. data/lib/JISFreq.rb +573 -0
  20. data/lib/JapaneseContextAnalysis.rb +234 -0
  21. data/lib/LangBulgarianModel.rb +231 -0
  22. data/lib/LangCyrillicModel.rb +332 -0
  23. data/lib/LangGreekModel.rb +229 -0
  24. data/lib/LangHebrewModel.rb +202 -0
  25. data/lib/LangHungarianModel.rb +228 -0
  26. data/lib/LangThaiModel.rb +203 -0
  27. data/lib/Latin1Prober.rb +160 -0
  28. data/lib/MBCSGroupProber.rb +57 -0
  29. data/lib/MBCSSM.rb +513 -0
  30. data/lib/MultiByteCharSetProber.rb +94 -0
  31. data/lib/SBCSGroupProber.rb +71 -0
  32. data/lib/SJISProber.rb +99 -0
  33. data/lib/SingleByteCharSetProber.rb +131 -0
  34. data/lib/UTF8Prober.rb +91 -0
  35. data/lib/UniversalDetector.rb +209 -0
  36. data/python-docs/css/chardet.css +299 -0
  37. data/python-docs/faq.html +107 -0
  38. data/python-docs/how-it-works.html +113 -0
  39. data/python-docs/images/caution.png +0 -0
  40. data/python-docs/images/important.png +0 -0
  41. data/python-docs/images/note.png +0 -0
  42. data/python-docs/images/permalink.gif +0 -0
  43. data/python-docs/images/tip.png +0 -0
  44. data/python-docs/images/warning.png +0 -0
  45. data/python-docs/index.html +73 -0
  46. data/python-docs/license.html +62 -0
  47. data/python-docs/supported-encodings.html +86 -0
  48. data/python-docs/usage.html +107 -0
  49. metadata +86 -0
@@ -0,0 +1,160 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module Enumerable
33
+ def reduce(fn, res)
34
+ each { |n| res = res.send(fn, n) }
35
+ res
36
+ end
37
+
38
+ def reduceBlock(res)
39
+ each { |n| res = yield(res, n) }
40
+ res
41
+ end
42
+ end
43
+
44
+ module UniversalDetector
45
+ FREQ_CAT_NUM = 4
46
+
47
+ UDF = 0 # undefined
48
+ OTH = 1 # other
49
+ ASC = 2 # ascii capital letter
50
+ ASS = 3 # ascii small letter
51
+ ACV = 4 # accent capital vowel
52
+ ACO = 5 # accent capital other
53
+ ASV = 6 # accent small vowel
54
+ ASO = 7 # accent small other
55
+ CLASS_NUM = 8 # total classes
56
+
57
+ Latin1_CharToClass = [ \
58
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
59
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
60
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
61
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
62
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
63
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
64
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
65
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
66
+ OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
67
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
68
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
69
+ ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
70
+ OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
71
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
72
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
73
+ ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
74
+ OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
75
+ OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
76
+ UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
77
+ OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
78
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
79
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
80
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
81
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
82
+ ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
83
+ ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
84
+ ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
85
+ ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
86
+ ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
87
+ ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
88
+ ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
89
+ ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
90
+ ]
91
+
92
+ # 0 : illegal
93
+ # 1 : very unlikely
94
+ # 2 : normal
95
+ # 3 : very likely
96
+ Latin1ClassModel = [ \
97
+ # UDF OTH ASC ASS ACV ACO ASV ASO
98
+ 0, 0, 0, 0, 0, 0, 0, 0, # UDF
99
+ 0, 3, 3, 3, 3, 3, 3, 3, # OTH
100
+ 0, 3, 3, 3, 3, 3, 3, 3, # ASC
101
+ 0, 3, 3, 3, 1, 1, 3, 3, # ASS
102
+ 0, 3, 3, 3, 1, 2, 1, 2, # ACV
103
+ 0, 3, 3, 3, 3, 3, 3, 3, # ACO
104
+ 0, 3, 1, 3, 1, 1, 1, 3, # ASV
105
+ 0, 3, 1, 3, 1, 1, 3, 3, # ASO
106
+ ]
107
+
108
+ class Latin1Prober < CharSetProber
109
+ def initialize
110
+ super
111
+ reset()
112
+ end
113
+
114
+ def reset
115
+ @_mLastCharClass = OTH
116
+ @_mFreqCounter = [0] * FREQ_CAT_NUM
117
+ super
118
+ end
119
+
120
+ def get_charset_name
121
+ return "windows-1252"
122
+ end
123
+
124
+ def feed(aBuf)
125
+ aBuf = filter_with_english_letters(aBuf)
126
+ for c in aBuf
127
+ charClass = Latin1_CharToClass[c[0]]
128
+ freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
129
+ if freq == 0
130
+ @_mState = :NotMe
131
+ break
132
+ end
133
+ @_mFreqCounter[freq] += 1
134
+ @_mLastCharClass = charClass
135
+ end
136
+
137
+ return get_state()
138
+ end
139
+
140
+ def get_confidence()
141
+ if get_state() == :NotMe
142
+ return 0.01
143
+ end
144
+
145
+ total = @_mFreqCounter.reduce(:+, 0)
146
+ if total < 0.01
147
+ confidence = 0.0
148
+ else
149
+ confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
150
+ end
151
+ if confidence < 0.0
152
+ confidence = 0.0
153
+ end
154
+ # lower the confidence of latin1 so that other more accurate detector
155
+ # can take priority.
156
+ confidence = confidence * 0.5
157
+ return confidence
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,57 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetGroupProber'
31
+ require 'UTF8Prober'
32
+ require 'SJISProber'
33
+ require 'EUCJPProber'
34
+ require 'GB2312Prober'
35
+ require 'EUCKRProber'
36
+ require 'Big5Prober'
37
+ require 'EUCTWProber'
38
+
39
+ module UniversalDetector
40
+ class MBCSGroupProber < CharSetGroupProber
41
+
42
+ attr_reader :mProbers
43
+
44
+ def initialize
45
+ super
46
+ @mProbers = [ \
47
+ UTF8Prober.new,
48
+ SJISProber.new,
49
+ EUCJPProber.new,
50
+ GB2312Prober.new,
51
+ EUCKRProber.new,
52
+ Big5Prober.new,
53
+ EUCTWProber.new]
54
+ reset()
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,513 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module UniversalDetector
30
+ BIG5_cls = [ \
31
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
32
+ 1,1,1,1,1,1,0,0, # 08 - 0f
33
+ 1,1,1,1,1,1,1,1, # 10 - 17
34
+ 1,1,1,0,1,1,1,1, # 18 - 1f
35
+ 1,1,1,1,1,1,1,1, # 20 - 27
36
+ 1,1,1,1,1,1,1,1, # 28 - 2f
37
+ 1,1,1,1,1,1,1,1, # 30 - 37
38
+ 1,1,1,1,1,1,1,1, # 38 - 3f
39
+ 2,2,2,2,2,2,2,2, # 40 - 47
40
+ 2,2,2,2,2,2,2,2, # 48 - 4f
41
+ 2,2,2,2,2,2,2,2, # 50 - 57
42
+ 2,2,2,2,2,2,2,2, # 58 - 5f
43
+ 2,2,2,2,2,2,2,2, # 60 - 67
44
+ 2,2,2,2,2,2,2,2, # 68 - 6f
45
+ 2,2,2,2,2,2,2,2, # 70 - 77
46
+ 2,2,2,2,2,2,2,1, # 78 - 7f
47
+ 4,4,4,4,4,4,4,4, # 80 - 87
48
+ 4,4,4,4,4,4,4,4, # 88 - 8f
49
+ 4,4,4,4,4,4,4,4, # 90 - 97
50
+ 4,4,4,4,4,4,4,4, # 98 - 9f
51
+ 4,3,3,3,3,3,3,3, # a0 - a7
52
+ 3,3,3,3,3,3,3,3, # a8 - af
53
+ 3,3,3,3,3,3,3,3, # b0 - b7
54
+ 3,3,3,3,3,3,3,3, # b8 - bf
55
+ 3,3,3,3,3,3,3,3, # c0 - c7
56
+ 3,3,3,3,3,3,3,3, # c8 - cf
57
+ 3,3,3,3,3,3,3,3, # d0 - d7
58
+ 3,3,3,3,3,3,3,3, # d8 - df
59
+ 3,3,3,3,3,3,3,3, # e0 - e7
60
+ 3,3,3,3,3,3,3,3, # e8 - ef
61
+ 3,3,3,3,3,3,3,3, # f0 - f7
62
+ 3,3,3,3,3,3,3,0] # f8 - ff
63
+
64
+ BIG5_st = [ \
65
+ :Error,:Start,:Start, 3,:Error,:Error,:Error,:Error,#00-07
66
+ :Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,#08-0f
67
+ :Error,:Start,:Start,:Start,:Start,:Start,:Start,:Start]#10-17
68
+
69
+ Big5CharLenTable = [0, 1, 1, 2, 0]
70
+
71
+ Big5SMModel = {'classTable' => BIG5_cls,
72
+ 'classFactor' => 5,
73
+ 'stateTable' => BIG5_st,
74
+ 'charLenTable' => Big5CharLenTable,
75
+ 'name' => 'Big5'}
76
+
77
+ # EUC-JP
78
+
79
+ EUCJP_cls = [ \
80
+ 4,4,4,4,4,4,4,4, # 00 - 07
81
+ 4,4,4,4,4,4,5,5, # 08 - 0f
82
+ 4,4,4,4,4,4,4,4, # 10 - 17
83
+ 4,4,4,5,4,4,4,4, # 18 - 1f
84
+ 4,4,4,4,4,4,4,4, # 20 - 27
85
+ 4,4,4,4,4,4,4,4, # 28 - 2f
86
+ 4,4,4,4,4,4,4,4, # 30 - 37
87
+ 4,4,4,4,4,4,4,4, # 38 - 3f
88
+ 4,4,4,4,4,4,4,4, # 40 - 47
89
+ 4,4,4,4,4,4,4,4, # 48 - 4f
90
+ 4,4,4,4,4,4,4,4, # 50 - 57
91
+ 4,4,4,4,4,4,4,4, # 58 - 5f
92
+ 4,4,4,4,4,4,4,4, # 60 - 67
93
+ 4,4,4,4,4,4,4,4, # 68 - 6f
94
+ 4,4,4,4,4,4,4,4, # 70 - 77
95
+ 4,4,4,4,4,4,4,4, # 78 - 7f
96
+ 5,5,5,5,5,5,5,5, # 80 - 87
97
+ 5,5,5,5,5,5,1,3, # 88 - 8f
98
+ 5,5,5,5,5,5,5,5, # 90 - 97
99
+ 5,5,5,5,5,5,5,5, # 98 - 9f
100
+ 5,2,2,2,2,2,2,2, # a0 - a7
101
+ 2,2,2,2,2,2,2,2, # a8 - af
102
+ 2,2,2,2,2,2,2,2, # b0 - b7
103
+ 2,2,2,2,2,2,2,2, # b8 - bf
104
+ 2,2,2,2,2,2,2,2, # c0 - c7
105
+ 2,2,2,2,2,2,2,2, # c8 - cf
106
+ 2,2,2,2,2,2,2,2, # d0 - d7
107
+ 2,2,2,2,2,2,2,2, # d8 - df
108
+ 0,0,0,0,0,0,0,0, # e0 - e7
109
+ 0,0,0,0,0,0,0,0, # e8 - ef
110
+ 0,0,0,0,0,0,0,0, # f0 - f7
111
+ 0,0,0,0,0,0,0,5] # f8 - ff
112
+
113
+ EUCJP_st = [ \
114
+ 3, 4, 3, 5,:Start,:Error,:Error,:Error,#00-07
115
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
116
+ :ItsMe,:ItsMe,:Start,:Error,:Start,:Error,:Error,:Error,#10-17
117
+ :Error,:Error,:Start,:Error,:Error,:Error, 3,:Error,#18-1f
118
+ 3,:Error,:Error,:Error,:Start,:Start,:Start,:Start]#20-27
119
+
120
+ EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
121
+
122
+ EUCJPSMModel = {'classTable' => EUCJP_cls,
123
+ 'classFactor' => 6,
124
+ 'stateTable' => EUCJP_st,
125
+ 'charLenTable' => EUCJPCharLenTable,
126
+ 'name' => 'EUC-JP'}
127
+
128
+ # EUC-KR
129
+
130
+ EUCKR_cls = [ \
131
+ 1,1,1,1,1,1,1,1, # 00 - 07
132
+ 1,1,1,1,1,1,0,0, # 08 - 0f
133
+ 1,1,1,1,1,1,1,1, # 10 - 17
134
+ 1,1,1,0,1,1,1,1, # 18 - 1f
135
+ 1,1,1,1,1,1,1,1, # 20 - 27
136
+ 1,1,1,1,1,1,1,1, # 28 - 2f
137
+ 1,1,1,1,1,1,1,1, # 30 - 37
138
+ 1,1,1,1,1,1,1,1, # 38 - 3f
139
+ 1,1,1,1,1,1,1,1, # 40 - 47
140
+ 1,1,1,1,1,1,1,1, # 48 - 4f
141
+ 1,1,1,1,1,1,1,1, # 50 - 57
142
+ 1,1,1,1,1,1,1,1, # 58 - 5f
143
+ 1,1,1,1,1,1,1,1, # 60 - 67
144
+ 1,1,1,1,1,1,1,1, # 68 - 6f
145
+ 1,1,1,1,1,1,1,1, # 70 - 77
146
+ 1,1,1,1,1,1,1,1, # 78 - 7f
147
+ 0,0,0,0,0,0,0,0, # 80 - 87
148
+ 0,0,0,0,0,0,0,0, # 88 - 8f
149
+ 0,0,0,0,0,0,0,0, # 90 - 97
150
+ 0,0,0,0,0,0,0,0, # 98 - 9f
151
+ 0,2,2,2,2,2,2,2, # a0 - a7
152
+ 2,2,2,2,2,3,3,3, # a8 - af
153
+ 2,2,2,2,2,2,2,2, # b0 - b7
154
+ 2,2,2,2,2,2,2,2, # b8 - bf
155
+ 2,2,2,2,2,2,2,2, # c0 - c7
156
+ 2,3,2,2,2,2,2,2, # c8 - cf
157
+ 2,2,2,2,2,2,2,2, # d0 - d7
158
+ 2,2,2,2,2,2,2,2, # d8 - df
159
+ 2,2,2,2,2,2,2,2, # e0 - e7
160
+ 2,2,2,2,2,2,2,2, # e8 - ef
161
+ 2,2,2,2,2,2,2,2, # f0 - f7
162
+ 2,2,2,2,2,2,2,0] # f8 - ff
163
+
164
+ EUCKR_st = [
165
+ :Error,:Start, 3,:Error,:Error,:Error,:Error,:Error,#00-07
166
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Start,:Start]#08-0f
167
+
168
+ EUCKRCharLenTable = [0, 1, 2, 0]
169
+
170
+ EUCKRSMModel = {'classTable' => EUCKR_cls,
171
+ 'classFactor' => 4,
172
+ 'stateTable' => EUCKR_st,
173
+ 'charLenTable' => EUCKRCharLenTable,
174
+ 'name' => 'EUC-KR'}
175
+
176
+ # EUC-TW
177
+
178
+ EUCTW_cls = [ \
179
+ 2,2,2,2,2,2,2,2, # 00 - 07
180
+ 2,2,2,2,2,2,0,0, # 08 - 0f
181
+ 2,2,2,2,2,2,2,2, # 10 - 17
182
+ 2,2,2,0,2,2,2,2, # 18 - 1f
183
+ 2,2,2,2,2,2,2,2, # 20 - 27
184
+ 2,2,2,2,2,2,2,2, # 28 - 2f
185
+ 2,2,2,2,2,2,2,2, # 30 - 37
186
+ 2,2,2,2,2,2,2,2, # 38 - 3f
187
+ 2,2,2,2,2,2,2,2, # 40 - 47
188
+ 2,2,2,2,2,2,2,2, # 48 - 4f
189
+ 2,2,2,2,2,2,2,2, # 50 - 57
190
+ 2,2,2,2,2,2,2,2, # 58 - 5f
191
+ 2,2,2,2,2,2,2,2, # 60 - 67
192
+ 2,2,2,2,2,2,2,2, # 68 - 6f
193
+ 2,2,2,2,2,2,2,2, # 70 - 77
194
+ 2,2,2,2,2,2,2,2, # 78 - 7f
195
+ 0,0,0,0,0,0,0,0, # 80 - 87
196
+ 0,0,0,0,0,0,6,0, # 88 - 8f
197
+ 0,0,0,0,0,0,0,0, # 90 - 97
198
+ 0,0,0,0,0,0,0,0, # 98 - 9f
199
+ 0,3,4,4,4,4,4,4, # a0 - a7
200
+ 5,5,1,1,1,1,1,1, # a8 - af
201
+ 1,1,1,1,1,1,1,1, # b0 - b7
202
+ 1,1,1,1,1,1,1,1, # b8 - bf
203
+ 1,1,3,1,3,3,3,3, # c0 - c7
204
+ 3,3,3,3,3,3,3,3, # c8 - cf
205
+ 3,3,3,3,3,3,3,3, # d0 - d7
206
+ 3,3,3,3,3,3,3,3, # d8 - df
207
+ 3,3,3,3,3,3,3,3, # e0 - e7
208
+ 3,3,3,3,3,3,3,3, # e8 - ef
209
+ 3,3,3,3,3,3,3,3, # f0 - f7
210
+ 3,3,3,3,3,3,3,0] # f8 - ff
211
+
212
+ EUCTW_st = [ \
213
+ :Error,:Error,:Start, 3, 3, 3, 4,:Error,#00-07
214
+ :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe,#08-0f
215
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Start,:Error,#10-17
216
+ :Start,:Start,:Start,:Error,:Error,:Error,:Error,:Error,#18-1f
217
+ 5,:Error,:Error,:Error,:Start,:Error,:Start,:Start,#20-27
218
+ :Start,:Error,:Start,:Start,:Start,:Start,:Start,:Start]#28-2f
219
+
220
+ EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
221
+
222
+ EUCTWSMModel = {'classTable' => EUCTW_cls,
223
+ 'classFactor' => 7,
224
+ 'stateTable' => EUCTW_st,
225
+ 'charLenTable' => EUCTWCharLenTable,
226
+ 'name' => 'x-euc-tw'}
227
+
228
+ # GB2312
229
+
230
+ GB2312_cls = [ \
231
+ 1,1,1,1,1,1,1,1, # 00 - 07
232
+ 1,1,1,1,1,1,0,0, # 08 - 0f
233
+ 1,1,1,1,1,1,1,1, # 10 - 17
234
+ 1,1,1,0,1,1,1,1, # 18 - 1f
235
+ 1,1,1,1,1,1,1,1, # 20 - 27
236
+ 1,1,1,1,1,1,1,1, # 28 - 2f
237
+ 3,3,3,3,3,3,3,3, # 30 - 37
238
+ 3,3,1,1,1,1,1,1, # 38 - 3f
239
+ 2,2,2,2,2,2,2,2, # 40 - 47
240
+ 2,2,2,2,2,2,2,2, # 48 - 4f
241
+ 2,2,2,2,2,2,2,2, # 50 - 57
242
+ 2,2,2,2,2,2,2,2, # 58 - 5f
243
+ 2,2,2,2,2,2,2,2, # 60 - 67
244
+ 2,2,2,2,2,2,2,2, # 68 - 6f
245
+ 2,2,2,2,2,2,2,2, # 70 - 77
246
+ 2,2,2,2,2,2,2,4, # 78 - 7f
247
+ 5,6,6,6,6,6,6,6, # 80 - 87
248
+ 6,6,6,6,6,6,6,6, # 88 - 8f
249
+ 6,6,6,6,6,6,6,6, # 90 - 97
250
+ 6,6,6,6,6,6,6,6, # 98 - 9f
251
+ 6,6,6,6,6,6,6,6, # a0 - a7
252
+ 6,6,6,6,6,6,6,6, # a8 - af
253
+ 6,6,6,6,6,6,6,6, # b0 - b7
254
+ 6,6,6,6,6,6,6,6, # b8 - bf
255
+ 6,6,6,6,6,6,6,6, # c0 - c7
256
+ 6,6,6,6,6,6,6,6, # c8 - cf
257
+ 6,6,6,6,6,6,6,6, # d0 - d7
258
+ 6,6,6,6,6,6,6,6, # d8 - df
259
+ 6,6,6,6,6,6,6,6, # e0 - e7
260
+ 6,6,6,6,6,6,6,6, # e8 - ef
261
+ 6,6,6,6,6,6,6,6, # f0 - f7
262
+ 6,6,6,6,6,6,6,0] # f8 - ff
263
+
264
+ GB2312_st = [ \
265
+ :Error,:Start,:Start,:Start,:Start,:Start, 3,:Error,#00-07
266
+ :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe,#08-0f
267
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Start,#10-17
268
+ 4,:Error,:Start,:Start,:Error,:Error,:Error,:Error,#18-1f
269
+ :Error,:Error, 5,:Error,:Error,:Error,:ItsMe,:Error,#20-27
270
+ :Error,:Error,:Start,:Start,:Start,:Start,:Start,:Start]#28-2f
271
+
272
+ # To be accurate, the length of class 6 can be either 2 or 4.
273
+ # But it is not necessary to discriminate between the two since
274
+ # it is used for frequency analysis only, and we are validing
275
+ # each code range there as well. So it is safe to set it to be
276
+ # 2 here.
277
+ GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
278
+
279
+ GB2312SMModel = {'classTable' => GB2312_cls,
280
+ 'classFactor' => 7,
281
+ 'stateTable' => GB2312_st,
282
+ 'charLenTable' => GB2312CharLenTable,
283
+ 'name' => 'GB2312'}
284
+
285
+ # Shift_JIS
286
+
287
+ SJIS_cls = [ \
288
+ 1,1,1,1,1,1,1,1, # 00 - 07
289
+ 1,1,1,1,1,1,0,0, # 08 - 0f
290
+ 1,1,1,1,1,1,1,1, # 10 - 17
291
+ 1,1,1,0,1,1,1,1, # 18 - 1f
292
+ 1,1,1,1,1,1,1,1, # 20 - 27
293
+ 1,1,1,1,1,1,1,1, # 28 - 2f
294
+ 1,1,1,1,1,1,1,1, # 30 - 37
295
+ 1,1,1,1,1,1,1,1, # 38 - 3f
296
+ 2,2,2,2,2,2,2,2, # 40 - 47
297
+ 2,2,2,2,2,2,2,2, # 48 - 4f
298
+ 2,2,2,2,2,2,2,2, # 50 - 57
299
+ 2,2,2,2,2,2,2,2, # 58 - 5f
300
+ 2,2,2,2,2,2,2,2, # 60 - 67
301
+ 2,2,2,2,2,2,2,2, # 68 - 6f
302
+ 2,2,2,2,2,2,2,2, # 70 - 77
303
+ 2,2,2,2,2,2,2,1, # 78 - 7f
304
+ 3,3,3,3,3,3,3,3, # 80 - 87
305
+ 3,3,3,3,3,3,3,3, # 88 - 8f
306
+ 3,3,3,3,3,3,3,3, # 90 - 97
307
+ 3,3,3,3,3,3,3,3, # 98 - 9f
308
+ #0xa0 is illegal in sjis encoding, but some pages does
309
+ #contain such byte. We need to be more error forgiven.
310
+ 2,2,2,2,2,2,2,2, # a0 - a7
311
+ 2,2,2,2,2,2,2,2, # a8 - af
312
+ 2,2,2,2,2,2,2,2, # b0 - b7
313
+ 2,2,2,2,2,2,2,2, # b8 - bf
314
+ 2,2,2,2,2,2,2,2, # c0 - c7
315
+ 2,2,2,2,2,2,2,2, # c8 - cf
316
+ 2,2,2,2,2,2,2,2, # d0 - d7
317
+ 2,2,2,2,2,2,2,2, # d8 - df
318
+ 3,3,3,3,3,3,3,3, # e0 - e7
319
+ 3,3,3,3,3,4,4,4, # e8 - ef
320
+ 4,4,4,4,4,4,4,4, # f0 - f7
321
+ 4,4,4,4,4,0,0,0] # f8 - ff
322
+
323
+ SJIS_st = [ \
324
+ :Error,:Start,:Start, 3,:Error,:Error,:Error,:Error,#00-07
325
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
326
+ :ItsMe,:ItsMe,:Error,:Error,:Start,:Start,:Start,:Start]#10-17
327
+
328
+ SJISCharLenTable = [0, 1, 1, 2, 0, 0]
329
+
330
+ SJISSMModel = {'classTable' => SJIS_cls,
331
+ 'classFactor' => 6,
332
+ 'stateTable' => SJIS_st,
333
+ 'charLenTable' => SJISCharLenTable,
334
+ 'name' => 'Shift_JIS'}
335
+
336
+ # UCS2-BE
337
+
338
+ UCS2BE_cls = [ \
339
+ 0,0,0,0,0,0,0,0, # 00 - 07
340
+ 0,0,1,0,0,2,0,0, # 08 - 0f
341
+ 0,0,0,0,0,0,0,0, # 10 - 17
342
+ 0,0,0,3,0,0,0,0, # 18 - 1f
343
+ 0,0,0,0,0,0,0,0, # 20 - 27
344
+ 0,3,3,3,3,3,0,0, # 28 - 2f
345
+ 0,0,0,0,0,0,0,0, # 30 - 37
346
+ 0,0,0,0,0,0,0,0, # 38 - 3f
347
+ 0,0,0,0,0,0,0,0, # 40 - 47
348
+ 0,0,0,0,0,0,0,0, # 48 - 4f
349
+ 0,0,0,0,0,0,0,0, # 50 - 57
350
+ 0,0,0,0,0,0,0,0, # 58 - 5f
351
+ 0,0,0,0,0,0,0,0, # 60 - 67
352
+ 0,0,0,0,0,0,0,0, # 68 - 6f
353
+ 0,0,0,0,0,0,0,0, # 70 - 77
354
+ 0,0,0,0,0,0,0,0, # 78 - 7f
355
+ 0,0,0,0,0,0,0,0, # 80 - 87
356
+ 0,0,0,0,0,0,0,0, # 88 - 8f
357
+ 0,0,0,0,0,0,0,0, # 90 - 97
358
+ 0,0,0,0,0,0,0,0, # 98 - 9f
359
+ 0,0,0,0,0,0,0,0, # a0 - a7
360
+ 0,0,0,0,0,0,0,0, # a8 - af
361
+ 0,0,0,0,0,0,0,0, # b0 - b7
362
+ 0,0,0,0,0,0,0,0, # b8 - bf
363
+ 0,0,0,0,0,0,0,0, # c0 - c7
364
+ 0,0,0,0,0,0,0,0, # c8 - cf
365
+ 0,0,0,0,0,0,0,0, # d0 - d7
366
+ 0,0,0,0,0,0,0,0, # d8 - df
367
+ 0,0,0,0,0,0,0,0, # e0 - e7
368
+ 0,0,0,0,0,0,0,0, # e8 - ef
369
+ 0,0,0,0,0,0,0,0, # f0 - f7
370
+ 0,0,0,0,0,0,4,5] # f8 - ff
371
+
372
+ UCS2BE_st = [ \
373
+ 5, 7, 7,:Error, 4, 3,:Error,:Error,#00-07
374
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
375
+ :ItsMe,:ItsMe, 6, 6, 6, 6,:Error,:Error,#10-17
376
+ 6, 6, 6, 6, 6,:ItsMe, 6, 6,#18-1f
377
+ 6, 6, 6, 6, 5, 7, 7,:Error,#20-27
378
+ 5, 8, 6, 6,:Error, 6, 6, 6,#28-2f
379
+ 6, 6, 6, 6,:Error,:Error,:Start,:Start]#30-37
380
+
381
+ UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
382
+
383
+ UCS2BESMModel = {'classTable' => UCS2BE_cls,
384
+ 'classFactor' => 6,
385
+ 'stateTable' => UCS2BE_st,
386
+ 'charLenTable' => UCS2BECharLenTable,
387
+ 'name' => 'UTF-16BE'}
388
+
389
+ # UCS2-LE
390
+
391
+ UCS2LE_cls = [ \
392
+ 0,0,0,0,0,0,0,0, # 00 - 07
393
+ 0,0,1,0,0,2,0,0, # 08 - 0f
394
+ 0,0,0,0,0,0,0,0, # 10 - 17
395
+ 0,0,0,3,0,0,0,0, # 18 - 1f
396
+ 0,0,0,0,0,0,0,0, # 20 - 27
397
+ 0,3,3,3,3,3,0,0, # 28 - 2f
398
+ 0,0,0,0,0,0,0,0, # 30 - 37
399
+ 0,0,0,0,0,0,0,0, # 38 - 3f
400
+ 0,0,0,0,0,0,0,0, # 40 - 47
401
+ 0,0,0,0,0,0,0,0, # 48 - 4f
402
+ 0,0,0,0,0,0,0,0, # 50 - 57
403
+ 0,0,0,0,0,0,0,0, # 58 - 5f
404
+ 0,0,0,0,0,0,0,0, # 60 - 67
405
+ 0,0,0,0,0,0,0,0, # 68 - 6f
406
+ 0,0,0,0,0,0,0,0, # 70 - 77
407
+ 0,0,0,0,0,0,0,0, # 78 - 7f
408
+ 0,0,0,0,0,0,0,0, # 80 - 87
409
+ 0,0,0,0,0,0,0,0, # 88 - 8f
410
+ 0,0,0,0,0,0,0,0, # 90 - 97
411
+ 0,0,0,0,0,0,0,0, # 98 - 9f
412
+ 0,0,0,0,0,0,0,0, # a0 - a7
413
+ 0,0,0,0,0,0,0,0, # a8 - af
414
+ 0,0,0,0,0,0,0,0, # b0 - b7
415
+ 0,0,0,0,0,0,0,0, # b8 - bf
416
+ 0,0,0,0,0,0,0,0, # c0 - c7
417
+ 0,0,0,0,0,0,0,0, # c8 - cf
418
+ 0,0,0,0,0,0,0,0, # d0 - d7
419
+ 0,0,0,0,0,0,0,0, # d8 - df
420
+ 0,0,0,0,0,0,0,0, # e0 - e7
421
+ 0,0,0,0,0,0,0,0, # e8 - ef
422
+ 0,0,0,0,0,0,0,0, # f0 - f7
423
+ 0,0,0,0,0,0,4,5] # f8 - ff
424
+
425
+ UCS2LE_st = [ \
426
+ 6, 6, 7, 6, 4, 3,:Error,:Error,#00-07
427
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
428
+ :ItsMe,:ItsMe, 5, 5, 5,:Error,:ItsMe,:Error,#10-17
429
+ 5, 5, 5,:Error, 5,:Error, 6, 6,#18-1f
430
+ 7, 6, 8, 8, 5, 5, 5,:Error,#20-27
431
+ 5, 5, 5,:Error,:Error,:Error, 5, 5,#28-2f
432
+ 5, 5, 5,:Error, 5,:Error,:Start,:Start]#30-37
433
+
434
+ UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
435
+
436
+ UCS2LESMModel = {'classTable' => UCS2LE_cls,
437
+ 'classFactor' => 6,
438
+ 'stateTable' => UCS2LE_st,
439
+ 'charLenTable' => UCS2LECharLenTable,
440
+ 'name' => 'UTF-16LE'}
441
+
442
+ # UTF-8
443
+
444
+ UTF8_cls = [ \
445
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
446
+ 1,1,1,1,1,1,0,0, # 08 - 0f
447
+ 1,1,1,1,1,1,1,1, # 10 - 17
448
+ 1,1,1,0,1,1,1,1, # 18 - 1f
449
+ 1,1,1,1,1,1,1,1, # 20 - 27
450
+ 1,1,1,1,1,1,1,1, # 28 - 2f
451
+ 1,1,1,1,1,1,1,1, # 30 - 37
452
+ 1,1,1,1,1,1,1,1, # 38 - 3f
453
+ 1,1,1,1,1,1,1,1, # 40 - 47
454
+ 1,1,1,1,1,1,1,1, # 48 - 4f
455
+ 1,1,1,1,1,1,1,1, # 50 - 57
456
+ 1,1,1,1,1,1,1,1, # 58 - 5f
457
+ 1,1,1,1,1,1,1,1, # 60 - 67
458
+ 1,1,1,1,1,1,1,1, # 68 - 6f
459
+ 1,1,1,1,1,1,1,1, # 70 - 77
460
+ 1,1,1,1,1,1,1,1, # 78 - 7f
461
+ 2,2,2,2,3,3,3,3, # 80 - 87
462
+ 4,4,4,4,4,4,4,4, # 88 - 8f
463
+ 4,4,4,4,4,4,4,4, # 90 - 97
464
+ 4,4,4,4,4,4,4,4, # 98 - 9f
465
+ 5,5,5,5,5,5,5,5, # a0 - a7
466
+ 5,5,5,5,5,5,5,5, # a8 - af
467
+ 5,5,5,5,5,5,5,5, # b0 - b7
468
+ 5,5,5,5,5,5,5,5, # b8 - bf
469
+ 0,0,6,6,6,6,6,6, # c0 - c7
470
+ 6,6,6,6,6,6,6,6, # c8 - cf
471
+ 6,6,6,6,6,6,6,6, # d0 - d7
472
+ 6,6,6,6,6,6,6,6, # d8 - df
473
+ 7,8,8,8,8,8,8,8, # e0 - e7
474
+ 8,8,8,8,8,9,8,8, # e8 - ef
475
+ 10,11,11,11,11,11,11,11, # f0 - f7
476
+ 12,13,13,13,14,15,0,0] # f8 - ff
477
+
478
+ UTF8_st = [ \
479
+ :Error,:Start,:Error,:Error,:Error,:Error, 12, 10,#00-07
480
+ 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
481
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#10-17
482
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#18-1f
483
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#20-27
484
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#28-2f
485
+ :Error,:Error, 5, 5, 5, 5,:Error,:Error,#30-37
486
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#38-3f
487
+ :Error,:Error,:Error, 5, 5, 5,:Error,:Error,#40-47
488
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#48-4f
489
+ :Error,:Error, 7, 7, 7, 7,:Error,:Error,#50-57
490
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#58-5f
491
+ :Error,:Error,:Error,:Error, 7, 7,:Error,:Error,#60-67
492
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#68-6f
493
+ :Error,:Error, 9, 9, 9, 9,:Error,:Error,#70-77
494
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#78-7f
495
+ :Error,:Error,:Error,:Error,:Error, 9,:Error,:Error,#80-87
496
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#88-8f
497
+ :Error,:Error, 12, 12, 12, 12,:Error,:Error,#90-97
498
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#98-9f
499
+ :Error,:Error,:Error,:Error,:Error, 12,:Error,:Error,#a0-a7
500
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#a8-af
501
+ :Error,:Error, 12, 12, 12,:Error,:Error,:Error,#b0-b7
502
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#b8-bf
503
+ :Error,:Error,:Start,:Start,:Start,:Start,:Error,:Error,#c0-c7
504
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error]#c8-cf
505
+
506
+ UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
507
+
508
+ UTF8SMModel = {'classTable' => UTF8_cls,
509
+ 'classFactor' => 16,
510
+ 'stateTable' => UTF8_st,
511
+ 'charLenTable' => UTF8CharLenTable,
512
+ 'name' => 'UTF-8'}
513
+ end