chardet 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/COPYING +504 -0
  2. data/README +12 -0
  3. data/lib/Big5Freq.rb +913 -0
  4. data/lib/Big5Prober.rb +48 -0
  5. data/lib/CharDistributionAnalysis.rb +245 -0
  6. data/lib/CharSetGroupProber.rb +114 -0
  7. data/lib/CharSetProber.rb +70 -0
  8. data/lib/CodingStateMachine.rb +74 -0
  9. data/lib/ESCSM.rb +242 -0
  10. data/lib/EUCJPProber.rb +97 -0
  11. data/lib/EUCKRFreq.rb +600 -0
  12. data/lib/EUCKRProber.rb +48 -0
  13. data/lib/EUCTWFreq.rb +432 -0
  14. data/lib/EUCTWProber.rb +48 -0
  15. data/lib/EscCharSetProber.rb +94 -0
  16. data/lib/GB2312Freq.rb +475 -0
  17. data/lib/GB2312Prober.rb +48 -0
  18. data/lib/HebrewProber.rb +292 -0
  19. data/lib/JISFreq.rb +573 -0
  20. data/lib/JapaneseContextAnalysis.rb +234 -0
  21. data/lib/LangBulgarianModel.rb +231 -0
  22. data/lib/LangCyrillicModel.rb +332 -0
  23. data/lib/LangGreekModel.rb +229 -0
  24. data/lib/LangHebrewModel.rb +202 -0
  25. data/lib/LangHungarianModel.rb +228 -0
  26. data/lib/LangThaiModel.rb +203 -0
  27. data/lib/Latin1Prober.rb +160 -0
  28. data/lib/MBCSGroupProber.rb +57 -0
  29. data/lib/MBCSSM.rb +513 -0
  30. data/lib/MultiByteCharSetProber.rb +94 -0
  31. data/lib/SBCSGroupProber.rb +71 -0
  32. data/lib/SJISProber.rb +99 -0
  33. data/lib/SingleByteCharSetProber.rb +131 -0
  34. data/lib/UTF8Prober.rb +91 -0
  35. data/lib/UniversalDetector.rb +209 -0
  36. data/python-docs/css/chardet.css +299 -0
  37. data/python-docs/faq.html +107 -0
  38. data/python-docs/how-it-works.html +113 -0
  39. data/python-docs/images/caution.png +0 -0
  40. data/python-docs/images/important.png +0 -0
  41. data/python-docs/images/note.png +0 -0
  42. data/python-docs/images/permalink.gif +0 -0
  43. data/python-docs/images/tip.png +0 -0
  44. data/python-docs/images/warning.png +0 -0
  45. data/python-docs/index.html +73 -0
  46. data/python-docs/license.html +62 -0
  47. data/python-docs/supported-encodings.html +86 -0
  48. data/python-docs/usage.html +107 -0
  49. metadata +86 -0
@@ -0,0 +1,160 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module Enumerable
33
+ def reduce(fn, res)
34
+ each { |n| res = res.send(fn, n) }
35
+ res
36
+ end
37
+
38
+ def reduceBlock(res)
39
+ each { |n| res = yield(res, n) }
40
+ res
41
+ end
42
+ end
43
+
44
+ module UniversalDetector
45
+ FREQ_CAT_NUM = 4
46
+
47
+ UDF = 0 # undefined
48
+ OTH = 1 # other
49
+ ASC = 2 # ascii capital letter
50
+ ASS = 3 # ascii small letter
51
+ ACV = 4 # accent capital vowel
52
+ ACO = 5 # accent capital other
53
+ ASV = 6 # accent small vowel
54
+ ASO = 7 # accent small other
55
+ CLASS_NUM = 8 # total classes
56
+
57
+ Latin1_CharToClass = [ \
58
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
59
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
60
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
61
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
62
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
63
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
64
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
65
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
66
+ OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
67
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
68
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
69
+ ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
70
+ OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
71
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
72
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
73
+ ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
74
+ OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
75
+ OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
76
+ UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
77
+ OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
78
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
79
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
80
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
81
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
82
+ ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
83
+ ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
84
+ ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
85
+ ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
86
+ ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
87
+ ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
88
+ ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
89
+ ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
90
+ ]
91
+
92
+ # 0 : illegal
93
+ # 1 : very unlikely
94
+ # 2 : normal
95
+ # 3 : very likely
96
+ Latin1ClassModel = [ \
97
+ # UDF OTH ASC ASS ACV ACO ASV ASO
98
+ 0, 0, 0, 0, 0, 0, 0, 0, # UDF
99
+ 0, 3, 3, 3, 3, 3, 3, 3, # OTH
100
+ 0, 3, 3, 3, 3, 3, 3, 3, # ASC
101
+ 0, 3, 3, 3, 1, 1, 3, 3, # ASS
102
+ 0, 3, 3, 3, 1, 2, 1, 2, # ACV
103
+ 0, 3, 3, 3, 3, 3, 3, 3, # ACO
104
+ 0, 3, 1, 3, 1, 1, 1, 3, # ASV
105
+ 0, 3, 1, 3, 1, 1, 3, 3, # ASO
106
+ ]
107
+
108
+ class Latin1Prober < CharSetProber
109
+ def initialize
110
+ super
111
+ reset()
112
+ end
113
+
114
+ def reset
115
+ @_mLastCharClass = OTH
116
+ @_mFreqCounter = [0] * FREQ_CAT_NUM
117
+ super
118
+ end
119
+
120
+ def get_charset_name
121
+ return "windows-1252"
122
+ end
123
+
124
+ def feed(aBuf)
125
+ aBuf = filter_with_english_letters(aBuf)
126
+ for c in aBuf
127
+ charClass = Latin1_CharToClass[c[0]]
128
+ freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
129
+ if freq == 0
130
+ @_mState = :NotMe
131
+ break
132
+ end
133
+ @_mFreqCounter[freq] += 1
134
+ @_mLastCharClass = charClass
135
+ end
136
+
137
+ return get_state()
138
+ end
139
+
140
+ def get_confidence()
141
+ if get_state() == :NotMe
142
+ return 0.01
143
+ end
144
+
145
+ total = @_mFreqCounter.reduce(:+, 0)
146
+ if total < 0.01
147
+ confidence = 0.0
148
+ else
149
+ confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
150
+ end
151
+ if confidence < 0.0
152
+ confidence = 0.0
153
+ end
154
+ # lower the confidence of latin1 so that other more accurate detector
155
+ # can take priority.
156
+ confidence = confidence * 0.5
157
+ return confidence
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,57 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetGroupProber'
31
+ require 'UTF8Prober'
32
+ require 'SJISProber'
33
+ require 'EUCJPProber'
34
+ require 'GB2312Prober'
35
+ require 'EUCKRProber'
36
+ require 'Big5Prober'
37
+ require 'EUCTWProber'
38
+
39
+ module UniversalDetector
40
+ class MBCSGroupProber < CharSetGroupProber
41
+
42
+ attr_reader :mProbers
43
+
44
+ def initialize
45
+ super
46
+ @mProbers = [ \
47
+ UTF8Prober.new,
48
+ SJISProber.new,
49
+ EUCJPProber.new,
50
+ GB2312Prober.new,
51
+ EUCKRProber.new,
52
+ Big5Prober.new,
53
+ EUCTWProber.new]
54
+ reset()
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,513 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module UniversalDetector
30
+ BIG5_cls = [ \
31
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
32
+ 1,1,1,1,1,1,0,0, # 08 - 0f
33
+ 1,1,1,1,1,1,1,1, # 10 - 17
34
+ 1,1,1,0,1,1,1,1, # 18 - 1f
35
+ 1,1,1,1,1,1,1,1, # 20 - 27
36
+ 1,1,1,1,1,1,1,1, # 28 - 2f
37
+ 1,1,1,1,1,1,1,1, # 30 - 37
38
+ 1,1,1,1,1,1,1,1, # 38 - 3f
39
+ 2,2,2,2,2,2,2,2, # 40 - 47
40
+ 2,2,2,2,2,2,2,2, # 48 - 4f
41
+ 2,2,2,2,2,2,2,2, # 50 - 57
42
+ 2,2,2,2,2,2,2,2, # 58 - 5f
43
+ 2,2,2,2,2,2,2,2, # 60 - 67
44
+ 2,2,2,2,2,2,2,2, # 68 - 6f
45
+ 2,2,2,2,2,2,2,2, # 70 - 77
46
+ 2,2,2,2,2,2,2,1, # 78 - 7f
47
+ 4,4,4,4,4,4,4,4, # 80 - 87
48
+ 4,4,4,4,4,4,4,4, # 88 - 8f
49
+ 4,4,4,4,4,4,4,4, # 90 - 97
50
+ 4,4,4,4,4,4,4,4, # 98 - 9f
51
+ 4,3,3,3,3,3,3,3, # a0 - a7
52
+ 3,3,3,3,3,3,3,3, # a8 - af
53
+ 3,3,3,3,3,3,3,3, # b0 - b7
54
+ 3,3,3,3,3,3,3,3, # b8 - bf
55
+ 3,3,3,3,3,3,3,3, # c0 - c7
56
+ 3,3,3,3,3,3,3,3, # c8 - cf
57
+ 3,3,3,3,3,3,3,3, # d0 - d7
58
+ 3,3,3,3,3,3,3,3, # d8 - df
59
+ 3,3,3,3,3,3,3,3, # e0 - e7
60
+ 3,3,3,3,3,3,3,3, # e8 - ef
61
+ 3,3,3,3,3,3,3,3, # f0 - f7
62
+ 3,3,3,3,3,3,3,0] # f8 - ff
63
+
64
+ BIG5_st = [ \
65
+ :Error,:Start,:Start, 3,:Error,:Error,:Error,:Error,#00-07
66
+ :Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,#08-0f
67
+ :Error,:Start,:Start,:Start,:Start,:Start,:Start,:Start]#10-17
68
+
69
+ Big5CharLenTable = [0, 1, 1, 2, 0]
70
+
71
+ Big5SMModel = {'classTable' => BIG5_cls,
72
+ 'classFactor' => 5,
73
+ 'stateTable' => BIG5_st,
74
+ 'charLenTable' => Big5CharLenTable,
75
+ 'name' => 'Big5'}
76
+
77
+ # EUC-JP
78
+
79
+ EUCJP_cls = [ \
80
+ 4,4,4,4,4,4,4,4, # 00 - 07
81
+ 4,4,4,4,4,4,5,5, # 08 - 0f
82
+ 4,4,4,4,4,4,4,4, # 10 - 17
83
+ 4,4,4,5,4,4,4,4, # 18 - 1f
84
+ 4,4,4,4,4,4,4,4, # 20 - 27
85
+ 4,4,4,4,4,4,4,4, # 28 - 2f
86
+ 4,4,4,4,4,4,4,4, # 30 - 37
87
+ 4,4,4,4,4,4,4,4, # 38 - 3f
88
+ 4,4,4,4,4,4,4,4, # 40 - 47
89
+ 4,4,4,4,4,4,4,4, # 48 - 4f
90
+ 4,4,4,4,4,4,4,4, # 50 - 57
91
+ 4,4,4,4,4,4,4,4, # 58 - 5f
92
+ 4,4,4,4,4,4,4,4, # 60 - 67
93
+ 4,4,4,4,4,4,4,4, # 68 - 6f
94
+ 4,4,4,4,4,4,4,4, # 70 - 77
95
+ 4,4,4,4,4,4,4,4, # 78 - 7f
96
+ 5,5,5,5,5,5,5,5, # 80 - 87
97
+ 5,5,5,5,5,5,1,3, # 88 - 8f
98
+ 5,5,5,5,5,5,5,5, # 90 - 97
99
+ 5,5,5,5,5,5,5,5, # 98 - 9f
100
+ 5,2,2,2,2,2,2,2, # a0 - a7
101
+ 2,2,2,2,2,2,2,2, # a8 - af
102
+ 2,2,2,2,2,2,2,2, # b0 - b7
103
+ 2,2,2,2,2,2,2,2, # b8 - bf
104
+ 2,2,2,2,2,2,2,2, # c0 - c7
105
+ 2,2,2,2,2,2,2,2, # c8 - cf
106
+ 2,2,2,2,2,2,2,2, # d0 - d7
107
+ 2,2,2,2,2,2,2,2, # d8 - df
108
+ 0,0,0,0,0,0,0,0, # e0 - e7
109
+ 0,0,0,0,0,0,0,0, # e8 - ef
110
+ 0,0,0,0,0,0,0,0, # f0 - f7
111
+ 0,0,0,0,0,0,0,5] # f8 - ff
112
+
113
+ EUCJP_st = [ \
114
+ 3, 4, 3, 5,:Start,:Error,:Error,:Error,#00-07
115
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
116
+ :ItsMe,:ItsMe,:Start,:Error,:Start,:Error,:Error,:Error,#10-17
117
+ :Error,:Error,:Start,:Error,:Error,:Error, 3,:Error,#18-1f
118
+ 3,:Error,:Error,:Error,:Start,:Start,:Start,:Start]#20-27
119
+
120
+ EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
121
+
122
+ EUCJPSMModel = {'classTable' => EUCJP_cls,
123
+ 'classFactor' => 6,
124
+ 'stateTable' => EUCJP_st,
125
+ 'charLenTable' => EUCJPCharLenTable,
126
+ 'name' => 'EUC-JP'}
127
+
128
+ # EUC-KR
129
+
130
+ EUCKR_cls = [ \
131
+ 1,1,1,1,1,1,1,1, # 00 - 07
132
+ 1,1,1,1,1,1,0,0, # 08 - 0f
133
+ 1,1,1,1,1,1,1,1, # 10 - 17
134
+ 1,1,1,0,1,1,1,1, # 18 - 1f
135
+ 1,1,1,1,1,1,1,1, # 20 - 27
136
+ 1,1,1,1,1,1,1,1, # 28 - 2f
137
+ 1,1,1,1,1,1,1,1, # 30 - 37
138
+ 1,1,1,1,1,1,1,1, # 38 - 3f
139
+ 1,1,1,1,1,1,1,1, # 40 - 47
140
+ 1,1,1,1,1,1,1,1, # 48 - 4f
141
+ 1,1,1,1,1,1,1,1, # 50 - 57
142
+ 1,1,1,1,1,1,1,1, # 58 - 5f
143
+ 1,1,1,1,1,1,1,1, # 60 - 67
144
+ 1,1,1,1,1,1,1,1, # 68 - 6f
145
+ 1,1,1,1,1,1,1,1, # 70 - 77
146
+ 1,1,1,1,1,1,1,1, # 78 - 7f
147
+ 0,0,0,0,0,0,0,0, # 80 - 87
148
+ 0,0,0,0,0,0,0,0, # 88 - 8f
149
+ 0,0,0,0,0,0,0,0, # 90 - 97
150
+ 0,0,0,0,0,0,0,0, # 98 - 9f
151
+ 0,2,2,2,2,2,2,2, # a0 - a7
152
+ 2,2,2,2,2,3,3,3, # a8 - af
153
+ 2,2,2,2,2,2,2,2, # b0 - b7
154
+ 2,2,2,2,2,2,2,2, # b8 - bf
155
+ 2,2,2,2,2,2,2,2, # c0 - c7
156
+ 2,3,2,2,2,2,2,2, # c8 - cf
157
+ 2,2,2,2,2,2,2,2, # d0 - d7
158
+ 2,2,2,2,2,2,2,2, # d8 - df
159
+ 2,2,2,2,2,2,2,2, # e0 - e7
160
+ 2,2,2,2,2,2,2,2, # e8 - ef
161
+ 2,2,2,2,2,2,2,2, # f0 - f7
162
+ 2,2,2,2,2,2,2,0] # f8 - ff
163
+
164
+ EUCKR_st = [
165
+ :Error,:Start, 3,:Error,:Error,:Error,:Error,:Error,#00-07
166
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Start,:Start]#08-0f
167
+
168
+ EUCKRCharLenTable = [0, 1, 2, 0]
169
+
170
+ EUCKRSMModel = {'classTable' => EUCKR_cls,
171
+ 'classFactor' => 4,
172
+ 'stateTable' => EUCKR_st,
173
+ 'charLenTable' => EUCKRCharLenTable,
174
+ 'name' => 'EUC-KR'}
175
+
176
+ # EUC-TW
177
+
178
+ EUCTW_cls = [ \
179
+ 2,2,2,2,2,2,2,2, # 00 - 07
180
+ 2,2,2,2,2,2,0,0, # 08 - 0f
181
+ 2,2,2,2,2,2,2,2, # 10 - 17
182
+ 2,2,2,0,2,2,2,2, # 18 - 1f
183
+ 2,2,2,2,2,2,2,2, # 20 - 27
184
+ 2,2,2,2,2,2,2,2, # 28 - 2f
185
+ 2,2,2,2,2,2,2,2, # 30 - 37
186
+ 2,2,2,2,2,2,2,2, # 38 - 3f
187
+ 2,2,2,2,2,2,2,2, # 40 - 47
188
+ 2,2,2,2,2,2,2,2, # 48 - 4f
189
+ 2,2,2,2,2,2,2,2, # 50 - 57
190
+ 2,2,2,2,2,2,2,2, # 58 - 5f
191
+ 2,2,2,2,2,2,2,2, # 60 - 67
192
+ 2,2,2,2,2,2,2,2, # 68 - 6f
193
+ 2,2,2,2,2,2,2,2, # 70 - 77
194
+ 2,2,2,2,2,2,2,2, # 78 - 7f
195
+ 0,0,0,0,0,0,0,0, # 80 - 87
196
+ 0,0,0,0,0,0,6,0, # 88 - 8f
197
+ 0,0,0,0,0,0,0,0, # 90 - 97
198
+ 0,0,0,0,0,0,0,0, # 98 - 9f
199
+ 0,3,4,4,4,4,4,4, # a0 - a7
200
+ 5,5,1,1,1,1,1,1, # a8 - af
201
+ 1,1,1,1,1,1,1,1, # b0 - b7
202
+ 1,1,1,1,1,1,1,1, # b8 - bf
203
+ 1,1,3,1,3,3,3,3, # c0 - c7
204
+ 3,3,3,3,3,3,3,3, # c8 - cf
205
+ 3,3,3,3,3,3,3,3, # d0 - d7
206
+ 3,3,3,3,3,3,3,3, # d8 - df
207
+ 3,3,3,3,3,3,3,3, # e0 - e7
208
+ 3,3,3,3,3,3,3,3, # e8 - ef
209
+ 3,3,3,3,3,3,3,3, # f0 - f7
210
+ 3,3,3,3,3,3,3,0] # f8 - ff
211
+
212
+ EUCTW_st = [ \
213
+ :Error,:Error,:Start, 3, 3, 3, 4,:Error,#00-07
214
+ :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe,#08-0f
215
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Start,:Error,#10-17
216
+ :Start,:Start,:Start,:Error,:Error,:Error,:Error,:Error,#18-1f
217
+ 5,:Error,:Error,:Error,:Start,:Error,:Start,:Start,#20-27
218
+ :Start,:Error,:Start,:Start,:Start,:Start,:Start,:Start]#28-2f
219
+
220
+ EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
221
+
222
+ EUCTWSMModel = {'classTable' => EUCTW_cls,
223
+ 'classFactor' => 7,
224
+ 'stateTable' => EUCTW_st,
225
+ 'charLenTable' => EUCTWCharLenTable,
226
+ 'name' => 'x-euc-tw'}
227
+
228
+ # GB2312
229
+
230
+ GB2312_cls = [ \
231
+ 1,1,1,1,1,1,1,1, # 00 - 07
232
+ 1,1,1,1,1,1,0,0, # 08 - 0f
233
+ 1,1,1,1,1,1,1,1, # 10 - 17
234
+ 1,1,1,0,1,1,1,1, # 18 - 1f
235
+ 1,1,1,1,1,1,1,1, # 20 - 27
236
+ 1,1,1,1,1,1,1,1, # 28 - 2f
237
+ 3,3,3,3,3,3,3,3, # 30 - 37
238
+ 3,3,1,1,1,1,1,1, # 38 - 3f
239
+ 2,2,2,2,2,2,2,2, # 40 - 47
240
+ 2,2,2,2,2,2,2,2, # 48 - 4f
241
+ 2,2,2,2,2,2,2,2, # 50 - 57
242
+ 2,2,2,2,2,2,2,2, # 58 - 5f
243
+ 2,2,2,2,2,2,2,2, # 60 - 67
244
+ 2,2,2,2,2,2,2,2, # 68 - 6f
245
+ 2,2,2,2,2,2,2,2, # 70 - 77
246
+ 2,2,2,2,2,2,2,4, # 78 - 7f
247
+ 5,6,6,6,6,6,6,6, # 80 - 87
248
+ 6,6,6,6,6,6,6,6, # 88 - 8f
249
+ 6,6,6,6,6,6,6,6, # 90 - 97
250
+ 6,6,6,6,6,6,6,6, # 98 - 9f
251
+ 6,6,6,6,6,6,6,6, # a0 - a7
252
+ 6,6,6,6,6,6,6,6, # a8 - af
253
+ 6,6,6,6,6,6,6,6, # b0 - b7
254
+ 6,6,6,6,6,6,6,6, # b8 - bf
255
+ 6,6,6,6,6,6,6,6, # c0 - c7
256
+ 6,6,6,6,6,6,6,6, # c8 - cf
257
+ 6,6,6,6,6,6,6,6, # d0 - d7
258
+ 6,6,6,6,6,6,6,6, # d8 - df
259
+ 6,6,6,6,6,6,6,6, # e0 - e7
260
+ 6,6,6,6,6,6,6,6, # e8 - ef
261
+ 6,6,6,6,6,6,6,6, # f0 - f7
262
+ 6,6,6,6,6,6,6,0] # f8 - ff
263
+
264
+ GB2312_st = [ \
265
+ :Error,:Start,:Start,:Start,:Start,:Start, 3,:Error,#00-07
266
+ :Error,:Error,:Error,:Error,:Error,:Error,:ItsMe,:ItsMe,#08-0f
267
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:Error,:Error,:Start,#10-17
268
+ 4,:Error,:Start,:Start,:Error,:Error,:Error,:Error,#18-1f
269
+ :Error,:Error, 5,:Error,:Error,:Error,:ItsMe,:Error,#20-27
270
+ :Error,:Error,:Start,:Start,:Start,:Start,:Start,:Start]#28-2f
271
+
272
+ # To be accurate, the length of class 6 can be either 2 or 4.
273
+ # But it is not necessary to discriminate between the two since
274
+ # it is used for frequency analysis only, and we are validing
275
+ # each code range there as well. So it is safe to set it to be
276
+ # 2 here.
277
+ GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
278
+
279
+ GB2312SMModel = {'classTable' => GB2312_cls,
280
+ 'classFactor' => 7,
281
+ 'stateTable' => GB2312_st,
282
+ 'charLenTable' => GB2312CharLenTable,
283
+ 'name' => 'GB2312'}
284
+
285
+ # Shift_JIS
286
+
287
+ SJIS_cls = [ \
288
+ 1,1,1,1,1,1,1,1, # 00 - 07
289
+ 1,1,1,1,1,1,0,0, # 08 - 0f
290
+ 1,1,1,1,1,1,1,1, # 10 - 17
291
+ 1,1,1,0,1,1,1,1, # 18 - 1f
292
+ 1,1,1,1,1,1,1,1, # 20 - 27
293
+ 1,1,1,1,1,1,1,1, # 28 - 2f
294
+ 1,1,1,1,1,1,1,1, # 30 - 37
295
+ 1,1,1,1,1,1,1,1, # 38 - 3f
296
+ 2,2,2,2,2,2,2,2, # 40 - 47
297
+ 2,2,2,2,2,2,2,2, # 48 - 4f
298
+ 2,2,2,2,2,2,2,2, # 50 - 57
299
+ 2,2,2,2,2,2,2,2, # 58 - 5f
300
+ 2,2,2,2,2,2,2,2, # 60 - 67
301
+ 2,2,2,2,2,2,2,2, # 68 - 6f
302
+ 2,2,2,2,2,2,2,2, # 70 - 77
303
+ 2,2,2,2,2,2,2,1, # 78 - 7f
304
+ 3,3,3,3,3,3,3,3, # 80 - 87
305
+ 3,3,3,3,3,3,3,3, # 88 - 8f
306
+ 3,3,3,3,3,3,3,3, # 90 - 97
307
+ 3,3,3,3,3,3,3,3, # 98 - 9f
308
+ #0xa0 is illegal in sjis encoding, but some pages does
309
+ #contain such byte. We need to be more error forgiven.
310
+ 2,2,2,2,2,2,2,2, # a0 - a7
311
+ 2,2,2,2,2,2,2,2, # a8 - af
312
+ 2,2,2,2,2,2,2,2, # b0 - b7
313
+ 2,2,2,2,2,2,2,2, # b8 - bf
314
+ 2,2,2,2,2,2,2,2, # c0 - c7
315
+ 2,2,2,2,2,2,2,2, # c8 - cf
316
+ 2,2,2,2,2,2,2,2, # d0 - d7
317
+ 2,2,2,2,2,2,2,2, # d8 - df
318
+ 3,3,3,3,3,3,3,3, # e0 - e7
319
+ 3,3,3,3,3,4,4,4, # e8 - ef
320
+ 4,4,4,4,4,4,4,4, # f0 - f7
321
+ 4,4,4,4,4,0,0,0] # f8 - ff
322
+
323
+ SJIS_st = [ \
324
+ :Error,:Start,:Start, 3,:Error,:Error,:Error,:Error,#00-07
325
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
326
+ :ItsMe,:ItsMe,:Error,:Error,:Start,:Start,:Start,:Start]#10-17
327
+
328
+ SJISCharLenTable = [0, 1, 1, 2, 0, 0]
329
+
330
+ SJISSMModel = {'classTable' => SJIS_cls,
331
+ 'classFactor' => 6,
332
+ 'stateTable' => SJIS_st,
333
+ 'charLenTable' => SJISCharLenTable,
334
+ 'name' => 'Shift_JIS'}
335
+
336
+ # UCS2-BE
337
+
338
+ UCS2BE_cls = [ \
339
+ 0,0,0,0,0,0,0,0, # 00 - 07
340
+ 0,0,1,0,0,2,0,0, # 08 - 0f
341
+ 0,0,0,0,0,0,0,0, # 10 - 17
342
+ 0,0,0,3,0,0,0,0, # 18 - 1f
343
+ 0,0,0,0,0,0,0,0, # 20 - 27
344
+ 0,3,3,3,3,3,0,0, # 28 - 2f
345
+ 0,0,0,0,0,0,0,0, # 30 - 37
346
+ 0,0,0,0,0,0,0,0, # 38 - 3f
347
+ 0,0,0,0,0,0,0,0, # 40 - 47
348
+ 0,0,0,0,0,0,0,0, # 48 - 4f
349
+ 0,0,0,0,0,0,0,0, # 50 - 57
350
+ 0,0,0,0,0,0,0,0, # 58 - 5f
351
+ 0,0,0,0,0,0,0,0, # 60 - 67
352
+ 0,0,0,0,0,0,0,0, # 68 - 6f
353
+ 0,0,0,0,0,0,0,0, # 70 - 77
354
+ 0,0,0,0,0,0,0,0, # 78 - 7f
355
+ 0,0,0,0,0,0,0,0, # 80 - 87
356
+ 0,0,0,0,0,0,0,0, # 88 - 8f
357
+ 0,0,0,0,0,0,0,0, # 90 - 97
358
+ 0,0,0,0,0,0,0,0, # 98 - 9f
359
+ 0,0,0,0,0,0,0,0, # a0 - a7
360
+ 0,0,0,0,0,0,0,0, # a8 - af
361
+ 0,0,0,0,0,0,0,0, # b0 - b7
362
+ 0,0,0,0,0,0,0,0, # b8 - bf
363
+ 0,0,0,0,0,0,0,0, # c0 - c7
364
+ 0,0,0,0,0,0,0,0, # c8 - cf
365
+ 0,0,0,0,0,0,0,0, # d0 - d7
366
+ 0,0,0,0,0,0,0,0, # d8 - df
367
+ 0,0,0,0,0,0,0,0, # e0 - e7
368
+ 0,0,0,0,0,0,0,0, # e8 - ef
369
+ 0,0,0,0,0,0,0,0, # f0 - f7
370
+ 0,0,0,0,0,0,4,5] # f8 - ff
371
+
372
+ UCS2BE_st = [ \
373
+ 5, 7, 7,:Error, 4, 3,:Error,:Error,#00-07
374
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
375
+ :ItsMe,:ItsMe, 6, 6, 6, 6,:Error,:Error,#10-17
376
+ 6, 6, 6, 6, 6,:ItsMe, 6, 6,#18-1f
377
+ 6, 6, 6, 6, 5, 7, 7,:Error,#20-27
378
+ 5, 8, 6, 6,:Error, 6, 6, 6,#28-2f
379
+ 6, 6, 6, 6,:Error,:Error,:Start,:Start]#30-37
380
+
381
+ UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
382
+
383
+ UCS2BESMModel = {'classTable' => UCS2BE_cls,
384
+ 'classFactor' => 6,
385
+ 'stateTable' => UCS2BE_st,
386
+ 'charLenTable' => UCS2BECharLenTable,
387
+ 'name' => 'UTF-16BE'}
388
+
389
+ # UCS2-LE
390
+
391
+ UCS2LE_cls = [ \
392
+ 0,0,0,0,0,0,0,0, # 00 - 07
393
+ 0,0,1,0,0,2,0,0, # 08 - 0f
394
+ 0,0,0,0,0,0,0,0, # 10 - 17
395
+ 0,0,0,3,0,0,0,0, # 18 - 1f
396
+ 0,0,0,0,0,0,0,0, # 20 - 27
397
+ 0,3,3,3,3,3,0,0, # 28 - 2f
398
+ 0,0,0,0,0,0,0,0, # 30 - 37
399
+ 0,0,0,0,0,0,0,0, # 38 - 3f
400
+ 0,0,0,0,0,0,0,0, # 40 - 47
401
+ 0,0,0,0,0,0,0,0, # 48 - 4f
402
+ 0,0,0,0,0,0,0,0, # 50 - 57
403
+ 0,0,0,0,0,0,0,0, # 58 - 5f
404
+ 0,0,0,0,0,0,0,0, # 60 - 67
405
+ 0,0,0,0,0,0,0,0, # 68 - 6f
406
+ 0,0,0,0,0,0,0,0, # 70 - 77
407
+ 0,0,0,0,0,0,0,0, # 78 - 7f
408
+ 0,0,0,0,0,0,0,0, # 80 - 87
409
+ 0,0,0,0,0,0,0,0, # 88 - 8f
410
+ 0,0,0,0,0,0,0,0, # 90 - 97
411
+ 0,0,0,0,0,0,0,0, # 98 - 9f
412
+ 0,0,0,0,0,0,0,0, # a0 - a7
413
+ 0,0,0,0,0,0,0,0, # a8 - af
414
+ 0,0,0,0,0,0,0,0, # b0 - b7
415
+ 0,0,0,0,0,0,0,0, # b8 - bf
416
+ 0,0,0,0,0,0,0,0, # c0 - c7
417
+ 0,0,0,0,0,0,0,0, # c8 - cf
418
+ 0,0,0,0,0,0,0,0, # d0 - d7
419
+ 0,0,0,0,0,0,0,0, # d8 - df
420
+ 0,0,0,0,0,0,0,0, # e0 - e7
421
+ 0,0,0,0,0,0,0,0, # e8 - ef
422
+ 0,0,0,0,0,0,0,0, # f0 - f7
423
+ 0,0,0,0,0,0,4,5] # f8 - ff
424
+
425
+ UCS2LE_st = [ \
426
+ 6, 6, 7, 6, 4, 3,:Error,:Error,#00-07
427
+ :Error,:Error,:Error,:Error,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#08-0f
428
+ :ItsMe,:ItsMe, 5, 5, 5,:Error,:ItsMe,:Error,#10-17
429
+ 5, 5, 5,:Error, 5,:Error, 6, 6,#18-1f
430
+ 7, 6, 8, 8, 5, 5, 5,:Error,#20-27
431
+ 5, 5, 5,:Error,:Error,:Error, 5, 5,#28-2f
432
+ 5, 5, 5,:Error, 5,:Error,:Start,:Start]#30-37
433
+
434
+ UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
435
+
436
+ UCS2LESMModel = {'classTable' => UCS2LE_cls,
437
+ 'classFactor' => 6,
438
+ 'stateTable' => UCS2LE_st,
439
+ 'charLenTable' => UCS2LECharLenTable,
440
+ 'name' => 'UTF-16LE'}
441
+
442
+ # UTF-8
443
+
444
+ UTF8_cls = [ \
445
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
446
+ 1,1,1,1,1,1,0,0, # 08 - 0f
447
+ 1,1,1,1,1,1,1,1, # 10 - 17
448
+ 1,1,1,0,1,1,1,1, # 18 - 1f
449
+ 1,1,1,1,1,1,1,1, # 20 - 27
450
+ 1,1,1,1,1,1,1,1, # 28 - 2f
451
+ 1,1,1,1,1,1,1,1, # 30 - 37
452
+ 1,1,1,1,1,1,1,1, # 38 - 3f
453
+ 1,1,1,1,1,1,1,1, # 40 - 47
454
+ 1,1,1,1,1,1,1,1, # 48 - 4f
455
+ 1,1,1,1,1,1,1,1, # 50 - 57
456
+ 1,1,1,1,1,1,1,1, # 58 - 5f
457
+ 1,1,1,1,1,1,1,1, # 60 - 67
458
+ 1,1,1,1,1,1,1,1, # 68 - 6f
459
+ 1,1,1,1,1,1,1,1, # 70 - 77
460
+ 1,1,1,1,1,1,1,1, # 78 - 7f
461
+ 2,2,2,2,3,3,3,3, # 80 - 87
462
+ 4,4,4,4,4,4,4,4, # 88 - 8f
463
+ 4,4,4,4,4,4,4,4, # 90 - 97
464
+ 4,4,4,4,4,4,4,4, # 98 - 9f
465
+ 5,5,5,5,5,5,5,5, # a0 - a7
466
+ 5,5,5,5,5,5,5,5, # a8 - af
467
+ 5,5,5,5,5,5,5,5, # b0 - b7
468
+ 5,5,5,5,5,5,5,5, # b8 - bf
469
+ 0,0,6,6,6,6,6,6, # c0 - c7
470
+ 6,6,6,6,6,6,6,6, # c8 - cf
471
+ 6,6,6,6,6,6,6,6, # d0 - d7
472
+ 6,6,6,6,6,6,6,6, # d8 - df
473
+ 7,8,8,8,8,8,8,8, # e0 - e7
474
+ 8,8,8,8,8,9,8,8, # e8 - ef
475
+ 10,11,11,11,11,11,11,11, # f0 - f7
476
+ 12,13,13,13,14,15,0,0] # f8 - ff
477
+
478
+ UTF8_st = [ \
479
+ :Error,:Start,:Error,:Error,:Error,:Error, 12, 10,#00-07
480
+ 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
481
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#10-17
482
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#18-1f
483
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#20-27
484
+ :ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,:ItsMe,#28-2f
485
+ :Error,:Error, 5, 5, 5, 5,:Error,:Error,#30-37
486
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#38-3f
487
+ :Error,:Error,:Error, 5, 5, 5,:Error,:Error,#40-47
488
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#48-4f
489
+ :Error,:Error, 7, 7, 7, 7,:Error,:Error,#50-57
490
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#58-5f
491
+ :Error,:Error,:Error,:Error, 7, 7,:Error,:Error,#60-67
492
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#68-6f
493
+ :Error,:Error, 9, 9, 9, 9,:Error,:Error,#70-77
494
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#78-7f
495
+ :Error,:Error,:Error,:Error,:Error, 9,:Error,:Error,#80-87
496
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#88-8f
497
+ :Error,:Error, 12, 12, 12, 12,:Error,:Error,#90-97
498
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#98-9f
499
+ :Error,:Error,:Error,:Error,:Error, 12,:Error,:Error,#a0-a7
500
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#a8-af
501
+ :Error,:Error, 12, 12, 12,:Error,:Error,:Error,#b0-b7
502
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error,#b8-bf
503
+ :Error,:Error,:Start,:Start,:Start,:Start,:Error,:Error,#c0-c7
504
+ :Error,:Error,:Error,:Error,:Error,:Error,:Error,:Error]#c8-cf
505
+
506
+ UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
507
+
508
+ UTF8SMModel = {'classTable' => UTF8_cls,
509
+ 'classFactor' => 16,
510
+ 'stateTable' => UTF8_st,
511
+ 'charLenTable' => UTF8CharLenTable,
512
+ 'name' => 'UTF-8'}
513
+ end