gigo 1.4.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/README.md +2 -8
  2. data/gemfiles/activesupport30.gemfile.lock +1 -1
  3. data/gemfiles/activesupport31.gemfile.lock +1 -1
  4. data/gemfiles/activesupport32.gemfile.lock +1 -1
  5. data/gemfiles/activesupport40.gemfile.lock +2 -2
  6. data/lib/gigo.rb +0 -1
  7. data/lib/gigo/version.rb +1 -1
  8. data/test/cases/gigo_test.rb +0 -1
  9. metadata +3 -39
  10. data/lib/gigo/rchardet.rb +0 -67
  11. data/lib/gigo/rchardet/big5freq.rb +0 -927
  12. data/lib/gigo/rchardet/big5prober.rb +0 -43
  13. data/lib/gigo/rchardet/chardistribution.rb +0 -238
  14. data/lib/gigo/rchardet/charsetgroupprober.rb +0 -113
  15. data/lib/gigo/rchardet/charsetprober.rb +0 -76
  16. data/lib/gigo/rchardet/codingstatemachine.rb +0 -66
  17. data/lib/gigo/rchardet/constants.rb +0 -43
  18. data/lib/gigo/rchardet/escprober.rb +0 -90
  19. data/lib/gigo/rchardet/escsm.rb +0 -245
  20. data/lib/gigo/rchardet/eucjpprober.rb +0 -89
  21. data/lib/gigo/rchardet/euckrfreq.rb +0 -598
  22. data/lib/gigo/rchardet/euckrprober.rb +0 -43
  23. data/lib/gigo/rchardet/euctwfreq.rb +0 -431
  24. data/lib/gigo/rchardet/euctwprober.rb +0 -43
  25. data/lib/gigo/rchardet/gb2312freq.rb +0 -475
  26. data/lib/gigo/rchardet/gb2312prober.rb +0 -43
  27. data/lib/gigo/rchardet/hebrewprober.rb +0 -291
  28. data/lib/gigo/rchardet/jisfreq.rb +0 -571
  29. data/lib/gigo/rchardet/jpcntx.rb +0 -230
  30. data/lib/gigo/rchardet/langbulgarianmodel.rb +0 -230
  31. data/lib/gigo/rchardet/langcyrillicmodel.rb +0 -331
  32. data/lib/gigo/rchardet/langgreekmodel.rb +0 -228
  33. data/lib/gigo/rchardet/langhebrewmodel.rb +0 -203
  34. data/lib/gigo/rchardet/langhungarianmodel.rb +0 -227
  35. data/lib/gigo/rchardet/langthaimodel.rb +0 -202
  36. data/lib/gigo/rchardet/latin1prober.rb +0 -148
  37. data/lib/gigo/rchardet/mbcharsetprober.rb +0 -91
  38. data/lib/gigo/rchardet/mbcsgroupprober.rb +0 -48
  39. data/lib/gigo/rchardet/mbcssm.rb +0 -543
  40. data/lib/gigo/rchardet/sbcharsetprober.rb +0 -125
  41. data/lib/gigo/rchardet/sbcsgroupprober.rb +0 -59
  42. data/lib/gigo/rchardet/sjisprober.rb +0 -89
  43. data/lib/gigo/rchardet/universaldetector.rb +0 -169
  44. data/lib/gigo/rchardet/utf8prober.rb +0 -87
  45. data/lib/gigo/transcoders/rchardet.rb +0 -22
@@ -1,148 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is Mozilla Universal charset detector code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 2001
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- # Shy Shalom - original C code
13
- #
14
- # This library is free software; you can redistribute it and/or
15
- # modify it under the terms of the GNU Lesser General Public
16
- # License as published by the Free Software Foundation; either
17
- # version 2.1 of the License, or (at your option) any later version.
18
- #
19
- # This library is distributed in the hope that it will be useful,
20
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
- # Lesser General Public License for more details.
23
- #
24
- # You should have received a copy of the GNU Lesser General Public
25
- # License along with this library; if not, write to the Free Software
26
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
- # 02110-1301 USA
28
- ######################### END LICENSE BLOCK #########################
29
- module GIGO
30
- module CharDet
31
- FREQ_CAT_NUM = 4
32
-
33
- UDF = 0 # undefined
34
- OTH = 1 # other
35
- ASC = 2 # ascii capital letter
36
- ASS = 3 # ascii small letter
37
- ACV = 4 # accent capital vowel
38
- ACO = 5 # accent capital other
39
- ASV = 6 # accent small vowel
40
- ASO = 7 # accent small other
41
- CLASS_NUM = 8 # total classes
42
-
43
- Latin1_CharToClass = [
44
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
45
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
46
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
47
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
48
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
49
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
50
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
51
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
52
- OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
53
- ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
54
- ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
55
- ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
56
- OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
57
- ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
58
- ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
59
- ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
60
- OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
61
- OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
62
- UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
63
- OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
64
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
65
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
66
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
67
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
68
- ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
69
- ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
70
- ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
71
- ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
72
- ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
73
- ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
74
- ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
75
- ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
76
- ]
77
-
78
- # 0 : illegal
79
- # 1 : very unlikely
80
- # 2 : normal
81
- # 3 : very likely
82
- Latin1ClassModel = [
83
- # UDF OTH ASC ASS ACV ACO ASV ASO
84
- 0, 0, 0, 0, 0, 0, 0, 0, # UDF
85
- 0, 3, 3, 3, 3, 3, 3, 3, # OTH
86
- 0, 3, 3, 3, 3, 3, 3, 3, # ASC
87
- 0, 3, 3, 3, 1, 1, 3, 3, # ASS
88
- 0, 3, 3, 3, 1, 2, 1, 2, # ACV
89
- 0, 3, 3, 3, 3, 3, 3, 3, # ACO
90
- 0, 3, 1, 3, 1, 1, 1, 3, # ASV
91
- 0, 3, 1, 3, 1, 1, 3, 3, # ASO
92
- ]
93
-
94
- class Latin1Prober < CharSetProber
95
- def initialize
96
- super
97
- reset()
98
- end
99
-
100
- def reset
101
- @_mLastCharClass = OTH
102
- @_mFreqCounter = [0] * FREQ_CAT_NUM
103
- super
104
- end
105
-
106
- def get_charset_name
107
- return "windows-1252"
108
- end
109
-
110
- def feed(aBuf)
111
- aBuf = filter_with_english_letters(aBuf)
112
- for c in aBuf.split('')
113
- char = c.respond_to?(:bytes) ? c.bytes.first : c[0]
114
- charClass = Latin1_CharToClass[char]
115
- freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
116
- if freq == 0
117
- @_mState = ENotMe
118
- break
119
- end
120
- @_mFreqCounter[freq] += 1
121
- @_mLastCharClass = charClass
122
- end
123
-
124
- return get_state()
125
- end
126
-
127
- def get_confidence
128
- if get_state() == ENotMe
129
- return 0.01
130
- end
131
-
132
- total = @_mFreqCounter.inject{|a,b| a+b}
133
- if total < 0.01
134
- confidence = 0.0
135
- else
136
- confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
137
- end
138
- if confidence < 0.0
139
- confidence = 0.0
140
- end
141
- # lower the confidence of latin1 so that other more accurate detector
142
- # can take priority.
143
- confidence = confidence * 0.5
144
- return confidence
145
- end
146
- end
147
- end
148
- end
@@ -1,91 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is Mozilla Universal charset detector code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 2001
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- # Shy Shalom - original C code
13
- # Proofpoint, Inc.
14
- #
15
- # This library is free software; you can redistribute it and/or
16
- # modify it under the terms of the GNU Lesser General Public
17
- # License as published by the Free Software Foundation; either
18
- # version 2.1 of the License, or (at your option) any later version.
19
- #
20
- # This library is distributed in the hope that it will be useful,
21
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
- # Lesser General Public License for more details.
24
- #
25
- # You should have received a copy of the GNU Lesser General Public
26
- # License along with this library; if not, write to the Free Software
27
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28
- # 02110-1301 USA
29
- ######################### END LICENSE BLOCK #########################
30
-
31
- module GIGO
32
- module CharDet
33
- class MultiByteCharSetProber < CharSetProber
34
- def initialize
35
- super
36
- @_mDistributionAnalyzer = nil
37
- @_mCodingSM = nil
38
- @_mLastChar = "\x00\x00"
39
- end
40
-
41
- def reset
42
- super
43
- if @_mCodingSM
44
- @_mCodingSM.reset()
45
- end
46
- if @_mDistributionAnalyzer
47
- @_mDistributionAnalyzer.reset()
48
- end
49
- @_mLastChar = "\x00\x00"
50
- end
51
-
52
- def get_charset_name
53
- end
54
-
55
- def feed(aBuf)
56
- aLen = aBuf.length
57
- for i in (0...aLen)
58
- codingState = @_mCodingSM.next_state(aBuf[i..i])
59
- if codingState == EError
60
- $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
61
- @_mState = ENotMe
62
- break
63
- elsif codingState == EItsMe
64
- @_mState = EFoundIt
65
- break
66
- elsif codingState == EStart
67
- charLen = @_mCodingSM.get_current_charlen()
68
- if i == 0
69
- @_mLastChar[1] = aBuf[0..0]
70
- @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
71
- else
72
- @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
73
- end
74
- end
75
- end
76
- @_mLastChar[0] = aBuf[aLen-1..aLen-1]
77
-
78
- if get_state() == EDetecting
79
- if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
80
- @_mState = EFoundIt
81
- end
82
- end
83
- return get_state()
84
- end
85
-
86
- def get_confidence
87
- return @_mDistributionAnalyzer.get_confidence()
88
- end
89
- end
90
- end
91
- end
@@ -1,48 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is Mozilla Universal charset detector code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 2001
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- # Shy Shalom - original C code
13
- # Proofpoint, Inc.
14
- #
15
- # This library is free software; you can redistribute it and/or
16
- # modify it under the terms of the GNU Lesser General Public
17
- # License as published by the Free Software Foundation; either
18
- # version 2.1 of the License, or (at your option) any later version.
19
- #
20
- # This library is distributed in the hope that it will be useful,
21
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
- # Lesser General Public License for more details.
24
- #
25
- # You should have received a copy of the GNU Lesser General Public
26
- # License along with this library; if not, write to the Free Software
27
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28
- # 02110-1301 USA
29
- ######################### END LICENSE BLOCK #########################
30
- module GIGO
31
- module CharDet
32
- class MBCSGroupProber < CharSetGroupProber
33
- def initialize
34
- super
35
- @_mProbers = [
36
- UTF8Prober.new,
37
- SJISProber.new,
38
- EUCJPProber.new,
39
- GB2312Prober.new,
40
- EUCKRProber.new,
41
- Big5Prober.new,
42
- EUCTWProber.new
43
- ]
44
- reset()
45
- end
46
- end
47
- end
48
- end
@@ -1,543 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is mozilla.org code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 1998
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- #
13
- # This library is free software; you can redistribute it and/or
14
- # modify it under the terms of the GNU Lesser General Public
15
- # License as published by the Free Software Foundation; either
16
- # version 2.1 of the License, or (at your option) any later version.
17
- #
18
- # This library is distributed in the hope that it will be useful,
19
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
- # Lesser General Public License for more details.
22
- #
23
- # You should have received a copy of the GNU Lesser General Public
24
- # License along with this library; if not, write to the Free Software
25
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
- # 02110-1301 USA
27
- ######################### END LICENSE BLOCK #########################
28
- module GIGO
29
- module CharDet
30
- # BIG5
31
-
32
- BIG5_cls = [
33
- 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
34
- 1,1,1,1,1,1,0,0, # 08 - 0f
35
- 1,1,1,1,1,1,1,1, # 10 - 17
36
- 1,1,1,0,1,1,1,1, # 18 - 1f
37
- 1,1,1,1,1,1,1,1, # 20 - 27
38
- 1,1,1,1,1,1,1,1, # 28 - 2f
39
- 1,1,1,1,1,1,1,1, # 30 - 37
40
- 1,1,1,1,1,1,1,1, # 38 - 3f
41
- 2,2,2,2,2,2,2,2, # 40 - 47
42
- 2,2,2,2,2,2,2,2, # 48 - 4f
43
- 2,2,2,2,2,2,2,2, # 50 - 57
44
- 2,2,2,2,2,2,2,2, # 58 - 5f
45
- 2,2,2,2,2,2,2,2, # 60 - 67
46
- 2,2,2,2,2,2,2,2, # 68 - 6f
47
- 2,2,2,2,2,2,2,2, # 70 - 77
48
- 2,2,2,2,2,2,2,1, # 78 - 7f
49
- 4,4,4,4,4,4,4,4, # 80 - 87
50
- 4,4,4,4,4,4,4,4, # 88 - 8f
51
- 4,4,4,4,4,4,4,4, # 90 - 97
52
- 4,4,4,4,4,4,4,4, # 98 - 9f
53
- 4,3,3,3,3,3,3,3, # a0 - a7
54
- 3,3,3,3,3,3,3,3, # a8 - af
55
- 3,3,3,3,3,3,3,3, # b0 - b7
56
- 3,3,3,3,3,3,3,3, # b8 - bf
57
- 3,3,3,3,3,3,3,3, # c0 - c7
58
- 3,3,3,3,3,3,3,3, # c8 - cf
59
- 3,3,3,3,3,3,3,3, # d0 - d7
60
- 3,3,3,3,3,3,3,3, # d8 - df
61
- 3,3,3,3,3,3,3,3, # e0 - e7
62
- 3,3,3,3,3,3,3,3, # e8 - ef
63
- 3,3,3,3,3,3,3,3, # f0 - f7
64
- 3,3,3,3,3,3,3,0 # f8 - ff
65
- ]
66
-
67
- BIG5_st = [
68
- EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
69
- EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,#08-0f
70
- EError,EStart,EStart,EStart,EStart,EStart,EStart,EStart #10-17
71
- ]
72
-
73
- Big5CharLenTable = [0, 1, 1, 2, 0]
74
-
75
- Big5SMModel = {'classTable' => BIG5_cls,
76
- 'classFactor' => 5,
77
- 'stateTable' => BIG5_st,
78
- 'charLenTable' => Big5CharLenTable,
79
- 'name' => 'Big5'
80
- }
81
-
82
- # EUC-JP
83
-
84
- EUCJP_cls = [
85
- 4,4,4,4,4,4,4,4, # 00 - 07
86
- 4,4,4,4,4,4,5,5, # 08 - 0f
87
- 4,4,4,4,4,4,4,4, # 10 - 17
88
- 4,4,4,5,4,4,4,4, # 18 - 1f
89
- 4,4,4,4,4,4,4,4, # 20 - 27
90
- 4,4,4,4,4,4,4,4, # 28 - 2f
91
- 4,4,4,4,4,4,4,4, # 30 - 37
92
- 4,4,4,4,4,4,4,4, # 38 - 3f
93
- 4,4,4,4,4,4,4,4, # 40 - 47
94
- 4,4,4,4,4,4,4,4, # 48 - 4f
95
- 4,4,4,4,4,4,4,4, # 50 - 57
96
- 4,4,4,4,4,4,4,4, # 58 - 5f
97
- 4,4,4,4,4,4,4,4, # 60 - 67
98
- 4,4,4,4,4,4,4,4, # 68 - 6f
99
- 4,4,4,4,4,4,4,4, # 70 - 77
100
- 4,4,4,4,4,4,4,4, # 78 - 7f
101
- 5,5,5,5,5,5,5,5, # 80 - 87
102
- 5,5,5,5,5,5,1,3, # 88 - 8f
103
- 5,5,5,5,5,5,5,5, # 90 - 97
104
- 5,5,5,5,5,5,5,5, # 98 - 9f
105
- 5,2,2,2,2,2,2,2, # a0 - a7
106
- 2,2,2,2,2,2,2,2, # a8 - af
107
- 2,2,2,2,2,2,2,2, # b0 - b7
108
- 2,2,2,2,2,2,2,2, # b8 - bf
109
- 2,2,2,2,2,2,2,2, # c0 - c7
110
- 2,2,2,2,2,2,2,2, # c8 - cf
111
- 2,2,2,2,2,2,2,2, # d0 - d7
112
- 2,2,2,2,2,2,2,2, # d8 - df
113
- 0,0,0,0,0,0,0,0, # e0 - e7
114
- 0,0,0,0,0,0,0,0, # e8 - ef
115
- 0,0,0,0,0,0,0,0, # f0 - f7
116
- 0,0,0,0,0,0,0,5 # f8 - ff
117
- ]
118
-
119
- EUCJP_st = [
120
- 3, 4, 3, 5,EStart,EError,EError,EError,#00-07
121
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
122
- EItsMe,EItsMe,EStart,EError,EStart,EError,EError,EError,#10-17
123
- EError,EError,EStart,EError,EError,EError, 3,EError,#18-1f
124
- 3,EError,EError,EError,EStart,EStart,EStart,EStart #20-27
125
- ]
126
-
127
- EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
128
-
129
- EUCJPSMModel = {'classTable' => EUCJP_cls,
130
- 'classFactor' => 6,
131
- 'stateTable' => EUCJP_st,
132
- 'charLenTable' => EUCJPCharLenTable,
133
- 'name' => 'EUC-JP'
134
- }
135
-
136
- # EUC-KR
137
-
138
- EUCKR_cls = [
139
- 1,1,1,1,1,1,1,1, # 00 - 07
140
- 1,1,1,1,1,1,0,0, # 08 - 0f
141
- 1,1,1,1,1,1,1,1, # 10 - 17
142
- 1,1,1,0,1,1,1,1, # 18 - 1f
143
- 1,1,1,1,1,1,1,1, # 20 - 27
144
- 1,1,1,1,1,1,1,1, # 28 - 2f
145
- 1,1,1,1,1,1,1,1, # 30 - 37
146
- 1,1,1,1,1,1,1,1, # 38 - 3f
147
- 1,1,1,1,1,1,1,1, # 40 - 47
148
- 1,1,1,1,1,1,1,1, # 48 - 4f
149
- 1,1,1,1,1,1,1,1, # 50 - 57
150
- 1,1,1,1,1,1,1,1, # 58 - 5f
151
- 1,1,1,1,1,1,1,1, # 60 - 67
152
- 1,1,1,1,1,1,1,1, # 68 - 6f
153
- 1,1,1,1,1,1,1,1, # 70 - 77
154
- 1,1,1,1,1,1,1,1, # 78 - 7f
155
- 0,0,0,0,0,0,0,0, # 80 - 87
156
- 0,0,0,0,0,0,0,0, # 88 - 8f
157
- 0,0,0,0,0,0,0,0, # 90 - 97
158
- 0,0,0,0,0,0,0,0, # 98 - 9f
159
- 0,2,2,2,2,2,2,2, # a0 - a7
160
- 2,2,2,2,2,3,3,3, # a8 - af
161
- 2,2,2,2,2,2,2,2, # b0 - b7
162
- 2,2,2,2,2,2,2,2, # b8 - bf
163
- 2,2,2,2,2,2,2,2, # c0 - c7
164
- 2,3,2,2,2,2,2,2, # c8 - cf
165
- 2,2,2,2,2,2,2,2, # d0 - d7
166
- 2,2,2,2,2,2,2,2, # d8 - df
167
- 2,2,2,2,2,2,2,2, # e0 - e7
168
- 2,2,2,2,2,2,2,2, # e8 - ef
169
- 2,2,2,2,2,2,2,2, # f0 - f7
170
- 2,2,2,2,2,2,2,0 # f8 - ff
171
- ]
172
-
173
- EUCKR_st = [
174
- EError,EStart, 3,EError,EError,EError,EError,EError,#00-07
175
- EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,EStart#08-0f
176
- ]
177
-
178
- EUCKRCharLenTable = [0, 1, 2, 0]
179
-
180
- EUCKRSMModel = {'classTable' => EUCKR_cls,
181
- 'classFactor' => 4,
182
- 'stateTable' => EUCKR_st,
183
- 'charLenTable' => EUCKRCharLenTable,
184
- 'name' => 'EUC-KR'
185
- }
186
-
187
- # EUC-TW
188
-
189
- EUCTW_cls = [
190
- 2,2,2,2,2,2,2,2, # 00 - 07
191
- 2,2,2,2,2,2,0,0, # 08 - 0f
192
- 2,2,2,2,2,2,2,2, # 10 - 17
193
- 2,2,2,0,2,2,2,2, # 18 - 1f
194
- 2,2,2,2,2,2,2,2, # 20 - 27
195
- 2,2,2,2,2,2,2,2, # 28 - 2f
196
- 2,2,2,2,2,2,2,2, # 30 - 37
197
- 2,2,2,2,2,2,2,2, # 38 - 3f
198
- 2,2,2,2,2,2,2,2, # 40 - 47
199
- 2,2,2,2,2,2,2,2, # 48 - 4f
200
- 2,2,2,2,2,2,2,2, # 50 - 57
201
- 2,2,2,2,2,2,2,2, # 58 - 5f
202
- 2,2,2,2,2,2,2,2, # 60 - 67
203
- 2,2,2,2,2,2,2,2, # 68 - 6f
204
- 2,2,2,2,2,2,2,2, # 70 - 77
205
- 2,2,2,2,2,2,2,2, # 78 - 7f
206
- 0,0,0,0,0,0,0,0, # 80 - 87
207
- 0,0,0,0,0,0,6,0, # 88 - 8f
208
- 0,0,0,0,0,0,0,0, # 90 - 97
209
- 0,0,0,0,0,0,0,0, # 98 - 9f
210
- 0,3,4,4,4,4,4,4, # a0 - a7
211
- 5,5,1,1,1,1,1,1, # a8 - af
212
- 1,1,1,1,1,1,1,1, # b0 - b7
213
- 1,1,1,1,1,1,1,1, # b8 - bf
214
- 1,1,3,1,3,3,3,3, # c0 - c7
215
- 3,3,3,3,3,3,3,3, # c8 - cf
216
- 3,3,3,3,3,3,3,3, # d0 - d7
217
- 3,3,3,3,3,3,3,3, # d8 - df
218
- 3,3,3,3,3,3,3,3, # e0 - e7
219
- 3,3,3,3,3,3,3,3, # e8 - ef
220
- 3,3,3,3,3,3,3,3, # f0 - f7
221
- 3,3,3,3,3,3,3,0 # f8 - ff
222
- ]
223
-
224
- EUCTW_st = [
225
- EError,EError,EStart, 3, 3, 3, 4,EError,#00-07
226
- EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
227
- EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EStart,EError,#10-17
228
- EStart,EStart,EStart,EError,EError,EError,EError,EError,#18-1f
229
- 5,EError,EError,EError,EStart,EError,EStart,EStart,#20-27
230
- EStart,EError,EStart,EStart,EStart,EStart,EStart,EStart #28-2f
231
- ]
232
-
233
- EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
234
-
235
- EUCTWSMModel = {'classTable' => EUCTW_cls,
236
- 'classFactor' => 7,
237
- 'stateTable' => EUCTW_st,
238
- 'charLenTable' => EUCTWCharLenTable,
239
- 'name' => 'x-euc-tw'
240
- }
241
-
242
- # GB2312
243
-
244
- GB2312_cls = [
245
- 1,1,1,1,1,1,1,1, # 00 - 07
246
- 1,1,1,1,1,1,0,0, # 08 - 0f
247
- 1,1,1,1,1,1,1,1, # 10 - 17
248
- 1,1,1,0,1,1,1,1, # 18 - 1f
249
- 1,1,1,1,1,1,1,1, # 20 - 27
250
- 1,1,1,1,1,1,1,1, # 28 - 2f
251
- 3,3,3,3,3,3,3,3, # 30 - 37
252
- 3,3,1,1,1,1,1,1, # 38 - 3f
253
- 2,2,2,2,2,2,2,2, # 40 - 47
254
- 2,2,2,2,2,2,2,2, # 48 - 4f
255
- 2,2,2,2,2,2,2,2, # 50 - 57
256
- 2,2,2,2,2,2,2,2, # 58 - 5f
257
- 2,2,2,2,2,2,2,2, # 60 - 67
258
- 2,2,2,2,2,2,2,2, # 68 - 6f
259
- 2,2,2,2,2,2,2,2, # 70 - 77
260
- 2,2,2,2,2,2,2,4, # 78 - 7f
261
- 5,6,6,6,6,6,6,6, # 80 - 87
262
- 6,6,6,6,6,6,6,6, # 88 - 8f
263
- 6,6,6,6,6,6,6,6, # 90 - 97
264
- 6,6,6,6,6,6,6,6, # 98 - 9f
265
- 6,6,6,6,6,6,6,6, # a0 - a7
266
- 6,6,6,6,6,6,6,6, # a8 - af
267
- 6,6,6,6,6,6,6,6, # b0 - b7
268
- 6,6,6,6,6,6,6,6, # b8 - bf
269
- 6,6,6,6,6,6,6,6, # c0 - c7
270
- 6,6,6,6,6,6,6,6, # c8 - cf
271
- 6,6,6,6,6,6,6,6, # d0 - d7
272
- 6,6,6,6,6,6,6,6, # d8 - df
273
- 6,6,6,6,6,6,6,6, # e0 - e7
274
- 6,6,6,6,6,6,6,6, # e8 - ef
275
- 6,6,6,6,6,6,6,6, # f0 - f7
276
- 6,6,6,6,6,6,6,0 # f8 - ff
277
- ]
278
-
279
- GB2312_st = [
280
- EError,EStart,EStart,EStart,EStart,EStart, 3,EError,#00-07
281
- EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
282
- EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,#10-17
283
- 4,EError,EStart,EStart,EError,EError,EError,EError,#18-1f
284
- EError,EError, 5,EError,EError,EError,EItsMe,EError,#20-27
285
- EError,EError,EStart,EStart,EStart,EStart,EStart,EStart#28-2f
286
- ]
287
-
288
- # To be accurate, the length of class 6 can be either 2 or 4.
289
- # But it is not necessary to discriminate between the two since
290
- # it is used for frequency analysis only, and we are validing
291
- # each code range there as well. So it is safe to set it to be
292
- # 2 here.
293
- GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
294
-
295
- GB2312SMModel = {'classTable' => GB2312_cls,
296
- 'classFactor' => 7,
297
- 'stateTable' => GB2312_st,
298
- 'charLenTable' => GB2312CharLenTable,
299
- 'name' => 'GB2312'
300
- }
301
-
302
- # Shift_JIS
303
-
304
- SJIS_cls = [
305
- 1,1,1,1,1,1,1,1, # 00 - 07
306
- 1,1,1,1,1,1,0,0, # 08 - 0f
307
- 1,1,1,1,1,1,1,1, # 10 - 17
308
- 1,1,1,0,1,1,1,1, # 18 - 1f
309
- 1,1,1,1,1,1,1,1, # 20 - 27
310
- 1,1,1,1,1,1,1,1, # 28 - 2f
311
- 1,1,1,1,1,1,1,1, # 30 - 37
312
- 1,1,1,1,1,1,1,1, # 38 - 3f
313
- 2,2,2,2,2,2,2,2, # 40 - 47
314
- 2,2,2,2,2,2,2,2, # 48 - 4f
315
- 2,2,2,2,2,2,2,2, # 50 - 57
316
- 2,2,2,2,2,2,2,2, # 58 - 5f
317
- 2,2,2,2,2,2,2,2, # 60 - 67
318
- 2,2,2,2,2,2,2,2, # 68 - 6f
319
- 2,2,2,2,2,2,2,2, # 70 - 77
320
- 2,2,2,2,2,2,2,1, # 78 - 7f
321
- 3,3,3,3,3,3,3,3, # 80 - 87
322
- 3,3,3,3,3,3,3,3, # 88 - 8f
323
- 3,3,3,3,3,3,3,3, # 90 - 97
324
- 3,3,3,3,3,3,3,3, # 98 - 9f
325
- #0xa0 is illegal in sjis encoding, but some pages does
326
- #contain such byte. We need to be more error forgiven.
327
- 2,2,2,2,2,2,2,2, # a0 - a7
328
- 2,2,2,2,2,2,2,2, # a8 - af
329
- 2,2,2,2,2,2,2,2, # b0 - b7
330
- 2,2,2,2,2,2,2,2, # b8 - bf
331
- 2,2,2,2,2,2,2,2, # c0 - c7
332
- 2,2,2,2,2,2,2,2, # c8 - cf
333
- 2,2,2,2,2,2,2,2, # d0 - d7
334
- 2,2,2,2,2,2,2,2, # d8 - df
335
- 3,3,3,3,3,3,3,3, # e0 - e7
336
- 3,3,3,3,3,4,4,4, # e8 - ef
337
- 4,4,4,4,4,4,4,4, # f0 - f7
338
- 4,4,4,4,4,0,0,0 # f8 - ff
339
- ]
340
-
341
- SJIS_st = [
342
- EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
343
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
344
- EItsMe,EItsMe,EError,EError,EStart,EStart,EStart,EStart#10-17
345
- ]
346
-
347
- SJISCharLenTable = [0, 1, 1, 2, 0, 0]
348
-
349
- SJISSMModel = {'classTable' => SJIS_cls,
350
- 'classFactor' => 6,
351
- 'stateTable' => SJIS_st,
352
- 'charLenTable' => SJISCharLenTable,
353
- 'name' => 'Shift_JIS'
354
- }
355
-
356
- # UCS2-BE
357
-
358
- UCS2BE_cls = [
359
- 0,0,0,0,0,0,0,0, # 00 - 07
360
- 0,0,1,0,0,2,0,0, # 08 - 0f
361
- 0,0,0,0,0,0,0,0, # 10 - 17
362
- 0,0,0,3,0,0,0,0, # 18 - 1f
363
- 0,0,0,0,0,0,0,0, # 20 - 27
364
- 0,3,3,3,3,3,0,0, # 28 - 2f
365
- 0,0,0,0,0,0,0,0, # 30 - 37
366
- 0,0,0,0,0,0,0,0, # 38 - 3f
367
- 0,0,0,0,0,0,0,0, # 40 - 47
368
- 0,0,0,0,0,0,0,0, # 48 - 4f
369
- 0,0,0,0,0,0,0,0, # 50 - 57
370
- 0,0,0,0,0,0,0,0, # 58 - 5f
371
- 0,0,0,0,0,0,0,0, # 60 - 67
372
- 0,0,0,0,0,0,0,0, # 68 - 6f
373
- 0,0,0,0,0,0,0,0, # 70 - 77
374
- 0,0,0,0,0,0,0,0, # 78 - 7f
375
- 0,0,0,0,0,0,0,0, # 80 - 87
376
- 0,0,0,0,0,0,0,0, # 88 - 8f
377
- 0,0,0,0,0,0,0,0, # 90 - 97
378
- 0,0,0,0,0,0,0,0, # 98 - 9f
379
- 0,0,0,0,0,0,0,0, # a0 - a7
380
- 0,0,0,0,0,0,0,0, # a8 - af
381
- 0,0,0,0,0,0,0,0, # b0 - b7
382
- 0,0,0,0,0,0,0,0, # b8 - bf
383
- 0,0,0,0,0,0,0,0, # c0 - c7
384
- 0,0,0,0,0,0,0,0, # c8 - cf
385
- 0,0,0,0,0,0,0,0, # d0 - d7
386
- 0,0,0,0,0,0,0,0, # d8 - df
387
- 0,0,0,0,0,0,0,0, # e0 - e7
388
- 0,0,0,0,0,0,0,0, # e8 - ef
389
- 0,0,0,0,0,0,0,0, # f0 - f7
390
- 0,0,0,0,0,0,4,5 # f8 - ff
391
- ]
392
-
393
- UCS2BE_st = [
394
- 5, 7, 7,EError, 4, 3,EError,EError,#00-07
395
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
396
- EItsMe,EItsMe, 6, 6, 6, 6,EError,EError,#10-17
397
- 6, 6, 6, 6, 6,EItsMe, 6, 6,#18-1f
398
- 6, 6, 6, 6, 5, 7, 7,EError,#20-27
399
- 5, 8, 6, 6,EError, 6, 6, 6,#28-2f
400
- 6, 6, 6, 6,EError,EError,EStart,EStart#30-37
401
- ]
402
-
403
- UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
404
-
405
- UCS2BESMModel = {'classTable' => UCS2BE_cls,
406
- 'classFactor' => 6,
407
- 'stateTable' => UCS2BE_st,
408
- 'charLenTable' => UCS2BECharLenTable,
409
- 'name' => 'UTF-16BE'
410
- }
411
-
412
- # UCS2-LE
413
-
414
- UCS2LE_cls = [
415
- 0,0,0,0,0,0,0,0, # 00 - 07
416
- 0,0,1,0,0,2,0,0, # 08 - 0f
417
- 0,0,0,0,0,0,0,0, # 10 - 17
418
- 0,0,0,3,0,0,0,0, # 18 - 1f
419
- 0,0,0,0,0,0,0,0, # 20 - 27
420
- 0,3,3,3,3,3,0,0, # 28 - 2f
421
- 0,0,0,0,0,0,0,0, # 30 - 37
422
- 0,0,0,0,0,0,0,0, # 38 - 3f
423
- 0,0,0,0,0,0,0,0, # 40 - 47
424
- 0,0,0,0,0,0,0,0, # 48 - 4f
425
- 0,0,0,0,0,0,0,0, # 50 - 57
426
- 0,0,0,0,0,0,0,0, # 58 - 5f
427
- 0,0,0,0,0,0,0,0, # 60 - 67
428
- 0,0,0,0,0,0,0,0, # 68 - 6f
429
- 0,0,0,0,0,0,0,0, # 70 - 77
430
- 0,0,0,0,0,0,0,0, # 78 - 7f
431
- 0,0,0,0,0,0,0,0, # 80 - 87
432
- 0,0,0,0,0,0,0,0, # 88 - 8f
433
- 0,0,0,0,0,0,0,0, # 90 - 97
434
- 0,0,0,0,0,0,0,0, # 98 - 9f
435
- 0,0,0,0,0,0,0,0, # a0 - a7
436
- 0,0,0,0,0,0,0,0, # a8 - af
437
- 0,0,0,0,0,0,0,0, # b0 - b7
438
- 0,0,0,0,0,0,0,0, # b8 - bf
439
- 0,0,0,0,0,0,0,0, # c0 - c7
440
- 0,0,0,0,0,0,0,0, # c8 - cf
441
- 0,0,0,0,0,0,0,0, # d0 - d7
442
- 0,0,0,0,0,0,0,0, # d8 - df
443
- 0,0,0,0,0,0,0,0, # e0 - e7
444
- 0,0,0,0,0,0,0,0, # e8 - ef
445
- 0,0,0,0,0,0,0,0, # f0 - f7
446
- 0,0,0,0,0,0,4,5 # f8 - ff
447
- ]
448
-
449
- UCS2LE_st = [
450
- 6, 6, 7, 6, 4, 3,EError,EError,#00-07
451
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
452
- EItsMe,EItsMe, 5, 5, 5,EError,EItsMe,EError,#10-17
453
- 5, 5, 5,EError, 5,EError, 6, 6,#18-1f
454
- 7, 6, 8, 8, 5, 5, 5,EError,#20-27
455
- 5, 5, 5,EError,EError,EError, 5, 5,#28-2f
456
- 5, 5, 5,EError, 5,EError,EStart,EStart#30-37
457
- ]
458
-
459
- UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
460
-
461
- UCS2LESMModel = {'classTable' => UCS2LE_cls,
462
- 'classFactor' => 6,
463
- 'stateTable' => UCS2LE_st,
464
- 'charLenTable' => UCS2LECharLenTable,
465
- 'name' => 'UTF-16LE'
466
- }
467
-
468
- # UTF-8
469
-
470
- UTF8_cls = [
471
- 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
472
- 1,1,1,1,1,1,0,0, # 08 - 0f
473
- 1,1,1,1,1,1,1,1, # 10 - 17
474
- 1,1,1,0,1,1,1,1, # 18 - 1f
475
- 1,1,1,1,1,1,1,1, # 20 - 27
476
- 1,1,1,1,1,1,1,1, # 28 - 2f
477
- 1,1,1,1,1,1,1,1, # 30 - 37
478
- 1,1,1,1,1,1,1,1, # 38 - 3f
479
- 1,1,1,1,1,1,1,1, # 40 - 47
480
- 1,1,1,1,1,1,1,1, # 48 - 4f
481
- 1,1,1,1,1,1,1,1, # 50 - 57
482
- 1,1,1,1,1,1,1,1, # 58 - 5f
483
- 1,1,1,1,1,1,1,1, # 60 - 67
484
- 1,1,1,1,1,1,1,1, # 68 - 6f
485
- 1,1,1,1,1,1,1,1, # 70 - 77
486
- 1,1,1,1,1,1,1,1, # 78 - 7f
487
- 2,2,2,2,3,3,3,3, # 80 - 87
488
- 4,4,4,4,4,4,4,4, # 88 - 8f
489
- 4,4,4,4,4,4,4,4, # 90 - 97
490
- 4,4,4,4,4,4,4,4, # 98 - 9f
491
- 5,5,5,5,5,5,5,5, # a0 - a7
492
- 5,5,5,5,5,5,5,5, # a8 - af
493
- 5,5,5,5,5,5,5,5, # b0 - b7
494
- 5,5,5,5,5,5,5,5, # b8 - bf
495
- 0,0,6,6,6,6,6,6, # c0 - c7
496
- 6,6,6,6,6,6,6,6, # c8 - cf
497
- 6,6,6,6,6,6,6,6, # d0 - d7
498
- 6,6,6,6,6,6,6,6, # d8 - df
499
- 7,8,8,8,8,8,8,8, # e0 - e7
500
- 8,8,8,8,8,9,8,8, # e8 - ef
501
- 10,11,11,11,11,11,11,11, # f0 - f7
502
- 12,13,13,13,14,15,0,0 # f8 - ff
503
- ]
504
-
505
- UTF8_st = [
506
- EError,EStart,EError,EError,EError,EError, 12, 10,#00-07
507
- 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
508
- EError,EError,EError,EError,EError,EError,EError,EError,#10-17
509
- EError,EError,EError,EError,EError,EError,EError,EError,#18-1f
510
- EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,#20-27
511
- EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,#28-2f
512
- EError,EError, 5, 5, 5, 5,EError,EError,#30-37
513
- EError,EError,EError,EError,EError,EError,EError,EError,#38-3f
514
- EError,EError,EError, 5, 5, 5,EError,EError,#40-47
515
- EError,EError,EError,EError,EError,EError,EError,EError,#48-4f
516
- EError,EError, 7, 7, 7, 7,EError,EError,#50-57
517
- EError,EError,EError,EError,EError,EError,EError,EError,#58-5f
518
- EError,EError,EError,EError, 7, 7,EError,EError,#60-67
519
- EError,EError,EError,EError,EError,EError,EError,EError,#68-6f
520
- EError,EError, 9, 9, 9, 9,EError,EError,#70-77
521
- EError,EError,EError,EError,EError,EError,EError,EError,#78-7f
522
- EError,EError,EError,EError,EError, 9,EError,EError,#80-87
523
- EError,EError,EError,EError,EError,EError,EError,EError,#88-8f
524
- EError,EError, 12, 12, 12, 12,EError,EError,#90-97
525
- EError,EError,EError,EError,EError,EError,EError,EError,#98-9f
526
- EError,EError,EError,EError,EError, 12,EError,EError,#a0-a7
527
- EError,EError,EError,EError,EError,EError,EError,EError,#a8-af
528
- EError,EError, 12, 12, 12,EError,EError,EError,#b0-b7
529
- EError,EError,EError,EError,EError,EError,EError,EError,#b8-bf
530
- EError,EError,EStart,EStart,EStart,EStart,EError,EError,#c0-c7
531
- EError,EError,EError,EError,EError,EError,EError,EError#c8-cf
532
- ]
533
-
534
- UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
535
-
536
- UTF8SMModel = {'classTable' => UTF8_cls,
537
- 'classFactor' => 16,
538
- 'stateTable' => UTF8_st,
539
- 'charLenTable' => UTF8CharLenTable,
540
- 'name' => 'UTF-8'
541
- }
542
- end
543
- end