gigo 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/README.md +2 -8
  2. data/gemfiles/activesupport30.gemfile.lock +1 -1
  3. data/gemfiles/activesupport31.gemfile.lock +1 -1
  4. data/gemfiles/activesupport32.gemfile.lock +1 -1
  5. data/gemfiles/activesupport40.gemfile.lock +2 -2
  6. data/lib/gigo.rb +0 -1
  7. data/lib/gigo/version.rb +1 -1
  8. data/test/cases/gigo_test.rb +0 -1
  9. metadata +3 -39
  10. data/lib/gigo/rchardet.rb +0 -67
  11. data/lib/gigo/rchardet/big5freq.rb +0 -927
  12. data/lib/gigo/rchardet/big5prober.rb +0 -43
  13. data/lib/gigo/rchardet/chardistribution.rb +0 -238
  14. data/lib/gigo/rchardet/charsetgroupprober.rb +0 -113
  15. data/lib/gigo/rchardet/charsetprober.rb +0 -76
  16. data/lib/gigo/rchardet/codingstatemachine.rb +0 -66
  17. data/lib/gigo/rchardet/constants.rb +0 -43
  18. data/lib/gigo/rchardet/escprober.rb +0 -90
  19. data/lib/gigo/rchardet/escsm.rb +0 -245
  20. data/lib/gigo/rchardet/eucjpprober.rb +0 -89
  21. data/lib/gigo/rchardet/euckrfreq.rb +0 -598
  22. data/lib/gigo/rchardet/euckrprober.rb +0 -43
  23. data/lib/gigo/rchardet/euctwfreq.rb +0 -431
  24. data/lib/gigo/rchardet/euctwprober.rb +0 -43
  25. data/lib/gigo/rchardet/gb2312freq.rb +0 -475
  26. data/lib/gigo/rchardet/gb2312prober.rb +0 -43
  27. data/lib/gigo/rchardet/hebrewprober.rb +0 -291
  28. data/lib/gigo/rchardet/jisfreq.rb +0 -571
  29. data/lib/gigo/rchardet/jpcntx.rb +0 -230
  30. data/lib/gigo/rchardet/langbulgarianmodel.rb +0 -230
  31. data/lib/gigo/rchardet/langcyrillicmodel.rb +0 -331
  32. data/lib/gigo/rchardet/langgreekmodel.rb +0 -228
  33. data/lib/gigo/rchardet/langhebrewmodel.rb +0 -203
  34. data/lib/gigo/rchardet/langhungarianmodel.rb +0 -227
  35. data/lib/gigo/rchardet/langthaimodel.rb +0 -202
  36. data/lib/gigo/rchardet/latin1prober.rb +0 -148
  37. data/lib/gigo/rchardet/mbcharsetprober.rb +0 -91
  38. data/lib/gigo/rchardet/mbcsgroupprober.rb +0 -48
  39. data/lib/gigo/rchardet/mbcssm.rb +0 -543
  40. data/lib/gigo/rchardet/sbcharsetprober.rb +0 -125
  41. data/lib/gigo/rchardet/sbcsgroupprober.rb +0 -59
  42. data/lib/gigo/rchardet/sjisprober.rb +0 -89
  43. data/lib/gigo/rchardet/universaldetector.rb +0 -169
  44. data/lib/gigo/rchardet/utf8prober.rb +0 -87
  45. data/lib/gigo/transcoders/rchardet.rb +0 -22
@@ -1,148 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is Mozilla Universal charset detector code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 2001
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- # Shy Shalom - original C code
13
- #
14
- # This library is free software; you can redistribute it and/or
15
- # modify it under the terms of the GNU Lesser General Public
16
- # License as published by the Free Software Foundation; either
17
- # version 2.1 of the License, or (at your option) any later version.
18
- #
19
- # This library is distributed in the hope that it will be useful,
20
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
- # Lesser General Public License for more details.
23
- #
24
- # You should have received a copy of the GNU Lesser General Public
25
- # License along with this library; if not, write to the Free Software
26
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
- # 02110-1301 USA
28
- ######################### END LICENSE BLOCK #########################
29
- module GIGO
30
- module CharDet
31
- FREQ_CAT_NUM = 4
32
-
33
- UDF = 0 # undefined
34
- OTH = 1 # other
35
- ASC = 2 # ascii capital letter
36
- ASS = 3 # ascii small letter
37
- ACV = 4 # accent capital vowel
38
- ACO = 5 # accent capital other
39
- ASV = 6 # accent small vowel
40
- ASO = 7 # accent small other
41
- CLASS_NUM = 8 # total classes
42
-
43
- Latin1_CharToClass = [
44
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
45
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
46
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
47
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
48
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
49
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
50
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
51
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
52
- OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
53
- ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
54
- ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
55
- ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
56
- OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
57
- ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
58
- ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
59
- ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
60
- OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
61
- OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
62
- UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
63
- OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
64
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
65
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
66
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
67
- OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
68
- ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
69
- ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
70
- ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
71
- ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
72
- ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
73
- ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
74
- ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
75
- ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
76
- ]
77
-
78
- # 0 : illegal
79
- # 1 : very unlikely
80
- # 2 : normal
81
- # 3 : very likely
82
- Latin1ClassModel = [
83
- # UDF OTH ASC ASS ACV ACO ASV ASO
84
- 0, 0, 0, 0, 0, 0, 0, 0, # UDF
85
- 0, 3, 3, 3, 3, 3, 3, 3, # OTH
86
- 0, 3, 3, 3, 3, 3, 3, 3, # ASC
87
- 0, 3, 3, 3, 1, 1, 3, 3, # ASS
88
- 0, 3, 3, 3, 1, 2, 1, 2, # ACV
89
- 0, 3, 3, 3, 3, 3, 3, 3, # ACO
90
- 0, 3, 1, 3, 1, 1, 1, 3, # ASV
91
- 0, 3, 1, 3, 1, 1, 3, 3, # ASO
92
- ]
93
-
94
- class Latin1Prober < CharSetProber
95
- def initialize
96
- super
97
- reset()
98
- end
99
-
100
- def reset
101
- @_mLastCharClass = OTH
102
- @_mFreqCounter = [0] * FREQ_CAT_NUM
103
- super
104
- end
105
-
106
- def get_charset_name
107
- return "windows-1252"
108
- end
109
-
110
- def feed(aBuf)
111
- aBuf = filter_with_english_letters(aBuf)
112
- for c in aBuf.split('')
113
- char = c.respond_to?(:bytes) ? c.bytes.first : c[0]
114
- charClass = Latin1_CharToClass[char]
115
- freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
116
- if freq == 0
117
- @_mState = ENotMe
118
- break
119
- end
120
- @_mFreqCounter[freq] += 1
121
- @_mLastCharClass = charClass
122
- end
123
-
124
- return get_state()
125
- end
126
-
127
- def get_confidence
128
- if get_state() == ENotMe
129
- return 0.01
130
- end
131
-
132
- total = @_mFreqCounter.inject{|a,b| a+b}
133
- if total < 0.01
134
- confidence = 0.0
135
- else
136
- confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
137
- end
138
- if confidence < 0.0
139
- confidence = 0.0
140
- end
141
- # lower the confidence of latin1 so that other more accurate detector
142
- # can take priority.
143
- confidence = confidence * 0.5
144
- return confidence
145
- end
146
- end
147
- end
148
- end
@@ -1,91 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is Mozilla Universal charset detector code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 2001
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- # Shy Shalom - original C code
13
- # Proofpoint, Inc.
14
- #
15
- # This library is free software; you can redistribute it and/or
16
- # modify it under the terms of the GNU Lesser General Public
17
- # License as published by the Free Software Foundation; either
18
- # version 2.1 of the License, or (at your option) any later version.
19
- #
20
- # This library is distributed in the hope that it will be useful,
21
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
- # Lesser General Public License for more details.
24
- #
25
- # You should have received a copy of the GNU Lesser General Public
26
- # License along with this library; if not, write to the Free Software
27
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28
- # 02110-1301 USA
29
- ######################### END LICENSE BLOCK #########################
30
-
31
- module GIGO
32
- module CharDet
33
- class MultiByteCharSetProber < CharSetProber
34
- def initialize
35
- super
36
- @_mDistributionAnalyzer = nil
37
- @_mCodingSM = nil
38
- @_mLastChar = "\x00\x00"
39
- end
40
-
41
- def reset
42
- super
43
- if @_mCodingSM
44
- @_mCodingSM.reset()
45
- end
46
- if @_mDistributionAnalyzer
47
- @_mDistributionAnalyzer.reset()
48
- end
49
- @_mLastChar = "\x00\x00"
50
- end
51
-
52
- def get_charset_name
53
- end
54
-
55
- def feed(aBuf)
56
- aLen = aBuf.length
57
- for i in (0...aLen)
58
- codingState = @_mCodingSM.next_state(aBuf[i..i])
59
- if codingState == EError
60
- $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
61
- @_mState = ENotMe
62
- break
63
- elsif codingState == EItsMe
64
- @_mState = EFoundIt
65
- break
66
- elsif codingState == EStart
67
- charLen = @_mCodingSM.get_current_charlen()
68
- if i == 0
69
- @_mLastChar[1] = aBuf[0..0]
70
- @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
71
- else
72
- @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
73
- end
74
- end
75
- end
76
- @_mLastChar[0] = aBuf[aLen-1..aLen-1]
77
-
78
- if get_state() == EDetecting
79
- if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
80
- @_mState = EFoundIt
81
- end
82
- end
83
- return get_state()
84
- end
85
-
86
- def get_confidence
87
- return @_mDistributionAnalyzer.get_confidence()
88
- end
89
- end
90
- end
91
- end
@@ -1,48 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is Mozilla Universal charset detector code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 2001
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- # Shy Shalom - original C code
13
- # Proofpoint, Inc.
14
- #
15
- # This library is free software; you can redistribute it and/or
16
- # modify it under the terms of the GNU Lesser General Public
17
- # License as published by the Free Software Foundation; either
18
- # version 2.1 of the License, or (at your option) any later version.
19
- #
20
- # This library is distributed in the hope that it will be useful,
21
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
- # Lesser General Public License for more details.
24
- #
25
- # You should have received a copy of the GNU Lesser General Public
26
- # License along with this library; if not, write to the Free Software
27
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28
- # 02110-1301 USA
29
- ######################### END LICENSE BLOCK #########################
30
- module GIGO
31
- module CharDet
32
- class MBCSGroupProber < CharSetGroupProber
33
- def initialize
34
- super
35
- @_mProbers = [
36
- UTF8Prober.new,
37
- SJISProber.new,
38
- EUCJPProber.new,
39
- GB2312Prober.new,
40
- EUCKRProber.new,
41
- Big5Prober.new,
42
- EUCTWProber.new
43
- ]
44
- reset()
45
- end
46
- end
47
- end
48
- end
@@ -1,543 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is mozilla.org code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 1998
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- #
13
- # This library is free software; you can redistribute it and/or
14
- # modify it under the terms of the GNU Lesser General Public
15
- # License as published by the Free Software Foundation; either
16
- # version 2.1 of the License, or (at your option) any later version.
17
- #
18
- # This library is distributed in the hope that it will be useful,
19
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
- # Lesser General Public License for more details.
22
- #
23
- # You should have received a copy of the GNU Lesser General Public
24
- # License along with this library; if not, write to the Free Software
25
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
- # 02110-1301 USA
27
- ######################### END LICENSE BLOCK #########################
28
- module GIGO
29
- module CharDet
30
- # BIG5
31
-
32
- BIG5_cls = [
33
- 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
34
- 1,1,1,1,1,1,0,0, # 08 - 0f
35
- 1,1,1,1,1,1,1,1, # 10 - 17
36
- 1,1,1,0,1,1,1,1, # 18 - 1f
37
- 1,1,1,1,1,1,1,1, # 20 - 27
38
- 1,1,1,1,1,1,1,1, # 28 - 2f
39
- 1,1,1,1,1,1,1,1, # 30 - 37
40
- 1,1,1,1,1,1,1,1, # 38 - 3f
41
- 2,2,2,2,2,2,2,2, # 40 - 47
42
- 2,2,2,2,2,2,2,2, # 48 - 4f
43
- 2,2,2,2,2,2,2,2, # 50 - 57
44
- 2,2,2,2,2,2,2,2, # 58 - 5f
45
- 2,2,2,2,2,2,2,2, # 60 - 67
46
- 2,2,2,2,2,2,2,2, # 68 - 6f
47
- 2,2,2,2,2,2,2,2, # 70 - 77
48
- 2,2,2,2,2,2,2,1, # 78 - 7f
49
- 4,4,4,4,4,4,4,4, # 80 - 87
50
- 4,4,4,4,4,4,4,4, # 88 - 8f
51
- 4,4,4,4,4,4,4,4, # 90 - 97
52
- 4,4,4,4,4,4,4,4, # 98 - 9f
53
- 4,3,3,3,3,3,3,3, # a0 - a7
54
- 3,3,3,3,3,3,3,3, # a8 - af
55
- 3,3,3,3,3,3,3,3, # b0 - b7
56
- 3,3,3,3,3,3,3,3, # b8 - bf
57
- 3,3,3,3,3,3,3,3, # c0 - c7
58
- 3,3,3,3,3,3,3,3, # c8 - cf
59
- 3,3,3,3,3,3,3,3, # d0 - d7
60
- 3,3,3,3,3,3,3,3, # d8 - df
61
- 3,3,3,3,3,3,3,3, # e0 - e7
62
- 3,3,3,3,3,3,3,3, # e8 - ef
63
- 3,3,3,3,3,3,3,3, # f0 - f7
64
- 3,3,3,3,3,3,3,0 # f8 - ff
65
- ]
66
-
67
- BIG5_st = [
68
- EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
69
- EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,#08-0f
70
- EError,EStart,EStart,EStart,EStart,EStart,EStart,EStart #10-17
71
- ]
72
-
73
- Big5CharLenTable = [0, 1, 1, 2, 0]
74
-
75
- Big5SMModel = {'classTable' => BIG5_cls,
76
- 'classFactor' => 5,
77
- 'stateTable' => BIG5_st,
78
- 'charLenTable' => Big5CharLenTable,
79
- 'name' => 'Big5'
80
- }
81
-
82
- # EUC-JP
83
-
84
- EUCJP_cls = [
85
- 4,4,4,4,4,4,4,4, # 00 - 07
86
- 4,4,4,4,4,4,5,5, # 08 - 0f
87
- 4,4,4,4,4,4,4,4, # 10 - 17
88
- 4,4,4,5,4,4,4,4, # 18 - 1f
89
- 4,4,4,4,4,4,4,4, # 20 - 27
90
- 4,4,4,4,4,4,4,4, # 28 - 2f
91
- 4,4,4,4,4,4,4,4, # 30 - 37
92
- 4,4,4,4,4,4,4,4, # 38 - 3f
93
- 4,4,4,4,4,4,4,4, # 40 - 47
94
- 4,4,4,4,4,4,4,4, # 48 - 4f
95
- 4,4,4,4,4,4,4,4, # 50 - 57
96
- 4,4,4,4,4,4,4,4, # 58 - 5f
97
- 4,4,4,4,4,4,4,4, # 60 - 67
98
- 4,4,4,4,4,4,4,4, # 68 - 6f
99
- 4,4,4,4,4,4,4,4, # 70 - 77
100
- 4,4,4,4,4,4,4,4, # 78 - 7f
101
- 5,5,5,5,5,5,5,5, # 80 - 87
102
- 5,5,5,5,5,5,1,3, # 88 - 8f
103
- 5,5,5,5,5,5,5,5, # 90 - 97
104
- 5,5,5,5,5,5,5,5, # 98 - 9f
105
- 5,2,2,2,2,2,2,2, # a0 - a7
106
- 2,2,2,2,2,2,2,2, # a8 - af
107
- 2,2,2,2,2,2,2,2, # b0 - b7
108
- 2,2,2,2,2,2,2,2, # b8 - bf
109
- 2,2,2,2,2,2,2,2, # c0 - c7
110
- 2,2,2,2,2,2,2,2, # c8 - cf
111
- 2,2,2,2,2,2,2,2, # d0 - d7
112
- 2,2,2,2,2,2,2,2, # d8 - df
113
- 0,0,0,0,0,0,0,0, # e0 - e7
114
- 0,0,0,0,0,0,0,0, # e8 - ef
115
- 0,0,0,0,0,0,0,0, # f0 - f7
116
- 0,0,0,0,0,0,0,5 # f8 - ff
117
- ]
118
-
119
- EUCJP_st = [
120
- 3, 4, 3, 5,EStart,EError,EError,EError,#00-07
121
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
122
- EItsMe,EItsMe,EStart,EError,EStart,EError,EError,EError,#10-17
123
- EError,EError,EStart,EError,EError,EError, 3,EError,#18-1f
124
- 3,EError,EError,EError,EStart,EStart,EStart,EStart #20-27
125
- ]
126
-
127
- EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
128
-
129
- EUCJPSMModel = {'classTable' => EUCJP_cls,
130
- 'classFactor' => 6,
131
- 'stateTable' => EUCJP_st,
132
- 'charLenTable' => EUCJPCharLenTable,
133
- 'name' => 'EUC-JP'
134
- }
135
-
136
- # EUC-KR
137
-
138
- EUCKR_cls = [
139
- 1,1,1,1,1,1,1,1, # 00 - 07
140
- 1,1,1,1,1,1,0,0, # 08 - 0f
141
- 1,1,1,1,1,1,1,1, # 10 - 17
142
- 1,1,1,0,1,1,1,1, # 18 - 1f
143
- 1,1,1,1,1,1,1,1, # 20 - 27
144
- 1,1,1,1,1,1,1,1, # 28 - 2f
145
- 1,1,1,1,1,1,1,1, # 30 - 37
146
- 1,1,1,1,1,1,1,1, # 38 - 3f
147
- 1,1,1,1,1,1,1,1, # 40 - 47
148
- 1,1,1,1,1,1,1,1, # 48 - 4f
149
- 1,1,1,1,1,1,1,1, # 50 - 57
150
- 1,1,1,1,1,1,1,1, # 58 - 5f
151
- 1,1,1,1,1,1,1,1, # 60 - 67
152
- 1,1,1,1,1,1,1,1, # 68 - 6f
153
- 1,1,1,1,1,1,1,1, # 70 - 77
154
- 1,1,1,1,1,1,1,1, # 78 - 7f
155
- 0,0,0,0,0,0,0,0, # 80 - 87
156
- 0,0,0,0,0,0,0,0, # 88 - 8f
157
- 0,0,0,0,0,0,0,0, # 90 - 97
158
- 0,0,0,0,0,0,0,0, # 98 - 9f
159
- 0,2,2,2,2,2,2,2, # a0 - a7
160
- 2,2,2,2,2,3,3,3, # a8 - af
161
- 2,2,2,2,2,2,2,2, # b0 - b7
162
- 2,2,2,2,2,2,2,2, # b8 - bf
163
- 2,2,2,2,2,2,2,2, # c0 - c7
164
- 2,3,2,2,2,2,2,2, # c8 - cf
165
- 2,2,2,2,2,2,2,2, # d0 - d7
166
- 2,2,2,2,2,2,2,2, # d8 - df
167
- 2,2,2,2,2,2,2,2, # e0 - e7
168
- 2,2,2,2,2,2,2,2, # e8 - ef
169
- 2,2,2,2,2,2,2,2, # f0 - f7
170
- 2,2,2,2,2,2,2,0 # f8 - ff
171
- ]
172
-
173
- EUCKR_st = [
174
- EError,EStart, 3,EError,EError,EError,EError,EError,#00-07
175
- EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,EStart#08-0f
176
- ]
177
-
178
- EUCKRCharLenTable = [0, 1, 2, 0]
179
-
180
- EUCKRSMModel = {'classTable' => EUCKR_cls,
181
- 'classFactor' => 4,
182
- 'stateTable' => EUCKR_st,
183
- 'charLenTable' => EUCKRCharLenTable,
184
- 'name' => 'EUC-KR'
185
- }
186
-
187
- # EUC-TW
188
-
189
- EUCTW_cls = [
190
- 2,2,2,2,2,2,2,2, # 00 - 07
191
- 2,2,2,2,2,2,0,0, # 08 - 0f
192
- 2,2,2,2,2,2,2,2, # 10 - 17
193
- 2,2,2,0,2,2,2,2, # 18 - 1f
194
- 2,2,2,2,2,2,2,2, # 20 - 27
195
- 2,2,2,2,2,2,2,2, # 28 - 2f
196
- 2,2,2,2,2,2,2,2, # 30 - 37
197
- 2,2,2,2,2,2,2,2, # 38 - 3f
198
- 2,2,2,2,2,2,2,2, # 40 - 47
199
- 2,2,2,2,2,2,2,2, # 48 - 4f
200
- 2,2,2,2,2,2,2,2, # 50 - 57
201
- 2,2,2,2,2,2,2,2, # 58 - 5f
202
- 2,2,2,2,2,2,2,2, # 60 - 67
203
- 2,2,2,2,2,2,2,2, # 68 - 6f
204
- 2,2,2,2,2,2,2,2, # 70 - 77
205
- 2,2,2,2,2,2,2,2, # 78 - 7f
206
- 0,0,0,0,0,0,0,0, # 80 - 87
207
- 0,0,0,0,0,0,6,0, # 88 - 8f
208
- 0,0,0,0,0,0,0,0, # 90 - 97
209
- 0,0,0,0,0,0,0,0, # 98 - 9f
210
- 0,3,4,4,4,4,4,4, # a0 - a7
211
- 5,5,1,1,1,1,1,1, # a8 - af
212
- 1,1,1,1,1,1,1,1, # b0 - b7
213
- 1,1,1,1,1,1,1,1, # b8 - bf
214
- 1,1,3,1,3,3,3,3, # c0 - c7
215
- 3,3,3,3,3,3,3,3, # c8 - cf
216
- 3,3,3,3,3,3,3,3, # d0 - d7
217
- 3,3,3,3,3,3,3,3, # d8 - df
218
- 3,3,3,3,3,3,3,3, # e0 - e7
219
- 3,3,3,3,3,3,3,3, # e8 - ef
220
- 3,3,3,3,3,3,3,3, # f0 - f7
221
- 3,3,3,3,3,3,3,0 # f8 - ff
222
- ]
223
-
224
- EUCTW_st = [
225
- EError,EError,EStart, 3, 3, 3, 4,EError,#00-07
226
- EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
227
- EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EStart,EError,#10-17
228
- EStart,EStart,EStart,EError,EError,EError,EError,EError,#18-1f
229
- 5,EError,EError,EError,EStart,EError,EStart,EStart,#20-27
230
- EStart,EError,EStart,EStart,EStart,EStart,EStart,EStart #28-2f
231
- ]
232
-
233
- EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
234
-
235
- EUCTWSMModel = {'classTable' => EUCTW_cls,
236
- 'classFactor' => 7,
237
- 'stateTable' => EUCTW_st,
238
- 'charLenTable' => EUCTWCharLenTable,
239
- 'name' => 'x-euc-tw'
240
- }
241
-
242
- # GB2312
243
-
244
- GB2312_cls = [
245
- 1,1,1,1,1,1,1,1, # 00 - 07
246
- 1,1,1,1,1,1,0,0, # 08 - 0f
247
- 1,1,1,1,1,1,1,1, # 10 - 17
248
- 1,1,1,0,1,1,1,1, # 18 - 1f
249
- 1,1,1,1,1,1,1,1, # 20 - 27
250
- 1,1,1,1,1,1,1,1, # 28 - 2f
251
- 3,3,3,3,3,3,3,3, # 30 - 37
252
- 3,3,1,1,1,1,1,1, # 38 - 3f
253
- 2,2,2,2,2,2,2,2, # 40 - 47
254
- 2,2,2,2,2,2,2,2, # 48 - 4f
255
- 2,2,2,2,2,2,2,2, # 50 - 57
256
- 2,2,2,2,2,2,2,2, # 58 - 5f
257
- 2,2,2,2,2,2,2,2, # 60 - 67
258
- 2,2,2,2,2,2,2,2, # 68 - 6f
259
- 2,2,2,2,2,2,2,2, # 70 - 77
260
- 2,2,2,2,2,2,2,4, # 78 - 7f
261
- 5,6,6,6,6,6,6,6, # 80 - 87
262
- 6,6,6,6,6,6,6,6, # 88 - 8f
263
- 6,6,6,6,6,6,6,6, # 90 - 97
264
- 6,6,6,6,6,6,6,6, # 98 - 9f
265
- 6,6,6,6,6,6,6,6, # a0 - a7
266
- 6,6,6,6,6,6,6,6, # a8 - af
267
- 6,6,6,6,6,6,6,6, # b0 - b7
268
- 6,6,6,6,6,6,6,6, # b8 - bf
269
- 6,6,6,6,6,6,6,6, # c0 - c7
270
- 6,6,6,6,6,6,6,6, # c8 - cf
271
- 6,6,6,6,6,6,6,6, # d0 - d7
272
- 6,6,6,6,6,6,6,6, # d8 - df
273
- 6,6,6,6,6,6,6,6, # e0 - e7
274
- 6,6,6,6,6,6,6,6, # e8 - ef
275
- 6,6,6,6,6,6,6,6, # f0 - f7
276
- 6,6,6,6,6,6,6,0 # f8 - ff
277
- ]
278
-
279
- GB2312_st = [
280
- EError,EStart,EStart,EStart,EStart,EStart, 3,EError,#00-07
281
- EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
282
- EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,#10-17
283
- 4,EError,EStart,EStart,EError,EError,EError,EError,#18-1f
284
- EError,EError, 5,EError,EError,EError,EItsMe,EError,#20-27
285
- EError,EError,EStart,EStart,EStart,EStart,EStart,EStart#28-2f
286
- ]
287
-
288
- # To be accurate, the length of class 6 can be either 2 or 4.
289
- # But it is not necessary to discriminate between the two since
290
- # it is used for frequency analysis only, and we are validing
291
- # each code range there as well. So it is safe to set it to be
292
- # 2 here.
293
- GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
294
-
295
- GB2312SMModel = {'classTable' => GB2312_cls,
296
- 'classFactor' => 7,
297
- 'stateTable' => GB2312_st,
298
- 'charLenTable' => GB2312CharLenTable,
299
- 'name' => 'GB2312'
300
- }
301
-
302
- # Shift_JIS
303
-
304
- SJIS_cls = [
305
- 1,1,1,1,1,1,1,1, # 00 - 07
306
- 1,1,1,1,1,1,0,0, # 08 - 0f
307
- 1,1,1,1,1,1,1,1, # 10 - 17
308
- 1,1,1,0,1,1,1,1, # 18 - 1f
309
- 1,1,1,1,1,1,1,1, # 20 - 27
310
- 1,1,1,1,1,1,1,1, # 28 - 2f
311
- 1,1,1,1,1,1,1,1, # 30 - 37
312
- 1,1,1,1,1,1,1,1, # 38 - 3f
313
- 2,2,2,2,2,2,2,2, # 40 - 47
314
- 2,2,2,2,2,2,2,2, # 48 - 4f
315
- 2,2,2,2,2,2,2,2, # 50 - 57
316
- 2,2,2,2,2,2,2,2, # 58 - 5f
317
- 2,2,2,2,2,2,2,2, # 60 - 67
318
- 2,2,2,2,2,2,2,2, # 68 - 6f
319
- 2,2,2,2,2,2,2,2, # 70 - 77
320
- 2,2,2,2,2,2,2,1, # 78 - 7f
321
- 3,3,3,3,3,3,3,3, # 80 - 87
322
- 3,3,3,3,3,3,3,3, # 88 - 8f
323
- 3,3,3,3,3,3,3,3, # 90 - 97
324
- 3,3,3,3,3,3,3,3, # 98 - 9f
325
- #0xa0 is illegal in sjis encoding, but some pages does
326
- #contain such byte. We need to be more error forgiven.
327
- 2,2,2,2,2,2,2,2, # a0 - a7
328
- 2,2,2,2,2,2,2,2, # a8 - af
329
- 2,2,2,2,2,2,2,2, # b0 - b7
330
- 2,2,2,2,2,2,2,2, # b8 - bf
331
- 2,2,2,2,2,2,2,2, # c0 - c7
332
- 2,2,2,2,2,2,2,2, # c8 - cf
333
- 2,2,2,2,2,2,2,2, # d0 - d7
334
- 2,2,2,2,2,2,2,2, # d8 - df
335
- 3,3,3,3,3,3,3,3, # e0 - e7
336
- 3,3,3,3,3,4,4,4, # e8 - ef
337
- 4,4,4,4,4,4,4,4, # f0 - f7
338
- 4,4,4,4,4,0,0,0 # f8 - ff
339
- ]
340
-
341
- SJIS_st = [
342
- EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
343
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
344
- EItsMe,EItsMe,EError,EError,EStart,EStart,EStart,EStart#10-17
345
- ]
346
-
347
- SJISCharLenTable = [0, 1, 1, 2, 0, 0]
348
-
349
- SJISSMModel = {'classTable' => SJIS_cls,
350
- 'classFactor' => 6,
351
- 'stateTable' => SJIS_st,
352
- 'charLenTable' => SJISCharLenTable,
353
- 'name' => 'Shift_JIS'
354
- }
355
-
356
- # UCS2-BE
357
-
358
- UCS2BE_cls = [
359
- 0,0,0,0,0,0,0,0, # 00 - 07
360
- 0,0,1,0,0,2,0,0, # 08 - 0f
361
- 0,0,0,0,0,0,0,0, # 10 - 17
362
- 0,0,0,3,0,0,0,0, # 18 - 1f
363
- 0,0,0,0,0,0,0,0, # 20 - 27
364
- 0,3,3,3,3,3,0,0, # 28 - 2f
365
- 0,0,0,0,0,0,0,0, # 30 - 37
366
- 0,0,0,0,0,0,0,0, # 38 - 3f
367
- 0,0,0,0,0,0,0,0, # 40 - 47
368
- 0,0,0,0,0,0,0,0, # 48 - 4f
369
- 0,0,0,0,0,0,0,0, # 50 - 57
370
- 0,0,0,0,0,0,0,0, # 58 - 5f
371
- 0,0,0,0,0,0,0,0, # 60 - 67
372
- 0,0,0,0,0,0,0,0, # 68 - 6f
373
- 0,0,0,0,0,0,0,0, # 70 - 77
374
- 0,0,0,0,0,0,0,0, # 78 - 7f
375
- 0,0,0,0,0,0,0,0, # 80 - 87
376
- 0,0,0,0,0,0,0,0, # 88 - 8f
377
- 0,0,0,0,0,0,0,0, # 90 - 97
378
- 0,0,0,0,0,0,0,0, # 98 - 9f
379
- 0,0,0,0,0,0,0,0, # a0 - a7
380
- 0,0,0,0,0,0,0,0, # a8 - af
381
- 0,0,0,0,0,0,0,0, # b0 - b7
382
- 0,0,0,0,0,0,0,0, # b8 - bf
383
- 0,0,0,0,0,0,0,0, # c0 - c7
384
- 0,0,0,0,0,0,0,0, # c8 - cf
385
- 0,0,0,0,0,0,0,0, # d0 - d7
386
- 0,0,0,0,0,0,0,0, # d8 - df
387
- 0,0,0,0,0,0,0,0, # e0 - e7
388
- 0,0,0,0,0,0,0,0, # e8 - ef
389
- 0,0,0,0,0,0,0,0, # f0 - f7
390
- 0,0,0,0,0,0,4,5 # f8 - ff
391
- ]
392
-
393
- UCS2BE_st = [
394
- 5, 7, 7,EError, 4, 3,EError,EError,#00-07
395
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
396
- EItsMe,EItsMe, 6, 6, 6, 6,EError,EError,#10-17
397
- 6, 6, 6, 6, 6,EItsMe, 6, 6,#18-1f
398
- 6, 6, 6, 6, 5, 7, 7,EError,#20-27
399
- 5, 8, 6, 6,EError, 6, 6, 6,#28-2f
400
- 6, 6, 6, 6,EError,EError,EStart,EStart#30-37
401
- ]
402
-
403
- UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
404
-
405
- UCS2BESMModel = {'classTable' => UCS2BE_cls,
406
- 'classFactor' => 6,
407
- 'stateTable' => UCS2BE_st,
408
- 'charLenTable' => UCS2BECharLenTable,
409
- 'name' => 'UTF-16BE'
410
- }
411
-
412
- # UCS2-LE
413
-
414
- UCS2LE_cls = [
415
- 0,0,0,0,0,0,0,0, # 00 - 07
416
- 0,0,1,0,0,2,0,0, # 08 - 0f
417
- 0,0,0,0,0,0,0,0, # 10 - 17
418
- 0,0,0,3,0,0,0,0, # 18 - 1f
419
- 0,0,0,0,0,0,0,0, # 20 - 27
420
- 0,3,3,3,3,3,0,0, # 28 - 2f
421
- 0,0,0,0,0,0,0,0, # 30 - 37
422
- 0,0,0,0,0,0,0,0, # 38 - 3f
423
- 0,0,0,0,0,0,0,0, # 40 - 47
424
- 0,0,0,0,0,0,0,0, # 48 - 4f
425
- 0,0,0,0,0,0,0,0, # 50 - 57
426
- 0,0,0,0,0,0,0,0, # 58 - 5f
427
- 0,0,0,0,0,0,0,0, # 60 - 67
428
- 0,0,0,0,0,0,0,0, # 68 - 6f
429
- 0,0,0,0,0,0,0,0, # 70 - 77
430
- 0,0,0,0,0,0,0,0, # 78 - 7f
431
- 0,0,0,0,0,0,0,0, # 80 - 87
432
- 0,0,0,0,0,0,0,0, # 88 - 8f
433
- 0,0,0,0,0,0,0,0, # 90 - 97
434
- 0,0,0,0,0,0,0,0, # 98 - 9f
435
- 0,0,0,0,0,0,0,0, # a0 - a7
436
- 0,0,0,0,0,0,0,0, # a8 - af
437
- 0,0,0,0,0,0,0,0, # b0 - b7
438
- 0,0,0,0,0,0,0,0, # b8 - bf
439
- 0,0,0,0,0,0,0,0, # c0 - c7
440
- 0,0,0,0,0,0,0,0, # c8 - cf
441
- 0,0,0,0,0,0,0,0, # d0 - d7
442
- 0,0,0,0,0,0,0,0, # d8 - df
443
- 0,0,0,0,0,0,0,0, # e0 - e7
444
- 0,0,0,0,0,0,0,0, # e8 - ef
445
- 0,0,0,0,0,0,0,0, # f0 - f7
446
- 0,0,0,0,0,0,4,5 # f8 - ff
447
- ]
448
-
449
- UCS2LE_st = [
450
- 6, 6, 7, 6, 4, 3,EError,EError,#00-07
451
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
452
- EItsMe,EItsMe, 5, 5, 5,EError,EItsMe,EError,#10-17
453
- 5, 5, 5,EError, 5,EError, 6, 6,#18-1f
454
- 7, 6, 8, 8, 5, 5, 5,EError,#20-27
455
- 5, 5, 5,EError,EError,EError, 5, 5,#28-2f
456
- 5, 5, 5,EError, 5,EError,EStart,EStart#30-37
457
- ]
458
-
459
- UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
460
-
461
- UCS2LESMModel = {'classTable' => UCS2LE_cls,
462
- 'classFactor' => 6,
463
- 'stateTable' => UCS2LE_st,
464
- 'charLenTable' => UCS2LECharLenTable,
465
- 'name' => 'UTF-16LE'
466
- }
467
-
468
- # UTF-8
469
-
470
- UTF8_cls = [
471
- 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
472
- 1,1,1,1,1,1,0,0, # 08 - 0f
473
- 1,1,1,1,1,1,1,1, # 10 - 17
474
- 1,1,1,0,1,1,1,1, # 18 - 1f
475
- 1,1,1,1,1,1,1,1, # 20 - 27
476
- 1,1,1,1,1,1,1,1, # 28 - 2f
477
- 1,1,1,1,1,1,1,1, # 30 - 37
478
- 1,1,1,1,1,1,1,1, # 38 - 3f
479
- 1,1,1,1,1,1,1,1, # 40 - 47
480
- 1,1,1,1,1,1,1,1, # 48 - 4f
481
- 1,1,1,1,1,1,1,1, # 50 - 57
482
- 1,1,1,1,1,1,1,1, # 58 - 5f
483
- 1,1,1,1,1,1,1,1, # 60 - 67
484
- 1,1,1,1,1,1,1,1, # 68 - 6f
485
- 1,1,1,1,1,1,1,1, # 70 - 77
486
- 1,1,1,1,1,1,1,1, # 78 - 7f
487
- 2,2,2,2,3,3,3,3, # 80 - 87
488
- 4,4,4,4,4,4,4,4, # 88 - 8f
489
- 4,4,4,4,4,4,4,4, # 90 - 97
490
- 4,4,4,4,4,4,4,4, # 98 - 9f
491
- 5,5,5,5,5,5,5,5, # a0 - a7
492
- 5,5,5,5,5,5,5,5, # a8 - af
493
- 5,5,5,5,5,5,5,5, # b0 - b7
494
- 5,5,5,5,5,5,5,5, # b8 - bf
495
- 0,0,6,6,6,6,6,6, # c0 - c7
496
- 6,6,6,6,6,6,6,6, # c8 - cf
497
- 6,6,6,6,6,6,6,6, # d0 - d7
498
- 6,6,6,6,6,6,6,6, # d8 - df
499
- 7,8,8,8,8,8,8,8, # e0 - e7
500
- 8,8,8,8,8,9,8,8, # e8 - ef
501
- 10,11,11,11,11,11,11,11, # f0 - f7
502
- 12,13,13,13,14,15,0,0 # f8 - ff
503
- ]
504
-
505
- UTF8_st = [
506
- EError,EStart,EError,EError,EError,EError, 12, 10,#00-07
507
- 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
508
- EError,EError,EError,EError,EError,EError,EError,EError,#10-17
509
- EError,EError,EError,EError,EError,EError,EError,EError,#18-1f
510
- EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,#20-27
511
- EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,#28-2f
512
- EError,EError, 5, 5, 5, 5,EError,EError,#30-37
513
- EError,EError,EError,EError,EError,EError,EError,EError,#38-3f
514
- EError,EError,EError, 5, 5, 5,EError,EError,#40-47
515
- EError,EError,EError,EError,EError,EError,EError,EError,#48-4f
516
- EError,EError, 7, 7, 7, 7,EError,EError,#50-57
517
- EError,EError,EError,EError,EError,EError,EError,EError,#58-5f
518
- EError,EError,EError,EError, 7, 7,EError,EError,#60-67
519
- EError,EError,EError,EError,EError,EError,EError,EError,#68-6f
520
- EError,EError, 9, 9, 9, 9,EError,EError,#70-77
521
- EError,EError,EError,EError,EError,EError,EError,EError,#78-7f
522
- EError,EError,EError,EError,EError, 9,EError,EError,#80-87
523
- EError,EError,EError,EError,EError,EError,EError,EError,#88-8f
524
- EError,EError, 12, 12, 12, 12,EError,EError,#90-97
525
- EError,EError,EError,EError,EError,EError,EError,EError,#98-9f
526
- EError,EError,EError,EError,EError, 12,EError,EError,#a0-a7
527
- EError,EError,EError,EError,EError,EError,EError,EError,#a8-af
528
- EError,EError, 12, 12, 12,EError,EError,EError,#b0-b7
529
- EError,EError,EError,EError,EError,EError,EError,EError,#b8-bf
530
- EError,EError,EStart,EStart,EStart,EStart,EError,EError,#c0-c7
531
- EError,EError,EError,EError,EError,EError,EError,EError#c8-cf
532
- ]
533
-
534
- UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
535
-
536
- UTF8SMModel = {'classTable' => UTF8_cls,
537
- 'classFactor' => 16,
538
- 'stateTable' => UTF8_st,
539
- 'charLenTable' => UTF8CharLenTable,
540
- 'name' => 'UTF-8'
541
- }
542
- end
543
- end