gigo 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/README.md +2 -8
  2. data/gemfiles/activesupport30.gemfile.lock +1 -1
  3. data/gemfiles/activesupport31.gemfile.lock +1 -1
  4. data/gemfiles/activesupport32.gemfile.lock +1 -1
  5. data/gemfiles/activesupport40.gemfile.lock +2 -2
  6. data/lib/gigo.rb +0 -1
  7. data/lib/gigo/version.rb +1 -1
  8. data/test/cases/gigo_test.rb +0 -1
  9. metadata +3 -39
  10. data/lib/gigo/rchardet.rb +0 -67
  11. data/lib/gigo/rchardet/big5freq.rb +0 -927
  12. data/lib/gigo/rchardet/big5prober.rb +0 -43
  13. data/lib/gigo/rchardet/chardistribution.rb +0 -238
  14. data/lib/gigo/rchardet/charsetgroupprober.rb +0 -113
  15. data/lib/gigo/rchardet/charsetprober.rb +0 -76
  16. data/lib/gigo/rchardet/codingstatemachine.rb +0 -66
  17. data/lib/gigo/rchardet/constants.rb +0 -43
  18. data/lib/gigo/rchardet/escprober.rb +0 -90
  19. data/lib/gigo/rchardet/escsm.rb +0 -245
  20. data/lib/gigo/rchardet/eucjpprober.rb +0 -89
  21. data/lib/gigo/rchardet/euckrfreq.rb +0 -598
  22. data/lib/gigo/rchardet/euckrprober.rb +0 -43
  23. data/lib/gigo/rchardet/euctwfreq.rb +0 -431
  24. data/lib/gigo/rchardet/euctwprober.rb +0 -43
  25. data/lib/gigo/rchardet/gb2312freq.rb +0 -475
  26. data/lib/gigo/rchardet/gb2312prober.rb +0 -43
  27. data/lib/gigo/rchardet/hebrewprober.rb +0 -291
  28. data/lib/gigo/rchardet/jisfreq.rb +0 -571
  29. data/lib/gigo/rchardet/jpcntx.rb +0 -230
  30. data/lib/gigo/rchardet/langbulgarianmodel.rb +0 -230
  31. data/lib/gigo/rchardet/langcyrillicmodel.rb +0 -331
  32. data/lib/gigo/rchardet/langgreekmodel.rb +0 -228
  33. data/lib/gigo/rchardet/langhebrewmodel.rb +0 -203
  34. data/lib/gigo/rchardet/langhungarianmodel.rb +0 -227
  35. data/lib/gigo/rchardet/langthaimodel.rb +0 -202
  36. data/lib/gigo/rchardet/latin1prober.rb +0 -148
  37. data/lib/gigo/rchardet/mbcharsetprober.rb +0 -91
  38. data/lib/gigo/rchardet/mbcsgroupprober.rb +0 -48
  39. data/lib/gigo/rchardet/mbcssm.rb +0 -543
  40. data/lib/gigo/rchardet/sbcharsetprober.rb +0 -125
  41. data/lib/gigo/rchardet/sbcsgroupprober.rb +0 -59
  42. data/lib/gigo/rchardet/sjisprober.rb +0 -89
  43. data/lib/gigo/rchardet/universaldetector.rb +0 -169
  44. data/lib/gigo/rchardet/utf8prober.rb +0 -87
  45. data/lib/gigo/transcoders/rchardet.rb +0 -22
@@ -1,66 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is mozilla.org code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 1998
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- #
13
- # This library is free software; you can redistribute it and/or
14
- # modify it under the terms of the GNU Lesser General Public
15
- # License as published by the Free Software Foundation; either
16
- # version 2.1 of the License, or (at your option) any later version.
17
- #
18
- # This library is distributed in the hope that it will be useful,
19
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
- # Lesser General Public License for more details.
22
- #
23
- # You should have received a copy of the GNU Lesser General Public
24
- # License along with this library; if not, write to the Free Software
25
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
- # 02110-1301 USA
27
- ######################### END LICENSE BLOCK #########################
28
- module GIGO
29
- module CharDet
30
- class CodingStateMachine
31
- def initialize(sm)
32
- @_mModel = sm
33
- @_mCurrentBytePos = 0
34
- @_mCurrentCharLen = 0
35
- reset()
36
- end
37
-
38
- def reset
39
- @_mCurrentState = EStart
40
- end
41
-
42
- def next_state(c)
43
- # for each byte we get its class
44
- # if it is first byte, we also get byte length
45
- char = c.respond_to?(:bytes) ? c.bytes.first : c[0]
46
- byteCls = @_mModel['classTable'][char]
47
- if @_mCurrentState == EStart
48
- @_mCurrentBytePos = 0
49
- @_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
50
- end
51
- # from byte's class and stateTable, we get its next state
52
- @_mCurrentState = @_mModel['stateTable'][@_mCurrentState * @_mModel['classFactor'] + byteCls]
53
- @_mCurrentBytePos += 1
54
- return @_mCurrentState
55
- end
56
-
57
- def get_current_charlen
58
- return @_mCurrentCharLen
59
- end
60
-
61
- def get_coding_state_machine
62
- return @_mModel['name']
63
- end
64
- end
65
- end
66
- end
@@ -1,43 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is Mozilla Universal charset detector code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 2001
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- # Shy Shalom - original C code
13
- #
14
- # This library is free software; you can redistribute it and/or
15
- # modify it under the terms of the GNU Lesser General Public
16
- # License as published by the Free Software Foundation; either
17
- # version 2.1 of the License, or (at your option) any later version.
18
- #
19
- # This library is distributed in the hope that it will be useful,
20
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
- # Lesser General Public License for more details.
23
- #
24
- # You should have received a copy of the GNU Lesser General Public
25
- # License along with this library; if not, write to the Free Software
26
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
- # 02110-1301 USA
28
- ######################### END LICENSE BLOCK #########################
29
- module GIGO
30
- module CharDet
31
- $debug = false
32
-
33
- EDetecting = 0
34
- EFoundIt = 1
35
- ENotMe = 2
36
-
37
- EStart = 0
38
- EError = 1
39
- EItsMe = 2
40
-
41
- SHORTCUT_THRESHOLD = 0.95
42
- end
43
- end
@@ -1,90 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is mozilla.org code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 1998
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- #
13
- # This library is free software; you can redistribute it and/or
14
- # modify it under the terms of the GNU Lesser General Public
15
- # License as published by the Free Software Foundation; either
16
- # version 2.1 of the License, or (at your option) any later version.
17
- #
18
- # This library is distributed in the hope that it will be useful,
19
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
- # Lesser General Public License for more details.
22
- #
23
- # You should have received a copy of the GNU Lesser General Public
24
- # License along with this library; if not, write to the Free Software
25
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
- # 02110-1301 USA
27
- ######################### END LICENSE BLOCK #########################
28
- module GIGO
29
- module CharDet
30
- class EscCharSetProber < CharSetProber
31
- def initialize
32
- super()
33
- @_mCodingSM = [
34
- CodingStateMachine.new(HZSMModel),
35
- CodingStateMachine.new(ISO2022CNSMModel),
36
- CodingStateMachine.new(ISO2022JPSMModel),
37
- CodingStateMachine.new(ISO2022KRSMModel)
38
- ]
39
- reset
40
- end
41
-
42
- def reset
43
- super
44
- for codingSM in @_mCodingSM
45
- next if not codingSM
46
- codingSM.active = true
47
- codingSM.reset
48
- end
49
- @_mActiveSM = @_mCodingSM.length
50
- @_mDetectedCharset = nil
51
- end
52
-
53
- def get_charset_name
54
- return @_mDetectedCharset
55
- end
56
-
57
- def get_confidence
58
- if @_mDetectedCharset
59
- return 0.99
60
- else
61
- return 0.00
62
- end
63
- end
64
-
65
- def feed(aBuf)
66
- for c in aBuf.split('')
67
- for codingSM in @_mCodingSM
68
- next unless codingSM
69
- next unless codingSM.active
70
- codingState = codingSM.next_state(c)
71
- if codingState == EError
72
- codingSM.active = false
73
- @_mActiveSM -= 1
74
- if @_mActiveSM <= 0
75
- @_mState = ENotMe
76
- return get_state
77
- end
78
- elsif codingState == EItsMe
79
- @_mState = EFoundIt
80
- @_mDetectedCharset = codingSM.get_coding_state_machine
81
- return get_state
82
- end
83
- end
84
- end
85
-
86
- return get_state
87
- end
88
- end
89
- end
90
- end
@@ -1,245 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is mozilla.org code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 1998
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Mark Pilgrim - port to Python
11
- #
12
- # This library is free software; you can redistribute it and/or
13
- # modify it under the terms of the GNU Lesser General Public
14
- # License as published by the Free Software Foundation; either
15
- # version 2.1 of the License, or (at your option) any later version.
16
- #
17
- # This library is distributed in the hope that it will be useful,
18
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
- # Lesser General Public License for more details.
21
- #
22
- # You should have received a copy of the GNU Lesser General Public
23
- # License along with this library; if not, write to the Free Software
24
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25
- # 02110-1301 USA
26
- ######################### END LICENSE BLOCK #########################
27
- module GIGO
28
- module CharDet
29
- HZ_cls = [
30
- 1,0,0,0,0,0,0,0, # 00 - 07
31
- 0,0,0,0,0,0,0,0, # 08 - 0f
32
- 0,0,0,0,0,0,0,0, # 10 - 17
33
- 0,0,0,1,0,0,0,0, # 18 - 1f
34
- 0,0,0,0,0,0,0,0, # 20 - 27
35
- 0,0,0,0,0,0,0,0, # 28 - 2f
36
- 0,0,0,0,0,0,0,0, # 30 - 37
37
- 0,0,0,0,0,0,0,0, # 38 - 3f
38
- 0,0,0,0,0,0,0,0, # 40 - 47
39
- 0,0,0,0,0,0,0,0, # 48 - 4f
40
- 0,0,0,0,0,0,0,0, # 50 - 57
41
- 0,0,0,0,0,0,0,0, # 58 - 5f
42
- 0,0,0,0,0,0,0,0, # 60 - 67
43
- 0,0,0,0,0,0,0,0, # 68 - 6f
44
- 0,0,0,0,0,0,0,0, # 70 - 77
45
- 0,0,0,4,0,5,2,0, # 78 - 7f
46
- 1,1,1,1,1,1,1,1, # 80 - 87
47
- 1,1,1,1,1,1,1,1, # 88 - 8f
48
- 1,1,1,1,1,1,1,1, # 90 - 97
49
- 1,1,1,1,1,1,1,1, # 98 - 9f
50
- 1,1,1,1,1,1,1,1, # a0 - a7
51
- 1,1,1,1,1,1,1,1, # a8 - af
52
- 1,1,1,1,1,1,1,1, # b0 - b7
53
- 1,1,1,1,1,1,1,1, # b8 - bf
54
- 1,1,1,1,1,1,1,1, # c0 - c7
55
- 1,1,1,1,1,1,1,1, # c8 - cf
56
- 1,1,1,1,1,1,1,1, # d0 - d7
57
- 1,1,1,1,1,1,1,1, # d8 - df
58
- 1,1,1,1,1,1,1,1, # e0 - e7
59
- 1,1,1,1,1,1,1,1, # e8 - ef
60
- 1,1,1,1,1,1,1,1, # f0 - f7
61
- 1,1,1,1,1,1,1,1, # f8 - ff
62
- ]
63
-
64
- HZ_st = [
65
- EStart,EError, 3,EStart,EStart,EStart,EError,EError,# 00-07
66
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 08-0f
67
- EItsMe,EItsMe,EError,EError,EStart,EStart, 4,EError,# 10-17
68
- 5,EError, 6,EError, 5, 5, 4,EError,# 18-1f
69
- 4,EError, 4, 4, 4,EError, 4,EError,# 20-27
70
- 4,EItsMe,EStart,EStart,EStart,EStart,EStart,EStart,# 28-2f
71
- ]
72
-
73
- HZCharLenTable = [0, 0, 0, 0, 0, 0]
74
-
75
- HZSMModel = {'classTable' => HZ_cls,
76
- 'classFactor' => 6,
77
- 'stateTable' => HZ_st,
78
- 'charLenTable' => HZCharLenTable,
79
- 'name' => "HZ-GB-2312"
80
- }
81
-
82
- ISO2022CN_cls = [
83
- 2,0,0,0,0,0,0,0, # 00 - 07
84
- 0,0,0,0,0,0,0,0, # 08 - 0f
85
- 0,0,0,0,0,0,0,0, # 10 - 17
86
- 0,0,0,1,0,0,0,0, # 18 - 1f
87
- 0,0,0,0,0,0,0,0, # 20 - 27
88
- 0,3,0,0,0,0,0,0, # 28 - 2f
89
- 0,0,0,0,0,0,0,0, # 30 - 37
90
- 0,0,0,0,0,0,0,0, # 38 - 3f
91
- 0,0,0,4,0,0,0,0, # 40 - 47
92
- 0,0,0,0,0,0,0,0, # 48 - 4f
93
- 0,0,0,0,0,0,0,0, # 50 - 57
94
- 0,0,0,0,0,0,0,0, # 58 - 5f
95
- 0,0,0,0,0,0,0,0, # 60 - 67
96
- 0,0,0,0,0,0,0,0, # 68 - 6f
97
- 0,0,0,0,0,0,0,0, # 70 - 77
98
- 0,0,0,0,0,0,0,0, # 78 - 7f
99
- 2,2,2,2,2,2,2,2, # 80 - 87
100
- 2,2,2,2,2,2,2,2, # 88 - 8f
101
- 2,2,2,2,2,2,2,2, # 90 - 97
102
- 2,2,2,2,2,2,2,2, # 98 - 9f
103
- 2,2,2,2,2,2,2,2, # a0 - a7
104
- 2,2,2,2,2,2,2,2, # a8 - af
105
- 2,2,2,2,2,2,2,2, # b0 - b7
106
- 2,2,2,2,2,2,2,2, # b8 - bf
107
- 2,2,2,2,2,2,2,2, # c0 - c7
108
- 2,2,2,2,2,2,2,2, # c8 - cf
109
- 2,2,2,2,2,2,2,2, # d0 - d7
110
- 2,2,2,2,2,2,2,2, # d8 - df
111
- 2,2,2,2,2,2,2,2, # e0 - e7
112
- 2,2,2,2,2,2,2,2, # e8 - ef
113
- 2,2,2,2,2,2,2,2, # f0 - f7
114
- 2,2,2,2,2,2,2,2, # f8 - ff
115
- ]
116
-
117
- ISO2022CN_st = [
118
- EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
119
- EStart,EError,EError,EError,EError,EError,EError,EError,# 08-0f
120
- EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,# 10-17
121
- EItsMe,EItsMe,EItsMe,EError,EError,EError, 4,EError,# 18-1f
122
- EError,EError,EError,EItsMe,EError,EError,EError,EError,# 20-27
123
- 5, 6,EError,EError,EError,EError,EError,EError,# 28-2f
124
- EError,EError,EError,EItsMe,EError,EError,EError,EError,# 30-37
125
- EError,EError,EError,EError,EError,EItsMe,EError,EStart,# 38-3f
126
- ]
127
-
128
- ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0]
129
-
130
- ISO2022CNSMModel = {'classTable' => ISO2022CN_cls,
131
- 'classFactor' => 9,
132
- 'stateTable' => ISO2022CN_st,
133
- 'charLenTable' => ISO2022CNCharLenTable,
134
- 'name' => "ISO-2022-CN"
135
- }
136
-
137
- ISO2022JP_cls = [
138
- 2,0,0,0,0,0,0,0, # 00 - 07
139
- 0,0,0,0,0,0,2,2, # 08 - 0f
140
- 0,0,0,0,0,0,0,0, # 10 - 17
141
- 0,0,0,1,0,0,0,0, # 18 - 1f
142
- 0,0,0,0,7,0,0,0, # 20 - 27
143
- 3,0,0,0,0,0,0,0, # 28 - 2f
144
- 0,0,0,0,0,0,0,0, # 30 - 37
145
- 0,0,0,0,0,0,0,0, # 38 - 3f
146
- 6,0,4,0,8,0,0,0, # 40 - 47
147
- 0,9,5,0,0,0,0,0, # 48 - 4f
148
- 0,0,0,0,0,0,0,0, # 50 - 57
149
- 0,0,0,0,0,0,0,0, # 58 - 5f
150
- 0,0,0,0,0,0,0,0, # 60 - 67
151
- 0,0,0,0,0,0,0,0, # 68 - 6f
152
- 0,0,0,0,0,0,0,0, # 70 - 77
153
- 0,0,0,0,0,0,0,0, # 78 - 7f
154
- 2,2,2,2,2,2,2,2, # 80 - 87
155
- 2,2,2,2,2,2,2,2, # 88 - 8f
156
- 2,2,2,2,2,2,2,2, # 90 - 97
157
- 2,2,2,2,2,2,2,2, # 98 - 9f
158
- 2,2,2,2,2,2,2,2, # a0 - a7
159
- 2,2,2,2,2,2,2,2, # a8 - af
160
- 2,2,2,2,2,2,2,2, # b0 - b7
161
- 2,2,2,2,2,2,2,2, # b8 - bf
162
- 2,2,2,2,2,2,2,2, # c0 - c7
163
- 2,2,2,2,2,2,2,2, # c8 - cf
164
- 2,2,2,2,2,2,2,2, # d0 - d7
165
- 2,2,2,2,2,2,2,2, # d8 - df
166
- 2,2,2,2,2,2,2,2, # e0 - e7
167
- 2,2,2,2,2,2,2,2, # e8 - ef
168
- 2,2,2,2,2,2,2,2, # f0 - f7
169
- 2,2,2,2,2,2,2,2, # f8 - ff
170
- ]
171
-
172
- ISO2022JP_st = [
173
- EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
174
- EStart,EStart,EError,EError,EError,EError,EError,EError,# 08-0f
175
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 10-17
176
- EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,# 18-1f
177
- EError, 5,EError,EError,EError, 4,EError,EError,# 20-27
178
- EError,EError,EError, 6,EItsMe,EError,EItsMe,EError,# 28-2f
179
- EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,# 30-37
180
- EError,EError,EError,EItsMe,EError,EError,EError,EError,# 38-3f
181
- EError,EError,EError,EError,EItsMe,EError,EStart,EStart,# 40-47
182
- ]
183
-
184
- ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0]
185
-
186
- ISO2022JPSMModel = {'classTable' => ISO2022JP_cls,
187
- 'classFactor' => 10,
188
- 'stateTable' => ISO2022JP_st,
189
- 'charLenTable' => ISO2022JPCharLenTable,
190
- 'name' => "ISO-2022-JP"
191
- }
192
-
193
- ISO2022KR_cls = [
194
- 2,0,0,0,0,0,0,0, # 00 - 07
195
- 0,0,0,0,0,0,0,0, # 08 - 0f
196
- 0,0,0,0,0,0,0,0, # 10 - 17
197
- 0,0,0,1,0,0,0,0, # 18 - 1f
198
- 0,0,0,0,3,0,0,0, # 20 - 27
199
- 0,4,0,0,0,0,0,0, # 28 - 2f
200
- 0,0,0,0,0,0,0,0, # 30 - 37
201
- 0,0,0,0,0,0,0,0, # 38 - 3f
202
- 0,0,0,5,0,0,0,0, # 40 - 47
203
- 0,0,0,0,0,0,0,0, # 48 - 4f
204
- 0,0,0,0,0,0,0,0, # 50 - 57
205
- 0,0,0,0,0,0,0,0, # 58 - 5f
206
- 0,0,0,0,0,0,0,0, # 60 - 67
207
- 0,0,0,0,0,0,0,0, # 68 - 6f
208
- 0,0,0,0,0,0,0,0, # 70 - 77
209
- 0,0,0,0,0,0,0,0, # 78 - 7f
210
- 2,2,2,2,2,2,2,2, # 80 - 87
211
- 2,2,2,2,2,2,2,2, # 88 - 8f
212
- 2,2,2,2,2,2,2,2, # 90 - 97
213
- 2,2,2,2,2,2,2,2, # 98 - 9f
214
- 2,2,2,2,2,2,2,2, # a0 - a7
215
- 2,2,2,2,2,2,2,2, # a8 - af
216
- 2,2,2,2,2,2,2,2, # b0 - b7
217
- 2,2,2,2,2,2,2,2, # b8 - bf
218
- 2,2,2,2,2,2,2,2, # c0 - c7
219
- 2,2,2,2,2,2,2,2, # c8 - cf
220
- 2,2,2,2,2,2,2,2, # d0 - d7
221
- 2,2,2,2,2,2,2,2, # d8 - df
222
- 2,2,2,2,2,2,2,2, # e0 - e7
223
- 2,2,2,2,2,2,2,2, # e8 - ef
224
- 2,2,2,2,2,2,2,2, # f0 - f7
225
- 2,2,2,2,2,2,2,2, # f8 - ff
226
- ]
227
-
228
- ISO2022KR_st = [
229
- EStart, 3,EError,EStart,EStart,EStart,EError,EError,# 00-07
230
- EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 08-0f
231
- EItsMe,EItsMe,EError,EError,EError, 4,EError,EError,# 10-17
232
- EError,EError,EError,EError, 5,EError,EError,EError,# 18-1f
233
- EError,EError,EError,EItsMe,EStart,EStart,EStart,EStart,# 20-27
234
- ]
235
-
236
- ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0]
237
-
238
- ISO2022KRSMModel = {'classTable' => ISO2022KR_cls,
239
- 'classFactor' => 6,
240
- 'stateTable' => ISO2022KR_st,
241
- 'charLenTable' => ISO2022KRCharLenTable,
242
- 'name' => "ISO-2022-KR"
243
- }
244
- end
245
- end
@@ -1,89 +0,0 @@
1
- ######################## BEGIN LICENSE BLOCK ########################
2
- # The Original Code is mozilla.org code.
3
- #
4
- # The Initial Developer of the Original Code is
5
- # Netscape Communications Corporation.
6
- # Portions created by the Initial Developer are Copyright (C) 1998
7
- # the Initial Developer. All Rights Reserved.
8
- #
9
- # Contributor(s):
10
- # Jeff Hodges - port to Ruby
11
- # Mark Pilgrim - port to Python
12
- #
13
- # This library is free software; you can redistribute it and/or
14
- # modify it under the terms of the GNU Lesser General Public
15
- # License as published by the Free Software Foundation; either
16
- # version 2.1 of the License, or (at your option) any later version.
17
- #
18
- # This library is distributed in the hope that it will be useful,
19
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
- # Lesser General Public License for more details.
22
- #
23
- # You should have received a copy of the GNU Lesser General Public
24
- # License along with this library; if not, write to the Free Software
25
- # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
- # 02110-1301 USA
27
- ######################### END LICENSE BLOCK #########################
28
- module GIGO
29
- module CharDet
30
- class EUCJPProber < MultiByteCharSetProber
31
- def initialize
32
- super
33
- @_mCodingSM = CodingStateMachine.new(EUCJPSMModel)
34
- @_mDistributionAnalyzer = EUCJPDistributionAnalysis.new
35
- @_mContextAnalyzer = EUCJPContextAnalysis.new
36
- reset
37
- end
38
-
39
- def reset
40
- super
41
- @_mContextAnalyzer.reset
42
- end
43
-
44
- def get_charset_name
45
- return "EUC-JP"
46
- end
47
-
48
- def feed(aBuf)
49
- aLen = aBuf.length
50
- for i in (0...aLen)
51
- codingState = @_mCodingSM.next_state(aBuf[i..i])
52
- if codingState == EError
53
- $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
54
- @_mState = ENotMe
55
- break
56
- elsif codingState == EItsMe
57
- @_mState = EFoundIt
58
- break
59
- elsif codingState == EStart
60
- charLen = @_mCodingSM.get_current_charlen()
61
- if i == 0
62
- @_mLastChar[1] = aBuf[0..0]
63
- @_mContextAnalyzer.feed(@_mLastChar, charLen)
64
- @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
65
- else
66
- @_mContextAnalyzer.feed(aBuf[i-1...i+1], charLen)
67
- @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
68
- end
69
- end
70
- end
71
-
72
- @_mLastChar[0] = aBuf[aLen-1..aLen-1]
73
-
74
- if get_state == EDetecting
75
- if @_mContextAnalyzer.got_enough_data && (get_confidence > SHORTCUT_THRESHOLD)
76
- @_mState = EFoundIt
77
- end
78
- end
79
-
80
- return get_state
81
- end
82
-
83
- def get_confidence
84
- l = [@_mContextAnalyzer.get_confidence,@_mDistributionAnalyzer.get_confidence]
85
- return l.max
86
- end
87
- end
88
- end
89
- end