rchardet 1.3.1 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rchardet.rb +1 -3
- data/lib/rchardet/big5freq.rb +2 -2
- data/lib/rchardet/big5prober.rb +2 -2
- data/lib/rchardet/chardistribution.rb +74 -69
- data/lib/rchardet/charsetgroupprober.rb +50 -52
- data/lib/rchardet/charsetprober.rb +2 -7
- data/lib/rchardet/codingstatemachine.rb +14 -13
- data/lib/rchardet/constants.rb +0 -0
- data/lib/rchardet/escprober.rb +34 -34
- data/lib/rchardet/escsm.rb +33 -32
- data/lib/rchardet/eucjpprober.rb +28 -28
- data/lib/rchardet/euckrfreq.rb +2 -1
- data/lib/rchardet/euckrprober.rb +2 -2
- data/lib/rchardet/euctwfreq.rb +2 -1
- data/lib/rchardet/euctwprober.rb +2 -2
- data/lib/rchardet/gb2312freq.rb +2 -2
- data/lib/rchardet/gb2312prober.rb +2 -2
- data/lib/rchardet/hebrewprober.rb +40 -40
- data/lib/rchardet/jisfreq.rb +2 -1
- data/lib/rchardet/jpcntx.rb +131 -130
- data/lib/rchardet/langbulgarianmodel.rb +6 -6
- data/lib/rchardet/langcyrillicmodel.rb +13 -13
- data/lib/rchardet/langgreekmodel.rb +5 -5
- data/lib/rchardet/langhebrewmodel.rb +3 -3
- data/lib/rchardet/langhungarianmodel.rb +5 -5
- data/lib/rchardet/langthaimodel.rb +3 -3
- data/lib/rchardet/latin1prober.rb +18 -18
- data/lib/rchardet/mbcharsetprober.rb +30 -30
- data/lib/rchardet/mbcsgroupprober.rb +9 -9
- data/lib/rchardet/mbcssm.rb +72 -72
- data/lib/rchardet/sbcharsetprober.rb +48 -50
- data/lib/rchardet/sbcsgroupprober.rb +16 -16
- data/lib/rchardet/sjisprober.rb +28 -28
- data/lib/rchardet/universaldetector.rb +92 -90
- data/lib/rchardet/utf8prober.rb +25 -25
- data/lib/rchardet/version.rb +3 -0
- metadata +30 -47
- data/COPYING +0 -504
- data/README +0 -12
@@ -31,36 +31,37 @@ module CharDet
|
|
31
31
|
attr_accessor :active
|
32
32
|
|
33
33
|
def initialize(sm)
|
34
|
-
@
|
35
|
-
@
|
36
|
-
@
|
34
|
+
@model = sm
|
35
|
+
@currentBytePos = 0
|
36
|
+
@currentCharLen = 0
|
37
37
|
reset()
|
38
38
|
end
|
39
39
|
|
40
40
|
def reset
|
41
|
-
@
|
41
|
+
@currentState = EStart
|
42
42
|
end
|
43
43
|
|
44
44
|
def next_state(c)
|
45
45
|
# for each byte we get its class
|
46
46
|
# if it is first byte, we also get byte length
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
b = c.bytes.first
|
48
|
+
byteCls = @model['classTable'][b]
|
49
|
+
if @currentState == EStart
|
50
|
+
@currentBytePos = 0
|
51
|
+
@currentCharLen = @model['charLenTable'][byteCls]
|
51
52
|
end
|
52
53
|
# from byte's class and stateTable, we get its next state
|
53
|
-
@
|
54
|
-
@
|
55
|
-
return @
|
54
|
+
@currentState = @model['stateTable'][@currentState * @model['classFactor'] + byteCls]
|
55
|
+
@currentBytePos += 1
|
56
|
+
return @currentState
|
56
57
|
end
|
57
58
|
|
58
59
|
def get_current_charlen
|
59
|
-
return @
|
60
|
+
return @currentCharLen
|
60
61
|
end
|
61
62
|
|
62
63
|
def get_coding_state_machine
|
63
|
-
return @
|
64
|
+
return @model['name']
|
64
65
|
end
|
65
66
|
end
|
66
67
|
end
|
data/lib/rchardet/constants.rb
CHANGED
File without changes
|
data/lib/rchardet/escprober.rb
CHANGED
@@ -30,58 +30,58 @@ module CharDet
|
|
30
30
|
class EscCharSetProber < CharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
33
|
+
@codingSM = [
|
34
|
+
CodingStateMachine.new(HZSMModel),
|
35
|
+
CodingStateMachine.new(ISO2022CNSMModel),
|
36
|
+
CodingStateMachine.new(ISO2022JPSMModel),
|
37
|
+
CodingStateMachine.new(ISO2022KRSMModel)
|
38
|
+
]
|
39
39
|
reset()
|
40
40
|
end
|
41
41
|
|
42
42
|
def reset
|
43
43
|
super()
|
44
|
-
for codingSM in @
|
45
|
-
|
46
|
-
|
47
|
-
|
44
|
+
for codingSM in @codingSM
|
45
|
+
next if !codingSM
|
46
|
+
codingSM.active = true
|
47
|
+
codingSM.reset()
|
48
48
|
end
|
49
|
-
@
|
50
|
-
@
|
49
|
+
@activeSM = @codingSM.length
|
50
|
+
@detectedCharset = nil
|
51
51
|
end
|
52
52
|
|
53
53
|
def get_charset_name
|
54
|
-
return @
|
54
|
+
return @detectedCharset
|
55
55
|
end
|
56
56
|
|
57
57
|
def get_confidence
|
58
|
-
if @
|
59
|
-
|
58
|
+
if @detectedCharset
|
59
|
+
return 0.99
|
60
60
|
else
|
61
|
-
|
61
|
+
return 0.00
|
62
62
|
end
|
63
63
|
end
|
64
64
|
|
65
65
|
def feed(aBuf)
|
66
66
|
aBuf.each_byte do |b|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
67
|
+
c = b.chr
|
68
|
+
for codingSM in @codingSM
|
69
|
+
next unless codingSM
|
70
|
+
next unless codingSM.active
|
71
|
+
codingState = codingSM.next_state(c)
|
72
|
+
if codingState == EError
|
73
|
+
codingSM.active = false
|
74
|
+
@activeSM -= 1
|
75
|
+
if @activeSM <= 0
|
76
|
+
@state = ENotMe
|
77
|
+
return get_state()
|
78
|
+
end
|
79
|
+
elsif codingState == EItsMe
|
80
|
+
@state = EFoundIt
|
81
|
+
@detectedCharset = codingSM.get_coding_state_machine()
|
82
|
+
return get_state()
|
83
|
+
end
|
84
|
+
end
|
85
85
|
end
|
86
86
|
|
87
87
|
return get_state()
|
data/lib/rchardet/escsm.rb
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
# the Initial Developer. All Rights Reserved.
|
8
8
|
#
|
9
9
|
# Contributor(s):
|
10
|
+
# Jeff Hodges - port to Ruby
|
10
11
|
# Mark Pilgrim - port to Python
|
11
12
|
#
|
12
13
|
# This library is free software; you can redistribute it and/or
|
@@ -59,7 +60,7 @@ module CharDet
|
|
59
60
|
1,1,1,1,1,1,1,1, # e8 - ef
|
60
61
|
1,1,1,1,1,1,1,1, # f0 - f7
|
61
62
|
1,1,1,1,1,1,1,1, # f8 - ff
|
62
|
-
]
|
63
|
+
].freeze
|
63
64
|
|
64
65
|
HZ_st = [
|
65
66
|
EStart,EError, 3,EStart,EStart,EStart,EError,EError,# 00-07
|
@@ -68,16 +69,16 @@ module CharDet
|
|
68
69
|
5,EError, 6,EError, 5, 5, 4,EError,# 18-1f
|
69
70
|
4,EError, 4, 4, 4,EError, 4,EError,# 20-27
|
70
71
|
4,EItsMe,EStart,EStart,EStart,EStart,EStart,EStart,# 28-2f
|
71
|
-
]
|
72
|
+
].freeze
|
72
73
|
|
73
|
-
HZCharLenTable = [0, 0, 0, 0, 0, 0]
|
74
|
+
HZCharLenTable = [0, 0, 0, 0, 0, 0].freeze
|
74
75
|
|
75
76
|
HZSMModel = {'classTable' => HZ_cls,
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
}
|
77
|
+
'classFactor' => 6,
|
78
|
+
'stateTable' => HZ_st,
|
79
|
+
'charLenTable' => HZCharLenTable,
|
80
|
+
'name' => "HZ-GB-2312"
|
81
|
+
}.freeze
|
81
82
|
|
82
83
|
ISO2022CN_cls = [
|
83
84
|
2,0,0,0,0,0,0,0, # 00 - 07
|
@@ -112,7 +113,7 @@ ISO2022CN_cls = [
|
|
112
113
|
2,2,2,2,2,2,2,2, # e8 - ef
|
113
114
|
2,2,2,2,2,2,2,2, # f0 - f7
|
114
115
|
2,2,2,2,2,2,2,2, # f8 - ff
|
115
|
-
]
|
116
|
+
].freeze
|
116
117
|
|
117
118
|
ISO2022CN_st = [
|
118
119
|
EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
|
@@ -123,16 +124,16 @@ EError,EError,EError,EItsMe,EError,EError,EError,EError,# 20-27
|
|
123
124
|
5, 6,EError,EError,EError,EError,EError,EError,# 28-2f
|
124
125
|
EError,EError,EError,EItsMe,EError,EError,EError,EError,# 30-37
|
125
126
|
EError,EError,EError,EError,EError,EItsMe,EError,EStart,# 38-3f
|
126
|
-
]
|
127
|
+
].freeze
|
127
128
|
|
128
|
-
ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0]
|
129
|
+
ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0].freeze
|
129
130
|
|
130
131
|
ISO2022CNSMModel = {'classTable' => ISO2022CN_cls,
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
}
|
132
|
+
'classFactor' => 9,
|
133
|
+
'stateTable' => ISO2022CN_st,
|
134
|
+
'charLenTable' => ISO2022CNCharLenTable,
|
135
|
+
'name' => "ISO-2022-CN"
|
136
|
+
}.freeze
|
136
137
|
|
137
138
|
ISO2022JP_cls = [
|
138
139
|
2,0,0,0,0,0,0,0, # 00 - 07
|
@@ -167,7 +168,7 @@ ISO2022JP_cls = [
|
|
167
168
|
2,2,2,2,2,2,2,2, # e8 - ef
|
168
169
|
2,2,2,2,2,2,2,2, # f0 - f7
|
169
170
|
2,2,2,2,2,2,2,2, # f8 - ff
|
170
|
-
]
|
171
|
+
].freeze
|
171
172
|
|
172
173
|
ISO2022JP_st = [
|
173
174
|
EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
|
@@ -179,16 +180,16 @@ EError,EError,EError, 6,EItsMe,EError,EItsMe,EError,# 28-2f
|
|
179
180
|
EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,# 30-37
|
180
181
|
EError,EError,EError,EItsMe,EError,EError,EError,EError,# 38-3f
|
181
182
|
EError,EError,EError,EError,EItsMe,EError,EStart,EStart,# 40-47
|
182
|
-
]
|
183
|
+
].freeze
|
183
184
|
|
184
|
-
ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
185
|
+
ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0].freeze
|
185
186
|
|
186
187
|
ISO2022JPSMModel = {'classTable' => ISO2022JP_cls,
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
}
|
188
|
+
'classFactor' => 10,
|
189
|
+
'stateTable' => ISO2022JP_st,
|
190
|
+
'charLenTable' => ISO2022JPCharLenTable,
|
191
|
+
'name' => "ISO-2022-JP"
|
192
|
+
}.freeze
|
192
193
|
|
193
194
|
ISO2022KR_cls = [
|
194
195
|
2,0,0,0,0,0,0,0, # 00 - 07
|
@@ -223,7 +224,7 @@ ISO2022KR_cls = [
|
|
223
224
|
2,2,2,2,2,2,2,2, # e8 - ef
|
224
225
|
2,2,2,2,2,2,2,2, # f0 - f7
|
225
226
|
2,2,2,2,2,2,2,2, # f8 - ff
|
226
|
-
]
|
227
|
+
].freeze
|
227
228
|
|
228
229
|
ISO2022KR_st = [
|
229
230
|
EStart, 3,EError,EStart,EStart,EStart,EError,EError,# 00-07
|
@@ -231,14 +232,14 @@ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 08-0f
|
|
231
232
|
EItsMe,EItsMe,EError,EError,EError, 4,EError,EError,# 10-17
|
232
233
|
EError,EError,EError,EError, 5,EError,EError,EError,# 18-1f
|
233
234
|
EError,EError,EError,EItsMe,EStart,EStart,EStart,EStart,# 20-27
|
234
|
-
]
|
235
|
+
].freeze
|
235
236
|
|
236
|
-
ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0]
|
237
|
+
ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0].freeze
|
237
238
|
|
238
239
|
ISO2022KRSMModel = {'classTable' => ISO2022KR_cls,
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
}
|
240
|
+
'classFactor' => 6,
|
241
|
+
'stateTable' => ISO2022KR_st,
|
242
|
+
'charLenTable' => ISO2022KRCharLenTable,
|
243
|
+
'name' => "ISO-2022-KR"
|
244
|
+
}.freeze
|
244
245
|
end
|
data/lib/rchardet/eucjpprober.rb
CHANGED
@@ -30,15 +30,15 @@ module CharDet
|
|
30
30
|
class EUCJPProber < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@
|
34
|
-
@
|
35
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(EUCJPSMModel)
|
34
|
+
@distributionAnalyzer = EUCJPDistributionAnalysis.new()
|
35
|
+
@contextAnalyzer = EUCJPContextAnalysis.new()
|
36
36
|
reset
|
37
37
|
end
|
38
38
|
|
39
39
|
def reset
|
40
40
|
super()
|
41
|
-
@
|
41
|
+
@contextAnalyzer.reset()
|
42
42
|
end
|
43
43
|
|
44
44
|
def get_charset_name
|
@@ -48,40 +48,40 @@ module CharDet
|
|
48
48
|
def feed(aBuf)
|
49
49
|
aLen = aBuf.length
|
50
50
|
for i in (0...aLen)
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
51
|
+
codingState = @codingSM.next_state(aBuf[i, 1])
|
52
|
+
if codingState == EError
|
53
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
54
|
+
@state = ENotMe
|
55
|
+
break
|
56
|
+
elsif codingState == EItsMe
|
57
|
+
@state = EFoundIt
|
58
|
+
break
|
59
|
+
elsif codingState == EStart
|
60
|
+
charLen = @codingSM.get_current_charlen()
|
61
|
+
if i == 0
|
62
|
+
@lastChar[1] = aBuf[0, 1]
|
63
|
+
@contextAnalyzer.feed(@lastChar, charLen)
|
64
|
+
@distributionAnalyzer.feed(@lastChar, charLen)
|
65
|
+
else
|
66
|
+
@contextAnalyzer.feed(aBuf[i-1, 2], charLen)
|
67
|
+
@distributionAnalyzer.feed(aBuf[i-1, 2], charLen)
|
68
|
+
end
|
69
|
+
end
|
70
70
|
end
|
71
71
|
|
72
|
-
@
|
72
|
+
@lastChar[0] = aBuf[aLen-1, 1]
|
73
73
|
|
74
74
|
if get_state() == EDetecting
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
if @contextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
76
|
+
@state = EFoundIt
|
77
|
+
end
|
78
78
|
end
|
79
79
|
|
80
80
|
return get_state()
|
81
81
|
end
|
82
82
|
|
83
83
|
def get_confidence
|
84
|
-
l = [@
|
84
|
+
l = [@contextAnalyzer.get_confidence,@distributionAnalyzer.get_confidence]
|
85
85
|
return l.max
|
86
86
|
end
|
87
87
|
end
|
data/lib/rchardet/euckrfreq.rb
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
# the Initial Developer. All Rights Reserved.
|
8
8
|
#
|
9
9
|
# Contributor(s):
|
10
|
+
# Jeff Hodges - port to Ruby
|
10
11
|
# Mark Pilgrim - port to Python
|
11
12
|
#
|
12
13
|
# This library is free software; you can redistribute it and/or
|
@@ -592,5 +593,5 @@ EUCKRCharToFreqOrder = [
|
|
592
593
|
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
|
593
594
|
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
|
594
595
|
8736,8737,8738,8739,8740,8741
|
595
|
-
]
|
596
|
+
].freeze
|
596
597
|
end
|
data/lib/rchardet/euckrprober.rb
CHANGED
@@ -30,8 +30,8 @@ module CharDet
|
|
30
30
|
class EUCKRProber < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@
|
34
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(EUCKRSMModel)
|
34
|
+
@distributionAnalyzer = EUCKRDistributionAnalysis.new()
|
35
35
|
reset()
|
36
36
|
end
|
37
37
|
|
data/lib/rchardet/euctwfreq.rb
CHANGED
@@ -426,5 +426,6 @@ EUCTWCharToFreqOrder = [
|
|
426
426
|
8678,8679,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689,8690,8691,8692,8693, # 8694
|
427
427
|
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
|
428
428
|
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
|
429
|
-
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741
|
429
|
+
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741 # 8742
|
430
|
+
].freeze
|
430
431
|
end
|
data/lib/rchardet/euctwprober.rb
CHANGED
@@ -30,8 +30,8 @@ module CharDet
|
|
30
30
|
class EUCTWProber < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@
|
34
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(EUCTWSMModel)
|
34
|
+
@distributionAnalyzer = EUCTWDistributionAnalysis.new()
|
35
35
|
reset()
|
36
36
|
end
|
37
37
|
|
data/lib/rchardet/gb2312freq.rb
CHANGED
@@ -469,6 +469,6 @@ GB2312CharToFreqOrder = [
|
|
469
469
|
6271,3875,5768,6094,5034,5506,4376,5769,6761,2120,6476,5253,5770,6762,5771,5970,
|
470
470
|
3990,5971,5557,5558,5772,6477,6095,2787,4641,5972,5121,6096,6097,6272,6763,3703,
|
471
471
|
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
|
472
|
-
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767
|
473
|
-
|
472
|
+
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767
|
473
|
+
].freeze
|
474
474
|
end
|
@@ -30,8 +30,8 @@ module CharDet
|
|
30
30
|
class GB2312Prober < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super
|
33
|
-
@
|
34
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(GB2312SMModel)
|
34
|
+
@distributionAnalyzer = GB2312DistributionAnalysis.new()
|
35
35
|
reset()
|
36
36
|
end
|
37
37
|
|
@@ -150,25 +150,25 @@ module CharDet
|
|
150
150
|
class HebrewProber < CharSetProber
|
151
151
|
def initialize
|
152
152
|
super()
|
153
|
-
@
|
154
|
-
@
|
153
|
+
@logicalProber = nil
|
154
|
+
@visualProber = nil
|
155
155
|
reset()
|
156
156
|
end
|
157
157
|
|
158
158
|
def reset
|
159
|
-
@
|
160
|
-
@
|
159
|
+
@finalCharLogicalScore = 0
|
160
|
+
@finalCharVisualScore = 0
|
161
161
|
# The two last characters seen in the previous buffer,
|
162
162
|
# mPrev and mBeforePrev are initialized to space in order to simulate a word
|
163
163
|
# delimiter at the beginning of the data
|
164
|
-
@
|
165
|
-
@
|
164
|
+
@prev = ' '
|
165
|
+
@beforePrev = ' '
|
166
166
|
# These probers are owned by the group prober.
|
167
167
|
end
|
168
168
|
|
169
169
|
def set_model_probers(logicalProber, visualProber)
|
170
|
-
@
|
171
|
-
@
|
170
|
+
@logicalProber = logicalProber
|
171
|
+
@visualProber = visualProber
|
172
172
|
end
|
173
173
|
|
174
174
|
def is_final(c)
|
@@ -215,34 +215,34 @@ module CharDet
|
|
215
215
|
# so the word boundary detection works properly. [MAP]
|
216
216
|
|
217
217
|
if get_state() == ENotMe
|
218
|
-
|
219
|
-
|
218
|
+
# Both model probers say it's not them. No reason to continue.
|
219
|
+
return ENotMe
|
220
220
|
end
|
221
221
|
|
222
222
|
aBuf = filter_high_bit_only(aBuf)
|
223
223
|
|
224
224
|
for cur in aBuf.split(' ')
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
225
|
+
if cur == ' '
|
226
|
+
# We stand on a space - a word just ended
|
227
|
+
if @beforePrev != ' '
|
228
|
+
# next-to-last char was not a space so self._mPrev is not a 1 letter word
|
229
|
+
if is_final(@prev)
|
230
|
+
# case (1) [-2:not space][-1:final letter][cur:space]
|
231
|
+
@finalCharLogicalScore += 1
|
232
|
+
elsif is_non_final(@prev)
|
233
|
+
# case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
234
|
+
@finalCharVisualScore += 1
|
235
|
+
end
|
236
|
+
end
|
237
|
+
else
|
238
|
+
# Not standing on a space
|
239
|
+
if (@beforePrev == ' ') and (is_final(@prev)) and (cur != ' ')
|
240
|
+
# case (3) [-2:space][-1:final letter][cur:not space]
|
241
|
+
@finalCharVisualScore += 1
|
242
|
+
end
|
243
|
+
end
|
244
|
+
@beforePrev = @prev
|
245
|
+
@prev = cur
|
246
246
|
end
|
247
247
|
|
248
248
|
# Forever detecting, till the end or until both model probers return eNotMe (handled above)
|
@@ -252,26 +252,26 @@ module CharDet
|
|
252
252
|
def get_charset_name
|
253
253
|
# Make the decision: is it Logical or Visual?
|
254
254
|
# If the final letter score distance is dominant enough, rely on it.
|
255
|
-
finalsub = @
|
255
|
+
finalsub = @finalCharLogicalScore - @finalCharVisualScore
|
256
256
|
if finalsub >= MIN_FINAL_CHAR_DISTANCE
|
257
|
-
|
257
|
+
return LOGICAL_HEBREW_NAME
|
258
258
|
end
|
259
259
|
if finalsub <= -MIN_FINAL_CHAR_DISTANCE
|
260
|
-
|
260
|
+
return VISUAL_HEBREW_NAME
|
261
261
|
end
|
262
262
|
|
263
263
|
# It's not dominant enough, try to rely on the model scores instead.
|
264
|
-
modelsub = @
|
264
|
+
modelsub = @logicalProber.get_confidence() - @visualProber.get_confidence()
|
265
265
|
if modelsub > MIN_MODEL_DISTANCE
|
266
|
-
|
266
|
+
return LOGICAL_HEBREW_NAME
|
267
267
|
end
|
268
268
|
if modelsub < -MIN_MODEL_DISTANCE
|
269
|
-
|
269
|
+
return VISUAL_HEBREW_NAME
|
270
270
|
end
|
271
271
|
|
272
272
|
# Still no good, back to final letter distance, maybe it'll save the day.
|
273
273
|
if finalsub < 0.0
|
274
|
-
|
274
|
+
return VISUAL_HEBREW_NAME
|
275
275
|
end
|
276
276
|
|
277
277
|
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
@@ -280,8 +280,8 @@ module CharDet
|
|
280
280
|
|
281
281
|
def get_state
|
282
282
|
# Remain active as long as any of the model probers are active.
|
283
|
-
if (@
|
284
|
-
|
283
|
+
if (@logicalProber.get_state() == ENotMe) and (@visualProber.get_state() == ENotMe)
|
284
|
+
return ENotMe
|
285
285
|
end
|
286
286
|
return EDetecting
|
287
287
|
end
|