rchardet 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rchardet.rb +1 -3
- data/lib/rchardet/big5freq.rb +2 -2
- data/lib/rchardet/big5prober.rb +2 -2
- data/lib/rchardet/chardistribution.rb +74 -69
- data/lib/rchardet/charsetgroupprober.rb +50 -52
- data/lib/rchardet/charsetprober.rb +2 -7
- data/lib/rchardet/codingstatemachine.rb +14 -13
- data/lib/rchardet/constants.rb +0 -0
- data/lib/rchardet/escprober.rb +34 -34
- data/lib/rchardet/escsm.rb +33 -32
- data/lib/rchardet/eucjpprober.rb +28 -28
- data/lib/rchardet/euckrfreq.rb +2 -1
- data/lib/rchardet/euckrprober.rb +2 -2
- data/lib/rchardet/euctwfreq.rb +2 -1
- data/lib/rchardet/euctwprober.rb +2 -2
- data/lib/rchardet/gb2312freq.rb +2 -2
- data/lib/rchardet/gb2312prober.rb +2 -2
- data/lib/rchardet/hebrewprober.rb +40 -40
- data/lib/rchardet/jisfreq.rb +2 -1
- data/lib/rchardet/jpcntx.rb +131 -130
- data/lib/rchardet/langbulgarianmodel.rb +6 -6
- data/lib/rchardet/langcyrillicmodel.rb +13 -13
- data/lib/rchardet/langgreekmodel.rb +5 -5
- data/lib/rchardet/langhebrewmodel.rb +3 -3
- data/lib/rchardet/langhungarianmodel.rb +5 -5
- data/lib/rchardet/langthaimodel.rb +3 -3
- data/lib/rchardet/latin1prober.rb +18 -18
- data/lib/rchardet/mbcharsetprober.rb +30 -30
- data/lib/rchardet/mbcsgroupprober.rb +9 -9
- data/lib/rchardet/mbcssm.rb +72 -72
- data/lib/rchardet/sbcharsetprober.rb +48 -50
- data/lib/rchardet/sbcsgroupprober.rb +16 -16
- data/lib/rchardet/sjisprober.rb +28 -28
- data/lib/rchardet/universaldetector.rb +92 -90
- data/lib/rchardet/utf8prober.rb +25 -25
- data/lib/rchardet/version.rb +3 -0
- metadata +30 -47
- data/COPYING +0 -504
- data/README +0 -12
@@ -31,36 +31,37 @@ module CharDet
|
|
31
31
|
attr_accessor :active
|
32
32
|
|
33
33
|
def initialize(sm)
|
34
|
-
@
|
35
|
-
@
|
36
|
-
@
|
34
|
+
@model = sm
|
35
|
+
@currentBytePos = 0
|
36
|
+
@currentCharLen = 0
|
37
37
|
reset()
|
38
38
|
end
|
39
39
|
|
40
40
|
def reset
|
41
|
-
@
|
41
|
+
@currentState = EStart
|
42
42
|
end
|
43
43
|
|
44
44
|
def next_state(c)
|
45
45
|
# for each byte we get its class
|
46
46
|
# if it is first byte, we also get byte length
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
b = c.bytes.first
|
48
|
+
byteCls = @model['classTable'][b]
|
49
|
+
if @currentState == EStart
|
50
|
+
@currentBytePos = 0
|
51
|
+
@currentCharLen = @model['charLenTable'][byteCls]
|
51
52
|
end
|
52
53
|
# from byte's class and stateTable, we get its next state
|
53
|
-
@
|
54
|
-
@
|
55
|
-
return @
|
54
|
+
@currentState = @model['stateTable'][@currentState * @model['classFactor'] + byteCls]
|
55
|
+
@currentBytePos += 1
|
56
|
+
return @currentState
|
56
57
|
end
|
57
58
|
|
58
59
|
def get_current_charlen
|
59
|
-
return @
|
60
|
+
return @currentCharLen
|
60
61
|
end
|
61
62
|
|
62
63
|
def get_coding_state_machine
|
63
|
-
return @
|
64
|
+
return @model['name']
|
64
65
|
end
|
65
66
|
end
|
66
67
|
end
|
data/lib/rchardet/constants.rb
CHANGED
File without changes
|
data/lib/rchardet/escprober.rb
CHANGED
@@ -30,58 +30,58 @@ module CharDet
|
|
30
30
|
class EscCharSetProber < CharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
33
|
+
@codingSM = [
|
34
|
+
CodingStateMachine.new(HZSMModel),
|
35
|
+
CodingStateMachine.new(ISO2022CNSMModel),
|
36
|
+
CodingStateMachine.new(ISO2022JPSMModel),
|
37
|
+
CodingStateMachine.new(ISO2022KRSMModel)
|
38
|
+
]
|
39
39
|
reset()
|
40
40
|
end
|
41
41
|
|
42
42
|
def reset
|
43
43
|
super()
|
44
|
-
for codingSM in @
|
45
|
-
|
46
|
-
|
47
|
-
|
44
|
+
for codingSM in @codingSM
|
45
|
+
next if !codingSM
|
46
|
+
codingSM.active = true
|
47
|
+
codingSM.reset()
|
48
48
|
end
|
49
|
-
@
|
50
|
-
@
|
49
|
+
@activeSM = @codingSM.length
|
50
|
+
@detectedCharset = nil
|
51
51
|
end
|
52
52
|
|
53
53
|
def get_charset_name
|
54
|
-
return @
|
54
|
+
return @detectedCharset
|
55
55
|
end
|
56
56
|
|
57
57
|
def get_confidence
|
58
|
-
if @
|
59
|
-
|
58
|
+
if @detectedCharset
|
59
|
+
return 0.99
|
60
60
|
else
|
61
|
-
|
61
|
+
return 0.00
|
62
62
|
end
|
63
63
|
end
|
64
64
|
|
65
65
|
def feed(aBuf)
|
66
66
|
aBuf.each_byte do |b|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
67
|
+
c = b.chr
|
68
|
+
for codingSM in @codingSM
|
69
|
+
next unless codingSM
|
70
|
+
next unless codingSM.active
|
71
|
+
codingState = codingSM.next_state(c)
|
72
|
+
if codingState == EError
|
73
|
+
codingSM.active = false
|
74
|
+
@activeSM -= 1
|
75
|
+
if @activeSM <= 0
|
76
|
+
@state = ENotMe
|
77
|
+
return get_state()
|
78
|
+
end
|
79
|
+
elsif codingState == EItsMe
|
80
|
+
@state = EFoundIt
|
81
|
+
@detectedCharset = codingSM.get_coding_state_machine()
|
82
|
+
return get_state()
|
83
|
+
end
|
84
|
+
end
|
85
85
|
end
|
86
86
|
|
87
87
|
return get_state()
|
data/lib/rchardet/escsm.rb
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
# the Initial Developer. All Rights Reserved.
|
8
8
|
#
|
9
9
|
# Contributor(s):
|
10
|
+
# Jeff Hodges - port to Ruby
|
10
11
|
# Mark Pilgrim - port to Python
|
11
12
|
#
|
12
13
|
# This library is free software; you can redistribute it and/or
|
@@ -59,7 +60,7 @@ module CharDet
|
|
59
60
|
1,1,1,1,1,1,1,1, # e8 - ef
|
60
61
|
1,1,1,1,1,1,1,1, # f0 - f7
|
61
62
|
1,1,1,1,1,1,1,1, # f8 - ff
|
62
|
-
]
|
63
|
+
].freeze
|
63
64
|
|
64
65
|
HZ_st = [
|
65
66
|
EStart,EError, 3,EStart,EStart,EStart,EError,EError,# 00-07
|
@@ -68,16 +69,16 @@ module CharDet
|
|
68
69
|
5,EError, 6,EError, 5, 5, 4,EError,# 18-1f
|
69
70
|
4,EError, 4, 4, 4,EError, 4,EError,# 20-27
|
70
71
|
4,EItsMe,EStart,EStart,EStart,EStart,EStart,EStart,# 28-2f
|
71
|
-
]
|
72
|
+
].freeze
|
72
73
|
|
73
|
-
HZCharLenTable = [0, 0, 0, 0, 0, 0]
|
74
|
+
HZCharLenTable = [0, 0, 0, 0, 0, 0].freeze
|
74
75
|
|
75
76
|
HZSMModel = {'classTable' => HZ_cls,
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
}
|
77
|
+
'classFactor' => 6,
|
78
|
+
'stateTable' => HZ_st,
|
79
|
+
'charLenTable' => HZCharLenTable,
|
80
|
+
'name' => "HZ-GB-2312"
|
81
|
+
}.freeze
|
81
82
|
|
82
83
|
ISO2022CN_cls = [
|
83
84
|
2,0,0,0,0,0,0,0, # 00 - 07
|
@@ -112,7 +113,7 @@ ISO2022CN_cls = [
|
|
112
113
|
2,2,2,2,2,2,2,2, # e8 - ef
|
113
114
|
2,2,2,2,2,2,2,2, # f0 - f7
|
114
115
|
2,2,2,2,2,2,2,2, # f8 - ff
|
115
|
-
]
|
116
|
+
].freeze
|
116
117
|
|
117
118
|
ISO2022CN_st = [
|
118
119
|
EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
|
@@ -123,16 +124,16 @@ EError,EError,EError,EItsMe,EError,EError,EError,EError,# 20-27
|
|
123
124
|
5, 6,EError,EError,EError,EError,EError,EError,# 28-2f
|
124
125
|
EError,EError,EError,EItsMe,EError,EError,EError,EError,# 30-37
|
125
126
|
EError,EError,EError,EError,EError,EItsMe,EError,EStart,# 38-3f
|
126
|
-
]
|
127
|
+
].freeze
|
127
128
|
|
128
|
-
ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0]
|
129
|
+
ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0].freeze
|
129
130
|
|
130
131
|
ISO2022CNSMModel = {'classTable' => ISO2022CN_cls,
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
}
|
132
|
+
'classFactor' => 9,
|
133
|
+
'stateTable' => ISO2022CN_st,
|
134
|
+
'charLenTable' => ISO2022CNCharLenTable,
|
135
|
+
'name' => "ISO-2022-CN"
|
136
|
+
}.freeze
|
136
137
|
|
137
138
|
ISO2022JP_cls = [
|
138
139
|
2,0,0,0,0,0,0,0, # 00 - 07
|
@@ -167,7 +168,7 @@ ISO2022JP_cls = [
|
|
167
168
|
2,2,2,2,2,2,2,2, # e8 - ef
|
168
169
|
2,2,2,2,2,2,2,2, # f0 - f7
|
169
170
|
2,2,2,2,2,2,2,2, # f8 - ff
|
170
|
-
]
|
171
|
+
].freeze
|
171
172
|
|
172
173
|
ISO2022JP_st = [
|
173
174
|
EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
|
@@ -179,16 +180,16 @@ EError,EError,EError, 6,EItsMe,EError,EItsMe,EError,# 28-2f
|
|
179
180
|
EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,# 30-37
|
180
181
|
EError,EError,EError,EItsMe,EError,EError,EError,EError,# 38-3f
|
181
182
|
EError,EError,EError,EError,EItsMe,EError,EStart,EStart,# 40-47
|
182
|
-
]
|
183
|
+
].freeze
|
183
184
|
|
184
|
-
ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
185
|
+
ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0].freeze
|
185
186
|
|
186
187
|
ISO2022JPSMModel = {'classTable' => ISO2022JP_cls,
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
}
|
188
|
+
'classFactor' => 10,
|
189
|
+
'stateTable' => ISO2022JP_st,
|
190
|
+
'charLenTable' => ISO2022JPCharLenTable,
|
191
|
+
'name' => "ISO-2022-JP"
|
192
|
+
}.freeze
|
192
193
|
|
193
194
|
ISO2022KR_cls = [
|
194
195
|
2,0,0,0,0,0,0,0, # 00 - 07
|
@@ -223,7 +224,7 @@ ISO2022KR_cls = [
|
|
223
224
|
2,2,2,2,2,2,2,2, # e8 - ef
|
224
225
|
2,2,2,2,2,2,2,2, # f0 - f7
|
225
226
|
2,2,2,2,2,2,2,2, # f8 - ff
|
226
|
-
]
|
227
|
+
].freeze
|
227
228
|
|
228
229
|
ISO2022KR_st = [
|
229
230
|
EStart, 3,EError,EStart,EStart,EStart,EError,EError,# 00-07
|
@@ -231,14 +232,14 @@ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 08-0f
|
|
231
232
|
EItsMe,EItsMe,EError,EError,EError, 4,EError,EError,# 10-17
|
232
233
|
EError,EError,EError,EError, 5,EError,EError,EError,# 18-1f
|
233
234
|
EError,EError,EError,EItsMe,EStart,EStart,EStart,EStart,# 20-27
|
234
|
-
]
|
235
|
+
].freeze
|
235
236
|
|
236
|
-
ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0]
|
237
|
+
ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0].freeze
|
237
238
|
|
238
239
|
ISO2022KRSMModel = {'classTable' => ISO2022KR_cls,
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
}
|
240
|
+
'classFactor' => 6,
|
241
|
+
'stateTable' => ISO2022KR_st,
|
242
|
+
'charLenTable' => ISO2022KRCharLenTable,
|
243
|
+
'name' => "ISO-2022-KR"
|
244
|
+
}.freeze
|
244
245
|
end
|
data/lib/rchardet/eucjpprober.rb
CHANGED
@@ -30,15 +30,15 @@ module CharDet
|
|
30
30
|
class EUCJPProber < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@
|
34
|
-
@
|
35
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(EUCJPSMModel)
|
34
|
+
@distributionAnalyzer = EUCJPDistributionAnalysis.new()
|
35
|
+
@contextAnalyzer = EUCJPContextAnalysis.new()
|
36
36
|
reset
|
37
37
|
end
|
38
38
|
|
39
39
|
def reset
|
40
40
|
super()
|
41
|
-
@
|
41
|
+
@contextAnalyzer.reset()
|
42
42
|
end
|
43
43
|
|
44
44
|
def get_charset_name
|
@@ -48,40 +48,40 @@ module CharDet
|
|
48
48
|
def feed(aBuf)
|
49
49
|
aLen = aBuf.length
|
50
50
|
for i in (0...aLen)
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
51
|
+
codingState = @codingSM.next_state(aBuf[i, 1])
|
52
|
+
if codingState == EError
|
53
|
+
$stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
|
54
|
+
@state = ENotMe
|
55
|
+
break
|
56
|
+
elsif codingState == EItsMe
|
57
|
+
@state = EFoundIt
|
58
|
+
break
|
59
|
+
elsif codingState == EStart
|
60
|
+
charLen = @codingSM.get_current_charlen()
|
61
|
+
if i == 0
|
62
|
+
@lastChar[1] = aBuf[0, 1]
|
63
|
+
@contextAnalyzer.feed(@lastChar, charLen)
|
64
|
+
@distributionAnalyzer.feed(@lastChar, charLen)
|
65
|
+
else
|
66
|
+
@contextAnalyzer.feed(aBuf[i-1, 2], charLen)
|
67
|
+
@distributionAnalyzer.feed(aBuf[i-1, 2], charLen)
|
68
|
+
end
|
69
|
+
end
|
70
70
|
end
|
71
71
|
|
72
|
-
@
|
72
|
+
@lastChar[0] = aBuf[aLen-1, 1]
|
73
73
|
|
74
74
|
if get_state() == EDetecting
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
if @contextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
|
76
|
+
@state = EFoundIt
|
77
|
+
end
|
78
78
|
end
|
79
79
|
|
80
80
|
return get_state()
|
81
81
|
end
|
82
82
|
|
83
83
|
def get_confidence
|
84
|
-
l = [@
|
84
|
+
l = [@contextAnalyzer.get_confidence,@distributionAnalyzer.get_confidence]
|
85
85
|
return l.max
|
86
86
|
end
|
87
87
|
end
|
data/lib/rchardet/euckrfreq.rb
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
# the Initial Developer. All Rights Reserved.
|
8
8
|
#
|
9
9
|
# Contributor(s):
|
10
|
+
# Jeff Hodges - port to Ruby
|
10
11
|
# Mark Pilgrim - port to Python
|
11
12
|
#
|
12
13
|
# This library is free software; you can redistribute it and/or
|
@@ -592,5 +593,5 @@ EUCKRCharToFreqOrder = [
|
|
592
593
|
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
|
593
594
|
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
|
594
595
|
8736,8737,8738,8739,8740,8741
|
595
|
-
]
|
596
|
+
].freeze
|
596
597
|
end
|
data/lib/rchardet/euckrprober.rb
CHANGED
@@ -30,8 +30,8 @@ module CharDet
|
|
30
30
|
class EUCKRProber < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@
|
34
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(EUCKRSMModel)
|
34
|
+
@distributionAnalyzer = EUCKRDistributionAnalysis.new()
|
35
35
|
reset()
|
36
36
|
end
|
37
37
|
|
data/lib/rchardet/euctwfreq.rb
CHANGED
@@ -426,5 +426,6 @@ EUCTWCharToFreqOrder = [
|
|
426
426
|
8678,8679,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689,8690,8691,8692,8693, # 8694
|
427
427
|
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
|
428
428
|
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
|
429
|
-
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741
|
429
|
+
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741 # 8742
|
430
|
+
].freeze
|
430
431
|
end
|
data/lib/rchardet/euctwprober.rb
CHANGED
@@ -30,8 +30,8 @@ module CharDet
|
|
30
30
|
class EUCTWProber < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super()
|
33
|
-
@
|
34
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(EUCTWSMModel)
|
34
|
+
@distributionAnalyzer = EUCTWDistributionAnalysis.new()
|
35
35
|
reset()
|
36
36
|
end
|
37
37
|
|
data/lib/rchardet/gb2312freq.rb
CHANGED
@@ -469,6 +469,6 @@ GB2312CharToFreqOrder = [
|
|
469
469
|
6271,3875,5768,6094,5034,5506,4376,5769,6761,2120,6476,5253,5770,6762,5771,5970,
|
470
470
|
3990,5971,5557,5558,5772,6477,6095,2787,4641,5972,5121,6096,6097,6272,6763,3703,
|
471
471
|
5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
|
472
|
-
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767
|
473
|
-
|
472
|
+
4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767
|
473
|
+
].freeze
|
474
474
|
end
|
@@ -30,8 +30,8 @@ module CharDet
|
|
30
30
|
class GB2312Prober < MultiByteCharSetProber
|
31
31
|
def initialize
|
32
32
|
super
|
33
|
-
@
|
34
|
-
@
|
33
|
+
@codingSM = CodingStateMachine.new(GB2312SMModel)
|
34
|
+
@distributionAnalyzer = GB2312DistributionAnalysis.new()
|
35
35
|
reset()
|
36
36
|
end
|
37
37
|
|
@@ -150,25 +150,25 @@ module CharDet
|
|
150
150
|
class HebrewProber < CharSetProber
|
151
151
|
def initialize
|
152
152
|
super()
|
153
|
-
@
|
154
|
-
@
|
153
|
+
@logicalProber = nil
|
154
|
+
@visualProber = nil
|
155
155
|
reset()
|
156
156
|
end
|
157
157
|
|
158
158
|
def reset
|
159
|
-
@
|
160
|
-
@
|
159
|
+
@finalCharLogicalScore = 0
|
160
|
+
@finalCharVisualScore = 0
|
161
161
|
# The two last characters seen in the previous buffer,
|
162
162
|
# mPrev and mBeforePrev are initialized to space in order to simulate a word
|
163
163
|
# delimiter at the beginning of the data
|
164
|
-
@
|
165
|
-
@
|
164
|
+
@prev = ' '
|
165
|
+
@beforePrev = ' '
|
166
166
|
# These probers are owned by the group prober.
|
167
167
|
end
|
168
168
|
|
169
169
|
def set_model_probers(logicalProber, visualProber)
|
170
|
-
@
|
171
|
-
@
|
170
|
+
@logicalProber = logicalProber
|
171
|
+
@visualProber = visualProber
|
172
172
|
end
|
173
173
|
|
174
174
|
def is_final(c)
|
@@ -215,34 +215,34 @@ module CharDet
|
|
215
215
|
# so the word boundary detection works properly. [MAP]
|
216
216
|
|
217
217
|
if get_state() == ENotMe
|
218
|
-
|
219
|
-
|
218
|
+
# Both model probers say it's not them. No reason to continue.
|
219
|
+
return ENotMe
|
220
220
|
end
|
221
221
|
|
222
222
|
aBuf = filter_high_bit_only(aBuf)
|
223
223
|
|
224
224
|
for cur in aBuf.split(' ')
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
225
|
+
if cur == ' '
|
226
|
+
# We stand on a space - a word just ended
|
227
|
+
if @beforePrev != ' '
|
228
|
+
# next-to-last char was not a space so self._mPrev is not a 1 letter word
|
229
|
+
if is_final(@prev)
|
230
|
+
# case (1) [-2:not space][-1:final letter][cur:space]
|
231
|
+
@finalCharLogicalScore += 1
|
232
|
+
elsif is_non_final(@prev)
|
233
|
+
# case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
234
|
+
@finalCharVisualScore += 1
|
235
|
+
end
|
236
|
+
end
|
237
|
+
else
|
238
|
+
# Not standing on a space
|
239
|
+
if (@beforePrev == ' ') and (is_final(@prev)) and (cur != ' ')
|
240
|
+
# case (3) [-2:space][-1:final letter][cur:not space]
|
241
|
+
@finalCharVisualScore += 1
|
242
|
+
end
|
243
|
+
end
|
244
|
+
@beforePrev = @prev
|
245
|
+
@prev = cur
|
246
246
|
end
|
247
247
|
|
248
248
|
# Forever detecting, till the end or until both model probers return eNotMe (handled above)
|
@@ -252,26 +252,26 @@ module CharDet
|
|
252
252
|
def get_charset_name
|
253
253
|
# Make the decision: is it Logical or Visual?
|
254
254
|
# If the final letter score distance is dominant enough, rely on it.
|
255
|
-
finalsub = @
|
255
|
+
finalsub = @finalCharLogicalScore - @finalCharVisualScore
|
256
256
|
if finalsub >= MIN_FINAL_CHAR_DISTANCE
|
257
|
-
|
257
|
+
return LOGICAL_HEBREW_NAME
|
258
258
|
end
|
259
259
|
if finalsub <= -MIN_FINAL_CHAR_DISTANCE
|
260
|
-
|
260
|
+
return VISUAL_HEBREW_NAME
|
261
261
|
end
|
262
262
|
|
263
263
|
# It's not dominant enough, try to rely on the model scores instead.
|
264
|
-
modelsub = @
|
264
|
+
modelsub = @logicalProber.get_confidence() - @visualProber.get_confidence()
|
265
265
|
if modelsub > MIN_MODEL_DISTANCE
|
266
|
-
|
266
|
+
return LOGICAL_HEBREW_NAME
|
267
267
|
end
|
268
268
|
if modelsub < -MIN_MODEL_DISTANCE
|
269
|
-
|
269
|
+
return VISUAL_HEBREW_NAME
|
270
270
|
end
|
271
271
|
|
272
272
|
# Still no good, back to final letter distance, maybe it'll save the day.
|
273
273
|
if finalsub < 0.0
|
274
|
-
|
274
|
+
return VISUAL_HEBREW_NAME
|
275
275
|
end
|
276
276
|
|
277
277
|
# (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
@@ -280,8 +280,8 @@ module CharDet
|
|
280
280
|
|
281
281
|
def get_state
|
282
282
|
# Remain active as long as any of the model probers are active.
|
283
|
-
if (@
|
284
|
-
|
283
|
+
if (@logicalProber.get_state() == ENotMe) and (@visualProber.get_state() == ENotMe)
|
284
|
+
return ENotMe
|
285
285
|
end
|
286
286
|
return EDetecting
|
287
287
|
end
|