rchardet 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/lib/rchardet.rb +1 -3
  2. data/lib/rchardet/big5freq.rb +2 -2
  3. data/lib/rchardet/big5prober.rb +2 -2
  4. data/lib/rchardet/chardistribution.rb +74 -69
  5. data/lib/rchardet/charsetgroupprober.rb +50 -52
  6. data/lib/rchardet/charsetprober.rb +2 -7
  7. data/lib/rchardet/codingstatemachine.rb +14 -13
  8. data/lib/rchardet/constants.rb +0 -0
  9. data/lib/rchardet/escprober.rb +34 -34
  10. data/lib/rchardet/escsm.rb +33 -32
  11. data/lib/rchardet/eucjpprober.rb +28 -28
  12. data/lib/rchardet/euckrfreq.rb +2 -1
  13. data/lib/rchardet/euckrprober.rb +2 -2
  14. data/lib/rchardet/euctwfreq.rb +2 -1
  15. data/lib/rchardet/euctwprober.rb +2 -2
  16. data/lib/rchardet/gb2312freq.rb +2 -2
  17. data/lib/rchardet/gb2312prober.rb +2 -2
  18. data/lib/rchardet/hebrewprober.rb +40 -40
  19. data/lib/rchardet/jisfreq.rb +2 -1
  20. data/lib/rchardet/jpcntx.rb +131 -130
  21. data/lib/rchardet/langbulgarianmodel.rb +6 -6
  22. data/lib/rchardet/langcyrillicmodel.rb +13 -13
  23. data/lib/rchardet/langgreekmodel.rb +5 -5
  24. data/lib/rchardet/langhebrewmodel.rb +3 -3
  25. data/lib/rchardet/langhungarianmodel.rb +5 -5
  26. data/lib/rchardet/langthaimodel.rb +3 -3
  27. data/lib/rchardet/latin1prober.rb +18 -18
  28. data/lib/rchardet/mbcharsetprober.rb +30 -30
  29. data/lib/rchardet/mbcsgroupprober.rb +9 -9
  30. data/lib/rchardet/mbcssm.rb +72 -72
  31. data/lib/rchardet/sbcharsetprober.rb +48 -50
  32. data/lib/rchardet/sbcsgroupprober.rb +16 -16
  33. data/lib/rchardet/sjisprober.rb +28 -28
  34. data/lib/rchardet/universaldetector.rb +92 -90
  35. data/lib/rchardet/utf8prober.rb +25 -25
  36. data/lib/rchardet/version.rb +3 -0
  37. metadata +30 -47
  38. data/COPYING +0 -504
  39. data/README +0 -12
@@ -31,36 +31,37 @@ module CharDet
31
31
  attr_accessor :active
32
32
 
33
33
  def initialize(sm)
34
- @_mModel = sm
35
- @_mCurrentBytePos = 0
36
- @_mCurrentCharLen = 0
34
+ @model = sm
35
+ @currentBytePos = 0
36
+ @currentCharLen = 0
37
37
  reset()
38
38
  end
39
39
 
40
40
  def reset
41
- @_mCurrentState = EStart
41
+ @currentState = EStart
42
42
  end
43
43
 
44
44
  def next_state(c)
45
45
  # for each byte we get its class
46
46
  # if it is first byte, we also get byte length
47
- byteCls = @_mModel['classTable'][c[0]]
48
- if @_mCurrentState == EStart
49
- @_mCurrentBytePos = 0
50
- @_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
47
+ b = c.bytes.first
48
+ byteCls = @model['classTable'][b]
49
+ if @currentState == EStart
50
+ @currentBytePos = 0
51
+ @currentCharLen = @model['charLenTable'][byteCls]
51
52
  end
52
53
  # from byte's class and stateTable, we get its next state
53
- @_mCurrentState = @_mModel['stateTable'][@_mCurrentState * @_mModel['classFactor'] + byteCls]
54
- @_mCurrentBytePos += 1
55
- return @_mCurrentState
54
+ @currentState = @model['stateTable'][@currentState * @model['classFactor'] + byteCls]
55
+ @currentBytePos += 1
56
+ return @currentState
56
57
  end
57
58
 
58
59
  def get_current_charlen
59
- return @_mCurrentCharLen
60
+ return @currentCharLen
60
61
  end
61
62
 
62
63
  def get_coding_state_machine
63
- return @_mModel['name']
64
+ return @model['name']
64
65
  end
65
66
  end
66
67
  end
File without changes
@@ -30,58 +30,58 @@ module CharDet
30
30
  class EscCharSetProber < CharSetProber
31
31
  def initialize
32
32
  super()
33
- @_mCodingSM = [
34
- CodingStateMachine.new(HZSMModel),
35
- CodingStateMachine.new(ISO2022CNSMModel),
36
- CodingStateMachine.new(ISO2022JPSMModel),
37
- CodingStateMachine.new(ISO2022KRSMModel)
38
- ]
33
+ @codingSM = [
34
+ CodingStateMachine.new(HZSMModel),
35
+ CodingStateMachine.new(ISO2022CNSMModel),
36
+ CodingStateMachine.new(ISO2022JPSMModel),
37
+ CodingStateMachine.new(ISO2022KRSMModel)
38
+ ]
39
39
  reset()
40
40
  end
41
41
 
42
42
  def reset
43
43
  super()
44
- for codingSM in @_mCodingSM
45
- next if not codingSM
46
- codingSM.active = true
47
- codingSM.reset()
44
+ for codingSM in @codingSM
45
+ next if !codingSM
46
+ codingSM.active = true
47
+ codingSM.reset()
48
48
  end
49
- @_mActiveSM = @_mCodingSM.length
50
- @_mDetectedCharset = nil
49
+ @activeSM = @codingSM.length
50
+ @detectedCharset = nil
51
51
  end
52
52
 
53
53
  def get_charset_name
54
- return @_mDetectedCharset
54
+ return @detectedCharset
55
55
  end
56
56
 
57
57
  def get_confidence
58
- if @_mDetectedCharset
59
- return 0.99
58
+ if @detectedCharset
59
+ return 0.99
60
60
  else
61
- return 0.00
61
+ return 0.00
62
62
  end
63
63
  end
64
64
 
65
65
  def feed(aBuf)
66
66
  aBuf.each_byte do |b|
67
- c = b.chr
68
- for codingSM in @_mCodingSM
69
- next unless codingSM
70
- next unless codingSM.active
71
- codingState = codingSM.next_state(c)
72
- if codingState == EError
73
- codingSM.active = false
74
- @_mActiveSM -= 1
75
- if @_mActiveSM <= 0
76
- @_mState = ENotMe
77
- return get_state()
78
- end
79
- elsif codingState == EItsMe
80
- @_mState = EFoundIt
81
- @_mDetectedCharset = codingSM.get_coding_state_machine()
82
- return get_state()
83
- end
84
- end
67
+ c = b.chr
68
+ for codingSM in @codingSM
69
+ next unless codingSM
70
+ next unless codingSM.active
71
+ codingState = codingSM.next_state(c)
72
+ if codingState == EError
73
+ codingSM.active = false
74
+ @activeSM -= 1
75
+ if @activeSM <= 0
76
+ @state = ENotMe
77
+ return get_state()
78
+ end
79
+ elsif codingState == EItsMe
80
+ @state = EFoundIt
81
+ @detectedCharset = codingSM.get_coding_state_machine()
82
+ return get_state()
83
+ end
84
+ end
85
85
  end
86
86
 
87
87
  return get_state()
@@ -7,6 +7,7 @@
7
7
  # the Initial Developer. All Rights Reserved.
8
8
  #
9
9
  # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
10
11
  # Mark Pilgrim - port to Python
11
12
  #
12
13
  # This library is free software; you can redistribute it and/or
@@ -59,7 +60,7 @@ module CharDet
59
60
  1,1,1,1,1,1,1,1, # e8 - ef
60
61
  1,1,1,1,1,1,1,1, # f0 - f7
61
62
  1,1,1,1,1,1,1,1, # f8 - ff
62
- ]
63
+ ].freeze
63
64
 
64
65
  HZ_st = [
65
66
  EStart,EError, 3,EStart,EStart,EStart,EError,EError,# 00-07
@@ -68,16 +69,16 @@ module CharDet
68
69
  5,EError, 6,EError, 5, 5, 4,EError,# 18-1f
69
70
  4,EError, 4, 4, 4,EError, 4,EError,# 20-27
70
71
  4,EItsMe,EStart,EStart,EStart,EStart,EStart,EStart,# 28-2f
71
- ]
72
+ ].freeze
72
73
 
73
- HZCharLenTable = [0, 0, 0, 0, 0, 0]
74
+ HZCharLenTable = [0, 0, 0, 0, 0, 0].freeze
74
75
 
75
76
  HZSMModel = {'classTable' => HZ_cls,
76
- 'classFactor' => 6,
77
- 'stateTable' => HZ_st,
78
- 'charLenTable' => HZCharLenTable,
79
- 'name' => "HZ-GB-2312"
80
- }
77
+ 'classFactor' => 6,
78
+ 'stateTable' => HZ_st,
79
+ 'charLenTable' => HZCharLenTable,
80
+ 'name' => "HZ-GB-2312"
81
+ }.freeze
81
82
 
82
83
  ISO2022CN_cls = [
83
84
  2,0,0,0,0,0,0,0, # 00 - 07
@@ -112,7 +113,7 @@ ISO2022CN_cls = [
112
113
  2,2,2,2,2,2,2,2, # e8 - ef
113
114
  2,2,2,2,2,2,2,2, # f0 - f7
114
115
  2,2,2,2,2,2,2,2, # f8 - ff
115
- ]
116
+ ].freeze
116
117
 
117
118
  ISO2022CN_st = [
118
119
  EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
@@ -123,16 +124,16 @@ EError,EError,EError,EItsMe,EError,EError,EError,EError,# 20-27
123
124
  5, 6,EError,EError,EError,EError,EError,EError,# 28-2f
124
125
  EError,EError,EError,EItsMe,EError,EError,EError,EError,# 30-37
125
126
  EError,EError,EError,EError,EError,EItsMe,EError,EStart,# 38-3f
126
- ]
127
+ ].freeze
127
128
 
128
- ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0]
129
+ ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0].freeze
129
130
 
130
131
  ISO2022CNSMModel = {'classTable' => ISO2022CN_cls,
131
- 'classFactor' => 9,
132
- 'stateTable' => ISO2022CN_st,
133
- 'charLenTable' => ISO2022CNCharLenTable,
134
- 'name' => "ISO-2022-CN"
135
- }
132
+ 'classFactor' => 9,
133
+ 'stateTable' => ISO2022CN_st,
134
+ 'charLenTable' => ISO2022CNCharLenTable,
135
+ 'name' => "ISO-2022-CN"
136
+ }.freeze
136
137
 
137
138
  ISO2022JP_cls = [
138
139
  2,0,0,0,0,0,0,0, # 00 - 07
@@ -167,7 +168,7 @@ ISO2022JP_cls = [
167
168
  2,2,2,2,2,2,2,2, # e8 - ef
168
169
  2,2,2,2,2,2,2,2, # f0 - f7
169
170
  2,2,2,2,2,2,2,2, # f8 - ff
170
- ]
171
+ ].freeze
171
172
 
172
173
  ISO2022JP_st = [
173
174
  EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
@@ -179,16 +180,16 @@ EError,EError,EError, 6,EItsMe,EError,EItsMe,EError,# 28-2f
179
180
  EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,# 30-37
180
181
  EError,EError,EError,EItsMe,EError,EError,EError,EError,# 38-3f
181
182
  EError,EError,EError,EError,EItsMe,EError,EStart,EStart,# 40-47
182
- ]
183
+ ].freeze
183
184
 
184
- ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
185
+ ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0].freeze
185
186
 
186
187
  ISO2022JPSMModel = {'classTable' => ISO2022JP_cls,
187
- 'classFactor' => 10,
188
- 'stateTable' => ISO2022JP_st,
189
- 'charLenTable' => ISO2022JPCharLenTable,
190
- 'name' => "ISO-2022-JP"
191
- }
188
+ 'classFactor' => 10,
189
+ 'stateTable' => ISO2022JP_st,
190
+ 'charLenTable' => ISO2022JPCharLenTable,
191
+ 'name' => "ISO-2022-JP"
192
+ }.freeze
192
193
 
193
194
  ISO2022KR_cls = [
194
195
  2,0,0,0,0,0,0,0, # 00 - 07
@@ -223,7 +224,7 @@ ISO2022KR_cls = [
223
224
  2,2,2,2,2,2,2,2, # e8 - ef
224
225
  2,2,2,2,2,2,2,2, # f0 - f7
225
226
  2,2,2,2,2,2,2,2, # f8 - ff
226
- ]
227
+ ].freeze
227
228
 
228
229
  ISO2022KR_st = [
229
230
  EStart, 3,EError,EStart,EStart,EStart,EError,EError,# 00-07
@@ -231,14 +232,14 @@ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 08-0f
231
232
  EItsMe,EItsMe,EError,EError,EError, 4,EError,EError,# 10-17
232
233
  EError,EError,EError,EError, 5,EError,EError,EError,# 18-1f
233
234
  EError,EError,EError,EItsMe,EStart,EStart,EStart,EStart,# 20-27
234
- ]
235
+ ].freeze
235
236
 
236
- ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0]
237
+ ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0].freeze
237
238
 
238
239
  ISO2022KRSMModel = {'classTable' => ISO2022KR_cls,
239
- 'classFactor' => 6,
240
- 'stateTable' => ISO2022KR_st,
241
- 'charLenTable' => ISO2022KRCharLenTable,
242
- 'name' => "ISO-2022-KR"
243
- }
240
+ 'classFactor' => 6,
241
+ 'stateTable' => ISO2022KR_st,
242
+ 'charLenTable' => ISO2022KRCharLenTable,
243
+ 'name' => "ISO-2022-KR"
244
+ }.freeze
244
245
  end
@@ -30,15 +30,15 @@ module CharDet
30
30
  class EUCJPProber < MultiByteCharSetProber
31
31
  def initialize
32
32
  super()
33
- @_mCodingSM = CodingStateMachine.new(EUCJPSMModel)
34
- @_mDistributionAnalyzer = EUCJPDistributionAnalysis.new()
35
- @_mContextAnalyzer = EUCJPContextAnalysis.new()
33
+ @codingSM = CodingStateMachine.new(EUCJPSMModel)
34
+ @distributionAnalyzer = EUCJPDistributionAnalysis.new()
35
+ @contextAnalyzer = EUCJPContextAnalysis.new()
36
36
  reset
37
37
  end
38
38
 
39
39
  def reset
40
40
  super()
41
- @_mContextAnalyzer.reset()
41
+ @contextAnalyzer.reset()
42
42
  end
43
43
 
44
44
  def get_charset_name
@@ -48,40 +48,40 @@ module CharDet
48
48
  def feed(aBuf)
49
49
  aLen = aBuf.length
50
50
  for i in (0...aLen)
51
- codingState = @_mCodingSM.next_state(aBuf[i..i])
52
- if codingState == EError
53
- $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
54
- @_mState = ENotMe
55
- break
56
- elsif codingState == EItsMe
57
- @_mState = EFoundIt
58
- break
59
- elsif codingState == EStart
60
- charLen = @_mCodingSM.get_current_charlen()
61
- if i == 0
62
- @_mLastChar[1] = aBuf[0..0]
63
- @_mContextAnalyzer.feed(@_mLastChar, charLen)
64
- @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
65
- else
66
- @_mContextAnalyzer.feed(aBuf[i-1...i+1], charLen)
67
- @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
68
- end
69
- end
51
+ codingState = @codingSM.next_state(aBuf[i, 1])
52
+ if codingState == EError
53
+ $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
54
+ @state = ENotMe
55
+ break
56
+ elsif codingState == EItsMe
57
+ @state = EFoundIt
58
+ break
59
+ elsif codingState == EStart
60
+ charLen = @codingSM.get_current_charlen()
61
+ if i == 0
62
+ @lastChar[1] = aBuf[0, 1]
63
+ @contextAnalyzer.feed(@lastChar, charLen)
64
+ @distributionAnalyzer.feed(@lastChar, charLen)
65
+ else
66
+ @contextAnalyzer.feed(aBuf[i-1, 2], charLen)
67
+ @distributionAnalyzer.feed(aBuf[i-1, 2], charLen)
68
+ end
69
+ end
70
70
  end
71
71
 
72
- @_mLastChar[0] = aBuf[aLen-1..aLen-1]
72
+ @lastChar[0] = aBuf[aLen-1, 1]
73
73
 
74
74
  if get_state() == EDetecting
75
- if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
76
- @_mState = EFoundIt
77
- end
75
+ if @contextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
76
+ @state = EFoundIt
77
+ end
78
78
  end
79
79
 
80
80
  return get_state()
81
81
  end
82
82
 
83
83
  def get_confidence
84
- l = [@_mContextAnalyzer.get_confidence,@_mDistributionAnalyzer.get_confidence]
84
+ l = [@contextAnalyzer.get_confidence,@distributionAnalyzer.get_confidence]
85
85
  return l.max
86
86
  end
87
87
  end
@@ -7,6 +7,7 @@
7
7
  # the Initial Developer. All Rights Reserved.
8
8
  #
9
9
  # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
10
11
  # Mark Pilgrim - port to Python
11
12
  #
12
13
  # This library is free software; you can redistribute it and/or
@@ -592,5 +593,5 @@ EUCKRCharToFreqOrder = [
592
593
  8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
593
594
  8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
594
595
  8736,8737,8738,8739,8740,8741
595
- ]
596
+ ].freeze
596
597
  end
@@ -30,8 +30,8 @@ module CharDet
30
30
  class EUCKRProber < MultiByteCharSetProber
31
31
  def initialize
32
32
  super()
33
- @_mCodingSM = CodingStateMachine.new(EUCKRSMModel)
34
- @_mDistributionAnalyzer = EUCKRDistributionAnalysis.new()
33
+ @codingSM = CodingStateMachine.new(EUCKRSMModel)
34
+ @distributionAnalyzer = EUCKRDistributionAnalysis.new()
35
35
  reset()
36
36
  end
37
37
 
@@ -426,5 +426,6 @@ EUCTWCharToFreqOrder = [
426
426
  8678,8679,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689,8690,8691,8692,8693, # 8694
427
427
  8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
428
428
  8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
429
- 8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741] # 8742
429
+ 8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741 # 8742
430
+ ].freeze
430
431
  end
@@ -30,8 +30,8 @@ module CharDet
30
30
  class EUCTWProber < MultiByteCharSetProber
31
31
  def initialize
32
32
  super()
33
- @_mCodingSM = CodingStateMachine.new(EUCTWSMModel)
34
- @_mDistributionAnalyzer = EUCTWDistributionAnalysis.new()
33
+ @codingSM = CodingStateMachine.new(EUCTWSMModel)
34
+ @distributionAnalyzer = EUCTWDistributionAnalysis.new()
35
35
  reset()
36
36
  end
37
37
 
@@ -469,6 +469,6 @@ GB2312CharToFreqOrder = [
469
469
  6271,3875,5768,6094,5034,5506,4376,5769,6761,2120,6476,5253,5770,6762,5771,5970,
470
470
  3990,5971,5557,5558,5772,6477,6095,2787,4641,5972,5121,6096,6097,6272,6763,3703,
471
471
  5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
472
- 4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767]
473
-
472
+ 4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767
473
+ ].freeze
474
474
  end
@@ -30,8 +30,8 @@ module CharDet
30
30
  class GB2312Prober < MultiByteCharSetProber
31
31
  def initialize
32
32
  super
33
- @_mCodingSM = CodingStateMachine.new(GB2312SMModel)
34
- @_mDistributionAnalyzer = GB2312DistributionAnalysis.new()
33
+ @codingSM = CodingStateMachine.new(GB2312SMModel)
34
+ @distributionAnalyzer = GB2312DistributionAnalysis.new()
35
35
  reset()
36
36
  end
37
37
 
@@ -150,25 +150,25 @@ module CharDet
150
150
  class HebrewProber < CharSetProber
151
151
  def initialize
152
152
  super()
153
- @_mLogicalProber = nil
154
- @_mVisualProber = nil
153
+ @logicalProber = nil
154
+ @visualProber = nil
155
155
  reset()
156
156
  end
157
157
 
158
158
  def reset
159
- @_mFinalCharLogicalScore = 0
160
- @_mFinalCharVisualScore = 0
159
+ @finalCharLogicalScore = 0
160
+ @finalCharVisualScore = 0
161
161
  # The two last characters seen in the previous buffer,
162
162
  # mPrev and mBeforePrev are initialized to space in order to simulate a word
163
163
  # delimiter at the beginning of the data
164
- @_mPrev = ' '
165
- @_mBeforePrev = ' '
164
+ @prev = ' '
165
+ @beforePrev = ' '
166
166
  # These probers are owned by the group prober.
167
167
  end
168
168
 
169
169
  def set_model_probers(logicalProber, visualProber)
170
- @_mLogicalProber = logicalProber
171
- @_mVisualProber = visualProber
170
+ @logicalProber = logicalProber
171
+ @visualProber = visualProber
172
172
  end
173
173
 
174
174
  def is_final(c)
@@ -215,34 +215,34 @@ module CharDet
215
215
  # so the word boundary detection works properly. [MAP]
216
216
 
217
217
  if get_state() == ENotMe
218
- # Both model probers say it's not them. No reason to continue.
219
- return ENotMe
218
+ # Both model probers say it's not them. No reason to continue.
219
+ return ENotMe
220
220
  end
221
221
 
222
222
  aBuf = filter_high_bit_only(aBuf)
223
223
 
224
224
  for cur in aBuf.split(' ')
225
- if cur == ' '
226
- # We stand on a space - a word just ended
227
- if @_mBeforePrev != ' '
228
- # next-to-last char was not a space so self._mPrev is not a 1 letter word
229
- if is_final(@_mPrev)
230
- # case (1) [-2:not space][-1:final letter][cur:space]
231
- @_mFinalCharLogicalScore += 1
232
- elsif is_non_final(@_mPrev)
233
- # case (2) [-2:not space][-1:Non-Final letter][cur:space]
234
- @_mFinalCharVisualScore += 1
235
- end
236
- end
237
- else
238
- # Not standing on a space
239
- if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ')
240
- # case (3) [-2:space][-1:final letter][cur:not space]
241
- @_mFinalCharVisualScore += 1
242
- end
243
- end
244
- @_mBeforePrev = @_mPrev
245
- @_mPrev = cur
225
+ if cur == ' '
226
+ # We stand on a space - a word just ended
227
+ if @beforePrev != ' '
228
+ # next-to-last char was not a space so self._mPrev is not a 1 letter word
229
+ if is_final(@prev)
230
+ # case (1) [-2:not space][-1:final letter][cur:space]
231
+ @finalCharLogicalScore += 1
232
+ elsif is_non_final(@prev)
233
+ # case (2) [-2:not space][-1:Non-Final letter][cur:space]
234
+ @finalCharVisualScore += 1
235
+ end
236
+ end
237
+ else
238
+ # Not standing on a space
239
+ if (@beforePrev == ' ') and (is_final(@prev)) and (cur != ' ')
240
+ # case (3) [-2:space][-1:final letter][cur:not space]
241
+ @finalCharVisualScore += 1
242
+ end
243
+ end
244
+ @beforePrev = @prev
245
+ @prev = cur
246
246
  end
247
247
 
248
248
  # Forever detecting, till the end or until both model probers return eNotMe (handled above)
@@ -252,26 +252,26 @@ module CharDet
252
252
  def get_charset_name
253
253
  # Make the decision: is it Logical or Visual?
254
254
  # If the final letter score distance is dominant enough, rely on it.
255
- finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore
255
+ finalsub = @finalCharLogicalScore - @finalCharVisualScore
256
256
  if finalsub >= MIN_FINAL_CHAR_DISTANCE
257
- return LOGICAL_HEBREW_NAME
257
+ return LOGICAL_HEBREW_NAME
258
258
  end
259
259
  if finalsub <= -MIN_FINAL_CHAR_DISTANCE
260
- return VISUAL_HEBREW_NAME
260
+ return VISUAL_HEBREW_NAME
261
261
  end
262
262
 
263
263
  # It's not dominant enough, try to rely on the model scores instead.
264
- modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence()
264
+ modelsub = @logicalProber.get_confidence() - @visualProber.get_confidence()
265
265
  if modelsub > MIN_MODEL_DISTANCE
266
- return LOGICAL_HEBREW_NAME
266
+ return LOGICAL_HEBREW_NAME
267
267
  end
268
268
  if modelsub < -MIN_MODEL_DISTANCE
269
- return VISUAL_HEBREW_NAME
269
+ return VISUAL_HEBREW_NAME
270
270
  end
271
271
 
272
272
  # Still no good, back to final letter distance, maybe it'll save the day.
273
273
  if finalsub < 0.0
274
- return VISUAL_HEBREW_NAME
274
+ return VISUAL_HEBREW_NAME
275
275
  end
276
276
 
277
277
  # (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
@@ -280,8 +280,8 @@ module CharDet
280
280
 
281
281
  def get_state
282
282
  # Remain active as long as any of the model probers are active.
283
- if (@_mLogicalProber.get_state() == ENotMe) and (@_mVisualProber.get_state() == ENotMe)
284
- return ENotMe
283
+ if (@logicalProber.get_state() == ENotMe) and (@visualProber.get_state() == ENotMe)
284
+ return ENotMe
285
285
  end
286
286
  return EDetecting
287
287
  end