rchardet 1.3.1 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/lib/rchardet.rb +1 -3
  2. data/lib/rchardet/big5freq.rb +2 -2
  3. data/lib/rchardet/big5prober.rb +2 -2
  4. data/lib/rchardet/chardistribution.rb +74 -69
  5. data/lib/rchardet/charsetgroupprober.rb +50 -52
  6. data/lib/rchardet/charsetprober.rb +2 -7
  7. data/lib/rchardet/codingstatemachine.rb +14 -13
  8. data/lib/rchardet/constants.rb +0 -0
  9. data/lib/rchardet/escprober.rb +34 -34
  10. data/lib/rchardet/escsm.rb +33 -32
  11. data/lib/rchardet/eucjpprober.rb +28 -28
  12. data/lib/rchardet/euckrfreq.rb +2 -1
  13. data/lib/rchardet/euckrprober.rb +2 -2
  14. data/lib/rchardet/euctwfreq.rb +2 -1
  15. data/lib/rchardet/euctwprober.rb +2 -2
  16. data/lib/rchardet/gb2312freq.rb +2 -2
  17. data/lib/rchardet/gb2312prober.rb +2 -2
  18. data/lib/rchardet/hebrewprober.rb +40 -40
  19. data/lib/rchardet/jisfreq.rb +2 -1
  20. data/lib/rchardet/jpcntx.rb +131 -130
  21. data/lib/rchardet/langbulgarianmodel.rb +6 -6
  22. data/lib/rchardet/langcyrillicmodel.rb +13 -13
  23. data/lib/rchardet/langgreekmodel.rb +5 -5
  24. data/lib/rchardet/langhebrewmodel.rb +3 -3
  25. data/lib/rchardet/langhungarianmodel.rb +5 -5
  26. data/lib/rchardet/langthaimodel.rb +3 -3
  27. data/lib/rchardet/latin1prober.rb +18 -18
  28. data/lib/rchardet/mbcharsetprober.rb +30 -30
  29. data/lib/rchardet/mbcsgroupprober.rb +9 -9
  30. data/lib/rchardet/mbcssm.rb +72 -72
  31. data/lib/rchardet/sbcharsetprober.rb +48 -50
  32. data/lib/rchardet/sbcsgroupprober.rb +16 -16
  33. data/lib/rchardet/sjisprober.rb +28 -28
  34. data/lib/rchardet/universaldetector.rb +92 -90
  35. data/lib/rchardet/utf8prober.rb +25 -25
  36. data/lib/rchardet/version.rb +3 -0
  37. metadata +30 -47
  38. data/COPYING +0 -504
  39. data/README +0 -12
@@ -31,36 +31,37 @@ module CharDet
31
31
  attr_accessor :active
32
32
 
33
33
  def initialize(sm)
34
- @_mModel = sm
35
- @_mCurrentBytePos = 0
36
- @_mCurrentCharLen = 0
34
+ @model = sm
35
+ @currentBytePos = 0
36
+ @currentCharLen = 0
37
37
  reset()
38
38
  end
39
39
 
40
40
  def reset
41
- @_mCurrentState = EStart
41
+ @currentState = EStart
42
42
  end
43
43
 
44
44
  def next_state(c)
45
45
  # for each byte we get its class
46
46
  # if it is first byte, we also get byte length
47
- byteCls = @_mModel['classTable'][c[0]]
48
- if @_mCurrentState == EStart
49
- @_mCurrentBytePos = 0
50
- @_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
47
+ b = c.bytes.first
48
+ byteCls = @model['classTable'][b]
49
+ if @currentState == EStart
50
+ @currentBytePos = 0
51
+ @currentCharLen = @model['charLenTable'][byteCls]
51
52
  end
52
53
  # from byte's class and stateTable, we get its next state
53
- @_mCurrentState = @_mModel['stateTable'][@_mCurrentState * @_mModel['classFactor'] + byteCls]
54
- @_mCurrentBytePos += 1
55
- return @_mCurrentState
54
+ @currentState = @model['stateTable'][@currentState * @model['classFactor'] + byteCls]
55
+ @currentBytePos += 1
56
+ return @currentState
56
57
  end
57
58
 
58
59
  def get_current_charlen
59
- return @_mCurrentCharLen
60
+ return @currentCharLen
60
61
  end
61
62
 
62
63
  def get_coding_state_machine
63
- return @_mModel['name']
64
+ return @model['name']
64
65
  end
65
66
  end
66
67
  end
File without changes
@@ -30,58 +30,58 @@ module CharDet
30
30
  class EscCharSetProber < CharSetProber
31
31
  def initialize
32
32
  super()
33
- @_mCodingSM = [
34
- CodingStateMachine.new(HZSMModel),
35
- CodingStateMachine.new(ISO2022CNSMModel),
36
- CodingStateMachine.new(ISO2022JPSMModel),
37
- CodingStateMachine.new(ISO2022KRSMModel)
38
- ]
33
+ @codingSM = [
34
+ CodingStateMachine.new(HZSMModel),
35
+ CodingStateMachine.new(ISO2022CNSMModel),
36
+ CodingStateMachine.new(ISO2022JPSMModel),
37
+ CodingStateMachine.new(ISO2022KRSMModel)
38
+ ]
39
39
  reset()
40
40
  end
41
41
 
42
42
  def reset
43
43
  super()
44
- for codingSM in @_mCodingSM
45
- next if not codingSM
46
- codingSM.active = true
47
- codingSM.reset()
44
+ for codingSM in @codingSM
45
+ next if !codingSM
46
+ codingSM.active = true
47
+ codingSM.reset()
48
48
  end
49
- @_mActiveSM = @_mCodingSM.length
50
- @_mDetectedCharset = nil
49
+ @activeSM = @codingSM.length
50
+ @detectedCharset = nil
51
51
  end
52
52
 
53
53
  def get_charset_name
54
- return @_mDetectedCharset
54
+ return @detectedCharset
55
55
  end
56
56
 
57
57
  def get_confidence
58
- if @_mDetectedCharset
59
- return 0.99
58
+ if @detectedCharset
59
+ return 0.99
60
60
  else
61
- return 0.00
61
+ return 0.00
62
62
  end
63
63
  end
64
64
 
65
65
  def feed(aBuf)
66
66
  aBuf.each_byte do |b|
67
- c = b.chr
68
- for codingSM in @_mCodingSM
69
- next unless codingSM
70
- next unless codingSM.active
71
- codingState = codingSM.next_state(c)
72
- if codingState == EError
73
- codingSM.active = false
74
- @_mActiveSM -= 1
75
- if @_mActiveSM <= 0
76
- @_mState = ENotMe
77
- return get_state()
78
- end
79
- elsif codingState == EItsMe
80
- @_mState = EFoundIt
81
- @_mDetectedCharset = codingSM.get_coding_state_machine()
82
- return get_state()
83
- end
84
- end
67
+ c = b.chr
68
+ for codingSM in @codingSM
69
+ next unless codingSM
70
+ next unless codingSM.active
71
+ codingState = codingSM.next_state(c)
72
+ if codingState == EError
73
+ codingSM.active = false
74
+ @activeSM -= 1
75
+ if @activeSM <= 0
76
+ @state = ENotMe
77
+ return get_state()
78
+ end
79
+ elsif codingState == EItsMe
80
+ @state = EFoundIt
81
+ @detectedCharset = codingSM.get_coding_state_machine()
82
+ return get_state()
83
+ end
84
+ end
85
85
  end
86
86
 
87
87
  return get_state()
@@ -7,6 +7,7 @@
7
7
  # the Initial Developer. All Rights Reserved.
8
8
  #
9
9
  # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
10
11
  # Mark Pilgrim - port to Python
11
12
  #
12
13
  # This library is free software; you can redistribute it and/or
@@ -59,7 +60,7 @@ module CharDet
59
60
  1,1,1,1,1,1,1,1, # e8 - ef
60
61
  1,1,1,1,1,1,1,1, # f0 - f7
61
62
  1,1,1,1,1,1,1,1, # f8 - ff
62
- ]
63
+ ].freeze
63
64
 
64
65
  HZ_st = [
65
66
  EStart,EError, 3,EStart,EStart,EStart,EError,EError,# 00-07
@@ -68,16 +69,16 @@ module CharDet
68
69
  5,EError, 6,EError, 5, 5, 4,EError,# 18-1f
69
70
  4,EError, 4, 4, 4,EError, 4,EError,# 20-27
70
71
  4,EItsMe,EStart,EStart,EStart,EStart,EStart,EStart,# 28-2f
71
- ]
72
+ ].freeze
72
73
 
73
- HZCharLenTable = [0, 0, 0, 0, 0, 0]
74
+ HZCharLenTable = [0, 0, 0, 0, 0, 0].freeze
74
75
 
75
76
  HZSMModel = {'classTable' => HZ_cls,
76
- 'classFactor' => 6,
77
- 'stateTable' => HZ_st,
78
- 'charLenTable' => HZCharLenTable,
79
- 'name' => "HZ-GB-2312"
80
- }
77
+ 'classFactor' => 6,
78
+ 'stateTable' => HZ_st,
79
+ 'charLenTable' => HZCharLenTable,
80
+ 'name' => "HZ-GB-2312"
81
+ }.freeze
81
82
 
82
83
  ISO2022CN_cls = [
83
84
  2,0,0,0,0,0,0,0, # 00 - 07
@@ -112,7 +113,7 @@ ISO2022CN_cls = [
112
113
  2,2,2,2,2,2,2,2, # e8 - ef
113
114
  2,2,2,2,2,2,2,2, # f0 - f7
114
115
  2,2,2,2,2,2,2,2, # f8 - ff
115
- ]
116
+ ].freeze
116
117
 
117
118
  ISO2022CN_st = [
118
119
  EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
@@ -123,16 +124,16 @@ EError,EError,EError,EItsMe,EError,EError,EError,EError,# 20-27
123
124
  5, 6,EError,EError,EError,EError,EError,EError,# 28-2f
124
125
  EError,EError,EError,EItsMe,EError,EError,EError,EError,# 30-37
125
126
  EError,EError,EError,EError,EError,EItsMe,EError,EStart,# 38-3f
126
- ]
127
+ ].freeze
127
128
 
128
- ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0]
129
+ ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0].freeze
129
130
 
130
131
  ISO2022CNSMModel = {'classTable' => ISO2022CN_cls,
131
- 'classFactor' => 9,
132
- 'stateTable' => ISO2022CN_st,
133
- 'charLenTable' => ISO2022CNCharLenTable,
134
- 'name' => "ISO-2022-CN"
135
- }
132
+ 'classFactor' => 9,
133
+ 'stateTable' => ISO2022CN_st,
134
+ 'charLenTable' => ISO2022CNCharLenTable,
135
+ 'name' => "ISO-2022-CN"
136
+ }.freeze
136
137
 
137
138
  ISO2022JP_cls = [
138
139
  2,0,0,0,0,0,0,0, # 00 - 07
@@ -167,7 +168,7 @@ ISO2022JP_cls = [
167
168
  2,2,2,2,2,2,2,2, # e8 - ef
168
169
  2,2,2,2,2,2,2,2, # f0 - f7
169
170
  2,2,2,2,2,2,2,2, # f8 - ff
170
- ]
171
+ ].freeze
171
172
 
172
173
  ISO2022JP_st = [
173
174
  EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
@@ -179,16 +180,16 @@ EError,EError,EError, 6,EItsMe,EError,EItsMe,EError,# 28-2f
179
180
  EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,# 30-37
180
181
  EError,EError,EError,EItsMe,EError,EError,EError,EError,# 38-3f
181
182
  EError,EError,EError,EError,EItsMe,EError,EStart,EStart,# 40-47
182
- ]
183
+ ].freeze
183
184
 
184
- ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
185
+ ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0].freeze
185
186
 
186
187
  ISO2022JPSMModel = {'classTable' => ISO2022JP_cls,
187
- 'classFactor' => 10,
188
- 'stateTable' => ISO2022JP_st,
189
- 'charLenTable' => ISO2022JPCharLenTable,
190
- 'name' => "ISO-2022-JP"
191
- }
188
+ 'classFactor' => 10,
189
+ 'stateTable' => ISO2022JP_st,
190
+ 'charLenTable' => ISO2022JPCharLenTable,
191
+ 'name' => "ISO-2022-JP"
192
+ }.freeze
192
193
 
193
194
  ISO2022KR_cls = [
194
195
  2,0,0,0,0,0,0,0, # 00 - 07
@@ -223,7 +224,7 @@ ISO2022KR_cls = [
223
224
  2,2,2,2,2,2,2,2, # e8 - ef
224
225
  2,2,2,2,2,2,2,2, # f0 - f7
225
226
  2,2,2,2,2,2,2,2, # f8 - ff
226
- ]
227
+ ].freeze
227
228
 
228
229
  ISO2022KR_st = [
229
230
  EStart, 3,EError,EStart,EStart,EStart,EError,EError,# 00-07
@@ -231,14 +232,14 @@ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 08-0f
231
232
  EItsMe,EItsMe,EError,EError,EError, 4,EError,EError,# 10-17
232
233
  EError,EError,EError,EError, 5,EError,EError,EError,# 18-1f
233
234
  EError,EError,EError,EItsMe,EStart,EStart,EStart,EStart,# 20-27
234
- ]
235
+ ].freeze
235
236
 
236
- ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0]
237
+ ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0].freeze
237
238
 
238
239
  ISO2022KRSMModel = {'classTable' => ISO2022KR_cls,
239
- 'classFactor' => 6,
240
- 'stateTable' => ISO2022KR_st,
241
- 'charLenTable' => ISO2022KRCharLenTable,
242
- 'name' => "ISO-2022-KR"
243
- }
240
+ 'classFactor' => 6,
241
+ 'stateTable' => ISO2022KR_st,
242
+ 'charLenTable' => ISO2022KRCharLenTable,
243
+ 'name' => "ISO-2022-KR"
244
+ }.freeze
244
245
  end
@@ -30,15 +30,15 @@ module CharDet
30
30
  class EUCJPProber < MultiByteCharSetProber
31
31
  def initialize
32
32
  super()
33
- @_mCodingSM = CodingStateMachine.new(EUCJPSMModel)
34
- @_mDistributionAnalyzer = EUCJPDistributionAnalysis.new()
35
- @_mContextAnalyzer = EUCJPContextAnalysis.new()
33
+ @codingSM = CodingStateMachine.new(EUCJPSMModel)
34
+ @distributionAnalyzer = EUCJPDistributionAnalysis.new()
35
+ @contextAnalyzer = EUCJPContextAnalysis.new()
36
36
  reset
37
37
  end
38
38
 
39
39
  def reset
40
40
  super()
41
- @_mContextAnalyzer.reset()
41
+ @contextAnalyzer.reset()
42
42
  end
43
43
 
44
44
  def get_charset_name
@@ -48,40 +48,40 @@ module CharDet
48
48
  def feed(aBuf)
49
49
  aLen = aBuf.length
50
50
  for i in (0...aLen)
51
- codingState = @_mCodingSM.next_state(aBuf[i..i])
52
- if codingState == EError
53
- $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
54
- @_mState = ENotMe
55
- break
56
- elsif codingState == EItsMe
57
- @_mState = EFoundIt
58
- break
59
- elsif codingState == EStart
60
- charLen = @_mCodingSM.get_current_charlen()
61
- if i == 0
62
- @_mLastChar[1] = aBuf[0..0]
63
- @_mContextAnalyzer.feed(@_mLastChar, charLen)
64
- @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
65
- else
66
- @_mContextAnalyzer.feed(aBuf[i-1...i+1], charLen)
67
- @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
68
- end
69
- end
51
+ codingState = @codingSM.next_state(aBuf[i, 1])
52
+ if codingState == EError
53
+ $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
54
+ @state = ENotMe
55
+ break
56
+ elsif codingState == EItsMe
57
+ @state = EFoundIt
58
+ break
59
+ elsif codingState == EStart
60
+ charLen = @codingSM.get_current_charlen()
61
+ if i == 0
62
+ @lastChar[1] = aBuf[0, 1]
63
+ @contextAnalyzer.feed(@lastChar, charLen)
64
+ @distributionAnalyzer.feed(@lastChar, charLen)
65
+ else
66
+ @contextAnalyzer.feed(aBuf[i-1, 2], charLen)
67
+ @distributionAnalyzer.feed(aBuf[i-1, 2], charLen)
68
+ end
69
+ end
70
70
  end
71
71
 
72
- @_mLastChar[0] = aBuf[aLen-1..aLen-1]
72
+ @lastChar[0] = aBuf[aLen-1, 1]
73
73
 
74
74
  if get_state() == EDetecting
75
- if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
76
- @_mState = EFoundIt
77
- end
75
+ if @contextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
76
+ @state = EFoundIt
77
+ end
78
78
  end
79
79
 
80
80
  return get_state()
81
81
  end
82
82
 
83
83
  def get_confidence
84
- l = [@_mContextAnalyzer.get_confidence,@_mDistributionAnalyzer.get_confidence]
84
+ l = [@contextAnalyzer.get_confidence,@distributionAnalyzer.get_confidence]
85
85
  return l.max
86
86
  end
87
87
  end
@@ -7,6 +7,7 @@
7
7
  # the Initial Developer. All Rights Reserved.
8
8
  #
9
9
  # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
10
11
  # Mark Pilgrim - port to Python
11
12
  #
12
13
  # This library is free software; you can redistribute it and/or
@@ -592,5 +593,5 @@ EUCKRCharToFreqOrder = [
592
593
  8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
593
594
  8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
594
595
  8736,8737,8738,8739,8740,8741
595
- ]
596
+ ].freeze
596
597
  end
@@ -30,8 +30,8 @@ module CharDet
30
30
  class EUCKRProber < MultiByteCharSetProber
31
31
  def initialize
32
32
  super()
33
- @_mCodingSM = CodingStateMachine.new(EUCKRSMModel)
34
- @_mDistributionAnalyzer = EUCKRDistributionAnalysis.new()
33
+ @codingSM = CodingStateMachine.new(EUCKRSMModel)
34
+ @distributionAnalyzer = EUCKRDistributionAnalysis.new()
35
35
  reset()
36
36
  end
37
37
 
@@ -426,5 +426,6 @@ EUCTWCharToFreqOrder = [
426
426
  8678,8679,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689,8690,8691,8692,8693, # 8694
427
427
  8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, # 8710
428
428
  8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, # 8726
429
- 8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741] # 8742
429
+ 8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741 # 8742
430
+ ].freeze
430
431
  end
@@ -30,8 +30,8 @@ module CharDet
30
30
  class EUCTWProber < MultiByteCharSetProber
31
31
  def initialize
32
32
  super()
33
- @_mCodingSM = CodingStateMachine.new(EUCTWSMModel)
34
- @_mDistributionAnalyzer = EUCTWDistributionAnalysis.new()
33
+ @codingSM = CodingStateMachine.new(EUCTWSMModel)
34
+ @distributionAnalyzer = EUCTWDistributionAnalysis.new()
35
35
  reset()
36
36
  end
37
37
 
@@ -469,6 +469,6 @@ GB2312CharToFreqOrder = [
469
469
  6271,3875,5768,6094,5034,5506,4376,5769,6761,2120,6476,5253,5770,6762,5771,5970,
470
470
  3990,5971,5557,5558,5772,6477,6095,2787,4641,5972,5121,6096,6097,6272,6763,3703,
471
471
  5867,5507,6273,4206,6274,4789,6098,6764,3619,3646,3833,3804,2394,3788,4936,3978,
472
- 4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767]
473
-
472
+ 4866,4899,6099,6100,5559,6478,6765,3599,5868,6101,5869,5870,6275,6766,4527,6767
473
+ ].freeze
474
474
  end
@@ -30,8 +30,8 @@ module CharDet
30
30
  class GB2312Prober < MultiByteCharSetProber
31
31
  def initialize
32
32
  super
33
- @_mCodingSM = CodingStateMachine.new(GB2312SMModel)
34
- @_mDistributionAnalyzer = GB2312DistributionAnalysis.new()
33
+ @codingSM = CodingStateMachine.new(GB2312SMModel)
34
+ @distributionAnalyzer = GB2312DistributionAnalysis.new()
35
35
  reset()
36
36
  end
37
37
 
@@ -150,25 +150,25 @@ module CharDet
150
150
  class HebrewProber < CharSetProber
151
151
  def initialize
152
152
  super()
153
- @_mLogicalProber = nil
154
- @_mVisualProber = nil
153
+ @logicalProber = nil
154
+ @visualProber = nil
155
155
  reset()
156
156
  end
157
157
 
158
158
  def reset
159
- @_mFinalCharLogicalScore = 0
160
- @_mFinalCharVisualScore = 0
159
+ @finalCharLogicalScore = 0
160
+ @finalCharVisualScore = 0
161
161
  # The two last characters seen in the previous buffer,
162
162
  # mPrev and mBeforePrev are initialized to space in order to simulate a word
163
163
  # delimiter at the beginning of the data
164
- @_mPrev = ' '
165
- @_mBeforePrev = ' '
164
+ @prev = ' '
165
+ @beforePrev = ' '
166
166
  # These probers are owned by the group prober.
167
167
  end
168
168
 
169
169
  def set_model_probers(logicalProber, visualProber)
170
- @_mLogicalProber = logicalProber
171
- @_mVisualProber = visualProber
170
+ @logicalProber = logicalProber
171
+ @visualProber = visualProber
172
172
  end
173
173
 
174
174
  def is_final(c)
@@ -215,34 +215,34 @@ module CharDet
215
215
  # so the word boundary detection works properly. [MAP]
216
216
 
217
217
  if get_state() == ENotMe
218
- # Both model probers say it's not them. No reason to continue.
219
- return ENotMe
218
+ # Both model probers say it's not them. No reason to continue.
219
+ return ENotMe
220
220
  end
221
221
 
222
222
  aBuf = filter_high_bit_only(aBuf)
223
223
 
224
224
  for cur in aBuf.split(' ')
225
- if cur == ' '
226
- # We stand on a space - a word just ended
227
- if @_mBeforePrev != ' '
228
- # next-to-last char was not a space so self._mPrev is not a 1 letter word
229
- if is_final(@_mPrev)
230
- # case (1) [-2:not space][-1:final letter][cur:space]
231
- @_mFinalCharLogicalScore += 1
232
- elsif is_non_final(@_mPrev)
233
- # case (2) [-2:not space][-1:Non-Final letter][cur:space]
234
- @_mFinalCharVisualScore += 1
235
- end
236
- end
237
- else
238
- # Not standing on a space
239
- if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ')
240
- # case (3) [-2:space][-1:final letter][cur:not space]
241
- @_mFinalCharVisualScore += 1
242
- end
243
- end
244
- @_mBeforePrev = @_mPrev
245
- @_mPrev = cur
225
+ if cur == ' '
226
+ # We stand on a space - a word just ended
227
+ if @beforePrev != ' '
228
+ # next-to-last char was not a space so self._mPrev is not a 1 letter word
229
+ if is_final(@prev)
230
+ # case (1) [-2:not space][-1:final letter][cur:space]
231
+ @finalCharLogicalScore += 1
232
+ elsif is_non_final(@prev)
233
+ # case (2) [-2:not space][-1:Non-Final letter][cur:space]
234
+ @finalCharVisualScore += 1
235
+ end
236
+ end
237
+ else
238
+ # Not standing on a space
239
+ if (@beforePrev == ' ') and (is_final(@prev)) and (cur != ' ')
240
+ # case (3) [-2:space][-1:final letter][cur:not space]
241
+ @finalCharVisualScore += 1
242
+ end
243
+ end
244
+ @beforePrev = @prev
245
+ @prev = cur
246
246
  end
247
247
 
248
248
  # Forever detecting, till the end or until both model probers return eNotMe (handled above)
@@ -252,26 +252,26 @@ module CharDet
252
252
  def get_charset_name
253
253
  # Make the decision: is it Logical or Visual?
254
254
  # If the final letter score distance is dominant enough, rely on it.
255
- finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore
255
+ finalsub = @finalCharLogicalScore - @finalCharVisualScore
256
256
  if finalsub >= MIN_FINAL_CHAR_DISTANCE
257
- return LOGICAL_HEBREW_NAME
257
+ return LOGICAL_HEBREW_NAME
258
258
  end
259
259
  if finalsub <= -MIN_FINAL_CHAR_DISTANCE
260
- return VISUAL_HEBREW_NAME
260
+ return VISUAL_HEBREW_NAME
261
261
  end
262
262
 
263
263
  # It's not dominant enough, try to rely on the model scores instead.
264
- modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence()
264
+ modelsub = @logicalProber.get_confidence() - @visualProber.get_confidence()
265
265
  if modelsub > MIN_MODEL_DISTANCE
266
- return LOGICAL_HEBREW_NAME
266
+ return LOGICAL_HEBREW_NAME
267
267
  end
268
268
  if modelsub < -MIN_MODEL_DISTANCE
269
- return VISUAL_HEBREW_NAME
269
+ return VISUAL_HEBREW_NAME
270
270
  end
271
271
 
272
272
  # Still no good, back to final letter distance, maybe it'll save the day.
273
273
  if finalsub < 0.0
274
- return VISUAL_HEBREW_NAME
274
+ return VISUAL_HEBREW_NAME
275
275
  end
276
276
 
277
277
  # (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
@@ -280,8 +280,8 @@ module CharDet
280
280
 
281
281
  def get_state
282
282
  # Remain active as long as any of the model probers are active.
283
- if (@_mLogicalProber.get_state() == ENotMe) and (@_mVisualProber.get_state() == ENotMe)
284
- return ENotMe
283
+ if (@logicalProber.get_state() == ENotMe) and (@visualProber.get_state() == ENotMe)
284
+ return ENotMe
285
285
  end
286
286
  return EDetecting
287
287
  end