chardet2 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +504 -0
- data/README.markdown +29 -0
- data/lib/Big5Freq.rb +913 -0
- data/lib/Big5Prober.rb +48 -0
- data/lib/CharDistributionAnalysis.rb +245 -0
- data/lib/CharSetGroupProber.rb +114 -0
- data/lib/CharSetProber.rb +70 -0
- data/lib/CodingStateMachine.rb +74 -0
- data/lib/ESCSM.rb +242 -0
- data/lib/EUCJPProber.rb +97 -0
- data/lib/EUCKRFreq.rb +600 -0
- data/lib/EUCKRProber.rb +48 -0
- data/lib/EUCTWFreq.rb +432 -0
- data/lib/EUCTWProber.rb +48 -0
- data/lib/EscCharSetProber.rb +94 -0
- data/lib/GB2312Freq.rb +475 -0
- data/lib/GB2312Prober.rb +48 -0
- data/lib/HebrewProber.rb +292 -0
- data/lib/JISFreq.rb +573 -0
- data/lib/JapaneseContextAnalysis.rb +234 -0
- data/lib/LangBulgarianModel.rb +231 -0
- data/lib/LangCyrillicModel.rb +332 -0
- data/lib/LangGreekModel.rb +229 -0
- data/lib/LangHebrewModel.rb +202 -0
- data/lib/LangHungarianModel.rb +228 -0
- data/lib/LangThaiModel.rb +203 -0
- data/lib/Latin1Prober.rb +155 -0
- data/lib/MBCSGroupProber.rb +57 -0
- data/lib/MBCSSM.rb +513 -0
- data/lib/MultiByteCharSetProber.rb +94 -0
- data/lib/SBCSGroupProber.rb +71 -0
- data/lib/SJISProber.rb +99 -0
- data/lib/SingleByteCharSetProber.rb +131 -0
- data/lib/UTF8Prober.rb +91 -0
- data/lib/UniversalDetector.rb +209 -0
- metadata +83 -0
data/lib/Latin1Prober.rb
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
######################## BEGIN LICENSE BLOCK ########################
|
2
|
+
# The Original Code is mozilla.org code.
|
3
|
+
#
|
4
|
+
# The Initial Developer of the Original Code is
|
5
|
+
# Netscape Communications Corporation.
|
6
|
+
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
|
+
# the Initial Developer. All Rights Reserved.
|
8
|
+
#
|
9
|
+
# Contributor(s):
|
10
|
+
# Hui (zhengzhengzheng@gmail.com) - port to Ruby
|
11
|
+
# Mark Pilgrim - first port to Python
|
12
|
+
#
|
13
|
+
# This library is free software; you can redistribute it and/or
|
14
|
+
# modify it under the terms of the GNU Lesser General Public
|
15
|
+
# License as published by the Free Software Foundation; either
|
16
|
+
# version 2.1 of the License, or (at your option) any later version.
|
17
|
+
#
|
18
|
+
# This library is distributed in the hope that it will be useful,
|
19
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
20
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
21
|
+
# Lesser General Public License for more details.
|
22
|
+
#
|
23
|
+
# You should have received a copy of the GNU Lesser General Public
|
24
|
+
# License along with this library; if not, write to the Free Software
|
25
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
26
|
+
# 02110-1301 USA
|
27
|
+
######################### END LICENSE BLOCK #########################
|
28
|
+
|
29
|
+
require 'UniversalDetector'
|
30
|
+
require 'CharSetProber'
|
31
|
+
|
32
|
+
module Enumerable
|
33
|
+
def reduceBlock(res)
|
34
|
+
each { |n| res = yield(res, n) }
|
35
|
+
res
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
module UniversalDetector
|
40
|
+
FREQ_CAT_NUM = 4
|
41
|
+
|
42
|
+
UDF = 0 # undefined
|
43
|
+
OTH = 1 # other
|
44
|
+
ASC = 2 # ascii capital letter
|
45
|
+
ASS = 3 # ascii small letter
|
46
|
+
ACV = 4 # accent capital vowel
|
47
|
+
ACO = 5 # accent capital other
|
48
|
+
ASV = 6 # accent small vowel
|
49
|
+
ASO = 7 # accent small other
|
50
|
+
CLASS_NUM = 8 # total classes
|
51
|
+
|
52
|
+
Latin1_CharToClass = [ \
|
53
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
54
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
55
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
|
56
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
|
57
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
|
58
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
|
59
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
|
60
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
|
61
|
+
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
|
62
|
+
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
|
63
|
+
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
|
64
|
+
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
|
65
|
+
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
|
66
|
+
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
|
67
|
+
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
|
68
|
+
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
|
69
|
+
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
|
70
|
+
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
|
71
|
+
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
|
72
|
+
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
|
73
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
|
74
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
|
75
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
|
76
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
|
77
|
+
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
|
78
|
+
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
|
79
|
+
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
|
80
|
+
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
|
81
|
+
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
|
82
|
+
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
|
83
|
+
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
|
84
|
+
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
|
85
|
+
]
|
86
|
+
|
87
|
+
# 0 : illegal
|
88
|
+
# 1 : very unlikely
|
89
|
+
# 2 : normal
|
90
|
+
# 3 : very likely
|
91
|
+
Latin1ClassModel = [ \
|
92
|
+
# UDF OTH ASC ASS ACV ACO ASV ASO
|
93
|
+
0, 0, 0, 0, 0, 0, 0, 0, # UDF
|
94
|
+
0, 3, 3, 3, 3, 3, 3, 3, # OTH
|
95
|
+
0, 3, 3, 3, 3, 3, 3, 3, # ASC
|
96
|
+
0, 3, 3, 3, 1, 1, 3, 3, # ASS
|
97
|
+
0, 3, 3, 3, 1, 2, 1, 2, # ACV
|
98
|
+
0, 3, 3, 3, 3, 3, 3, 3, # ACO
|
99
|
+
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
100
|
+
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
101
|
+
]
|
102
|
+
|
103
|
+
class Latin1Prober < CharSetProber
|
104
|
+
def initialize
|
105
|
+
super
|
106
|
+
reset()
|
107
|
+
end
|
108
|
+
|
109
|
+
def reset
|
110
|
+
@_mLastCharClass = OTH
|
111
|
+
@_mFreqCounter = [0] * FREQ_CAT_NUM
|
112
|
+
super
|
113
|
+
end
|
114
|
+
|
115
|
+
def get_charset_name
|
116
|
+
return "windows-1252"
|
117
|
+
end
|
118
|
+
|
119
|
+
def feed(aBuf)
|
120
|
+
aBuf = filter_with_english_letters(aBuf)
|
121
|
+
for c in aBuf
|
122
|
+
charClass = Latin1_CharToClass[c[0]]
|
123
|
+
freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
|
124
|
+
if freq == 0
|
125
|
+
@_mState = :NotMe
|
126
|
+
break
|
127
|
+
end
|
128
|
+
@_mFreqCounter[freq] += 1
|
129
|
+
@_mLastCharClass = charClass
|
130
|
+
end
|
131
|
+
|
132
|
+
return get_state()
|
133
|
+
end
|
134
|
+
|
135
|
+
def get_confidence()
|
136
|
+
if get_state() == :NotMe
|
137
|
+
return 0.01
|
138
|
+
end
|
139
|
+
|
140
|
+
total = @_mFreqCounter.reduce(0, :+)
|
141
|
+
if total < 0.01
|
142
|
+
confidence = 0.0
|
143
|
+
else
|
144
|
+
confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
|
145
|
+
end
|
146
|
+
if confidence < 0.0
|
147
|
+
confidence = 0.0
|
148
|
+
end
|
149
|
+
# lower the confidence of latin1 so that other more accurate detector
|
150
|
+
# can take priority.
|
151
|
+
confidence = confidence * 0.5
|
152
|
+
return confidence
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
######################## BEGIN LICENSE BLOCK ########################
|
2
|
+
# The Original Code is mozilla.org code.
|
3
|
+
#
|
4
|
+
# The Initial Developer of the Original Code is
|
5
|
+
# Netscape Communications Corporation.
|
6
|
+
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
|
+
# the Initial Developer. All Rights Reserved.
|
8
|
+
#
|
9
|
+
# Contributor(s):
|
10
|
+
# Hui (zhengzhengzheng@gmail.com) - port to Ruby
|
11
|
+
# Mark Pilgrim - first port to Python
|
12
|
+
#
|
13
|
+
# This library is free software; you can redistribute it and/or
|
14
|
+
# modify it under the terms of the GNU Lesser General Public
|
15
|
+
# License as published by the Free Software Foundation; either
|
16
|
+
# version 2.1 of the License, or (at your option) any later version.
|
17
|
+
#
|
18
|
+
# This library is distributed in the hope that it will be useful,
|
19
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
20
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
21
|
+
# Lesser General Public License for more details.
|
22
|
+
#
|
23
|
+
# You should have received a copy of the GNU Lesser General Public
|
24
|
+
# License along with this library; if not, write to the Free Software
|
25
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
26
|
+
# 02110-1301 USA
|
27
|
+
######################### END LICENSE BLOCK #########################
|
28
|
+
|
29
|
+
require 'UniversalDetector'
|
30
|
+
require 'CharSetGroupProber'
|
31
|
+
require 'UTF8Prober'
|
32
|
+
require 'SJISProber'
|
33
|
+
require 'EUCJPProber'
|
34
|
+
require 'GB2312Prober'
|
35
|
+
require 'EUCKRProber'
|
36
|
+
require 'Big5Prober'
|
37
|
+
require 'EUCTWProber'
|
38
|
+
|
39
|
+
module UniversalDetector
|
40
|
+
class MBCSGroupProber < CharSetGroupProber
|
41
|
+
|
42
|
+
attr_reader :mProbers
|
43
|
+
|
44
|
+
def initialize
|
45
|
+
super
|
46
|
+
@mProbers = [ \
|
47
|
+
UTF8Prober.new,
|
48
|
+
SJISProber.new,
|
49
|
+
EUCJPProber.new,
|
50
|
+
GB2312Prober.new,
|
51
|
+
EUCKRProber.new,
|
52
|
+
Big5Prober.new,
|
53
|
+
EUCTWProber.new]
|
54
|
+
reset()
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|