chardet 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +504 -0
- data/README +12 -0
- data/lib/Big5Freq.rb +913 -0
- data/lib/Big5Prober.rb +48 -0
- data/lib/CharDistributionAnalysis.rb +245 -0
- data/lib/CharSetGroupProber.rb +114 -0
- data/lib/CharSetProber.rb +70 -0
- data/lib/CodingStateMachine.rb +74 -0
- data/lib/ESCSM.rb +242 -0
- data/lib/EUCJPProber.rb +97 -0
- data/lib/EUCKRFreq.rb +600 -0
- data/lib/EUCKRProber.rb +48 -0
- data/lib/EUCTWFreq.rb +432 -0
- data/lib/EUCTWProber.rb +48 -0
- data/lib/EscCharSetProber.rb +94 -0
- data/lib/GB2312Freq.rb +475 -0
- data/lib/GB2312Prober.rb +48 -0
- data/lib/HebrewProber.rb +292 -0
- data/lib/JISFreq.rb +573 -0
- data/lib/JapaneseContextAnalysis.rb +234 -0
- data/lib/LangBulgarianModel.rb +231 -0
- data/lib/LangCyrillicModel.rb +332 -0
- data/lib/LangGreekModel.rb +229 -0
- data/lib/LangHebrewModel.rb +202 -0
- data/lib/LangHungarianModel.rb +228 -0
- data/lib/LangThaiModel.rb +203 -0
- data/lib/Latin1Prober.rb +160 -0
- data/lib/MBCSGroupProber.rb +57 -0
- data/lib/MBCSSM.rb +513 -0
- data/lib/MultiByteCharSetProber.rb +94 -0
- data/lib/SBCSGroupProber.rb +71 -0
- data/lib/SJISProber.rb +99 -0
- data/lib/SingleByteCharSetProber.rb +131 -0
- data/lib/UTF8Prober.rb +91 -0
- data/lib/UniversalDetector.rb +209 -0
- data/python-docs/css/chardet.css +299 -0
- data/python-docs/faq.html +107 -0
- data/python-docs/how-it-works.html +113 -0
- data/python-docs/images/caution.png +0 -0
- data/python-docs/images/important.png +0 -0
- data/python-docs/images/note.png +0 -0
- data/python-docs/images/permalink.gif +0 -0
- data/python-docs/images/tip.png +0 -0
- data/python-docs/images/warning.png +0 -0
- data/python-docs/index.html +73 -0
- data/python-docs/license.html +62 -0
- data/python-docs/supported-encodings.html +86 -0
- data/python-docs/usage.html +107 -0
- metadata +86 -0
@@ -0,0 +1,209 @@
|
|
1
|
+
######################## BEGIN LICENSE BLOCK ########################
|
2
|
+
# The Original Code is mozilla.org code.
|
3
|
+
#
|
4
|
+
# The Initial Developer of the Original Code is
|
5
|
+
# Netscape Communications Corporation.
|
6
|
+
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
|
+
# the Initial Developer. All Rights Reserved.
|
8
|
+
#
|
9
|
+
# Contributor(s):
|
10
|
+
# Hui (zhengzhengzheng@gmail.com) - port to Ruby
|
11
|
+
# Mark Pilgrim - first port to Python
|
12
|
+
#
|
13
|
+
# This library is free software; you can redistribute it and/or
|
14
|
+
# modify it under the terms of the GNU Lesser General Public
|
15
|
+
# License as published by the Free Software Foundation; either
|
16
|
+
# version 2.1 of the License, or (at your option) any later version.
|
17
|
+
#
|
18
|
+
# This library is distributed in the hope that it will be useful,
|
19
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
20
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
21
|
+
# Lesser General Public License for more details.
|
22
|
+
#
|
23
|
+
# You should have received a copy of the GNU Lesser General Public
|
24
|
+
# License along with this library; if not, write to the Free Software
|
25
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
26
|
+
# 02110-1301 USA
|
27
|
+
######################### END LICENSE BLOCK #########################
|
28
|
+
|
29
|
+
require "EscCharSetProber"
|
30
|
+
require "MBCSGroupProber"
|
31
|
+
require "SBCSGroupProber"
|
32
|
+
require "Latin1Prober"
|
33
|
+
require "singleton"
|
34
|
+
|
35
|
+
module UniversalDetector
|
36
|
+
|
37
|
+
class << self
|
38
|
+
def encoding(data)
|
39
|
+
chardet(data)['encoding']
|
40
|
+
end
|
41
|
+
|
42
|
+
def chardet(data)
|
43
|
+
u = UniversalDetector::Detector.instance
|
44
|
+
u.reset()
|
45
|
+
u.feed(data)
|
46
|
+
u.close()
|
47
|
+
u.result
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
DEBUG = nil
|
52
|
+
|
53
|
+
Detectiong = 0
|
54
|
+
FoundIt = 1
|
55
|
+
NotMe = 2
|
56
|
+
|
57
|
+
Start = 0
|
58
|
+
Error = 1
|
59
|
+
ItsMe = 2
|
60
|
+
|
61
|
+
MINIMUM_THRESHOLD = 0.20
|
62
|
+
PureAscii = 0
|
63
|
+
EscAscii = 1
|
64
|
+
Highbyte = 2
|
65
|
+
|
66
|
+
SHORTCUT_THRESHOLD = 0.95
|
67
|
+
|
68
|
+
class Detector
|
69
|
+
|
70
|
+
include Singleton
|
71
|
+
|
72
|
+
attr_reader :result
|
73
|
+
|
74
|
+
def initialize
|
75
|
+
@_highBitDetector = /[\x80-\xFF]/
|
76
|
+
@_escDetector = /\033|~\{/
|
77
|
+
@_mEscCharSetProber = nil
|
78
|
+
@_mCharSetProbers = []
|
79
|
+
reset
|
80
|
+
end
|
81
|
+
|
82
|
+
def reset
|
83
|
+
@result = {"encoding"=> nil, "confidence"=> 0.0}
|
84
|
+
@done = false
|
85
|
+
@_mStart = true
|
86
|
+
@_mGotData = false
|
87
|
+
@_mInputState = :PureAscii
|
88
|
+
@_mLastChar = ""
|
89
|
+
if @_mEscCharSetProber
|
90
|
+
@_mEscCharSetProber.reset
|
91
|
+
end
|
92
|
+
for prober in @_mCharSetProbers
|
93
|
+
prober.reset
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def feed(data)
|
98
|
+
if @done || data.empty?
|
99
|
+
return
|
100
|
+
end
|
101
|
+
unless @_mGotData
|
102
|
+
# If the data starts with BOM, we know it is UTF
|
103
|
+
if data[0,3] == "\xEF\xBB\xBF":
|
104
|
+
# EF BB BF UTF-8 with BOM
|
105
|
+
@result = {"encoding"=> "UTF-8", "confidence"=> 1.0}
|
106
|
+
elsif data[0,4] == "\xFF\xFE\x00\x00":
|
107
|
+
# FF FE 00 00 UTF-32, little-endian BOM
|
108
|
+
@result = {"encoding"=> "UTF-32LE", "confidence"=> 1.0}
|
109
|
+
elsif data[0,4] == "\x00\x00\xFE\xFF":
|
110
|
+
# 00 00 FE FF UTF-32, big-endian BOM
|
111
|
+
@result = {"encoding"=> "UTF-32BE", "confidence"=> 1.0}
|
112
|
+
elsif data[0,4] == "\xFE\xFF\x00\x00":
|
113
|
+
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
114
|
+
@result = {"encoding"=> "X-ISO-10646-UCS-4-3412", "confidence"=> 1.0}
|
115
|
+
elsif data[0,4] == "\x00\x00\xFF\xFE":
|
116
|
+
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
117
|
+
@result = {"encoding"=> "X-ISO-10646-UCS-4-2143", "confidence"=> 1.0}
|
118
|
+
elsif data[0,4] == "\xFF\xFE":
|
119
|
+
# FF FE UTF-16, little endian BOM
|
120
|
+
@result = {"encoding"=> "UTF-16LE", "confidence"=> 1.0}
|
121
|
+
elsif data[0,2] == "\xFE\xFF":
|
122
|
+
# FE FF UTF-16, big endian BOM
|
123
|
+
@result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0}
|
124
|
+
end
|
125
|
+
end
|
126
|
+
@_mGotData = true
|
127
|
+
if @result["encoding"] && @result["confidence"] > 0.0
|
128
|
+
@done = true
|
129
|
+
return
|
130
|
+
end
|
131
|
+
|
132
|
+
if @_mInputState == :PureAscii
|
133
|
+
if data =~ @_highBitDetector
|
134
|
+
@_mInputState = :Highbyte
|
135
|
+
elsif (@_mLastChar + data) =~ @_escDetector
|
136
|
+
@_mInputState = :EscAscii
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
@_mLastChar = data[-1]
|
141
|
+
if @_mInputState == :EscAscii
|
142
|
+
unless @_mEscCharSetProber
|
143
|
+
@_mEscCharSetProber = EscCharSetProber.new
|
144
|
+
end
|
145
|
+
if @_mEscCharSetProber.feed(data) == constants.eFoundIt
|
146
|
+
@result = {"encoding"=> @_mEscCharSetProber.get_charset_name() ,"confidence"=> @_mEscCharSetProber.get_confidence()}
|
147
|
+
@done = true
|
148
|
+
end
|
149
|
+
elsif @_mInputState == :Highbyte
|
150
|
+
if @_mCharSetProbers.empty?
|
151
|
+
@_mCharSetProbers = MBCSGroupProber.new.mProbers + SBCSGroupProber.new.mProbers + [Latin1Prober.new]
|
152
|
+
end
|
153
|
+
@_mCharSetProbers.each do |prober|
|
154
|
+
if prober.feed(data) == :FoundIt
|
155
|
+
@result = {"encoding"=> prober.get_charset_name(), "confidence"=> prober.get_confidence()}
|
156
|
+
@done = true
|
157
|
+
break
|
158
|
+
end
|
159
|
+
end #for
|
160
|
+
end
|
161
|
+
end #feed
|
162
|
+
|
163
|
+
def close
|
164
|
+
if @done then return end
|
165
|
+
unless @_mGotData
|
166
|
+
if DEBUG
|
167
|
+
p("no data received!\n")
|
168
|
+
end
|
169
|
+
return
|
170
|
+
end
|
171
|
+
@done = true
|
172
|
+
|
173
|
+
if @_mInputState == :PureAscii
|
174
|
+
@result = {"encoding" => "ascii", "confidence" => 1.0}
|
175
|
+
return @result
|
176
|
+
end
|
177
|
+
|
178
|
+
if @_mInputState == :Highbyte
|
179
|
+
proberConfidence = nil
|
180
|
+
maxProberConfidence = 0.0
|
181
|
+
maxProber = nil
|
182
|
+
for prober in @_mCharSetProbers
|
183
|
+
unless prober then next end
|
184
|
+
proberConfidence = prober.get_confidence()
|
185
|
+
if proberConfidence > maxProberConfidence
|
186
|
+
maxProberConfidence = proberConfidence
|
187
|
+
maxProber = prober
|
188
|
+
end
|
189
|
+
end
|
190
|
+
if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD)
|
191
|
+
@result = {"encoding" => maxProber.get_charset_name(),
|
192
|
+
"confidence" => maxProber.get_confidence()}
|
193
|
+
return @result
|
194
|
+
end
|
195
|
+
end #if
|
196
|
+
|
197
|
+
if DEBUG
|
198
|
+
p("no probers hit minimum threshhold\n")
|
199
|
+
for prober in @_mCharSetProbers:
|
200
|
+
unless prober then next end
|
201
|
+
p("%s confidence = %s\n" % \
|
202
|
+
[prober.get_charset_name(), \
|
203
|
+
prober.get_confidence()])
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end #close
|
207
|
+
end #class
|
208
|
+
|
209
|
+
end #module
|
@@ -0,0 +1,299 @@
|
|
1
|
+
html {
|
2
|
+
margin: 0;
|
3
|
+
padding: 0;
|
4
|
+
}
|
5
|
+
|
6
|
+
body {
|
7
|
+
background-color: #fff;
|
8
|
+
color: #333;
|
9
|
+
font-family: 'Lucida Grande', Verdana, Geneva, Lucida, Helvetica, sans-serif;
|
10
|
+
font-size: 100%;
|
11
|
+
margin: 10px;
|
12
|
+
padding: 0;
|
13
|
+
}
|
14
|
+
|
15
|
+
a:link, a:visited {
|
16
|
+
background-color: transparent;
|
17
|
+
color: #333;
|
18
|
+
text-decoration: none !important;
|
19
|
+
border-bottom: 1px dotted #333 !important;
|
20
|
+
text-decoration: underline;
|
21
|
+
border-bottom: 0;
|
22
|
+
}
|
23
|
+
|
24
|
+
a:hover {
|
25
|
+
background-color: transparent;
|
26
|
+
color: #993344;
|
27
|
+
text-decoration: none !important;
|
28
|
+
text-decoration: underline;
|
29
|
+
border-bottom: 1px dotted #993344 !important;
|
30
|
+
border-bottom: 0;
|
31
|
+
}
|
32
|
+
|
33
|
+
h1 {
|
34
|
+
margin: 8px 0 0 0;
|
35
|
+
padding: 0;
|
36
|
+
font-variant: small-caps;
|
37
|
+
letter-spacing: 0.1em;
|
38
|
+
font-family: "Book Antiqua", Georgia, Palatino, Times, "Times New Roman", serif;
|
39
|
+
}
|
40
|
+
|
41
|
+
h1 a:link, h1 a:visited, h1 a:hover {
|
42
|
+
background-color: transparent ! important;
|
43
|
+
color: #333 ! important;
|
44
|
+
text-decoration: none ! important;
|
45
|
+
border-bottom: 0px ! important;
|
46
|
+
}
|
47
|
+
|
48
|
+
#intro {
|
49
|
+
width: 730px;
|
50
|
+
}
|
51
|
+
|
52
|
+
#intro ul {
|
53
|
+
margin-left: 0;
|
54
|
+
padding-left: 0;
|
55
|
+
display: inline;
|
56
|
+
}
|
57
|
+
|
58
|
+
#intro ul li {
|
59
|
+
display: inline;
|
60
|
+
font-size: small;
|
61
|
+
}
|
62
|
+
|
63
|
+
#intro ul li.li1 {
|
64
|
+
}
|
65
|
+
|
66
|
+
#intro p {
|
67
|
+
font-size: small;
|
68
|
+
font-weight: normal;
|
69
|
+
margin: 1.2em 0 0 0;
|
70
|
+
padding: 0;
|
71
|
+
}
|
72
|
+
|
73
|
+
.z {
|
74
|
+
float:left;
|
75
|
+
background: url(/img/shadowAlpha.png) no-repeat bottom right !important;
|
76
|
+
background: url(/img/shadow.gif) no-repeat bottom right;
|
77
|
+
margin: 15px 0 0 10px !important;
|
78
|
+
margin: 15px 0 0 5px;
|
79
|
+
}
|
80
|
+
|
81
|
+
.z .sectionInner {
|
82
|
+
background: none !important;
|
83
|
+
background: url(/img/shadow2.gif) no-repeat left top;
|
84
|
+
padding: 0 !important;
|
85
|
+
padding: 0 6px 6px 0;
|
86
|
+
}
|
87
|
+
|
88
|
+
.z .sectionInner .sectionInner2 {
|
89
|
+
background-color: #fff;
|
90
|
+
border: 1px solid #a9a9a9;
|
91
|
+
padding: 4px;
|
92
|
+
margin: -6px 6px 6px -6px !important;
|
93
|
+
margin: 0;
|
94
|
+
}
|
95
|
+
|
96
|
+
.s {
|
97
|
+
margin-left: 1em;
|
98
|
+
margin-right: 1em;
|
99
|
+
margin-bottom: 1em;
|
100
|
+
}
|
101
|
+
|
102
|
+
#main {
|
103
|
+
clear: left;
|
104
|
+
margin-left: 11px;
|
105
|
+
margin-bottom: 2em;
|
106
|
+
font-size: small;
|
107
|
+
}
|
108
|
+
|
109
|
+
#mainInner {
|
110
|
+
margin-left: 1em;
|
111
|
+
margin-bottom: 2em;
|
112
|
+
padding-top: 1em;
|
113
|
+
}
|
114
|
+
|
115
|
+
.footernavigation {
|
116
|
+
clear: both;
|
117
|
+
font-size: small;
|
118
|
+
padding-bottom: 1em;
|
119
|
+
margin-bottom: 0;
|
120
|
+
}
|
121
|
+
|
122
|
+
.example, .section, .appendix {
|
123
|
+
line-height: 150%;
|
124
|
+
}
|
125
|
+
|
126
|
+
#breadcrumb {
|
127
|
+
width: 100%;
|
128
|
+
margin: 0 0 1em 0;
|
129
|
+
padding: 0;
|
130
|
+
line-height: 140%;
|
131
|
+
font-size: small;
|
132
|
+
}
|
133
|
+
|
134
|
+
#breadcrumb #thispage {
|
135
|
+
font-weight: bold;
|
136
|
+
}
|
137
|
+
|
138
|
+
/* ----- Python code syntax coloring ----- */
|
139
|
+
.computeroutput, .traceback, .pykeyword, .pystring, .pycomment, .pyfunction, .pyclass {
|
140
|
+
background-color: white;
|
141
|
+
}
|
142
|
+
|
143
|
+
.pykeyword, .pyfunction, .pyclass {
|
144
|
+
font-weight: bold;
|
145
|
+
}
|
146
|
+
|
147
|
+
.computeroutput {
|
148
|
+
color: teal;
|
149
|
+
}
|
150
|
+
|
151
|
+
.traceback {
|
152
|
+
color: red;
|
153
|
+
}
|
154
|
+
|
155
|
+
.pykeyword {
|
156
|
+
color: navy;
|
157
|
+
}
|
158
|
+
|
159
|
+
.pystring {
|
160
|
+
color: olive;
|
161
|
+
}
|
162
|
+
|
163
|
+
.pycomment {
|
164
|
+
color: green;
|
165
|
+
font-style: italic;
|
166
|
+
}
|
167
|
+
|
168
|
+
.pyfunction {
|
169
|
+
color: teal;
|
170
|
+
}
|
171
|
+
|
172
|
+
.pyclass {
|
173
|
+
color: blue;
|
174
|
+
}
|
175
|
+
|
176
|
+
/* ----- standard stuff ----- */
|
177
|
+
.skip {
|
178
|
+
display: none;
|
179
|
+
}
|
180
|
+
|
181
|
+
samp, code, tt, pre {
|
182
|
+
font-weight: normal;
|
183
|
+
font-family: monospace;
|
184
|
+
font-size: small;
|
185
|
+
}
|
186
|
+
|
187
|
+
img {
|
188
|
+
border: 0;
|
189
|
+
}
|
190
|
+
|
191
|
+
acronym, abbr {
|
192
|
+
/* border-bottom: 1px dotted #333;*/
|
193
|
+
border-bottom: 0;
|
194
|
+
font-style: normal;
|
195
|
+
cursor: help;
|
196
|
+
}
|
197
|
+
|
198
|
+
hr {
|
199
|
+
clear: both;
|
200
|
+
margin-top: 2em !important;
|
201
|
+
margin-top: 1em;
|
202
|
+
height: 1px;
|
203
|
+
background-color: #cecbc6;
|
204
|
+
color: #cecbc6;
|
205
|
+
}
|
206
|
+
|
207
|
+
#footer {
|
208
|
+
text-align: center;
|
209
|
+
font-size: x-small;
|
210
|
+
}
|
211
|
+
|
212
|
+
body.docs .example {
|
213
|
+
border-left: 4px double #ddd !important;
|
214
|
+
border-left: 1px solid #ccc;
|
215
|
+
margin-left: 2em;
|
216
|
+
padding-left: 2em;
|
217
|
+
}
|
218
|
+
|
219
|
+
body.docs .example h3 {
|
220
|
+
font-size: 100%;
|
221
|
+
}
|
222
|
+
|
223
|
+
body.docs .example a.skip:link,
|
224
|
+
body.docs .example a.skip:visited,
|
225
|
+
body.docs .example a.skip:hover,
|
226
|
+
body.docs .section h3.title a.skip:link,
|
227
|
+
body.docs .section h3.title a.skip:visited,
|
228
|
+
body.docs .section h3.title a.skip:hover,
|
229
|
+
body.docs .appendix h3.title a.skip:link,
|
230
|
+
body.docs .appendix h3.title a.skip:visited,
|
231
|
+
body.docs .appendix h3.title a.skip:hover {
|
232
|
+
display: block;
|
233
|
+
float: left;
|
234
|
+
vertical-align: bottom;
|
235
|
+
text-decoration: none;
|
236
|
+
border-bottom: 0 ! important;
|
237
|
+
margin-right: 6px;
|
238
|
+
}
|
239
|
+
|
240
|
+
.reference-from h3,
|
241
|
+
.seealso h3,
|
242
|
+
.furtherreading h3 {
|
243
|
+
margin-top: -1.2em;
|
244
|
+
margin-left: -15px;
|
245
|
+
font-size: small;
|
246
|
+
width: 8em;
|
247
|
+
border: 1px solid #a9a9a9;
|
248
|
+
padding: 3px 3px 3px 13px;
|
249
|
+
background: white;
|
250
|
+
position: relative;
|
251
|
+
}
|
252
|
+
|
253
|
+
.reference-from, .seealso, .furtherreading {
|
254
|
+
width: 680px;
|
255
|
+
margin-top: 3em;
|
256
|
+
margin-bottom: 3em;
|
257
|
+
border: 1px solid #a9a9a9;
|
258
|
+
}
|
259
|
+
|
260
|
+
table.tip, table.note, table.warning, table.caution, table.important {
|
261
|
+
margin-bottom: 1em;
|
262
|
+
}
|
263
|
+
|
264
|
+
.table h3 {
|
265
|
+
display: none;
|
266
|
+
}
|
267
|
+
|
268
|
+
.table table td {
|
269
|
+
padding: 5px 1em 5px 1em;
|
270
|
+
}
|
271
|
+
|
272
|
+
div.download {
|
273
|
+
width: 708px;
|
274
|
+
margin-top: 3em;
|
275
|
+
margin-bottom: 3em;
|
276
|
+
border: 1px solid #a9a9a9;
|
277
|
+
}
|
278
|
+
|
279
|
+
div.download h3 {
|
280
|
+
margin-top: -1.2em;
|
281
|
+
margin-left: -15px;
|
282
|
+
font-size: small;
|
283
|
+
width: 10em;
|
284
|
+
border: 1px solid #a9a9a9;
|
285
|
+
padding: 3px 3px 3px 13px;
|
286
|
+
background-color: #fff;
|
287
|
+
color: #222;
|
288
|
+
position: relative;
|
289
|
+
}
|
290
|
+
|
291
|
+
div.download p {
|
292
|
+
margin-left: 1em;
|
293
|
+
}
|
294
|
+
|
295
|
+
div.download ul {
|
296
|
+
list-style: none;
|
297
|
+
padding-left: 1em;
|
298
|
+
margin-left: 0;
|
299
|
+
}
|