chardet 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +504 -0
- data/README +12 -0
- data/lib/Big5Freq.rb +913 -0
- data/lib/Big5Prober.rb +48 -0
- data/lib/CharDistributionAnalysis.rb +245 -0
- data/lib/CharSetGroupProber.rb +114 -0
- data/lib/CharSetProber.rb +70 -0
- data/lib/CodingStateMachine.rb +74 -0
- data/lib/ESCSM.rb +242 -0
- data/lib/EUCJPProber.rb +97 -0
- data/lib/EUCKRFreq.rb +600 -0
- data/lib/EUCKRProber.rb +48 -0
- data/lib/EUCTWFreq.rb +432 -0
- data/lib/EUCTWProber.rb +48 -0
- data/lib/EscCharSetProber.rb +94 -0
- data/lib/GB2312Freq.rb +475 -0
- data/lib/GB2312Prober.rb +48 -0
- data/lib/HebrewProber.rb +292 -0
- data/lib/JISFreq.rb +573 -0
- data/lib/JapaneseContextAnalysis.rb +234 -0
- data/lib/LangBulgarianModel.rb +231 -0
- data/lib/LangCyrillicModel.rb +332 -0
- data/lib/LangGreekModel.rb +229 -0
- data/lib/LangHebrewModel.rb +202 -0
- data/lib/LangHungarianModel.rb +228 -0
- data/lib/LangThaiModel.rb +203 -0
- data/lib/Latin1Prober.rb +160 -0
- data/lib/MBCSGroupProber.rb +57 -0
- data/lib/MBCSSM.rb +513 -0
- data/lib/MultiByteCharSetProber.rb +94 -0
- data/lib/SBCSGroupProber.rb +71 -0
- data/lib/SJISProber.rb +99 -0
- data/lib/SingleByteCharSetProber.rb +131 -0
- data/lib/UTF8Prober.rb +91 -0
- data/lib/UniversalDetector.rb +209 -0
- data/python-docs/css/chardet.css +299 -0
- data/python-docs/faq.html +107 -0
- data/python-docs/how-it-works.html +113 -0
- data/python-docs/images/caution.png +0 -0
- data/python-docs/images/important.png +0 -0
- data/python-docs/images/note.png +0 -0
- data/python-docs/images/permalink.gif +0 -0
- data/python-docs/images/tip.png +0 -0
- data/python-docs/images/warning.png +0 -0
- data/python-docs/index.html +73 -0
- data/python-docs/license.html +62 -0
- data/python-docs/supported-encodings.html +86 -0
- data/python-docs/usage.html +107 -0
- metadata +86 -0
@@ -0,0 +1,209 @@
|
|
1
|
+
######################## BEGIN LICENSE BLOCK ########################
|
2
|
+
# The Original Code is mozilla.org code.
|
3
|
+
#
|
4
|
+
# The Initial Developer of the Original Code is
|
5
|
+
# Netscape Communications Corporation.
|
6
|
+
# Portions created by the Initial Developer are Copyright (C) 1998
|
7
|
+
# the Initial Developer. All Rights Reserved.
|
8
|
+
#
|
9
|
+
# Contributor(s):
|
10
|
+
# Hui (zhengzhengzheng@gmail.com) - port to Ruby
|
11
|
+
# Mark Pilgrim - first port to Python
|
12
|
+
#
|
13
|
+
# This library is free software; you can redistribute it and/or
|
14
|
+
# modify it under the terms of the GNU Lesser General Public
|
15
|
+
# License as published by the Free Software Foundation; either
|
16
|
+
# version 2.1 of the License, or (at your option) any later version.
|
17
|
+
#
|
18
|
+
# This library is distributed in the hope that it will be useful,
|
19
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
20
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
21
|
+
# Lesser General Public License for more details.
|
22
|
+
#
|
23
|
+
# You should have received a copy of the GNU Lesser General Public
|
24
|
+
# License along with this library; if not, write to the Free Software
|
25
|
+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
26
|
+
# 02110-1301 USA
|
27
|
+
######################### END LICENSE BLOCK #########################
|
28
|
+
|
29
|
+
require "EscCharSetProber"
|
30
|
+
require "MBCSGroupProber"
|
31
|
+
require "SBCSGroupProber"
|
32
|
+
require "Latin1Prober"
|
33
|
+
require "singleton"
|
34
|
+
|
35
|
+
module UniversalDetector
|
36
|
+
|
37
|
+
class << self
|
38
|
+
def encoding(data)
|
39
|
+
chardet(data)['encoding']
|
40
|
+
end
|
41
|
+
|
42
|
+
def chardet(data)
|
43
|
+
u = UniversalDetector::Detector.instance
|
44
|
+
u.reset()
|
45
|
+
u.feed(data)
|
46
|
+
u.close()
|
47
|
+
u.result
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
DEBUG = nil
|
52
|
+
|
53
|
+
Detectiong = 0
|
54
|
+
FoundIt = 1
|
55
|
+
NotMe = 2
|
56
|
+
|
57
|
+
Start = 0
|
58
|
+
Error = 1
|
59
|
+
ItsMe = 2
|
60
|
+
|
61
|
+
MINIMUM_THRESHOLD = 0.20
|
62
|
+
PureAscii = 0
|
63
|
+
EscAscii = 1
|
64
|
+
Highbyte = 2
|
65
|
+
|
66
|
+
SHORTCUT_THRESHOLD = 0.95
|
67
|
+
|
68
|
+
class Detector
|
69
|
+
|
70
|
+
include Singleton
|
71
|
+
|
72
|
+
attr_reader :result
|
73
|
+
|
74
|
+
def initialize
|
75
|
+
@_highBitDetector = /[\x80-\xFF]/
|
76
|
+
@_escDetector = /\033|~\{/
|
77
|
+
@_mEscCharSetProber = nil
|
78
|
+
@_mCharSetProbers = []
|
79
|
+
reset
|
80
|
+
end
|
81
|
+
|
82
|
+
def reset
|
83
|
+
@result = {"encoding"=> nil, "confidence"=> 0.0}
|
84
|
+
@done = false
|
85
|
+
@_mStart = true
|
86
|
+
@_mGotData = false
|
87
|
+
@_mInputState = :PureAscii
|
88
|
+
@_mLastChar = ""
|
89
|
+
if @_mEscCharSetProber
|
90
|
+
@_mEscCharSetProber.reset
|
91
|
+
end
|
92
|
+
for prober in @_mCharSetProbers
|
93
|
+
prober.reset
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def feed(data)
|
98
|
+
if @done || data.empty?
|
99
|
+
return
|
100
|
+
end
|
101
|
+
unless @_mGotData
|
102
|
+
# If the data starts with BOM, we know it is UTF
|
103
|
+
if data[0,3] == "\xEF\xBB\xBF":
|
104
|
+
# EF BB BF UTF-8 with BOM
|
105
|
+
@result = {"encoding"=> "UTF-8", "confidence"=> 1.0}
|
106
|
+
elsif data[0,4] == "\xFF\xFE\x00\x00":
|
107
|
+
# FF FE 00 00 UTF-32, little-endian BOM
|
108
|
+
@result = {"encoding"=> "UTF-32LE", "confidence"=> 1.0}
|
109
|
+
elsif data[0,4] == "\x00\x00\xFE\xFF":
|
110
|
+
# 00 00 FE FF UTF-32, big-endian BOM
|
111
|
+
@result = {"encoding"=> "UTF-32BE", "confidence"=> 1.0}
|
112
|
+
elsif data[0,4] == "\xFE\xFF\x00\x00":
|
113
|
+
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
114
|
+
@result = {"encoding"=> "X-ISO-10646-UCS-4-3412", "confidence"=> 1.0}
|
115
|
+
elsif data[0,4] == "\x00\x00\xFF\xFE":
|
116
|
+
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
117
|
+
@result = {"encoding"=> "X-ISO-10646-UCS-4-2143", "confidence"=> 1.0}
|
118
|
+
elsif data[0,4] == "\xFF\xFE":
|
119
|
+
# FF FE UTF-16, little endian BOM
|
120
|
+
@result = {"encoding"=> "UTF-16LE", "confidence"=> 1.0}
|
121
|
+
elsif data[0,2] == "\xFE\xFF":
|
122
|
+
# FE FF UTF-16, big endian BOM
|
123
|
+
@result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0}
|
124
|
+
end
|
125
|
+
end
|
126
|
+
@_mGotData = true
|
127
|
+
if @result["encoding"] && @result["confidence"] > 0.0
|
128
|
+
@done = true
|
129
|
+
return
|
130
|
+
end
|
131
|
+
|
132
|
+
if @_mInputState == :PureAscii
|
133
|
+
if data =~ @_highBitDetector
|
134
|
+
@_mInputState = :Highbyte
|
135
|
+
elsif (@_mLastChar + data) =~ @_escDetector
|
136
|
+
@_mInputState = :EscAscii
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
@_mLastChar = data[-1]
|
141
|
+
if @_mInputState == :EscAscii
|
142
|
+
unless @_mEscCharSetProber
|
143
|
+
@_mEscCharSetProber = EscCharSetProber.new
|
144
|
+
end
|
145
|
+
if @_mEscCharSetProber.feed(data) == constants.eFoundIt
|
146
|
+
@result = {"encoding"=> @_mEscCharSetProber.get_charset_name() ,"confidence"=> @_mEscCharSetProber.get_confidence()}
|
147
|
+
@done = true
|
148
|
+
end
|
149
|
+
elsif @_mInputState == :Highbyte
|
150
|
+
if @_mCharSetProbers.empty?
|
151
|
+
@_mCharSetProbers = MBCSGroupProber.new.mProbers + SBCSGroupProber.new.mProbers + [Latin1Prober.new]
|
152
|
+
end
|
153
|
+
@_mCharSetProbers.each do |prober|
|
154
|
+
if prober.feed(data) == :FoundIt
|
155
|
+
@result = {"encoding"=> prober.get_charset_name(), "confidence"=> prober.get_confidence()}
|
156
|
+
@done = true
|
157
|
+
break
|
158
|
+
end
|
159
|
+
end #for
|
160
|
+
end
|
161
|
+
end #feed
|
162
|
+
|
163
|
+
def close
|
164
|
+
if @done then return end
|
165
|
+
unless @_mGotData
|
166
|
+
if DEBUG
|
167
|
+
p("no data received!\n")
|
168
|
+
end
|
169
|
+
return
|
170
|
+
end
|
171
|
+
@done = true
|
172
|
+
|
173
|
+
if @_mInputState == :PureAscii
|
174
|
+
@result = {"encoding" => "ascii", "confidence" => 1.0}
|
175
|
+
return @result
|
176
|
+
end
|
177
|
+
|
178
|
+
if @_mInputState == :Highbyte
|
179
|
+
proberConfidence = nil
|
180
|
+
maxProberConfidence = 0.0
|
181
|
+
maxProber = nil
|
182
|
+
for prober in @_mCharSetProbers
|
183
|
+
unless prober then next end
|
184
|
+
proberConfidence = prober.get_confidence()
|
185
|
+
if proberConfidence > maxProberConfidence
|
186
|
+
maxProberConfidence = proberConfidence
|
187
|
+
maxProber = prober
|
188
|
+
end
|
189
|
+
end
|
190
|
+
if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD)
|
191
|
+
@result = {"encoding" => maxProber.get_charset_name(),
|
192
|
+
"confidence" => maxProber.get_confidence()}
|
193
|
+
return @result
|
194
|
+
end
|
195
|
+
end #if
|
196
|
+
|
197
|
+
if DEBUG
|
198
|
+
p("no probers hit minimum threshhold\n")
|
199
|
+
for prober in @_mCharSetProbers:
|
200
|
+
unless prober then next end
|
201
|
+
p("%s confidence = %s\n" % \
|
202
|
+
[prober.get_charset_name(), \
|
203
|
+
prober.get_confidence()])
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end #close
|
207
|
+
end #class
|
208
|
+
|
209
|
+
end #module
|
@@ -0,0 +1,299 @@
|
|
1
|
+
html {
|
2
|
+
margin: 0;
|
3
|
+
padding: 0;
|
4
|
+
}
|
5
|
+
|
6
|
+
body {
|
7
|
+
background-color: #fff;
|
8
|
+
color: #333;
|
9
|
+
font-family: 'Lucida Grande', Verdana, Geneva, Lucida, Helvetica, sans-serif;
|
10
|
+
font-size: 100%;
|
11
|
+
margin: 10px;
|
12
|
+
padding: 0;
|
13
|
+
}
|
14
|
+
|
15
|
+
a:link, a:visited {
|
16
|
+
background-color: transparent;
|
17
|
+
color: #333;
|
18
|
+
text-decoration: none !important;
|
19
|
+
border-bottom: 1px dotted #333 !important;
|
20
|
+
text-decoration: underline;
|
21
|
+
border-bottom: 0;
|
22
|
+
}
|
23
|
+
|
24
|
+
a:hover {
|
25
|
+
background-color: transparent;
|
26
|
+
color: #993344;
|
27
|
+
text-decoration: none !important;
|
28
|
+
text-decoration: underline;
|
29
|
+
border-bottom: 1px dotted #993344 !important;
|
30
|
+
border-bottom: 0;
|
31
|
+
}
|
32
|
+
|
33
|
+
h1 {
|
34
|
+
margin: 8px 0 0 0;
|
35
|
+
padding: 0;
|
36
|
+
font-variant: small-caps;
|
37
|
+
letter-spacing: 0.1em;
|
38
|
+
font-family: "Book Antiqua", Georgia, Palatino, Times, "Times New Roman", serif;
|
39
|
+
}
|
40
|
+
|
41
|
+
h1 a:link, h1 a:visited, h1 a:hover {
|
42
|
+
background-color: transparent ! important;
|
43
|
+
color: #333 ! important;
|
44
|
+
text-decoration: none ! important;
|
45
|
+
border-bottom: 0px ! important;
|
46
|
+
}
|
47
|
+
|
48
|
+
#intro {
|
49
|
+
width: 730px;
|
50
|
+
}
|
51
|
+
|
52
|
+
#intro ul {
|
53
|
+
margin-left: 0;
|
54
|
+
padding-left: 0;
|
55
|
+
display: inline;
|
56
|
+
}
|
57
|
+
|
58
|
+
#intro ul li {
|
59
|
+
display: inline;
|
60
|
+
font-size: small;
|
61
|
+
}
|
62
|
+
|
63
|
+
#intro ul li.li1 {
|
64
|
+
}
|
65
|
+
|
66
|
+
#intro p {
|
67
|
+
font-size: small;
|
68
|
+
font-weight: normal;
|
69
|
+
margin: 1.2em 0 0 0;
|
70
|
+
padding: 0;
|
71
|
+
}
|
72
|
+
|
73
|
+
.z {
|
74
|
+
float:left;
|
75
|
+
background: url(/img/shadowAlpha.png) no-repeat bottom right !important;
|
76
|
+
background: url(/img/shadow.gif) no-repeat bottom right;
|
77
|
+
margin: 15px 0 0 10px !important;
|
78
|
+
margin: 15px 0 0 5px;
|
79
|
+
}
|
80
|
+
|
81
|
+
.z .sectionInner {
|
82
|
+
background: none !important;
|
83
|
+
background: url(/img/shadow2.gif) no-repeat left top;
|
84
|
+
padding: 0 !important;
|
85
|
+
padding: 0 6px 6px 0;
|
86
|
+
}
|
87
|
+
|
88
|
+
.z .sectionInner .sectionInner2 {
|
89
|
+
background-color: #fff;
|
90
|
+
border: 1px solid #a9a9a9;
|
91
|
+
padding: 4px;
|
92
|
+
margin: -6px 6px 6px -6px !important;
|
93
|
+
margin: 0;
|
94
|
+
}
|
95
|
+
|
96
|
+
.s {
|
97
|
+
margin-left: 1em;
|
98
|
+
margin-right: 1em;
|
99
|
+
margin-bottom: 1em;
|
100
|
+
}
|
101
|
+
|
102
|
+
#main {
|
103
|
+
clear: left;
|
104
|
+
margin-left: 11px;
|
105
|
+
margin-bottom: 2em;
|
106
|
+
font-size: small;
|
107
|
+
}
|
108
|
+
|
109
|
+
#mainInner {
|
110
|
+
margin-left: 1em;
|
111
|
+
margin-bottom: 2em;
|
112
|
+
padding-top: 1em;
|
113
|
+
}
|
114
|
+
|
115
|
+
.footernavigation {
|
116
|
+
clear: both;
|
117
|
+
font-size: small;
|
118
|
+
padding-bottom: 1em;
|
119
|
+
margin-bottom: 0;
|
120
|
+
}
|
121
|
+
|
122
|
+
.example, .section, .appendix {
|
123
|
+
line-height: 150%;
|
124
|
+
}
|
125
|
+
|
126
|
+
#breadcrumb {
|
127
|
+
width: 100%;
|
128
|
+
margin: 0 0 1em 0;
|
129
|
+
padding: 0;
|
130
|
+
line-height: 140%;
|
131
|
+
font-size: small;
|
132
|
+
}
|
133
|
+
|
134
|
+
#breadcrumb #thispage {
|
135
|
+
font-weight: bold;
|
136
|
+
}
|
137
|
+
|
138
|
+
/* ----- Python code syntax coloring ----- */
|
139
|
+
.computeroutput, .traceback, .pykeyword, .pystring, .pycomment, .pyfunction, .pyclass {
|
140
|
+
background-color: white;
|
141
|
+
}
|
142
|
+
|
143
|
+
.pykeyword, .pyfunction, .pyclass {
|
144
|
+
font-weight: bold;
|
145
|
+
}
|
146
|
+
|
147
|
+
.computeroutput {
|
148
|
+
color: teal;
|
149
|
+
}
|
150
|
+
|
151
|
+
.traceback {
|
152
|
+
color: red;
|
153
|
+
}
|
154
|
+
|
155
|
+
.pykeyword {
|
156
|
+
color: navy;
|
157
|
+
}
|
158
|
+
|
159
|
+
.pystring {
|
160
|
+
color: olive;
|
161
|
+
}
|
162
|
+
|
163
|
+
.pycomment {
|
164
|
+
color: green;
|
165
|
+
font-style: italic;
|
166
|
+
}
|
167
|
+
|
168
|
+
.pyfunction {
|
169
|
+
color: teal;
|
170
|
+
}
|
171
|
+
|
172
|
+
.pyclass {
|
173
|
+
color: blue;
|
174
|
+
}
|
175
|
+
|
176
|
+
/* ----- standard stuff ----- */
|
177
|
+
.skip {
|
178
|
+
display: none;
|
179
|
+
}
|
180
|
+
|
181
|
+
samp, code, tt, pre {
|
182
|
+
font-weight: normal;
|
183
|
+
font-family: monospace;
|
184
|
+
font-size: small;
|
185
|
+
}
|
186
|
+
|
187
|
+
img {
|
188
|
+
border: 0;
|
189
|
+
}
|
190
|
+
|
191
|
+
acronym, abbr {
|
192
|
+
/* border-bottom: 1px dotted #333;*/
|
193
|
+
border-bottom: 0;
|
194
|
+
font-style: normal;
|
195
|
+
cursor: help;
|
196
|
+
}
|
197
|
+
|
198
|
+
hr {
|
199
|
+
clear: both;
|
200
|
+
margin-top: 2em !important;
|
201
|
+
margin-top: 1em;
|
202
|
+
height: 1px;
|
203
|
+
background-color: #cecbc6;
|
204
|
+
color: #cecbc6;
|
205
|
+
}
|
206
|
+
|
207
|
+
#footer {
|
208
|
+
text-align: center;
|
209
|
+
font-size: x-small;
|
210
|
+
}
|
211
|
+
|
212
|
+
body.docs .example {
|
213
|
+
border-left: 4px double #ddd !important;
|
214
|
+
border-left: 1px solid #ccc;
|
215
|
+
margin-left: 2em;
|
216
|
+
padding-left: 2em;
|
217
|
+
}
|
218
|
+
|
219
|
+
body.docs .example h3 {
|
220
|
+
font-size: 100%;
|
221
|
+
}
|
222
|
+
|
223
|
+
body.docs .example a.skip:link,
|
224
|
+
body.docs .example a.skip:visited,
|
225
|
+
body.docs .example a.skip:hover,
|
226
|
+
body.docs .section h3.title a.skip:link,
|
227
|
+
body.docs .section h3.title a.skip:visited,
|
228
|
+
body.docs .section h3.title a.skip:hover,
|
229
|
+
body.docs .appendix h3.title a.skip:link,
|
230
|
+
body.docs .appendix h3.title a.skip:visited,
|
231
|
+
body.docs .appendix h3.title a.skip:hover {
|
232
|
+
display: block;
|
233
|
+
float: left;
|
234
|
+
vertical-align: bottom;
|
235
|
+
text-decoration: none;
|
236
|
+
border-bottom: 0 ! important;
|
237
|
+
margin-right: 6px;
|
238
|
+
}
|
239
|
+
|
240
|
+
.reference-from h3,
|
241
|
+
.seealso h3,
|
242
|
+
.furtherreading h3 {
|
243
|
+
margin-top: -1.2em;
|
244
|
+
margin-left: -15px;
|
245
|
+
font-size: small;
|
246
|
+
width: 8em;
|
247
|
+
border: 1px solid #a9a9a9;
|
248
|
+
padding: 3px 3px 3px 13px;
|
249
|
+
background: white;
|
250
|
+
position: relative;
|
251
|
+
}
|
252
|
+
|
253
|
+
.reference-from, .seealso, .furtherreading {
|
254
|
+
width: 680px;
|
255
|
+
margin-top: 3em;
|
256
|
+
margin-bottom: 3em;
|
257
|
+
border: 1px solid #a9a9a9;
|
258
|
+
}
|
259
|
+
|
260
|
+
table.tip, table.note, table.warning, table.caution, table.important {
|
261
|
+
margin-bottom: 1em;
|
262
|
+
}
|
263
|
+
|
264
|
+
.table h3 {
|
265
|
+
display: none;
|
266
|
+
}
|
267
|
+
|
268
|
+
.table table td {
|
269
|
+
padding: 5px 1em 5px 1em;
|
270
|
+
}
|
271
|
+
|
272
|
+
div.download {
|
273
|
+
width: 708px;
|
274
|
+
margin-top: 3em;
|
275
|
+
margin-bottom: 3em;
|
276
|
+
border: 1px solid #a9a9a9;
|
277
|
+
}
|
278
|
+
|
279
|
+
div.download h3 {
|
280
|
+
margin-top: -1.2em;
|
281
|
+
margin-left: -15px;
|
282
|
+
font-size: small;
|
283
|
+
width: 10em;
|
284
|
+
border: 1px solid #a9a9a9;
|
285
|
+
padding: 3px 3px 3px 13px;
|
286
|
+
background-color: #fff;
|
287
|
+
color: #222;
|
288
|
+
position: relative;
|
289
|
+
}
|
290
|
+
|
291
|
+
div.download p {
|
292
|
+
margin-left: 1em;
|
293
|
+
}
|
294
|
+
|
295
|
+
div.download ul {
|
296
|
+
list-style: none;
|
297
|
+
padding-left: 1em;
|
298
|
+
margin-left: 0;
|
299
|
+
}
|