chardet 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/COPYING +504 -0
  2. data/README +12 -0
  3. data/lib/Big5Freq.rb +913 -0
  4. data/lib/Big5Prober.rb +48 -0
  5. data/lib/CharDistributionAnalysis.rb +245 -0
  6. data/lib/CharSetGroupProber.rb +114 -0
  7. data/lib/CharSetProber.rb +70 -0
  8. data/lib/CodingStateMachine.rb +74 -0
  9. data/lib/ESCSM.rb +242 -0
  10. data/lib/EUCJPProber.rb +97 -0
  11. data/lib/EUCKRFreq.rb +600 -0
  12. data/lib/EUCKRProber.rb +48 -0
  13. data/lib/EUCTWFreq.rb +432 -0
  14. data/lib/EUCTWProber.rb +48 -0
  15. data/lib/EscCharSetProber.rb +94 -0
  16. data/lib/GB2312Freq.rb +475 -0
  17. data/lib/GB2312Prober.rb +48 -0
  18. data/lib/HebrewProber.rb +292 -0
  19. data/lib/JISFreq.rb +573 -0
  20. data/lib/JapaneseContextAnalysis.rb +234 -0
  21. data/lib/LangBulgarianModel.rb +231 -0
  22. data/lib/LangCyrillicModel.rb +332 -0
  23. data/lib/LangGreekModel.rb +229 -0
  24. data/lib/LangHebrewModel.rb +202 -0
  25. data/lib/LangHungarianModel.rb +228 -0
  26. data/lib/LangThaiModel.rb +203 -0
  27. data/lib/Latin1Prober.rb +160 -0
  28. data/lib/MBCSGroupProber.rb +57 -0
  29. data/lib/MBCSSM.rb +513 -0
  30. data/lib/MultiByteCharSetProber.rb +94 -0
  31. data/lib/SBCSGroupProber.rb +71 -0
  32. data/lib/SJISProber.rb +99 -0
  33. data/lib/SingleByteCharSetProber.rb +131 -0
  34. data/lib/UTF8Prober.rb +91 -0
  35. data/lib/UniversalDetector.rb +209 -0
  36. data/python-docs/css/chardet.css +299 -0
  37. data/python-docs/faq.html +107 -0
  38. data/python-docs/how-it-works.html +113 -0
  39. data/python-docs/images/caution.png +0 -0
  40. data/python-docs/images/important.png +0 -0
  41. data/python-docs/images/note.png +0 -0
  42. data/python-docs/images/permalink.gif +0 -0
  43. data/python-docs/images/tip.png +0 -0
  44. data/python-docs/images/warning.png +0 -0
  45. data/python-docs/index.html +73 -0
  46. data/python-docs/license.html +62 -0
  47. data/python-docs/supported-encodings.html +86 -0
  48. data/python-docs/usage.html +107 -0
  49. metadata +86 -0
@@ -0,0 +1,209 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require "EscCharSetProber"
30
+ require "MBCSGroupProber"
31
+ require "SBCSGroupProber"
32
+ require "Latin1Prober"
33
+ require "singleton"
34
+
35
+ module UniversalDetector
36
+
37
+ class << self
38
+ def encoding(data)
39
+ chardet(data)['encoding']
40
+ end
41
+
42
+ def chardet(data)
43
+ u = UniversalDetector::Detector.instance
44
+ u.reset()
45
+ u.feed(data)
46
+ u.close()
47
+ u.result
48
+ end
49
+ end
50
+
51
+ DEBUG = nil
52
+
53
+ Detectiong = 0
54
+ FoundIt = 1
55
+ NotMe = 2
56
+
57
+ Start = 0
58
+ Error = 1
59
+ ItsMe = 2
60
+
61
+ MINIMUM_THRESHOLD = 0.20
62
+ PureAscii = 0
63
+ EscAscii = 1
64
+ Highbyte = 2
65
+
66
+ SHORTCUT_THRESHOLD = 0.95
67
+
68
+ class Detector
69
+
70
+ include Singleton
71
+
72
+ attr_reader :result
73
+
74
+ def initialize
75
+ @_highBitDetector = /[\x80-\xFF]/
76
+ @_escDetector = /\033|~\{/
77
+ @_mEscCharSetProber = nil
78
+ @_mCharSetProbers = []
79
+ reset
80
+ end
81
+
82
+ def reset
83
+ @result = {"encoding"=> nil, "confidence"=> 0.0}
84
+ @done = false
85
+ @_mStart = true
86
+ @_mGotData = false
87
+ @_mInputState = :PureAscii
88
+ @_mLastChar = ""
89
+ if @_mEscCharSetProber
90
+ @_mEscCharSetProber.reset
91
+ end
92
+ for prober in @_mCharSetProbers
93
+ prober.reset
94
+ end
95
+ end
96
+
97
+ def feed(data)
98
+ if @done || data.empty?
99
+ return
100
+ end
101
+ unless @_mGotData
102
+ # If the data starts with BOM, we know it is UTF
103
+ if data[0,3] == "\xEF\xBB\xBF":
104
+ # EF BB BF UTF-8 with BOM
105
+ @result = {"encoding"=> "UTF-8", "confidence"=> 1.0}
106
+ elsif data[0,4] == "\xFF\xFE\x00\x00":
107
+ # FF FE 00 00 UTF-32, little-endian BOM
108
+ @result = {"encoding"=> "UTF-32LE", "confidence"=> 1.0}
109
+ elsif data[0,4] == "\x00\x00\xFE\xFF":
110
+ # 00 00 FE FF UTF-32, big-endian BOM
111
+ @result = {"encoding"=> "UTF-32BE", "confidence"=> 1.0}
112
+ elsif data[0,4] == "\xFE\xFF\x00\x00":
113
+ # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
114
+ @result = {"encoding"=> "X-ISO-10646-UCS-4-3412", "confidence"=> 1.0}
115
+ elsif data[0,4] == "\x00\x00\xFF\xFE":
116
+ # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
117
+ @result = {"encoding"=> "X-ISO-10646-UCS-4-2143", "confidence"=> 1.0}
118
+ elsif data[0,4] == "\xFF\xFE":
119
+ # FF FE UTF-16, little endian BOM
120
+ @result = {"encoding"=> "UTF-16LE", "confidence"=> 1.0}
121
+ elsif data[0,2] == "\xFE\xFF":
122
+ # FE FF UTF-16, big endian BOM
123
+ @result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0}
124
+ end
125
+ end
126
+ @_mGotData = true
127
+ if @result["encoding"] && @result["confidence"] > 0.0
128
+ @done = true
129
+ return
130
+ end
131
+
132
+ if @_mInputState == :PureAscii
133
+ if data =~ @_highBitDetector
134
+ @_mInputState = :Highbyte
135
+ elsif (@_mLastChar + data) =~ @_escDetector
136
+ @_mInputState = :EscAscii
137
+ end
138
+ end
139
+
140
+ @_mLastChar = data[-1]
141
+ if @_mInputState == :EscAscii
142
+ unless @_mEscCharSetProber
143
+ @_mEscCharSetProber = EscCharSetProber.new
144
+ end
145
+ if @_mEscCharSetProber.feed(data) == constants.eFoundIt
146
+ @result = {"encoding"=> @_mEscCharSetProber.get_charset_name() ,"confidence"=> @_mEscCharSetProber.get_confidence()}
147
+ @done = true
148
+ end
149
+ elsif @_mInputState == :Highbyte
150
+ if @_mCharSetProbers.empty?
151
+ @_mCharSetProbers = MBCSGroupProber.new.mProbers + SBCSGroupProber.new.mProbers + [Latin1Prober.new]
152
+ end
153
+ @_mCharSetProbers.each do |prober|
154
+ if prober.feed(data) == :FoundIt
155
+ @result = {"encoding"=> prober.get_charset_name(), "confidence"=> prober.get_confidence()}
156
+ @done = true
157
+ break
158
+ end
159
+ end #for
160
+ end
161
+ end #feed
162
+
163
+ def close
164
+ if @done then return end
165
+ unless @_mGotData
166
+ if DEBUG
167
+ p("no data received!\n")
168
+ end
169
+ return
170
+ end
171
+ @done = true
172
+
173
+ if @_mInputState == :PureAscii
174
+ @result = {"encoding" => "ascii", "confidence" => 1.0}
175
+ return @result
176
+ end
177
+
178
+ if @_mInputState == :Highbyte
179
+ proberConfidence = nil
180
+ maxProberConfidence = 0.0
181
+ maxProber = nil
182
+ for prober in @_mCharSetProbers
183
+ unless prober then next end
184
+ proberConfidence = prober.get_confidence()
185
+ if proberConfidence > maxProberConfidence
186
+ maxProberConfidence = proberConfidence
187
+ maxProber = prober
188
+ end
189
+ end
190
+ if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD)
191
+ @result = {"encoding" => maxProber.get_charset_name(),
192
+ "confidence" => maxProber.get_confidence()}
193
+ return @result
194
+ end
195
+ end #if
196
+
197
+ if DEBUG
198
+ p("no probers hit minimum threshhold\n")
199
+ for prober in @_mCharSetProbers:
200
+ unless prober then next end
201
+ p("%s confidence = %s\n" % \
202
+ [prober.get_charset_name(), \
203
+ prober.get_confidence()])
204
+ end
205
+ end
206
+ end #close
207
+ end #class
208
+
209
+ end #module
@@ -0,0 +1,299 @@
1
+ html {
2
+ margin: 0;
3
+ padding: 0;
4
+ }
5
+
6
+ body {
7
+ background-color: #fff;
8
+ color: #333;
9
+ font-family: 'Lucida Grande', Verdana, Geneva, Lucida, Helvetica, sans-serif;
10
+ font-size: 100%;
11
+ margin: 10px;
12
+ padding: 0;
13
+ }
14
+
15
+ a:link, a:visited {
16
+ background-color: transparent;
17
+ color: #333;
18
+ text-decoration: none !important;
19
+ border-bottom: 1px dotted #333 !important;
20
+ text-decoration: underline;
21
+ border-bottom: 0;
22
+ }
23
+
24
+ a:hover {
25
+ background-color: transparent;
26
+ color: #993344;
27
+ text-decoration: none !important;
28
+ text-decoration: underline;
29
+ border-bottom: 1px dotted #993344 !important;
30
+ border-bottom: 0;
31
+ }
32
+
33
+ h1 {
34
+ margin: 8px 0 0 0;
35
+ padding: 0;
36
+ font-variant: small-caps;
37
+ letter-spacing: 0.1em;
38
+ font-family: "Book Antiqua", Georgia, Palatino, Times, "Times New Roman", serif;
39
+ }
40
+
41
+ h1 a:link, h1 a:visited, h1 a:hover {
42
+ background-color: transparent ! important;
43
+ color: #333 ! important;
44
+ text-decoration: none ! important;
45
+ border-bottom: 0px ! important;
46
+ }
47
+
48
+ #intro {
49
+ width: 730px;
50
+ }
51
+
52
+ #intro ul {
53
+ margin-left: 0;
54
+ padding-left: 0;
55
+ display: inline;
56
+ }
57
+
58
+ #intro ul li {
59
+ display: inline;
60
+ font-size: small;
61
+ }
62
+
63
+ #intro ul li.li1 {
64
+ }
65
+
66
+ #intro p {
67
+ font-size: small;
68
+ font-weight: normal;
69
+ margin: 1.2em 0 0 0;
70
+ padding: 0;
71
+ }
72
+
73
+ .z {
74
+ float:left;
75
+ background: url(/img/shadowAlpha.png) no-repeat bottom right !important;
76
+ background: url(/img/shadow.gif) no-repeat bottom right;
77
+ margin: 15px 0 0 10px !important;
78
+ margin: 15px 0 0 5px;
79
+ }
80
+
81
+ .z .sectionInner {
82
+ background: none !important;
83
+ background: url(/img/shadow2.gif) no-repeat left top;
84
+ padding: 0 !important;
85
+ padding: 0 6px 6px 0;
86
+ }
87
+
88
+ .z .sectionInner .sectionInner2 {
89
+ background-color: #fff;
90
+ border: 1px solid #a9a9a9;
91
+ padding: 4px;
92
+ margin: -6px 6px 6px -6px !important;
93
+ margin: 0;
94
+ }
95
+
96
+ .s {
97
+ margin-left: 1em;
98
+ margin-right: 1em;
99
+ margin-bottom: 1em;
100
+ }
101
+
102
+ #main {
103
+ clear: left;
104
+ margin-left: 11px;
105
+ margin-bottom: 2em;
106
+ font-size: small;
107
+ }
108
+
109
+ #mainInner {
110
+ margin-left: 1em;
111
+ margin-bottom: 2em;
112
+ padding-top: 1em;
113
+ }
114
+
115
+ .footernavigation {
116
+ clear: both;
117
+ font-size: small;
118
+ padding-bottom: 1em;
119
+ margin-bottom: 0;
120
+ }
121
+
122
+ .example, .section, .appendix {
123
+ line-height: 150%;
124
+ }
125
+
126
+ #breadcrumb {
127
+ width: 100%;
128
+ margin: 0 0 1em 0;
129
+ padding: 0;
130
+ line-height: 140%;
131
+ font-size: small;
132
+ }
133
+
134
+ #breadcrumb #thispage {
135
+ font-weight: bold;
136
+ }
137
+
138
+ /* ----- Python code syntax coloring ----- */
139
+ .computeroutput, .traceback, .pykeyword, .pystring, .pycomment, .pyfunction, .pyclass {
140
+ background-color: white;
141
+ }
142
+
143
+ .pykeyword, .pyfunction, .pyclass {
144
+ font-weight: bold;
145
+ }
146
+
147
+ .computeroutput {
148
+ color: teal;
149
+ }
150
+
151
+ .traceback {
152
+ color: red;
153
+ }
154
+
155
+ .pykeyword {
156
+ color: navy;
157
+ }
158
+
159
+ .pystring {
160
+ color: olive;
161
+ }
162
+
163
+ .pycomment {
164
+ color: green;
165
+ font-style: italic;
166
+ }
167
+
168
+ .pyfunction {
169
+ color: teal;
170
+ }
171
+
172
+ .pyclass {
173
+ color: blue;
174
+ }
175
+
176
+ /* ----- standard stuff ----- */
177
+ .skip {
178
+ display: none;
179
+ }
180
+
181
+ samp, code, tt, pre {
182
+ font-weight: normal;
183
+ font-family: monospace;
184
+ font-size: small;
185
+ }
186
+
187
+ img {
188
+ border: 0;
189
+ }
190
+
191
+ acronym, abbr {
192
+ /* border-bottom: 1px dotted #333;*/
193
+ border-bottom: 0;
194
+ font-style: normal;
195
+ cursor: help;
196
+ }
197
+
198
+ hr {
199
+ clear: both;
200
+ margin-top: 2em !important;
201
+ margin-top: 1em;
202
+ height: 1px;
203
+ background-color: #cecbc6;
204
+ color: #cecbc6;
205
+ }
206
+
207
+ #footer {
208
+ text-align: center;
209
+ font-size: x-small;
210
+ }
211
+
212
+ body.docs .example {
213
+ border-left: 4px double #ddd !important;
214
+ border-left: 1px solid #ccc;
215
+ margin-left: 2em;
216
+ padding-left: 2em;
217
+ }
218
+
219
+ body.docs .example h3 {
220
+ font-size: 100%;
221
+ }
222
+
223
+ body.docs .example a.skip:link,
224
+ body.docs .example a.skip:visited,
225
+ body.docs .example a.skip:hover,
226
+ body.docs .section h3.title a.skip:link,
227
+ body.docs .section h3.title a.skip:visited,
228
+ body.docs .section h3.title a.skip:hover,
229
+ body.docs .appendix h3.title a.skip:link,
230
+ body.docs .appendix h3.title a.skip:visited,
231
+ body.docs .appendix h3.title a.skip:hover {
232
+ display: block;
233
+ float: left;
234
+ vertical-align: bottom;
235
+ text-decoration: none;
236
+ border-bottom: 0 ! important;
237
+ margin-right: 6px;
238
+ }
239
+
240
+ .reference-from h3,
241
+ .seealso h3,
242
+ .furtherreading h3 {
243
+ margin-top: -1.2em;
244
+ margin-left: -15px;
245
+ font-size: small;
246
+ width: 8em;
247
+ border: 1px solid #a9a9a9;
248
+ padding: 3px 3px 3px 13px;
249
+ background: white;
250
+ position: relative;
251
+ }
252
+
253
+ .reference-from, .seealso, .furtherreading {
254
+ width: 680px;
255
+ margin-top: 3em;
256
+ margin-bottom: 3em;
257
+ border: 1px solid #a9a9a9;
258
+ }
259
+
260
+ table.tip, table.note, table.warning, table.caution, table.important {
261
+ margin-bottom: 1em;
262
+ }
263
+
264
+ .table h3 {
265
+ display: none;
266
+ }
267
+
268
+ .table table td {
269
+ padding: 5px 1em 5px 1em;
270
+ }
271
+
272
+ div.download {
273
+ width: 708px;
274
+ margin-top: 3em;
275
+ margin-bottom: 3em;
276
+ border: 1px solid #a9a9a9;
277
+ }
278
+
279
+ div.download h3 {
280
+ margin-top: -1.2em;
281
+ margin-left: -15px;
282
+ font-size: small;
283
+ width: 10em;
284
+ border: 1px solid #a9a9a9;
285
+ padding: 3px 3px 3px 13px;
286
+ background-color: #fff;
287
+ color: #222;
288
+ position: relative;
289
+ }
290
+
291
+ div.download p {
292
+ margin-left: 1em;
293
+ }
294
+
295
+ div.download ul {
296
+ list-style: none;
297
+ padding-left: 1em;
298
+ margin-left: 0;
299
+ }