chardet 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/COPYING +504 -0
  2. data/README +12 -0
  3. data/lib/Big5Freq.rb +913 -0
  4. data/lib/Big5Prober.rb +48 -0
  5. data/lib/CharDistributionAnalysis.rb +245 -0
  6. data/lib/CharSetGroupProber.rb +114 -0
  7. data/lib/CharSetProber.rb +70 -0
  8. data/lib/CodingStateMachine.rb +74 -0
  9. data/lib/ESCSM.rb +242 -0
  10. data/lib/EUCJPProber.rb +97 -0
  11. data/lib/EUCKRFreq.rb +600 -0
  12. data/lib/EUCKRProber.rb +48 -0
  13. data/lib/EUCTWFreq.rb +432 -0
  14. data/lib/EUCTWProber.rb +48 -0
  15. data/lib/EscCharSetProber.rb +94 -0
  16. data/lib/GB2312Freq.rb +475 -0
  17. data/lib/GB2312Prober.rb +48 -0
  18. data/lib/HebrewProber.rb +292 -0
  19. data/lib/JISFreq.rb +573 -0
  20. data/lib/JapaneseContextAnalysis.rb +234 -0
  21. data/lib/LangBulgarianModel.rb +231 -0
  22. data/lib/LangCyrillicModel.rb +332 -0
  23. data/lib/LangGreekModel.rb +229 -0
  24. data/lib/LangHebrewModel.rb +202 -0
  25. data/lib/LangHungarianModel.rb +228 -0
  26. data/lib/LangThaiModel.rb +203 -0
  27. data/lib/Latin1Prober.rb +160 -0
  28. data/lib/MBCSGroupProber.rb +57 -0
  29. data/lib/MBCSSM.rb +513 -0
  30. data/lib/MultiByteCharSetProber.rb +94 -0
  31. data/lib/SBCSGroupProber.rb +71 -0
  32. data/lib/SJISProber.rb +99 -0
  33. data/lib/SingleByteCharSetProber.rb +131 -0
  34. data/lib/UTF8Prober.rb +91 -0
  35. data/lib/UniversalDetector.rb +209 -0
  36. data/python-docs/css/chardet.css +299 -0
  37. data/python-docs/faq.html +107 -0
  38. data/python-docs/how-it-works.html +113 -0
  39. data/python-docs/images/caution.png +0 -0
  40. data/python-docs/images/important.png +0 -0
  41. data/python-docs/images/note.png +0 -0
  42. data/python-docs/images/permalink.gif +0 -0
  43. data/python-docs/images/tip.png +0 -0
  44. data/python-docs/images/warning.png +0 -0
  45. data/python-docs/index.html +73 -0
  46. data/python-docs/license.html +62 -0
  47. data/python-docs/supported-encodings.html +86 -0
  48. data/python-docs/usage.html +107 -0
  49. metadata +86 -0
@@ -0,0 +1,209 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require "EscCharSetProber"
30
+ require "MBCSGroupProber"
31
+ require "SBCSGroupProber"
32
+ require "Latin1Prober"
33
+ require "singleton"
34
+
35
+ module UniversalDetector
36
+
37
+ class << self
38
+ def encoding(data)
39
+ chardet(data)['encoding']
40
+ end
41
+
42
+ def chardet(data)
43
+ u = UniversalDetector::Detector.instance
44
+ u.reset()
45
+ u.feed(data)
46
+ u.close()
47
+ u.result
48
+ end
49
+ end
50
+
51
+ DEBUG = nil
52
+
53
+ Detectiong = 0
54
+ FoundIt = 1
55
+ NotMe = 2
56
+
57
+ Start = 0
58
+ Error = 1
59
+ ItsMe = 2
60
+
61
+ MINIMUM_THRESHOLD = 0.20
62
+ PureAscii = 0
63
+ EscAscii = 1
64
+ Highbyte = 2
65
+
66
+ SHORTCUT_THRESHOLD = 0.95
67
+
68
+ class Detector
69
+
70
+ include Singleton
71
+
72
+ attr_reader :result
73
+
74
+ def initialize
75
+ @_highBitDetector = /[\x80-\xFF]/
76
+ @_escDetector = /\033|~\{/
77
+ @_mEscCharSetProber = nil
78
+ @_mCharSetProbers = []
79
+ reset
80
+ end
81
+
82
+ def reset
83
+ @result = {"encoding"=> nil, "confidence"=> 0.0}
84
+ @done = false
85
+ @_mStart = true
86
+ @_mGotData = false
87
+ @_mInputState = :PureAscii
88
+ @_mLastChar = ""
89
+ if @_mEscCharSetProber
90
+ @_mEscCharSetProber.reset
91
+ end
92
+ for prober in @_mCharSetProbers
93
+ prober.reset
94
+ end
95
+ end
96
+
97
+ def feed(data)
98
+ if @done || data.empty?
99
+ return
100
+ end
101
+ unless @_mGotData
102
+ # If the data starts with BOM, we know it is UTF
103
+ if data[0,3] == "\xEF\xBB\xBF":
104
+ # EF BB BF UTF-8 with BOM
105
+ @result = {"encoding"=> "UTF-8", "confidence"=> 1.0}
106
+ elsif data[0,4] == "\xFF\xFE\x00\x00":
107
+ # FF FE 00 00 UTF-32, little-endian BOM
108
+ @result = {"encoding"=> "UTF-32LE", "confidence"=> 1.0}
109
+ elsif data[0,4] == "\x00\x00\xFE\xFF":
110
+ # 00 00 FE FF UTF-32, big-endian BOM
111
+ @result = {"encoding"=> "UTF-32BE", "confidence"=> 1.0}
112
+ elsif data[0,4] == "\xFE\xFF\x00\x00":
113
+ # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
114
+ @result = {"encoding"=> "X-ISO-10646-UCS-4-3412", "confidence"=> 1.0}
115
+ elsif data[0,4] == "\x00\x00\xFF\xFE":
116
+ # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
117
+ @result = {"encoding"=> "X-ISO-10646-UCS-4-2143", "confidence"=> 1.0}
118
+ elsif data[0,4] == "\xFF\xFE":
119
+ # FF FE UTF-16, little endian BOM
120
+ @result = {"encoding"=> "UTF-16LE", "confidence"=> 1.0}
121
+ elsif data[0,2] == "\xFE\xFF":
122
+ # FE FF UTF-16, big endian BOM
123
+ @result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0}
124
+ end
125
+ end
126
+ @_mGotData = true
127
+ if @result["encoding"] && @result["confidence"] > 0.0
128
+ @done = true
129
+ return
130
+ end
131
+
132
+ if @_mInputState == :PureAscii
133
+ if data =~ @_highBitDetector
134
+ @_mInputState = :Highbyte
135
+ elsif (@_mLastChar + data) =~ @_escDetector
136
+ @_mInputState = :EscAscii
137
+ end
138
+ end
139
+
140
+ @_mLastChar = data[-1]
141
+ if @_mInputState == :EscAscii
142
+ unless @_mEscCharSetProber
143
+ @_mEscCharSetProber = EscCharSetProber.new
144
+ end
145
+ if @_mEscCharSetProber.feed(data) == constants.eFoundIt
146
+ @result = {"encoding"=> @_mEscCharSetProber.get_charset_name() ,"confidence"=> @_mEscCharSetProber.get_confidence()}
147
+ @done = true
148
+ end
149
+ elsif @_mInputState == :Highbyte
150
+ if @_mCharSetProbers.empty?
151
+ @_mCharSetProbers = MBCSGroupProber.new.mProbers + SBCSGroupProber.new.mProbers + [Latin1Prober.new]
152
+ end
153
+ @_mCharSetProbers.each do |prober|
154
+ if prober.feed(data) == :FoundIt
155
+ @result = {"encoding"=> prober.get_charset_name(), "confidence"=> prober.get_confidence()}
156
+ @done = true
157
+ break
158
+ end
159
+ end #for
160
+ end
161
+ end #feed
162
+
163
+ def close
164
+ if @done then return end
165
+ unless @_mGotData
166
+ if DEBUG
167
+ p("no data received!\n")
168
+ end
169
+ return
170
+ end
171
+ @done = true
172
+
173
+ if @_mInputState == :PureAscii
174
+ @result = {"encoding" => "ascii", "confidence" => 1.0}
175
+ return @result
176
+ end
177
+
178
+ if @_mInputState == :Highbyte
179
+ proberConfidence = nil
180
+ maxProberConfidence = 0.0
181
+ maxProber = nil
182
+ for prober in @_mCharSetProbers
183
+ unless prober then next end
184
+ proberConfidence = prober.get_confidence()
185
+ if proberConfidence > maxProberConfidence
186
+ maxProberConfidence = proberConfidence
187
+ maxProber = prober
188
+ end
189
+ end
190
+ if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD)
191
+ @result = {"encoding" => maxProber.get_charset_name(),
192
+ "confidence" => maxProber.get_confidence()}
193
+ return @result
194
+ end
195
+ end #if
196
+
197
+ if DEBUG
198
+ p("no probers hit minimum threshhold\n")
199
+ for prober in @_mCharSetProbers:
200
+ unless prober then next end
201
+ p("%s confidence = %s\n" % \
202
+ [prober.get_charset_name(), \
203
+ prober.get_confidence()])
204
+ end
205
+ end
206
+ end #close
207
+ end #class
208
+
209
+ end #module
@@ -0,0 +1,299 @@
1
+ html {
2
+ margin: 0;
3
+ padding: 0;
4
+ }
5
+
6
+ body {
7
+ background-color: #fff;
8
+ color: #333;
9
+ font-family: 'Lucida Grande', Verdana, Geneva, Lucida, Helvetica, sans-serif;
10
+ font-size: 100%;
11
+ margin: 10px;
12
+ padding: 0;
13
+ }
14
+
15
+ a:link, a:visited {
16
+ background-color: transparent;
17
+ color: #333;
18
+ text-decoration: none !important;
19
+ border-bottom: 1px dotted #333 !important;
20
+ text-decoration: underline;
21
+ border-bottom: 0;
22
+ }
23
+
24
+ a:hover {
25
+ background-color: transparent;
26
+ color: #993344;
27
+ text-decoration: none !important;
28
+ text-decoration: underline;
29
+ border-bottom: 1px dotted #993344 !important;
30
+ border-bottom: 0;
31
+ }
32
+
33
+ h1 {
34
+ margin: 8px 0 0 0;
35
+ padding: 0;
36
+ font-variant: small-caps;
37
+ letter-spacing: 0.1em;
38
+ font-family: "Book Antiqua", Georgia, Palatino, Times, "Times New Roman", serif;
39
+ }
40
+
41
+ h1 a:link, h1 a:visited, h1 a:hover {
42
+ background-color: transparent ! important;
43
+ color: #333 ! important;
44
+ text-decoration: none ! important;
45
+ border-bottom: 0px ! important;
46
+ }
47
+
48
+ #intro {
49
+ width: 730px;
50
+ }
51
+
52
+ #intro ul {
53
+ margin-left: 0;
54
+ padding-left: 0;
55
+ display: inline;
56
+ }
57
+
58
+ #intro ul li {
59
+ display: inline;
60
+ font-size: small;
61
+ }
62
+
63
+ #intro ul li.li1 {
64
+ }
65
+
66
+ #intro p {
67
+ font-size: small;
68
+ font-weight: normal;
69
+ margin: 1.2em 0 0 0;
70
+ padding: 0;
71
+ }
72
+
73
+ .z {
74
+ float:left;
75
+ background: url(/img/shadowAlpha.png) no-repeat bottom right !important;
76
+ background: url(/img/shadow.gif) no-repeat bottom right;
77
+ margin: 15px 0 0 10px !important;
78
+ margin: 15px 0 0 5px;
79
+ }
80
+
81
+ .z .sectionInner {
82
+ background: none !important;
83
+ background: url(/img/shadow2.gif) no-repeat left top;
84
+ padding: 0 !important;
85
+ padding: 0 6px 6px 0;
86
+ }
87
+
88
+ .z .sectionInner .sectionInner2 {
89
+ background-color: #fff;
90
+ border: 1px solid #a9a9a9;
91
+ padding: 4px;
92
+ margin: -6px 6px 6px -6px !important;
93
+ margin: 0;
94
+ }
95
+
96
+ .s {
97
+ margin-left: 1em;
98
+ margin-right: 1em;
99
+ margin-bottom: 1em;
100
+ }
101
+
102
+ #main {
103
+ clear: left;
104
+ margin-left: 11px;
105
+ margin-bottom: 2em;
106
+ font-size: small;
107
+ }
108
+
109
+ #mainInner {
110
+ margin-left: 1em;
111
+ margin-bottom: 2em;
112
+ padding-top: 1em;
113
+ }
114
+
115
+ .footernavigation {
116
+ clear: both;
117
+ font-size: small;
118
+ padding-bottom: 1em;
119
+ margin-bottom: 0;
120
+ }
121
+
122
+ .example, .section, .appendix {
123
+ line-height: 150%;
124
+ }
125
+
126
+ #breadcrumb {
127
+ width: 100%;
128
+ margin: 0 0 1em 0;
129
+ padding: 0;
130
+ line-height: 140%;
131
+ font-size: small;
132
+ }
133
+
134
+ #breadcrumb #thispage {
135
+ font-weight: bold;
136
+ }
137
+
138
+ /* ----- Python code syntax coloring ----- */
139
+ .computeroutput, .traceback, .pykeyword, .pystring, .pycomment, .pyfunction, .pyclass {
140
+ background-color: white;
141
+ }
142
+
143
+ .pykeyword, .pyfunction, .pyclass {
144
+ font-weight: bold;
145
+ }
146
+
147
+ .computeroutput {
148
+ color: teal;
149
+ }
150
+
151
+ .traceback {
152
+ color: red;
153
+ }
154
+
155
+ .pykeyword {
156
+ color: navy;
157
+ }
158
+
159
+ .pystring {
160
+ color: olive;
161
+ }
162
+
163
+ .pycomment {
164
+ color: green;
165
+ font-style: italic;
166
+ }
167
+
168
+ .pyfunction {
169
+ color: teal;
170
+ }
171
+
172
+ .pyclass {
173
+ color: blue;
174
+ }
175
+
176
+ /* ----- standard stuff ----- */
177
+ .skip {
178
+ display: none;
179
+ }
180
+
181
+ samp, code, tt, pre {
182
+ font-weight: normal;
183
+ font-family: monospace;
184
+ font-size: small;
185
+ }
186
+
187
+ img {
188
+ border: 0;
189
+ }
190
+
191
+ acronym, abbr {
192
+ /* border-bottom: 1px dotted #333;*/
193
+ border-bottom: 0;
194
+ font-style: normal;
195
+ cursor: help;
196
+ }
197
+
198
+ hr {
199
+ clear: both;
200
+ margin-top: 2em !important;
201
+ margin-top: 1em;
202
+ height: 1px;
203
+ background-color: #cecbc6;
204
+ color: #cecbc6;
205
+ }
206
+
207
+ #footer {
208
+ text-align: center;
209
+ font-size: x-small;
210
+ }
211
+
212
+ body.docs .example {
213
+ border-left: 4px double #ddd !important;
214
+ border-left: 1px solid #ccc;
215
+ margin-left: 2em;
216
+ padding-left: 2em;
217
+ }
218
+
219
+ body.docs .example h3 {
220
+ font-size: 100%;
221
+ }
222
+
223
+ body.docs .example a.skip:link,
224
+ body.docs .example a.skip:visited,
225
+ body.docs .example a.skip:hover,
226
+ body.docs .section h3.title a.skip:link,
227
+ body.docs .section h3.title a.skip:visited,
228
+ body.docs .section h3.title a.skip:hover,
229
+ body.docs .appendix h3.title a.skip:link,
230
+ body.docs .appendix h3.title a.skip:visited,
231
+ body.docs .appendix h3.title a.skip:hover {
232
+ display: block;
233
+ float: left;
234
+ vertical-align: bottom;
235
+ text-decoration: none;
236
+ border-bottom: 0 ! important;
237
+ margin-right: 6px;
238
+ }
239
+
240
+ .reference-from h3,
241
+ .seealso h3,
242
+ .furtherreading h3 {
243
+ margin-top: -1.2em;
244
+ margin-left: -15px;
245
+ font-size: small;
246
+ width: 8em;
247
+ border: 1px solid #a9a9a9;
248
+ padding: 3px 3px 3px 13px;
249
+ background: white;
250
+ position: relative;
251
+ }
252
+
253
+ .reference-from, .seealso, .furtherreading {
254
+ width: 680px;
255
+ margin-top: 3em;
256
+ margin-bottom: 3em;
257
+ border: 1px solid #a9a9a9;
258
+ }
259
+
260
+ table.tip, table.note, table.warning, table.caution, table.important {
261
+ margin-bottom: 1em;
262
+ }
263
+
264
+ .table h3 {
265
+ display: none;
266
+ }
267
+
268
+ .table table td {
269
+ padding: 5px 1em 5px 1em;
270
+ }
271
+
272
+ div.download {
273
+ width: 708px;
274
+ margin-top: 3em;
275
+ margin-bottom: 3em;
276
+ border: 1px solid #a9a9a9;
277
+ }
278
+
279
+ div.download h3 {
280
+ margin-top: -1.2em;
281
+ margin-left: -15px;
282
+ font-size: small;
283
+ width: 10em;
284
+ border: 1px solid #a9a9a9;
285
+ padding: 3px 3px 3px 13px;
286
+ background-color: #fff;
287
+ color: #222;
288
+ position: relative;
289
+ }
290
+
291
+ div.download p {
292
+ margin-left: 1em;
293
+ }
294
+
295
+ div.download ul {
296
+ list-style: none;
297
+ padding-left: 1em;
298
+ margin-left: 0;
299
+ }