chardet 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +504 -0
- data/README +12 -0
- data/lib/Big5Freq.rb +913 -0
- data/lib/Big5Prober.rb +48 -0
- data/lib/CharDistributionAnalysis.rb +245 -0
- data/lib/CharSetGroupProber.rb +114 -0
- data/lib/CharSetProber.rb +70 -0
- data/lib/CodingStateMachine.rb +74 -0
- data/lib/ESCSM.rb +242 -0
- data/lib/EUCJPProber.rb +97 -0
- data/lib/EUCKRFreq.rb +600 -0
- data/lib/EUCKRProber.rb +48 -0
- data/lib/EUCTWFreq.rb +432 -0
- data/lib/EUCTWProber.rb +48 -0
- data/lib/EscCharSetProber.rb +94 -0
- data/lib/GB2312Freq.rb +475 -0
- data/lib/GB2312Prober.rb +48 -0
- data/lib/HebrewProber.rb +292 -0
- data/lib/JISFreq.rb +573 -0
- data/lib/JapaneseContextAnalysis.rb +234 -0
- data/lib/LangBulgarianModel.rb +231 -0
- data/lib/LangCyrillicModel.rb +332 -0
- data/lib/LangGreekModel.rb +229 -0
- data/lib/LangHebrewModel.rb +202 -0
- data/lib/LangHungarianModel.rb +228 -0
- data/lib/LangThaiModel.rb +203 -0
- data/lib/Latin1Prober.rb +160 -0
- data/lib/MBCSGroupProber.rb +57 -0
- data/lib/MBCSSM.rb +513 -0
- data/lib/MultiByteCharSetProber.rb +94 -0
- data/lib/SBCSGroupProber.rb +71 -0
- data/lib/SJISProber.rb +99 -0
- data/lib/SingleByteCharSetProber.rb +131 -0
- data/lib/UTF8Prober.rb +91 -0
- data/lib/UniversalDetector.rb +209 -0
- data/python-docs/css/chardet.css +299 -0
- data/python-docs/faq.html +107 -0
- data/python-docs/how-it-works.html +113 -0
- data/python-docs/images/caution.png +0 -0
- data/python-docs/images/important.png +0 -0
- data/python-docs/images/note.png +0 -0
- data/python-docs/images/permalink.gif +0 -0
- data/python-docs/images/tip.png +0 -0
- data/python-docs/images/warning.png +0 -0
- data/python-docs/index.html +73 -0
- data/python-docs/license.html +62 -0
- data/python-docs/supported-encodings.html +86 -0
- data/python-docs/usage.html +107 -0
- metadata +86 -0
@@ -0,0 +1,107 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
2
|
+
<html lang="en">
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
5
|
+
<title>Usage [Universal Encoding Detector]</title>
|
6
|
+
<link rel="stylesheet" href="css/chardet.css" type="text/css">
|
7
|
+
<link rev="made" href="mailto:mark@diveintomark.org">
|
8
|
+
<meta name="generator" content="DocBook XSL Stylesheets V1.65.1">
|
9
|
+
<meta name="keywords" content="character, set, encoding, detection, Python, XML, feed">
|
10
|
+
<link rel="start" href="index.html" title="Documentation">
|
11
|
+
<link rel="up" href="index.html" title="Documentation">
|
12
|
+
<link rel="prev" href="supported-encodings.html" title="Supported encodings">
|
13
|
+
<link rel="next" href="how-it-works.html" title="How it works">
|
14
|
+
</head>
|
15
|
+
<body id="chardet-feedparser-org" class="docs">
|
16
|
+
<div class="z" id="intro"><div class="sectionInner"><div class="sectionInner2">
|
17
|
+
<div class="s" id="pageHeader">
|
18
|
+
<h1><a href="/">Universal Encoding Detector</a></h1>
|
19
|
+
<p>Character encoding auto-detection in Python. As smart as your browser. Open source.</p>
|
20
|
+
</div>
|
21
|
+
<div class="s" id="quickSummary"><ul>
|
22
|
+
<li class="li1">
|
23
|
+
<a href="http://chardet.feedparser.org/download/">Download</a> ·</li>
|
24
|
+
<li class="li2">
|
25
|
+
<a href="index.html">Documentation</a> ·</li>
|
26
|
+
<li class="li3"><a href="faq.html" title="Frequently Asked Questions">FAQ</a></li>
|
27
|
+
</ul></div>
|
28
|
+
</div></div></div>
|
29
|
+
<div id="main"><div id="mainInner">
|
30
|
+
<p id="breadcrumb">You are here: <a href="index.html">Documentation</a> → <span class="thispage">Usage</span></p>
|
31
|
+
<div class="section" lang="en">
|
32
|
+
<div class="titlepage">
|
33
|
+
<div><div><h2 class="title">
|
34
|
+
<a name="usage" class="skip" href="#usage" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Usage</h2></div></div>
|
35
|
+
<div></div>
|
36
|
+
</div>
|
37
|
+
<div class="section" lang="en">
|
38
|
+
<div class="titlepage">
|
39
|
+
<div><div><h3 class="title">
|
40
|
+
<a name="usage.basic" class="skip" href="#usage.basic" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Basic usage</h3></div></div>
|
41
|
+
<div></div>
|
42
|
+
</div>
|
43
|
+
<p>The easiest way to use the <span class="application">Universal Encoding Detector</span> library is with the <tt class="function">detect</tt> function.</p>
|
44
|
+
<div class="example">
|
45
|
+
<a name="example.basic.detect" class="skip" href="#example.basic.detect" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Using the <tt class="function">detect</tt> function</h3>
|
46
|
+
<p>The <tt class="function">detect</tt> function takes one argument, a non-Unicode string. It returns a dictionary containing the auto-detected character encoding and a confidence level from <tt class="constant">0</tt> to <tt class="constant">1</tt>.</p>
|
47
|
+
<pre class="screen"><tt class="prompt">>>> </tt><span class="userinput"><font color='navy'><b>import</b></font> urllib</span>
|
48
|
+
<tt class="prompt">>>> </tt><span class="userinput">rawdata = urllib.urlopen(<font color='olive'>'http://yahoo.co.jp/'</font>).read()</span>
|
49
|
+
<tt class="prompt">>>> </tt><span class="userinput"><font color='navy'><b>import</b></font> chardet</span>
|
50
|
+
<tt class="prompt">>>> </tt><span class="userinput">chardet.detect(rawdata)</span>
|
51
|
+
<span class="computeroutput">{'encoding': 'EUC-JP', 'confidence': 0.99}</span></pre>
|
52
|
+
</div>
|
53
|
+
</div>
|
54
|
+
<div class="section" lang="en">
|
55
|
+
<div class="titlepage">
|
56
|
+
<div><div><h3 class="title">
|
57
|
+
<a name="usage.advanced" class="skip" href="#usage.advanced" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Advanced usage</h3></div></div>
|
58
|
+
<div></div>
|
59
|
+
</div>
|
60
|
+
<p>If you're dealing with a large amount of text, you can call the <span class="application">Universal Encoding Detector</span> library incrementally, and it will stop as soon as it is confident enough to report its results.</p>
|
61
|
+
<p>Create a <tt class="classname">UniversalDetector</tt> object, then call its <tt class="methodname">feed</tt> method repeatedly with each block of text. If the detector reaches a minimum threshold of confidence, it will set <tt class="varname">detector.done</tt> to <tt class="constant">True</tt>.</p>
|
62
|
+
<p>Once you've exhausted the source text, call <tt class="methodname">detector.close()</tt>, which will do some final calculations in case the detector didn't hit its minimum confidence threshold earlier. Then <tt class="varname">detector.result</tt> will be a dictionary containing the auto-detected character encoding and confidence level (the same as <a href="usage.html#example.basic.detect" title="Example: Using the detect function">the <tt class="function">chardet.detect</tt> function returns</a>).</p>
|
63
|
+
<div class="example">
|
64
|
+
<a name="example.multiline" class="skip" href="#example.multiline" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Detecting encoding incrementally</h3>
|
65
|
+
<pre class="programlisting python"><font color='navy'><b>import</b></font> urllib
|
66
|
+
<font color='navy'><b>from</b></font> chardet.universaldetector <font color='navy'><b>import</b></font> UniversalDetector
|
67
|
+
|
68
|
+
usock = urllib.urlopen(<font color='olive'>'http://yahoo.co.jp/'</font>)
|
69
|
+
detector = UniversalDetector()
|
70
|
+
<font color='navy'><b>for</b></font> line <font color='navy'><b>in</b></font> usock.readlines():
|
71
|
+
detector.feed(line)
|
72
|
+
<font color='navy'><b>if</b></font> detector.done: <font color='navy'><b>break</b></font>
|
73
|
+
detector.close()
|
74
|
+
usock.close()
|
75
|
+
<font color='navy'><b>print</b></font> detector.result</pre>
|
76
|
+
<pre class="screen"><span class="computeroutput">{'encoding': 'EUC-JP', 'confidence': 0.99}</span></pre>
|
77
|
+
</div>
|
78
|
+
<p>If you want to detect the encoding of multiple texts (such as separate files), you can re-use a single <tt class="classname">UniversalDetector</tt> object. Just call <tt class="methodname">detector.reset()</tt> at the start of each file, call <tt class="methodname">detector.feed</tt> as many times as you like, and then call <tt class="methodname">detector.close()</tt> and check the <tt class="varname">detector.result</tt> dictionary for the file's results.</p>
|
79
|
+
<div class="example">
|
80
|
+
<a name="advanced.multifile.multiline" class="skip" href="#advanced.multifile.multiline" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Detecting encodings of multiple files</h3>
|
81
|
+
<pre class="programlisting python"><font color='navy'><b>import</b></font> glob
|
82
|
+
<font color='navy'><b>from</b></font> charset.universaldetector <font color='navy'><b>import</b></font> UniversalDetector
|
83
|
+
|
84
|
+
detector = UniversalDetector()
|
85
|
+
<font color='navy'><b>for</b></font> filename <font color='navy'><b>in</b></font> glob.glob(<font color='olive'>'*.xml'</font>):
|
86
|
+
<font color='navy'><b>print</b></font> filename.ljust(60),
|
87
|
+
detector.reset()
|
88
|
+
<font color='navy'><b>for</b></font> line <font color='navy'><b>in</b></font> file(filename, <font color='olive'>'rb'</font>):
|
89
|
+
detector.feed(line)
|
90
|
+
<font color='navy'><b>if</b></font> detector.done: <font color='navy'><b>break</b></font>
|
91
|
+
detector.close()
|
92
|
+
<font color='navy'><b>print</b></font> detector.result
|
93
|
+
</pre>
|
94
|
+
</div>
|
95
|
+
</div>
|
96
|
+
</div>
|
97
|
+
<div class="footernavigation">
|
98
|
+
<div style="float: left">← <a class="NavigationArrow" href="supported-encodings.html">Supported encodings</a>
|
99
|
+
</div>
|
100
|
+
<div style="text-align: right">
|
101
|
+
<a class="NavigationArrow" href="how-it-works.html">How it works</a> →</div>
|
102
|
+
</div>
|
103
|
+
<hr>
|
104
|
+
<div id="footer"><p class="copyright">Copyright © 2006 Mark Pilgrim · <a href="mailto:mark@diveintomark.org">mark@diveintomark.org</a> · <a href="license.html">Terms of use</a></p></div>
|
105
|
+
</div></div>
|
106
|
+
</body>
|
107
|
+
</html>
|
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.10
|
3
|
+
specification_version: 1
|
4
|
+
name: chardet
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.9.0
|
7
|
+
date: 2006-03-28
|
8
|
+
summary: "Character encoding auto-detection in Ruby. Base on Mark Pilgrim's Python port."
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: zhengzhengzheng@gmail.com
|
12
|
+
homepage: http://blog.vava.cn/
|
13
|
+
rubyforge_project:
|
14
|
+
description:
|
15
|
+
autorequire: UniversalDetector
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: false
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
-
|
22
|
+
- ">"
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 0.0.0
|
25
|
+
version:
|
26
|
+
platform: ruby
|
27
|
+
authors:
|
28
|
+
- Hui
|
29
|
+
files:
|
30
|
+
- lib/Big5Freq.rb
|
31
|
+
- lib/Big5Prober.rb
|
32
|
+
- lib/CharDistributionAnalysis.rb
|
33
|
+
- lib/CharSetGroupProber.rb
|
34
|
+
- lib/CharSetProber.rb
|
35
|
+
- lib/CodingStateMachine.rb
|
36
|
+
- lib/EscCharSetProber.rb
|
37
|
+
- lib/ESCSM.rb
|
38
|
+
- lib/EUCJPProber.rb
|
39
|
+
- lib/EUCKRFreq.rb
|
40
|
+
- lib/EUCKRProber.rb
|
41
|
+
- lib/EUCTWFreq.rb
|
42
|
+
- lib/EUCTWProber.rb
|
43
|
+
- lib/GB2312Freq.rb
|
44
|
+
- lib/GB2312Prober.rb
|
45
|
+
- lib/HebrewProber.rb
|
46
|
+
- lib/JapaneseContextAnalysis.rb
|
47
|
+
- lib/JISFreq.rb
|
48
|
+
- lib/LangBulgarianModel.rb
|
49
|
+
- lib/LangCyrillicModel.rb
|
50
|
+
- lib/LangGreekModel.rb
|
51
|
+
- lib/LangHebrewModel.rb
|
52
|
+
- lib/LangHungarianModel.rb
|
53
|
+
- lib/LangThaiModel.rb
|
54
|
+
- lib/Latin1Prober.rb
|
55
|
+
- lib/MBCSGroupProber.rb
|
56
|
+
- lib/MBCSSM.rb
|
57
|
+
- lib/MultiByteCharSetProber.rb
|
58
|
+
- lib/SBCSGroupProber.rb
|
59
|
+
- lib/SingleByteCharSetProber.rb
|
60
|
+
- lib/SJISProber.rb
|
61
|
+
- lib/UniversalDetector.rb
|
62
|
+
- lib/UTF8Prober.rb
|
63
|
+
- python-docs/css
|
64
|
+
- python-docs/faq.html
|
65
|
+
- python-docs/how-it-works.html
|
66
|
+
- python-docs/images
|
67
|
+
- python-docs/index.html
|
68
|
+
- python-docs/license.html
|
69
|
+
- python-docs/supported-encodings.html
|
70
|
+
- python-docs/usage.html
|
71
|
+
- python-docs/css/chardet.css
|
72
|
+
- python-docs/images/caution.png
|
73
|
+
- python-docs/images/important.png
|
74
|
+
- python-docs/images/note.png
|
75
|
+
- python-docs/images/permalink.gif
|
76
|
+
- python-docs/images/tip.png
|
77
|
+
- python-docs/images/warning.png
|
78
|
+
- COPYING
|
79
|
+
- README
|
80
|
+
test_files: []
|
81
|
+
rdoc_options: []
|
82
|
+
extra_rdoc_files: []
|
83
|
+
executables: []
|
84
|
+
extensions: []
|
85
|
+
requirements: []
|
86
|
+
dependencies: []
|