chardet 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +504 -0
- data/README +12 -0
- data/lib/Big5Freq.rb +913 -0
- data/lib/Big5Prober.rb +48 -0
- data/lib/CharDistributionAnalysis.rb +245 -0
- data/lib/CharSetGroupProber.rb +114 -0
- data/lib/CharSetProber.rb +70 -0
- data/lib/CodingStateMachine.rb +74 -0
- data/lib/ESCSM.rb +242 -0
- data/lib/EUCJPProber.rb +97 -0
- data/lib/EUCKRFreq.rb +600 -0
- data/lib/EUCKRProber.rb +48 -0
- data/lib/EUCTWFreq.rb +432 -0
- data/lib/EUCTWProber.rb +48 -0
- data/lib/EscCharSetProber.rb +94 -0
- data/lib/GB2312Freq.rb +475 -0
- data/lib/GB2312Prober.rb +48 -0
- data/lib/HebrewProber.rb +292 -0
- data/lib/JISFreq.rb +573 -0
- data/lib/JapaneseContextAnalysis.rb +234 -0
- data/lib/LangBulgarianModel.rb +231 -0
- data/lib/LangCyrillicModel.rb +332 -0
- data/lib/LangGreekModel.rb +229 -0
- data/lib/LangHebrewModel.rb +202 -0
- data/lib/LangHungarianModel.rb +228 -0
- data/lib/LangThaiModel.rb +203 -0
- data/lib/Latin1Prober.rb +160 -0
- data/lib/MBCSGroupProber.rb +57 -0
- data/lib/MBCSSM.rb +513 -0
- data/lib/MultiByteCharSetProber.rb +94 -0
- data/lib/SBCSGroupProber.rb +71 -0
- data/lib/SJISProber.rb +99 -0
- data/lib/SingleByteCharSetProber.rb +131 -0
- data/lib/UTF8Prober.rb +91 -0
- data/lib/UniversalDetector.rb +209 -0
- data/python-docs/css/chardet.css +299 -0
- data/python-docs/faq.html +107 -0
- data/python-docs/how-it-works.html +113 -0
- data/python-docs/images/caution.png +0 -0
- data/python-docs/images/important.png +0 -0
- data/python-docs/images/note.png +0 -0
- data/python-docs/images/permalink.gif +0 -0
- data/python-docs/images/tip.png +0 -0
- data/python-docs/images/warning.png +0 -0
- data/python-docs/index.html +73 -0
- data/python-docs/license.html +62 -0
- data/python-docs/supported-encodings.html +86 -0
- data/python-docs/usage.html +107 -0
- metadata +86 -0
@@ -0,0 +1,107 @@
|
|
1
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
2
|
+
<html lang="en">
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
5
|
+
<title>Usage [Universal Encoding Detector]</title>
|
6
|
+
<link rel="stylesheet" href="css/chardet.css" type="text/css">
|
7
|
+
<link rev="made" href="mailto:mark@diveintomark.org">
|
8
|
+
<meta name="generator" content="DocBook XSL Stylesheets V1.65.1">
|
9
|
+
<meta name="keywords" content="character, set, encoding, detection, Python, XML, feed">
|
10
|
+
<link rel="start" href="index.html" title="Documentation">
|
11
|
+
<link rel="up" href="index.html" title="Documentation">
|
12
|
+
<link rel="prev" href="supported-encodings.html" title="Supported encodings">
|
13
|
+
<link rel="next" href="how-it-works.html" title="How it works">
|
14
|
+
</head>
|
15
|
+
<body id="chardet-feedparser-org" class="docs">
|
16
|
+
<div class="z" id="intro"><div class="sectionInner"><div class="sectionInner2">
|
17
|
+
<div class="s" id="pageHeader">
|
18
|
+
<h1><a href="/">Universal Encoding Detector</a></h1>
|
19
|
+
<p>Character encoding auto-detection in Python. As smart as your browser. Open source.</p>
|
20
|
+
</div>
|
21
|
+
<div class="s" id="quickSummary"><ul>
|
22
|
+
<li class="li1">
|
23
|
+
<a href="http://chardet.feedparser.org/download/">Download</a> ·</li>
|
24
|
+
<li class="li2">
|
25
|
+
<a href="index.html">Documentation</a> ·</li>
|
26
|
+
<li class="li3"><a href="faq.html" title="Frequently Asked Questions">FAQ</a></li>
|
27
|
+
</ul></div>
|
28
|
+
</div></div></div>
|
29
|
+
<div id="main"><div id="mainInner">
|
30
|
+
<p id="breadcrumb">You are here: <a href="index.html">Documentation</a> → <span class="thispage">Usage</span></p>
|
31
|
+
<div class="section" lang="en">
|
32
|
+
<div class="titlepage">
|
33
|
+
<div><div><h2 class="title">
|
34
|
+
<a name="usage" class="skip" href="#usage" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Usage</h2></div></div>
|
35
|
+
<div></div>
|
36
|
+
</div>
|
37
|
+
<div class="section" lang="en">
|
38
|
+
<div class="titlepage">
|
39
|
+
<div><div><h3 class="title">
|
40
|
+
<a name="usage.basic" class="skip" href="#usage.basic" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Basic usage</h3></div></div>
|
41
|
+
<div></div>
|
42
|
+
</div>
|
43
|
+
<p>The easiest way to use the <span class="application">Universal Encoding Detector</span> library is with the <tt class="function">detect</tt> function.</p>
|
44
|
+
<div class="example">
|
45
|
+
<a name="example.basic.detect" class="skip" href="#example.basic.detect" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Using the <tt class="function">detect</tt> function</h3>
|
46
|
+
<p>The <tt class="function">detect</tt> function takes one argument, a non-Unicode string. It returns a dictionary containing the auto-detected character encoding and a confidence level from <tt class="constant">0</tt> to <tt class="constant">1</tt>.</p>
|
47
|
+
<pre class="screen"><tt class="prompt">>>> </tt><span class="userinput"><font color='navy'><b>import</b></font> urllib</span>
|
48
|
+
<tt class="prompt">>>> </tt><span class="userinput">rawdata = urllib.urlopen(<font color='olive'>'http://yahoo.co.jp/'</font>).read()</span>
|
49
|
+
<tt class="prompt">>>> </tt><span class="userinput"><font color='navy'><b>import</b></font> chardet</span>
|
50
|
+
<tt class="prompt">>>> </tt><span class="userinput">chardet.detect(rawdata)</span>
|
51
|
+
<span class="computeroutput">{'encoding': 'EUC-JP', 'confidence': 0.99}</span></pre>
|
52
|
+
</div>
|
53
|
+
</div>
|
54
|
+
<div class="section" lang="en">
|
55
|
+
<div class="titlepage">
|
56
|
+
<div><div><h3 class="title">
|
57
|
+
<a name="usage.advanced" class="skip" href="#usage.advanced" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Advanced usage</h3></div></div>
|
58
|
+
<div></div>
|
59
|
+
</div>
|
60
|
+
<p>If you're dealing with a large amount of text, you can call the <span class="application">Universal Encoding Detector</span> library incrementally, and it will stop as soon as it is confident enough to report its results.</p>
|
61
|
+
<p>Create a <tt class="classname">UniversalDetector</tt> object, then call its <tt class="methodname">feed</tt> method repeatedly with each block of text. If the detector reaches a minimum threshold of confidence, it will set <tt class="varname">detector.done</tt> to <tt class="constant">True</tt>.</p>
|
62
|
+
<p>Once you've exhausted the source text, call <tt class="methodname">detector.close()</tt>, which will do some final calculations in case the detector didn't hit its minimum confidence threshold earlier. Then <tt class="varname">detector.result</tt> will be a dictionary containing the auto-detected character encoding and confidence level (the same as <a href="usage.html#example.basic.detect" title="Example: Using the detect function">the <tt class="function">chardet.detect</tt> function returns</a>).</p>
|
63
|
+
<div class="example">
|
64
|
+
<a name="example.multiline" class="skip" href="#example.multiline" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Detecting encoding incrementally</h3>
|
65
|
+
<pre class="programlisting python"><font color='navy'><b>import</b></font> urllib
|
66
|
+
<font color='navy'><b>from</b></font> chardet.universaldetector <font color='navy'><b>import</b></font> UniversalDetector
|
67
|
+
|
68
|
+
usock = urllib.urlopen(<font color='olive'>'http://yahoo.co.jp/'</font>)
|
69
|
+
detector = UniversalDetector()
|
70
|
+
<font color='navy'><b>for</b></font> line <font color='navy'><b>in</b></font> usock.readlines():
|
71
|
+
detector.feed(line)
|
72
|
+
<font color='navy'><b>if</b></font> detector.done: <font color='navy'><b>break</b></font>
|
73
|
+
detector.close()
|
74
|
+
usock.close()
|
75
|
+
<font color='navy'><b>print</b></font> detector.result</pre>
|
76
|
+
<pre class="screen"><span class="computeroutput">{'encoding': 'EUC-JP', 'confidence': 0.99}</span></pre>
|
77
|
+
</div>
|
78
|
+
<p>If you want to detect the encoding of multiple texts (such as separate files), you can re-use a single <tt class="classname">UniversalDetector</tt> object. Just call <tt class="methodname">detector.reset()</tt> at the start of each file, call <tt class="methodname">detector.feed</tt> as many times as you like, and then call <tt class="methodname">detector.close()</tt> and check the <tt class="varname">detector.result</tt> dictionary for the file's results.</p>
|
79
|
+
<div class="example">
|
80
|
+
<a name="advanced.multifile.multiline" class="skip" href="#advanced.multifile.multiline" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Detecting encodings of multiple files</h3>
|
81
|
+
<pre class="programlisting python"><font color='navy'><b>import</b></font> glob
|
82
|
+
<font color='navy'><b>from</b></font> charset.universaldetector <font color='navy'><b>import</b></font> UniversalDetector
|
83
|
+
|
84
|
+
detector = UniversalDetector()
|
85
|
+
<font color='navy'><b>for</b></font> filename <font color='navy'><b>in</b></font> glob.glob(<font color='olive'>'*.xml'</font>):
|
86
|
+
<font color='navy'><b>print</b></font> filename.ljust(60),
|
87
|
+
detector.reset()
|
88
|
+
<font color='navy'><b>for</b></font> line <font color='navy'><b>in</b></font> file(filename, <font color='olive'>'rb'</font>):
|
89
|
+
detector.feed(line)
|
90
|
+
<font color='navy'><b>if</b></font> detector.done: <font color='navy'><b>break</b></font>
|
91
|
+
detector.close()
|
92
|
+
<font color='navy'><b>print</b></font> detector.result
|
93
|
+
</pre>
|
94
|
+
</div>
|
95
|
+
</div>
|
96
|
+
</div>
|
97
|
+
<div class="footernavigation">
|
98
|
+
<div style="float: left">← <a class="NavigationArrow" href="supported-encodings.html">Supported encodings</a>
|
99
|
+
</div>
|
100
|
+
<div style="text-align: right">
|
101
|
+
<a class="NavigationArrow" href="how-it-works.html">How it works</a> →</div>
|
102
|
+
</div>
|
103
|
+
<hr>
|
104
|
+
<div id="footer"><p class="copyright">Copyright © 2006 Mark Pilgrim · <a href="mailto:mark@diveintomark.org">mark@diveintomark.org</a> · <a href="license.html">Terms of use</a></p></div>
|
105
|
+
</div></div>
|
106
|
+
</body>
|
107
|
+
</html>
|
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.10
|
3
|
+
specification_version: 1
|
4
|
+
name: chardet
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.9.0
|
7
|
+
date: 2006-03-28
|
8
|
+
summary: "Character encoding auto-detection in Ruby. Base on Mark Pilgrim's Python port."
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: zhengzhengzheng@gmail.com
|
12
|
+
homepage: http://blog.vava.cn/
|
13
|
+
rubyforge_project:
|
14
|
+
description:
|
15
|
+
autorequire: UniversalDetector
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: false
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
-
|
22
|
+
- ">"
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 0.0.0
|
25
|
+
version:
|
26
|
+
platform: ruby
|
27
|
+
authors:
|
28
|
+
- Hui
|
29
|
+
files:
|
30
|
+
- lib/Big5Freq.rb
|
31
|
+
- lib/Big5Prober.rb
|
32
|
+
- lib/CharDistributionAnalysis.rb
|
33
|
+
- lib/CharSetGroupProber.rb
|
34
|
+
- lib/CharSetProber.rb
|
35
|
+
- lib/CodingStateMachine.rb
|
36
|
+
- lib/EscCharSetProber.rb
|
37
|
+
- lib/ESCSM.rb
|
38
|
+
- lib/EUCJPProber.rb
|
39
|
+
- lib/EUCKRFreq.rb
|
40
|
+
- lib/EUCKRProber.rb
|
41
|
+
- lib/EUCTWFreq.rb
|
42
|
+
- lib/EUCTWProber.rb
|
43
|
+
- lib/GB2312Freq.rb
|
44
|
+
- lib/GB2312Prober.rb
|
45
|
+
- lib/HebrewProber.rb
|
46
|
+
- lib/JapaneseContextAnalysis.rb
|
47
|
+
- lib/JISFreq.rb
|
48
|
+
- lib/LangBulgarianModel.rb
|
49
|
+
- lib/LangCyrillicModel.rb
|
50
|
+
- lib/LangGreekModel.rb
|
51
|
+
- lib/LangHebrewModel.rb
|
52
|
+
- lib/LangHungarianModel.rb
|
53
|
+
- lib/LangThaiModel.rb
|
54
|
+
- lib/Latin1Prober.rb
|
55
|
+
- lib/MBCSGroupProber.rb
|
56
|
+
- lib/MBCSSM.rb
|
57
|
+
- lib/MultiByteCharSetProber.rb
|
58
|
+
- lib/SBCSGroupProber.rb
|
59
|
+
- lib/SingleByteCharSetProber.rb
|
60
|
+
- lib/SJISProber.rb
|
61
|
+
- lib/UniversalDetector.rb
|
62
|
+
- lib/UTF8Prober.rb
|
63
|
+
- python-docs/css
|
64
|
+
- python-docs/faq.html
|
65
|
+
- python-docs/how-it-works.html
|
66
|
+
- python-docs/images
|
67
|
+
- python-docs/index.html
|
68
|
+
- python-docs/license.html
|
69
|
+
- python-docs/supported-encodings.html
|
70
|
+
- python-docs/usage.html
|
71
|
+
- python-docs/css/chardet.css
|
72
|
+
- python-docs/images/caution.png
|
73
|
+
- python-docs/images/important.png
|
74
|
+
- python-docs/images/note.png
|
75
|
+
- python-docs/images/permalink.gif
|
76
|
+
- python-docs/images/tip.png
|
77
|
+
- python-docs/images/warning.png
|
78
|
+
- COPYING
|
79
|
+
- README
|
80
|
+
test_files: []
|
81
|
+
rdoc_options: []
|
82
|
+
extra_rdoc_files: []
|
83
|
+
executables: []
|
84
|
+
extensions: []
|
85
|
+
requirements: []
|
86
|
+
dependencies: []
|