chardet 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/COPYING +504 -0
  2. data/README +12 -0
  3. data/lib/Big5Freq.rb +913 -0
  4. data/lib/Big5Prober.rb +48 -0
  5. data/lib/CharDistributionAnalysis.rb +245 -0
  6. data/lib/CharSetGroupProber.rb +114 -0
  7. data/lib/CharSetProber.rb +70 -0
  8. data/lib/CodingStateMachine.rb +74 -0
  9. data/lib/ESCSM.rb +242 -0
  10. data/lib/EUCJPProber.rb +97 -0
  11. data/lib/EUCKRFreq.rb +600 -0
  12. data/lib/EUCKRProber.rb +48 -0
  13. data/lib/EUCTWFreq.rb +432 -0
  14. data/lib/EUCTWProber.rb +48 -0
  15. data/lib/EscCharSetProber.rb +94 -0
  16. data/lib/GB2312Freq.rb +475 -0
  17. data/lib/GB2312Prober.rb +48 -0
  18. data/lib/HebrewProber.rb +292 -0
  19. data/lib/JISFreq.rb +573 -0
  20. data/lib/JapaneseContextAnalysis.rb +234 -0
  21. data/lib/LangBulgarianModel.rb +231 -0
  22. data/lib/LangCyrillicModel.rb +332 -0
  23. data/lib/LangGreekModel.rb +229 -0
  24. data/lib/LangHebrewModel.rb +202 -0
  25. data/lib/LangHungarianModel.rb +228 -0
  26. data/lib/LangThaiModel.rb +203 -0
  27. data/lib/Latin1Prober.rb +160 -0
  28. data/lib/MBCSGroupProber.rb +57 -0
  29. data/lib/MBCSSM.rb +513 -0
  30. data/lib/MultiByteCharSetProber.rb +94 -0
  31. data/lib/SBCSGroupProber.rb +71 -0
  32. data/lib/SJISProber.rb +99 -0
  33. data/lib/SingleByteCharSetProber.rb +131 -0
  34. data/lib/UTF8Prober.rb +91 -0
  35. data/lib/UniversalDetector.rb +209 -0
  36. data/python-docs/css/chardet.css +299 -0
  37. data/python-docs/faq.html +107 -0
  38. data/python-docs/how-it-works.html +113 -0
  39. data/python-docs/images/caution.png +0 -0
  40. data/python-docs/images/important.png +0 -0
  41. data/python-docs/images/note.png +0 -0
  42. data/python-docs/images/permalink.gif +0 -0
  43. data/python-docs/images/tip.png +0 -0
  44. data/python-docs/images/warning.png +0 -0
  45. data/python-docs/index.html +73 -0
  46. data/python-docs/license.html +62 -0
  47. data/python-docs/supported-encodings.html +86 -0
  48. data/python-docs/usage.html +107 -0
  49. metadata +86 -0
@@ -0,0 +1,107 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
2
+ <html lang="en">
3
+ <head>
4
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
5
+ <title>Usage [Universal Encoding Detector]</title>
6
+ <link rel="stylesheet" href="css/chardet.css" type="text/css">
7
+ <link rev="made" href="mailto:mark@diveintomark.org">
8
+ <meta name="generator" content="DocBook XSL Stylesheets V1.65.1">
9
+ <meta name="keywords" content="character, set, encoding, detection, Python, XML, feed">
10
+ <link rel="start" href="index.html" title="Documentation">
11
+ <link rel="up" href="index.html" title="Documentation">
12
+ <link rel="prev" href="supported-encodings.html" title="Supported encodings">
13
+ <link rel="next" href="how-it-works.html" title="How it works">
14
+ </head>
15
+ <body id="chardet-feedparser-org" class="docs">
16
+ <div class="z" id="intro"><div class="sectionInner"><div class="sectionInner2">
17
+ <div class="s" id="pageHeader">
18
+ <h1><a href="/">Universal Encoding Detector</a></h1>
19
+ <p>Character encoding auto-detection in Python. As smart as your browser. Open source.</p>
20
+ </div>
21
+ <div class="s" id="quickSummary"><ul>
22
+ <li class="li1">
23
+ <a href="http://chardet.feedparser.org/download/">Download</a> ·</li>
24
+ <li class="li2">
25
+ <a href="index.html">Documentation</a> ·</li>
26
+ <li class="li3"><a href="faq.html" title="Frequently Asked Questions">FAQ</a></li>
27
+ </ul></div>
28
+ </div></div></div>
29
+ <div id="main"><div id="mainInner">
30
+ <p id="breadcrumb">You are here: <a href="index.html">Documentation</a> → <span class="thispage">Usage</span></p>
31
+ <div class="section" lang="en">
32
+ <div class="titlepage">
33
+ <div><div><h2 class="title">
34
+ <a name="usage" class="skip" href="#usage" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Usage</h2></div></div>
35
+ <div></div>
36
+ </div>
37
+ <div class="section" lang="en">
38
+ <div class="titlepage">
39
+ <div><div><h3 class="title">
40
+ <a name="usage.basic" class="skip" href="#usage.basic" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Basic usage</h3></div></div>
41
+ <div></div>
42
+ </div>
43
+ <p>The easiest way to use the <span class="application">Universal Encoding Detector</span> library is with the <tt class="function">detect</tt> function.</p>
44
+ <div class="example">
45
+ <a name="example.basic.detect" class="skip" href="#example.basic.detect" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Using the <tt class="function">detect</tt> function</h3>
46
+ <p>The <tt class="function">detect</tt> function takes one argument, a non-Unicode string. It returns a dictionary containing the auto-detected character encoding and a confidence level from <tt class="constant">0</tt> to <tt class="constant">1</tt>.</p>
47
+ <pre class="screen"><tt class="prompt">&gt;&gt;&gt; </tt><span class="userinput"><font color='navy'><b>import</b></font> urllib</span>
48
+ <tt class="prompt">&gt;&gt;&gt; </tt><span class="userinput">rawdata = urllib.urlopen(<font color='olive'>'http://yahoo.co.jp/'</font>).read()</span>
49
+ <tt class="prompt">&gt;&gt;&gt; </tt><span class="userinput"><font color='navy'><b>import</b></font> chardet</span>
50
+ <tt class="prompt">&gt;&gt;&gt; </tt><span class="userinput">chardet.detect(rawdata)</span>
51
+ <span class="computeroutput">{'encoding': 'EUC-JP', 'confidence': 0.99}</span></pre>
52
+ </div>
53
+ </div>
54
+ <div class="section" lang="en">
55
+ <div class="titlepage">
56
+ <div><div><h3 class="title">
57
+ <a name="usage.advanced" class="skip" href="#usage.advanced" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Advanced usage</h3></div></div>
58
+ <div></div>
59
+ </div>
60
+ <p>If you're dealing with a large amount of text, you can call the <span class="application">Universal Encoding Detector</span> library incrementally, and it will stop as soon as it is confident enough to report its results.</p>
61
+ <p>Create a <tt class="classname">UniversalDetector</tt> object, then call its <tt class="methodname">feed</tt> method repeatedly with each block of text. If the detector reaches a minimum threshold of confidence, it will set <tt class="varname">detector.done</tt> to <tt class="constant">True</tt>.</p>
62
+ <p>Once you've exhausted the source text, call <tt class="methodname">detector.close()</tt>, which will do some final calculations in case the detector didn't hit its minimum confidence threshold earlier. Then <tt class="varname">detector.result</tt> will be a dictionary containing the auto-detected character encoding and confidence level (the same as <a href="usage.html#example.basic.detect" title="Example: Using the detect function">the <tt class="function">chardet.detect</tt> function returns</a>).</p>
63
+ <div class="example">
64
+ <a name="example.multiline" class="skip" href="#example.multiline" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Detecting encoding incrementally</h3>
65
+ <pre class="programlisting python"><font color='navy'><b>import</b></font> urllib
66
+ <font color='navy'><b>from</b></font> chardet.universaldetector <font color='navy'><b>import</b></font> UniversalDetector
67
+
68
+ usock = urllib.urlopen(<font color='olive'>'http://yahoo.co.jp/'</font>)
69
+ detector = UniversalDetector()
70
+ <font color='navy'><b>for</b></font> line <font color='navy'><b>in</b></font> usock.readlines():
71
+ detector.feed(line)
72
+ <font color='navy'><b>if</b></font> detector.done: <font color='navy'><b>break</b></font>
73
+ detector.close()
74
+ usock.close()
75
+ <font color='navy'><b>print</b></font> detector.result</pre>
76
+ <pre class="screen"><span class="computeroutput">{'encoding': 'EUC-JP', 'confidence': 0.99}</span></pre>
77
+ </div>
78
+ <p>If you want to detect the encoding of multiple texts (such as separate files), you can re-use a single <tt class="classname">UniversalDetector</tt> object. Just call <tt class="methodname">detector.reset()</tt> at the start of each file, call <tt class="methodname">detector.feed</tt> as many times as you like, and then call <tt class="methodname">detector.close()</tt> and check the <tt class="varname">detector.result</tt> dictionary for the file's results.</p>
79
+ <div class="example">
80
+ <a name="advanced.multifile.multiline" class="skip" href="#advanced.multifile.multiline" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Detecting encodings of multiple files</h3>
81
+ <pre class="programlisting python"><font color='navy'><b>import</b></font> glob
82
+ <font color='navy'><b>from</b></font> charset.universaldetector <font color='navy'><b>import</b></font> UniversalDetector
83
+
84
+ detector = UniversalDetector()
85
+ <font color='navy'><b>for</b></font> filename <font color='navy'><b>in</b></font> glob.glob(<font color='olive'>'*.xml'</font>):
86
+ <font color='navy'><b>print</b></font> filename.ljust(60),
87
+ detector.reset()
88
+ <font color='navy'><b>for</b></font> line <font color='navy'><b>in</b></font> file(filename, <font color='olive'>'rb'</font>):
89
+ detector.feed(line)
90
+ <font color='navy'><b>if</b></font> detector.done: <font color='navy'><b>break</b></font>
91
+ detector.close()
92
+ <font color='navy'><b>print</b></font> detector.result
93
+ </pre>
94
+ </div>
95
+ </div>
96
+ </div>
97
+ <div class="footernavigation">
98
+ <div style="float: left">← <a class="NavigationArrow" href="supported-encodings.html">Supported encodings</a>
99
+ </div>
100
+ <div style="text-align: right">
101
+ <a class="NavigationArrow" href="how-it-works.html">How it works</a> →</div>
102
+ </div>
103
+ <hr>
104
+ <div id="footer"><p class="copyright">Copyright © 2006 Mark Pilgrim · <a href="mailto:mark@diveintomark.org">mark@diveintomark.org</a> · <a href="license.html">Terms of use</a></p></div>
105
+ </div></div>
106
+ </body>
107
+ </html>
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.10
3
+ specification_version: 1
4
+ name: chardet
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.9.0
7
+ date: 2006-03-28
8
+ summary: "Character encoding auto-detection in Ruby. Base on Mark Pilgrim's Python port."
9
+ require_paths:
10
+ - lib
11
+ email: zhengzhengzheng@gmail.com
12
+ homepage: http://blog.vava.cn/
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: UniversalDetector
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ -
22
+ - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ platform: ruby
27
+ authors:
28
+ - Hui
29
+ files:
30
+ - lib/Big5Freq.rb
31
+ - lib/Big5Prober.rb
32
+ - lib/CharDistributionAnalysis.rb
33
+ - lib/CharSetGroupProber.rb
34
+ - lib/CharSetProber.rb
35
+ - lib/CodingStateMachine.rb
36
+ - lib/EscCharSetProber.rb
37
+ - lib/ESCSM.rb
38
+ - lib/EUCJPProber.rb
39
+ - lib/EUCKRFreq.rb
40
+ - lib/EUCKRProber.rb
41
+ - lib/EUCTWFreq.rb
42
+ - lib/EUCTWProber.rb
43
+ - lib/GB2312Freq.rb
44
+ - lib/GB2312Prober.rb
45
+ - lib/HebrewProber.rb
46
+ - lib/JapaneseContextAnalysis.rb
47
+ - lib/JISFreq.rb
48
+ - lib/LangBulgarianModel.rb
49
+ - lib/LangCyrillicModel.rb
50
+ - lib/LangGreekModel.rb
51
+ - lib/LangHebrewModel.rb
52
+ - lib/LangHungarianModel.rb
53
+ - lib/LangThaiModel.rb
54
+ - lib/Latin1Prober.rb
55
+ - lib/MBCSGroupProber.rb
56
+ - lib/MBCSSM.rb
57
+ - lib/MultiByteCharSetProber.rb
58
+ - lib/SBCSGroupProber.rb
59
+ - lib/SingleByteCharSetProber.rb
60
+ - lib/SJISProber.rb
61
+ - lib/UniversalDetector.rb
62
+ - lib/UTF8Prober.rb
63
+ - python-docs/css
64
+ - python-docs/faq.html
65
+ - python-docs/how-it-works.html
66
+ - python-docs/images
67
+ - python-docs/index.html
68
+ - python-docs/license.html
69
+ - python-docs/supported-encodings.html
70
+ - python-docs/usage.html
71
+ - python-docs/css/chardet.css
72
+ - python-docs/images/caution.png
73
+ - python-docs/images/important.png
74
+ - python-docs/images/note.png
75
+ - python-docs/images/permalink.gif
76
+ - python-docs/images/tip.png
77
+ - python-docs/images/warning.png
78
+ - COPYING
79
+ - README
80
+ test_files: []
81
+ rdoc_options: []
82
+ extra_rdoc_files: []
83
+ executables: []
84
+ extensions: []
85
+ requirements: []
86
+ dependencies: []