chardet 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/COPYING +504 -0
  2. data/README +12 -0
  3. data/lib/Big5Freq.rb +913 -0
  4. data/lib/Big5Prober.rb +48 -0
  5. data/lib/CharDistributionAnalysis.rb +245 -0
  6. data/lib/CharSetGroupProber.rb +114 -0
  7. data/lib/CharSetProber.rb +70 -0
  8. data/lib/CodingStateMachine.rb +74 -0
  9. data/lib/ESCSM.rb +242 -0
  10. data/lib/EUCJPProber.rb +97 -0
  11. data/lib/EUCKRFreq.rb +600 -0
  12. data/lib/EUCKRProber.rb +48 -0
  13. data/lib/EUCTWFreq.rb +432 -0
  14. data/lib/EUCTWProber.rb +48 -0
  15. data/lib/EscCharSetProber.rb +94 -0
  16. data/lib/GB2312Freq.rb +475 -0
  17. data/lib/GB2312Prober.rb +48 -0
  18. data/lib/HebrewProber.rb +292 -0
  19. data/lib/JISFreq.rb +573 -0
  20. data/lib/JapaneseContextAnalysis.rb +234 -0
  21. data/lib/LangBulgarianModel.rb +231 -0
  22. data/lib/LangCyrillicModel.rb +332 -0
  23. data/lib/LangGreekModel.rb +229 -0
  24. data/lib/LangHebrewModel.rb +202 -0
  25. data/lib/LangHungarianModel.rb +228 -0
  26. data/lib/LangThaiModel.rb +203 -0
  27. data/lib/Latin1Prober.rb +160 -0
  28. data/lib/MBCSGroupProber.rb +57 -0
  29. data/lib/MBCSSM.rb +513 -0
  30. data/lib/MultiByteCharSetProber.rb +94 -0
  31. data/lib/SBCSGroupProber.rb +71 -0
  32. data/lib/SJISProber.rb +99 -0
  33. data/lib/SingleByteCharSetProber.rb +131 -0
  34. data/lib/UTF8Prober.rb +91 -0
  35. data/lib/UniversalDetector.rb +209 -0
  36. data/python-docs/css/chardet.css +299 -0
  37. data/python-docs/faq.html +107 -0
  38. data/python-docs/how-it-works.html +113 -0
  39. data/python-docs/images/caution.png +0 -0
  40. data/python-docs/images/important.png +0 -0
  41. data/python-docs/images/note.png +0 -0
  42. data/python-docs/images/permalink.gif +0 -0
  43. data/python-docs/images/tip.png +0 -0
  44. data/python-docs/images/warning.png +0 -0
  45. data/python-docs/index.html +73 -0
  46. data/python-docs/license.html +62 -0
  47. data/python-docs/supported-encodings.html +86 -0
  48. data/python-docs/usage.html +107 -0
  49. metadata +86 -0
@@ -0,0 +1,94 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module UniversalDetector
33
+
34
+ class MultiByteCharSetProber < CharSetProber
35
+ def initialize
36
+ super
37
+ @_mDistributionAnalyzer = nil
38
+ @_mCodingSM = nil
39
+ @_mLastChar = ['\x00', '\x00']
40
+ end
41
+
42
+ def reset
43
+ super
44
+ if @_mCodingSM
45
+ @_mCodingSM.reset()
46
+ end
47
+ if @_mDistributionAnalyzer
48
+ @_mDistributionAnalyzer.reset()
49
+ end
50
+ @_mLastChar = ['\x00', '\x00']
51
+ end
52
+
53
+ def get_charset_name
54
+ end
55
+
56
+ def feed(aBuf)
57
+ aLen = aBuf.length
58
+ for i in 0...aLen
59
+ codingState = @_mCodingSM.next_state(aBuf[i])
60
+ if codingState == :Error
61
+ if UniversalDetector::DEBUG
62
+ p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
63
+ end
64
+ @_mState = :NotMe
65
+ break
66
+ elsif codingState == :ItsMe
67
+ @_mState = :FoundIt
68
+ break
69
+ elsif codingState == :Start
70
+ charLen = @_mCodingSM.get_current_charlen()
71
+ if i == 0
72
+ @_mLastChar[1] = aBuf[0]
73
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
74
+ else
75
+ @_mDistributionAnalyzer.feed(aBuf[(i-1)..(i+1)], charLen)
76
+ end
77
+ end
78
+ end
79
+
80
+ @_mLastChar[0] = aBuf[aLen - 1]
81
+ if get_state() == :Detecting
82
+ if @_mDistributionAnalyzer.got_enough_data() && (get_confidence() > SHORTCUT_THRESHOLD)
83
+ @_mState = :FoundIt
84
+ end
85
+ end
86
+
87
+ return get_state()
88
+ end
89
+
90
+ def get_confidence
91
+ return @_mDistributionAnalyzer.get_confidence()
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,71 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetGroupProber'
31
+ require 'SingleByteCharSetProber'
32
+ require 'LangCyrillicModel'
33
+ require 'LangGreekModel'
34
+ require 'LangHebrewModel'
35
+ require 'LangHungarianModel'
36
+ require 'LangBulgarianModel'
37
+ require 'LangThaiModel'
38
+ require 'HebrewProber'
39
+
40
+ module UniversalDetector
41
+ class SBCSGroupProber < CharSetGroupProber
42
+
43
+ attr_reader :mProbers
44
+
45
+ def initialize
46
+ super
47
+ @mProbers = [ \
48
+ SingleByteCharSetProber.new(Win1251CyrillicModel),
49
+ SingleByteCharSetProber.new(Koi8rModel),
50
+ SingleByteCharSetProber.new(Latin5CyrillicModel),
51
+ SingleByteCharSetProber.new(MacCyrillicModel),
52
+ SingleByteCharSetProber.new(Ibm866Model),
53
+ SingleByteCharSetProber.new(Ibm855Model),
54
+ SingleByteCharSetProber.new(Latin7GreekModel),
55
+ SingleByteCharSetProber.new(Win1253GreekModel),
56
+ SingleByteCharSetProber.new(Latin5BulgarianModel),
57
+ SingleByteCharSetProber.new(Win1251BulgarianModel),
58
+ SingleByteCharSetProber.new(Latin2HungarianModel),
59
+ SingleByteCharSetProber.new(Win1250HungarianModel),
60
+ SingleByteCharSetProber.new(TIS620ThaiModel),
61
+ ]
62
+ hebrewProber = HebrewProber.new
63
+ logicalHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, false, hebrewProber)
64
+ visualHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, true, hebrewProber)
65
+ hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
66
+ @mProbers = @mProbers + [hebrewProber, logicalHebrewProber, visualHebrewProber]
67
+
68
+ reset()
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,99 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'MultiByteCharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'JapaneseContextAnalysis'
33
+ require 'CharDistributionAnalysis'
34
+ require 'MBCSSM'
35
+
36
+ module UniversalDetector
37
+ class SJISProber < MultiByteCharSetProber
38
+ def initialize
39
+ super
40
+ @_mCodingSM = CodingStateMachine.new(SJISSMModel)
41
+ @_mDistributionAnalyzer = SJISDistributionAnalysis.new
42
+ @_mContextAnalyzer = SJISContextAnalysis.new
43
+ reset()
44
+ end
45
+
46
+ def reset
47
+ super
48
+ @_mContextAnalyzer.reset()
49
+ end
50
+
51
+ def get_charset_name
52
+ return "SHIFT_JIS"
53
+ end
54
+
55
+ def feed(aBuf)
56
+ aLen = aBuf.length
57
+ for i in 0...aLen
58
+ codingState = @_mCodingSM.next_state(aBuf[i])
59
+ if codingState == :Error
60
+ if DEBUG
61
+ p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
62
+ end
63
+ @_mState = :NotMe
64
+ break
65
+ elsif codingState == :ItsMe
66
+ @_mState = :FoundIt
67
+ break
68
+ elsif codingState == :Start
69
+ charLen = @_mCodingSM.get_current_charlen()
70
+ if i == 0
71
+ @_mLastChar[1] = aBuf[0]
72
+ @_mContextAnalyzer.feed(@_mLastChar[2 - charLen..@_mLastChar.length], charLen)
73
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
74
+ else
75
+ @_mContextAnalyzer.feed(aBuf[i + 1 - charLen .. i + 3 - charLen], charLen)
76
+ @_mDistributionAnalyzer.feed(aBuf[i - 1 .. i + 1], charLen)
77
+ end
78
+ end
79
+ end
80
+
81
+ @_mLastChar[0] = aBuf[aLen - 1]
82
+
83
+ if get_state() == :Detecting
84
+ if @_mContextAnalyzer.got_enough_data() and \
85
+ (get_confidence() > SHORTCUT_THRESHOLD)
86
+ @_mState = :FoundIt
87
+ end
88
+ end
89
+
90
+ return get_state()
91
+ end
92
+
93
+ def get_confidence
94
+ contxtCf = @_mContextAnalyzer.get_confidence()
95
+ distribCf = @_mDistributionAnalyzer.get_confidence()
96
+ return [contxtCf, distribCf].max
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,131 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module UniversalDetector
33
+
34
+ SAMPLE_SIZE = 64
35
+ SB_ENOUGH_REL_THRESHOLD = 1024
36
+ POSITIVE_SHORTCUT_THRESHOLD = 0.95
37
+ NEGATIVE_SHORTCUT_THRESHOLD = 0.05
38
+ SYMBOL_CAT_ORDER = 250
39
+ NUMBER_OF_SEQ_CAT = 4
40
+ POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
41
+
42
+ class SingleByteCharSetProber < CharSetProber
43
+ def initialize(model, reversed=false, nameProber=nil)
44
+ super()
45
+ @_mModel = model
46
+ @_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
47
+ @_mNameProber = nameProber # Optional auxiliary prober for name decision
48
+ reset()
49
+ end
50
+
51
+ def reset
52
+ super
53
+ @_mLastOrder = 255 # char order of last character
54
+ @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
55
+ @_mTotalSeqs = 0
56
+ @_mTotalChar = 0
57
+ @_mFreqChar = 0 # characters that fall in our sampling range
58
+ end
59
+
60
+ def get_charset_name
61
+ if @_mNameProber
62
+ return @_mNameProber.get_charset_name()
63
+ else
64
+ return @_mModel['charsetName']
65
+ end
66
+ end
67
+
68
+ def feed(aBuf)
69
+ unless @_mModel['keepEnglishLetter']
70
+ aBuf = filter_without_english_letters(aBuf)
71
+ end
72
+ aLen = aBuf.length
73
+ unless aLen
74
+ return get_state()
75
+ end
76
+
77
+ for i in 0...aLen
78
+ c = aBuf[i]
79
+ order = @_mModel['charToOrderMap'][c]
80
+ if order < SYMBOL_CAT_ORDER
81
+ @_mTotalChar += 1
82
+ end
83
+ if order < SAMPLE_SIZE
84
+ @_mFreqChar += 1
85
+ if @_mLastOrder < SAMPLE_SIZE
86
+ @_mTotalSeqs += 1
87
+ unless @_mReversed
88
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
89
+ else # reverse the order of the letters in the lookup
90
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
91
+ end
92
+ end
93
+ end
94
+ @_mLastOrder = order
95
+ end
96
+
97
+ if get_state() == :Detecting
98
+ if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
99
+ cf = get_confidence()
100
+ if cf > POSITIVE_SHORTCUT_THRESHOLD
101
+ if DEBUG
102
+ p('%s confidence = %s, we have a winner\n' % [@_mModel['charsetName'], cf])
103
+ end
104
+ @_mState = :FoundIt
105
+ elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
106
+ if DEBUG
107
+ p('%s confidence = %s, below negative shortcut threshhold %s\n' % [@_mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD])
108
+ end
109
+ @_mState = :NotMe
110
+ end
111
+ end
112
+ end
113
+
114
+ return get_state()
115
+ end
116
+
117
+ def get_confidence
118
+ r = 0.01
119
+ if @_mTotalSeqs > 0
120
+ # print @_mSeqCounters[POSITIVE_CAT], @_mTotalSeqs, @_mModel['mTypicalPositiveRatio']
121
+ r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
122
+ # print r, @_mFreqChar, @_mTotalChar
123
+ r = r * @_mFreqChar / @_mTotalChar
124
+ if r >= 1.0
125
+ r = 0.99
126
+ end
127
+ end
128
+ return r
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,91 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'MBCSSM'
33
+
34
+ module UniversalDetector
35
+ ONE_CHAR_PROB = 0.5
36
+
37
+ class UTF8Prober < CharSetProber
38
+ def initialize
39
+ super()
40
+ @_mCodingSM = CodingStateMachine.new(UTF8SMModel)
41
+ reset()
42
+ end
43
+
44
+ def reset
45
+ super
46
+ @_mCodingSM.reset()
47
+ @_mNumOfMBChar = 0
48
+ end
49
+
50
+ def get_charset_name
51
+ return "utf-8"
52
+ end
53
+
54
+ def feed(aBuf)
55
+ aLen = aBuf.length
56
+ for i in 0...aLen
57
+ codingState = @_mCodingSM.next_state(aBuf[i])
58
+ if codingState == :Error
59
+ @_mState = :NotMe
60
+ break
61
+ elsif codingState == :ItsMe
62
+ @_mState = :FoundIt
63
+ break
64
+ elsif codingState == :Start
65
+ if @_mCodingSM.get_current_charlen() >= 2:
66
+ @_mNumOfMBChar += 1
67
+ end
68
+ end
69
+ end
70
+
71
+ if get_state() == :Detecting
72
+ if get_confidence() > SHORTCUT_THRESHOLD
73
+ @_mState = :FoundIt
74
+ end
75
+ end
76
+ return get_state()
77
+ end
78
+
79
+ def get_confidence
80
+ unlike = 0.99
81
+ if @_mNumOfMBChar < 6
82
+ for i in 0...@_mNumOfMBChar
83
+ unlike = unlike * ONE_CHAR_PROB
84
+ end
85
+ return 1.0 - unlike
86
+ else
87
+ return unlike
88
+ end
89
+ end
90
+ end
91
+ end