chardet 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/COPYING +504 -0
  2. data/README +12 -0
  3. data/lib/Big5Freq.rb +913 -0
  4. data/lib/Big5Prober.rb +48 -0
  5. data/lib/CharDistributionAnalysis.rb +245 -0
  6. data/lib/CharSetGroupProber.rb +114 -0
  7. data/lib/CharSetProber.rb +70 -0
  8. data/lib/CodingStateMachine.rb +74 -0
  9. data/lib/ESCSM.rb +242 -0
  10. data/lib/EUCJPProber.rb +97 -0
  11. data/lib/EUCKRFreq.rb +600 -0
  12. data/lib/EUCKRProber.rb +48 -0
  13. data/lib/EUCTWFreq.rb +432 -0
  14. data/lib/EUCTWProber.rb +48 -0
  15. data/lib/EscCharSetProber.rb +94 -0
  16. data/lib/GB2312Freq.rb +475 -0
  17. data/lib/GB2312Prober.rb +48 -0
  18. data/lib/HebrewProber.rb +292 -0
  19. data/lib/JISFreq.rb +573 -0
  20. data/lib/JapaneseContextAnalysis.rb +234 -0
  21. data/lib/LangBulgarianModel.rb +231 -0
  22. data/lib/LangCyrillicModel.rb +332 -0
  23. data/lib/LangGreekModel.rb +229 -0
  24. data/lib/LangHebrewModel.rb +202 -0
  25. data/lib/LangHungarianModel.rb +228 -0
  26. data/lib/LangThaiModel.rb +203 -0
  27. data/lib/Latin1Prober.rb +160 -0
  28. data/lib/MBCSGroupProber.rb +57 -0
  29. data/lib/MBCSSM.rb +513 -0
  30. data/lib/MultiByteCharSetProber.rb +94 -0
  31. data/lib/SBCSGroupProber.rb +71 -0
  32. data/lib/SJISProber.rb +99 -0
  33. data/lib/SingleByteCharSetProber.rb +131 -0
  34. data/lib/UTF8Prober.rb +91 -0
  35. data/lib/UniversalDetector.rb +209 -0
  36. data/python-docs/css/chardet.css +299 -0
  37. data/python-docs/faq.html +107 -0
  38. data/python-docs/how-it-works.html +113 -0
  39. data/python-docs/images/caution.png +0 -0
  40. data/python-docs/images/important.png +0 -0
  41. data/python-docs/images/note.png +0 -0
  42. data/python-docs/images/permalink.gif +0 -0
  43. data/python-docs/images/tip.png +0 -0
  44. data/python-docs/images/warning.png +0 -0
  45. data/python-docs/index.html +73 -0
  46. data/python-docs/license.html +62 -0
  47. data/python-docs/supported-encodings.html +86 -0
  48. data/python-docs/usage.html +107 -0
  49. metadata +86 -0
@@ -0,0 +1,94 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module UniversalDetector
33
+
34
+ class MultiByteCharSetProber < CharSetProber
35
+ def initialize
36
+ super
37
+ @_mDistributionAnalyzer = nil
38
+ @_mCodingSM = nil
39
+ @_mLastChar = ['\x00', '\x00']
40
+ end
41
+
42
+ def reset
43
+ super
44
+ if @_mCodingSM
45
+ @_mCodingSM.reset()
46
+ end
47
+ if @_mDistributionAnalyzer
48
+ @_mDistributionAnalyzer.reset()
49
+ end
50
+ @_mLastChar = ['\x00', '\x00']
51
+ end
52
+
53
+ def get_charset_name
54
+ end
55
+
56
+ def feed(aBuf)
57
+ aLen = aBuf.length
58
+ for i in 0...aLen
59
+ codingState = @_mCodingSM.next_state(aBuf[i])
60
+ if codingState == :Error
61
+ if UniversalDetector::DEBUG
62
+ p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
63
+ end
64
+ @_mState = :NotMe
65
+ break
66
+ elsif codingState == :ItsMe
67
+ @_mState = :FoundIt
68
+ break
69
+ elsif codingState == :Start
70
+ charLen = @_mCodingSM.get_current_charlen()
71
+ if i == 0
72
+ @_mLastChar[1] = aBuf[0]
73
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
74
+ else
75
+ @_mDistributionAnalyzer.feed(aBuf[(i-1)..(i+1)], charLen)
76
+ end
77
+ end
78
+ end
79
+
80
+ @_mLastChar[0] = aBuf[aLen - 1]
81
+ if get_state() == :Detecting
82
+ if @_mDistributionAnalyzer.got_enough_data() && (get_confidence() > SHORTCUT_THRESHOLD)
83
+ @_mState = :FoundIt
84
+ end
85
+ end
86
+
87
+ return get_state()
88
+ end
89
+
90
+ def get_confidence
91
+ return @_mDistributionAnalyzer.get_confidence()
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,71 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetGroupProber'
31
+ require 'SingleByteCharSetProber'
32
+ require 'LangCyrillicModel'
33
+ require 'LangGreekModel'
34
+ require 'LangHebrewModel'
35
+ require 'LangHungarianModel'
36
+ require 'LangBulgarianModel'
37
+ require 'LangThaiModel'
38
+ require 'HebrewProber'
39
+
40
+ module UniversalDetector
41
+ class SBCSGroupProber < CharSetGroupProber
42
+
43
+ attr_reader :mProbers
44
+
45
+ def initialize
46
+ super
47
+ @mProbers = [ \
48
+ SingleByteCharSetProber.new(Win1251CyrillicModel),
49
+ SingleByteCharSetProber.new(Koi8rModel),
50
+ SingleByteCharSetProber.new(Latin5CyrillicModel),
51
+ SingleByteCharSetProber.new(MacCyrillicModel),
52
+ SingleByteCharSetProber.new(Ibm866Model),
53
+ SingleByteCharSetProber.new(Ibm855Model),
54
+ SingleByteCharSetProber.new(Latin7GreekModel),
55
+ SingleByteCharSetProber.new(Win1253GreekModel),
56
+ SingleByteCharSetProber.new(Latin5BulgarianModel),
57
+ SingleByteCharSetProber.new(Win1251BulgarianModel),
58
+ SingleByteCharSetProber.new(Latin2HungarianModel),
59
+ SingleByteCharSetProber.new(Win1250HungarianModel),
60
+ SingleByteCharSetProber.new(TIS620ThaiModel),
61
+ ]
62
+ hebrewProber = HebrewProber.new
63
+ logicalHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, false, hebrewProber)
64
+ visualHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, true, hebrewProber)
65
+ hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
66
+ @mProbers = @mProbers + [hebrewProber, logicalHebrewProber, visualHebrewProber]
67
+
68
+ reset()
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,99 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'MultiByteCharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'JapaneseContextAnalysis'
33
+ require 'CharDistributionAnalysis'
34
+ require 'MBCSSM'
35
+
36
+ module UniversalDetector
37
+ class SJISProber < MultiByteCharSetProber
38
+ def initialize
39
+ super
40
+ @_mCodingSM = CodingStateMachine.new(SJISSMModel)
41
+ @_mDistributionAnalyzer = SJISDistributionAnalysis.new
42
+ @_mContextAnalyzer = SJISContextAnalysis.new
43
+ reset()
44
+ end
45
+
46
+ def reset
47
+ super
48
+ @_mContextAnalyzer.reset()
49
+ end
50
+
51
+ def get_charset_name
52
+ return "SHIFT_JIS"
53
+ end
54
+
55
+ def feed(aBuf)
56
+ aLen = aBuf.length
57
+ for i in 0...aLen
58
+ codingState = @_mCodingSM.next_state(aBuf[i])
59
+ if codingState == :Error
60
+ if DEBUG
61
+ p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
62
+ end
63
+ @_mState = :NotMe
64
+ break
65
+ elsif codingState == :ItsMe
66
+ @_mState = :FoundIt
67
+ break
68
+ elsif codingState == :Start
69
+ charLen = @_mCodingSM.get_current_charlen()
70
+ if i == 0
71
+ @_mLastChar[1] = aBuf[0]
72
+ @_mContextAnalyzer.feed(@_mLastChar[2 - charLen..@_mLastChar.length], charLen)
73
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
74
+ else
75
+ @_mContextAnalyzer.feed(aBuf[i + 1 - charLen .. i + 3 - charLen], charLen)
76
+ @_mDistributionAnalyzer.feed(aBuf[i - 1 .. i + 1], charLen)
77
+ end
78
+ end
79
+ end
80
+
81
+ @_mLastChar[0] = aBuf[aLen - 1]
82
+
83
+ if get_state() == :Detecting
84
+ if @_mContextAnalyzer.got_enough_data() and \
85
+ (get_confidence() > SHORTCUT_THRESHOLD)
86
+ @_mState = :FoundIt
87
+ end
88
+ end
89
+
90
+ return get_state()
91
+ end
92
+
93
+ def get_confidence
94
+ contxtCf = @_mContextAnalyzer.get_confidence()
95
+ distribCf = @_mDistributionAnalyzer.get_confidence()
96
+ return [contxtCf, distribCf].max
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,131 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module UniversalDetector
33
+
34
+ SAMPLE_SIZE = 64
35
+ SB_ENOUGH_REL_THRESHOLD = 1024
36
+ POSITIVE_SHORTCUT_THRESHOLD = 0.95
37
+ NEGATIVE_SHORTCUT_THRESHOLD = 0.05
38
+ SYMBOL_CAT_ORDER = 250
39
+ NUMBER_OF_SEQ_CAT = 4
40
+ POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
41
+
42
+ class SingleByteCharSetProber < CharSetProber
43
+ def initialize(model, reversed=false, nameProber=nil)
44
+ super()
45
+ @_mModel = model
46
+ @_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
47
+ @_mNameProber = nameProber # Optional auxiliary prober for name decision
48
+ reset()
49
+ end
50
+
51
+ def reset
52
+ super
53
+ @_mLastOrder = 255 # char order of last character
54
+ @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
55
+ @_mTotalSeqs = 0
56
+ @_mTotalChar = 0
57
+ @_mFreqChar = 0 # characters that fall in our sampling range
58
+ end
59
+
60
+ def get_charset_name
61
+ if @_mNameProber
62
+ return @_mNameProber.get_charset_name()
63
+ else
64
+ return @_mModel['charsetName']
65
+ end
66
+ end
67
+
68
+ def feed(aBuf)
69
+ unless @_mModel['keepEnglishLetter']
70
+ aBuf = filter_without_english_letters(aBuf)
71
+ end
72
+ aLen = aBuf.length
73
+ unless aLen
74
+ return get_state()
75
+ end
76
+
77
+ for i in 0...aLen
78
+ c = aBuf[i]
79
+ order = @_mModel['charToOrderMap'][c]
80
+ if order < SYMBOL_CAT_ORDER
81
+ @_mTotalChar += 1
82
+ end
83
+ if order < SAMPLE_SIZE
84
+ @_mFreqChar += 1
85
+ if @_mLastOrder < SAMPLE_SIZE
86
+ @_mTotalSeqs += 1
87
+ unless @_mReversed
88
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
89
+ else # reverse the order of the letters in the lookup
90
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
91
+ end
92
+ end
93
+ end
94
+ @_mLastOrder = order
95
+ end
96
+
97
+ if get_state() == :Detecting
98
+ if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
99
+ cf = get_confidence()
100
+ if cf > POSITIVE_SHORTCUT_THRESHOLD
101
+ if DEBUG
102
+ p('%s confidence = %s, we have a winner\n' % [@_mModel['charsetName'], cf])
103
+ end
104
+ @_mState = :FoundIt
105
+ elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
106
+ if DEBUG
107
+ p('%s confidence = %s, below negative shortcut threshhold %s\n' % [@_mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD])
108
+ end
109
+ @_mState = :NotMe
110
+ end
111
+ end
112
+ end
113
+
114
+ return get_state()
115
+ end
116
+
117
+ def get_confidence
118
+ r = 0.01
119
+ if @_mTotalSeqs > 0
120
+ # print @_mSeqCounters[POSITIVE_CAT], @_mTotalSeqs, @_mModel['mTypicalPositiveRatio']
121
+ r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
122
+ # print r, @_mFreqChar, @_mTotalChar
123
+ r = r * @_mFreqChar / @_mTotalChar
124
+ if r >= 1.0
125
+ r = 0.99
126
+ end
127
+ end
128
+ return r
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,91 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'MBCSSM'
33
+
34
+ module UniversalDetector
35
+ ONE_CHAR_PROB = 0.5
36
+
37
+ class UTF8Prober < CharSetProber
38
+ def initialize
39
+ super()
40
+ @_mCodingSM = CodingStateMachine.new(UTF8SMModel)
41
+ reset()
42
+ end
43
+
44
+ def reset
45
+ super
46
+ @_mCodingSM.reset()
47
+ @_mNumOfMBChar = 0
48
+ end
49
+
50
+ def get_charset_name
51
+ return "utf-8"
52
+ end
53
+
54
+ def feed(aBuf)
55
+ aLen = aBuf.length
56
+ for i in 0...aLen
57
+ codingState = @_mCodingSM.next_state(aBuf[i])
58
+ if codingState == :Error
59
+ @_mState = :NotMe
60
+ break
61
+ elsif codingState == :ItsMe
62
+ @_mState = :FoundIt
63
+ break
64
+ elsif codingState == :Start
65
+ if @_mCodingSM.get_current_charlen() >= 2:
66
+ @_mNumOfMBChar += 1
67
+ end
68
+ end
69
+ end
70
+
71
+ if get_state() == :Detecting
72
+ if get_confidence() > SHORTCUT_THRESHOLD
73
+ @_mState = :FoundIt
74
+ end
75
+ end
76
+ return get_state()
77
+ end
78
+
79
+ def get_confidence
80
+ unlike = 0.99
81
+ if @_mNumOfMBChar < 6
82
+ for i in 0...@_mNumOfMBChar
83
+ unlike = unlike * ONE_CHAR_PROB
84
+ end
85
+ return 1.0 - unlike
86
+ else
87
+ return unlike
88
+ end
89
+ end
90
+ end
91
+ end