chardet2 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/SJISProber.rb ADDED
@@ -0,0 +1,99 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'MultiByteCharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'JapaneseContextAnalysis'
33
+ require 'CharDistributionAnalysis'
34
+ require 'MBCSSM'
35
+
36
+ module UniversalDetector
37
+ class SJISProber < MultiByteCharSetProber
38
+ def initialize
39
+ super
40
+ @_mCodingSM = CodingStateMachine.new(SJISSMModel)
41
+ @_mDistributionAnalyzer = SJISDistributionAnalysis.new
42
+ @_mContextAnalyzer = SJISContextAnalysis.new
43
+ reset()
44
+ end
45
+
46
+ def reset
47
+ super
48
+ @_mContextAnalyzer.reset()
49
+ end
50
+
51
+ def get_charset_name
52
+ return "SHIFT_JIS"
53
+ end
54
+
55
+ def feed(aBuf)
56
+ aLen = aBuf.length
57
+ for i in 0...aLen
58
+ codingState = @_mCodingSM.next_state(aBuf[i])
59
+ if codingState == :Error
60
+ if DEBUG
61
+ p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
62
+ end
63
+ @_mState = :NotMe
64
+ break
65
+ elsif codingState == :ItsMe
66
+ @_mState = :FoundIt
67
+ break
68
+ elsif codingState == :Start
69
+ charLen = @_mCodingSM.get_current_charlen()
70
+ if i == 0
71
+ @_mLastChar[1] = aBuf[0]
72
+ @_mContextAnalyzer.feed(@_mLastChar[2 - charLen..@_mLastChar.length], charLen)
73
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
74
+ else
75
+ @_mContextAnalyzer.feed(aBuf[i + 1 - charLen .. i + 3 - charLen], charLen)
76
+ @_mDistributionAnalyzer.feed(aBuf[i - 1 .. i + 1], charLen)
77
+ end
78
+ end
79
+ end
80
+
81
+ @_mLastChar[0] = aBuf[aLen - 1]
82
+
83
+ if get_state() == :Detecting
84
+ if @_mContextAnalyzer.got_enough_data() and \
85
+ (get_confidence() > SHORTCUT_THRESHOLD)
86
+ @_mState = :FoundIt
87
+ end
88
+ end
89
+
90
+ return get_state()
91
+ end
92
+
93
+ def get_confidence
94
+ contxtCf = @_mContextAnalyzer.get_confidence()
95
+ distribCf = @_mDistributionAnalyzer.get_confidence()
96
+ return [contxtCf, distribCf].max
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,131 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module UniversalDetector
33
+
34
+ SAMPLE_SIZE = 64
35
+ SB_ENOUGH_REL_THRESHOLD = 1024
36
+ POSITIVE_SHORTCUT_THRESHOLD = 0.95
37
+ NEGATIVE_SHORTCUT_THRESHOLD = 0.05
38
+ SYMBOL_CAT_ORDER = 250
39
+ NUMBER_OF_SEQ_CAT = 4
40
+ POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
41
+
42
+ class SingleByteCharSetProber < CharSetProber
43
+ def initialize(model, reversed=false, nameProber=nil)
44
+ super()
45
+ @_mModel = model
46
+ @_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
47
+ @_mNameProber = nameProber # Optional auxiliary prober for name decision
48
+ reset()
49
+ end
50
+
51
+ def reset
52
+ super
53
+ @_mLastOrder = 255 # char order of last character
54
+ @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
55
+ @_mTotalSeqs = 0
56
+ @_mTotalChar = 0
57
+ @_mFreqChar = 0 # characters that fall in our sampling range
58
+ end
59
+
60
+ def get_charset_name
61
+ if @_mNameProber
62
+ return @_mNameProber.get_charset_name()
63
+ else
64
+ return @_mModel['charsetName']
65
+ end
66
+ end
67
+
68
+ def feed(aBuf)
69
+ unless @_mModel['keepEnglishLetter']
70
+ aBuf = filter_without_english_letters(aBuf)
71
+ end
72
+ aLen = aBuf.length
73
+ unless aLen
74
+ return get_state()
75
+ end
76
+
77
+ for i in 0...aLen
78
+ c = aBuf[i]
79
+ order = @_mModel['charToOrderMap'][c]
80
+ if order < SYMBOL_CAT_ORDER
81
+ @_mTotalChar += 1
82
+ end
83
+ if order < SAMPLE_SIZE
84
+ @_mFreqChar += 1
85
+ if @_mLastOrder < SAMPLE_SIZE
86
+ @_mTotalSeqs += 1
87
+ unless @_mReversed
88
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
89
+ else # reverse the order of the letters in the lookup
90
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
91
+ end
92
+ end
93
+ end
94
+ @_mLastOrder = order
95
+ end
96
+
97
+ if get_state() == :Detecting
98
+ if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
99
+ cf = get_confidence()
100
+ if cf > POSITIVE_SHORTCUT_THRESHOLD
101
+ if DEBUG
102
+ p('%s confidence = %s, we have a winner\n' % [@_mModel['charsetName'], cf])
103
+ end
104
+ @_mState = :FoundIt
105
+ elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
106
+ if DEBUG
107
+ p('%s confidence = %s, below negative shortcut threshhold %s\n' % [@_mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD])
108
+ end
109
+ @_mState = :NotMe
110
+ end
111
+ end
112
+ end
113
+
114
+ return get_state()
115
+ end
116
+
117
+ def get_confidence
118
+ r = 0.01
119
+ if @_mTotalSeqs > 0
120
+ # print @_mSeqCounters[POSITIVE_CAT], @_mTotalSeqs, @_mModel['mTypicalPositiveRatio']
121
+ r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
122
+ # print r, @_mFreqChar, @_mTotalChar
123
+ r = r * @_mFreqChar / @_mTotalChar
124
+ if r >= 1.0
125
+ r = 0.99
126
+ end
127
+ end
128
+ return r
129
+ end
130
+ end
131
+ end
data/lib/UTF8Prober.rb ADDED
@@ -0,0 +1,91 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'MBCSSM'
33
+
34
+ module UniversalDetector
35
+ ONE_CHAR_PROB = 0.5
36
+
37
+ class UTF8Prober < CharSetProber
38
+ def initialize
39
+ super()
40
+ @_mCodingSM = CodingStateMachine.new(UTF8SMModel)
41
+ reset()
42
+ end
43
+
44
+ def reset
45
+ super
46
+ @_mCodingSM.reset()
47
+ @_mNumOfMBChar = 0
48
+ end
49
+
50
+ def get_charset_name
51
+ return "utf-8"
52
+ end
53
+
54
+ def feed(aBuf)
55
+ aLen = aBuf.length
56
+ for i in 0...aLen
57
+ codingState = @_mCodingSM.next_state(aBuf[i])
58
+ if codingState == :Error
59
+ @_mState = :NotMe
60
+ break
61
+ elsif codingState == :ItsMe
62
+ @_mState = :FoundIt
63
+ break
64
+ elsif codingState == :Start
65
+ if @_mCodingSM.get_current_charlen() >= 2
66
+ @_mNumOfMBChar += 1
67
+ end
68
+ end
69
+ end
70
+
71
+ if get_state() == :Detecting
72
+ if get_confidence() > SHORTCUT_THRESHOLD
73
+ @_mState = :FoundIt
74
+ end
75
+ end
76
+ return get_state()
77
+ end
78
+
79
+ def get_confidence
80
+ unlike = 0.99
81
+ if @_mNumOfMBChar < 6
82
+ for i in 0...@_mNumOfMBChar
83
+ unlike = unlike * ONE_CHAR_PROB
84
+ end
85
+ return 1.0 - unlike
86
+ else
87
+ return unlike
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,209 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require "EscCharSetProber"
30
+ require "MBCSGroupProber"
31
+ require "SBCSGroupProber"
32
+ require "Latin1Prober"
33
+ require "singleton"
34
+
35
+ module UniversalDetector
36
+
37
+ class << self
38
+ def encoding(data)
39
+ chardet(data)['encoding']
40
+ end
41
+
42
+ def chardet(data)
43
+ u = UniversalDetector::Detector.instance
44
+ u.reset()
45
+ u.feed(data)
46
+ u.close()
47
+ u.result
48
+ end
49
+ end
50
+
51
+ DEBUG = nil
52
+
53
+ Detectiong = 0
54
+ FoundIt = 1
55
+ NotMe = 2
56
+
57
+ Start = 0
58
+ Error = 1
59
+ ItsMe = 2
60
+
61
+ MINIMUM_THRESHOLD = 0.20
62
+ PureAscii = 0
63
+ EscAscii = 1
64
+ Highbyte = 2
65
+
66
+ SHORTCUT_THRESHOLD = 0.95
67
+
68
+ class Detector
69
+
70
+ include Singleton
71
+
72
+ attr_reader :result
73
+
74
+ def initialize
75
+ @_highBitDetector = /[\x80-\xFF]/n
76
+ @_escDetector = /\033|~\{/n
77
+ @_mEscCharSetProber = nil
78
+ @_mCharSetProbers = []
79
+ reset
80
+ end
81
+
82
+ def reset
83
+ @result = {"encoding"=> nil, "confidence"=> 0.0}
84
+ @done = false
85
+ @_mStart = true
86
+ @_mGotData = false
87
+ @_mInputState = :PureAscii
88
+ @_mLastChar = ""
89
+ if @_mEscCharSetProber
90
+ @_mEscCharSetProber.reset
91
+ end
92
+ for prober in @_mCharSetProbers
93
+ prober.reset
94
+ end
95
+ end
96
+
97
+ def feed(data)
98
+ if @done || data.empty?
99
+ return
100
+ end
101
+ unless @_mGotData
102
+ # If the data starts with BOM, we know it is UTF
103
+ if data[0,3] == "\xEF\xBB\xBF"
104
+ # EF BB BF UTF-8 with BOM
105
+ @result = {"encoding"=> "UTF-8", "confidence"=> 1.0}
106
+ elsif data[0,4] == "\xFF\xFE\x00\x00"
107
+ # FF FE 00 00 UTF-32, little-endian BOM
108
+ @result = {"encoding"=> "UTF-32LE", "confidence"=> 1.0}
109
+ elsif data[0,4] == "\x00\x00\xFE\xFF"
110
+ # 00 00 FE FF UTF-32, big-endian BOM
111
+ @result = {"encoding"=> "UTF-32BE", "confidence"=> 1.0}
112
+ elsif data[0,4] == "\xFE\xFF\x00\x00"
113
+ # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
114
+ @result = {"encoding"=> "X-ISO-10646-UCS-4-3412", "confidence"=> 1.0}
115
+ elsif data[0,4] == "\x00\x00\xFF\xFE"
116
+ # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
117
+ @result = {"encoding"=> "X-ISO-10646-UCS-4-2143", "confidence"=> 1.0}
118
+ elsif data[0,4] == "\xFF\xFE"
119
+ # FF FE UTF-16, little endian BOM
120
+ @result = {"encoding"=> "UTF-16LE", "confidence"=> 1.0}
121
+ elsif data[0,2] == "\xFE\xFF"
122
+ # FE FF UTF-16, big endian BOM
123
+ @result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0}
124
+ end
125
+ end
126
+ @_mGotData = true
127
+ if @result["encoding"] && @result["confidence"] > 0.0
128
+ @done = true
129
+ return
130
+ end
131
+
132
+ if @_mInputState == :PureAscii
133
+ if data =~ @_highBitDetector
134
+ @_mInputState = :Highbyte
135
+ elsif (@_mLastChar + data) =~ @_escDetector
136
+ @_mInputState = :EscAscii
137
+ end
138
+ end
139
+
140
+ @_mLastChar = data[-1]
141
+ if @_mInputState == :EscAscii
142
+ unless @_mEscCharSetProber
143
+ @_mEscCharSetProber = EscCharSetProber.new
144
+ end
145
+ if @_mEscCharSetProber.feed(data) == constants.eFoundIt
146
+ @result = {"encoding"=> @_mEscCharSetProber.get_charset_name() ,"confidence"=> @_mEscCharSetProber.get_confidence()}
147
+ @done = true
148
+ end
149
+ elsif @_mInputState == :Highbyte
150
+ if @_mCharSetProbers.empty?
151
+ @_mCharSetProbers = MBCSGroupProber.new.mProbers + SBCSGroupProber.new.mProbers + [Latin1Prober.new]
152
+ end
153
+ @_mCharSetProbers.each do |prober|
154
+ if prober.feed(data) == :FoundIt
155
+ @result = {"encoding"=> prober.get_charset_name(), "confidence"=> prober.get_confidence()}
156
+ @done = true
157
+ break
158
+ end
159
+ end #for
160
+ end
161
+ end #feed
162
+
163
+ def close
164
+ if @done then return end
165
+ unless @_mGotData
166
+ if DEBUG
167
+ p("no data received!\n")
168
+ end
169
+ return
170
+ end
171
+ @done = true
172
+
173
+ if @_mInputState == :PureAscii
174
+ @result = {"encoding" => "ascii", "confidence" => 1.0}
175
+ return @result
176
+ end
177
+
178
+ if @_mInputState == :Highbyte
179
+ proberConfidence = nil
180
+ maxProberConfidence = 0.0
181
+ maxProber = nil
182
+ for prober in @_mCharSetProbers
183
+ unless prober then next end
184
+ proberConfidence = prober.get_confidence()
185
+ if proberConfidence > maxProberConfidence
186
+ maxProberConfidence = proberConfidence
187
+ maxProber = prober
188
+ end
189
+ end
190
+ if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD)
191
+ @result = {"encoding" => maxProber.get_charset_name(),
192
+ "confidence" => maxProber.get_confidence()}
193
+ return @result
194
+ end
195
+ end #if
196
+
197
+ if DEBUG
198
+ p("no probers hit minimum threshhold\n")
199
+ for prober in @_mCharSetProbers
200
+ unless prober then next end
201
+ p("%s confidence = %s\n" % \
202
+ [prober.get_charset_name(), \
203
+ prober.get_confidence()])
204
+ end
205
+ end
206
+ end #close
207
+ end #class
208
+
209
+ end #module
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chardet2
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jan Xie
9
+ - Felipe Tanus
10
+ - Hui
11
+ autorequire: UniversalDetector
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2013-05-17 00:00:00.000000000 Z
15
+ dependencies: []
16
+ description:
17
+ email:
18
+ - jan.h.xie@gmail.com
19
+ executables: []
20
+ extensions: []
21
+ extra_rdoc_files: []
22
+ files:
23
+ - lib/MBCSSM.rb
24
+ - lib/MultiByteCharSetProber.rb
25
+ - lib/JapaneseContextAnalysis.rb
26
+ - lib/LangCyrillicModel.rb
27
+ - lib/EUCKRFreq.rb
28
+ - lib/GB2312Freq.rb
29
+ - lib/EUCKRProber.rb
30
+ - lib/CodingStateMachine.rb
31
+ - lib/LangHungarianModel.rb
32
+ - lib/HebrewProber.rb
33
+ - lib/Big5Prober.rb
34
+ - lib/CharSetGroupProber.rb
35
+ - lib/SingleByteCharSetProber.rb
36
+ - lib/EUCTWFreq.rb
37
+ - lib/MBCSGroupProber.rb
38
+ - lib/SBCSGroupProber.rb
39
+ - lib/LangBulgarianModel.rb
40
+ - lib/SJISProber.rb
41
+ - lib/Big5Freq.rb
42
+ - lib/UniversalDetector.rb
43
+ - lib/CharDistributionAnalysis.rb
44
+ - lib/UTF8Prober.rb
45
+ - lib/Latin1Prober.rb
46
+ - lib/ESCSM.rb
47
+ - lib/EscCharSetProber.rb
48
+ - lib/JISFreq.rb
49
+ - lib/EUCJPProber.rb
50
+ - lib/EUCTWProber.rb
51
+ - lib/LangGreekModel.rb
52
+ - lib/LangHebrewModel.rb
53
+ - lib/GB2312Prober.rb
54
+ - lib/LangThaiModel.rb
55
+ - lib/CharSetProber.rb
56
+ - COPYING
57
+ - README.markdown
58
+ homepage: https://github.com/janx/chardet
59
+ licenses: []
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ requirements: []
77
+ rubyforge_project:
78
+ rubygems_version: 1.8.23
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base
82
+ on Mark Pilgrim's Python port and Hui's ruby port.
83
+ test_files: []