chardet2 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/SJISProber.rb ADDED
@@ -0,0 +1,99 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'MultiByteCharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'JapaneseContextAnalysis'
33
+ require 'CharDistributionAnalysis'
34
+ require 'MBCSSM'
35
+
36
+ module UniversalDetector
37
+ class SJISProber < MultiByteCharSetProber
38
+ def initialize
39
+ super
40
+ @_mCodingSM = CodingStateMachine.new(SJISSMModel)
41
+ @_mDistributionAnalyzer = SJISDistributionAnalysis.new
42
+ @_mContextAnalyzer = SJISContextAnalysis.new
43
+ reset()
44
+ end
45
+
46
+ def reset
47
+ super
48
+ @_mContextAnalyzer.reset()
49
+ end
50
+
51
+ def get_charset_name
52
+ return "SHIFT_JIS"
53
+ end
54
+
55
+ def feed(aBuf)
56
+ aLen = aBuf.length
57
+ for i in 0...aLen
58
+ codingState = @_mCodingSM.next_state(aBuf[i])
59
+ if codingState == :Error
60
+ if DEBUG
61
+ p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
62
+ end
63
+ @_mState = :NotMe
64
+ break
65
+ elsif codingState == :ItsMe
66
+ @_mState = :FoundIt
67
+ break
68
+ elsif codingState == :Start
69
+ charLen = @_mCodingSM.get_current_charlen()
70
+ if i == 0
71
+ @_mLastChar[1] = aBuf[0]
72
+ @_mContextAnalyzer.feed(@_mLastChar[2 - charLen..@_mLastChar.length], charLen)
73
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
74
+ else
75
+ @_mContextAnalyzer.feed(aBuf[i + 1 - charLen .. i + 3 - charLen], charLen)
76
+ @_mDistributionAnalyzer.feed(aBuf[i - 1 .. i + 1], charLen)
77
+ end
78
+ end
79
+ end
80
+
81
+ @_mLastChar[0] = aBuf[aLen - 1]
82
+
83
+ if get_state() == :Detecting
84
+ if @_mContextAnalyzer.got_enough_data() and \
85
+ (get_confidence() > SHORTCUT_THRESHOLD)
86
+ @_mState = :FoundIt
87
+ end
88
+ end
89
+
90
+ return get_state()
91
+ end
92
+
93
+ def get_confidence
94
+ contxtCf = @_mContextAnalyzer.get_confidence()
95
+ distribCf = @_mDistributionAnalyzer.get_confidence()
96
+ return [contxtCf, distribCf].max
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,131 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module UniversalDetector
33
+
34
+ SAMPLE_SIZE = 64
35
+ SB_ENOUGH_REL_THRESHOLD = 1024
36
+ POSITIVE_SHORTCUT_THRESHOLD = 0.95
37
+ NEGATIVE_SHORTCUT_THRESHOLD = 0.05
38
+ SYMBOL_CAT_ORDER = 250
39
+ NUMBER_OF_SEQ_CAT = 4
40
+ POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
41
+
42
+ class SingleByteCharSetProber < CharSetProber
43
+ def initialize(model, reversed=false, nameProber=nil)
44
+ super()
45
+ @_mModel = model
46
+ @_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
47
+ @_mNameProber = nameProber # Optional auxiliary prober for name decision
48
+ reset()
49
+ end
50
+
51
+ def reset
52
+ super
53
+ @_mLastOrder = 255 # char order of last character
54
+ @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
55
+ @_mTotalSeqs = 0
56
+ @_mTotalChar = 0
57
+ @_mFreqChar = 0 # characters that fall in our sampling range
58
+ end
59
+
60
+ def get_charset_name
61
+ if @_mNameProber
62
+ return @_mNameProber.get_charset_name()
63
+ else
64
+ return @_mModel['charsetName']
65
+ end
66
+ end
67
+
68
+ def feed(aBuf)
69
+ unless @_mModel['keepEnglishLetter']
70
+ aBuf = filter_without_english_letters(aBuf)
71
+ end
72
+ aLen = aBuf.length
73
+ unless aLen
74
+ return get_state()
75
+ end
76
+
77
+ for i in 0...aLen
78
+ c = aBuf[i]
79
+ order = @_mModel['charToOrderMap'][c]
80
+ if order < SYMBOL_CAT_ORDER
81
+ @_mTotalChar += 1
82
+ end
83
+ if order < SAMPLE_SIZE
84
+ @_mFreqChar += 1
85
+ if @_mLastOrder < SAMPLE_SIZE
86
+ @_mTotalSeqs += 1
87
+ unless @_mReversed
88
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
89
+ else # reverse the order of the letters in the lookup
90
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
91
+ end
92
+ end
93
+ end
94
+ @_mLastOrder = order
95
+ end
96
+
97
+ if get_state() == :Detecting
98
+ if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
99
+ cf = get_confidence()
100
+ if cf > POSITIVE_SHORTCUT_THRESHOLD
101
+ if DEBUG
102
+ p('%s confidence = %s, we have a winner\n' % [@_mModel['charsetName'], cf])
103
+ end
104
+ @_mState = :FoundIt
105
+ elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
106
+ if DEBUG
107
+ p('%s confidence = %s, below negative shortcut threshhold %s\n' % [@_mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD])
108
+ end
109
+ @_mState = :NotMe
110
+ end
111
+ end
112
+ end
113
+
114
+ return get_state()
115
+ end
116
+
117
+ def get_confidence
118
+ r = 0.01
119
+ if @_mTotalSeqs > 0
120
+ # print @_mSeqCounters[POSITIVE_CAT], @_mTotalSeqs, @_mModel['mTypicalPositiveRatio']
121
+ r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
122
+ # print r, @_mFreqChar, @_mTotalChar
123
+ r = r * @_mFreqChar / @_mTotalChar
124
+ if r >= 1.0
125
+ r = 0.99
126
+ end
127
+ end
128
+ return r
129
+ end
130
+ end
131
+ end
data/lib/UTF8Prober.rb ADDED
@@ -0,0 +1,91 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+ require 'CodingStateMachine'
32
+ require 'MBCSSM'
33
+
34
+ module UniversalDetector
35
+ ONE_CHAR_PROB = 0.5
36
+
37
+ class UTF8Prober < CharSetProber
38
+ def initialize
39
+ super()
40
+ @_mCodingSM = CodingStateMachine.new(UTF8SMModel)
41
+ reset()
42
+ end
43
+
44
+ def reset
45
+ super
46
+ @_mCodingSM.reset()
47
+ @_mNumOfMBChar = 0
48
+ end
49
+
50
+ def get_charset_name
51
+ return "utf-8"
52
+ end
53
+
54
+ def feed(aBuf)
55
+ aLen = aBuf.length
56
+ for i in 0...aLen
57
+ codingState = @_mCodingSM.next_state(aBuf[i])
58
+ if codingState == :Error
59
+ @_mState = :NotMe
60
+ break
61
+ elsif codingState == :ItsMe
62
+ @_mState = :FoundIt
63
+ break
64
+ elsif codingState == :Start
65
+ if @_mCodingSM.get_current_charlen() >= 2
66
+ @_mNumOfMBChar += 1
67
+ end
68
+ end
69
+ end
70
+
71
+ if get_state() == :Detecting
72
+ if get_confidence() > SHORTCUT_THRESHOLD
73
+ @_mState = :FoundIt
74
+ end
75
+ end
76
+ return get_state()
77
+ end
78
+
79
+ def get_confidence
80
+ unlike = 0.99
81
+ if @_mNumOfMBChar < 6
82
+ for i in 0...@_mNumOfMBChar
83
+ unlike = unlike * ONE_CHAR_PROB
84
+ end
85
+ return 1.0 - unlike
86
+ else
87
+ return unlike
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,209 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require "EscCharSetProber"
30
+ require "MBCSGroupProber"
31
+ require "SBCSGroupProber"
32
+ require "Latin1Prober"
33
+ require "singleton"
34
+
35
+ module UniversalDetector
36
+
37
+ class << self
38
+ def encoding(data)
39
+ chardet(data)['encoding']
40
+ end
41
+
42
+ def chardet(data)
43
+ u = UniversalDetector::Detector.instance
44
+ u.reset()
45
+ u.feed(data)
46
+ u.close()
47
+ u.result
48
+ end
49
+ end
50
+
51
+ DEBUG = nil
52
+
53
+ Detectiong = 0
54
+ FoundIt = 1
55
+ NotMe = 2
56
+
57
+ Start = 0
58
+ Error = 1
59
+ ItsMe = 2
60
+
61
+ MINIMUM_THRESHOLD = 0.20
62
+ PureAscii = 0
63
+ EscAscii = 1
64
+ Highbyte = 2
65
+
66
+ SHORTCUT_THRESHOLD = 0.95
67
+
68
+ class Detector
69
+
70
+ include Singleton
71
+
72
+ attr_reader :result
73
+
74
+ def initialize
75
+ @_highBitDetector = /[\x80-\xFF]/n
76
+ @_escDetector = /\033|~\{/n
77
+ @_mEscCharSetProber = nil
78
+ @_mCharSetProbers = []
79
+ reset
80
+ end
81
+
82
+ def reset
83
+ @result = {"encoding"=> nil, "confidence"=> 0.0}
84
+ @done = false
85
+ @_mStart = true
86
+ @_mGotData = false
87
+ @_mInputState = :PureAscii
88
+ @_mLastChar = ""
89
+ if @_mEscCharSetProber
90
+ @_mEscCharSetProber.reset
91
+ end
92
+ for prober in @_mCharSetProbers
93
+ prober.reset
94
+ end
95
+ end
96
+
97
+ def feed(data)
98
+ if @done || data.empty?
99
+ return
100
+ end
101
+ unless @_mGotData
102
+ # If the data starts with BOM, we know it is UTF
103
+ if data[0,3] == "\xEF\xBB\xBF"
104
+ # EF BB BF UTF-8 with BOM
105
+ @result = {"encoding"=> "UTF-8", "confidence"=> 1.0}
106
+ elsif data[0,4] == "\xFF\xFE\x00\x00"
107
+ # FF FE 00 00 UTF-32, little-endian BOM
108
+ @result = {"encoding"=> "UTF-32LE", "confidence"=> 1.0}
109
+ elsif data[0,4] == "\x00\x00\xFE\xFF"
110
+ # 00 00 FE FF UTF-32, big-endian BOM
111
+ @result = {"encoding"=> "UTF-32BE", "confidence"=> 1.0}
112
+ elsif data[0,4] == "\xFE\xFF\x00\x00"
113
+ # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
114
+ @result = {"encoding"=> "X-ISO-10646-UCS-4-3412", "confidence"=> 1.0}
115
+ elsif data[0,4] == "\x00\x00\xFF\xFE"
116
+ # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
117
+ @result = {"encoding"=> "X-ISO-10646-UCS-4-2143", "confidence"=> 1.0}
118
+ elsif data[0,4] == "\xFF\xFE"
119
+ # FF FE UTF-16, little endian BOM
120
+ @result = {"encoding"=> "UTF-16LE", "confidence"=> 1.0}
121
+ elsif data[0,2] == "\xFE\xFF"
122
+ # FE FF UTF-16, big endian BOM
123
+ @result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0}
124
+ end
125
+ end
126
+ @_mGotData = true
127
+ if @result["encoding"] && @result["confidence"] > 0.0
128
+ @done = true
129
+ return
130
+ end
131
+
132
+ if @_mInputState == :PureAscii
133
+ if data =~ @_highBitDetector
134
+ @_mInputState = :Highbyte
135
+ elsif (@_mLastChar + data) =~ @_escDetector
136
+ @_mInputState = :EscAscii
137
+ end
138
+ end
139
+
140
+ @_mLastChar = data[-1]
141
+ if @_mInputState == :EscAscii
142
+ unless @_mEscCharSetProber
143
+ @_mEscCharSetProber = EscCharSetProber.new
144
+ end
145
+ if @_mEscCharSetProber.feed(data) == constants.eFoundIt
146
+ @result = {"encoding"=> @_mEscCharSetProber.get_charset_name() ,"confidence"=> @_mEscCharSetProber.get_confidence()}
147
+ @done = true
148
+ end
149
+ elsif @_mInputState == :Highbyte
150
+ if @_mCharSetProbers.empty?
151
+ @_mCharSetProbers = MBCSGroupProber.new.mProbers + SBCSGroupProber.new.mProbers + [Latin1Prober.new]
152
+ end
153
+ @_mCharSetProbers.each do |prober|
154
+ if prober.feed(data) == :FoundIt
155
+ @result = {"encoding"=> prober.get_charset_name(), "confidence"=> prober.get_confidence()}
156
+ @done = true
157
+ break
158
+ end
159
+ end #for
160
+ end
161
+ end #feed
162
+
163
+ def close
164
+ if @done then return end
165
+ unless @_mGotData
166
+ if DEBUG
167
+ p("no data received!\n")
168
+ end
169
+ return
170
+ end
171
+ @done = true
172
+
173
+ if @_mInputState == :PureAscii
174
+ @result = {"encoding" => "ascii", "confidence" => 1.0}
175
+ return @result
176
+ end
177
+
178
+ if @_mInputState == :Highbyte
179
+ proberConfidence = nil
180
+ maxProberConfidence = 0.0
181
+ maxProber = nil
182
+ for prober in @_mCharSetProbers
183
+ unless prober then next end
184
+ proberConfidence = prober.get_confidence()
185
+ if proberConfidence > maxProberConfidence
186
+ maxProberConfidence = proberConfidence
187
+ maxProber = prober
188
+ end
189
+ end
190
+ if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD)
191
+ @result = {"encoding" => maxProber.get_charset_name(),
192
+ "confidence" => maxProber.get_confidence()}
193
+ return @result
194
+ end
195
+ end #if
196
+
197
+ if DEBUG
198
+ p("no probers hit minimum threshhold\n")
199
+ for prober in @_mCharSetProbers
200
+ unless prober then next end
201
+ p("%s confidence = %s\n" % \
202
+ [prober.get_charset_name(), \
203
+ prober.get_confidence()])
204
+ end
205
+ end
206
+ end #close
207
+ end #class
208
+
209
+ end #module
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chardet2
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jan Xie
9
+ - Felipe Tanus
10
+ - Hui
11
+ autorequire: UniversalDetector
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2013-05-17 00:00:00.000000000 Z
15
+ dependencies: []
16
+ description:
17
+ email:
18
+ - jan.h.xie@gmail.com
19
+ executables: []
20
+ extensions: []
21
+ extra_rdoc_files: []
22
+ files:
23
+ - lib/MBCSSM.rb
24
+ - lib/MultiByteCharSetProber.rb
25
+ - lib/JapaneseContextAnalysis.rb
26
+ - lib/LangCyrillicModel.rb
27
+ - lib/EUCKRFreq.rb
28
+ - lib/GB2312Freq.rb
29
+ - lib/EUCKRProber.rb
30
+ - lib/CodingStateMachine.rb
31
+ - lib/LangHungarianModel.rb
32
+ - lib/HebrewProber.rb
33
+ - lib/Big5Prober.rb
34
+ - lib/CharSetGroupProber.rb
35
+ - lib/SingleByteCharSetProber.rb
36
+ - lib/EUCTWFreq.rb
37
+ - lib/MBCSGroupProber.rb
38
+ - lib/SBCSGroupProber.rb
39
+ - lib/LangBulgarianModel.rb
40
+ - lib/SJISProber.rb
41
+ - lib/Big5Freq.rb
42
+ - lib/UniversalDetector.rb
43
+ - lib/CharDistributionAnalysis.rb
44
+ - lib/UTF8Prober.rb
45
+ - lib/Latin1Prober.rb
46
+ - lib/ESCSM.rb
47
+ - lib/EscCharSetProber.rb
48
+ - lib/JISFreq.rb
49
+ - lib/EUCJPProber.rb
50
+ - lib/EUCTWProber.rb
51
+ - lib/LangGreekModel.rb
52
+ - lib/LangHebrewModel.rb
53
+ - lib/GB2312Prober.rb
54
+ - lib/LangThaiModel.rb
55
+ - lib/CharSetProber.rb
56
+ - COPYING
57
+ - README.markdown
58
+ homepage: https://github.com/janx/chardet
59
+ licenses: []
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ requirements: []
77
+ rubyforge_project:
78
+ rubygems_version: 1.8.23
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: Character encoding auto-detection in Ruby, compatible with 1.9/2.0. Base
82
+ on Mark Pilgrim's Python port and Hui's ruby port.
83
+ test_files: []