chardet2 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,155 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module Enumerable
33
+ def reduceBlock(res)
34
+ each { |n| res = yield(res, n) }
35
+ res
36
+ end
37
+ end
38
+
39
+ module UniversalDetector
40
+ FREQ_CAT_NUM = 4
41
+
42
+ UDF = 0 # undefined
43
+ OTH = 1 # other
44
+ ASC = 2 # ascii capital letter
45
+ ASS = 3 # ascii small letter
46
+ ACV = 4 # accent capital vowel
47
+ ACO = 5 # accent capital other
48
+ ASV = 6 # accent small vowel
49
+ ASO = 7 # accent small other
50
+ CLASS_NUM = 8 # total classes
51
+
52
+ Latin1_CharToClass = [ \
53
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
54
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
55
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
56
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
57
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
58
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
59
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
60
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
61
+ OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
62
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
63
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
64
+ ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
65
+ OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
66
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
67
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
68
+ ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
69
+ OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
70
+ OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
71
+ UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
72
+ OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
73
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
74
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
75
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
76
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
77
+ ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
78
+ ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
79
+ ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
80
+ ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
81
+ ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
82
+ ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
83
+ ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
84
+ ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
85
+ ]
86
+
87
+ # 0 : illegal
88
+ # 1 : very unlikely
89
+ # 2 : normal
90
+ # 3 : very likely
91
+ Latin1ClassModel = [ \
92
+ # UDF OTH ASC ASS ACV ACO ASV ASO
93
+ 0, 0, 0, 0, 0, 0, 0, 0, # UDF
94
+ 0, 3, 3, 3, 3, 3, 3, 3, # OTH
95
+ 0, 3, 3, 3, 3, 3, 3, 3, # ASC
96
+ 0, 3, 3, 3, 1, 1, 3, 3, # ASS
97
+ 0, 3, 3, 3, 1, 2, 1, 2, # ACV
98
+ 0, 3, 3, 3, 3, 3, 3, 3, # ACO
99
+ 0, 3, 1, 3, 1, 1, 1, 3, # ASV
100
+ 0, 3, 1, 3, 1, 1, 3, 3, # ASO
101
+ ]
102
+
103
+ class Latin1Prober < CharSetProber
104
+ def initialize
105
+ super
106
+ reset()
107
+ end
108
+
109
+ def reset
110
+ @_mLastCharClass = OTH
111
+ @_mFreqCounter = [0] * FREQ_CAT_NUM
112
+ super
113
+ end
114
+
115
+ def get_charset_name
116
+ return "windows-1252"
117
+ end
118
+
119
+ def feed(aBuf)
120
+ aBuf = filter_with_english_letters(aBuf)
121
+ for c in aBuf
122
+ charClass = Latin1_CharToClass[c[0]]
123
+ freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
124
+ if freq == 0
125
+ @_mState = :NotMe
126
+ break
127
+ end
128
+ @_mFreqCounter[freq] += 1
129
+ @_mLastCharClass = charClass
130
+ end
131
+
132
+ return get_state()
133
+ end
134
+
135
+ def get_confidence()
136
+ if get_state() == :NotMe
137
+ return 0.01
138
+ end
139
+
140
+ total = @_mFreqCounter.reduce(0, :+)
141
+ if total < 0.01
142
+ confidence = 0.0
143
+ else
144
+ confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
145
+ end
146
+ if confidence < 0.0
147
+ confidence = 0.0
148
+ end
149
+ # lower the confidence of latin1 so that other more accurate detector
150
+ # can take priority.
151
+ confidence = confidence * 0.5
152
+ return confidence
153
+ end
154
+ end
155
+ end
@@ -0,0 +1,57 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetGroupProber'
31
+ require 'UTF8Prober'
32
+ require 'SJISProber'
33
+ require 'EUCJPProber'
34
+ require 'GB2312Prober'
35
+ require 'EUCKRProber'
36
+ require 'Big5Prober'
37
+ require 'EUCTWProber'
38
+
39
+ module UniversalDetector
40
+ class MBCSGroupProber < CharSetGroupProber
41
+
42
+ attr_reader :mProbers
43
+
44
+ def initialize
45
+ super
46
+ @mProbers = [ \
47
+ UTF8Prober.new,
48
+ SJISProber.new,
49
+ EUCJPProber.new,
50
+ GB2312Prober.new,
51
+ EUCKRProber.new,
52
+ Big5Prober.new,
53
+ EUCTWProber.new]
54
+ reset()
55
+ end
56
+ end
57
+ end