chardet2 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetProber'
31
+
32
+ module Enumerable
33
+ def reduceBlock(res)
34
+ each { |n| res = yield(res, n) }
35
+ res
36
+ end
37
+ end
38
+
39
+ module UniversalDetector
40
+ FREQ_CAT_NUM = 4
41
+
42
+ UDF = 0 # undefined
43
+ OTH = 1 # other
44
+ ASC = 2 # ascii capital letter
45
+ ASS = 3 # ascii small letter
46
+ ACV = 4 # accent capital vowel
47
+ ACO = 5 # accent capital other
48
+ ASV = 6 # accent small vowel
49
+ ASO = 7 # accent small other
50
+ CLASS_NUM = 8 # total classes
51
+
52
+ Latin1_CharToClass = [ \
53
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
54
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
55
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
56
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
57
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
58
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
59
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
60
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
61
+ OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
62
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
63
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
64
+ ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
65
+ OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
66
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
67
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
68
+ ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
69
+ OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
70
+ OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
71
+ UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
72
+ OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
73
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
74
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
75
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
76
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
77
+ ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
78
+ ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
79
+ ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
80
+ ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
81
+ ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
82
+ ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
83
+ ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
84
+ ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
85
+ ]
86
+
87
+ # 0 : illegal
88
+ # 1 : very unlikely
89
+ # 2 : normal
90
+ # 3 : very likely
91
+ Latin1ClassModel = [ \
92
+ # UDF OTH ASC ASS ACV ACO ASV ASO
93
+ 0, 0, 0, 0, 0, 0, 0, 0, # UDF
94
+ 0, 3, 3, 3, 3, 3, 3, 3, # OTH
95
+ 0, 3, 3, 3, 3, 3, 3, 3, # ASC
96
+ 0, 3, 3, 3, 1, 1, 3, 3, # ASS
97
+ 0, 3, 3, 3, 1, 2, 1, 2, # ACV
98
+ 0, 3, 3, 3, 3, 3, 3, 3, # ACO
99
+ 0, 3, 1, 3, 1, 1, 1, 3, # ASV
100
+ 0, 3, 1, 3, 1, 1, 3, 3, # ASO
101
+ ]
102
+
103
+ class Latin1Prober < CharSetProber
104
+ def initialize
105
+ super
106
+ reset()
107
+ end
108
+
109
+ def reset
110
+ @_mLastCharClass = OTH
111
+ @_mFreqCounter = [0] * FREQ_CAT_NUM
112
+ super
113
+ end
114
+
115
+ def get_charset_name
116
+ return "windows-1252"
117
+ end
118
+
119
+ def feed(aBuf)
120
+ aBuf = filter_with_english_letters(aBuf)
121
+ for c in aBuf
122
+ charClass = Latin1_CharToClass[c[0]]
123
+ freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
124
+ if freq == 0
125
+ @_mState = :NotMe
126
+ break
127
+ end
128
+ @_mFreqCounter[freq] += 1
129
+ @_mLastCharClass = charClass
130
+ end
131
+
132
+ return get_state()
133
+ end
134
+
135
+ def get_confidence()
136
+ if get_state() == :NotMe
137
+ return 0.01
138
+ end
139
+
140
+ total = @_mFreqCounter.reduce(0, :+)
141
+ if total < 0.01
142
+ confidence = 0.0
143
+ else
144
+ confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
145
+ end
146
+ if confidence < 0.0
147
+ confidence = 0.0
148
+ end
149
+ # lower the confidence of latin1 so that other more accurate detector
150
+ # can take priority.
151
+ confidence = confidence * 0.5
152
+ return confidence
153
+ end
154
+ end
155
+ end
@@ -0,0 +1,57 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Hui (zhengzhengzheng@gmail.com) - port to Ruby
11
+ # Mark Pilgrim - first port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ require 'UniversalDetector'
30
+ require 'CharSetGroupProber'
31
+ require 'UTF8Prober'
32
+ require 'SJISProber'
33
+ require 'EUCJPProber'
34
+ require 'GB2312Prober'
35
+ require 'EUCKRProber'
36
+ require 'Big5Prober'
37
+ require 'EUCTWProber'
38
+
39
+ module UniversalDetector
40
+ class MBCSGroupProber < CharSetGroupProber
41
+
42
+ attr_reader :mProbers
43
+
44
+ def initialize
45
+ super
46
+ @mProbers = [ \
47
+ UTF8Prober.new,
48
+ SJISProber.new,
49
+ EUCJPProber.new,
50
+ GB2312Prober.new,
51
+ EUCKRProber.new,
52
+ Big5Prober.new,
53
+ EUCTWProber.new]
54
+ reset()
55
+ end
56
+ end
57
+ end