tmail 1.2.3.1 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. data/README +10 -0
  2. data/ext/tmailscanner/tmail/tmailscanner.c +39 -8
  3. data/lib/tmail.rb +1 -0
  4. data/lib/tmail/address.rb +6 -40
  5. data/lib/tmail/attachments.rb +41 -22
  6. data/lib/tmail/encode.rb +9 -0
  7. data/lib/tmail/header.rb +5 -3
  8. data/lib/tmail/interface.rb +40 -3
  9. data/lib/tmail/mail.rb +3 -3
  10. data/lib/tmail/mailbox.rb +3 -2
  11. data/lib/tmail/net.rb +3 -1
  12. data/lib/tmail/parser.rb +204 -620
  13. data/lib/tmail/parser.y +38 -3
  14. data/lib/tmail/quoting.rb +38 -1
  15. data/lib/tmail/utils.rb +28 -4
  16. data/lib/tmail/vendor/rchardet-1.3/COPYING +504 -0
  17. data/lib/tmail/vendor/rchardet-1.3/README +12 -0
  18. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet.rb +67 -0
  19. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5freq.rb +927 -0
  20. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5prober.rb +42 -0
  21. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +237 -0
  22. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +112 -0
  23. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetprober.rb +75 -0
  24. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +64 -0
  25. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/constants.rb +42 -0
  26. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +90 -0
  27. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escsm.rb +244 -0
  28. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +88 -0
  29. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrfreq.rb +596 -0
  30. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrprober.rb +42 -0
  31. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwfreq.rb +430 -0
  32. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwprober.rb +42 -0
  33. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312freq.rb +474 -0
  34. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312prober.rb +42 -0
  35. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +289 -0
  36. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jisfreq.rb +570 -0
  37. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +229 -0
  38. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langbulgarianmodel.rb +229 -0
  39. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langcyrillicmodel.rb +330 -0
  40. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langgreekmodel.rb +227 -0
  41. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhebrewmodel.rb +202 -0
  42. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhungarianmodel.rb +226 -0
  43. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langthaimodel.rb +201 -0
  44. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +147 -0
  45. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +89 -0
  46. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +47 -0
  47. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +542 -0
  48. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +124 -0
  49. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +58 -0
  50. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +88 -0
  51. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +166 -0
  52. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +87 -0
  53. data/lib/tmail/version.rb +1 -1
  54. data/setup.rb +2 -2
  55. data/test/fixtures/apple_unquoted_content_type +44 -0
  56. data/test/fixtures/inline_attachment.txt +2095 -0
  57. data/test/fixtures/iso_8859_1_email_without_encoding_and_message_id.txt +16 -0
  58. data/test/fixtures/mailbox.zip +0 -0
  59. data/test/fixtures/marked_as_iso_8859_1_but_it_is_utf_8.txt +33 -0
  60. data/test/fixtures/marked_as_utf_8_but_it_is_iso_8859_1.txt +56 -0
  61. data/test/fixtures/raw_email_bad_time +62 -0
  62. data/test/fixtures/raw_email_double_at_in_header +14 -0
  63. data/test/fixtures/raw_email_only_attachment +17 -0
  64. data/test/fixtures/raw_email_string_in_date_field +17 -0
  65. data/test/fixtures/raw_email_trailing_dot +21 -0
  66. data/test/fixtures/raw_email_with_quoted_attachment_filename +60 -0
  67. data/test/fixtures/raw_email_with_wrong_splitted_multibyte_encoded_word_subject +15 -0
  68. data/test/fixtures/the_only_part_is_a_word_document.txt +425 -0
  69. data/test/fixtures/unquoted_filename_in_attachment +177 -0
  70. data/test/test_address.rb +114 -92
  71. data/test/test_attachments.rb +84 -1
  72. data/test/test_encode.rb +54 -0
  73. data/test/test_header.rb +60 -2
  74. data/test/test_mail.rb +22 -15
  75. data/test/test_mbox.rb +12 -3
  76. data/test/test_port.rb +13 -9
  77. data/test/test_quote.rb +9 -0
  78. data/tmail.gemspec +34 -0
  79. metadata +68 -167
  80. data/MANIFEST +0 -191
  81. data/log/BugTrackingLog.txt +0 -1231
  82. data/log/Changelog.txt +0 -534
  83. data/log/Fixme.txt +0 -6
  84. data/log/Testlog.txt +0 -2340
  85. data/log/Todo.txt +0 -30
  86. data/log/fixme.rdoc +0 -6
  87. data/meta/MANIFEST +0 -128
  88. data/meta/VERSION +0 -1
  89. data/meta/project.yaml +0 -30
  90. data/meta/unixname +0 -1
  91. data/sample/bench_base64.rb +0 -48
  92. data/sample/data/multipart +0 -23
  93. data/sample/data/normal +0 -29
  94. data/sample/data/sendtest +0 -5
  95. data/sample/data/simple +0 -14
  96. data/sample/data/test +0 -27
  97. data/sample/extract-attachements.rb +0 -33
  98. data/sample/from-check.rb +0 -26
  99. data/sample/multipart.rb +0 -26
  100. data/sample/parse-bench.rb +0 -68
  101. data/sample/parse-test.rb +0 -19
  102. data/sample/sendmail.rb +0 -94
  103. data/site/contributing/index.html +0 -183
  104. data/site/css/clean.css +0 -27
  105. data/site/css/layout.css +0 -31
  106. data/site/css/style.css +0 -60
  107. data/site/download/index.html +0 -61
  108. data/site/img/envelope.jpg +0 -0
  109. data/site/img/mailman.gif +0 -0
  110. data/site/img/stamp-sm.jpg +0 -0
  111. data/site/img/stamp.jpg +0 -0
  112. data/site/img/stampborder.jpg +0 -0
  113. data/site/img/tfire.jpg +0 -0
  114. data/site/img/tmail.png +0 -0
  115. data/site/index.html +0 -270
  116. data/site/js/jquery.js +0 -31
  117. data/site/log/Changelog.xsl +0 -33
  118. data/site/log/changelog.xml +0 -1677
  119. data/site/outdated/BUGS +0 -3
  120. data/site/outdated/DEPENDS +0 -1
  121. data/site/outdated/Incompatibilities +0 -89
  122. data/site/outdated/Incompatibilities.ja +0 -102
  123. data/site/outdated/NEWS +0 -9
  124. data/site/outdated/README.ja +0 -73
  125. data/site/outdated/doc.ja/address.html +0 -275
  126. data/site/outdated/doc.ja/basics.html +0 -405
  127. data/site/outdated/doc.ja/config.html +0 -49
  128. data/site/outdated/doc.ja/details.html +0 -146
  129. data/site/outdated/doc.ja/index.html +0 -39
  130. data/site/outdated/doc.ja/mail.html +0 -793
  131. data/site/outdated/doc.ja/mailbox.html +0 -265
  132. data/site/outdated/doc.ja/port.html +0 -95
  133. data/site/outdated/doc.ja/tmail.html +0 -58
  134. data/site/outdated/doc.ja/usage.html +0 -202
  135. data/site/outdated/rdd/address.rrd.m +0 -229
  136. data/site/outdated/rdd/basics.rd.m +0 -275
  137. data/site/outdated/rdd/config.rrd.m +0 -26
  138. data/site/outdated/rdd/details.rd.m +0 -117
  139. data/site/outdated/rdd/index.rhtml.m +0 -54
  140. data/site/outdated/rdd/mail.rrd.m +0 -701
  141. data/site/outdated/rdd/mailbox.rrd.m +0 -228
  142. data/site/outdated/rdd/port.rrd.m +0 -69
  143. data/site/outdated/rdd/tmail.rrd.m +0 -33
  144. data/site/outdated/rdd/usage.rd.m +0 -247
  145. data/site/quickstart/index.html +0 -69
  146. data/site/quickstart/quickstart.html +0 -52
  147. data/site/quickstart/usage.html +0 -193
  148. data/site/reference/address.html +0 -247
  149. data/site/reference/config.html +0 -30
  150. data/site/reference/index.html +0 -101
  151. data/site/reference/mail.html +0 -726
  152. data/site/reference/mailbox.html +0 -245
  153. data/site/reference/port.html +0 -75
  154. data/site/reference/tmail.html +0 -35
  155. data/work/script/make +0 -26
  156. data/work/script/rdoc +0 -39
  157. data/work/script/setup +0 -1616
  158. data/work/script/test +0 -30
@@ -0,0 +1,42 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Communicator client code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class Big5Prober < MultiByteCharSetProber
31
+ def initialize
32
+ super
33
+ @_mCodingSM = CodingStateMachine.new(Big5SMModel)
34
+ @_mDistributionAnalyzer = Big5DistributionAnalysis.new()
35
+ reset()
36
+ end
37
+
38
+ def get_charset_name
39
+ return "Big5"
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,237 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Communicator client code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ ENOUGH_DATA_THRESHOLD = 1024
31
+ SURE_YES = 0.99
32
+ SURE_NO = 0.01
33
+
34
+ class CharDistributionAnalysis
35
+ def initialize
36
+ @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
37
+ @_mTableSize = nil # Size of above table
38
+ @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
39
+ reset()
40
+ end
41
+
42
+ def reset
43
+ # # """reset analyser, clear any state"""
44
+ @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
45
+ @_mTotalChars = 0 # Total characters encountered
46
+ @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
47
+ end
48
+
49
+ def feed(aStr, aCharLen)
50
+ # # """feed a character with known length"""
51
+ if aCharLen == 2
52
+ # we only care about 2-bytes character in our distribution analysis
53
+ order = get_order(aStr)
54
+ else
55
+ order = -1
56
+ end
57
+ if order >= 0
58
+ @_mTotalChars += 1
59
+ # order is valid
60
+ if order < @_mTableSize:
61
+ if 512 > @_mCharToFreqOrder[order]:
62
+ @_mFreqChars += 1
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ def get_confidence
69
+ # """return confidence based on existing data"""
70
+ # if we didn't receive any character in our consideration range, return negative answer
71
+ if @_mTotalChars <= 0
72
+ return SURE_NO
73
+ end
74
+
75
+ if @_mTotalChars != @_mFreqChars:
76
+ r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
77
+ if r < SURE_YES
78
+ return r
79
+ end
80
+ end
81
+
82
+ # normalize confidence (we don't want to be 100% sure)
83
+ return SURE_YES
84
+ end
85
+
86
+ def got_enough_data
87
+ # It is not necessary to receive all data to draw conclusion. For charset detection,
88
+ # certain amount of data is enough
89
+ return @_mTotalChars > ENOUGH_DATA_THRESHOLD
90
+ end
91
+
92
+ def get_order(aStr)
93
+ # We do not handle characters based on the original encoding string, but
94
+ # convert this encoding string to a number, here called order.
95
+ # This allows multiple encodings of a language to share one frequency table.
96
+ return -1
97
+ end
98
+ end
99
+
100
+ class EUCTWDistributionAnalysis < CharDistributionAnalysis
101
+ def initialize
102
+ super()
103
+ @_mCharToFreqOrder = EUCTWCharToFreqOrder
104
+ @_mTableSize = EUCTW_TABLE_SIZE
105
+ @_mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
106
+ end
107
+
108
+ def get_order(aStr)
109
+ # for euc-TW encoding, we are interested
110
+ # first byte range: 0xc4 -- 0xfe
111
+ # second byte range: 0xa1 -- 0xfe
112
+ # no validation needed here. State machine has done that
113
+ if aStr[0..0] >= "\xC4"
114
+ return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
115
+ else
116
+ return -1
117
+ end
118
+ end
119
+ end
120
+
121
+ class EUCKRDistributionAnalysis < CharDistributionAnalysis
122
+ def initialize
123
+ super()
124
+ @_mCharToFreqOrder = EUCKRCharToFreqOrder
125
+ @_mTableSize = EUCKR_TABLE_SIZE
126
+ @_mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
127
+ end
128
+
129
+ def get_order(aStr)
130
+ # for euc-KR encoding, we are interested
131
+ # first byte range: 0xb0 -- 0xfe
132
+ # second byte range: 0xa1 -- 0xfe
133
+ # no validation needed here. State machine has done that
134
+ if aStr[0..0] >= "\xB0"
135
+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
136
+ else
137
+ return -1
138
+ end
139
+ end
140
+ end
141
+
142
+ class GB2312DistributionAnalysis < CharDistributionAnalysis
143
+ def initialize
144
+ super()
145
+ @_mCharToFreqOrder = GB2312CharToFreqOrder
146
+ @_mTableSize = GB2312_TABLE_SIZE
147
+ @_mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
148
+ end
149
+
150
+ def get_order(aStr)
151
+ # for GB2312 encoding, we are interested
152
+ # first byte range: 0xb0 -- 0xfe
153
+ # second byte range: 0xa1 -- 0xfe
154
+ # no validation needed here. State machine has done that
155
+ if (aStr[0..0] >= "\xB0") and (aStr[1..1] >= "\xA1")
156
+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
157
+ else
158
+ return -1
159
+ end
160
+ end
161
+ end
162
+
163
+ class Big5DistributionAnalysis < CharDistributionAnalysis
164
+ def initialize
165
+ super
166
+ @_mCharToFreqOrder = Big5CharToFreqOrder
167
+ @_mTableSize = BIG5_TABLE_SIZE
168
+ @_mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
169
+ end
170
+
171
+ def get_order(aStr)
172
+ # for big5 encoding, we are interested
173
+ # first byte range: 0xa4 -- 0xfe
174
+ # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
175
+ # no validation needed here. State machine has done that
176
+ if aStr[0..0] >= "\xA4"
177
+ if aStr[1..1] >= "\xA1"
178
+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
179
+ else
180
+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
181
+ end
182
+ else
183
+ return -1
184
+ end
185
+ end
186
+ end
187
+
188
+ class SJISDistributionAnalysis < CharDistributionAnalysis
189
+ def initialize
190
+ super()
191
+ @_mCharToFreqOrder = JISCharToFreqOrder
192
+ @_mTableSize = JIS_TABLE_SIZE
193
+ @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
194
+ end
195
+
196
+ def get_order(aStr)
197
+ # for sjis encoding, we are interested
198
+ # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
199
+ # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
200
+ # no validation needed here. State machine has done that
201
+ aStr = aStr[0..1].join if aStr.class == Array
202
+ if (aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")
203
+ order = 188 * (aStr[0] - 0x81)
204
+ elsif (aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xEF")
205
+ order = 188 * (aStr[0] - 0xE0 + 31)
206
+ else
207
+ return -1
208
+ end
209
+ order = order + aStr[1] - 0x40
210
+ if aStr[1..1] > "\x7F"
211
+ order =- 1
212
+ end
213
+ return order
214
+ end
215
+ end
216
+
217
+ class EUCJPDistributionAnalysis < CharDistributionAnalysis
218
+ def initialize
219
+ super()
220
+ @_mCharToFreqOrder = JISCharToFreqOrder
221
+ @_mTableSize = JIS_TABLE_SIZE
222
+ @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
223
+ end
224
+
225
+ def get_order(aStr)
226
+ # for euc-JP encoding, we are interested
227
+ # first byte range: 0xa0 -- 0xfe
228
+ # second byte range: 0xa1 -- 0xfe
229
+ # no validation needed here. State machine has done that
230
+ if aStr[0..0] >= "\xA0":
231
+ return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
232
+ else
233
+ return -1
234
+ end
235
+ end
236
+ end
237
+ end
@@ -0,0 +1,112 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Communicator client code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class CharSetGroupProber < CharSetProber
31
+ attr_accessor :_mProbers
32
+ def initialize
33
+ super
34
+ @_mActiveNum = 0
35
+ @_mProbers = []
36
+ @_mBestGuessProber = nil
37
+ end
38
+
39
+ def reset
40
+ super
41
+ @_mActiveNum = 0
42
+
43
+ for prober in @_mProbers:
44
+ if prober
45
+ prober.reset()
46
+ prober.active = true
47
+ @_mActiveNum += 1
48
+ end
49
+ end
50
+ @_mBestGuessProber = nil
51
+ end
52
+
53
+ def get_charset_name
54
+ if not @_mBestGuessProber
55
+ get_confidence()
56
+ return nil unless @_mBestGuessProber
57
+ # self._mBestGuessProber = self._mProbers[0]
58
+ end
59
+ return @_mBestGuessProber.get_charset_name()
60
+ end
61
+
62
+ def feed(aBuf)
63
+ for prober in @_mProbers
64
+ next unless prober
65
+ next unless prober.active
66
+ st = prober.feed(aBuf)
67
+ next unless st
68
+ if st == EFoundIt
69
+ @_mBestGuessProber = prober
70
+ return get_state()
71
+ elsif st == ENotMe
72
+ prober.active = false
73
+ @_mActiveNum -= 1
74
+ if @_mActiveNum <= 0
75
+ @_mState = ENotMe
76
+ return get_state()
77
+ end
78
+ end
79
+ end
80
+ return get_state()
81
+ end
82
+
83
+ def get_confidence()
84
+ st = get_state()
85
+ if st == EFoundIt
86
+ return 0.99
87
+ elsif st == ENotMe
88
+ return 0.01
89
+ end
90
+ bestConf = 0.0
91
+ @_mBestGuessProber = nil
92
+ for prober in @_mProbers
93
+ next unless prober
94
+ unless prober.active
95
+ $stderr << "#{prober.get_charset_name()} not active\n" if $debug
96
+ next
97
+ end
98
+ cf = prober.get_confidence()
99
+ $stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
100
+ if bestConf < cf
101
+ bestConf = cf
102
+ @_mBestGuessProber = prober
103
+ end
104
+ end
105
+ return 0.0 unless @_mBestGuessProber
106
+ return bestConf
107
+ # else:
108
+ # self._mBestGuessProber = self._mProbers[0]
109
+ # return self._mBestGuessProber.get_confidence()
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,75 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ class CharSetProber
32
+ attr_accessor :active
33
+ def initialize
34
+ end
35
+
36
+ def reset
37
+ @_mState = EDetecting
38
+ end
39
+
40
+ def get_charset_name
41
+ return nil
42
+ end
43
+
44
+ def feed(aBuf)
45
+ end
46
+
47
+ def get_state
48
+ return @_mState
49
+ end
50
+
51
+ def get_confidence
52
+ return 0.0
53
+ end
54
+
55
+ def filter_high_bit_only(aBuf)
56
+ # DO NOT USE `gsub!`
57
+ # It will remove all characters from the buffer that is later used by
58
+ # other probers. This is because gsub! removes data from the instance variable
59
+ # that will be passed to later probers, while gsub makes a new instance variable
60
+ # that will not.
61
+ newBuf = aBuf.gsub(/([\x00-\x7F])+/, ' ')
62
+ return newBuf
63
+ end
64
+
65
+ def filter_without_english_letters(aBuf)
66
+ newBuf = aBuf.gsub(/([A-Za-z])+/,' ')
67
+ return newBuf
68
+ end
69
+
70
+ def filter_with_english_letters(aBuf)
71
+ # TODO
72
+ return aBuf
73
+ end
74
+ end
75
+ end