tmail_es 1.2.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGES +83 -0
  3. data/LICENSE +21 -0
  4. data/NOTES +7 -0
  5. data/README +182 -0
  6. data/Rakefile +2 -0
  7. data/ext/Makefile +20 -0
  8. data/ext/tmailscanner/tmail/MANIFEST +4 -0
  9. data/ext/tmailscanner/tmail/depend +1 -0
  10. data/ext/tmailscanner/tmail/extconf.rb +33 -0
  11. data/ext/tmailscanner/tmail/tmailscanner.c +614 -0
  12. data/lib/tmail/Makefile +18 -0
  13. data/lib/tmail/address.rb +392 -0
  14. data/lib/tmail/attachments.rb +65 -0
  15. data/lib/tmail/base64.rb +46 -0
  16. data/lib/tmail/compat.rb +41 -0
  17. data/lib/tmail/config.rb +67 -0
  18. data/lib/tmail/core_extensions.rb +63 -0
  19. data/lib/tmail/encode.rb +590 -0
  20. data/lib/tmail/header.rb +962 -0
  21. data/lib/tmail/index.rb +9 -0
  22. data/lib/tmail/interface.rb +1162 -0
  23. data/lib/tmail/loader.rb +3 -0
  24. data/lib/tmail/mail.rb +578 -0
  25. data/lib/tmail/mailbox.rb +496 -0
  26. data/lib/tmail/main.rb +6 -0
  27. data/lib/tmail/mbox.rb +3 -0
  28. data/lib/tmail/net.rb +250 -0
  29. data/lib/tmail/obsolete.rb +132 -0
  30. data/lib/tmail/parser.rb +1060 -0
  31. data/lib/tmail/parser.y +416 -0
  32. data/lib/tmail/port.rb +379 -0
  33. data/lib/tmail/quoting.rb +164 -0
  34. data/lib/tmail/require_arch.rb +58 -0
  35. data/lib/tmail/scanner.rb +49 -0
  36. data/lib/tmail/scanner_r.rb +261 -0
  37. data/lib/tmail/stringio.rb +280 -0
  38. data/lib/tmail/utils.rb +361 -0
  39. data/lib/tmail/vendor/rchardet-1.3/COPYING +504 -0
  40. data/lib/tmail/vendor/rchardet-1.3/README +12 -0
  41. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5freq.rb +927 -0
  42. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5prober.rb +42 -0
  43. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +238 -0
  44. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +112 -0
  45. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetprober.rb +75 -0
  46. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +64 -0
  47. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/constants.rb +42 -0
  48. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +89 -0
  49. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escsm.rb +244 -0
  50. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +88 -0
  51. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrfreq.rb +596 -0
  52. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrprober.rb +42 -0
  53. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwfreq.rb +430 -0
  54. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwprober.rb +42 -0
  55. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312freq.rb +474 -0
  56. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312prober.rb +42 -0
  57. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +289 -0
  58. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jisfreq.rb +570 -0
  59. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +229 -0
  60. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langbulgarianmodel.rb +229 -0
  61. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langcyrillicmodel.rb +330 -0
  62. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langgreekmodel.rb +227 -0
  63. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhebrewmodel.rb +202 -0
  64. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhungarianmodel.rb +226 -0
  65. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langthaimodel.rb +201 -0
  66. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +147 -0
  67. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +89 -0
  68. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +45 -0
  69. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +542 -0
  70. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +124 -0
  71. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +56 -0
  72. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +88 -0
  73. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +167 -0
  74. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +87 -0
  75. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet.rb +67 -0
  76. data/lib/tmail/version.rb +40 -0
  77. data/lib/tmail.rb +6 -0
  78. data/setup.rb +1482 -0
  79. data/test/extctrl.rb +6 -0
  80. data/test/fixtures/apple_unquoted_content_type +44 -0
  81. data/test/fixtures/inline_attachment.txt +2095 -0
  82. data/test/fixtures/iso_8859_1_email_without_encoding_and_message_id.txt +16 -0
  83. data/test/fixtures/mailbox +414 -0
  84. data/test/fixtures/mailbox.zip +0 -0
  85. data/test/fixtures/mailbox_without_any_from_or_sender +10 -0
  86. data/test/fixtures/mailbox_without_from +11 -0
  87. data/test/fixtures/mailbox_without_return_path +12 -0
  88. data/test/fixtures/marked_as_iso_8859_1_but_it_is_utf_8.txt +33 -0
  89. data/test/fixtures/marked_as_utf_8_but_it_is_iso_8859_1.txt +56 -0
  90. data/test/fixtures/raw_attack_email_with_zero_length_whitespace +29 -0
  91. data/test/fixtures/raw_base64_decoded_string +0 -0
  92. data/test/fixtures/raw_base64_email +83 -0
  93. data/test/fixtures/raw_base64_encoded_string +1 -0
  94. data/test/fixtures/raw_email +14 -0
  95. data/test/fixtures/raw_email10 +20 -0
  96. data/test/fixtures/raw_email11 +34 -0
  97. data/test/fixtures/raw_email12 +32 -0
  98. data/test/fixtures/raw_email13 +29 -0
  99. data/test/fixtures/raw_email2 +114 -0
  100. data/test/fixtures/raw_email3 +70 -0
  101. data/test/fixtures/raw_email4 +59 -0
  102. data/test/fixtures/raw_email5 +19 -0
  103. data/test/fixtures/raw_email6 +20 -0
  104. data/test/fixtures/raw_email7 +66 -0
  105. data/test/fixtures/raw_email8 +47 -0
  106. data/test/fixtures/raw_email9 +28 -0
  107. data/test/fixtures/raw_email_bad_time +62 -0
  108. data/test/fixtures/raw_email_double_at_in_header +14 -0
  109. data/test/fixtures/raw_email_multiple_from +30 -0
  110. data/test/fixtures/raw_email_only_attachment +17 -0
  111. data/test/fixtures/raw_email_quoted_with_0d0a +14 -0
  112. data/test/fixtures/raw_email_reply +32 -0
  113. data/test/fixtures/raw_email_simple +11 -0
  114. data/test/fixtures/raw_email_string_in_date_field +17 -0
  115. data/test/fixtures/raw_email_trailing_dot +21 -0
  116. data/test/fixtures/raw_email_with_bad_date +48 -0
  117. data/test/fixtures/raw_email_with_illegal_boundary +58 -0
  118. data/test/fixtures/raw_email_with_mimepart_without_content_type +94 -0
  119. data/test/fixtures/raw_email_with_multipart_mixed_quoted_boundary +50 -0
  120. data/test/fixtures/raw_email_with_nested_attachment +100 -0
  121. data/test/fixtures/raw_email_with_partially_quoted_subject +14 -0
  122. data/test/fixtures/raw_email_with_quoted_attachment_filename +60 -0
  123. data/test/fixtures/raw_email_with_quoted_illegal_boundary +58 -0
  124. data/test/fixtures/raw_email_with_wrong_splitted_multibyte_encoded_word_subject +15 -0
  125. data/test/fixtures/the_only_part_is_a_word_document.txt +425 -0
  126. data/test/fixtures/unquoted_filename_in_attachment +177 -0
  127. data/test/kcode.rb +14 -0
  128. data/test/temp_test_one.rb +46 -0
  129. data/test/test_address.rb +1216 -0
  130. data/test/test_attachments.rb +133 -0
  131. data/test/test_base64.rb +64 -0
  132. data/test/test_encode.rb +139 -0
  133. data/test/test_header.rb +1021 -0
  134. data/test/test_helper.rb +9 -0
  135. data/test/test_mail.rb +756 -0
  136. data/test/test_mbox.rb +184 -0
  137. data/test/test_port.rb +440 -0
  138. data/test/test_quote.rb +107 -0
  139. data/test/test_scanner.rb +209 -0
  140. data/test/test_utils.rb +36 -0
  141. data/tmail_es.gemspec +35 -0
  142. metadata +257 -0
@@ -0,0 +1,42 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Communicator client code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class Big5Prober < MultiByteCharSetProber
31
+ def initialize
32
+ super
33
+ @_mCodingSM = CodingStateMachine.new(Big5SMModel)
34
+ @_mDistributionAnalyzer = Big5DistributionAnalysis.new()
35
+ reset()
36
+ end
37
+
38
+ def get_charset_name
39
+ return "Big5"
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,238 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Communicator client code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s)
10
+
11
+ # Jeff Hodges
12
+ # Mark Pilgrim - port to Python
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ ENOUGH_DATA_THRESHOLD = 1024
32
+ SURE_YES = 0.99
33
+ SURE_NO = 0.01
34
+
35
+ class CharDistributionAnalysis
36
+ def initialize
37
+ @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
38
+ @_mTableSize = nil # Size of above table
39
+ @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
40
+ reset()
41
+ end
42
+
43
+ def reset
44
+ # # """reset analyser, clear any state"""
45
+ @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
46
+ @_mTotalChars = 0 # Total characters encountered
47
+ @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
48
+ end
49
+
50
+ def feed(aStr, aCharLen)
51
+ # # """feed a character with known length"""
52
+ if aCharLen == 2
53
+ # we only care about 2-bytes character in our distribution analysis
54
+ order = get_order(aStr)
55
+ else
56
+ order = -1
57
+ end
58
+ if order >= 0
59
+ @_mTotalChars += 1
60
+ # order is valid
61
+ if order < @_mTableSize
62
+ if 512 > @_mCharToFreqOrder[order]
63
+ @_mFreqChars += 1
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ def get_confidence
70
+ # """return confidence based on existing data"""
71
+ # if we didn't receive any character in our consideration range, return negative answer
72
+ if @_mTotalChars <= 0
73
+ return SURE_NO
74
+ end
75
+
76
+ if @_mTotalChars != @_mFreqChars
77
+ r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
78
+ if r < SURE_YES
79
+ return r
80
+ end
81
+ end
82
+
83
+ # normalize confidence (we don't want to be 100% sure)
84
+ return SURE_YES
85
+ end
86
+
87
+ def got_enough_data
88
+ # It is not necessary to receive all data to draw conclusion. For charset detection,
89
+ # certain amount of data is enough
90
+ return @_mTotalChars > ENOUGH_DATA_THRESHOLD
91
+ end
92
+
93
+ def get_order(aStr)
94
+ # We do not handle characters based on the original encoding string, but
95
+ # convert this encoding string to a number, here called order.
96
+ # This allows multiple encodings of a language to share one frequency table.
97
+ return -1
98
+ end
99
+ end
100
+
101
+ class EUCTWDistributionAnalysis < CharDistributionAnalysis
102
+ def initialize
103
+ super()
104
+ @_mCharToFreqOrder = EUCTWCharToFreqOrder
105
+ @_mTableSize = EUCTW_TABLE_SIZE
106
+ @_mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
107
+ end
108
+
109
+ def get_order(aStr)
110
+ # for euc-TW encoding, we are interested
111
+ # first byte range: 0xc4 -- 0xfe
112
+ # second byte range: 0xa1 -- 0xfe
113
+ # no validation needed here. State machine has done that
114
+ if aStr[0..0] >= "\xC4"
115
+ return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
116
+ else
117
+ return -1
118
+ end
119
+ end
120
+ end
121
+
122
+ class EUCKRDistributionAnalysis < CharDistributionAnalysis
123
+ def initialize
124
+ super()
125
+ @_mCharToFreqOrder = EUCKRCharToFreqOrder
126
+ @_mTableSize = EUCKR_TABLE_SIZE
127
+ @_mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
128
+ end
129
+
130
+ def get_order(aStr)
131
+ # for euc-KR encoding, we are interested
132
+ # first byte range: 0xb0 -- 0xfe
133
+ # second byte range: 0xa1 -- 0xfe
134
+ # no validation needed here. State machine has done that
135
+ if aStr[0..0] >= "\xB0"
136
+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
137
+ else
138
+ return -1
139
+ end
140
+ end
141
+ end
142
+
143
+ class GB2312DistributionAnalysis < CharDistributionAnalysis
144
+ def initialize
145
+ super()
146
+ @_mCharToFreqOrder = GB2312CharToFreqOrder
147
+ @_mTableSize = GB2312_TABLE_SIZE
148
+ @_mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
149
+ end
150
+
151
+ def get_order(aStr)
152
+ # for GB2312 encoding, we are interested
153
+ # first byte range: 0xb0 -- 0xfe
154
+ # second byte range: 0xa1 -- 0xfe
155
+ # no validation needed here. State machine has done that
156
+ if (aStr[0..0] >= "\xB0") and (aStr[1..1] >= "\xA1")
157
+ return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
158
+ else
159
+ return -1
160
+ end
161
+ end
162
+ end
163
+
164
+ class Big5DistributionAnalysis < CharDistributionAnalysis
165
+ def initialize
166
+ super
167
+ @_mCharToFreqOrder = Big5CharToFreqOrder
168
+ @_mTableSize = BIG5_TABLE_SIZE
169
+ @_mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
170
+ end
171
+
172
+ def get_order(aStr)
173
+ # for big5 encoding, we are interested
174
+ # first byte range: 0xa4 -- 0xfe
175
+ # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
176
+ # no validation needed here. State machine has done that
177
+ if aStr[0..0] >= "\xA4"
178
+ if aStr[1..1] >= "\xA1"
179
+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
180
+ else
181
+ return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
182
+ end
183
+ else
184
+ return -1
185
+ end
186
+ end
187
+ end
188
+
189
+ class SJISDistributionAnalysis < CharDistributionAnalysis
190
+ def initialize
191
+ super()
192
+ @_mCharToFreqOrder = JISCharToFreqOrder
193
+ @_mTableSize = JIS_TABLE_SIZE
194
+ @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
195
+ end
196
+
197
+ def get_order(aStr)
198
+ # for sjis encoding, we are interested
199
+ # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
200
+ # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
201
+ # no validation needed here. State machine has done that
202
+ aStr = aStr[0..1].join if aStr.class == Array
203
+ if (aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")
204
+ order = 188 * (aStr[0] - 0x81)
205
+ elsif (aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xEF")
206
+ order = 188 * (aStr[0] - 0xE0 + 31)
207
+ else
208
+ return -1
209
+ end
210
+ order = order + aStr[1] - 0x40
211
+ if aStr[1..1] > "\x7F"
212
+ order =- 1
213
+ end
214
+ return order
215
+ end
216
+ end
217
+
218
+ class EUCJPDistributionAnalysis < CharDistributionAnalysis
219
+ def initialize
220
+ super()
221
+ @_mCharToFreqOrder = JISCharToFreqOrder
222
+ @_mTableSize = JIS_TABLE_SIZE
223
+ @_mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
224
+ end
225
+
226
+ def get_order(aStr)
227
+ # for euc-JP encoding, we are interested
228
+ # first byte range: 0xa0 -- 0xfe
229
+ # second byte range: 0xa1 -- 0xfe
230
+ # no validation needed here. State machine has done that
231
+ if aStr[0..0] >= "\xA0"
232
+ return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
233
+ else
234
+ return -1
235
+ end
236
+ end
237
+ end
238
+ end
@@ -0,0 +1,112 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Communicator client code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s)
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class CharSetGroupProber < CharSetProber
31
+ attr_accessor :_mProbers
32
+ def initialize
33
+ super
34
+ @_mActiveNum = 0
35
+ @_mProbers = []
36
+ @_mBestGuessProber = nil
37
+ end
38
+
39
+ def reset
40
+ super
41
+ @_mActiveNum = 0
42
+
43
+ for prober in @_mProbers
44
+ if prober
45
+ prober.reset()
46
+ prober.active = true
47
+ @_mActiveNum += 1
48
+ end
49
+ end
50
+ @_mBestGuessProber = nil
51
+ end
52
+
53
+ def get_charset_name
54
+ if not @_mBestGuessProber
55
+ get_confidence()
56
+ return nil unless @_mBestGuessProber
57
+ # self._mBestGuessProber = self._mProbers[0]
58
+ end
59
+ return @_mBestGuessProber.get_charset_name()
60
+ end
61
+
62
+ def feed(aBuf)
63
+ for prober in @_mProbers
64
+ next unless prober
65
+ next unless prober.active
66
+ st = prober.feed(aBuf)
67
+ next unless st
68
+ if st == EFoundIt
69
+ @_mBestGuessProber = prober
70
+ return get_state()
71
+ elsif st == ENotMe
72
+ prober.active = false
73
+ @_mActiveNum -= 1
74
+ if @_mActiveNum <= 0
75
+ @_mState = ENotMe
76
+ return get_state()
77
+ end
78
+ end
79
+ end
80
+ return get_state()
81
+ end
82
+
83
+ def get_confidence()
84
+ st = get_state()
85
+ if st == EFoundIt
86
+ return 0.99
87
+ elsif st == ENotMe
88
+ return 0.01
89
+ end
90
+ bestConf = 0.0
91
+ @_mBestGuessProber = nil
92
+ for prober in @_mProbers
93
+ next unless prober
94
+ unless prober.active
95
+ $stderr << "#{prober.get_charset_name()} not active\n" if $debug
96
+ next
97
+ end
98
+ cf = prober.get_confidence()
99
+ $stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
100
+ if bestConf < cf
101
+ bestConf = cf
102
+ @_mBestGuessProber = prober
103
+ end
104
+ end
105
+ return 0.0 unless @_mBestGuessProber
106
+ return bestConf
107
+ # else
108
+ # self._mBestGuessProber = self._mProbers[0]
109
+ # return self._mBestGuessProber.get_confidence()
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,75 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ class CharSetProber
32
+ attr_accessor :active
33
+ def initialize
34
+ end
35
+
36
+ def reset
37
+ @_mState = EDetecting
38
+ end
39
+
40
+ def get_charset_name
41
+ return nil
42
+ end
43
+
44
+ def feed(aBuf)
45
+ end
46
+
47
+ def get_state
48
+ return @_mState
49
+ end
50
+
51
+ def get_confidence
52
+ return 0.0
53
+ end
54
+
55
+ def filter_high_bit_only(aBuf)
56
+ # DO NOT USE `gsub!`
57
+ # It will remove all characters from the buffer that is later used by
58
+ # other probers. This is because gsub! removes data from the instance variable
59
+ # that will be passed to later probers, while gsub makes a new instance variable
60
+ # that will not.
61
+ newBuf = aBuf.gsub(/([\x00-\x7F])+/, ' ')
62
+ return newBuf
63
+ end
64
+
65
+ def filter_without_english_letters(aBuf)
66
+ newBuf = aBuf.gsub(/([A-Za-z])+/,' ')
67
+ return newBuf
68
+ end
69
+
70
+ def filter_with_english_letters(aBuf)
71
+ # TODO
72
+ return aBuf
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,64 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class CodingStateMachine
31
+ def initialize(sm)
32
+ @_mModel = sm
33
+ @_mCurrentBytePos = 0
34
+ @_mCurrentCharLen = 0
35
+ reset()
36
+ end
37
+
38
+ def reset
39
+ @_mCurrentState = EStart
40
+ end
41
+
42
+ def next_state(c)
43
+ # for each byte we get its class
44
+ # if it is first byte, we also get byte length
45
+ byteCls = @_mModel['classTable'][c[0]]
46
+ if @_mCurrentState == EStart
47
+ @_mCurrentBytePos = 0
48
+ @_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
49
+ end
50
+ # from byte's class and stateTable, we get its next state
51
+ @_mCurrentState = @_mModel['stateTable'][@_mCurrentState * @_mModel['classFactor'] + byteCls]
52
+ @_mCurrentBytePos += 1
53
+ return @_mCurrentState
54
+ end
55
+
56
+ def get_current_charlen
57
+ return @_mCurrentCharLen
58
+ end
59
+
60
+ def get_coding_state_machine
61
+ return @_mModel['name']
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,42 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ $debug = false
32
+
33
+ EDetecting = 0
34
+ EFoundIt = 1
35
+ ENotMe = 2
36
+
37
+ EStart = 0
38
+ EError = 1
39
+ EItsMe = 2
40
+
41
+ SHORTCUT_THRESHOLD = 0.95
42
+ end
@@ -0,0 +1,89 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class EscCharSetProber < CharSetProber
31
+ def initialize
32
+ super()
33
+ @_mCodingSM = [ CodingStateMachine.new(HZSMModel),
34
+ CodingStateMachine.new(ISO2022CNSMModel),
35
+ CodingStateMachine.new(ISO2022JPSMModel),
36
+ CodingStateMachine.new(ISO2022KRSMModel) ]
37
+ reset()
38
+ end
39
+
40
+ def reset
41
+ super()
42
+ for codingSM in @_mCodingSM
43
+ next if not codingSM
44
+ codingSM.active = true
45
+ codingSM.reset()
46
+ end
47
+ @_mActiveSM = @_mCodingSM.length
48
+ @_mDetectedCharset = nil
49
+ end
50
+
51
+ def get_charset_name
52
+ return @_mDetectedCharset
53
+ end
54
+
55
+ def get_confidence
56
+ if @_mDetectedCharset
57
+ return 0.99
58
+ else
59
+ return 0.00
60
+ end
61
+ end
62
+
63
+ def feed(aBuf)
64
+ aBuf.each_byte do |b|
65
+ c = b.chr
66
+ for codingSM in @_mCodingSM
67
+ next unless codingSM
68
+ next unless codingSM.active
69
+ codingState = codingSM.next_state(c)
70
+ if codingState == EError
71
+ codingSM.active = false
72
+ @_mActiveSM -= 1
73
+ if @_mActiveSM <= 0
74
+ @_mState = ENotMe
75
+ return get_state()
76
+ end
77
+ elsif codingState == EItsMe
78
+ @_mState = EFoundIt
79
+ @_mDetectedCharset = codingSM.get_coding_state_machine()
80
+ return get_state()
81
+ end
82
+ end
83
+ end
84
+ return get_state()
85
+
86
+ end
87
+
88
+ end
89
+ end