tmail_es 1.2.7.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGES +83 -0
  3. data/LICENSE +21 -0
  4. data/NOTES +7 -0
  5. data/README +182 -0
  6. data/Rakefile +2 -0
  7. data/ext/Makefile +20 -0
  8. data/ext/tmailscanner/tmail/MANIFEST +4 -0
  9. data/ext/tmailscanner/tmail/depend +1 -0
  10. data/ext/tmailscanner/tmail/extconf.rb +33 -0
  11. data/ext/tmailscanner/tmail/tmailscanner.c +614 -0
  12. data/lib/tmail/Makefile +18 -0
  13. data/lib/tmail/address.rb +392 -0
  14. data/lib/tmail/attachments.rb +65 -0
  15. data/lib/tmail/base64.rb +46 -0
  16. data/lib/tmail/compat.rb +41 -0
  17. data/lib/tmail/config.rb +67 -0
  18. data/lib/tmail/core_extensions.rb +63 -0
  19. data/lib/tmail/encode.rb +590 -0
  20. data/lib/tmail/header.rb +962 -0
  21. data/lib/tmail/index.rb +9 -0
  22. data/lib/tmail/interface.rb +1162 -0
  23. data/lib/tmail/loader.rb +3 -0
  24. data/lib/tmail/mail.rb +578 -0
  25. data/lib/tmail/mailbox.rb +496 -0
  26. data/lib/tmail/main.rb +6 -0
  27. data/lib/tmail/mbox.rb +3 -0
  28. data/lib/tmail/net.rb +250 -0
  29. data/lib/tmail/obsolete.rb +132 -0
  30. data/lib/tmail/parser.rb +1060 -0
  31. data/lib/tmail/parser.y +416 -0
  32. data/lib/tmail/port.rb +379 -0
  33. data/lib/tmail/quoting.rb +164 -0
  34. data/lib/tmail/require_arch.rb +58 -0
  35. data/lib/tmail/scanner.rb +49 -0
  36. data/lib/tmail/scanner_r.rb +261 -0
  37. data/lib/tmail/stringio.rb +280 -0
  38. data/lib/tmail/utils.rb +361 -0
  39. data/lib/tmail/vendor/rchardet-1.3/COPYING +504 -0
  40. data/lib/tmail/vendor/rchardet-1.3/README +12 -0
  41. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5freq.rb +927 -0
  42. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5prober.rb +42 -0
  43. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +238 -0
  44. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +112 -0
  45. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetprober.rb +75 -0
  46. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +64 -0
  47. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/constants.rb +42 -0
  48. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +89 -0
  49. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escsm.rb +244 -0
  50. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +88 -0
  51. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrfreq.rb +596 -0
  52. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrprober.rb +42 -0
  53. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwfreq.rb +430 -0
  54. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwprober.rb +42 -0
  55. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312freq.rb +474 -0
  56. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312prober.rb +42 -0
  57. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +289 -0
  58. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jisfreq.rb +570 -0
  59. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +229 -0
  60. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langbulgarianmodel.rb +229 -0
  61. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langcyrillicmodel.rb +330 -0
  62. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langgreekmodel.rb +227 -0
  63. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhebrewmodel.rb +202 -0
  64. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhungarianmodel.rb +226 -0
  65. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langthaimodel.rb +201 -0
  66. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +147 -0
  67. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +89 -0
  68. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +45 -0
  69. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +542 -0
  70. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +124 -0
  71. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +56 -0
  72. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +88 -0
  73. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +167 -0
  74. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +87 -0
  75. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet.rb +67 -0
  76. data/lib/tmail/version.rb +40 -0
  77. data/lib/tmail.rb +6 -0
  78. data/setup.rb +1482 -0
  79. data/test/extctrl.rb +6 -0
  80. data/test/fixtures/apple_unquoted_content_type +44 -0
  81. data/test/fixtures/inline_attachment.txt +2095 -0
  82. data/test/fixtures/iso_8859_1_email_without_encoding_and_message_id.txt +16 -0
  83. data/test/fixtures/mailbox +414 -0
  84. data/test/fixtures/mailbox.zip +0 -0
  85. data/test/fixtures/mailbox_without_any_from_or_sender +10 -0
  86. data/test/fixtures/mailbox_without_from +11 -0
  87. data/test/fixtures/mailbox_without_return_path +12 -0
  88. data/test/fixtures/marked_as_iso_8859_1_but_it_is_utf_8.txt +33 -0
  89. data/test/fixtures/marked_as_utf_8_but_it_is_iso_8859_1.txt +56 -0
  90. data/test/fixtures/raw_attack_email_with_zero_length_whitespace +29 -0
  91. data/test/fixtures/raw_base64_decoded_string +0 -0
  92. data/test/fixtures/raw_base64_email +83 -0
  93. data/test/fixtures/raw_base64_encoded_string +1 -0
  94. data/test/fixtures/raw_email +14 -0
  95. data/test/fixtures/raw_email10 +20 -0
  96. data/test/fixtures/raw_email11 +34 -0
  97. data/test/fixtures/raw_email12 +32 -0
  98. data/test/fixtures/raw_email13 +29 -0
  99. data/test/fixtures/raw_email2 +114 -0
  100. data/test/fixtures/raw_email3 +70 -0
  101. data/test/fixtures/raw_email4 +59 -0
  102. data/test/fixtures/raw_email5 +19 -0
  103. data/test/fixtures/raw_email6 +20 -0
  104. data/test/fixtures/raw_email7 +66 -0
  105. data/test/fixtures/raw_email8 +47 -0
  106. data/test/fixtures/raw_email9 +28 -0
  107. data/test/fixtures/raw_email_bad_time +62 -0
  108. data/test/fixtures/raw_email_double_at_in_header +14 -0
  109. data/test/fixtures/raw_email_multiple_from +30 -0
  110. data/test/fixtures/raw_email_only_attachment +17 -0
  111. data/test/fixtures/raw_email_quoted_with_0d0a +14 -0
  112. data/test/fixtures/raw_email_reply +32 -0
  113. data/test/fixtures/raw_email_simple +11 -0
  114. data/test/fixtures/raw_email_string_in_date_field +17 -0
  115. data/test/fixtures/raw_email_trailing_dot +21 -0
  116. data/test/fixtures/raw_email_with_bad_date +48 -0
  117. data/test/fixtures/raw_email_with_illegal_boundary +58 -0
  118. data/test/fixtures/raw_email_with_mimepart_without_content_type +94 -0
  119. data/test/fixtures/raw_email_with_multipart_mixed_quoted_boundary +50 -0
  120. data/test/fixtures/raw_email_with_nested_attachment +100 -0
  121. data/test/fixtures/raw_email_with_partially_quoted_subject +14 -0
  122. data/test/fixtures/raw_email_with_quoted_attachment_filename +60 -0
  123. data/test/fixtures/raw_email_with_quoted_illegal_boundary +58 -0
  124. data/test/fixtures/raw_email_with_wrong_splitted_multibyte_encoded_word_subject +15 -0
  125. data/test/fixtures/the_only_part_is_a_word_document.txt +425 -0
  126. data/test/fixtures/unquoted_filename_in_attachment +177 -0
  127. data/test/kcode.rb +14 -0
  128. data/test/temp_test_one.rb +46 -0
  129. data/test/test_address.rb +1216 -0
  130. data/test/test_attachments.rb +133 -0
  131. data/test/test_base64.rb +64 -0
  132. data/test/test_encode.rb +139 -0
  133. data/test/test_header.rb +1021 -0
  134. data/test/test_helper.rb +9 -0
  135. data/test/test_mail.rb +756 -0
  136. data/test/test_mbox.rb +184 -0
  137. data/test/test_port.rb +440 -0
  138. data/test/test_quote.rb +107 -0
  139. data/test/test_scanner.rb +209 -0
  140. data/test/test_utils.rb +36 -0
  141. data/tmail_es.gemspec +35 -0
  142. metadata +257 -0
@@ -0,0 +1,542 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ # BIG5
31
+
32
+ BIG5_cls = [
33
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
34
+ 1,1,1,1,1,1,0,0, # 08 - 0f
35
+ 1,1,1,1,1,1,1,1, # 10 - 17
36
+ 1,1,1,0,1,1,1,1, # 18 - 1f
37
+ 1,1,1,1,1,1,1,1, # 20 - 27
38
+ 1,1,1,1,1,1,1,1, # 28 - 2f
39
+ 1,1,1,1,1,1,1,1, # 30 - 37
40
+ 1,1,1,1,1,1,1,1, # 38 - 3f
41
+ 2,2,2,2,2,2,2,2, # 40 - 47
42
+ 2,2,2,2,2,2,2,2, # 48 - 4f
43
+ 2,2,2,2,2,2,2,2, # 50 - 57
44
+ 2,2,2,2,2,2,2,2, # 58 - 5f
45
+ 2,2,2,2,2,2,2,2, # 60 - 67
46
+ 2,2,2,2,2,2,2,2, # 68 - 6f
47
+ 2,2,2,2,2,2,2,2, # 70 - 77
48
+ 2,2,2,2,2,2,2,1, # 78 - 7f
49
+ 4,4,4,4,4,4,4,4, # 80 - 87
50
+ 4,4,4,4,4,4,4,4, # 88 - 8f
51
+ 4,4,4,4,4,4,4,4, # 90 - 97
52
+ 4,4,4,4,4,4,4,4, # 98 - 9f
53
+ 4,3,3,3,3,3,3,3, # a0 - a7
54
+ 3,3,3,3,3,3,3,3, # a8 - af
55
+ 3,3,3,3,3,3,3,3, # b0 - b7
56
+ 3,3,3,3,3,3,3,3, # b8 - bf
57
+ 3,3,3,3,3,3,3,3, # c0 - c7
58
+ 3,3,3,3,3,3,3,3, # c8 - cf
59
+ 3,3,3,3,3,3,3,3, # d0 - d7
60
+ 3,3,3,3,3,3,3,3, # d8 - df
61
+ 3,3,3,3,3,3,3,3, # e0 - e7
62
+ 3,3,3,3,3,3,3,3, # e8 - ef
63
+ 3,3,3,3,3,3,3,3, # f0 - f7
64
+ 3,3,3,3,3,3,3,0 # f8 - ff
65
+ ]
66
+
67
+ BIG5_st = [
68
+ EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
69
+ EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,#08-0f
70
+ EError,EStart,EStart,EStart,EStart,EStart,EStart,EStart #10-17
71
+ ]
72
+
73
+ Big5CharLenTable = [0, 1, 1, 2, 0]
74
+
75
+ Big5SMModel = {'classTable' => BIG5_cls,
76
+ 'classFactor' => 5,
77
+ 'stateTable' => BIG5_st,
78
+ 'charLenTable' => Big5CharLenTable,
79
+ 'name' => 'Big5'
80
+ }
81
+
82
+ # EUC-JP
83
+
84
+ EUCJP_cls = [
85
+ 4,4,4,4,4,4,4,4, # 00 - 07
86
+ 4,4,4,4,4,4,5,5, # 08 - 0f
87
+ 4,4,4,4,4,4,4,4, # 10 - 17
88
+ 4,4,4,5,4,4,4,4, # 18 - 1f
89
+ 4,4,4,4,4,4,4,4, # 20 - 27
90
+ 4,4,4,4,4,4,4,4, # 28 - 2f
91
+ 4,4,4,4,4,4,4,4, # 30 - 37
92
+ 4,4,4,4,4,4,4,4, # 38 - 3f
93
+ 4,4,4,4,4,4,4,4, # 40 - 47
94
+ 4,4,4,4,4,4,4,4, # 48 - 4f
95
+ 4,4,4,4,4,4,4,4, # 50 - 57
96
+ 4,4,4,4,4,4,4,4, # 58 - 5f
97
+ 4,4,4,4,4,4,4,4, # 60 - 67
98
+ 4,4,4,4,4,4,4,4, # 68 - 6f
99
+ 4,4,4,4,4,4,4,4, # 70 - 77
100
+ 4,4,4,4,4,4,4,4, # 78 - 7f
101
+ 5,5,5,5,5,5,5,5, # 80 - 87
102
+ 5,5,5,5,5,5,1,3, # 88 - 8f
103
+ 5,5,5,5,5,5,5,5, # 90 - 97
104
+ 5,5,5,5,5,5,5,5, # 98 - 9f
105
+ 5,2,2,2,2,2,2,2, # a0 - a7
106
+ 2,2,2,2,2,2,2,2, # a8 - af
107
+ 2,2,2,2,2,2,2,2, # b0 - b7
108
+ 2,2,2,2,2,2,2,2, # b8 - bf
109
+ 2,2,2,2,2,2,2,2, # c0 - c7
110
+ 2,2,2,2,2,2,2,2, # c8 - cf
111
+ 2,2,2,2,2,2,2,2, # d0 - d7
112
+ 2,2,2,2,2,2,2,2, # d8 - df
113
+ 0,0,0,0,0,0,0,0, # e0 - e7
114
+ 0,0,0,0,0,0,0,0, # e8 - ef
115
+ 0,0,0,0,0,0,0,0, # f0 - f7
116
+ 0,0,0,0,0,0,0,5 # f8 - ff
117
+ ]
118
+
119
+ EUCJP_st = [
120
+ 3, 4, 3, 5,EStart,EError,EError,EError,#00-07
121
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
122
+ EItsMe,EItsMe,EStart,EError,EStart,EError,EError,EError,#10-17
123
+ EError,EError,EStart,EError,EError,EError, 3,EError,#18-1f
124
+ 3,EError,EError,EError,EStart,EStart,EStart,EStart #20-27
125
+ ]
126
+
127
+ EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
128
+
129
+ EUCJPSMModel = {'classTable' => EUCJP_cls,
130
+ 'classFactor' => 6,
131
+ 'stateTable' => EUCJP_st,
132
+ 'charLenTable' => EUCJPCharLenTable,
133
+ 'name' => 'EUC-JP'
134
+ }
135
+
136
+ # EUC-KR
137
+
138
+ EUCKR_cls = [
139
+ 1,1,1,1,1,1,1,1, # 00 - 07
140
+ 1,1,1,1,1,1,0,0, # 08 - 0f
141
+ 1,1,1,1,1,1,1,1, # 10 - 17
142
+ 1,1,1,0,1,1,1,1, # 18 - 1f
143
+ 1,1,1,1,1,1,1,1, # 20 - 27
144
+ 1,1,1,1,1,1,1,1, # 28 - 2f
145
+ 1,1,1,1,1,1,1,1, # 30 - 37
146
+ 1,1,1,1,1,1,1,1, # 38 - 3f
147
+ 1,1,1,1,1,1,1,1, # 40 - 47
148
+ 1,1,1,1,1,1,1,1, # 48 - 4f
149
+ 1,1,1,1,1,1,1,1, # 50 - 57
150
+ 1,1,1,1,1,1,1,1, # 58 - 5f
151
+ 1,1,1,1,1,1,1,1, # 60 - 67
152
+ 1,1,1,1,1,1,1,1, # 68 - 6f
153
+ 1,1,1,1,1,1,1,1, # 70 - 77
154
+ 1,1,1,1,1,1,1,1, # 78 - 7f
155
+ 0,0,0,0,0,0,0,0, # 80 - 87
156
+ 0,0,0,0,0,0,0,0, # 88 - 8f
157
+ 0,0,0,0,0,0,0,0, # 90 - 97
158
+ 0,0,0,0,0,0,0,0, # 98 - 9f
159
+ 0,2,2,2,2,2,2,2, # a0 - a7
160
+ 2,2,2,2,2,3,3,3, # a8 - af
161
+ 2,2,2,2,2,2,2,2, # b0 - b7
162
+ 2,2,2,2,2,2,2,2, # b8 - bf
163
+ 2,2,2,2,2,2,2,2, # c0 - c7
164
+ 2,3,2,2,2,2,2,2, # c8 - cf
165
+ 2,2,2,2,2,2,2,2, # d0 - d7
166
+ 2,2,2,2,2,2,2,2, # d8 - df
167
+ 2,2,2,2,2,2,2,2, # e0 - e7
168
+ 2,2,2,2,2,2,2,2, # e8 - ef
169
+ 2,2,2,2,2,2,2,2, # f0 - f7
170
+ 2,2,2,2,2,2,2,0 # f8 - ff
171
+ ]
172
+
173
+ EUCKR_st = [
174
+ EError,EStart, 3,EError,EError,EError,EError,EError,#00-07
175
+ EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,EStart#08-0f
176
+ ]
177
+
178
+ EUCKRCharLenTable = [0, 1, 2, 0]
179
+
180
+ EUCKRSMModel = {'classTable' => EUCKR_cls,
181
+ 'classFactor' => 4,
182
+ 'stateTable' => EUCKR_st,
183
+ 'charLenTable' => EUCKRCharLenTable,
184
+ 'name' => 'EUC-KR'
185
+ }
186
+
187
+ # EUC-TW
188
+
189
+ EUCTW_cls = [
190
+ 2,2,2,2,2,2,2,2, # 00 - 07
191
+ 2,2,2,2,2,2,0,0, # 08 - 0f
192
+ 2,2,2,2,2,2,2,2, # 10 - 17
193
+ 2,2,2,0,2,2,2,2, # 18 - 1f
194
+ 2,2,2,2,2,2,2,2, # 20 - 27
195
+ 2,2,2,2,2,2,2,2, # 28 - 2f
196
+ 2,2,2,2,2,2,2,2, # 30 - 37
197
+ 2,2,2,2,2,2,2,2, # 38 - 3f
198
+ 2,2,2,2,2,2,2,2, # 40 - 47
199
+ 2,2,2,2,2,2,2,2, # 48 - 4f
200
+ 2,2,2,2,2,2,2,2, # 50 - 57
201
+ 2,2,2,2,2,2,2,2, # 58 - 5f
202
+ 2,2,2,2,2,2,2,2, # 60 - 67
203
+ 2,2,2,2,2,2,2,2, # 68 - 6f
204
+ 2,2,2,2,2,2,2,2, # 70 - 77
205
+ 2,2,2,2,2,2,2,2, # 78 - 7f
206
+ 0,0,0,0,0,0,0,0, # 80 - 87
207
+ 0,0,0,0,0,0,6,0, # 88 - 8f
208
+ 0,0,0,0,0,0,0,0, # 90 - 97
209
+ 0,0,0,0,0,0,0,0, # 98 - 9f
210
+ 0,3,4,4,4,4,4,4, # a0 - a7
211
+ 5,5,1,1,1,1,1,1, # a8 - af
212
+ 1,1,1,1,1,1,1,1, # b0 - b7
213
+ 1,1,1,1,1,1,1,1, # b8 - bf
214
+ 1,1,3,1,3,3,3,3, # c0 - c7
215
+ 3,3,3,3,3,3,3,3, # c8 - cf
216
+ 3,3,3,3,3,3,3,3, # d0 - d7
217
+ 3,3,3,3,3,3,3,3, # d8 - df
218
+ 3,3,3,3,3,3,3,3, # e0 - e7
219
+ 3,3,3,3,3,3,3,3, # e8 - ef
220
+ 3,3,3,3,3,3,3,3, # f0 - f7
221
+ 3,3,3,3,3,3,3,0 # f8 - ff
222
+ ]
223
+
224
+ EUCTW_st = [
225
+ EError,EError,EStart, 3, 3, 3, 4,EError,#00-07
226
+ EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
227
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EStart,EError,#10-17
228
+ EStart,EStart,EStart,EError,EError,EError,EError,EError,#18-1f
229
+ 5,EError,EError,EError,EStart,EError,EStart,EStart,#20-27
230
+ EStart,EError,EStart,EStart,EStart,EStart,EStart,EStart #28-2f
231
+ ]
232
+
233
+ EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
234
+
235
+ EUCTWSMModel = {'classTable' => EUCTW_cls,
236
+ 'classFactor' => 7,
237
+ 'stateTable' => EUCTW_st,
238
+ 'charLenTable' => EUCTWCharLenTable,
239
+ 'name' => 'x-euc-tw'
240
+ }
241
+
242
+ # GB2312
243
+
244
+ GB2312_cls = [
245
+ 1,1,1,1,1,1,1,1, # 00 - 07
246
+ 1,1,1,1,1,1,0,0, # 08 - 0f
247
+ 1,1,1,1,1,1,1,1, # 10 - 17
248
+ 1,1,1,0,1,1,1,1, # 18 - 1f
249
+ 1,1,1,1,1,1,1,1, # 20 - 27
250
+ 1,1,1,1,1,1,1,1, # 28 - 2f
251
+ 3,3,3,3,3,3,3,3, # 30 - 37
252
+ 3,3,1,1,1,1,1,1, # 38 - 3f
253
+ 2,2,2,2,2,2,2,2, # 40 - 47
254
+ 2,2,2,2,2,2,2,2, # 48 - 4f
255
+ 2,2,2,2,2,2,2,2, # 50 - 57
256
+ 2,2,2,2,2,2,2,2, # 58 - 5f
257
+ 2,2,2,2,2,2,2,2, # 60 - 67
258
+ 2,2,2,2,2,2,2,2, # 68 - 6f
259
+ 2,2,2,2,2,2,2,2, # 70 - 77
260
+ 2,2,2,2,2,2,2,4, # 78 - 7f
261
+ 5,6,6,6,6,6,6,6, # 80 - 87
262
+ 6,6,6,6,6,6,6,6, # 88 - 8f
263
+ 6,6,6,6,6,6,6,6, # 90 - 97
264
+ 6,6,6,6,6,6,6,6, # 98 - 9f
265
+ 6,6,6,6,6,6,6,6, # a0 - a7
266
+ 6,6,6,6,6,6,6,6, # a8 - af
267
+ 6,6,6,6,6,6,6,6, # b0 - b7
268
+ 6,6,6,6,6,6,6,6, # b8 - bf
269
+ 6,6,6,6,6,6,6,6, # c0 - c7
270
+ 6,6,6,6,6,6,6,6, # c8 - cf
271
+ 6,6,6,6,6,6,6,6, # d0 - d7
272
+ 6,6,6,6,6,6,6,6, # d8 - df
273
+ 6,6,6,6,6,6,6,6, # e0 - e7
274
+ 6,6,6,6,6,6,6,6, # e8 - ef
275
+ 6,6,6,6,6,6,6,6, # f0 - f7
276
+ 6,6,6,6,6,6,6,0 # f8 - ff
277
+ ]
278
+
279
+ GB2312_st = [
280
+ EError,EStart,EStart,EStart,EStart,EStart, 3,EError,#00-07
281
+ EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
282
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,#10-17
283
+ 4,EError,EStart,EStart,EError,EError,EError,EError,#18-1f
284
+ EError,EError, 5,EError,EError,EError,EItsMe,EError,#20-27
285
+ EError,EError,EStart,EStart,EStart,EStart,EStart,EStart#28-2f
286
+ ]
287
+
288
+ # To be accurate, the length of class 6 can be either 2 or 4.
289
+ # But it is not necessary to discriminate between the two since
290
+ # it is used for frequency analysis only, and we are validing
291
+ # each code range there as well. So it is safe to set it to be
292
+ # 2 here.
293
+ GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
294
+
295
+ GB2312SMModel = {'classTable' => GB2312_cls,
296
+ 'classFactor' => 7,
297
+ 'stateTable' => GB2312_st,
298
+ 'charLenTable' => GB2312CharLenTable,
299
+ 'name' => 'GB2312'
300
+ }
301
+
302
+ # Shift_JIS
303
+
304
+ SJIS_cls = [
305
+ 1,1,1,1,1,1,1,1, # 00 - 07
306
+ 1,1,1,1,1,1,0,0, # 08 - 0f
307
+ 1,1,1,1,1,1,1,1, # 10 - 17
308
+ 1,1,1,0,1,1,1,1, # 18 - 1f
309
+ 1,1,1,1,1,1,1,1, # 20 - 27
310
+ 1,1,1,1,1,1,1,1, # 28 - 2f
311
+ 1,1,1,1,1,1,1,1, # 30 - 37
312
+ 1,1,1,1,1,1,1,1, # 38 - 3f
313
+ 2,2,2,2,2,2,2,2, # 40 - 47
314
+ 2,2,2,2,2,2,2,2, # 48 - 4f
315
+ 2,2,2,2,2,2,2,2, # 50 - 57
316
+ 2,2,2,2,2,2,2,2, # 58 - 5f
317
+ 2,2,2,2,2,2,2,2, # 60 - 67
318
+ 2,2,2,2,2,2,2,2, # 68 - 6f
319
+ 2,2,2,2,2,2,2,2, # 70 - 77
320
+ 2,2,2,2,2,2,2,1, # 78 - 7f
321
+ 3,3,3,3,3,3,3,3, # 80 - 87
322
+ 3,3,3,3,3,3,3,3, # 88 - 8f
323
+ 3,3,3,3,3,3,3,3, # 90 - 97
324
+ 3,3,3,3,3,3,3,3, # 98 - 9f
325
+ #0xa0 is illegal in sjis encoding, but some pages does
326
+ #contain such byte. We need to be more error forgiven.
327
+ 2,2,2,2,2,2,2,2, # a0 - a7
328
+ 2,2,2,2,2,2,2,2, # a8 - af
329
+ 2,2,2,2,2,2,2,2, # b0 - b7
330
+ 2,2,2,2,2,2,2,2, # b8 - bf
331
+ 2,2,2,2,2,2,2,2, # c0 - c7
332
+ 2,2,2,2,2,2,2,2, # c8 - cf
333
+ 2,2,2,2,2,2,2,2, # d0 - d7
334
+ 2,2,2,2,2,2,2,2, # d8 - df
335
+ 3,3,3,3,3,3,3,3, # e0 - e7
336
+ 3,3,3,3,3,4,4,4, # e8 - ef
337
+ 4,4,4,4,4,4,4,4, # f0 - f7
338
+ 4,4,4,4,4,0,0,0 # f8 - ff
339
+ ]
340
+
341
+ SJIS_st = [
342
+ EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
343
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
344
+ EItsMe,EItsMe,EError,EError,EStart,EStart,EStart,EStart#10-17
345
+ ]
346
+
347
+ SJISCharLenTable = [0, 1, 1, 2, 0, 0]
348
+
349
+ SJISSMModel = {'classTable' => SJIS_cls,
350
+ 'classFactor' => 6,
351
+ 'stateTable' => SJIS_st,
352
+ 'charLenTable' => SJISCharLenTable,
353
+ 'name' => 'Shift_JIS'
354
+ }
355
+
356
+ # UCS2-BE
357
+
358
+ UCS2BE_cls = [
359
+ 0,0,0,0,0,0,0,0, # 00 - 07
360
+ 0,0,1,0,0,2,0,0, # 08 - 0f
361
+ 0,0,0,0,0,0,0,0, # 10 - 17
362
+ 0,0,0,3,0,0,0,0, # 18 - 1f
363
+ 0,0,0,0,0,0,0,0, # 20 - 27
364
+ 0,3,3,3,3,3,0,0, # 28 - 2f
365
+ 0,0,0,0,0,0,0,0, # 30 - 37
366
+ 0,0,0,0,0,0,0,0, # 38 - 3f
367
+ 0,0,0,0,0,0,0,0, # 40 - 47
368
+ 0,0,0,0,0,0,0,0, # 48 - 4f
369
+ 0,0,0,0,0,0,0,0, # 50 - 57
370
+ 0,0,0,0,0,0,0,0, # 58 - 5f
371
+ 0,0,0,0,0,0,0,0, # 60 - 67
372
+ 0,0,0,0,0,0,0,0, # 68 - 6f
373
+ 0,0,0,0,0,0,0,0, # 70 - 77
374
+ 0,0,0,0,0,0,0,0, # 78 - 7f
375
+ 0,0,0,0,0,0,0,0, # 80 - 87
376
+ 0,0,0,0,0,0,0,0, # 88 - 8f
377
+ 0,0,0,0,0,0,0,0, # 90 - 97
378
+ 0,0,0,0,0,0,0,0, # 98 - 9f
379
+ 0,0,0,0,0,0,0,0, # a0 - a7
380
+ 0,0,0,0,0,0,0,0, # a8 - af
381
+ 0,0,0,0,0,0,0,0, # b0 - b7
382
+ 0,0,0,0,0,0,0,0, # b8 - bf
383
+ 0,0,0,0,0,0,0,0, # c0 - c7
384
+ 0,0,0,0,0,0,0,0, # c8 - cf
385
+ 0,0,0,0,0,0,0,0, # d0 - d7
386
+ 0,0,0,0,0,0,0,0, # d8 - df
387
+ 0,0,0,0,0,0,0,0, # e0 - e7
388
+ 0,0,0,0,0,0,0,0, # e8 - ef
389
+ 0,0,0,0,0,0,0,0, # f0 - f7
390
+ 0,0,0,0,0,0,4,5 # f8 - ff
391
+ ]
392
+
393
+ UCS2BE_st = [
394
+ 5, 7, 7,EError, 4, 3,EError,EError,#00-07
395
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
396
+ EItsMe,EItsMe, 6, 6, 6, 6,EError,EError,#10-17
397
+ 6, 6, 6, 6, 6,EItsMe, 6, 6,#18-1f
398
+ 6, 6, 6, 6, 5, 7, 7,EError,#20-27
399
+ 5, 8, 6, 6,EError, 6, 6, 6,#28-2f
400
+ 6, 6, 6, 6,EError,EError,EStart,EStart#30-37
401
+ ]
402
+
403
+ UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
404
+
405
+ UCS2BESMModel = {'classTable' => UCS2BE_cls,
406
+ 'classFactor' => 6,
407
+ 'stateTable' => UCS2BE_st,
408
+ 'charLenTable' => UCS2BECharLenTable,
409
+ 'name' => 'UTF-16BE'
410
+ }
411
+
412
+ # UCS2-LE
413
+
414
+ UCS2LE_cls = [
415
+ 0,0,0,0,0,0,0,0, # 00 - 07
416
+ 0,0,1,0,0,2,0,0, # 08 - 0f
417
+ 0,0,0,0,0,0,0,0, # 10 - 17
418
+ 0,0,0,3,0,0,0,0, # 18 - 1f
419
+ 0,0,0,0,0,0,0,0, # 20 - 27
420
+ 0,3,3,3,3,3,0,0, # 28 - 2f
421
+ 0,0,0,0,0,0,0,0, # 30 - 37
422
+ 0,0,0,0,0,0,0,0, # 38 - 3f
423
+ 0,0,0,0,0,0,0,0, # 40 - 47
424
+ 0,0,0,0,0,0,0,0, # 48 - 4f
425
+ 0,0,0,0,0,0,0,0, # 50 - 57
426
+ 0,0,0,0,0,0,0,0, # 58 - 5f
427
+ 0,0,0,0,0,0,0,0, # 60 - 67
428
+ 0,0,0,0,0,0,0,0, # 68 - 6f
429
+ 0,0,0,0,0,0,0,0, # 70 - 77
430
+ 0,0,0,0,0,0,0,0, # 78 - 7f
431
+ 0,0,0,0,0,0,0,0, # 80 - 87
432
+ 0,0,0,0,0,0,0,0, # 88 - 8f
433
+ 0,0,0,0,0,0,0,0, # 90 - 97
434
+ 0,0,0,0,0,0,0,0, # 98 - 9f
435
+ 0,0,0,0,0,0,0,0, # a0 - a7
436
+ 0,0,0,0,0,0,0,0, # a8 - af
437
+ 0,0,0,0,0,0,0,0, # b0 - b7
438
+ 0,0,0,0,0,0,0,0, # b8 - bf
439
+ 0,0,0,0,0,0,0,0, # c0 - c7
440
+ 0,0,0,0,0,0,0,0, # c8 - cf
441
+ 0,0,0,0,0,0,0,0, # d0 - d7
442
+ 0,0,0,0,0,0,0,0, # d8 - df
443
+ 0,0,0,0,0,0,0,0, # e0 - e7
444
+ 0,0,0,0,0,0,0,0, # e8 - ef
445
+ 0,0,0,0,0,0,0,0, # f0 - f7
446
+ 0,0,0,0,0,0,4,5 # f8 - ff
447
+ ]
448
+
449
+ UCS2LE_st = [
450
+ 6, 6, 7, 6, 4, 3,EError,EError,#00-07
451
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
452
+ EItsMe,EItsMe, 5, 5, 5,EError,EItsMe,EError,#10-17
453
+ 5, 5, 5,EError, 5,EError, 6, 6,#18-1f
454
+ 7, 6, 8, 8, 5, 5, 5,EError,#20-27
455
+ 5, 5, 5,EError,EError,EError, 5, 5,#28-2f
456
+ 5, 5, 5,EError, 5,EError,EStart,EStart#30-37
457
+ ]
458
+
459
+ UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
460
+
461
+ UCS2LESMModel = {'classTable' => UCS2LE_cls,
462
+ 'classFactor' => 6,
463
+ 'stateTable' => UCS2LE_st,
464
+ 'charLenTable' => UCS2LECharLenTable,
465
+ 'name' => 'UTF-16LE'
466
+ }
467
+
468
+ # UTF-8
469
+
470
+ UTF8_cls = [
471
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
472
+ 1,1,1,1,1,1,0,0, # 08 - 0f
473
+ 1,1,1,1,1,1,1,1, # 10 - 17
474
+ 1,1,1,0,1,1,1,1, # 18 - 1f
475
+ 1,1,1,1,1,1,1,1, # 20 - 27
476
+ 1,1,1,1,1,1,1,1, # 28 - 2f
477
+ 1,1,1,1,1,1,1,1, # 30 - 37
478
+ 1,1,1,1,1,1,1,1, # 38 - 3f
479
+ 1,1,1,1,1,1,1,1, # 40 - 47
480
+ 1,1,1,1,1,1,1,1, # 48 - 4f
481
+ 1,1,1,1,1,1,1,1, # 50 - 57
482
+ 1,1,1,1,1,1,1,1, # 58 - 5f
483
+ 1,1,1,1,1,1,1,1, # 60 - 67
484
+ 1,1,1,1,1,1,1,1, # 68 - 6f
485
+ 1,1,1,1,1,1,1,1, # 70 - 77
486
+ 1,1,1,1,1,1,1,1, # 78 - 7f
487
+ 2,2,2,2,3,3,3,3, # 80 - 87
488
+ 4,4,4,4,4,4,4,4, # 88 - 8f
489
+ 4,4,4,4,4,4,4,4, # 90 - 97
490
+ 4,4,4,4,4,4,4,4, # 98 - 9f
491
+ 5,5,5,5,5,5,5,5, # a0 - a7
492
+ 5,5,5,5,5,5,5,5, # a8 - af
493
+ 5,5,5,5,5,5,5,5, # b0 - b7
494
+ 5,5,5,5,5,5,5,5, # b8 - bf
495
+ 0,0,6,6,6,6,6,6, # c0 - c7
496
+ 6,6,6,6,6,6,6,6, # c8 - cf
497
+ 6,6,6,6,6,6,6,6, # d0 - d7
498
+ 6,6,6,6,6,6,6,6, # d8 - df
499
+ 7,8,8,8,8,8,8,8, # e0 - e7
500
+ 8,8,8,8,8,9,8,8, # e8 - ef
501
+ 10,11,11,11,11,11,11,11, # f0 - f7
502
+ 12,13,13,13,14,15,0,0 # f8 - ff
503
+ ]
504
+
505
+ UTF8_st = [
506
+ EError,EStart,EError,EError,EError,EError, 12, 10,#00-07
507
+ 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
508
+ EError,EError,EError,EError,EError,EError,EError,EError,#10-17
509
+ EError,EError,EError,EError,EError,EError,EError,EError,#18-1f
510
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,#20-27
511
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,#28-2f
512
+ EError,EError, 5, 5, 5, 5,EError,EError,#30-37
513
+ EError,EError,EError,EError,EError,EError,EError,EError,#38-3f
514
+ EError,EError,EError, 5, 5, 5,EError,EError,#40-47
515
+ EError,EError,EError,EError,EError,EError,EError,EError,#48-4f
516
+ EError,EError, 7, 7, 7, 7,EError,EError,#50-57
517
+ EError,EError,EError,EError,EError,EError,EError,EError,#58-5f
518
+ EError,EError,EError,EError, 7, 7,EError,EError,#60-67
519
+ EError,EError,EError,EError,EError,EError,EError,EError,#68-6f
520
+ EError,EError, 9, 9, 9, 9,EError,EError,#70-77
521
+ EError,EError,EError,EError,EError,EError,EError,EError,#78-7f
522
+ EError,EError,EError,EError,EError, 9,EError,EError,#80-87
523
+ EError,EError,EError,EError,EError,EError,EError,EError,#88-8f
524
+ EError,EError, 12, 12, 12, 12,EError,EError,#90-97
525
+ EError,EError,EError,EError,EError,EError,EError,EError,#98-9f
526
+ EError,EError,EError,EError,EError, 12,EError,EError,#a0-a7
527
+ EError,EError,EError,EError,EError,EError,EError,EError,#a8-af
528
+ EError,EError, 12, 12, 12,EError,EError,EError,#b0-b7
529
+ EError,EError,EError,EError,EError,EError,EError,EError,#b8-bf
530
+ EError,EError,EStart,EStart,EStart,EStart,EError,EError,#c0-c7
531
+ EError,EError,EError,EError,EError,EError,EError,EError#c8-cf
532
+ ]
533
+
534
+ UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
535
+
536
+ UTF8SMModel = {'classTable' => UTF8_cls,
537
+ 'classFactor' => 16,
538
+ 'stateTable' => UTF8_st,
539
+ 'charLenTable' => UTF8CharLenTable,
540
+ 'name' => 'UTF-8'
541
+ }
542
+ end
@@ -0,0 +1,124 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ SAMPLE_SIZE = 64
32
+ SB_ENOUGH_REL_THRESHOLD = 1024
33
+ POSITIVE_SHORTCUT_THRESHOLD = 0.95
34
+ NEGATIVE_SHORTCUT_THRESHOLD = 0.05
35
+ SYMBOL_CAT_ORDER = 250
36
+ NUMBER_OF_SEQ_CAT = 4
37
+ POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
38
+ #NEGATIVE_CAT = 0
39
+
40
+ class SingleByteCharSetProber < CharSetProber
41
+ def initialize(model, reversed=false, nameProber=nil)
42
+ super()
43
+ @_mModel = model
44
+ @_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
45
+ @_mNameProber = nameProber # Optional auxiliary prober for name decision
46
+ reset()
47
+ end
48
+
49
+ def reset
50
+ super()
51
+ @_mLastOrder = 255 # char order of last character
52
+ @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
53
+ @_mTotalSeqs = 0
54
+ @_mTotalChar = 0
55
+ @_mFreqChar = 0 # characters that fall in our sampling range
56
+ end
57
+
58
+ def get_charset_name
59
+ if @_mNameProber
60
+ return @_mNameProber.get_charset_name()
61
+ else
62
+ return @_mModel['charsetName']
63
+ end
64
+ end
65
+
66
+ def feed(aBuf)
67
+ if not @_mModel['keepEnglishLetter']
68
+ aBuf = filter_without_english_letters(aBuf)
69
+ end
70
+ aLen = aBuf.length
71
+ if not aLen
72
+ return get_state()
73
+ end
74
+ aBuf.each_byte do |b|
75
+ c = b.chr
76
+ order = @_mModel['charToOrderMap'][c[0]]
77
+ if order < SYMBOL_CAT_ORDER
78
+ @_mTotalChar += 1
79
+ end
80
+ if order < SAMPLE_SIZE
81
+ @_mFreqChar += 1
82
+ if @_mLastOrder < SAMPLE_SIZE
83
+ @_mTotalSeqs += 1
84
+ if not @_mReversed
85
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
86
+ else # reverse the order of the letters in the lookup
87
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
88
+ end
89
+ end
90
+ end
91
+ @_mLastOrder = order
92
+ end
93
+
94
+ if get_state() == EDetecting
95
+ if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
96
+ cf = get_confidence()
97
+ if cf > POSITIVE_SHORTCUT_THRESHOLD
98
+ $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
99
+ @_mState = EFoundIt
100
+ elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
101
+ $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
102
+ @_mState = ENotMe
103
+ end
104
+ end
105
+ end
106
+
107
+ return get_state()
108
+ end
109
+
110
+ def get_confidence
111
+ r = 0.01
112
+ if @_mTotalSeqs > 0
113
+ # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
114
+ r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
115
+ # print r, self._mFreqChar, self._mTotalChar
116
+ r = r * @_mFreqChar / @_mTotalChar
117
+ if r >= 1.0
118
+ r = 0.99
119
+ end
120
+ end
121
+ return r
122
+ end
123
+ end
124
+ end