tmail 1.2.3.1 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. data/README +10 -0
  2. data/ext/tmailscanner/tmail/tmailscanner.c +39 -8
  3. data/lib/tmail.rb +1 -0
  4. data/lib/tmail/address.rb +6 -40
  5. data/lib/tmail/attachments.rb +41 -22
  6. data/lib/tmail/encode.rb +9 -0
  7. data/lib/tmail/header.rb +5 -3
  8. data/lib/tmail/interface.rb +40 -3
  9. data/lib/tmail/mail.rb +3 -3
  10. data/lib/tmail/mailbox.rb +3 -2
  11. data/lib/tmail/net.rb +3 -1
  12. data/lib/tmail/parser.rb +204 -620
  13. data/lib/tmail/parser.y +38 -3
  14. data/lib/tmail/quoting.rb +38 -1
  15. data/lib/tmail/utils.rb +28 -4
  16. data/lib/tmail/vendor/rchardet-1.3/COPYING +504 -0
  17. data/lib/tmail/vendor/rchardet-1.3/README +12 -0
  18. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet.rb +67 -0
  19. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5freq.rb +927 -0
  20. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5prober.rb +42 -0
  21. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +237 -0
  22. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +112 -0
  23. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetprober.rb +75 -0
  24. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +64 -0
  25. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/constants.rb +42 -0
  26. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +90 -0
  27. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escsm.rb +244 -0
  28. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +88 -0
  29. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrfreq.rb +596 -0
  30. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrprober.rb +42 -0
  31. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwfreq.rb +430 -0
  32. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwprober.rb +42 -0
  33. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312freq.rb +474 -0
  34. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312prober.rb +42 -0
  35. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +289 -0
  36. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jisfreq.rb +570 -0
  37. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +229 -0
  38. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langbulgarianmodel.rb +229 -0
  39. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langcyrillicmodel.rb +330 -0
  40. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langgreekmodel.rb +227 -0
  41. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhebrewmodel.rb +202 -0
  42. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhungarianmodel.rb +226 -0
  43. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langthaimodel.rb +201 -0
  44. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +147 -0
  45. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +89 -0
  46. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +47 -0
  47. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +542 -0
  48. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +124 -0
  49. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +58 -0
  50. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +88 -0
  51. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +166 -0
  52. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +87 -0
  53. data/lib/tmail/version.rb +1 -1
  54. data/setup.rb +2 -2
  55. data/test/fixtures/apple_unquoted_content_type +44 -0
  56. data/test/fixtures/inline_attachment.txt +2095 -0
  57. data/test/fixtures/iso_8859_1_email_without_encoding_and_message_id.txt +16 -0
  58. data/test/fixtures/mailbox.zip +0 -0
  59. data/test/fixtures/marked_as_iso_8859_1_but_it_is_utf_8.txt +33 -0
  60. data/test/fixtures/marked_as_utf_8_but_it_is_iso_8859_1.txt +56 -0
  61. data/test/fixtures/raw_email_bad_time +62 -0
  62. data/test/fixtures/raw_email_double_at_in_header +14 -0
  63. data/test/fixtures/raw_email_only_attachment +17 -0
  64. data/test/fixtures/raw_email_string_in_date_field +17 -0
  65. data/test/fixtures/raw_email_trailing_dot +21 -0
  66. data/test/fixtures/raw_email_with_quoted_attachment_filename +60 -0
  67. data/test/fixtures/raw_email_with_wrong_splitted_multibyte_encoded_word_subject +15 -0
  68. data/test/fixtures/the_only_part_is_a_word_document.txt +425 -0
  69. data/test/fixtures/unquoted_filename_in_attachment +177 -0
  70. data/test/test_address.rb +114 -92
  71. data/test/test_attachments.rb +84 -1
  72. data/test/test_encode.rb +54 -0
  73. data/test/test_header.rb +60 -2
  74. data/test/test_mail.rb +22 -15
  75. data/test/test_mbox.rb +12 -3
  76. data/test/test_port.rb +13 -9
  77. data/test/test_quote.rb +9 -0
  78. data/tmail.gemspec +34 -0
  79. metadata +68 -167
  80. data/MANIFEST +0 -191
  81. data/log/BugTrackingLog.txt +0 -1231
  82. data/log/Changelog.txt +0 -534
  83. data/log/Fixme.txt +0 -6
  84. data/log/Testlog.txt +0 -2340
  85. data/log/Todo.txt +0 -30
  86. data/log/fixme.rdoc +0 -6
  87. data/meta/MANIFEST +0 -128
  88. data/meta/VERSION +0 -1
  89. data/meta/project.yaml +0 -30
  90. data/meta/unixname +0 -1
  91. data/sample/bench_base64.rb +0 -48
  92. data/sample/data/multipart +0 -23
  93. data/sample/data/normal +0 -29
  94. data/sample/data/sendtest +0 -5
  95. data/sample/data/simple +0 -14
  96. data/sample/data/test +0 -27
  97. data/sample/extract-attachements.rb +0 -33
  98. data/sample/from-check.rb +0 -26
  99. data/sample/multipart.rb +0 -26
  100. data/sample/parse-bench.rb +0 -68
  101. data/sample/parse-test.rb +0 -19
  102. data/sample/sendmail.rb +0 -94
  103. data/site/contributing/index.html +0 -183
  104. data/site/css/clean.css +0 -27
  105. data/site/css/layout.css +0 -31
  106. data/site/css/style.css +0 -60
  107. data/site/download/index.html +0 -61
  108. data/site/img/envelope.jpg +0 -0
  109. data/site/img/mailman.gif +0 -0
  110. data/site/img/stamp-sm.jpg +0 -0
  111. data/site/img/stamp.jpg +0 -0
  112. data/site/img/stampborder.jpg +0 -0
  113. data/site/img/tfire.jpg +0 -0
  114. data/site/img/tmail.png +0 -0
  115. data/site/index.html +0 -270
  116. data/site/js/jquery.js +0 -31
  117. data/site/log/Changelog.xsl +0 -33
  118. data/site/log/changelog.xml +0 -1677
  119. data/site/outdated/BUGS +0 -3
  120. data/site/outdated/DEPENDS +0 -1
  121. data/site/outdated/Incompatibilities +0 -89
  122. data/site/outdated/Incompatibilities.ja +0 -102
  123. data/site/outdated/NEWS +0 -9
  124. data/site/outdated/README.ja +0 -73
  125. data/site/outdated/doc.ja/address.html +0 -275
  126. data/site/outdated/doc.ja/basics.html +0 -405
  127. data/site/outdated/doc.ja/config.html +0 -49
  128. data/site/outdated/doc.ja/details.html +0 -146
  129. data/site/outdated/doc.ja/index.html +0 -39
  130. data/site/outdated/doc.ja/mail.html +0 -793
  131. data/site/outdated/doc.ja/mailbox.html +0 -265
  132. data/site/outdated/doc.ja/port.html +0 -95
  133. data/site/outdated/doc.ja/tmail.html +0 -58
  134. data/site/outdated/doc.ja/usage.html +0 -202
  135. data/site/outdated/rdd/address.rrd.m +0 -229
  136. data/site/outdated/rdd/basics.rd.m +0 -275
  137. data/site/outdated/rdd/config.rrd.m +0 -26
  138. data/site/outdated/rdd/details.rd.m +0 -117
  139. data/site/outdated/rdd/index.rhtml.m +0 -54
  140. data/site/outdated/rdd/mail.rrd.m +0 -701
  141. data/site/outdated/rdd/mailbox.rrd.m +0 -228
  142. data/site/outdated/rdd/port.rrd.m +0 -69
  143. data/site/outdated/rdd/tmail.rrd.m +0 -33
  144. data/site/outdated/rdd/usage.rd.m +0 -247
  145. data/site/quickstart/index.html +0 -69
  146. data/site/quickstart/quickstart.html +0 -52
  147. data/site/quickstart/usage.html +0 -193
  148. data/site/reference/address.html +0 -247
  149. data/site/reference/config.html +0 -30
  150. data/site/reference/index.html +0 -101
  151. data/site/reference/mail.html +0 -726
  152. data/site/reference/mailbox.html +0 -245
  153. data/site/reference/port.html +0 -75
  154. data/site/reference/tmail.html +0 -35
  155. data/work/script/make +0 -26
  156. data/work/script/rdoc +0 -39
  157. data/work/script/setup +0 -1616
  158. data/work/script/test +0 -30
@@ -0,0 +1,64 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class CodingStateMachine
31
+ def initialize(sm)
32
+ @_mModel = sm
33
+ @_mCurrentBytePos = 0
34
+ @_mCurrentCharLen = 0
35
+ reset()
36
+ end
37
+
38
+ def reset
39
+ @_mCurrentState = EStart
40
+ end
41
+
42
+ def next_state(c)
43
+ # for each byte we get its class
44
+ # if it is first byte, we also get byte length
45
+ byteCls = @_mModel['classTable'][c[0]]
46
+ if @_mCurrentState == EStart
47
+ @_mCurrentBytePos = 0
48
+ @_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
49
+ end
50
+ # from byte's class and stateTable, we get its next state
51
+ @_mCurrentState = @_mModel['stateTable'][@_mCurrentState * @_mModel['classFactor'] + byteCls]
52
+ @_mCurrentBytePos += 1
53
+ return @_mCurrentState
54
+ end
55
+
56
+ def get_current_charlen
57
+ return @_mCurrentCharLen
58
+ end
59
+
60
+ def get_coding_state_machine
61
+ return @_mModel['name']
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,42 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ $debug = false
32
+
33
+ EDetecting = 0
34
+ EFoundIt = 1
35
+ ENotMe = 2
36
+
37
+ EStart = 0
38
+ EError = 1
39
+ EItsMe = 2
40
+
41
+ SHORTCUT_THRESHOLD = 0.95
42
+ end
@@ -0,0 +1,90 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class EscCharSetProber < CharSetProber
31
+ def initialize
32
+ super()
33
+ @_mCodingSM = [
34
+ CodingStateMachine.new(HZSMModel),
35
+ CodingStateMachine.new(ISO2022CNSMModel),
36
+ CodingStateMachine.new(ISO2022JPSMModel),
37
+ CodingStateMachine.new(ISO2022KRSMModel)
38
+ ]
39
+ reset()
40
+ end
41
+
42
+ def reset
43
+ super()
44
+ for codingSM in @_mCodingSM:
45
+ next if not codingSM
46
+ codingSM.active = true
47
+ codingSM.reset()
48
+ end
49
+ @_mActiveSM = @_mCodingSM.length
50
+ @_mDetectedCharset = nil
51
+ end
52
+
53
+ def get_charset_name
54
+ return @_mDetectedCharset
55
+ end
56
+
57
+ def get_confidence
58
+ if @_mDetectedCharset
59
+ return 0.99
60
+ else
61
+ return 0.00
62
+ end
63
+ end
64
+
65
+ def feed(aBuf)
66
+ aBuf.each_byte do |b|
67
+ c = b.chr
68
+ for codingSM in @_mCodingSM
69
+ next unless codingSM
70
+ next unless codingSM.active
71
+ codingState = codingSM.next_state(c)
72
+ if codingState == EError
73
+ codingSM.active = false
74
+ @_mActiveSM -= 1
75
+ if @_mActiveSM <= 0
76
+ @_mState = ENotMe
77
+ return get_state()
78
+ end
79
+ elsif codingState == EItsMe
80
+ @_mState = EFoundIt
81
+ @_mDetectedCharset = codingSM.get_coding_state_machine()
82
+ return get_state()
83
+ end
84
+ end
85
+ end
86
+
87
+ return get_state()
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,244 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Mark Pilgrim - port to Python
11
+ #
12
+ # This library is free software; you can redistribute it and/or
13
+ # modify it under the terms of the GNU Lesser General Public
14
+ # License as published by the Free Software Foundation; either
15
+ # version 2.1 of the License, or (at your option) any later version.
16
+ #
17
+ # This library is distributed in the hope that it will be useful,
18
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
+ # Lesser General Public License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Lesser General Public
23
+ # License along with this library; if not, write to the Free Software
24
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25
+ # 02110-1301 USA
26
+ ######################### END LICENSE BLOCK #########################
27
+
28
+ module CharDet
29
+ HZ_cls = [
30
+ 1,0,0,0,0,0,0,0, # 00 - 07
31
+ 0,0,0,0,0,0,0,0, # 08 - 0f
32
+ 0,0,0,0,0,0,0,0, # 10 - 17
33
+ 0,0,0,1,0,0,0,0, # 18 - 1f
34
+ 0,0,0,0,0,0,0,0, # 20 - 27
35
+ 0,0,0,0,0,0,0,0, # 28 - 2f
36
+ 0,0,0,0,0,0,0,0, # 30 - 37
37
+ 0,0,0,0,0,0,0,0, # 38 - 3f
38
+ 0,0,0,0,0,0,0,0, # 40 - 47
39
+ 0,0,0,0,0,0,0,0, # 48 - 4f
40
+ 0,0,0,0,0,0,0,0, # 50 - 57
41
+ 0,0,0,0,0,0,0,0, # 58 - 5f
42
+ 0,0,0,0,0,0,0,0, # 60 - 67
43
+ 0,0,0,0,0,0,0,0, # 68 - 6f
44
+ 0,0,0,0,0,0,0,0, # 70 - 77
45
+ 0,0,0,4,0,5,2,0, # 78 - 7f
46
+ 1,1,1,1,1,1,1,1, # 80 - 87
47
+ 1,1,1,1,1,1,1,1, # 88 - 8f
48
+ 1,1,1,1,1,1,1,1, # 90 - 97
49
+ 1,1,1,1,1,1,1,1, # 98 - 9f
50
+ 1,1,1,1,1,1,1,1, # a0 - a7
51
+ 1,1,1,1,1,1,1,1, # a8 - af
52
+ 1,1,1,1,1,1,1,1, # b0 - b7
53
+ 1,1,1,1,1,1,1,1, # b8 - bf
54
+ 1,1,1,1,1,1,1,1, # c0 - c7
55
+ 1,1,1,1,1,1,1,1, # c8 - cf
56
+ 1,1,1,1,1,1,1,1, # d0 - d7
57
+ 1,1,1,1,1,1,1,1, # d8 - df
58
+ 1,1,1,1,1,1,1,1, # e0 - e7
59
+ 1,1,1,1,1,1,1,1, # e8 - ef
60
+ 1,1,1,1,1,1,1,1, # f0 - f7
61
+ 1,1,1,1,1,1,1,1, # f8 - ff
62
+ ]
63
+
64
+ HZ_st = [
65
+ EStart,EError, 3,EStart,EStart,EStart,EError,EError,# 00-07
66
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 08-0f
67
+ EItsMe,EItsMe,EError,EError,EStart,EStart, 4,EError,# 10-17
68
+ 5,EError, 6,EError, 5, 5, 4,EError,# 18-1f
69
+ 4,EError, 4, 4, 4,EError, 4,EError,# 20-27
70
+ 4,EItsMe,EStart,EStart,EStart,EStart,EStart,EStart,# 28-2f
71
+ ]
72
+
73
+ HZCharLenTable = [0, 0, 0, 0, 0, 0]
74
+
75
+ HZSMModel = {'classTable' => HZ_cls,
76
+ 'classFactor' => 6,
77
+ 'stateTable' => HZ_st,
78
+ 'charLenTable' => HZCharLenTable,
79
+ 'name' => "HZ-GB-2312"
80
+ }
81
+
82
+ ISO2022CN_cls = [
83
+ 2,0,0,0,0,0,0,0, # 00 - 07
84
+ 0,0,0,0,0,0,0,0, # 08 - 0f
85
+ 0,0,0,0,0,0,0,0, # 10 - 17
86
+ 0,0,0,1,0,0,0,0, # 18 - 1f
87
+ 0,0,0,0,0,0,0,0, # 20 - 27
88
+ 0,3,0,0,0,0,0,0, # 28 - 2f
89
+ 0,0,0,0,0,0,0,0, # 30 - 37
90
+ 0,0,0,0,0,0,0,0, # 38 - 3f
91
+ 0,0,0,4,0,0,0,0, # 40 - 47
92
+ 0,0,0,0,0,0,0,0, # 48 - 4f
93
+ 0,0,0,0,0,0,0,0, # 50 - 57
94
+ 0,0,0,0,0,0,0,0, # 58 - 5f
95
+ 0,0,0,0,0,0,0,0, # 60 - 67
96
+ 0,0,0,0,0,0,0,0, # 68 - 6f
97
+ 0,0,0,0,0,0,0,0, # 70 - 77
98
+ 0,0,0,0,0,0,0,0, # 78 - 7f
99
+ 2,2,2,2,2,2,2,2, # 80 - 87
100
+ 2,2,2,2,2,2,2,2, # 88 - 8f
101
+ 2,2,2,2,2,2,2,2, # 90 - 97
102
+ 2,2,2,2,2,2,2,2, # 98 - 9f
103
+ 2,2,2,2,2,2,2,2, # a0 - a7
104
+ 2,2,2,2,2,2,2,2, # a8 - af
105
+ 2,2,2,2,2,2,2,2, # b0 - b7
106
+ 2,2,2,2,2,2,2,2, # b8 - bf
107
+ 2,2,2,2,2,2,2,2, # c0 - c7
108
+ 2,2,2,2,2,2,2,2, # c8 - cf
109
+ 2,2,2,2,2,2,2,2, # d0 - d7
110
+ 2,2,2,2,2,2,2,2, # d8 - df
111
+ 2,2,2,2,2,2,2,2, # e0 - e7
112
+ 2,2,2,2,2,2,2,2, # e8 - ef
113
+ 2,2,2,2,2,2,2,2, # f0 - f7
114
+ 2,2,2,2,2,2,2,2, # f8 - ff
115
+ ]
116
+
117
+ ISO2022CN_st = [
118
+ EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
119
+ EStart,EError,EError,EError,EError,EError,EError,EError,# 08-0f
120
+ EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,# 10-17
121
+ EItsMe,EItsMe,EItsMe,EError,EError,EError, 4,EError,# 18-1f
122
+ EError,EError,EError,EItsMe,EError,EError,EError,EError,# 20-27
123
+ 5, 6,EError,EError,EError,EError,EError,EError,# 28-2f
124
+ EError,EError,EError,EItsMe,EError,EError,EError,EError,# 30-37
125
+ EError,EError,EError,EError,EError,EItsMe,EError,EStart,# 38-3f
126
+ ]
127
+
128
+ ISO2022CNCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0]
129
+
130
+ ISO2022CNSMModel = {'classTable' => ISO2022CN_cls,
131
+ 'classFactor' => 9,
132
+ 'stateTable' => ISO2022CN_st,
133
+ 'charLenTable' => ISO2022CNCharLenTable,
134
+ 'name' => "ISO-2022-CN"
135
+ }
136
+
137
+ ISO2022JP_cls = [
138
+ 2,0,0,0,0,0,0,0, # 00 - 07
139
+ 0,0,0,0,0,0,2,2, # 08 - 0f
140
+ 0,0,0,0,0,0,0,0, # 10 - 17
141
+ 0,0,0,1,0,0,0,0, # 18 - 1f
142
+ 0,0,0,0,7,0,0,0, # 20 - 27
143
+ 3,0,0,0,0,0,0,0, # 28 - 2f
144
+ 0,0,0,0,0,0,0,0, # 30 - 37
145
+ 0,0,0,0,0,0,0,0, # 38 - 3f
146
+ 6,0,4,0,8,0,0,0, # 40 - 47
147
+ 0,9,5,0,0,0,0,0, # 48 - 4f
148
+ 0,0,0,0,0,0,0,0, # 50 - 57
149
+ 0,0,0,0,0,0,0,0, # 58 - 5f
150
+ 0,0,0,0,0,0,0,0, # 60 - 67
151
+ 0,0,0,0,0,0,0,0, # 68 - 6f
152
+ 0,0,0,0,0,0,0,0, # 70 - 77
153
+ 0,0,0,0,0,0,0,0, # 78 - 7f
154
+ 2,2,2,2,2,2,2,2, # 80 - 87
155
+ 2,2,2,2,2,2,2,2, # 88 - 8f
156
+ 2,2,2,2,2,2,2,2, # 90 - 97
157
+ 2,2,2,2,2,2,2,2, # 98 - 9f
158
+ 2,2,2,2,2,2,2,2, # a0 - a7
159
+ 2,2,2,2,2,2,2,2, # a8 - af
160
+ 2,2,2,2,2,2,2,2, # b0 - b7
161
+ 2,2,2,2,2,2,2,2, # b8 - bf
162
+ 2,2,2,2,2,2,2,2, # c0 - c7
163
+ 2,2,2,2,2,2,2,2, # c8 - cf
164
+ 2,2,2,2,2,2,2,2, # d0 - d7
165
+ 2,2,2,2,2,2,2,2, # d8 - df
166
+ 2,2,2,2,2,2,2,2, # e0 - e7
167
+ 2,2,2,2,2,2,2,2, # e8 - ef
168
+ 2,2,2,2,2,2,2,2, # f0 - f7
169
+ 2,2,2,2,2,2,2,2, # f8 - ff
170
+ ]
171
+
172
+ ISO2022JP_st = [
173
+ EStart, 3,EError,EStart,EStart,EStart,EStart,EStart,# 00-07
174
+ EStart,EStart,EError,EError,EError,EError,EError,EError,# 08-0f
175
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 10-17
176
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,# 18-1f
177
+ EError, 5,EError,EError,EError, 4,EError,EError,# 20-27
178
+ EError,EError,EError, 6,EItsMe,EError,EItsMe,EError,# 28-2f
179
+ EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,# 30-37
180
+ EError,EError,EError,EItsMe,EError,EError,EError,EError,# 38-3f
181
+ EError,EError,EError,EError,EItsMe,EError,EStart,EStart,# 40-47
182
+ ]
183
+
184
+ ISO2022JPCharLenTable = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
185
+
186
+ ISO2022JPSMModel = {'classTable' => ISO2022JP_cls,
187
+ 'classFactor' => 10,
188
+ 'stateTable' => ISO2022JP_st,
189
+ 'charLenTable' => ISO2022JPCharLenTable,
190
+ 'name' => "ISO-2022-JP"
191
+ }
192
+
193
+ ISO2022KR_cls = [
194
+ 2,0,0,0,0,0,0,0, # 00 - 07
195
+ 0,0,0,0,0,0,0,0, # 08 - 0f
196
+ 0,0,0,0,0,0,0,0, # 10 - 17
197
+ 0,0,0,1,0,0,0,0, # 18 - 1f
198
+ 0,0,0,0,3,0,0,0, # 20 - 27
199
+ 0,4,0,0,0,0,0,0, # 28 - 2f
200
+ 0,0,0,0,0,0,0,0, # 30 - 37
201
+ 0,0,0,0,0,0,0,0, # 38 - 3f
202
+ 0,0,0,5,0,0,0,0, # 40 - 47
203
+ 0,0,0,0,0,0,0,0, # 48 - 4f
204
+ 0,0,0,0,0,0,0,0, # 50 - 57
205
+ 0,0,0,0,0,0,0,0, # 58 - 5f
206
+ 0,0,0,0,0,0,0,0, # 60 - 67
207
+ 0,0,0,0,0,0,0,0, # 68 - 6f
208
+ 0,0,0,0,0,0,0,0, # 70 - 77
209
+ 0,0,0,0,0,0,0,0, # 78 - 7f
210
+ 2,2,2,2,2,2,2,2, # 80 - 87
211
+ 2,2,2,2,2,2,2,2, # 88 - 8f
212
+ 2,2,2,2,2,2,2,2, # 90 - 97
213
+ 2,2,2,2,2,2,2,2, # 98 - 9f
214
+ 2,2,2,2,2,2,2,2, # a0 - a7
215
+ 2,2,2,2,2,2,2,2, # a8 - af
216
+ 2,2,2,2,2,2,2,2, # b0 - b7
217
+ 2,2,2,2,2,2,2,2, # b8 - bf
218
+ 2,2,2,2,2,2,2,2, # c0 - c7
219
+ 2,2,2,2,2,2,2,2, # c8 - cf
220
+ 2,2,2,2,2,2,2,2, # d0 - d7
221
+ 2,2,2,2,2,2,2,2, # d8 - df
222
+ 2,2,2,2,2,2,2,2, # e0 - e7
223
+ 2,2,2,2,2,2,2,2, # e8 - ef
224
+ 2,2,2,2,2,2,2,2, # f0 - f7
225
+ 2,2,2,2,2,2,2,2, # f8 - ff
226
+ ]
227
+
228
+ ISO2022KR_st = [
229
+ EStart, 3,EError,EStart,EStart,EStart,EError,EError,# 00-07
230
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,# 08-0f
231
+ EItsMe,EItsMe,EError,EError,EError, 4,EError,EError,# 10-17
232
+ EError,EError,EError,EError, 5,EError,EError,EError,# 18-1f
233
+ EError,EError,EError,EItsMe,EStart,EStart,EStart,EStart,# 20-27
234
+ ]
235
+
236
+ ISO2022KRCharLenTable = [0, 0, 0, 0, 0, 0]
237
+
238
+ ISO2022KRSMModel = {'classTable' => ISO2022KR_cls,
239
+ 'classFactor' => 6,
240
+ 'stateTable' => ISO2022KR_st,
241
+ 'charLenTable' => ISO2022KRCharLenTable,
242
+ 'name' => "ISO-2022-KR"
243
+ }
244
+ end
@@ -0,0 +1,88 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class EUCJPProber < MultiByteCharSetProber
31
+ def initialize
32
+ super()
33
+ @_mCodingSM = CodingStateMachine.new(EUCJPSMModel)
34
+ @_mDistributionAnalyzer = EUCJPDistributionAnalysis.new()
35
+ @_mContextAnalyzer = EUCJPContextAnalysis.new()
36
+ reset
37
+ end
38
+
39
+ def reset
40
+ super()
41
+ @_mContextAnalyzer.reset()
42
+ end
43
+
44
+ def get_charset_name
45
+ return "EUC-JP"
46
+ end
47
+
48
+ def feed(aBuf)
49
+ aLen = aBuf.length
50
+ for i in (0...aLen)
51
+ codingState = @_mCodingSM.next_state(aBuf[i..i])
52
+ if codingState == EError
53
+ $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
54
+ @_mState = ENotMe
55
+ break
56
+ elsif codingState == EItsMe
57
+ @_mState = EFoundIt
58
+ break
59
+ elsif codingState == EStart:
60
+ charLen = @_mCodingSM.get_current_charlen()
61
+ if i == 0
62
+ @_mLastChar[1] = aBuf[0..0]
63
+ @_mContextAnalyzer.feed(@_mLastChar, charLen)
64
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
65
+ else
66
+ @_mContextAnalyzer.feed(aBuf[i-1...i+1], charLen)
67
+ @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
68
+ end
69
+ end
70
+ end
71
+
72
+ @_mLastChar[0] = aBuf[aLen-1..aLen-1]
73
+
74
+ if get_state() == EDetecting
75
+ if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
76
+ @_mState = EFoundIt
77
+ end
78
+ end
79
+
80
+ return get_state()
81
+ end
82
+
83
+ def get_confidence
84
+ l = [@_mContextAnalyzer.get_confidence,@_mDistributionAnalyzer.get_confidence]
85
+ return l.max
86
+ end
87
+ end
88
+ end