tmail 1.2.3.1 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. data/README +10 -0
  2. data/ext/tmailscanner/tmail/tmailscanner.c +39 -8
  3. data/lib/tmail.rb +1 -0
  4. data/lib/tmail/address.rb +6 -40
  5. data/lib/tmail/attachments.rb +41 -22
  6. data/lib/tmail/encode.rb +9 -0
  7. data/lib/tmail/header.rb +5 -3
  8. data/lib/tmail/interface.rb +40 -3
  9. data/lib/tmail/mail.rb +3 -3
  10. data/lib/tmail/mailbox.rb +3 -2
  11. data/lib/tmail/net.rb +3 -1
  12. data/lib/tmail/parser.rb +204 -620
  13. data/lib/tmail/parser.y +38 -3
  14. data/lib/tmail/quoting.rb +38 -1
  15. data/lib/tmail/utils.rb +28 -4
  16. data/lib/tmail/vendor/rchardet-1.3/COPYING +504 -0
  17. data/lib/tmail/vendor/rchardet-1.3/README +12 -0
  18. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet.rb +67 -0
  19. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5freq.rb +927 -0
  20. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5prober.rb +42 -0
  21. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +237 -0
  22. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +112 -0
  23. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetprober.rb +75 -0
  24. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +64 -0
  25. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/constants.rb +42 -0
  26. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +90 -0
  27. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escsm.rb +244 -0
  28. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +88 -0
  29. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrfreq.rb +596 -0
  30. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrprober.rb +42 -0
  31. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwfreq.rb +430 -0
  32. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwprober.rb +42 -0
  33. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312freq.rb +474 -0
  34. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312prober.rb +42 -0
  35. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +289 -0
  36. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jisfreq.rb +570 -0
  37. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +229 -0
  38. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langbulgarianmodel.rb +229 -0
  39. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langcyrillicmodel.rb +330 -0
  40. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langgreekmodel.rb +227 -0
  41. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhebrewmodel.rb +202 -0
  42. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhungarianmodel.rb +226 -0
  43. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langthaimodel.rb +201 -0
  44. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +147 -0
  45. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +89 -0
  46. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +47 -0
  47. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +542 -0
  48. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +124 -0
  49. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +58 -0
  50. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +88 -0
  51. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +166 -0
  52. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +87 -0
  53. data/lib/tmail/version.rb +1 -1
  54. data/setup.rb +2 -2
  55. data/test/fixtures/apple_unquoted_content_type +44 -0
  56. data/test/fixtures/inline_attachment.txt +2095 -0
  57. data/test/fixtures/iso_8859_1_email_without_encoding_and_message_id.txt +16 -0
  58. data/test/fixtures/mailbox.zip +0 -0
  59. data/test/fixtures/marked_as_iso_8859_1_but_it_is_utf_8.txt +33 -0
  60. data/test/fixtures/marked_as_utf_8_but_it_is_iso_8859_1.txt +56 -0
  61. data/test/fixtures/raw_email_bad_time +62 -0
  62. data/test/fixtures/raw_email_double_at_in_header +14 -0
  63. data/test/fixtures/raw_email_only_attachment +17 -0
  64. data/test/fixtures/raw_email_string_in_date_field +17 -0
  65. data/test/fixtures/raw_email_trailing_dot +21 -0
  66. data/test/fixtures/raw_email_with_quoted_attachment_filename +60 -0
  67. data/test/fixtures/raw_email_with_wrong_splitted_multibyte_encoded_word_subject +15 -0
  68. data/test/fixtures/the_only_part_is_a_word_document.txt +425 -0
  69. data/test/fixtures/unquoted_filename_in_attachment +177 -0
  70. data/test/test_address.rb +114 -92
  71. data/test/test_attachments.rb +84 -1
  72. data/test/test_encode.rb +54 -0
  73. data/test/test_header.rb +60 -2
  74. data/test/test_mail.rb +22 -15
  75. data/test/test_mbox.rb +12 -3
  76. data/test/test_port.rb +13 -9
  77. data/test/test_quote.rb +9 -0
  78. data/tmail.gemspec +34 -0
  79. metadata +68 -167
  80. data/MANIFEST +0 -191
  81. data/log/BugTrackingLog.txt +0 -1231
  82. data/log/Changelog.txt +0 -534
  83. data/log/Fixme.txt +0 -6
  84. data/log/Testlog.txt +0 -2340
  85. data/log/Todo.txt +0 -30
  86. data/log/fixme.rdoc +0 -6
  87. data/meta/MANIFEST +0 -128
  88. data/meta/VERSION +0 -1
  89. data/meta/project.yaml +0 -30
  90. data/meta/unixname +0 -1
  91. data/sample/bench_base64.rb +0 -48
  92. data/sample/data/multipart +0 -23
  93. data/sample/data/normal +0 -29
  94. data/sample/data/sendtest +0 -5
  95. data/sample/data/simple +0 -14
  96. data/sample/data/test +0 -27
  97. data/sample/extract-attachements.rb +0 -33
  98. data/sample/from-check.rb +0 -26
  99. data/sample/multipart.rb +0 -26
  100. data/sample/parse-bench.rb +0 -68
  101. data/sample/parse-test.rb +0 -19
  102. data/sample/sendmail.rb +0 -94
  103. data/site/contributing/index.html +0 -183
  104. data/site/css/clean.css +0 -27
  105. data/site/css/layout.css +0 -31
  106. data/site/css/style.css +0 -60
  107. data/site/download/index.html +0 -61
  108. data/site/img/envelope.jpg +0 -0
  109. data/site/img/mailman.gif +0 -0
  110. data/site/img/stamp-sm.jpg +0 -0
  111. data/site/img/stamp.jpg +0 -0
  112. data/site/img/stampborder.jpg +0 -0
  113. data/site/img/tfire.jpg +0 -0
  114. data/site/img/tmail.png +0 -0
  115. data/site/index.html +0 -270
  116. data/site/js/jquery.js +0 -31
  117. data/site/log/Changelog.xsl +0 -33
  118. data/site/log/changelog.xml +0 -1677
  119. data/site/outdated/BUGS +0 -3
  120. data/site/outdated/DEPENDS +0 -1
  121. data/site/outdated/Incompatibilities +0 -89
  122. data/site/outdated/Incompatibilities.ja +0 -102
  123. data/site/outdated/NEWS +0 -9
  124. data/site/outdated/README.ja +0 -73
  125. data/site/outdated/doc.ja/address.html +0 -275
  126. data/site/outdated/doc.ja/basics.html +0 -405
  127. data/site/outdated/doc.ja/config.html +0 -49
  128. data/site/outdated/doc.ja/details.html +0 -146
  129. data/site/outdated/doc.ja/index.html +0 -39
  130. data/site/outdated/doc.ja/mail.html +0 -793
  131. data/site/outdated/doc.ja/mailbox.html +0 -265
  132. data/site/outdated/doc.ja/port.html +0 -95
  133. data/site/outdated/doc.ja/tmail.html +0 -58
  134. data/site/outdated/doc.ja/usage.html +0 -202
  135. data/site/outdated/rdd/address.rrd.m +0 -229
  136. data/site/outdated/rdd/basics.rd.m +0 -275
  137. data/site/outdated/rdd/config.rrd.m +0 -26
  138. data/site/outdated/rdd/details.rd.m +0 -117
  139. data/site/outdated/rdd/index.rhtml.m +0 -54
  140. data/site/outdated/rdd/mail.rrd.m +0 -701
  141. data/site/outdated/rdd/mailbox.rrd.m +0 -228
  142. data/site/outdated/rdd/port.rrd.m +0 -69
  143. data/site/outdated/rdd/tmail.rrd.m +0 -33
  144. data/site/outdated/rdd/usage.rd.m +0 -247
  145. data/site/quickstart/index.html +0 -69
  146. data/site/quickstart/quickstart.html +0 -52
  147. data/site/quickstart/usage.html +0 -193
  148. data/site/reference/address.html +0 -247
  149. data/site/reference/config.html +0 -30
  150. data/site/reference/index.html +0 -101
  151. data/site/reference/mail.html +0 -726
  152. data/site/reference/mailbox.html +0 -245
  153. data/site/reference/port.html +0 -75
  154. data/site/reference/tmail.html +0 -35
  155. data/work/script/make +0 -26
  156. data/work/script/rdoc +0 -39
  157. data/work/script/setup +0 -1616
  158. data/work/script/test +0 -30
@@ -0,0 +1,124 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ SAMPLE_SIZE = 64
32
+ SB_ENOUGH_REL_THRESHOLD = 1024
33
+ POSITIVE_SHORTCUT_THRESHOLD = 0.95
34
+ NEGATIVE_SHORTCUT_THRESHOLD = 0.05
35
+ SYMBOL_CAT_ORDER = 250
36
+ NUMBER_OF_SEQ_CAT = 4
37
+ POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
38
+ #NEGATIVE_CAT = 0
39
+
40
+ class SingleByteCharSetProber < CharSetProber
41
+ def initialize(model, reversed=false, nameProber=nil)
42
+ super()
43
+ @_mModel = model
44
+ @_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
45
+ @_mNameProber = nameProber # Optional auxiliary prober for name decision
46
+ reset()
47
+ end
48
+
49
+ def reset
50
+ super()
51
+ @_mLastOrder = 255 # char order of last character
52
+ @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
53
+ @_mTotalSeqs = 0
54
+ @_mTotalChar = 0
55
+ @_mFreqChar = 0 # characters that fall in our sampling range
56
+ end
57
+
58
+ def get_charset_name
59
+ if @_mNameProber
60
+ return @_mNameProber.get_charset_name()
61
+ else
62
+ return @_mModel['charsetName']
63
+ end
64
+ end
65
+
66
+ def feed(aBuf)
67
+ if not @_mModel['keepEnglishLetter']
68
+ aBuf = filter_without_english_letters(aBuf)
69
+ end
70
+ aLen = aBuf.length
71
+ if not aLen
72
+ return get_state()
73
+ end
74
+ aBuf.each_byte do |b|
75
+ c = b.chr
76
+ order = @_mModel['charToOrderMap'][c[0]]
77
+ if order < SYMBOL_CAT_ORDER
78
+ @_mTotalChar += 1
79
+ end
80
+ if order < SAMPLE_SIZE
81
+ @_mFreqChar += 1
82
+ if @_mLastOrder < SAMPLE_SIZE
83
+ @_mTotalSeqs += 1
84
+ if not @_mReversed
85
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
86
+ else # reverse the order of the letters in the lookup
87
+ @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
88
+ end
89
+ end
90
+ end
91
+ @_mLastOrder = order
92
+ end
93
+
94
+ if get_state() == EDetecting
95
+ if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
96
+ cf = get_confidence()
97
+ if cf > POSITIVE_SHORTCUT_THRESHOLD
98
+ $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
99
+ @_mState = EFoundIt
100
+ elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
101
+ $stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
102
+ @_mState = ENotMe
103
+ end
104
+ end
105
+ end
106
+
107
+ return get_state()
108
+ end
109
+
110
+ def get_confidence
111
+ r = 0.01
112
+ if @_mTotalSeqs > 0
113
+ # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
114
+ r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
115
+ # print r, self._mFreqChar, self._mTotalChar
116
+ r = r * @_mFreqChar / @_mTotalChar
117
+ if r >= 1.0
118
+ r = 0.99
119
+ end
120
+ end
121
+ return r
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,58 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ class SBCSGroupProber < CharSetGroupProber
32
+ def initialize
33
+ super
34
+ @_mProbers = [
35
+ SingleByteCharSetProber.new(Win1251CyrillicModel),
36
+ SingleByteCharSetProber.new(Koi8rModel),
37
+ SingleByteCharSetProber.new(Latin5CyrillicModel),
38
+ SingleByteCharSetProber.new(MacCyrillicModel),
39
+ SingleByteCharSetProber.new(Ibm866Model),
40
+ SingleByteCharSetProber.new(Ibm855Model),
41
+ SingleByteCharSetProber.new(Latin7GreekModel),
42
+ SingleByteCharSetProber.new(Win1253GreekModel),
43
+ SingleByteCharSetProber.new(Latin5BulgarianModel),
44
+ SingleByteCharSetProber.new(Win1251BulgarianModel),
45
+ SingleByteCharSetProber.new(Latin2HungarianModel),
46
+ SingleByteCharSetProber.new(Win1250HungarianModel),
47
+ SingleByteCharSetProber.new(TIS620ThaiModel),
48
+ ]
49
+ hebrewProber = HebrewProber.new()
50
+ logicalHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, false, hebrewProber)
51
+ visualHebrewProber = SingleByteCharSetProber.new(Win1255HebrewModel, true, hebrewProber)
52
+ hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
53
+ @_mProbers += [hebrewProber, logicalHebrewProber, visualHebrewProber]
54
+
55
+ reset()
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,88 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ class SJISProber < MultiByteCharSetProber
31
+ def initialize
32
+ super()
33
+ @_mCodingSM = CodingStateMachine.new(SJISSMModel)
34
+ @_mDistributionAnalyzer = SJISDistributionAnalysis.new()
35
+ @_mContextAnalyzer = SJISContextAnalysis.new()
36
+ reset()
37
+ end
38
+
39
+ def reset
40
+ super()
41
+ @_mContextAnalyzer.reset()
42
+ end
43
+
44
+ def get_charset_name
45
+ return "SHIFT_JIS"
46
+ end
47
+
48
+ def feed(aBuf)
49
+ aLen = aBuf.length
50
+ for i in (0...aLen)
51
+ codingState = @_mCodingSM.next_state(aBuf[i..i])
52
+ if codingState == EError
53
+ $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
54
+ @_mState = ENotMe
55
+ break
56
+ elsif codingState == EItsMe
57
+ @_mState = EFoundIt
58
+ break
59
+ elsif codingState == EStart
60
+ charLen = @_mCodingSM.get_current_charlen()
61
+ if i == 0
62
+ @_mLastChar[1] = aBuf[0..0]
63
+ @_mContextAnalyzer.feed(@_mLastChar[2 - charLen..-1], charLen)
64
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
65
+ else
66
+ @_mContextAnalyzer.feed(aBuf[i + 1 - charLen ... i + 3 - charLen], charLen)
67
+ @_mDistributionAnalyzer.feed(aBuf[i - 1 ... i + 1], charLen)
68
+ end
69
+ end
70
+ end
71
+
72
+ @_mLastChar[0] = aBuf[aLen - 1.. aLen-1]
73
+
74
+ if get_state() == EDetecting:
75
+ if @_mContextAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
76
+ @_mState = EFoundIt
77
+ end
78
+ end
79
+
80
+ return get_state()
81
+ end
82
+
83
+ def get_confidence
84
+ l = [@_mContextAnalyzer.get_confidence(), @_mDistributionAnalyzer.get_confidence()]
85
+ return l.max
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,166 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ MINIMUM_THRESHOLD = 0.20
32
+ EPureAscii = 0
33
+ EEscAscii = 1
34
+ EHighbyte = 2
35
+
36
+ class UniversalDetector
37
+ attr_accessor :result
38
+ def initialize
39
+ @_highBitDetector = /[\x80-\xFF]/
40
+ @_escDetector = /(\033|\~\{)/
41
+ @_mEscCharSetProber = nil
42
+ @_mCharSetProbers = []
43
+ reset()
44
+ end
45
+
46
+ def reset
47
+ @result = {'encoding' => nil, 'confidence' => 0.0}
48
+ @done = false
49
+ @_mStart = true
50
+ @_mGotData = false
51
+ @_mInputState = EPureAscii
52
+ @_mLastChar = ''
53
+ if @_mEscCharSetProber
54
+ @_mEscCharSetProber.reset()
55
+ end
56
+ for prober in @_mCharSetProbers
57
+ prober.reset()
58
+ end
59
+ end
60
+
61
+ def feed(aBuf)
62
+ return if @done
63
+
64
+ aLen = aBuf.length
65
+ return if not aLen
66
+
67
+ if not @_mGotData
68
+ # If the data starts with BOM, we know it is UTF
69
+ if aBuf[0...3] == "\xEF\xBB\xBF"
70
+ # EF BB BF UTF-8 with BOM
71
+ @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
72
+ elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
73
+ # FF FE 00 00 UTF-32, little-endian BOM
74
+ @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
75
+ elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
76
+ # 00 00 FE FF UTF-32, big-endian BOM
77
+ @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
78
+ elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
79
+ # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
80
+ @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
81
+ elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
82
+ # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
83
+ @result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0}
84
+ elsif aBuf[0...2] == "\xFF\xFE"
85
+ # FF FE UTF-16, little endian BOM
86
+ @result = {'encoding' => "UTF-16LE", 'confidence' => 1.0}
87
+ elsif aBuf[0...2] == "\xFE\xFF"
88
+ # FE FF UTF-16, big endian BOM
89
+ @result = {'encoding' => "UTF-16BE", 'confidence' => 1.0}
90
+ end
91
+ end
92
+
93
+ @_mGotData = true
94
+ if @result['encoding'] and (@result['confidence'] > 0.0):
95
+ @done = true
96
+ return
97
+ end
98
+ if @_mInputState == EPureAscii:
99
+ if @_highBitDetector =~ (aBuf)
100
+ @_mInputState = EHighbyte
101
+ elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
102
+ @_mInputState = EEscAscii
103
+ end
104
+ end
105
+
106
+ @_mLastChar = aBuf[-1..-1]
107
+ if @_mInputState == EEscAscii
108
+ if not @_mEscCharSetProber
109
+ @_mEscCharSetProber = EscCharSetProber.new()
110
+ end
111
+ if @_mEscCharSetProber.feed(aBuf) == EFoundIt
112
+ @result = {'encoding' => self._mEscCharSetProber.get_charset_name(),
113
+ 'confidence' => @_mEscCharSetProber.get_confidence()
114
+ }
115
+ @done = true
116
+ end
117
+ elsif @_mInputState == EHighbyte
118
+ if not @_mCharSetProbers or @_mCharSetProbers.empty?
119
+ @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
120
+ end
121
+ for prober in @_mCharSetProbers
122
+ if prober.feed(aBuf) == EFoundIt
123
+ @result = {'encoding' => prober.get_charset_name(),
124
+ 'confidence' => prober.get_confidence()}
125
+ @done = true
126
+ break
127
+ end
128
+ end
129
+ end
130
+
131
+ end
132
+
133
+ def close
134
+ return if @done
135
+ if not @_mGotData
136
+ $stderr << "no data received!\n" if $debug
137
+ return
138
+ end
139
+ @done = true
140
+
141
+ if @_mInputState == EPureAscii:
142
+ @result = {'encoding' => 'ascii', 'confidence' => 1.0}
143
+ return @result
144
+ end
145
+
146
+ if @_mInputState == EHighbyte:
147
+ confidences = {}
148
+ @_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
149
+ maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
150
+ if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
151
+ @result = {'encoding' => maxProber.get_charset_name(),
152
+ 'confidence' => maxProber.get_confidence()}
153
+ return @result
154
+ end
155
+ end
156
+
157
+ if $debug
158
+ $stderr << "no probers hit minimum threshhold\n" if $debug
159
+ for prober in @_mCharSetProbers[0]._mProbers
160
+ next if not prober
161
+ $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
162
+ end
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,87 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ ONE_CHAR_PROB = 0.5
31
+
32
+ class UTF8Prober < CharSetProber
33
+ def initialize
34
+ super()
35
+ @_mCodingSM = CodingStateMachine.new(UTF8SMModel)
36
+ reset()
37
+ end
38
+
39
+ def reset
40
+ super()
41
+ @_mCodingSM.reset()
42
+ @_mNumOfMBChar = 0
43
+ end
44
+
45
+ def get_charset_name
46
+ return "utf-8"
47
+ end
48
+
49
+ def feed(aBuf)
50
+ aBuf.each_byte do |b|
51
+ c = b.chr
52
+ codingState = @_mCodingSM.next_state(c)
53
+ if codingState == EError
54
+ @_mState = ENotMe
55
+ break
56
+ elsif codingState == EItsMe
57
+ @_mState = EFoundIt
58
+ break
59
+ elsif codingState == EStart
60
+ if @_mCodingSM.get_current_charlen() >= 2
61
+ @_mNumOfMBChar += 1
62
+ end
63
+ end
64
+ end
65
+
66
+ if get_state() == EDetecting:
67
+ if get_confidence() > SHORTCUT_THRESHOLD
68
+ @_mState = EFoundIt
69
+ end
70
+ end
71
+
72
+ return get_state()
73
+ end
74
+
75
+ def get_confidence
76
+ unlike = 0.99
77
+ if @_mNumOfMBChar < 6
78
+ for i in (0...@_mNumOfMBChar)
79
+ unlike = unlike * ONE_CHAR_PROB
80
+ end
81
+ return 1.0 - unlike
82
+ else
83
+ return unlike
84
+ end
85
+ end
86
+ end
87
+ end