tmail 1.2.3.1 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. data/README +10 -0
  2. data/ext/tmailscanner/tmail/tmailscanner.c +39 -8
  3. data/lib/tmail.rb +1 -0
  4. data/lib/tmail/address.rb +6 -40
  5. data/lib/tmail/attachments.rb +41 -22
  6. data/lib/tmail/encode.rb +9 -0
  7. data/lib/tmail/header.rb +5 -3
  8. data/lib/tmail/interface.rb +40 -3
  9. data/lib/tmail/mail.rb +3 -3
  10. data/lib/tmail/mailbox.rb +3 -2
  11. data/lib/tmail/net.rb +3 -1
  12. data/lib/tmail/parser.rb +204 -620
  13. data/lib/tmail/parser.y +38 -3
  14. data/lib/tmail/quoting.rb +38 -1
  15. data/lib/tmail/utils.rb +28 -4
  16. data/lib/tmail/vendor/rchardet-1.3/COPYING +504 -0
  17. data/lib/tmail/vendor/rchardet-1.3/README +12 -0
  18. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet.rb +67 -0
  19. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5freq.rb +927 -0
  20. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/big5prober.rb +42 -0
  21. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb +237 -0
  22. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetgroupprober.rb +112 -0
  23. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/charsetprober.rb +75 -0
  24. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/codingstatemachine.rb +64 -0
  25. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/constants.rb +42 -0
  26. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escprober.rb +90 -0
  27. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/escsm.rb +244 -0
  28. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/eucjpprober.rb +88 -0
  29. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrfreq.rb +596 -0
  30. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euckrprober.rb +42 -0
  31. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwfreq.rb +430 -0
  32. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/euctwprober.rb +42 -0
  33. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312freq.rb +474 -0
  34. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/gb2312prober.rb +42 -0
  35. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/hebrewprober.rb +289 -0
  36. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jisfreq.rb +570 -0
  37. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/jpcntx.rb +229 -0
  38. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langbulgarianmodel.rb +229 -0
  39. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langcyrillicmodel.rb +330 -0
  40. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langgreekmodel.rb +227 -0
  41. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhebrewmodel.rb +202 -0
  42. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langhungarianmodel.rb +226 -0
  43. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/langthaimodel.rb +201 -0
  44. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/latin1prober.rb +147 -0
  45. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcharsetprober.rb +89 -0
  46. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcsgroupprober.rb +47 -0
  47. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/mbcssm.rb +542 -0
  48. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcharsetprober.rb +124 -0
  49. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sbcsgroupprober.rb +58 -0
  50. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/sjisprober.rb +88 -0
  51. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb +166 -0
  52. data/lib/tmail/vendor/rchardet-1.3/lib/rchardet/utf8prober.rb +87 -0
  53. data/lib/tmail/version.rb +1 -1
  54. data/setup.rb +2 -2
  55. data/test/fixtures/apple_unquoted_content_type +44 -0
  56. data/test/fixtures/inline_attachment.txt +2095 -0
  57. data/test/fixtures/iso_8859_1_email_without_encoding_and_message_id.txt +16 -0
  58. data/test/fixtures/mailbox.zip +0 -0
  59. data/test/fixtures/marked_as_iso_8859_1_but_it_is_utf_8.txt +33 -0
  60. data/test/fixtures/marked_as_utf_8_but_it_is_iso_8859_1.txt +56 -0
  61. data/test/fixtures/raw_email_bad_time +62 -0
  62. data/test/fixtures/raw_email_double_at_in_header +14 -0
  63. data/test/fixtures/raw_email_only_attachment +17 -0
  64. data/test/fixtures/raw_email_string_in_date_field +17 -0
  65. data/test/fixtures/raw_email_trailing_dot +21 -0
  66. data/test/fixtures/raw_email_with_quoted_attachment_filename +60 -0
  67. data/test/fixtures/raw_email_with_wrong_splitted_multibyte_encoded_word_subject +15 -0
  68. data/test/fixtures/the_only_part_is_a_word_document.txt +425 -0
  69. data/test/fixtures/unquoted_filename_in_attachment +177 -0
  70. data/test/test_address.rb +114 -92
  71. data/test/test_attachments.rb +84 -1
  72. data/test/test_encode.rb +54 -0
  73. data/test/test_header.rb +60 -2
  74. data/test/test_mail.rb +22 -15
  75. data/test/test_mbox.rb +12 -3
  76. data/test/test_port.rb +13 -9
  77. data/test/test_quote.rb +9 -0
  78. data/tmail.gemspec +34 -0
  79. metadata +68 -167
  80. data/MANIFEST +0 -191
  81. data/log/BugTrackingLog.txt +0 -1231
  82. data/log/Changelog.txt +0 -534
  83. data/log/Fixme.txt +0 -6
  84. data/log/Testlog.txt +0 -2340
  85. data/log/Todo.txt +0 -30
  86. data/log/fixme.rdoc +0 -6
  87. data/meta/MANIFEST +0 -128
  88. data/meta/VERSION +0 -1
  89. data/meta/project.yaml +0 -30
  90. data/meta/unixname +0 -1
  91. data/sample/bench_base64.rb +0 -48
  92. data/sample/data/multipart +0 -23
  93. data/sample/data/normal +0 -29
  94. data/sample/data/sendtest +0 -5
  95. data/sample/data/simple +0 -14
  96. data/sample/data/test +0 -27
  97. data/sample/extract-attachements.rb +0 -33
  98. data/sample/from-check.rb +0 -26
  99. data/sample/multipart.rb +0 -26
  100. data/sample/parse-bench.rb +0 -68
  101. data/sample/parse-test.rb +0 -19
  102. data/sample/sendmail.rb +0 -94
  103. data/site/contributing/index.html +0 -183
  104. data/site/css/clean.css +0 -27
  105. data/site/css/layout.css +0 -31
  106. data/site/css/style.css +0 -60
  107. data/site/download/index.html +0 -61
  108. data/site/img/envelope.jpg +0 -0
  109. data/site/img/mailman.gif +0 -0
  110. data/site/img/stamp-sm.jpg +0 -0
  111. data/site/img/stamp.jpg +0 -0
  112. data/site/img/stampborder.jpg +0 -0
  113. data/site/img/tfire.jpg +0 -0
  114. data/site/img/tmail.png +0 -0
  115. data/site/index.html +0 -270
  116. data/site/js/jquery.js +0 -31
  117. data/site/log/Changelog.xsl +0 -33
  118. data/site/log/changelog.xml +0 -1677
  119. data/site/outdated/BUGS +0 -3
  120. data/site/outdated/DEPENDS +0 -1
  121. data/site/outdated/Incompatibilities +0 -89
  122. data/site/outdated/Incompatibilities.ja +0 -102
  123. data/site/outdated/NEWS +0 -9
  124. data/site/outdated/README.ja +0 -73
  125. data/site/outdated/doc.ja/address.html +0 -275
  126. data/site/outdated/doc.ja/basics.html +0 -405
  127. data/site/outdated/doc.ja/config.html +0 -49
  128. data/site/outdated/doc.ja/details.html +0 -146
  129. data/site/outdated/doc.ja/index.html +0 -39
  130. data/site/outdated/doc.ja/mail.html +0 -793
  131. data/site/outdated/doc.ja/mailbox.html +0 -265
  132. data/site/outdated/doc.ja/port.html +0 -95
  133. data/site/outdated/doc.ja/tmail.html +0 -58
  134. data/site/outdated/doc.ja/usage.html +0 -202
  135. data/site/outdated/rdd/address.rrd.m +0 -229
  136. data/site/outdated/rdd/basics.rd.m +0 -275
  137. data/site/outdated/rdd/config.rrd.m +0 -26
  138. data/site/outdated/rdd/details.rd.m +0 -117
  139. data/site/outdated/rdd/index.rhtml.m +0 -54
  140. data/site/outdated/rdd/mail.rrd.m +0 -701
  141. data/site/outdated/rdd/mailbox.rrd.m +0 -228
  142. data/site/outdated/rdd/port.rrd.m +0 -69
  143. data/site/outdated/rdd/tmail.rrd.m +0 -33
  144. data/site/outdated/rdd/usage.rd.m +0 -247
  145. data/site/quickstart/index.html +0 -69
  146. data/site/quickstart/quickstart.html +0 -52
  147. data/site/quickstart/usage.html +0 -193
  148. data/site/reference/address.html +0 -247
  149. data/site/reference/config.html +0 -30
  150. data/site/reference/index.html +0 -101
  151. data/site/reference/mail.html +0 -726
  152. data/site/reference/mailbox.html +0 -245
  153. data/site/reference/port.html +0 -75
  154. data/site/reference/tmail.html +0 -35
  155. data/work/script/make +0 -26
  156. data/work/script/rdoc +0 -39
  157. data/work/script/setup +0 -1616
  158. data/work/script/test +0 -30
@@ -0,0 +1,147 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ #
14
+ # This library is free software; you can redistribute it and/or
15
+ # modify it under the terms of the GNU Lesser General Public
16
+ # License as published by the Free Software Foundation; either
17
+ # version 2.1 of the License, or (at your option) any later version.
18
+ #
19
+ # This library is distributed in the hope that it will be useful,
20
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ # Lesser General Public License for more details.
23
+ #
24
+ # You should have received a copy of the GNU Lesser General Public
25
+ # License along with this library; if not, write to the Free Software
26
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ # 02110-1301 USA
28
+ ######################### END LICENSE BLOCK #########################
29
+
30
+ module CharDet
31
+ FREQ_CAT_NUM = 4
32
+
33
+ UDF = 0 # undefined
34
+ OTH = 1 # other
35
+ ASC = 2 # ascii capital letter
36
+ ASS = 3 # ascii small letter
37
+ ACV = 4 # accent capital vowel
38
+ ACO = 5 # accent capital other
39
+ ASV = 6 # accent small vowel
40
+ ASO = 7 # accent small other
41
+ CLASS_NUM = 8 # total classes
42
+
43
+ Latin1_CharToClass = [
44
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
45
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
46
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
47
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
48
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
49
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
50
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
51
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
52
+ OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
53
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
54
+ ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
55
+ ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
56
+ OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
57
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
58
+ ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
59
+ ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
60
+ OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
61
+ OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
62
+ UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
63
+ OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
64
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
65
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
66
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
67
+ OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
68
+ ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
69
+ ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
70
+ ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
71
+ ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
72
+ ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
73
+ ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
74
+ ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
75
+ ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
76
+ ]
77
+
78
+ # 0 : illegal
79
+ # 1 : very unlikely
80
+ # 2 : normal
81
+ # 3 : very likely
82
+ Latin1ClassModel = [
83
+ # UDF OTH ASC ASS ACV ACO ASV ASO
84
+ 0, 0, 0, 0, 0, 0, 0, 0, # UDF
85
+ 0, 3, 3, 3, 3, 3, 3, 3, # OTH
86
+ 0, 3, 3, 3, 3, 3, 3, 3, # ASC
87
+ 0, 3, 3, 3, 1, 1, 3, 3, # ASS
88
+ 0, 3, 3, 3, 1, 2, 1, 2, # ACV
89
+ 0, 3, 3, 3, 3, 3, 3, 3, # ACO
90
+ 0, 3, 1, 3, 1, 1, 1, 3, # ASV
91
+ 0, 3, 1, 3, 1, 1, 3, 3, # ASO
92
+ ]
93
+
94
+ class Latin1Prober < CharSetProber
95
+ def initialize
96
+ super
97
+ reset()
98
+ end
99
+
100
+ def reset
101
+ @_mLastCharClass = OTH
102
+ @_mFreqCounter = [0] * FREQ_CAT_NUM
103
+ super
104
+ end
105
+
106
+ def get_charset_name
107
+ return "windows-1252"
108
+ end
109
+
110
+ def feed(aBuf)
111
+ aBuf = filter_with_english_letters(aBuf)
112
+ aBuf.each_byte do |b|
113
+ c = b.chr
114
+ charClass = Latin1_CharToClass[c[0]]
115
+ freq = Latin1ClassModel[(@_mLastCharClass * CLASS_NUM) + charClass]
116
+ if freq == 0
117
+ @_mState = ENotMe
118
+ break
119
+ end
120
+ @_mFreqCounter[freq] += 1
121
+ @_mLastCharClass = charClass
122
+ end
123
+
124
+ return get_state()
125
+ end
126
+
127
+ def get_confidence
128
+ if get_state() == ENotMe
129
+ return 0.01
130
+ end
131
+
132
+ total = @_mFreqCounter.inject{|a,b| a+b}
133
+ if total < 0.01
134
+ confidence = 0.0
135
+ else
136
+ confidence = (@_mFreqCounter[3] / total) - (@_mFreqCounter[1] * 20.0 / total)
137
+ end
138
+ if confidence < 0.0
139
+ confidence = 0.0
140
+ end
141
+ # lower the confidence of latin1 so that other more accurate detector
142
+ # can take priority.
143
+ confidence = confidence * 0.5
144
+ return confidence
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,89 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ # Proofpoint, Inc.
14
+ #
15
+ # This library is free software; you can redistribute it and/or
16
+ # modify it under the terms of the GNU Lesser General Public
17
+ # License as published by the Free Software Foundation; either
18
+ # version 2.1 of the License, or (at your option) any later version.
19
+ #
20
+ # This library is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
+ # Lesser General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU Lesser General Public
26
+ # License along with this library; if not, write to the Free Software
27
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28
+ # 02110-1301 USA
29
+ ######################### END LICENSE BLOCK #########################
30
+
31
+ module CharDet
32
+ class MultiByteCharSetProber < CharSetProber
33
+ def initialize
34
+ super
35
+ @_mDistributionAnalyzer = nil
36
+ @_mCodingSM = nil
37
+ @_mLastChar = "\x00\x00"
38
+ end
39
+
40
+ def reset
41
+ super
42
+ if @_mCodingSM
43
+ @_mCodingSM.reset()
44
+ end
45
+ if @_mDistributionAnalyzer
46
+ @_mDistributionAnalyzer.reset()
47
+ end
48
+ @_mLastChar = "\x00\x00"
49
+ end
50
+
51
+ def get_charset_name
52
+ end
53
+
54
+ def feed(aBuf)
55
+ aLen = aBuf.length
56
+ for i in (0...aLen)
57
+ codingState = @_mCodingSM.next_state(aBuf[i..i])
58
+ if codingState == EError
59
+ $stderr << "#{get_charset_name} prober hit error at byte #{i}\n" if $debug
60
+ @_mState = ENotMe
61
+ break
62
+ elsif codingState == EItsMe
63
+ @_mState = EFoundIt
64
+ break
65
+ elsif codingState == EStart
66
+ charLen = @_mCodingSM.get_current_charlen()
67
+ if i == 0
68
+ @_mLastChar[1] = aBuf[0..0]
69
+ @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
70
+ else
71
+ @_mDistributionAnalyzer.feed(aBuf[i-1...i+1], charLen)
72
+ end
73
+ end
74
+ end
75
+ @_mLastChar[0] = aBuf[aLen-1..aLen-1]
76
+
77
+ if get_state() == EDetecting
78
+ if @_mDistributionAnalyzer.got_enough_data() and (get_confidence() > SHORTCUT_THRESHOLD)
79
+ @_mState = EFoundIt
80
+ end
81
+ end
82
+ return get_state()
83
+ end
84
+
85
+ def get_confidence
86
+ return @_mDistributionAnalyzer.get_confidence()
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,47 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is Mozilla Universal charset detector code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 2001
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ # Shy Shalom - original C code
13
+ # Proofpoint, Inc.
14
+ #
15
+ # This library is free software; you can redistribute it and/or
16
+ # modify it under the terms of the GNU Lesser General Public
17
+ # License as published by the Free Software Foundation; either
18
+ # version 2.1 of the License, or (at your option) any later version.
19
+ #
20
+ # This library is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
+ # Lesser General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU Lesser General Public
26
+ # License along with this library; if not, write to the Free Software
27
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28
+ # 02110-1301 USA
29
+ ######################### END LICENSE BLOCK #########################
30
+
31
+ module CharDet
32
+ class MBCSGroupProber < CharSetGroupProber
33
+ def initialize
34
+ super
35
+ @_mProbers = [
36
+ UTF8Prober.new,
37
+ SJISProber.new,
38
+ EUCJPProber.new,
39
+ GB2312Prober.new,
40
+ EUCKRProber.new,
41
+ Big5Prober.new,
42
+ EUCTWProber.new
43
+ ]
44
+ reset()
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,542 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # The Original Code is mozilla.org code.
3
+ #
4
+ # The Initial Developer of the Original Code is
5
+ # Netscape Communications Corporation.
6
+ # Portions created by the Initial Developer are Copyright (C) 1998
7
+ # the Initial Developer. All Rights Reserved.
8
+ #
9
+ # Contributor(s):
10
+ # Jeff Hodges - port to Ruby
11
+ # Mark Pilgrim - port to Python
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
+ # 02110-1301 USA
27
+ ######################### END LICENSE BLOCK #########################
28
+
29
+ module CharDet
30
+ # BIG5
31
+
32
+ BIG5_cls = [
33
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
34
+ 1,1,1,1,1,1,0,0, # 08 - 0f
35
+ 1,1,1,1,1,1,1,1, # 10 - 17
36
+ 1,1,1,0,1,1,1,1, # 18 - 1f
37
+ 1,1,1,1,1,1,1,1, # 20 - 27
38
+ 1,1,1,1,1,1,1,1, # 28 - 2f
39
+ 1,1,1,1,1,1,1,1, # 30 - 37
40
+ 1,1,1,1,1,1,1,1, # 38 - 3f
41
+ 2,2,2,2,2,2,2,2, # 40 - 47
42
+ 2,2,2,2,2,2,2,2, # 48 - 4f
43
+ 2,2,2,2,2,2,2,2, # 50 - 57
44
+ 2,2,2,2,2,2,2,2, # 58 - 5f
45
+ 2,2,2,2,2,2,2,2, # 60 - 67
46
+ 2,2,2,2,2,2,2,2, # 68 - 6f
47
+ 2,2,2,2,2,2,2,2, # 70 - 77
48
+ 2,2,2,2,2,2,2,1, # 78 - 7f
49
+ 4,4,4,4,4,4,4,4, # 80 - 87
50
+ 4,4,4,4,4,4,4,4, # 88 - 8f
51
+ 4,4,4,4,4,4,4,4, # 90 - 97
52
+ 4,4,4,4,4,4,4,4, # 98 - 9f
53
+ 4,3,3,3,3,3,3,3, # a0 - a7
54
+ 3,3,3,3,3,3,3,3, # a8 - af
55
+ 3,3,3,3,3,3,3,3, # b0 - b7
56
+ 3,3,3,3,3,3,3,3, # b8 - bf
57
+ 3,3,3,3,3,3,3,3, # c0 - c7
58
+ 3,3,3,3,3,3,3,3, # c8 - cf
59
+ 3,3,3,3,3,3,3,3, # d0 - d7
60
+ 3,3,3,3,3,3,3,3, # d8 - df
61
+ 3,3,3,3,3,3,3,3, # e0 - e7
62
+ 3,3,3,3,3,3,3,3, # e8 - ef
63
+ 3,3,3,3,3,3,3,3, # f0 - f7
64
+ 3,3,3,3,3,3,3,0 # f8 - ff
65
+ ]
66
+
67
+ BIG5_st = [
68
+ EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
69
+ EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,#08-0f
70
+ EError,EStart,EStart,EStart,EStart,EStart,EStart,EStart #10-17
71
+ ]
72
+
73
+ Big5CharLenTable = [0, 1, 1, 2, 0]
74
+
75
+ Big5SMModel = {'classTable' => BIG5_cls,
76
+ 'classFactor' => 5,
77
+ 'stateTable' => BIG5_st,
78
+ 'charLenTable' => Big5CharLenTable,
79
+ 'name' => 'Big5'
80
+ }
81
+
82
+ # EUC-JP
83
+
84
+ EUCJP_cls = [
85
+ 4,4,4,4,4,4,4,4, # 00 - 07
86
+ 4,4,4,4,4,4,5,5, # 08 - 0f
87
+ 4,4,4,4,4,4,4,4, # 10 - 17
88
+ 4,4,4,5,4,4,4,4, # 18 - 1f
89
+ 4,4,4,4,4,4,4,4, # 20 - 27
90
+ 4,4,4,4,4,4,4,4, # 28 - 2f
91
+ 4,4,4,4,4,4,4,4, # 30 - 37
92
+ 4,4,4,4,4,4,4,4, # 38 - 3f
93
+ 4,4,4,4,4,4,4,4, # 40 - 47
94
+ 4,4,4,4,4,4,4,4, # 48 - 4f
95
+ 4,4,4,4,4,4,4,4, # 50 - 57
96
+ 4,4,4,4,4,4,4,4, # 58 - 5f
97
+ 4,4,4,4,4,4,4,4, # 60 - 67
98
+ 4,4,4,4,4,4,4,4, # 68 - 6f
99
+ 4,4,4,4,4,4,4,4, # 70 - 77
100
+ 4,4,4,4,4,4,4,4, # 78 - 7f
101
+ 5,5,5,5,5,5,5,5, # 80 - 87
102
+ 5,5,5,5,5,5,1,3, # 88 - 8f
103
+ 5,5,5,5,5,5,5,5, # 90 - 97
104
+ 5,5,5,5,5,5,5,5, # 98 - 9f
105
+ 5,2,2,2,2,2,2,2, # a0 - a7
106
+ 2,2,2,2,2,2,2,2, # a8 - af
107
+ 2,2,2,2,2,2,2,2, # b0 - b7
108
+ 2,2,2,2,2,2,2,2, # b8 - bf
109
+ 2,2,2,2,2,2,2,2, # c0 - c7
110
+ 2,2,2,2,2,2,2,2, # c8 - cf
111
+ 2,2,2,2,2,2,2,2, # d0 - d7
112
+ 2,2,2,2,2,2,2,2, # d8 - df
113
+ 0,0,0,0,0,0,0,0, # e0 - e7
114
+ 0,0,0,0,0,0,0,0, # e8 - ef
115
+ 0,0,0,0,0,0,0,0, # f0 - f7
116
+ 0,0,0,0,0,0,0,5 # f8 - ff
117
+ ]
118
+
119
+ EUCJP_st = [
120
+ 3, 4, 3, 5,EStart,EError,EError,EError,#00-07
121
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
122
+ EItsMe,EItsMe,EStart,EError,EStart,EError,EError,EError,#10-17
123
+ EError,EError,EStart,EError,EError,EError, 3,EError,#18-1f
124
+ 3,EError,EError,EError,EStart,EStart,EStart,EStart #20-27
125
+ ]
126
+
127
+ EUCJPCharLenTable = [2, 2, 2, 3, 1, 0]
128
+
129
+ EUCJPSMModel = {'classTable' => EUCJP_cls,
130
+ 'classFactor' => 6,
131
+ 'stateTable' => EUCJP_st,
132
+ 'charLenTable' => EUCJPCharLenTable,
133
+ 'name' => 'EUC-JP'
134
+ }
135
+
136
+ # EUC-KR
137
+
138
+ EUCKR_cls = [
139
+ 1,1,1,1,1,1,1,1, # 00 - 07
140
+ 1,1,1,1,1,1,0,0, # 08 - 0f
141
+ 1,1,1,1,1,1,1,1, # 10 - 17
142
+ 1,1,1,0,1,1,1,1, # 18 - 1f
143
+ 1,1,1,1,1,1,1,1, # 20 - 27
144
+ 1,1,1,1,1,1,1,1, # 28 - 2f
145
+ 1,1,1,1,1,1,1,1, # 30 - 37
146
+ 1,1,1,1,1,1,1,1, # 38 - 3f
147
+ 1,1,1,1,1,1,1,1, # 40 - 47
148
+ 1,1,1,1,1,1,1,1, # 48 - 4f
149
+ 1,1,1,1,1,1,1,1, # 50 - 57
150
+ 1,1,1,1,1,1,1,1, # 58 - 5f
151
+ 1,1,1,1,1,1,1,1, # 60 - 67
152
+ 1,1,1,1,1,1,1,1, # 68 - 6f
153
+ 1,1,1,1,1,1,1,1, # 70 - 77
154
+ 1,1,1,1,1,1,1,1, # 78 - 7f
155
+ 0,0,0,0,0,0,0,0, # 80 - 87
156
+ 0,0,0,0,0,0,0,0, # 88 - 8f
157
+ 0,0,0,0,0,0,0,0, # 90 - 97
158
+ 0,0,0,0,0,0,0,0, # 98 - 9f
159
+ 0,2,2,2,2,2,2,2, # a0 - a7
160
+ 2,2,2,2,2,3,3,3, # a8 - af
161
+ 2,2,2,2,2,2,2,2, # b0 - b7
162
+ 2,2,2,2,2,2,2,2, # b8 - bf
163
+ 2,2,2,2,2,2,2,2, # c0 - c7
164
+ 2,3,2,2,2,2,2,2, # c8 - cf
165
+ 2,2,2,2,2,2,2,2, # d0 - d7
166
+ 2,2,2,2,2,2,2,2, # d8 - df
167
+ 2,2,2,2,2,2,2,2, # e0 - e7
168
+ 2,2,2,2,2,2,2,2, # e8 - ef
169
+ 2,2,2,2,2,2,2,2, # f0 - f7
170
+ 2,2,2,2,2,2,2,0 # f8 - ff
171
+ ]
172
+
173
+ EUCKR_st = [
174
+ EError,EStart, 3,EError,EError,EError,EError,EError,#00-07
175
+ EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,EStart#08-0f
176
+ ]
177
+
178
+ EUCKRCharLenTable = [0, 1, 2, 0]
179
+
180
+ EUCKRSMModel = {'classTable' => EUCKR_cls,
181
+ 'classFactor' => 4,
182
+ 'stateTable' => EUCKR_st,
183
+ 'charLenTable' => EUCKRCharLenTable,
184
+ 'name' => 'EUC-KR'
185
+ }
186
+
187
+ # EUC-TW
188
+
189
+ EUCTW_cls = [
190
+ 2,2,2,2,2,2,2,2, # 00 - 07
191
+ 2,2,2,2,2,2,0,0, # 08 - 0f
192
+ 2,2,2,2,2,2,2,2, # 10 - 17
193
+ 2,2,2,0,2,2,2,2, # 18 - 1f
194
+ 2,2,2,2,2,2,2,2, # 20 - 27
195
+ 2,2,2,2,2,2,2,2, # 28 - 2f
196
+ 2,2,2,2,2,2,2,2, # 30 - 37
197
+ 2,2,2,2,2,2,2,2, # 38 - 3f
198
+ 2,2,2,2,2,2,2,2, # 40 - 47
199
+ 2,2,2,2,2,2,2,2, # 48 - 4f
200
+ 2,2,2,2,2,2,2,2, # 50 - 57
201
+ 2,2,2,2,2,2,2,2, # 58 - 5f
202
+ 2,2,2,2,2,2,2,2, # 60 - 67
203
+ 2,2,2,2,2,2,2,2, # 68 - 6f
204
+ 2,2,2,2,2,2,2,2, # 70 - 77
205
+ 2,2,2,2,2,2,2,2, # 78 - 7f
206
+ 0,0,0,0,0,0,0,0, # 80 - 87
207
+ 0,0,0,0,0,0,6,0, # 88 - 8f
208
+ 0,0,0,0,0,0,0,0, # 90 - 97
209
+ 0,0,0,0,0,0,0,0, # 98 - 9f
210
+ 0,3,4,4,4,4,4,4, # a0 - a7
211
+ 5,5,1,1,1,1,1,1, # a8 - af
212
+ 1,1,1,1,1,1,1,1, # b0 - b7
213
+ 1,1,1,1,1,1,1,1, # b8 - bf
214
+ 1,1,3,1,3,3,3,3, # c0 - c7
215
+ 3,3,3,3,3,3,3,3, # c8 - cf
216
+ 3,3,3,3,3,3,3,3, # d0 - d7
217
+ 3,3,3,3,3,3,3,3, # d8 - df
218
+ 3,3,3,3,3,3,3,3, # e0 - e7
219
+ 3,3,3,3,3,3,3,3, # e8 - ef
220
+ 3,3,3,3,3,3,3,3, # f0 - f7
221
+ 3,3,3,3,3,3,3,0 # f8 - ff
222
+ ]
223
+
224
+ EUCTW_st = [
225
+ EError,EError,EStart, 3, 3, 3, 4,EError,#00-07
226
+ EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
227
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EStart,EError,#10-17
228
+ EStart,EStart,EStart,EError,EError,EError,EError,EError,#18-1f
229
+ 5,EError,EError,EError,EStart,EError,EStart,EStart,#20-27
230
+ EStart,EError,EStart,EStart,EStart,EStart,EStart,EStart #28-2f
231
+ ]
232
+
233
+ EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3]
234
+
235
+ EUCTWSMModel = {'classTable' => EUCTW_cls,
236
+ 'classFactor' => 7,
237
+ 'stateTable' => EUCTW_st,
238
+ 'charLenTable' => EUCTWCharLenTable,
239
+ 'name' => 'x-euc-tw'
240
+ }
241
+
242
+ # GB2312
243
+
244
+ GB2312_cls = [
245
+ 1,1,1,1,1,1,1,1, # 00 - 07
246
+ 1,1,1,1,1,1,0,0, # 08 - 0f
247
+ 1,1,1,1,1,1,1,1, # 10 - 17
248
+ 1,1,1,0,1,1,1,1, # 18 - 1f
249
+ 1,1,1,1,1,1,1,1, # 20 - 27
250
+ 1,1,1,1,1,1,1,1, # 28 - 2f
251
+ 3,3,3,3,3,3,3,3, # 30 - 37
252
+ 3,3,1,1,1,1,1,1, # 38 - 3f
253
+ 2,2,2,2,2,2,2,2, # 40 - 47
254
+ 2,2,2,2,2,2,2,2, # 48 - 4f
255
+ 2,2,2,2,2,2,2,2, # 50 - 57
256
+ 2,2,2,2,2,2,2,2, # 58 - 5f
257
+ 2,2,2,2,2,2,2,2, # 60 - 67
258
+ 2,2,2,2,2,2,2,2, # 68 - 6f
259
+ 2,2,2,2,2,2,2,2, # 70 - 77
260
+ 2,2,2,2,2,2,2,4, # 78 - 7f
261
+ 5,6,6,6,6,6,6,6, # 80 - 87
262
+ 6,6,6,6,6,6,6,6, # 88 - 8f
263
+ 6,6,6,6,6,6,6,6, # 90 - 97
264
+ 6,6,6,6,6,6,6,6, # 98 - 9f
265
+ 6,6,6,6,6,6,6,6, # a0 - a7
266
+ 6,6,6,6,6,6,6,6, # a8 - af
267
+ 6,6,6,6,6,6,6,6, # b0 - b7
268
+ 6,6,6,6,6,6,6,6, # b8 - bf
269
+ 6,6,6,6,6,6,6,6, # c0 - c7
270
+ 6,6,6,6,6,6,6,6, # c8 - cf
271
+ 6,6,6,6,6,6,6,6, # d0 - d7
272
+ 6,6,6,6,6,6,6,6, # d8 - df
273
+ 6,6,6,6,6,6,6,6, # e0 - e7
274
+ 6,6,6,6,6,6,6,6, # e8 - ef
275
+ 6,6,6,6,6,6,6,6, # f0 - f7
276
+ 6,6,6,6,6,6,6,0 # f8 - ff
277
+ ]
278
+
279
+ GB2312_st = [
280
+ EError,EStart,EStart,EStart,EStart,EStart, 3,EError,#00-07
281
+ EError,EError,EError,EError,EError,EError,EItsMe,EItsMe,#08-0f
282
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EError,EError,EStart,#10-17
283
+ 4,EError,EStart,EStart,EError,EError,EError,EError,#18-1f
284
+ EError,EError, 5,EError,EError,EError,EItsMe,EError,#20-27
285
+ EError,EError,EStart,EStart,EStart,EStart,EStart,EStart#28-2f
286
+ ]
287
+
288
+ # To be accurate, the length of class 6 can be either 2 or 4.
289
+ # But it is not necessary to discriminate between the two since
290
+ # it is used for frequency analysis only, and we are validing
291
+ # each code range there as well. So it is safe to set it to be
292
+ # 2 here.
293
+ GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2]
294
+
295
+ GB2312SMModel = {'classTable' => GB2312_cls,
296
+ 'classFactor' => 7,
297
+ 'stateTable' => GB2312_st,
298
+ 'charLenTable' => GB2312CharLenTable,
299
+ 'name' => 'GB2312'
300
+ }
301
+
302
+ # Shift_JIS
303
+
304
+ SJIS_cls = [
305
+ 1,1,1,1,1,1,1,1, # 00 - 07
306
+ 1,1,1,1,1,1,0,0, # 08 - 0f
307
+ 1,1,1,1,1,1,1,1, # 10 - 17
308
+ 1,1,1,0,1,1,1,1, # 18 - 1f
309
+ 1,1,1,1,1,1,1,1, # 20 - 27
310
+ 1,1,1,1,1,1,1,1, # 28 - 2f
311
+ 1,1,1,1,1,1,1,1, # 30 - 37
312
+ 1,1,1,1,1,1,1,1, # 38 - 3f
313
+ 2,2,2,2,2,2,2,2, # 40 - 47
314
+ 2,2,2,2,2,2,2,2, # 48 - 4f
315
+ 2,2,2,2,2,2,2,2, # 50 - 57
316
+ 2,2,2,2,2,2,2,2, # 58 - 5f
317
+ 2,2,2,2,2,2,2,2, # 60 - 67
318
+ 2,2,2,2,2,2,2,2, # 68 - 6f
319
+ 2,2,2,2,2,2,2,2, # 70 - 77
320
+ 2,2,2,2,2,2,2,1, # 78 - 7f
321
+ 3,3,3,3,3,3,3,3, # 80 - 87
322
+ 3,3,3,3,3,3,3,3, # 88 - 8f
323
+ 3,3,3,3,3,3,3,3, # 90 - 97
324
+ 3,3,3,3,3,3,3,3, # 98 - 9f
325
+ #0xa0 is illegal in sjis encoding, but some pages does
326
+ #contain such byte. We need to be more error forgiven.
327
+ 2,2,2,2,2,2,2,2, # a0 - a7
328
+ 2,2,2,2,2,2,2,2, # a8 - af
329
+ 2,2,2,2,2,2,2,2, # b0 - b7
330
+ 2,2,2,2,2,2,2,2, # b8 - bf
331
+ 2,2,2,2,2,2,2,2, # c0 - c7
332
+ 2,2,2,2,2,2,2,2, # c8 - cf
333
+ 2,2,2,2,2,2,2,2, # d0 - d7
334
+ 2,2,2,2,2,2,2,2, # d8 - df
335
+ 3,3,3,3,3,3,3,3, # e0 - e7
336
+ 3,3,3,3,3,4,4,4, # e8 - ef
337
+ 4,4,4,4,4,4,4,4, # f0 - f7
338
+ 4,4,4,4,4,0,0,0 # f8 - ff
339
+ ]
340
+
341
+ SJIS_st = [
342
+ EError,EStart,EStart, 3,EError,EError,EError,EError,#00-07
343
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
344
+ EItsMe,EItsMe,EError,EError,EStart,EStart,EStart,EStart#10-17
345
+ ]
346
+
347
+ SJISCharLenTable = [0, 1, 1, 2, 0, 0]
348
+
349
+ SJISSMModel = {'classTable' => SJIS_cls,
350
+ 'classFactor' => 6,
351
+ 'stateTable' => SJIS_st,
352
+ 'charLenTable' => SJISCharLenTable,
353
+ 'name' => 'Shift_JIS'
354
+ }
355
+
356
+ # UCS2-BE
357
+
358
+ UCS2BE_cls = [
359
+ 0,0,0,0,0,0,0,0, # 00 - 07
360
+ 0,0,1,0,0,2,0,0, # 08 - 0f
361
+ 0,0,0,0,0,0,0,0, # 10 - 17
362
+ 0,0,0,3,0,0,0,0, # 18 - 1f
363
+ 0,0,0,0,0,0,0,0, # 20 - 27
364
+ 0,3,3,3,3,3,0,0, # 28 - 2f
365
+ 0,0,0,0,0,0,0,0, # 30 - 37
366
+ 0,0,0,0,0,0,0,0, # 38 - 3f
367
+ 0,0,0,0,0,0,0,0, # 40 - 47
368
+ 0,0,0,0,0,0,0,0, # 48 - 4f
369
+ 0,0,0,0,0,0,0,0, # 50 - 57
370
+ 0,0,0,0,0,0,0,0, # 58 - 5f
371
+ 0,0,0,0,0,0,0,0, # 60 - 67
372
+ 0,0,0,0,0,0,0,0, # 68 - 6f
373
+ 0,0,0,0,0,0,0,0, # 70 - 77
374
+ 0,0,0,0,0,0,0,0, # 78 - 7f
375
+ 0,0,0,0,0,0,0,0, # 80 - 87
376
+ 0,0,0,0,0,0,0,0, # 88 - 8f
377
+ 0,0,0,0,0,0,0,0, # 90 - 97
378
+ 0,0,0,0,0,0,0,0, # 98 - 9f
379
+ 0,0,0,0,0,0,0,0, # a0 - a7
380
+ 0,0,0,0,0,0,0,0, # a8 - af
381
+ 0,0,0,0,0,0,0,0, # b0 - b7
382
+ 0,0,0,0,0,0,0,0, # b8 - bf
383
+ 0,0,0,0,0,0,0,0, # c0 - c7
384
+ 0,0,0,0,0,0,0,0, # c8 - cf
385
+ 0,0,0,0,0,0,0,0, # d0 - d7
386
+ 0,0,0,0,0,0,0,0, # d8 - df
387
+ 0,0,0,0,0,0,0,0, # e0 - e7
388
+ 0,0,0,0,0,0,0,0, # e8 - ef
389
+ 0,0,0,0,0,0,0,0, # f0 - f7
390
+ 0,0,0,0,0,0,4,5 # f8 - ff
391
+ ]
392
+
393
+ UCS2BE_st = [
394
+ 5, 7, 7,EError, 4, 3,EError,EError,#00-07
395
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
396
+ EItsMe,EItsMe, 6, 6, 6, 6,EError,EError,#10-17
397
+ 6, 6, 6, 6, 6,EItsMe, 6, 6,#18-1f
398
+ 6, 6, 6, 6, 5, 7, 7,EError,#20-27
399
+ 5, 8, 6, 6,EError, 6, 6, 6,#28-2f
400
+ 6, 6, 6, 6,EError,EError,EStart,EStart#30-37
401
+ ]
402
+
403
+ UCS2BECharLenTable = [2, 2, 2, 0, 2, 2]
404
+
405
+ UCS2BESMModel = {'classTable' => UCS2BE_cls,
406
+ 'classFactor' => 6,
407
+ 'stateTable' => UCS2BE_st,
408
+ 'charLenTable' => UCS2BECharLenTable,
409
+ 'name' => 'UTF-16BE'
410
+ }
411
+
412
+ # UCS2-LE
413
+
414
+ UCS2LE_cls = [
415
+ 0,0,0,0,0,0,0,0, # 00 - 07
416
+ 0,0,1,0,0,2,0,0, # 08 - 0f
417
+ 0,0,0,0,0,0,0,0, # 10 - 17
418
+ 0,0,0,3,0,0,0,0, # 18 - 1f
419
+ 0,0,0,0,0,0,0,0, # 20 - 27
420
+ 0,3,3,3,3,3,0,0, # 28 - 2f
421
+ 0,0,0,0,0,0,0,0, # 30 - 37
422
+ 0,0,0,0,0,0,0,0, # 38 - 3f
423
+ 0,0,0,0,0,0,0,0, # 40 - 47
424
+ 0,0,0,0,0,0,0,0, # 48 - 4f
425
+ 0,0,0,0,0,0,0,0, # 50 - 57
426
+ 0,0,0,0,0,0,0,0, # 58 - 5f
427
+ 0,0,0,0,0,0,0,0, # 60 - 67
428
+ 0,0,0,0,0,0,0,0, # 68 - 6f
429
+ 0,0,0,0,0,0,0,0, # 70 - 77
430
+ 0,0,0,0,0,0,0,0, # 78 - 7f
431
+ 0,0,0,0,0,0,0,0, # 80 - 87
432
+ 0,0,0,0,0,0,0,0, # 88 - 8f
433
+ 0,0,0,0,0,0,0,0, # 90 - 97
434
+ 0,0,0,0,0,0,0,0, # 98 - 9f
435
+ 0,0,0,0,0,0,0,0, # a0 - a7
436
+ 0,0,0,0,0,0,0,0, # a8 - af
437
+ 0,0,0,0,0,0,0,0, # b0 - b7
438
+ 0,0,0,0,0,0,0,0, # b8 - bf
439
+ 0,0,0,0,0,0,0,0, # c0 - c7
440
+ 0,0,0,0,0,0,0,0, # c8 - cf
441
+ 0,0,0,0,0,0,0,0, # d0 - d7
442
+ 0,0,0,0,0,0,0,0, # d8 - df
443
+ 0,0,0,0,0,0,0,0, # e0 - e7
444
+ 0,0,0,0,0,0,0,0, # e8 - ef
445
+ 0,0,0,0,0,0,0,0, # f0 - f7
446
+ 0,0,0,0,0,0,4,5 # f8 - ff
447
+ ]
448
+
449
+ UCS2LE_st = [
450
+ 6, 6, 7, 6, 4, 3,EError,EError,#00-07
451
+ EError,EError,EError,EError,EItsMe,EItsMe,EItsMe,EItsMe,#08-0f
452
+ EItsMe,EItsMe, 5, 5, 5,EError,EItsMe,EError,#10-17
453
+ 5, 5, 5,EError, 5,EError, 6, 6,#18-1f
454
+ 7, 6, 8, 8, 5, 5, 5,EError,#20-27
455
+ 5, 5, 5,EError,EError,EError, 5, 5,#28-2f
456
+ 5, 5, 5,EError, 5,EError,EStart,EStart#30-37
457
+ ]
458
+
459
+ UCS2LECharLenTable = [2, 2, 2, 2, 2, 2]
460
+
461
+ UCS2LESMModel = {'classTable' => UCS2LE_cls,
462
+ 'classFactor' => 6,
463
+ 'stateTable' => UCS2LE_st,
464
+ 'charLenTable' => UCS2LECharLenTable,
465
+ 'name' => 'UTF-16LE'
466
+ }
467
+
468
+ # UTF-8
469
+
470
+ UTF8_cls = [
471
+ 1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
472
+ 1,1,1,1,1,1,0,0, # 08 - 0f
473
+ 1,1,1,1,1,1,1,1, # 10 - 17
474
+ 1,1,1,0,1,1,1,1, # 18 - 1f
475
+ 1,1,1,1,1,1,1,1, # 20 - 27
476
+ 1,1,1,1,1,1,1,1, # 28 - 2f
477
+ 1,1,1,1,1,1,1,1, # 30 - 37
478
+ 1,1,1,1,1,1,1,1, # 38 - 3f
479
+ 1,1,1,1,1,1,1,1, # 40 - 47
480
+ 1,1,1,1,1,1,1,1, # 48 - 4f
481
+ 1,1,1,1,1,1,1,1, # 50 - 57
482
+ 1,1,1,1,1,1,1,1, # 58 - 5f
483
+ 1,1,1,1,1,1,1,1, # 60 - 67
484
+ 1,1,1,1,1,1,1,1, # 68 - 6f
485
+ 1,1,1,1,1,1,1,1, # 70 - 77
486
+ 1,1,1,1,1,1,1,1, # 78 - 7f
487
+ 2,2,2,2,3,3,3,3, # 80 - 87
488
+ 4,4,4,4,4,4,4,4, # 88 - 8f
489
+ 4,4,4,4,4,4,4,4, # 90 - 97
490
+ 4,4,4,4,4,4,4,4, # 98 - 9f
491
+ 5,5,5,5,5,5,5,5, # a0 - a7
492
+ 5,5,5,5,5,5,5,5, # a8 - af
493
+ 5,5,5,5,5,5,5,5, # b0 - b7
494
+ 5,5,5,5,5,5,5,5, # b8 - bf
495
+ 0,0,6,6,6,6,6,6, # c0 - c7
496
+ 6,6,6,6,6,6,6,6, # c8 - cf
497
+ 6,6,6,6,6,6,6,6, # d0 - d7
498
+ 6,6,6,6,6,6,6,6, # d8 - df
499
+ 7,8,8,8,8,8,8,8, # e0 - e7
500
+ 8,8,8,8,8,9,8,8, # e8 - ef
501
+ 10,11,11,11,11,11,11,11, # f0 - f7
502
+ 12,13,13,13,14,15,0,0 # f8 - ff
503
+ ]
504
+
505
+ UTF8_st = [
506
+ EError,EStart,EError,EError,EError,EError, 12, 10,#00-07
507
+ 9, 11, 8, 7, 6, 5, 4, 3,#08-0f
508
+ EError,EError,EError,EError,EError,EError,EError,EError,#10-17
509
+ EError,EError,EError,EError,EError,EError,EError,EError,#18-1f
510
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,#20-27
511
+ EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,EItsMe,#28-2f
512
+ EError,EError, 5, 5, 5, 5,EError,EError,#30-37
513
+ EError,EError,EError,EError,EError,EError,EError,EError,#38-3f
514
+ EError,EError,EError, 5, 5, 5,EError,EError,#40-47
515
+ EError,EError,EError,EError,EError,EError,EError,EError,#48-4f
516
+ EError,EError, 7, 7, 7, 7,EError,EError,#50-57
517
+ EError,EError,EError,EError,EError,EError,EError,EError,#58-5f
518
+ EError,EError,EError,EError, 7, 7,EError,EError,#60-67
519
+ EError,EError,EError,EError,EError,EError,EError,EError,#68-6f
520
+ EError,EError, 9, 9, 9, 9,EError,EError,#70-77
521
+ EError,EError,EError,EError,EError,EError,EError,EError,#78-7f
522
+ EError,EError,EError,EError,EError, 9,EError,EError,#80-87
523
+ EError,EError,EError,EError,EError,EError,EError,EError,#88-8f
524
+ EError,EError, 12, 12, 12, 12,EError,EError,#90-97
525
+ EError,EError,EError,EError,EError,EError,EError,EError,#98-9f
526
+ EError,EError,EError,EError,EError, 12,EError,EError,#a0-a7
527
+ EError,EError,EError,EError,EError,EError,EError,EError,#a8-af
528
+ EError,EError, 12, 12, 12,EError,EError,EError,#b0-b7
529
+ EError,EError,EError,EError,EError,EError,EError,EError,#b8-bf
530
+ EError,EError,EStart,EStart,EStart,EStart,EError,EError,#c0-c7
531
+ EError,EError,EError,EError,EError,EError,EError,EError#c8-cf
532
+ ]
533
+
534
+ UTF8CharLenTable = [0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6]
535
+
536
+ UTF8SMModel = {'classTable' => UTF8_cls,
537
+ 'classFactor' => 16,
538
+ 'stateTable' => UTF8_st,
539
+ 'charLenTable' => UTF8CharLenTable,
540
+ 'name' => 'UTF-8'
541
+ }
542
+ end