cchardet 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (317) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.gitmodules +3 -0
  4. data/.rubocop.yml +11 -0
  5. data/CHANGELOG.md +5 -0
  6. data/Gemfile +10 -0
  7. data/README.md +35 -0
  8. data/Rakefile +15 -0
  9. data/cchardet.gemspec +30 -0
  10. data/ext/cchardet/extconf.rb +26 -0
  11. data/ext/uchardet/.gitignore +1 -0
  12. data/ext/uchardet/.gitlab-ci.yml +106 -0
  13. data/ext/uchardet/AUTHORS +16 -0
  14. data/ext/uchardet/CMakeLists.txt +74 -0
  15. data/ext/uchardet/COPYING +1316 -0
  16. data/ext/uchardet/INSTALL +26 -0
  17. data/ext/uchardet/README.md +295 -0
  18. data/ext/uchardet/build-mac/uchardet.cpp +7 -0
  19. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
  20. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  21. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
  22. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
  23. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
  24. data/ext/uchardet/doc/CMakeLists.txt +6 -0
  25. data/ext/uchardet/doc/README.maintainer +59 -0
  26. data/ext/uchardet/doc/uchardet.1 +18 -0
  27. data/ext/uchardet/script/BuildLangModel.py +533 -0
  28. data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
  29. data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
  30. data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
  31. data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
  32. data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
  33. data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
  34. data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
  35. data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
  36. data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
  37. data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
  38. data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
  39. data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
  40. data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
  41. data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
  42. data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
  43. data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
  44. data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
  45. data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
  46. data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
  47. data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
  48. data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
  49. data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
  50. data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
  51. data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
  52. data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
  53. data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
  54. data/ext/uchardet/script/README +63 -0
  55. data/ext/uchardet/script/charsets/codepoints.py +53 -0
  56. data/ext/uchardet/script/charsets/db.py +73 -0
  57. data/ext/uchardet/script/charsets/ibm852.py +72 -0
  58. data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
  59. data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
  60. data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
  61. data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
  62. data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
  63. data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
  64. data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
  65. data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
  66. data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
  67. data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
  68. data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
  69. data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
  70. data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
  71. data/ext/uchardet/script/charsets/tis-620.py +77 -0
  72. data/ext/uchardet/script/charsets/viscii.py +72 -0
  73. data/ext/uchardet/script/charsets/windows-1250.py +75 -0
  74. data/ext/uchardet/script/charsets/windows-1252.py +76 -0
  75. data/ext/uchardet/script/charsets/windows-1253.py +72 -0
  76. data/ext/uchardet/script/charsets/windows-1256.py +75 -0
  77. data/ext/uchardet/script/charsets/windows-1257.py +72 -0
  78. data/ext/uchardet/script/charsets/windows-1258.py +72 -0
  79. data/ext/uchardet/script/debug.sh +9 -0
  80. data/ext/uchardet/script/header-template.cpp +38 -0
  81. data/ext/uchardet/script/langs/ar.py +59 -0
  82. data/ext/uchardet/script/langs/cs.py +80 -0
  83. data/ext/uchardet/script/langs/da.py +69 -0
  84. data/ext/uchardet/script/langs/de.py +69 -0
  85. data/ext/uchardet/script/langs/el.py +55 -0
  86. data/ext/uchardet/script/langs/eo.py +67 -0
  87. data/ext/uchardet/script/langs/es.py +69 -0
  88. data/ext/uchardet/script/langs/et.py +57 -0
  89. data/ext/uchardet/script/langs/fi.py +60 -0
  90. data/ext/uchardet/script/langs/fr.py +79 -0
  91. data/ext/uchardet/script/langs/ga.py +60 -0
  92. data/ext/uchardet/script/langs/hr.py +59 -0
  93. data/ext/uchardet/script/langs/hu.py +66 -0
  94. data/ext/uchardet/script/langs/it.py +56 -0
  95. data/ext/uchardet/script/langs/lt.py +70 -0
  96. data/ext/uchardet/script/langs/lv.py +69 -0
  97. data/ext/uchardet/script/langs/mt.py +80 -0
  98. data/ext/uchardet/script/langs/pl.py +81 -0
  99. data/ext/uchardet/script/langs/pt.py +80 -0
  100. data/ext/uchardet/script/langs/ro.py +65 -0
  101. data/ext/uchardet/script/langs/sk.py +80 -0
  102. data/ext/uchardet/script/langs/sl.py +59 -0
  103. data/ext/uchardet/script/langs/sv.py +56 -0
  104. data/ext/uchardet/script/langs/th.py +55 -0
  105. data/ext/uchardet/script/langs/tr.py +67 -0
  106. data/ext/uchardet/script/langs/vi.py +64 -0
  107. data/ext/uchardet/script/release.sh +8 -0
  108. data/ext/uchardet/script/win32.sh +7 -0
  109. data/ext/uchardet/src/Big5Freq.tab +943 -0
  110. data/ext/uchardet/src/CMakeLists.txt +160 -0
  111. data/ext/uchardet/src/CharDistribution.cpp +109 -0
  112. data/ext/uchardet/src/CharDistribution.h +242 -0
  113. data/ext/uchardet/src/EUCKRFreq.tab +614 -0
  114. data/ext/uchardet/src/EUCTWFreq.tab +447 -0
  115. data/ext/uchardet/src/GB2312Freq.tab +491 -0
  116. data/ext/uchardet/src/JISFreq.tab +589 -0
  117. data/ext/uchardet/src/JpCntx.cpp +230 -0
  118. data/ext/uchardet/src/JpCntx.h +140 -0
  119. data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
  120. data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
  121. data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
  122. data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
  123. data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
  124. data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
  125. data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
  126. data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
  127. data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
  128. data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
  129. data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
  130. data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
  131. data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
  132. data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
  133. data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
  134. data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
  135. data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
  136. data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
  137. data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
  138. data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
  139. data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
  140. data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
  141. data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
  142. data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
  143. data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
  144. data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
  145. data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
  146. data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
  147. data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
  148. data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
  149. data/ext/uchardet/src/nsBig5Prober.h +75 -0
  150. data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
  151. data/ext/uchardet/src/nsCharSetProber.h +77 -0
  152. data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
  153. data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
  154. data/ext/uchardet/src/nsEUCJPProber.h +79 -0
  155. data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
  156. data/ext/uchardet/src/nsEUCKRProber.h +81 -0
  157. data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
  158. data/ext/uchardet/src/nsEUCTWProber.h +75 -0
  159. data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
  160. data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
  161. data/ext/uchardet/src/nsEscSM.cpp +267 -0
  162. data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
  163. data/ext/uchardet/src/nsGB2312Prober.h +77 -0
  164. data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
  165. data/ext/uchardet/src/nsHebrewProber.h +177 -0
  166. data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
  167. data/ext/uchardet/src/nsLatin1Prober.h +73 -0
  168. data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
  169. data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
  170. data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
  171. data/ext/uchardet/src/nsPkgInt.h +89 -0
  172. data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
  173. data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
  174. data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
  175. data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
  176. data/ext/uchardet/src/nsSJISProber.cpp +98 -0
  177. data/ext/uchardet/src/nsSJISProber.h +81 -0
  178. data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
  179. data/ext/uchardet/src/nsUTF8Prober.h +66 -0
  180. data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
  181. data/ext/uchardet/src/nsUniversalDetector.h +91 -0
  182. data/ext/uchardet/src/nscore.h +59 -0
  183. data/ext/uchardet/src/prmem.h +49 -0
  184. data/ext/uchardet/src/symbols.cmake +41 -0
  185. data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
  186. data/ext/uchardet/src/tools/uchardet.cpp +254 -0
  187. data/ext/uchardet/src/uchardet.cpp +274 -0
  188. data/ext/uchardet/src/uchardet.h +136 -0
  189. data/ext/uchardet/test/CMakeLists.txt +47 -0
  190. data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
  191. data/ext/uchardet/test/ar/utf-8.txt +3 -0
  192. data/ext/uchardet/test/ar/windows-1256.txt +3 -0
  193. data/ext/uchardet/test/bg/windows-1251.txt +3 -0
  194. data/ext/uchardet/test/cs/ibm852.txt +4 -0
  195. data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
  196. data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
  197. data/ext/uchardet/test/cs/utf-8.txt +4 -0
  198. data/ext/uchardet/test/cs/windows-1250.txt +4 -0
  199. data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
  200. data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
  201. data/ext/uchardet/test/da/utf-8.txt +10 -0
  202. data/ext/uchardet/test/da/windows-1252.txt +10 -0
  203. data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
  204. data/ext/uchardet/test/de/windows-1252.txt +11 -0
  205. data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
  206. data/ext/uchardet/test/el/utf-8.txt +3 -0
  207. data/ext/uchardet/test/el/windows-1253.txt +5 -0
  208. data/ext/uchardet/test/en/ascii.txt +4 -0
  209. data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
  210. data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
  211. data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
  212. data/ext/uchardet/test/es/utf-8.txt +5 -0
  213. data/ext/uchardet/test/es/windows-1252.txt +5 -0
  214. data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
  215. data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
  216. data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
  217. data/ext/uchardet/test/et/utf-8.txt +6 -0
  218. data/ext/uchardet/test/et/windows-1252.txt +6 -0
  219. data/ext/uchardet/test/et/windows-1257.txt +6 -0
  220. data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
  221. data/ext/uchardet/test/fi/utf-8.txt +8 -0
  222. data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
  223. data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
  224. data/ext/uchardet/test/fr/utf-16.be +0 -0
  225. data/ext/uchardet/test/fr/utf-32.le +0 -0
  226. data/ext/uchardet/test/fr/utf-8.txt +14 -0
  227. data/ext/uchardet/test/fr/windows-1252.txt +3 -0
  228. data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
  229. data/ext/uchardet/test/ga/utf-8.txt +6 -0
  230. data/ext/uchardet/test/ga/windows-1252.txt +6 -0
  231. data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
  232. data/ext/uchardet/test/he/utf-8.txt +3 -0
  233. data/ext/uchardet/test/he/windows-1255.txt +1 -0
  234. data/ext/uchardet/test/hr/ibm852.txt +4 -0
  235. data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
  236. data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
  237. data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
  238. data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
  239. data/ext/uchardet/test/hr/utf-8.txt +4 -0
  240. data/ext/uchardet/test/hr/windows-1250.txt +4 -0
  241. data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
  242. data/ext/uchardet/test/hu/windows-1250.txt +1 -0
  243. data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
  244. data/ext/uchardet/test/it/utf-8.txt +18 -0
  245. data/ext/uchardet/test/ja/euc-jp.txt +10 -0
  246. data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
  247. data/ext/uchardet/test/ja/shift_jis.txt +1 -0
  248. data/ext/uchardet/test/ja/utf-16be.txt +0 -0
  249. data/ext/uchardet/test/ja/utf-16le.txt +0 -0
  250. data/ext/uchardet/test/ja/utf-8.txt +9 -0
  251. data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
  252. data/ext/uchardet/test/ko/uhc.smi +16 -0
  253. data/ext/uchardet/test/ko/utf-16.le +0 -0
  254. data/ext/uchardet/test/ko/utf-32.be +0 -0
  255. data/ext/uchardet/test/ko/utf-8.txt +3 -0
  256. data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
  257. data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
  258. data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
  259. data/ext/uchardet/test/lt/utf-8.txt +3 -0
  260. data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
  261. data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
  262. data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
  263. data/ext/uchardet/test/lv/utf-8.txt +6 -0
  264. data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
  265. data/ext/uchardet/test/mt/utf-8.txt +4 -0
  266. data/ext/uchardet/test/pl/ibm852.txt +3 -0
  267. data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
  268. data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
  269. data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
  270. data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
  271. data/ext/uchardet/test/pl/utf-8.txt +3 -0
  272. data/ext/uchardet/test/pl/windows-1250.txt +3 -0
  273. data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
  274. data/ext/uchardet/test/pt/utf-8.txt +6 -0
  275. data/ext/uchardet/test/ro/ibm852.txt +9 -0
  276. data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
  277. data/ext/uchardet/test/ro/utf-8.txt +9 -0
  278. data/ext/uchardet/test/ro/windows-1250.txt +9 -0
  279. data/ext/uchardet/test/ru/ibm855.txt +5 -0
  280. data/ext/uchardet/test/ru/ibm866.txt +11 -0
  281. data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
  282. data/ext/uchardet/test/ru/koi8-r.txt +1 -0
  283. data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
  284. data/ext/uchardet/test/ru/windows-1251.txt +4 -0
  285. data/ext/uchardet/test/sk/ibm852.txt +3 -0
  286. data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
  287. data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
  288. data/ext/uchardet/test/sk/utf-8.txt +3 -0
  289. data/ext/uchardet/test/sk/windows-1250.txt +3 -0
  290. data/ext/uchardet/test/sl/ibm852.txt +9 -0
  291. data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
  292. data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
  293. data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
  294. data/ext/uchardet/test/sl/utf-8.txt +9 -0
  295. data/ext/uchardet/test/sl/windows-1250.txt +9 -0
  296. data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
  297. data/ext/uchardet/test/sv/utf-8.txt +10 -0
  298. data/ext/uchardet/test/sv/windows-1252.txt +10 -0
  299. data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
  300. data/ext/uchardet/test/th/tis-620.txt +5 -0
  301. data/ext/uchardet/test/th/utf-8.txt +1 -0
  302. data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
  303. data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
  304. data/ext/uchardet/test/uchardet-tests.c +130 -0
  305. data/ext/uchardet/test/vi/utf-8.txt +4 -0
  306. data/ext/uchardet/test/vi/viscii.txt +4 -0
  307. data/ext/uchardet/test/vi/windows-1258.txt +4 -0
  308. data/ext/uchardet/test/zh/big5.txt +1 -0
  309. data/ext/uchardet/test/zh/euc-tw.txt +1 -0
  310. data/ext/uchardet/test/zh/gb18030.txt +1 -0
  311. data/ext/uchardet/test/zh/utf-8.txt +1 -0
  312. data/ext/uchardet/uchardet.doap +51 -0
  313. data/ext/uchardet/uchardet.pc.in +10 -0
  314. data/lib/cchardet.rb +56 -0
  315. data/lib/cchardet/lib_finder.rb +32 -0
  316. data/lib/cchardet/version.rb +5 -0
  317. metadata +362 -0
@@ -0,0 +1,175 @@
1
+ /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
+ /* ***** BEGIN LICENSE BLOCK *****
3
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
+ *
5
+ * The contents of this file are subject to the Mozilla Public License Version
6
+ * 1.1 (the "License"); you may not use this file except in compliance with
7
+ * the License. You may obtain a copy of the License at
8
+ * http://www.mozilla.org/MPL/
9
+ *
10
+ * Software distributed under the License is distributed on an "AS IS" basis,
11
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
+ * for the specific language governing rights and limitations under the
13
+ * License.
14
+ *
15
+ * The Original Code is Mozilla Communicator client code.
16
+ *
17
+ * The Initial Developer of the Original Code is
18
+ * Netscape Communications Corporation.
19
+ * Portions created by the Initial Developer are Copyright (C) 1998
20
+ * the Initial Developer. All Rights Reserved.
21
+ *
22
+ * Contributor(s):
23
+ *
24
+ * Alternatively, the contents of this file may be used under the terms of
25
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
26
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
+ * in which case the provisions of the GPL or the LGPL are applicable instead
28
+ * of those above. If you wish to allow use of your version of this file only
29
+ * under the terms of either the GPL or the LGPL, and not to allow others to
30
+ * use your version of this file under the terms of the MPL, indicate your
31
+ * decision by deleting the provisions above and replace them with the notice
32
+ * and other provisions required by the GPL or the LGPL. If you do not delete
33
+ * the provisions above, a recipient may use your version of this file under
34
+ * the terms of any one of the MPL, the GPL or the LGPL.
35
+ *
36
+ * ***** END LICENSE BLOCK ***** */
37
+
38
+ #include "../nsSBCharSetProber.h"
39
+
40
+ /********* Language model for: Turkish *********/
41
+
42
+ /**
43
+ * Generated by BuildLangModel.py
44
+ * On: 2015-12-04 02:24:44.730727
45
+ **/
46
+
47
+ /* Character Mapping Table:
48
+ * ILL: illegal character.
49
+ * CTR: control character specific to the charset.
50
+ * RET: carriage/return.
51
+ * SYM: symbol (punctuation) that does not belong to word.
52
+ * NUM: 0 - 9.
53
+ *
54
+ * Other characters are ordered by probabilities
55
+ * (0 is the most common character in the language).
56
+ *
57
+ * Orders are generic to a language. So the codepoint with order X in
58
+ * CHARSET1 maps to the same character as the codepoint with the same
59
+ * order X in CHARSET2 for the same language.
60
+ * As such, it is possible to get missing order. For instance the
61
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
62
+ * even though they are both used for French. Same for the euro sign.
63
+ */
64
+ static const unsigned char Iso_8859_3_CharToOrderMap[] =
65
+ {
66
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
67
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
68
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
69
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
70
+ SYM, 0, 15, 21, 7, 1, 26, 22, 19, 6, 28, 9, 5, 11, 3, 14, /* 4X */
71
+ 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */
72
+ SYM, 0, 15, 21, 7, 1, 26, 22, 19, 2, 28, 9, 5, 11, 3, 14, /* 6X */
73
+ 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */
74
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
75
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
76
+ SYM, 48,SYM,SYM,SYM,ILL, 49,SYM,SYM, 2, 17, 25, 50,SYM,ILL, 51, /* AX */
77
+ SYM, 52,SYM,SYM,SYM,SYM, 53,SYM,SYM, 6, 17, 25, 54,SYM,ILL, 55, /* BX */
78
+ 41, 36, 30,ILL, 39, 56, 57, 24, 42, 33, 58, 45, 59, 37, 31, 60, /* CX */
79
+ ILL, 47, 61, 38, 62, 63, 27,SYM, 64, 65, 40, 35, 16, 66, 67, 68, /* DX */
80
+ 41, 36, 30,ILL, 39, 69, 70, 24, 42, 33, 71, 45, 72, 37, 31, 73, /* EX */
81
+ ILL, 47, 74, 38, 75, 76, 27,SYM, 77, 78, 40, 35, 16, 79, 80,SYM, /* FX */
82
+ };
83
+ /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
84
+
85
+ static const unsigned char Iso_8859_9_CharToOrderMap[] =
86
+ {
87
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
88
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
89
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
90
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
91
+ SYM, 0, 15, 21, 7, 1, 26, 22, 19, 6, 28, 9, 5, 11, 3, 14, /* 4X */
92
+ 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,SYM, /* 5X */
93
+ SYM, 0, 15, 21, 7, 1, 26, 22, 19, 2, 28, 9, 5, 11, 3, 14, /* 6X */
94
+ 23, 34, 4, 10, 8, 12, 20, 29, 32, 13, 18,SYM,SYM,SYM,SYM,CTR, /* 7X */
95
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
96
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
97
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
98
+ SYM,SYM,SYM,SYM,SYM, 81,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
99
+ 41, 36, 30, 44, 39, 82, 46, 24, 42, 33, 83, 45, 84, 37, 31, 85, /* CX */
100
+ 25, 47, 86, 38, 87, 88, 27,SYM, 43, 89, 40, 35, 16, 2, 17, 90, /* DX */
101
+ 41, 36, 30, 44, 39, 91, 46, 24, 42, 33, 92, 45, 93, 37, 31, 94, /* EX */
102
+ 25, 47, 95, 38, 96, 97, 27,SYM, 43, 98, 40, 35, 16, 6, 17, 99, /* FX */
103
+ };
104
+ /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
105
+
106
+
107
+ /* Model Table:
108
+ * Total sequences: 935
109
+ * First 512 sequences: 0.991865243864388
110
+ * Next 512 sequences (512-1024): 0.008134756135611957
111
+ * Rest: 2.949029909160572e-17
112
+ * Negative sequences: TODO
113
+ */
114
+ static const PRUint8 TurkishLangModel[] =
115
+ {
116
+ 3,2,3,3,3,3,2,3,3,3,3,3,3,3,2,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,2,2,2,0,
117
+ 3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,2,0,3,0,2,0,
118
+ 3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,2,2,2,0,2,0,2,0,
119
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,0,3,2,2,2,2,2,2,0,0,0,
120
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,2,2,2,2,2,2,
121
+ 3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,2,3,0,3,2,2,2,2,3,0,2,2,2,
122
+ 3,2,0,3,3,3,3,3,3,3,3,3,2,3,2,3,0,3,3,2,3,3,2,3,2,3,2,0,0,0,0,0,2,0,0,0,
123
+ 3,3,3,2,3,3,3,3,2,2,2,2,3,3,3,2,3,0,2,2,2,2,2,2,0,0,0,3,2,3,2,2,0,0,0,0,
124
+ 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,2,2,2,3,0,2,3,2,2,3,2,2,0,0,0,
125
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,3,2,2,2,2,3,0,2,3,2,2,3,0,0,0,0,2,
126
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,2,3,3,0,2,3,0,2,2,0,0,2,2,2,
127
+ 3,3,3,2,3,3,3,3,2,2,3,3,3,3,3,3,3,2,3,3,0,3,2,3,2,0,2,2,0,2,3,2,2,2,2,2,
128
+ 3,3,3,3,3,3,0,3,3,3,3,3,2,3,2,3,0,3,3,3,3,3,3,3,3,3,2,0,2,2,0,0,2,2,0,0,
129
+ 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,2,2,2,3,2,2,0,2,3,0,2,2,0,0,2,0,2,
130
+ 3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,3,0,2,2,2,0,0,
131
+ 3,3,3,3,3,3,3,3,0,2,2,3,3,3,3,3,3,0,2,2,2,2,0,2,0,0,0,3,2,2,2,0,0,2,0,0,
132
+ 2,2,2,3,3,3,0,3,3,3,3,3,0,3,2,3,0,3,3,3,3,3,2,3,3,3,3,0,2,0,0,0,0,0,0,0,
133
+ 3,3,3,0,2,3,3,2,3,3,2,3,3,2,2,3,3,2,0,2,2,2,2,2,3,0,2,2,0,0,2,2,0,0,0,0,
134
+ 3,3,3,2,2,3,3,3,2,2,0,3,3,3,3,2,3,0,2,2,0,3,3,0,0,0,0,2,0,0,2,2,0,0,0,0,
135
+ 3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,2,2,2,2,2,0,2,3,0,2,0,0,2,3,2,0,2,0,2,
136
+ 3,3,3,2,3,3,2,2,0,2,3,2,3,3,3,2,2,2,2,2,3,2,2,0,0,0,2,0,0,0,2,2,0,0,0,0,
137
+ 3,3,3,2,3,3,3,2,3,3,2,2,3,2,3,2,3,0,2,3,0,2,0,0,0,0,0,2,0,0,2,0,0,2,2,2,
138
+ 3,3,3,2,3,3,3,2,2,2,2,0,3,2,3,0,3,0,2,3,2,0,2,2,0,0,2,3,2,2,2,0,0,2,0,0,
139
+ 3,3,3,0,3,3,3,2,3,2,3,3,3,2,3,2,2,0,2,3,0,2,2,3,2,0,2,0,0,2,2,0,2,2,0,0,
140
+ 3,3,3,0,2,3,3,2,3,2,0,3,3,2,3,2,3,2,0,0,0,0,2,2,0,0,0,3,0,0,0,0,0,0,0,0,
141
+ 3,3,3,0,3,3,3,3,0,0,0,3,3,0,0,2,3,2,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
142
+ 3,3,3,2,3,3,3,2,3,2,2,0,3,3,3,2,2,0,0,2,0,2,2,0,2,0,2,2,2,0,2,2,0,0,0,0,
143
+ 0,0,0,3,3,3,0,3,3,3,3,3,0,3,0,2,0,2,3,2,2,0,0,2,3,3,2,0,2,0,0,0,0,0,0,0,
144
+ 3,3,3,0,0,2,2,2,0,2,0,0,3,0,3,0,2,0,0,0,0,2,2,2,0,0,0,2,0,0,2,0,0,0,0,0,
145
+ 3,3,3,2,2,2,0,0,0,2,2,2,2,2,3,2,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0,
146
+ 0,0,2,3,3,3,0,3,2,2,2,2,0,2,0,2,0,2,2,3,2,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,
147
+ 0,0,0,2,0,2,0,2,2,0,0,2,0,2,0,0,0,2,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
148
+ 3,2,2,0,0,0,2,0,2,0,0,0,0,2,2,0,0,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,0,0,0,0,
149
+ 2,0,2,2,2,2,0,2,2,0,2,2,2,0,0,2,0,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
150
+ 2,0,2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
151
+ 0,0,0,0,2,2,0,2,0,0,2,2,0,0,0,2,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
152
+ };
153
+
154
+
155
+ const SequenceModel Iso_8859_3TurkishModel =
156
+ {
157
+ Iso_8859_3_CharToOrderMap,
158
+ TurkishLangModel,
159
+ 36,
160
+ (float)0.991865243864388,
161
+ PR_FALSE,
162
+ "ISO-8859-3",
163
+ "tr"
164
+ };
165
+
166
+ const SequenceModel Iso_8859_9TurkishModel =
167
+ {
168
+ Iso_8859_9_CharToOrderMap,
169
+ TurkishLangModel,
170
+ 36,
171
+ (float)0.991865243864388,
172
+ PR_FALSE,
173
+ "ISO-8859-9",
174
+ "tr"
175
+ };
@@ -0,0 +1,249 @@
1
+ /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
+ /* ***** BEGIN LICENSE BLOCK *****
3
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
+ *
5
+ * The contents of this file are subject to the Mozilla Public License Version
6
+ * 1.1 (the "License"); you may not use this file except in compliance with
7
+ * the License. You may obtain a copy of the License at
8
+ * http://www.mozilla.org/MPL/
9
+ *
10
+ * Software distributed under the License is distributed on an "AS IS" basis,
11
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
+ * for the specific language governing rights and limitations under the
13
+ * License.
14
+ *
15
+ * The Original Code is Mozilla Communicator client code.
16
+ *
17
+ * The Initial Developer of the Original Code is
18
+ * Netscape Communications Corporation.
19
+ * Portions created by the Initial Developer are Copyright (C) 1998
20
+ * the Initial Developer. All Rights Reserved.
21
+ *
22
+ * Contributor(s):
23
+ *
24
+ * Alternatively, the contents of this file may be used under the terms of
25
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
26
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
+ * in which case the provisions of the GPL or the LGPL are applicable instead
28
+ * of those above. If you wish to allow use of your version of this file only
29
+ * under the terms of either the GPL or the LGPL, and not to allow others to
30
+ * use your version of this file under the terms of the MPL, indicate your
31
+ * decision by deleting the provisions above and replace them with the notice
32
+ * and other provisions required by the GPL or the LGPL. If you do not delete
33
+ * the provisions above, a recipient may use your version of this file under
34
+ * the terms of any one of the MPL, the GPL or the LGPL.
35
+ *
36
+ * ***** END LICENSE BLOCK ***** */
37
+
38
+ #include "../nsSBCharSetProber.h"
39
+
40
+ /********* Language model for: Vietnamese *********/
41
+
42
+ /**
43
+ * Generated by BuildLangModel.py
44
+ * On: 2016-02-13 03:42:06.561440
45
+ **/
46
+
47
+ /* Character Mapping Table:
48
+ * ILL: illegal character.
49
+ * CTR: control character specific to the charset.
50
+ * RET: carriage/return.
51
+ * SYM: symbol (punctuation) that does not belong to word.
52
+ * NUM: 0 - 9.
53
+ *
54
+ * Other characters are ordered by probabilities
55
+ * (0 is the most common character in the language).
56
+ *
57
+ * Orders are generic to a language. So the codepoint with order X in
58
+ * CHARSET1 maps to the same character as the codepoint with the same
59
+ * order X in CHARSET2 for the same language.
60
+ * As such, it is possible to get missing order. For instance the
61
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
62
+ * even though they are both used for French. Same for the euro sign.
63
+ */
64
+ static const unsigned char Windows_1258_CharToOrderMap[] =
65
+ {
66
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
67
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
68
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
69
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
70
+ SYM, 6, 17, 3, 22, 21, 66, 5, 1, 4, 75, 24, 14, 8, 0, 9, /* 4X */
71
+ 16, 36, 11, 19, 2, 7, 13, 69, 54, 20, 82,SYM,SYM,SYM,SYM,SYM, /* 5X */
72
+ SYM, 6, 17, 3, 22, 21, 66, 5, 1, 4, 75, 24, 14, 8, 0, 9, /* 6X */
73
+ 16, 36, 11, 19, 2, 7, 13, 69, 54, 20, 82,SYM,SYM,SYM,SYM,CTR, /* 7X */
74
+ SYM,ILL,SYM,101,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,100,ILL,ILL,ILL, /* 8X */
75
+ ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,100,ILL,ILL,102, /* 9X */
76
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
77
+ SYM,SYM,SYM,SYM,SYM,103,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
78
+ 12, 15, 25, 51, 97,104, 98, 91, 90, 62, 27,105,SYM, 47,106,107, /* CX */
79
+ 10,108,SYM, 33, 29, 46, 93,SYM, 94, 58, 67,109, 96, 18,SYM, 99, /* DX */
80
+ 12, 15, 25, 51, 97,110, 98, 91, 90, 62, 27,111,SYM, 47,112,113, /* EX */
81
+ 10,114,SYM, 33, 29, 46, 93,SYM, 94, 58, 67,115, 96, 18,116,117, /* FX */
82
+ };
83
+ /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
84
+
85
+ static const unsigned char Viscii_CharToOrderMap[] =
86
+ {
87
+ CTR,CTR, 88,CTR,CTR, 95, 77,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
88
+ CTR,CTR,CTR,CTR, 80,CTR,CTR,CTR,CTR, 79,CTR,CTR,CTR,CTR, 92,CTR, /* 1X */
89
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
90
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
91
+ SYM, 6, 17, 3, 22, 21, 66, 5, 1, 4, 75, 24, 14, 8, 0, 9, /* 4X */
92
+ 16, 36, 11, 19, 2, 7, 13, 69, 54, 20, 82,SYM,SYM,SYM,SYM,SYM, /* 5X */
93
+ SYM, 6, 17, 3, 22, 21, 66, 5, 1, 4, 75, 24, 14, 8, 0, 9, /* 6X */
94
+ 16, 36, 11, 19, 2, 7, 13, 69, 54, 20, 82,SYM,SYM,SYM,SYM,CTR, /* 7X */
95
+ 30, 57, 71, 65, 41, 43, 78, 49, 83, 89, 23, 45, 39, 74, 28, 32, /* 8X */
96
+ 53, 60, 84, 31, 37, 40, 38, 59, 42, 81, 44, 73, 35, 72, 48, 76, /* 9X */
97
+ 86, 57, 71, 65, 41, 43, 78, 49, 83, 89, 23, 45, 39, 74, 28, 32, /* AX */
98
+ 53, 60, 84, 87, 46, 31, 38, 59, 42, 56, 52, 55, 70, 46, 40, 18, /* BX */
99
+ 12, 15, 25, 61, 34, 51, 88, 95, 90, 62, 27, 85, 50, 47, 64, 76, /* CX */
100
+ 10, 52, 63, 33, 29, 30, 80, 55, 70, 58, 67, 79, 92, 68, 87, 18, /* DX */
101
+ 12, 15, 25, 61, 34, 51, 26, 77, 90, 62, 27, 85, 50, 47, 64, 73, /* EX */
102
+ 10, 56, 63, 33, 29, 86, 81, 44, 48, 58, 67, 72, 35, 68, 37, 26, /* FX */
103
+ };
104
+ /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
105
+
106
+
107
+ /* Model Table:
108
+ * Total sequences: 1494
109
+ * First 512 sequences: 0.9321889118082535
110
+ * Next 512 sequences (512-1024): 0.06092051479986333
111
+ * Rest: 0.0068905733918831966
112
+ * Negative sequences: TODO
113
+ */
114
+ static const PRUint8 VietnameseLangModel[] =
115
+ {
116
+ 3,3,3,3,3,3,3,2,2,3,0,2,3,1,1,1,1,2,3,3,2,3,3,3,2,1,2,
117
+ 3,0,3,2,2,2,3,1,0,1,1,2,0,0,1,0,1,0,2,2,1,0,0,0,3,0,0,2,
118
+ 2,1,2,0,3,0,3,3,2,3,0,2,3,0,2,3,0,0,3,1,3,3,1,3,1,3,3,
119
+ 3,3,3,3,3,3,3,3,3,0,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,3,2,0,
120
+ 2,3,2,2,3,1,3,3,1,3,1,3,3,2,2,3,2,0,3,2,2,3,1,3,0,3,0,
121
+ 3,1,3,3,3,3,2,3,2,0,0,2,1,2,2,2,2,0,0,1,3,2,3,2,2,2,2,0,
122
+ 2,3,2,2,3,0,3,3,2,3,0,2,2,1,2,3,1,1,2,2,2,3,1,0,2,2,0,
123
+ 0,0,3,2,3,2,3,3,3,1,1,2,0,0,2,0,3,0,0,2,0,2,2,0,2,3,1,1,
124
+ 3,1,3,3,3,3,3,2,3,3,1,3,2,2,3,3,2,2,0,3,1,3,3,3,2,0,3,
125
+ 3,3,1,0,0,3,1,3,0,2,0,2,3,3,2,0,0,2,3,0,0,0,1,0,1,0,0,2,
126
+ 2,3,2,2,3,1,3,3,1,3,0,3,3,0,2,2,0,1,3,2,2,3,1,1,1,2,3,
127
+ 0,0,3,3,1,2,2,0,1,0,2,2,0,0,1,1,3,3,0,0,0,1,1,2,1,0,3,0,
128
+ 3,2,3,3,3,2,2,3,3,3,0,3,0,2,3,0,2,3,0,3,3,2,3,0,2,0,0,
129
+ 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,
130
+ 3,1,3,2,3,2,3,1,3,2,0,3,1,2,3,2,2,2,0,3,3,3,2,2,2,3,0,
131
+ 2,1,3,1,3,3,0,2,0,0,0,1,0,1,3,0,3,0,0,2,2,0,3,0,2,0,3,1,
132
+ 2,1,0,2,3,0,3,3,2,3,0,0,3,0,2,3,2,2,3,2,2,3,2,0,0,1,0,
133
+ 0,2,3,3,3,2,2,1,0,0,0,2,0,3,3,0,1,2,2,0,0,3,2,2,1,2,1,1,
134
+ 3,2,3,2,3,2,3,3,3,2,0,3,3,2,3,3,2,3,0,3,2,2,3,0,2,0,0,
135
+ 0,0,0,3,0,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,
136
+ 0,0,0,0,3,0,3,2,0,3,0,1,3,0,0,3,0,1,3,0,0,1,0,3,0,3,0,
137
+ 2,3,3,3,3,3,3,3,2,0,1,3,3,1,3,3,3,3,3,2,2,0,1,2,2,3,3,0,
138
+ 3,2,3,2,3,2,3,3,2,3,0,3,2,2,3,2,1,2,3,3,3,3,3,0,2,1,2,
139
+ 3,1,2,2,3,2,0,2,0,0,2,2,1,0,3,3,2,3,0,1,2,2,2,3,3,1,2,0,
140
+ 3,0,0,0,3,0,0,2,3,3,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,
141
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
142
+ 2,0,0,0,3,0,3,3,0,2,0,1,3,0,1,1,0,0,2,1,1,3,1,1,0,2,1,
143
+ 2,1,2,1,0,1,0,0,0,0,2,1,0,3,2,3,3,1,3,0,3,2,3,3,3,0,0,0,
144
+ 0,2,2,1,3,2,3,3,2,3,0,0,3,2,3,2,2,2,3,2,2,3,2,1,1,2,1,
145
+ 3,2,2,3,3,2,1,0,0,0,3,2,0,3,2,3,2,1,0,1,2,2,3,0,2,0,0,1,
146
+ 3,0,3,3,3,1,0,2,3,3,0,1,0,0,1,0,3,0,0,1,3,0,0,0,0,0,0,
147
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
148
+ 0,3,2,0,3,0,3,2,1,3,0,3,0,0,2,0,2,1,0,2,2,3,1,0,0,0,0,
149
+ 2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
150
+ 2,1,0,2,3,1,3,3,0,3,0,3,3,0,3,3,0,3,1,2,2,3,1,1,1,0,0,
151
+ 2,1,0,2,3,3,2,3,0,0,0,1,0,2,2,3,2,0,1,0,2,1,2,3,0,2,3,0,
152
+ 3,0,1,1,2,0,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
153
+ 0,0,0,0,0,0,0,0,0,0,3,3,0,3,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
154
+ 1,3,3,3,3,1,3,3,2,3,0,1,2,0,2,3,2,2,2,3,2,3,2,0,2,2,0,
155
+ 0,0,2,1,0,3,2,2,0,1,1,1,1,1,1,0,0,0,0,2,0,1,0,0,1,2,1,0,
156
+ 2,0,1,2,1,0,2,2,1,2,0,2,0,0,1,1,2,1,0,2,0,2,1,3,1,0,0,
157
+ 3,2,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,
158
+ 3,2,3,2,2,2,3,2,3,3,0,3,0,2,3,1,2,2,0,3,2,3,3,0,2,0,0,
159
+ 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
160
+ 1,1,1,2,3,1,3,3,0,3,0,3,3,1,2,1,0,0,3,2,2,3,2,0,1,3,1,
161
+ 1,0,0,3,1,1,1,0,0,0,0,1,0,0,3,3,2,1,0,1,0,3,2,1,1,2,1,0,
162
+ 3,0,3,2,0,0,0,3,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,
163
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
164
+ 1,3,1,0,3,1,3,2,0,2,0,2,0,1,2,0,0,1,0,2,2,2,0,3,1,0,0,
165
+ 2,0,1,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,3,0,0,2,0,0,0,1,
166
+ 3,0,1,1,0,0,0,3,3,0,0,0,0,0,1,0,1,0,0,0,3,0,0,0,0,0,0,
167
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
168
+ 3,0,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
169
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
170
+ 3,0,2,1,0,0,0,3,3,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,
171
+ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172
+ 3,0,3,3,0,0,0,3,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,
173
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
174
+ 3,0,2,2,3,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,
175
+ 0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
176
+ 3,0,3,3,3,0,0,0,2,3,0,0,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0,
177
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
178
+ 3,0,3,3,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
179
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180
+ 3,0,3,3,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
181
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
182
+ 3,0,1,2,3,0,3,0,2,0,0,1,0,1,0,0,2,0,0,0,0,0,0,0,1,0,0,
183
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
184
+ 3,0,0,0,3,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,
185
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
186
+ 2,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,
187
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
188
+ 0,0,0,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
189
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
190
+ 3,0,2,3,3,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,
191
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192
+ 3,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
193
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194
+ 3,0,0,0,0,0,0,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
195
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196
+ 3,0,2,3,3,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
197
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
198
+ 3,0,3,0,0,0,0,3,3,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,
199
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200
+ 3,0,1,3,0,0,3,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
201
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202
+ 3,0,0,0,1,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,
203
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204
+ 3,0,1,3,3,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
205
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206
+ 3,0,0,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
207
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208
+ 3,0,1,1,3,0,0,2,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
209
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210
+ 3,0,2,3,0,0,2,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,
211
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212
+ 3,0,1,3,1,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,
213
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214
+ 3,0,3,1,0,0,0,2,2,0,0,0,0,0,0,0,3,0,0,0,2,0,0,0,0,0,0,
215
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216
+ 3,0,0,0,0,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
217
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218
+ 3,0,1,1,1,0,0,0,3,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,
219
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220
+ 3,0,2,3,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
221
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222
+ 3,0,0,0,2,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
223
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224
+ 1,1,2,1,2,0,3,3,0,1,0,0,0,2,0,3,1,2,2,0,1,3,0,2,0,2,0,
225
+ 2,0,2,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,1,2,0,0,1,1,2,0,2,
226
+ };
227
+
228
+
229
+ const SequenceModel Windows_1258VietnameseModel =
230
+ {
231
+ Windows_1258_CharToOrderMap,
232
+ VietnameseLangModel,
233
+ 55,
234
+ (float)0.9321889118082535,
235
+ PR_FALSE,
236
+ "WINDOWS-1258",
237
+ "vi"
238
+ };
239
+
240
+ const SequenceModel VisciiVietnameseModel =
241
+ {
242
+ Viscii_CharToOrderMap,
243
+ VietnameseLangModel,
244
+ 55,
245
+ (float)0.9321889118082535,
246
+ PR_FALSE,
247
+ "VISCII",
248
+ "vi"
249
+ };
@@ -0,0 +1,88 @@
1
+ /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
+ /* ***** BEGIN LICENSE BLOCK *****
3
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
+ *
5
+ * The contents of this file are subject to the Mozilla Public License Version
6
+ * 1.1 (the "License"); you may not use this file except in compliance with
7
+ * the License. You may obtain a copy of the License at
8
+ * http://www.mozilla.org/MPL/
9
+ *
10
+ * Software distributed under the License is distributed on an "AS IS" basis,
11
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
+ * for the specific language governing rights and limitations under the
13
+ * License.
14
+ *
15
+ * The Original Code is mozilla.org code.
16
+ *
17
+ * The Initial Developer of the Original Code is
18
+ * Netscape Communications Corporation.
19
+ * Portions created by the Initial Developer are Copyright (C) 1998
20
+ * the Initial Developer. All Rights Reserved.
21
+ *
22
+ * Contributor(s):
23
+ *
24
+ * Alternatively, the contents of this file may be used under the terms of
25
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
26
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
+ * in which case the provisions of the GPL or the LGPL are applicable instead
28
+ * of those above. If you wish to allow use of your version of this file only
29
+ * under the terms of either the GPL or the LGPL, and not to allow others to
30
+ * use your version of this file under the terms of the MPL, indicate your
31
+ * decision by deleting the provisions above and replace them with the notice
32
+ * and other provisions required by the GPL or the LGPL. If you do not delete
33
+ * the provisions above, a recipient may use your version of this file under
34
+ * the terms of any one of the MPL, the GPL or the LGPL.
35
+ *
36
+ * ***** END LICENSE BLOCK ***** */
37
+
38
+ #include "nsBig5Prober.h"
39
+
40
+ void nsBig5Prober::Reset(void)
41
+ {
42
+ mCodingSM->Reset();
43
+ mState = eDetecting;
44
+ mDistributionAnalyser.Reset(mIsPreferredLanguage);
45
+ }
46
+
47
+ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
48
+ {
49
+ PRUint32 codingState;
50
+
51
+ for (PRUint32 i = 0; i < aLen; i++)
52
+ {
53
+ codingState = mCodingSM->NextState(aBuf[i]);
54
+ if (codingState == eItsMe)
55
+ {
56
+ mState = eFoundIt;
57
+ break;
58
+ }
59
+ if (codingState == eStart)
60
+ {
61
+ PRUint32 charLen = mCodingSM->GetCurrentCharLen();
62
+
63
+ if (i == 0)
64
+ {
65
+ mLastChar[1] = aBuf[0];
66
+ mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
67
+ }
68
+ else
69
+ mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
70
+ }
71
+ }
72
+
73
+ mLastChar[0] = aBuf[aLen-1];
74
+
75
+ if (mState == eDetecting)
76
+ if (mDistributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
77
+ mState = eFoundIt;
78
+
79
+ return mState;
80
+ }
81
+
82
+ float nsBig5Prober::GetConfidence(void)
83
+ {
84
+ float distribCf = mDistributionAnalyser.GetConfidence();
85
+
86
+ return (float)distribCf;
87
+ }
88
+