cchardet 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (317) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.gitmodules +3 -0
  4. data/.rubocop.yml +11 -0
  5. data/CHANGELOG.md +5 -0
  6. data/Gemfile +10 -0
  7. data/README.md +35 -0
  8. data/Rakefile +15 -0
  9. data/cchardet.gemspec +30 -0
  10. data/ext/cchardet/extconf.rb +26 -0
  11. data/ext/uchardet/.gitignore +1 -0
  12. data/ext/uchardet/.gitlab-ci.yml +106 -0
  13. data/ext/uchardet/AUTHORS +16 -0
  14. data/ext/uchardet/CMakeLists.txt +74 -0
  15. data/ext/uchardet/COPYING +1316 -0
  16. data/ext/uchardet/INSTALL +26 -0
  17. data/ext/uchardet/README.md +295 -0
  18. data/ext/uchardet/build-mac/uchardet.cpp +7 -0
  19. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
  20. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  21. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
  22. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
  23. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
  24. data/ext/uchardet/doc/CMakeLists.txt +6 -0
  25. data/ext/uchardet/doc/README.maintainer +59 -0
  26. data/ext/uchardet/doc/uchardet.1 +18 -0
  27. data/ext/uchardet/script/BuildLangModel.py +533 -0
  28. data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
  29. data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
  30. data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
  31. data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
  32. data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
  33. data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
  34. data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
  35. data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
  36. data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
  37. data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
  38. data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
  39. data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
  40. data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
  41. data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
  42. data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
  43. data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
  44. data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
  45. data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
  46. data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
  47. data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
  48. data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
  49. data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
  50. data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
  51. data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
  52. data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
  53. data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
  54. data/ext/uchardet/script/README +63 -0
  55. data/ext/uchardet/script/charsets/codepoints.py +53 -0
  56. data/ext/uchardet/script/charsets/db.py +73 -0
  57. data/ext/uchardet/script/charsets/ibm852.py +72 -0
  58. data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
  59. data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
  60. data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
  61. data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
  62. data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
  63. data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
  64. data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
  65. data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
  66. data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
  67. data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
  68. data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
  69. data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
  70. data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
  71. data/ext/uchardet/script/charsets/tis-620.py +77 -0
  72. data/ext/uchardet/script/charsets/viscii.py +72 -0
  73. data/ext/uchardet/script/charsets/windows-1250.py +75 -0
  74. data/ext/uchardet/script/charsets/windows-1252.py +76 -0
  75. data/ext/uchardet/script/charsets/windows-1253.py +72 -0
  76. data/ext/uchardet/script/charsets/windows-1256.py +75 -0
  77. data/ext/uchardet/script/charsets/windows-1257.py +72 -0
  78. data/ext/uchardet/script/charsets/windows-1258.py +72 -0
  79. data/ext/uchardet/script/debug.sh +9 -0
  80. data/ext/uchardet/script/header-template.cpp +38 -0
  81. data/ext/uchardet/script/langs/ar.py +59 -0
  82. data/ext/uchardet/script/langs/cs.py +80 -0
  83. data/ext/uchardet/script/langs/da.py +69 -0
  84. data/ext/uchardet/script/langs/de.py +69 -0
  85. data/ext/uchardet/script/langs/el.py +55 -0
  86. data/ext/uchardet/script/langs/eo.py +67 -0
  87. data/ext/uchardet/script/langs/es.py +69 -0
  88. data/ext/uchardet/script/langs/et.py +57 -0
  89. data/ext/uchardet/script/langs/fi.py +60 -0
  90. data/ext/uchardet/script/langs/fr.py +79 -0
  91. data/ext/uchardet/script/langs/ga.py +60 -0
  92. data/ext/uchardet/script/langs/hr.py +59 -0
  93. data/ext/uchardet/script/langs/hu.py +66 -0
  94. data/ext/uchardet/script/langs/it.py +56 -0
  95. data/ext/uchardet/script/langs/lt.py +70 -0
  96. data/ext/uchardet/script/langs/lv.py +69 -0
  97. data/ext/uchardet/script/langs/mt.py +80 -0
  98. data/ext/uchardet/script/langs/pl.py +81 -0
  99. data/ext/uchardet/script/langs/pt.py +80 -0
  100. data/ext/uchardet/script/langs/ro.py +65 -0
  101. data/ext/uchardet/script/langs/sk.py +80 -0
  102. data/ext/uchardet/script/langs/sl.py +59 -0
  103. data/ext/uchardet/script/langs/sv.py +56 -0
  104. data/ext/uchardet/script/langs/th.py +55 -0
  105. data/ext/uchardet/script/langs/tr.py +67 -0
  106. data/ext/uchardet/script/langs/vi.py +64 -0
  107. data/ext/uchardet/script/release.sh +8 -0
  108. data/ext/uchardet/script/win32.sh +7 -0
  109. data/ext/uchardet/src/Big5Freq.tab +943 -0
  110. data/ext/uchardet/src/CMakeLists.txt +160 -0
  111. data/ext/uchardet/src/CharDistribution.cpp +109 -0
  112. data/ext/uchardet/src/CharDistribution.h +242 -0
  113. data/ext/uchardet/src/EUCKRFreq.tab +614 -0
  114. data/ext/uchardet/src/EUCTWFreq.tab +447 -0
  115. data/ext/uchardet/src/GB2312Freq.tab +491 -0
  116. data/ext/uchardet/src/JISFreq.tab +589 -0
  117. data/ext/uchardet/src/JpCntx.cpp +230 -0
  118. data/ext/uchardet/src/JpCntx.h +140 -0
  119. data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
  120. data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
  121. data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
  122. data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
  123. data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
  124. data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
  125. data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
  126. data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
  127. data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
  128. data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
  129. data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
  130. data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
  131. data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
  132. data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
  133. data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
  134. data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
  135. data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
  136. data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
  137. data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
  138. data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
  139. data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
  140. data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
  141. data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
  142. data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
  143. data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
  144. data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
  145. data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
  146. data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
  147. data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
  148. data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
  149. data/ext/uchardet/src/nsBig5Prober.h +75 -0
  150. data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
  151. data/ext/uchardet/src/nsCharSetProber.h +77 -0
  152. data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
  153. data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
  154. data/ext/uchardet/src/nsEUCJPProber.h +79 -0
  155. data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
  156. data/ext/uchardet/src/nsEUCKRProber.h +81 -0
  157. data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
  158. data/ext/uchardet/src/nsEUCTWProber.h +75 -0
  159. data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
  160. data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
  161. data/ext/uchardet/src/nsEscSM.cpp +267 -0
  162. data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
  163. data/ext/uchardet/src/nsGB2312Prober.h +77 -0
  164. data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
  165. data/ext/uchardet/src/nsHebrewProber.h +177 -0
  166. data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
  167. data/ext/uchardet/src/nsLatin1Prober.h +73 -0
  168. data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
  169. data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
  170. data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
  171. data/ext/uchardet/src/nsPkgInt.h +89 -0
  172. data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
  173. data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
  174. data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
  175. data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
  176. data/ext/uchardet/src/nsSJISProber.cpp +98 -0
  177. data/ext/uchardet/src/nsSJISProber.h +81 -0
  178. data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
  179. data/ext/uchardet/src/nsUTF8Prober.h +66 -0
  180. data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
  181. data/ext/uchardet/src/nsUniversalDetector.h +91 -0
  182. data/ext/uchardet/src/nscore.h +59 -0
  183. data/ext/uchardet/src/prmem.h +49 -0
  184. data/ext/uchardet/src/symbols.cmake +41 -0
  185. data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
  186. data/ext/uchardet/src/tools/uchardet.cpp +254 -0
  187. data/ext/uchardet/src/uchardet.cpp +274 -0
  188. data/ext/uchardet/src/uchardet.h +136 -0
  189. data/ext/uchardet/test/CMakeLists.txt +47 -0
  190. data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
  191. data/ext/uchardet/test/ar/utf-8.txt +3 -0
  192. data/ext/uchardet/test/ar/windows-1256.txt +3 -0
  193. data/ext/uchardet/test/bg/windows-1251.txt +3 -0
  194. data/ext/uchardet/test/cs/ibm852.txt +4 -0
  195. data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
  196. data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
  197. data/ext/uchardet/test/cs/utf-8.txt +4 -0
  198. data/ext/uchardet/test/cs/windows-1250.txt +4 -0
  199. data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
  200. data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
  201. data/ext/uchardet/test/da/utf-8.txt +10 -0
  202. data/ext/uchardet/test/da/windows-1252.txt +10 -0
  203. data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
  204. data/ext/uchardet/test/de/windows-1252.txt +11 -0
  205. data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
  206. data/ext/uchardet/test/el/utf-8.txt +3 -0
  207. data/ext/uchardet/test/el/windows-1253.txt +5 -0
  208. data/ext/uchardet/test/en/ascii.txt +4 -0
  209. data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
  210. data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
  211. data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
  212. data/ext/uchardet/test/es/utf-8.txt +5 -0
  213. data/ext/uchardet/test/es/windows-1252.txt +5 -0
  214. data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
  215. data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
  216. data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
  217. data/ext/uchardet/test/et/utf-8.txt +6 -0
  218. data/ext/uchardet/test/et/windows-1252.txt +6 -0
  219. data/ext/uchardet/test/et/windows-1257.txt +6 -0
  220. data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
  221. data/ext/uchardet/test/fi/utf-8.txt +8 -0
  222. data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
  223. data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
  224. data/ext/uchardet/test/fr/utf-16.be +0 -0
  225. data/ext/uchardet/test/fr/utf-32.le +0 -0
  226. data/ext/uchardet/test/fr/utf-8.txt +14 -0
  227. data/ext/uchardet/test/fr/windows-1252.txt +3 -0
  228. data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
  229. data/ext/uchardet/test/ga/utf-8.txt +6 -0
  230. data/ext/uchardet/test/ga/windows-1252.txt +6 -0
  231. data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
  232. data/ext/uchardet/test/he/utf-8.txt +3 -0
  233. data/ext/uchardet/test/he/windows-1255.txt +1 -0
  234. data/ext/uchardet/test/hr/ibm852.txt +4 -0
  235. data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
  236. data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
  237. data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
  238. data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
  239. data/ext/uchardet/test/hr/utf-8.txt +4 -0
  240. data/ext/uchardet/test/hr/windows-1250.txt +4 -0
  241. data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
  242. data/ext/uchardet/test/hu/windows-1250.txt +1 -0
  243. data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
  244. data/ext/uchardet/test/it/utf-8.txt +18 -0
  245. data/ext/uchardet/test/ja/euc-jp.txt +10 -0
  246. data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
  247. data/ext/uchardet/test/ja/shift_jis.txt +1 -0
  248. data/ext/uchardet/test/ja/utf-16be.txt +0 -0
  249. data/ext/uchardet/test/ja/utf-16le.txt +0 -0
  250. data/ext/uchardet/test/ja/utf-8.txt +9 -0
  251. data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
  252. data/ext/uchardet/test/ko/uhc.smi +16 -0
  253. data/ext/uchardet/test/ko/utf-16.le +0 -0
  254. data/ext/uchardet/test/ko/utf-32.be +0 -0
  255. data/ext/uchardet/test/ko/utf-8.txt +3 -0
  256. data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
  257. data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
  258. data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
  259. data/ext/uchardet/test/lt/utf-8.txt +3 -0
  260. data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
  261. data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
  262. data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
  263. data/ext/uchardet/test/lv/utf-8.txt +6 -0
  264. data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
  265. data/ext/uchardet/test/mt/utf-8.txt +4 -0
  266. data/ext/uchardet/test/pl/ibm852.txt +3 -0
  267. data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
  268. data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
  269. data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
  270. data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
  271. data/ext/uchardet/test/pl/utf-8.txt +3 -0
  272. data/ext/uchardet/test/pl/windows-1250.txt +3 -0
  273. data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
  274. data/ext/uchardet/test/pt/utf-8.txt +6 -0
  275. data/ext/uchardet/test/ro/ibm852.txt +9 -0
  276. data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
  277. data/ext/uchardet/test/ro/utf-8.txt +9 -0
  278. data/ext/uchardet/test/ro/windows-1250.txt +9 -0
  279. data/ext/uchardet/test/ru/ibm855.txt +5 -0
  280. data/ext/uchardet/test/ru/ibm866.txt +11 -0
  281. data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
  282. data/ext/uchardet/test/ru/koi8-r.txt +1 -0
  283. data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
  284. data/ext/uchardet/test/ru/windows-1251.txt +4 -0
  285. data/ext/uchardet/test/sk/ibm852.txt +3 -0
  286. data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
  287. data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
  288. data/ext/uchardet/test/sk/utf-8.txt +3 -0
  289. data/ext/uchardet/test/sk/windows-1250.txt +3 -0
  290. data/ext/uchardet/test/sl/ibm852.txt +9 -0
  291. data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
  292. data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
  293. data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
  294. data/ext/uchardet/test/sl/utf-8.txt +9 -0
  295. data/ext/uchardet/test/sl/windows-1250.txt +9 -0
  296. data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
  297. data/ext/uchardet/test/sv/utf-8.txt +10 -0
  298. data/ext/uchardet/test/sv/windows-1252.txt +10 -0
  299. data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
  300. data/ext/uchardet/test/th/tis-620.txt +5 -0
  301. data/ext/uchardet/test/th/utf-8.txt +1 -0
  302. data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
  303. data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
  304. data/ext/uchardet/test/uchardet-tests.c +130 -0
  305. data/ext/uchardet/test/vi/utf-8.txt +4 -0
  306. data/ext/uchardet/test/vi/viscii.txt +4 -0
  307. data/ext/uchardet/test/vi/windows-1258.txt +4 -0
  308. data/ext/uchardet/test/zh/big5.txt +1 -0
  309. data/ext/uchardet/test/zh/euc-tw.txt +1 -0
  310. data/ext/uchardet/test/zh/gb18030.txt +1 -0
  311. data/ext/uchardet/test/zh/utf-8.txt +1 -0
  312. data/ext/uchardet/uchardet.doap +51 -0
  313. data/ext/uchardet/uchardet.pc.in +10 -0
  314. data/lib/cchardet.rb +56 -0
  315. data/lib/cchardet/lib_finder.rb +32 -0
  316. data/lib/cchardet/version.rb +5 -0
  317. metadata +362 -0
@@ -0,0 +1,160 @@
1
+ set(
2
+ UCHARDET_HEADERS
3
+ uchardet.h
4
+ )
5
+
6
+ set(
7
+ UCHARDET_SOURCES
8
+ CharDistribution.cpp
9
+ JpCntx.cpp
10
+ LangModels/LangArabicModel.cpp
11
+ LangModels/LangBulgarianModel.cpp
12
+ LangModels/LangCroatianModel.cpp
13
+ LangModels/LangCzechModel.cpp
14
+ LangModels/LangEsperantoModel.cpp
15
+ LangModels/LangEstonianModel.cpp
16
+ LangModels/LangFinnishModel.cpp
17
+ LangModels/LangFrenchModel.cpp
18
+ LangModels/LangDanishModel.cpp
19
+ LangModels/LangGermanModel.cpp
20
+ LangModels/LangGreekModel.cpp
21
+ LangModels/LangHungarianModel.cpp
22
+ LangModels/LangHebrewModel.cpp
23
+ LangModels/LangIrishModel.cpp
24
+ LangModels/LangItalianModel.cpp
25
+ LangModels/LangLithuanianModel.cpp
26
+ LangModels/LangLatvianModel.cpp
27
+ LangModels/LangMalteseModel.cpp
28
+ LangModels/LangPolishModel.cpp
29
+ LangModels/LangPortugueseModel.cpp
30
+ LangModels/LangRomanianModel.cpp
31
+ LangModels/LangRussianModel.cpp
32
+ LangModels/LangSlovakModel.cpp
33
+ LangModels/LangSloveneModel.cpp
34
+ LangModels/LangSwedishModel.cpp
35
+ LangModels/LangSpanishModel.cpp
36
+ LangModels/LangThaiModel.cpp
37
+ LangModels/LangTurkishModel.cpp
38
+ LangModels/LangVietnameseModel.cpp
39
+ nsHebrewProber.cpp
40
+ nsCharSetProber.cpp
41
+ nsBig5Prober.cpp
42
+ nsEUCJPProber.cpp
43
+ nsEUCKRProber.cpp
44
+ nsEUCTWProber.cpp
45
+ nsEscCharsetProber.cpp
46
+ nsEscSM.cpp
47
+ nsGB2312Prober.cpp
48
+ nsMBCSGroupProber.cpp
49
+ nsMBCSSM.cpp
50
+ nsSBCSGroupProber.cpp
51
+ nsSBCharSetProber.cpp
52
+ nsSJISProber.cpp
53
+ nsUTF8Prober.cpp
54
+ nsLatin1Prober.cpp
55
+ nsUniversalDetector.cpp
56
+ uchardet.cpp
57
+ )
58
+
59
+ set (UCHARDET_LIBRARY libuchardet)
60
+ set (UCHARDET_LIBRARY libuchardet PARENT_SCOPE)
61
+
62
+ if (BUILD_STATIC AND BUILD_SHARED_LIBS)
63
+ set (UCHARDET_STATIC_LIBRARY libuchardet_static)
64
+ endif ()
65
+
66
+ add_definitions(
67
+ -DVERSION="${UCHARDET_VERSION}"
68
+ -Wall
69
+ )
70
+
71
+ if (CMAKE_BUILD_TYPE MATCHES Debug)
72
+ add_definitions(
73
+ -O0
74
+ -g3
75
+ )
76
+ endif (CMAKE_BUILD_TYPE MATCHES Debug)
77
+
78
+ add_library(
79
+ ${UCHARDET_LIBRARY}
80
+ ${UCHARDET_SOURCES}
81
+ )
82
+ target_compile_definitions("${UCHARDET_LIBRARY}" PRIVATE BUILDING_UCHARDET)
83
+ if(BUILD_SHARED_LIBS)
84
+ target_compile_definitions("${UCHARDET_LIBRARY}" PUBLIC UCHARDET_SHARED)
85
+ endif()
86
+
87
+ if (UCHARDET_STATIC_LIBRARY)
88
+ add_library(
89
+ ${UCHARDET_STATIC_LIBRARY}
90
+ STATIC
91
+ ${UCHARDET_SOURCES}
92
+ )
93
+ target_compile_definitions("${UCHARDET_STATIC_LIBRARY}" PRIVATE BUILDING_UCHARDET)
94
+ endif (UCHARDET_STATIC_LIBRARY)
95
+
96
+ set_target_properties(
97
+ ${UCHARDET_LIBRARY}
98
+ PROPERTIES
99
+ LINKER_LANGUAGE
100
+ CXX
101
+ OUTPUT_NAME
102
+ ${PACKAGE_NAME}
103
+ VERSION
104
+ ${UCHARDET_VERSION}
105
+ SOVERSION
106
+ ${UCHARDET_VERSION_MAJOR}
107
+ )
108
+
109
+ if (UCHARDET_STATIC_LIBRARY)
110
+ set_target_properties(
111
+ ${UCHARDET_STATIC_LIBRARY}
112
+ PROPERTIES
113
+ LINKER_LANGUAGE
114
+ CXX
115
+ OUTPUT_NAME
116
+ ${PACKAGE_NAME}
117
+ )
118
+ endif (UCHARDET_STATIC_LIBRARY)
119
+
120
+ if (NOT WIN32)
121
+ install(
122
+ TARGETS
123
+ ${UCHARDET_LIBRARY}
124
+ LIBRARY DESTINATION
125
+ ${CMAKE_INSTALL_LIBDIR}
126
+ ARCHIVE DESTINATION
127
+ ${CMAKE_INSTALL_LIBDIR}
128
+ )
129
+ else (NOT WIN32)
130
+ install(
131
+ TARGETS
132
+ ${UCHARDET_LIBRARY}
133
+ RUNTIME DESTINATION
134
+ ${CMAKE_INSTALL_BINDIR}
135
+ ARCHIVE DESTINATION
136
+ ${CMAKE_INSTALL_LIBDIR}
137
+ )
138
+ endif (NOT WIN32)
139
+
140
+ if (UCHARDET_STATIC_LIBRARY)
141
+ install(
142
+ TARGETS
143
+ ${UCHARDET_STATIC_LIBRARY}
144
+ ARCHIVE DESTINATION
145
+ ${CMAKE_INSTALL_LIBDIR}
146
+ )
147
+ endif (UCHARDET_STATIC_LIBRARY)
148
+
149
+ install(
150
+ FILES
151
+ ${UCHARDET_HEADERS}
152
+ DESTINATION
153
+ ${CMAKE_INSTALL_INCLUDEDIR}/${PACKAGE_NAME}
154
+ )
155
+
156
+ include(symbols.cmake)
157
+
158
+ if (BUILD_BINARY)
159
+ add_subdirectory(tools)
160
+ endif (BUILD_BINARY)
@@ -0,0 +1,109 @@
1
+ /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
+ /* ***** BEGIN LICENSE BLOCK *****
3
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
+ *
5
+ * The contents of this file are subject to the Mozilla Public License Version
6
+ * 1.1 (the "License"); you may not use this file except in compliance with
7
+ * the License. You may obtain a copy of the License at
8
+ * http://www.mozilla.org/MPL/
9
+ *
10
+ * Software distributed under the License is distributed on an "AS IS" basis,
11
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
+ * for the specific language governing rights and limitations under the
13
+ * License.
14
+ *
15
+ * The Original Code is Mozilla Communicator client code.
16
+ *
17
+ * The Initial Developer of the Original Code is
18
+ * Netscape Communications Corporation.
19
+ * Portions created by the Initial Developer are Copyright (C) 1998
20
+ * the Initial Developer. All Rights Reserved.
21
+ *
22
+ * Contributor(s):
23
+ *
24
+ * Alternatively, the contents of this file may be used under the terms of
25
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
26
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
+ * in which case the provisions of the GPL or the LGPL are applicable instead
28
+ * of those above. If you wish to allow use of your version of this file only
29
+ * under the terms of either the GPL or the LGPL, and not to allow others to
30
+ * use your version of this file under the terms of the MPL, indicate your
31
+ * decision by deleting the provisions above and replace them with the notice
32
+ * and other provisions required by the GPL or the LGPL. If you do not delete
33
+ * the provisions above, a recipient may use your version of this file under
34
+ * the terms of any one of the MPL, the GPL or the LGPL.
35
+ *
36
+ * ***** END LICENSE BLOCK ***** */
37
+
38
+ #include "CharDistribution.h"
39
+
40
+ #include "JISFreq.tab"
41
+ #include "Big5Freq.tab"
42
+ #include "EUCKRFreq.tab"
43
+ #include "EUCTWFreq.tab"
44
+ #include "GB2312Freq.tab"
45
+
46
+ #define SURE_YES 0.99f
47
+ #define SURE_NO 0.01f
48
+
49
+ //return confidence base on received data
50
+ float CharDistributionAnalysis::GetConfidence(void)
51
+ {
52
+ //if we didn't receive any character in our consideration range, or the
53
+ // number of frequent characters is below the minimum threshold, return
54
+ // negative answer
55
+ if (mTotalChars <= 0 || mFreqChars <= mDataThreshold)
56
+ return SURE_NO;
57
+
58
+ if (mTotalChars != mFreqChars) {
59
+ float r = mFreqChars / ((mTotalChars - mFreqChars) * mTypicalDistributionRatio);
60
+
61
+ if (r < SURE_YES)
62
+ return r;
63
+ }
64
+ //normalize confidence, (we don't want to be 100% sure)
65
+ return SURE_YES;
66
+ }
67
+
68
+ EUCTWDistributionAnalysis::EUCTWDistributionAnalysis()
69
+ {
70
+ mCharToFreqOrder = EUCTWCharToFreqOrder;
71
+ mTableSize = EUCTW_TABLE_SIZE;
72
+ mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO;
73
+ }
74
+
75
+ EUCKRDistributionAnalysis::EUCKRDistributionAnalysis()
76
+ {
77
+ mCharToFreqOrder = EUCKRCharToFreqOrder;
78
+ mTableSize = EUCKR_TABLE_SIZE;
79
+ mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO;
80
+ }
81
+
82
+ GB2312DistributionAnalysis::GB2312DistributionAnalysis()
83
+ {
84
+ mCharToFreqOrder = GB2312CharToFreqOrder;
85
+ mTableSize = GB2312_TABLE_SIZE;
86
+ mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO;
87
+ }
88
+
89
+ Big5DistributionAnalysis::Big5DistributionAnalysis()
90
+ {
91
+ mCharToFreqOrder = Big5CharToFreqOrder;
92
+ mTableSize = BIG5_TABLE_SIZE;
93
+ mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO;
94
+ }
95
+
96
+ SJISDistributionAnalysis::SJISDistributionAnalysis()
97
+ {
98
+ mCharToFreqOrder = JISCharToFreqOrder;
99
+ mTableSize = JIS_TABLE_SIZE;
100
+ mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO;
101
+ }
102
+
103
+ EUCJPDistributionAnalysis::EUCJPDistributionAnalysis()
104
+ {
105
+ mCharToFreqOrder = JISCharToFreqOrder;
106
+ mTableSize = JIS_TABLE_SIZE;
107
+ mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO;
108
+ }
109
+
@@ -0,0 +1,242 @@
1
+ /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
+ /* ***** BEGIN LICENSE BLOCK *****
3
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
+ *
5
+ * The contents of this file are subject to the Mozilla Public License Version
6
+ * 1.1 (the "License"); you may not use this file except in compliance with
7
+ * the License. You may obtain a copy of the License at
8
+ * http://www.mozilla.org/MPL/
9
+ *
10
+ * Software distributed under the License is distributed on an "AS IS" basis,
11
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
+ * for the specific language governing rights and limitations under the
13
+ * License.
14
+ *
15
+ * The Original Code is Mozilla Communicator client code.
16
+ *
17
+ * The Initial Developer of the Original Code is
18
+ * Netscape Communications Corporation.
19
+ * Portions created by the Initial Developer are Copyright (C) 1998
20
+ * the Initial Developer. All Rights Reserved.
21
+ *
22
+ * Contributor(s):
23
+ *
24
+ * Alternatively, the contents of this file may be used under the terms of
25
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
26
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
+ * in which case the provisions of the GPL or the LGPL are applicable instead
28
+ * of those above. If you wish to allow use of your version of this file only
29
+ * under the terms of either the GPL or the LGPL, and not to allow others to
30
+ * use your version of this file under the terms of the MPL, indicate your
31
+ * decision by deleting the provisions above and replace them with the notice
32
+ * and other provisions required by the GPL or the LGPL. If you do not delete
33
+ * the provisions above, a recipient may use your version of this file under
34
+ * the terms of any one of the MPL, the GPL or the LGPL.
35
+ *
36
+ * ***** END LICENSE BLOCK ***** */
37
+
38
+ #ifndef CharDistribution_h__
39
+ #define CharDistribution_h__
40
+
41
+ #include "nscore.h"
42
+
43
+ #define ENOUGH_DATA_THRESHOLD 1024
44
+
45
+ #define MINIMUM_DATA_THRESHOLD 4
46
+
47
+ class CharDistributionAnalysis
48
+ {
49
+ public:
50
+ CharDistributionAnalysis() {Reset(PR_FALSE);}
51
+
52
+ //feed a block of data and do distribution analysis
53
+ void HandleData(const char* aBuf, PRUint32 aLen) {}
54
+
55
+ //Feed a character with known length
56
+ void HandleOneChar(const char* aStr, PRUint32 aCharLen)
57
+ {
58
+ PRInt32 order;
59
+
60
+ //we only care about 2-bytes character in our distribution analysis
61
+ order = (aCharLen == 2) ? GetOrder(aStr) : -1;
62
+
63
+ if (order >= 0)
64
+ {
65
+ mTotalChars++;
66
+ //order is valid
67
+ if ((PRUint32)order < mTableSize)
68
+ {
69
+ if (512 > mCharToFreqOrder[order])
70
+ mFreqChars++;
71
+ }
72
+ }
73
+ }
74
+
75
+ //return confidence base on existing data
76
+ float GetConfidence(void);
77
+
78
+ //Reset analyser, clear any state
79
+ void Reset(PRBool aIsPreferredLanguage)
80
+ {
81
+ mDone = PR_FALSE;
82
+ mTotalChars = 0;
83
+ mFreqChars = 0;
84
+ mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
85
+ }
86
+
87
+ //This function is for future extension. Caller can use this function to control
88
+ //analyser's behavior
89
+ void SetOpion(){}
90
+
91
+ //It is not necessary to receive all data to draw conclusion. For charset detection,
92
+ // certain amount of data is enough
93
+ PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
94
+
95
+ protected:
96
+ //we do not handle character base on its original encoding string, but
97
+ //convert this encoding string to a number, here called order.
98
+ //This allow multiple encoding of a language to share one frequency table
99
+ virtual PRInt32 GetOrder(const char* str) {return -1;}
100
+
101
+ //If this flag is set to PR_TRUE, detection is done and conclusion has been made
102
+ PRBool mDone;
103
+
104
+ //The number of characters whose frequency order is less than 512
105
+ PRUint32 mFreqChars;
106
+
107
+ //Total character encounted.
108
+ PRUint32 mTotalChars;
109
+
110
+ //Number of hi-byte characters needed to trigger detection
111
+ PRUint32 mDataThreshold;
112
+
113
+ //Mapping table to get frequency order from char order (get from GetOrder())
114
+ const PRInt16 *mCharToFreqOrder;
115
+
116
+ //Size of above table
117
+ PRUint32 mTableSize;
118
+
119
+ //This is a constant value varies from language to language, it is used in
120
+ //calculating confidence. See my paper for further detail.
121
+ float mTypicalDistributionRatio;
122
+ };
123
+
124
+
125
+ class EUCTWDistributionAnalysis: public CharDistributionAnalysis
126
+ {
127
+ public:
128
+ EUCTWDistributionAnalysis();
129
+ protected:
130
+
131
+ //for EUC-TW encoding, we are interested
132
+ // first byte range: 0xc4 -- 0xfe
133
+ // second byte range: 0xa1 -- 0xfe
134
+ //no validation needed here. State machine has done that
135
+ PRInt32 GetOrder(const char* str)
136
+ { if ((unsigned char)*str >= (unsigned char)0xc4)
137
+ return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
138
+ else
139
+ return -1;
140
+ }
141
+ };
142
+
143
+
144
+ class EUCKRDistributionAnalysis : public CharDistributionAnalysis
145
+ {
146
+ public:
147
+ EUCKRDistributionAnalysis();
148
+ protected:
149
+ //for euc-KR encoding, we are interested
150
+ // first byte range: 0xb0 -- 0xfe
151
+ // second byte range: 0xa1 -- 0xfe
152
+ //no validation needed here. State machine has done that
153
+ PRInt32 GetOrder(const char* str)
154
+ { if ((unsigned char)*str >= (unsigned char)0xb0)
155
+ return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
156
+ else
157
+ return -1;
158
+ }
159
+ };
160
+
161
+ class GB2312DistributionAnalysis : public CharDistributionAnalysis
162
+ {
163
+ public:
164
+ GB2312DistributionAnalysis();
165
+ protected:
166
+ //for GB2312 encoding, we are interested
167
+ // first byte range: 0xb0 -- 0xfe
168
+ // second byte range: 0xa1 -- 0xfe
169
+ //no validation needed here. State machine has done that
170
+ PRInt32 GetOrder(const char* str)
171
+ { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)
172
+ return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
173
+ else
174
+ return -1;
175
+ }
176
+ };
177
+
178
+
179
+ class Big5DistributionAnalysis : public CharDistributionAnalysis
180
+ {
181
+ public:
182
+ Big5DistributionAnalysis();
183
+ protected:
184
+ //for big5 encoding, we are interested
185
+ // first byte range: 0xa4 -- 0xfe
186
+ // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
187
+ //no validation needed here. State machine has done that
188
+ PRInt32 GetOrder(const char* str)
189
+ { if ((unsigned char)*str >= (unsigned char)0xa4)
190
+ if ((unsigned char)str[1] >= (unsigned char)0xa1)
191
+ return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
192
+ else
193
+ return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
194
+ else
195
+ return -1;
196
+ }
197
+ };
198
+
199
+ class SJISDistributionAnalysis : public CharDistributionAnalysis
200
+ {
201
+ public:
202
+ SJISDistributionAnalysis();
203
+ protected:
204
+ //for sjis encoding, we are interested
205
+ // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
206
+ // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
207
+ //no validation needed here. State machine has done that
208
+ PRInt32 GetOrder(const char* str)
209
+ {
210
+ PRInt32 order;
211
+ if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)
212
+ order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
213
+ else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)
214
+ order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
215
+ else
216
+ return -1;
217
+ order += (unsigned char)*(str+1) - 0x40;
218
+ if ((unsigned char)str[1] > (unsigned char)0x7f)
219
+ order--;
220
+ return order;
221
+ }
222
+ };
223
+
224
+ class EUCJPDistributionAnalysis : public CharDistributionAnalysis
225
+ {
226
+ public:
227
+ EUCJPDistributionAnalysis();
228
+ protected:
229
+ //for euc-JP encoding, we are interested
230
+ // first byte range: 0xa0 -- 0xfe
231
+ // second byte range: 0xa1 -- 0xfe
232
+ //no validation needed here. State machine has done that
233
+ PRInt32 GetOrder(const char* str)
234
+ { if ((unsigned char)*str >= (unsigned char)0xa0)
235
+ return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
236
+ else
237
+ return -1;
238
+ }
239
+ };
240
+
241
+ #endif //CharDistribution_h__
242
+