cchardet 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (317) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.gitmodules +3 -0
  4. data/.rubocop.yml +11 -0
  5. data/CHANGELOG.md +5 -0
  6. data/Gemfile +10 -0
  7. data/README.md +35 -0
  8. data/Rakefile +15 -0
  9. data/cchardet.gemspec +30 -0
  10. data/ext/cchardet/extconf.rb +26 -0
  11. data/ext/uchardet/.gitignore +1 -0
  12. data/ext/uchardet/.gitlab-ci.yml +106 -0
  13. data/ext/uchardet/AUTHORS +16 -0
  14. data/ext/uchardet/CMakeLists.txt +74 -0
  15. data/ext/uchardet/COPYING +1316 -0
  16. data/ext/uchardet/INSTALL +26 -0
  17. data/ext/uchardet/README.md +295 -0
  18. data/ext/uchardet/build-mac/uchardet.cpp +7 -0
  19. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
  20. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  21. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
  22. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
  23. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
  24. data/ext/uchardet/doc/CMakeLists.txt +6 -0
  25. data/ext/uchardet/doc/README.maintainer +59 -0
  26. data/ext/uchardet/doc/uchardet.1 +18 -0
  27. data/ext/uchardet/script/BuildLangModel.py +533 -0
  28. data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
  29. data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
  30. data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
  31. data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
  32. data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
  33. data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
  34. data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
  35. data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
  36. data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
  37. data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
  38. data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
  39. data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
  40. data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
  41. data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
  42. data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
  43. data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
  44. data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
  45. data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
  46. data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
  47. data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
  48. data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
  49. data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
  50. data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
  51. data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
  52. data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
  53. data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
  54. data/ext/uchardet/script/README +63 -0
  55. data/ext/uchardet/script/charsets/codepoints.py +53 -0
  56. data/ext/uchardet/script/charsets/db.py +73 -0
  57. data/ext/uchardet/script/charsets/ibm852.py +72 -0
  58. data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
  59. data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
  60. data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
  61. data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
  62. data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
  63. data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
  64. data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
  65. data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
  66. data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
  67. data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
  68. data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
  69. data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
  70. data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
  71. data/ext/uchardet/script/charsets/tis-620.py +77 -0
  72. data/ext/uchardet/script/charsets/viscii.py +72 -0
  73. data/ext/uchardet/script/charsets/windows-1250.py +75 -0
  74. data/ext/uchardet/script/charsets/windows-1252.py +76 -0
  75. data/ext/uchardet/script/charsets/windows-1253.py +72 -0
  76. data/ext/uchardet/script/charsets/windows-1256.py +75 -0
  77. data/ext/uchardet/script/charsets/windows-1257.py +72 -0
  78. data/ext/uchardet/script/charsets/windows-1258.py +72 -0
  79. data/ext/uchardet/script/debug.sh +9 -0
  80. data/ext/uchardet/script/header-template.cpp +38 -0
  81. data/ext/uchardet/script/langs/ar.py +59 -0
  82. data/ext/uchardet/script/langs/cs.py +80 -0
  83. data/ext/uchardet/script/langs/da.py +69 -0
  84. data/ext/uchardet/script/langs/de.py +69 -0
  85. data/ext/uchardet/script/langs/el.py +55 -0
  86. data/ext/uchardet/script/langs/eo.py +67 -0
  87. data/ext/uchardet/script/langs/es.py +69 -0
  88. data/ext/uchardet/script/langs/et.py +57 -0
  89. data/ext/uchardet/script/langs/fi.py +60 -0
  90. data/ext/uchardet/script/langs/fr.py +79 -0
  91. data/ext/uchardet/script/langs/ga.py +60 -0
  92. data/ext/uchardet/script/langs/hr.py +59 -0
  93. data/ext/uchardet/script/langs/hu.py +66 -0
  94. data/ext/uchardet/script/langs/it.py +56 -0
  95. data/ext/uchardet/script/langs/lt.py +70 -0
  96. data/ext/uchardet/script/langs/lv.py +69 -0
  97. data/ext/uchardet/script/langs/mt.py +80 -0
  98. data/ext/uchardet/script/langs/pl.py +81 -0
  99. data/ext/uchardet/script/langs/pt.py +80 -0
  100. data/ext/uchardet/script/langs/ro.py +65 -0
  101. data/ext/uchardet/script/langs/sk.py +80 -0
  102. data/ext/uchardet/script/langs/sl.py +59 -0
  103. data/ext/uchardet/script/langs/sv.py +56 -0
  104. data/ext/uchardet/script/langs/th.py +55 -0
  105. data/ext/uchardet/script/langs/tr.py +67 -0
  106. data/ext/uchardet/script/langs/vi.py +64 -0
  107. data/ext/uchardet/script/release.sh +8 -0
  108. data/ext/uchardet/script/win32.sh +7 -0
  109. data/ext/uchardet/src/Big5Freq.tab +943 -0
  110. data/ext/uchardet/src/CMakeLists.txt +160 -0
  111. data/ext/uchardet/src/CharDistribution.cpp +109 -0
  112. data/ext/uchardet/src/CharDistribution.h +242 -0
  113. data/ext/uchardet/src/EUCKRFreq.tab +614 -0
  114. data/ext/uchardet/src/EUCTWFreq.tab +447 -0
  115. data/ext/uchardet/src/GB2312Freq.tab +491 -0
  116. data/ext/uchardet/src/JISFreq.tab +589 -0
  117. data/ext/uchardet/src/JpCntx.cpp +230 -0
  118. data/ext/uchardet/src/JpCntx.h +140 -0
  119. data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
  120. data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
  121. data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
  122. data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
  123. data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
  124. data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
  125. data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
  126. data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
  127. data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
  128. data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
  129. data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
  130. data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
  131. data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
  132. data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
  133. data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
  134. data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
  135. data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
  136. data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
  137. data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
  138. data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
  139. data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
  140. data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
  141. data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
  142. data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
  143. data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
  144. data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
  145. data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
  146. data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
  147. data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
  148. data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
  149. data/ext/uchardet/src/nsBig5Prober.h +75 -0
  150. data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
  151. data/ext/uchardet/src/nsCharSetProber.h +77 -0
  152. data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
  153. data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
  154. data/ext/uchardet/src/nsEUCJPProber.h +79 -0
  155. data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
  156. data/ext/uchardet/src/nsEUCKRProber.h +81 -0
  157. data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
  158. data/ext/uchardet/src/nsEUCTWProber.h +75 -0
  159. data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
  160. data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
  161. data/ext/uchardet/src/nsEscSM.cpp +267 -0
  162. data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
  163. data/ext/uchardet/src/nsGB2312Prober.h +77 -0
  164. data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
  165. data/ext/uchardet/src/nsHebrewProber.h +177 -0
  166. data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
  167. data/ext/uchardet/src/nsLatin1Prober.h +73 -0
  168. data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
  169. data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
  170. data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
  171. data/ext/uchardet/src/nsPkgInt.h +89 -0
  172. data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
  173. data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
  174. data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
  175. data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
  176. data/ext/uchardet/src/nsSJISProber.cpp +98 -0
  177. data/ext/uchardet/src/nsSJISProber.h +81 -0
  178. data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
  179. data/ext/uchardet/src/nsUTF8Prober.h +66 -0
  180. data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
  181. data/ext/uchardet/src/nsUniversalDetector.h +91 -0
  182. data/ext/uchardet/src/nscore.h +59 -0
  183. data/ext/uchardet/src/prmem.h +49 -0
  184. data/ext/uchardet/src/symbols.cmake +41 -0
  185. data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
  186. data/ext/uchardet/src/tools/uchardet.cpp +254 -0
  187. data/ext/uchardet/src/uchardet.cpp +274 -0
  188. data/ext/uchardet/src/uchardet.h +136 -0
  189. data/ext/uchardet/test/CMakeLists.txt +47 -0
  190. data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
  191. data/ext/uchardet/test/ar/utf-8.txt +3 -0
  192. data/ext/uchardet/test/ar/windows-1256.txt +3 -0
  193. data/ext/uchardet/test/bg/windows-1251.txt +3 -0
  194. data/ext/uchardet/test/cs/ibm852.txt +4 -0
  195. data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
  196. data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
  197. data/ext/uchardet/test/cs/utf-8.txt +4 -0
  198. data/ext/uchardet/test/cs/windows-1250.txt +4 -0
  199. data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
  200. data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
  201. data/ext/uchardet/test/da/utf-8.txt +10 -0
  202. data/ext/uchardet/test/da/windows-1252.txt +10 -0
  203. data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
  204. data/ext/uchardet/test/de/windows-1252.txt +11 -0
  205. data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
  206. data/ext/uchardet/test/el/utf-8.txt +3 -0
  207. data/ext/uchardet/test/el/windows-1253.txt +5 -0
  208. data/ext/uchardet/test/en/ascii.txt +4 -0
  209. data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
  210. data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
  211. data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
  212. data/ext/uchardet/test/es/utf-8.txt +5 -0
  213. data/ext/uchardet/test/es/windows-1252.txt +5 -0
  214. data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
  215. data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
  216. data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
  217. data/ext/uchardet/test/et/utf-8.txt +6 -0
  218. data/ext/uchardet/test/et/windows-1252.txt +6 -0
  219. data/ext/uchardet/test/et/windows-1257.txt +6 -0
  220. data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
  221. data/ext/uchardet/test/fi/utf-8.txt +8 -0
  222. data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
  223. data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
  224. data/ext/uchardet/test/fr/utf-16.be +0 -0
  225. data/ext/uchardet/test/fr/utf-32.le +0 -0
  226. data/ext/uchardet/test/fr/utf-8.txt +14 -0
  227. data/ext/uchardet/test/fr/windows-1252.txt +3 -0
  228. data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
  229. data/ext/uchardet/test/ga/utf-8.txt +6 -0
  230. data/ext/uchardet/test/ga/windows-1252.txt +6 -0
  231. data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
  232. data/ext/uchardet/test/he/utf-8.txt +3 -0
  233. data/ext/uchardet/test/he/windows-1255.txt +1 -0
  234. data/ext/uchardet/test/hr/ibm852.txt +4 -0
  235. data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
  236. data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
  237. data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
  238. data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
  239. data/ext/uchardet/test/hr/utf-8.txt +4 -0
  240. data/ext/uchardet/test/hr/windows-1250.txt +4 -0
  241. data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
  242. data/ext/uchardet/test/hu/windows-1250.txt +1 -0
  243. data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
  244. data/ext/uchardet/test/it/utf-8.txt +18 -0
  245. data/ext/uchardet/test/ja/euc-jp.txt +10 -0
  246. data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
  247. data/ext/uchardet/test/ja/shift_jis.txt +1 -0
  248. data/ext/uchardet/test/ja/utf-16be.txt +0 -0
  249. data/ext/uchardet/test/ja/utf-16le.txt +0 -0
  250. data/ext/uchardet/test/ja/utf-8.txt +9 -0
  251. data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
  252. data/ext/uchardet/test/ko/uhc.smi +16 -0
  253. data/ext/uchardet/test/ko/utf-16.le +0 -0
  254. data/ext/uchardet/test/ko/utf-32.be +0 -0
  255. data/ext/uchardet/test/ko/utf-8.txt +3 -0
  256. data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
  257. data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
  258. data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
  259. data/ext/uchardet/test/lt/utf-8.txt +3 -0
  260. data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
  261. data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
  262. data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
  263. data/ext/uchardet/test/lv/utf-8.txt +6 -0
  264. data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
  265. data/ext/uchardet/test/mt/utf-8.txt +4 -0
  266. data/ext/uchardet/test/pl/ibm852.txt +3 -0
  267. data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
  268. data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
  269. data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
  270. data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
  271. data/ext/uchardet/test/pl/utf-8.txt +3 -0
  272. data/ext/uchardet/test/pl/windows-1250.txt +3 -0
  273. data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
  274. data/ext/uchardet/test/pt/utf-8.txt +6 -0
  275. data/ext/uchardet/test/ro/ibm852.txt +9 -0
  276. data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
  277. data/ext/uchardet/test/ro/utf-8.txt +9 -0
  278. data/ext/uchardet/test/ro/windows-1250.txt +9 -0
  279. data/ext/uchardet/test/ru/ibm855.txt +5 -0
  280. data/ext/uchardet/test/ru/ibm866.txt +11 -0
  281. data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
  282. data/ext/uchardet/test/ru/koi8-r.txt +1 -0
  283. data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
  284. data/ext/uchardet/test/ru/windows-1251.txt +4 -0
  285. data/ext/uchardet/test/sk/ibm852.txt +3 -0
  286. data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
  287. data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
  288. data/ext/uchardet/test/sk/utf-8.txt +3 -0
  289. data/ext/uchardet/test/sk/windows-1250.txt +3 -0
  290. data/ext/uchardet/test/sl/ibm852.txt +9 -0
  291. data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
  292. data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
  293. data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
  294. data/ext/uchardet/test/sl/utf-8.txt +9 -0
  295. data/ext/uchardet/test/sl/windows-1250.txt +9 -0
  296. data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
  297. data/ext/uchardet/test/sv/utf-8.txt +10 -0
  298. data/ext/uchardet/test/sv/windows-1252.txt +10 -0
  299. data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
  300. data/ext/uchardet/test/th/tis-620.txt +5 -0
  301. data/ext/uchardet/test/th/utf-8.txt +1 -0
  302. data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
  303. data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
  304. data/ext/uchardet/test/uchardet-tests.c +130 -0
  305. data/ext/uchardet/test/vi/utf-8.txt +4 -0
  306. data/ext/uchardet/test/vi/viscii.txt +4 -0
  307. data/ext/uchardet/test/vi/windows-1258.txt +4 -0
  308. data/ext/uchardet/test/zh/big5.txt +1 -0
  309. data/ext/uchardet/test/zh/euc-tw.txt +1 -0
  310. data/ext/uchardet/test/zh/gb18030.txt +1 -0
  311. data/ext/uchardet/test/zh/utf-8.txt +1 -0
  312. data/ext/uchardet/uchardet.doap +51 -0
  313. data/ext/uchardet/uchardet.pc.in +10 -0
  314. data/lib/cchardet.rb +56 -0
  315. data/lib/cchardet/lib_finder.rb +32 -0
  316. data/lib/cchardet/version.rb +5 -0
  317. metadata +362 -0
@@ -0,0 +1,142 @@
1
+ = Logs of language model for Arabic (ar) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2015-12-13 18:31:12.817808
5
+ - Maximum depth: 2
6
+ - Max number of pages: 50
7
+
8
+ == Parsed pages ==
9
+
10
+ الصفحة_الرئيسية (revision 17217037)
11
+ 11 ديسمبر (revision 17699159)
12
+ 12 ديسمبر (revision 17710194)
13
+ 13 ديسمبر (revision 17722318)
14
+ 1437 هـ (revision 17278274)
15
+ 14 ديسمبر (revision 17432010)
16
+ 15 ديسمبر (revision 17206233)
17
+ 1645 (revision 17168144)
18
+ 1954 (revision 17409780)
19
+ 1955 (revision 16826533)
20
+ 1972 (revision 17004868)
21
+ 1988 (revision 17671285)
22
+ 2003 (revision 17656994)
23
+ 2011 (revision 17589601)
24
+ 2015 (revision 17678287)
25
+ 216 ق.م (revision 17586752)
26
+ 25 يناير (revision 17325864)
27
+ 2 ربيع الأول (revision 17722146)
28
+ 6 (عدد) (revision 16972178)
29
+ آريز (revision 17466671)
30
+ آلهة اليونان (revision 17722617)
31
+ أثينا (revision 17642941)
32
+ أثينا (ميثولوجيا) (revision 17662932)
33
+ أزمة المهاجرين إلى أوروبا (revision 17718437)
34
+ أوروبا (revision 17713457)
35
+ إس سي إي سانتا مونيكا ستوديو (revision 17035439)
36
+ إسبارطة (revision 16733170)
37
+ إسماعيل الصفوي (revision 17194218)
38
+ إله الحرب (لعبة فيديو) (revision 17630201)
39
+ إمارة دبي (revision 17602037)
40
+ إيطاليا (revision 17586853)
41
+ اتفاق باريس (revision 17718086)
42
+ الأزمة الليبية (revision 17630232)
43
+ الإمارات العربية المتحدة (revision 17722077)
44
+ الإنتخابات البلدية السعودية 2015 (revision 17722004)
45
+ الاتحاد الأوروبي لكرة القدم (revision 17596822)
46
+ الاحتجاجات اللبنانية 2015 (revision 17315127)
47
+ الانتفاضة الفلسطينية (2015) (revision 17710414)
48
+ التمرد العراقي (revision 17708640)
49
+ الجمعية العامة للأمم المتحدة (revision 17304227)
50
+ الجمهورية الرومانية (revision 16472557)
51
+ الجيش اللبناني (revision 17516533)
52
+ الحرب الأهلية السورية (revision 17675300)
53
+ الحرب الأهلية اليمنية (2015) (revision 17686236)
54
+ الحرب في شمال غرب باكستان (revision 17490838)
55
+ الدولة الصفوية (revision 17031046)
56
+ الرياض (revision 17580586)
57
+ السعودية (revision 17711339)
58
+ السلطة الوطنية الفلسطينية (revision 17438123)
59
+ العراق (revision 17704602)
60
+ العلاقات الخارجية في تركيا (revision 17647409)
61
+
62
+ == End of Parsed pages ==
63
+
64
+ - Wikipedia parsing ended at: 2015-12-13 18:33:58.846891
65
+
66
+ 95 characters appeared 727795 times.
67
+
68
+ First 64 characters:
69
+ [ 0] Char ا: 14.933875610577156 %
70
+ [ 1] Char ل: 11.460782225764122 %
71
+ [ 2] Char ي: 8.30302489025069 %
72
+ [ 3] Char م: 6.3702003998378665 %
73
+ [ 4] Char و: 5.952637762007158 %
74
+ [ 5] Char ر: 4.9419135883043985 %
75
+ [ 6] Char ن: 4.900967992360486 %
76
+ [ 7] Char ت: 4.229625100474721 %
77
+ [ 8] Char ة: 3.6022506337636284 %
78
+ [ 9] Char ب: 3.5434428650925054 %
79
+ [10] Char ع: 3.3116468236247845 %
80
+ [11] Char د: 3.1756195082406444 %
81
+ [12] Char س: 2.5401383631379715 %
82
+ [13] Char ف: 2.3899587109007343 %
83
+ [14] Char ق: 2.010868445097864 %
84
+ [15] Char أ: 1.8763525443291036 %
85
+ [16] Char ه: 1.8663222473361318 %
86
+ [17] Char ك: 1.8573911609725264 %
87
+ [18] Char ح: 1.8431014227907585 %
88
+ [19] Char ج: 1.3270220323030524 %
89
+ [20] Char ط: 1.0305099650313618 %
90
+ [21] Char ش: 0.9638703206260004 %
91
+ [22] Char إ: 0.8946200509758929 %
92
+ [23] Char ص: 0.8509264284585631 %
93
+ [24] Char ى: 0.7726076711161797 %
94
+ [25] Char خ: 0.717097534333157 %
95
+ [26] Char ز: 0.6687322666410184 %
96
+ [27] Char ث: 0.6549921337739336 %
97
+ [28] Char ض: 0.5409490309771295 %
98
+ [29] Char غ: 0.4574090231452538 %
99
+ [30] Char ذ: 0.44765352880962356 %
100
+ [31] Char ئ: 0.39269299734128427 %
101
+ [32] Char ء: 0.295138053984982 %
102
+ [33] Char ظ: 0.2397653185306302 %
103
+ [34] Char آ: 0.12324899181775088 %
104
+ [35] Char ؤ: 0.08491402111858422 %
105
+ [36] Char ـ: 0.047678261048784344 %
106
+ [37] Char a: 0.03311372020967443 %
107
+ [38] Char e: 0.029403884335561525 %
108
+ [39] Char i: 0.027205463076827956 %
109
+ [40] Char o: 0.02432003517474014 %
110
+ [41] Char t: 0.02349562720271505 %
111
+ [42] Char r: 0.02294602188803166 %
112
+ [43] Char n: 0.020472797971956388 %
113
+ [44] Char s: 0.01799957405588112 %
114
+ [45] Char l: 0.012915724895059736 %
115
+ [46] Char h: 0.011816514265692949 %
116
+ [47] Char d: 0.011129507622338709 %
117
+ [48] Char پ: 0.010717303636326163 %
118
+ [49] Char c: 0.009480691678288529 %
119
+ [50] Char u: 0.007969277062909199 %
120
+ [51] Char m: 0.007694474405567502 %
121
+ [52] Char A: 0.006870066433542411 %
122
+ [53] Char گ: 0.006595263776200715 %
123
+ [54] Char f: 0.006183059790188171 %
124
+ [55] Char S: 0.005770855804175626 %
125
+ [56] Char y: 0.0054960531468339294 %
126
+ [57] Char T: 0.0049464478321505365 %
127
+ [58] Char b: 0.0048090465034796885 %
128
+ [59] Char G: 0.0046716451748088405 %
129
+ [60] Char I: 0.004396842517467144 %
130
+ [61] Char C: 0.0042594411887962955 %
131
+ [62] Char p: 0.0039846385314545995 %
132
+ [63] Char k: 0.003709835874112903 %
133
+
134
+ The first 64 characters have an accumulated ratio of 0.999523217389512.
135
+
136
+ 1479 sequences found.
137
+
138
+ First 512 (typical positive ratio): 0.9696025116913417
139
+ Next 512 (512-1024): 1.3740132867084825e-06
140
+ Rest: 0.0012305764497782395
141
+
142
+ - Processing end: 2015-12-13 18:33:59.193909
@@ -0,0 +1,157 @@
1
+ = Logs of language model for Croatian (hr) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2016-09-25 23:41:35.999066
5
+ - Maximum depth: 5
6
+ - Max number of pages: 100
7
+
8
+ == Parsed pages ==
9
+
10
+ Fizika čvrstog stanja (revision 4776646)
11
+ Agregatno stanje (revision 4663090)
12
+ Alnico (revision 3915185)
13
+ Aluminij (revision 4772363)
14
+ Amorfna tvar (revision 4659679)
15
+ Antimon (revision 4420072)
16
+ Antoine Henri Becquerel (revision 4634966)
17
+ Apsolutna nula (revision 4706683)
18
+ Arsen (revision 4540773)
19
+ Arthur Holly Compton (revision 4736068)
20
+ Atom (revision 4778162)
21
+ Atomska jezgra (revision 4540956)
22
+ Bell Labs (revision 4769518)
23
+ Bor (element) (revision 4602837)
24
+ Brian Josephson (revision 4403761)
25
+ Cink (revision 4537854)
26
+ Coulombov zakon (revision 4710338)
27
+ Dijamant (revision 4625335)
28
+ Dimenzija (revision 4669110)
29
+ Dinastija Han (revision 4541686)
30
+ Dislokacija (revision 4668021)
31
+ EV (revision 4538157)
32
+ Eksponencijalna funkcija (revision 4160157)
33
+ Električna struja (revision 4280621)
34
+ Električna vodljivost (revision 4460160)
35
+ Električni izolator (revision 4649046)
36
+ Električni luk (revision 4646980)
37
+ Električni naboj (revision 4727496)
38
+ Električni otpor (revision 4593314)
39
+ Električni vodič (revision 4333008)
40
+ Električno polje (revision 4705679)
41
+ Elektrolit (revision 4486319)
42
+ Elektromagnetsko zračenje (revision 4537368)
43
+ Elektron (revision 4630705)
44
+ Elektronika (revision 4090016)
45
+ Elektronska konfiguracija (revision 4420620)
46
+ Elektronski mikroskop (revision 4413214)
47
+ Elektrotehnika (revision 4596912)
48
+ Energetika (revision 4586277)
49
+ Energija (revision 4719089)
50
+ Fermi-Diracova statistika (revision 3934172)
51
+ Feromagnetizam (revision 4760511)
52
+ Fizika (revision 4769955)
53
+ Fizika kondenzirane tvari (revision 4769955)
54
+ Fizikalna veličina (revision 4621676)
55
+ Fosfor (revision 4602427)
56
+ Fotodioda (revision 3939069)
57
+ Fotoelektrični učinak (revision 4704417)
58
+ Foton (revision 4537522)
59
+ Fotonaponski sustavi (revision 4418887)
60
+ Francuski jezik (revision 4771366)
61
+ Galij (revision 4537855)
62
+ Genitiv (revision 4625199)
63
+ Germanij (revision 4537856)
64
+ Helij (revision 4747001)
65
+ Henri (revision 3922500)
66
+ Indij (revision 4537867)
67
+ Integrirani krug (revision 4447159)
68
+ Ion (revision 4549144)
69
+ Ioniziranje (revision 4566703)
70
+ Izolator (revision 4649046)
71
+ John Bardeen (revision 4403736)
72
+ Kadmij (revision 3921860)
73
+ Kelvin (revision 4624351)
74
+ Keramika (revision 4599177)
75
+ Kinetička energija (revision 4719090)
76
+ Klasična mehanika (revision 4637127)
77
+ Kompas (revision 4702880)
78
+ Kondenzacija (revision 4477825)
79
+ Kondenzirana tvar (revision 4776646)
80
+ Konstrukcija (revision 4680450)
81
+ Kovalentna veza (revision 4641419)
82
+ Kristal (revision 4720329)
83
+ Kristalna rešetka (revision 4479184)
84
+ Kristalografija (revision 4105956)
85
+ Krutine (revision 4625162)
86
+ Kubični kristalni sustav (revision 4344344)
87
+ Kubični metar (revision 4616551)
88
+ Kvantna mehanika (revision 4541215)
89
+ Latinski jezik (revision 4760544)
90
+ Luminiscencija (revision 4708222)
91
+ Magnet (revision 4603344)
92
+ Magnetizam (revision 4760040)
93
+ Magnetska permeabilnost (revision 4675996)
94
+ Magnetska vodljivost (revision 4736934)
95
+ Magnetski moment (revision 4410235)
96
+ Magnetsko polje (revision 4678057)
97
+ Materijal (revision 4669230)
98
+ Mehanika (revision 4698699)
99
+ Metal (revision 4671710)
100
+ Metan (revision 4422418)
101
+ Metar (revision 4655527)
102
+ Mjerna veličina (revision 4621676)
103
+ Molekula (revision 4539232)
104
+ Molekule (revision 4539232)
105
+ Napon (revision 4585417)
106
+ Niskotemperaturna fizika (revision 4657522)
107
+ Njemački jezik (revision 4731246)
108
+ Optika (revision 4768098)
109
+
110
+ == End of Parsed pages ==
111
+
112
+ - Wikipedia parsing ended at: 2016-09-25 23:50:27.589690
113
+
114
+ 49 characters appeared 500582 times.
115
+
116
+ First 31 characters:
117
+ [ 0] Char a: 10.808019465342342 %
118
+ [ 1] Char i: 10.18554402675286 %
119
+ [ 2] Char e: 9.571259054460608 %
120
+ [ 3] Char o: 8.468143081453189 %
121
+ [ 4] Char n: 6.952906816465634 %
122
+ [ 5] Char t: 5.369549843981606 %
123
+ [ 6] Char r: 5.331993559496746 %
124
+ [ 7] Char j: 5.102860270644969 %
125
+ [ 8] Char s: 4.717109284792501 %
126
+ [ 9] Char k: 4.013927788054705 %
127
+ [10] Char l: 3.854713113935379 %
128
+ [11] Char u: 3.786792173909569 %
129
+ [12] Char m: 3.730058212240951 %
130
+ [13] Char v: 3.0989927724129114 %
131
+ [14] Char p: 2.67308852495695 %
132
+ [15] Char d: 2.6135578186990345 %
133
+ [16] Char z: 1.8931963194841206 %
134
+ [17] Char g: 1.5665765049482403 %
135
+ [18] Char č: 1.161048539500022 %
136
+ [19] Char b: 1.1440683044935693 %
137
+ [20] Char c: 1.007627122029957 %
138
+ [21] Char h: 0.8006680224219008 %
139
+ [22] Char f: 0.5159993767254915 %
140
+ [23] Char š: 0.422907735395999 %
141
+ [24] Char ž: 0.3611795869607777 %
142
+ [25] Char ć: 0.34959307366225717 %
143
+ [26] Char đ: 0.2195444502598975 %
144
+ [27] Char y: 0.11306838839590717 %
145
+ [28] Char w: 0.07291512679241363 %
146
+ [29] Char x: 0.04534721584076135 %
147
+ [30] Char q: 0.02477116636235422 %
148
+
149
+ The first 31 characters have an accumulated ratio of 0.9997702674087363.
150
+
151
+ 712 sequences found.
152
+
153
+ First 512 (typical positive ratio): 0.9989731099787131
154
+ Next 512 (512-1024): 1.9976747066414694e-06
155
+ Rest: 3.7513395167998453e-17
156
+
157
+ - Processing end: 2016-09-25 23:50:27.987029
@@ -0,0 +1,161 @@
1
+ = Logs of language model for Czech (cs) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2016-09-21 03:20:56.824516
5
+ - Maximum depth: 5
6
+ - Max number of pages: 100
7
+
8
+ == Parsed pages ==
9
+
10
+ Sociální fobie (revision 13567590)
11
+ Adaptace (revision 13991192)
12
+ Agorafobie (revision 13013445)
13
+ Alkoholismus (revision 13822064)
14
+ Alprazolam (revision 14082425)
15
+ Antidepresivum (revision 14113423)
16
+ Asertivita (revision 14111958)
17
+ Atenolol (revision 12051880)
18
+ Automatické negativní myšlenky (revision 13567590)
19
+ Benzodiazepin (revision 13947546)
20
+ Beta-blokátory (revision 13428762)
21
+ Blud (revision 13888988)
22
+ Bohatství (revision 13556478)
23
+ Bupropion (revision 13686045)
24
+ Citaloparam (revision 13567590)
25
+ Clonazepan (revision 13567590)
26
+ Crohnova nemoc (revision 13745254)
27
+ Deprese (psychologie) (revision 13695735)
28
+ Diagnostický a statický manuál mentálních poruch (revision 13567590)
29
+ Diagnostický a statistický manuál mentálních poruch (revision 13714660)
30
+ Diagnóza (medicína) (revision 13052239)
31
+ Dichotomické myšlení (revision 13567590)
32
+ Digital object identifier (revision 14138049)
33
+ Dopamin (revision 13714274)
34
+ Dystymie (revision 13567267)
35
+ Důkaz kruhem (revision 13190761)
36
+ Elektivní mutismus (revision 9940891)
37
+ Emoce (revision 14110033)
38
+ Escitalopram (revision 12954987)
39
+ Evoluce (revision 13951488)
40
+ Expozice (psychologie) (revision 14119474)
41
+ Extraverze a introverze (revision 13872996)
42
+ Fluoxetin (revision 12955006)
43
+ Fluvoxamin (revision 12955006)
44
+ Gen (revision 13907182)
45
+ Generalizovaná úzkostná porucha (revision 14006709)
46
+ Halucinaci (revision 12188143)
47
+ Hněv (revision 14057864)
48
+ Inteligence (revision 14009781)
49
+ International Standard Serial Number (revision 12869806)
50
+ Interpersonální psychoterapie (revision 13567590)
51
+ Iracionalita (revision 4765977)
52
+ Ján Praško Pavlov (revision 14086840)
53
+ Klinické testování (revision 13530979)
54
+ Kognitivní omyl (revision 13107294)
55
+ Kognitivní psychologie (revision 11629465)
56
+ Kognitivní restrukturalizace (revision 13567360)
57
+ Kognitivně behaviorální terapie (revision 13980494)
58
+ Komorbidita (revision 11351714)
59
+ Lymská borelióza (revision 14068446)
60
+ Malé sebevědomí (revision 13567590)
61
+ Medical Subject Headings (revision 12239331)
62
+ Meditace (revision 13180783)
63
+ Mentální černý filtr (revision 13567590)
64
+ Mezinárodní klasifikace nemocí (revision 12531067)
65
+ Michael Liebowitz (revision 13567590)
66
+ Moclobemid (revision 13567590)
67
+ Moritova terapie (revision 11960292)
68
+ Musturbace (revision 13567590)
69
+ Nervozita (revision 13847097)
70
+ Noradrenalin (revision 14054165)
71
+ Obsedantně kompulzivní porucha (revision 13950365)
72
+ Panická ataka (revision 13253537)
73
+ Panická porucha (revision 13253537)
74
+ Paranoia (revision 14027052)
75
+ Paroxetin (revision 12955006)
76
+ Pohlavnost (revision 13564689)
77
+ Porucha (revision 11039108)
78
+ Pravděpodobnost (revision 13596041)
79
+ Predestinace (revision 12467403)
80
+ Profese (revision 13975485)
81
+ Propanolol (revision 12972658)
82
+ Psychiatr (revision 12767960)
83
+ Psychické trauma (revision 11227535)
84
+ Psychoaktivní droga (revision 13939232)
85
+ Psychodynamická léčba (revision 13567590)
86
+ Psychofarmaka (revision 9928215)
87
+ Psycholog (revision 12358728)
88
+ Psychoterapie (revision 13874178)
89
+ Puberta (revision 12540014)
90
+ RIMA (revision 10234728)
91
+ Remise (revision 9896748)
92
+ Richard Heimberg (revision 13567590)
93
+ Rámování myšlenek (revision 13567590)
94
+ Schizofrenie (revision 13977456)
95
+ Sebevražda (revision 14053884)
96
+ Selektivní abstrakce (revision 13567590)
97
+ Selektivní inhibitor zpětného vychytávání serotoninu (revision 12955027)
98
+ Serotonin (revision 13975104)
99
+ Sertralin (revision 12955006)
100
+ Skupinová terapie (revision 11964235)
101
+ Sociální chování (revision 13507313)
102
+ Sociální dovednost (revision 12226347)
103
+
104
+ == End of Parsed pages ==
105
+
106
+ - Wikipedia parsing ended at: 2016-09-21 03:28:11.731386
107
+
108
+ 47 characters appeared 594800 times.
109
+
110
+ First 41 characters:
111
+ [ 0] Char o: 8.323806321452588 %
112
+ [ 1] Char e: 8.040013449899126 %
113
+ [ 2] Char n: 6.895595158036315 %
114
+ [ 3] Char a: 6.263113651647613 %
115
+ [ 4] Char i: 5.650470746469401 %
116
+ [ 5] Char t: 5.40383322125084 %
117
+ [ 6] Char s: 4.588937457969065 %
118
+ [ 7] Char v: 3.8685272360457295 %
119
+ [ 8] Char p: 3.6914929388029587 %
120
+ [ 9] Char r: 3.6302958977807664 %
121
+ [10] Char l: 3.6017148621385338 %
122
+ [11] Char í: 3.5733019502353733 %
123
+ [12] Char k: 3.301950235373235 %
124
+ [13] Char u: 3.1782111634162744 %
125
+ [14] Char c: 3.1383658372562206 %
126
+ [15] Char d: 3.120208473436449 %
127
+ [16] Char m: 2.758406186953598 %
128
+ [17] Char h: 2.2747141896435776 %
129
+ [18] Char á: 2.156186953597848 %
130
+ [19] Char z: 2.0260591795561536 %
131
+ [20] Char y: 1.9894082044384667 %
132
+ [21] Char j: 1.8979488903833224 %
133
+ [22] Char b: 1.8189307330195021 %
134
+ [23] Char ě: 1.277236045729657 %
135
+ [24] Char é: 1.2291526563550772 %
136
+ [25] Char č: 0.9502353732347008 %
137
+ [26] Char ž: 0.9214862138533961 %
138
+ [27] Char ř: 0.8955951580363146 %
139
+ [28] Char ý: 0.7646267652992602 %
140
+ [29] Char š: 0.6605581708137189 %
141
+ [30] Char f: 0.6260928043039677 %
142
+ [31] Char ů: 0.5016812373907196 %
143
+ [32] Char g: 0.47041022192333554 %
144
+ [33] Char ú: 0.19502353732347008 %
145
+ [34] Char x: 0.13685272360457296 %
146
+ [35] Char ň: 0.05447209145931405 %
147
+ [36] Char w: 0.04488903833221251 %
148
+ [37] Char ó: 0.03429724277067922 %
149
+ [38] Char ť: 0.02269670477471419 %
150
+ [39] Char ď: 0.012104909213180902 %
151
+ [40] Char q: 0.007229320780094149 %
152
+
153
+ The first 41 characters have an accumulated ratio of 0.9999613315400132.
154
+
155
+ 1025 sequences found.
156
+
157
+ First 512 (typical positive ratio): 0.9786035192432675
158
+ Next 512 (512-1024): 1.6812373907195695e-06
159
+ Rest: 2.0246480655940202e-06
160
+
161
+ - Processing end: 2016-09-21 03:28:12.235582