cchardet 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (317) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.gitmodules +3 -0
  4. data/.rubocop.yml +11 -0
  5. data/CHANGELOG.md +5 -0
  6. data/Gemfile +10 -0
  7. data/README.md +35 -0
  8. data/Rakefile +15 -0
  9. data/cchardet.gemspec +30 -0
  10. data/ext/cchardet/extconf.rb +26 -0
  11. data/ext/uchardet/.gitignore +1 -0
  12. data/ext/uchardet/.gitlab-ci.yml +106 -0
  13. data/ext/uchardet/AUTHORS +16 -0
  14. data/ext/uchardet/CMakeLists.txt +74 -0
  15. data/ext/uchardet/COPYING +1316 -0
  16. data/ext/uchardet/INSTALL +26 -0
  17. data/ext/uchardet/README.md +295 -0
  18. data/ext/uchardet/build-mac/uchardet.cpp +7 -0
  19. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
  20. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  21. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
  22. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
  23. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
  24. data/ext/uchardet/doc/CMakeLists.txt +6 -0
  25. data/ext/uchardet/doc/README.maintainer +59 -0
  26. data/ext/uchardet/doc/uchardet.1 +18 -0
  27. data/ext/uchardet/script/BuildLangModel.py +533 -0
  28. data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
  29. data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
  30. data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
  31. data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
  32. data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
  33. data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
  34. data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
  35. data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
  36. data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
  37. data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
  38. data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
  39. data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
  40. data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
  41. data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
  42. data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
  43. data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
  44. data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
  45. data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
  46. data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
  47. data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
  48. data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
  49. data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
  50. data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
  51. data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
  52. data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
  53. data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
  54. data/ext/uchardet/script/README +63 -0
  55. data/ext/uchardet/script/charsets/codepoints.py +53 -0
  56. data/ext/uchardet/script/charsets/db.py +73 -0
  57. data/ext/uchardet/script/charsets/ibm852.py +72 -0
  58. data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
  59. data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
  60. data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
  61. data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
  62. data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
  63. data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
  64. data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
  65. data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
  66. data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
  67. data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
  68. data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
  69. data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
  70. data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
  71. data/ext/uchardet/script/charsets/tis-620.py +77 -0
  72. data/ext/uchardet/script/charsets/viscii.py +72 -0
  73. data/ext/uchardet/script/charsets/windows-1250.py +75 -0
  74. data/ext/uchardet/script/charsets/windows-1252.py +76 -0
  75. data/ext/uchardet/script/charsets/windows-1253.py +72 -0
  76. data/ext/uchardet/script/charsets/windows-1256.py +75 -0
  77. data/ext/uchardet/script/charsets/windows-1257.py +72 -0
  78. data/ext/uchardet/script/charsets/windows-1258.py +72 -0
  79. data/ext/uchardet/script/debug.sh +9 -0
  80. data/ext/uchardet/script/header-template.cpp +38 -0
  81. data/ext/uchardet/script/langs/ar.py +59 -0
  82. data/ext/uchardet/script/langs/cs.py +80 -0
  83. data/ext/uchardet/script/langs/da.py +69 -0
  84. data/ext/uchardet/script/langs/de.py +69 -0
  85. data/ext/uchardet/script/langs/el.py +55 -0
  86. data/ext/uchardet/script/langs/eo.py +67 -0
  87. data/ext/uchardet/script/langs/es.py +69 -0
  88. data/ext/uchardet/script/langs/et.py +57 -0
  89. data/ext/uchardet/script/langs/fi.py +60 -0
  90. data/ext/uchardet/script/langs/fr.py +79 -0
  91. data/ext/uchardet/script/langs/ga.py +60 -0
  92. data/ext/uchardet/script/langs/hr.py +59 -0
  93. data/ext/uchardet/script/langs/hu.py +66 -0
  94. data/ext/uchardet/script/langs/it.py +56 -0
  95. data/ext/uchardet/script/langs/lt.py +70 -0
  96. data/ext/uchardet/script/langs/lv.py +69 -0
  97. data/ext/uchardet/script/langs/mt.py +80 -0
  98. data/ext/uchardet/script/langs/pl.py +81 -0
  99. data/ext/uchardet/script/langs/pt.py +80 -0
  100. data/ext/uchardet/script/langs/ro.py +65 -0
  101. data/ext/uchardet/script/langs/sk.py +80 -0
  102. data/ext/uchardet/script/langs/sl.py +59 -0
  103. data/ext/uchardet/script/langs/sv.py +56 -0
  104. data/ext/uchardet/script/langs/th.py +55 -0
  105. data/ext/uchardet/script/langs/tr.py +67 -0
  106. data/ext/uchardet/script/langs/vi.py +64 -0
  107. data/ext/uchardet/script/release.sh +8 -0
  108. data/ext/uchardet/script/win32.sh +7 -0
  109. data/ext/uchardet/src/Big5Freq.tab +943 -0
  110. data/ext/uchardet/src/CMakeLists.txt +160 -0
  111. data/ext/uchardet/src/CharDistribution.cpp +109 -0
  112. data/ext/uchardet/src/CharDistribution.h +242 -0
  113. data/ext/uchardet/src/EUCKRFreq.tab +614 -0
  114. data/ext/uchardet/src/EUCTWFreq.tab +447 -0
  115. data/ext/uchardet/src/GB2312Freq.tab +491 -0
  116. data/ext/uchardet/src/JISFreq.tab +589 -0
  117. data/ext/uchardet/src/JpCntx.cpp +230 -0
  118. data/ext/uchardet/src/JpCntx.h +140 -0
  119. data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
  120. data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
  121. data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
  122. data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
  123. data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
  124. data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
  125. data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
  126. data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
  127. data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
  128. data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
  129. data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
  130. data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
  131. data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
  132. data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
  133. data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
  134. data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
  135. data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
  136. data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
  137. data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
  138. data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
  139. data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
  140. data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
  141. data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
  142. data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
  143. data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
  144. data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
  145. data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
  146. data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
  147. data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
  148. data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
  149. data/ext/uchardet/src/nsBig5Prober.h +75 -0
  150. data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
  151. data/ext/uchardet/src/nsCharSetProber.h +77 -0
  152. data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
  153. data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
  154. data/ext/uchardet/src/nsEUCJPProber.h +79 -0
  155. data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
  156. data/ext/uchardet/src/nsEUCKRProber.h +81 -0
  157. data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
  158. data/ext/uchardet/src/nsEUCTWProber.h +75 -0
  159. data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
  160. data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
  161. data/ext/uchardet/src/nsEscSM.cpp +267 -0
  162. data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
  163. data/ext/uchardet/src/nsGB2312Prober.h +77 -0
  164. data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
  165. data/ext/uchardet/src/nsHebrewProber.h +177 -0
  166. data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
  167. data/ext/uchardet/src/nsLatin1Prober.h +73 -0
  168. data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
  169. data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
  170. data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
  171. data/ext/uchardet/src/nsPkgInt.h +89 -0
  172. data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
  173. data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
  174. data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
  175. data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
  176. data/ext/uchardet/src/nsSJISProber.cpp +98 -0
  177. data/ext/uchardet/src/nsSJISProber.h +81 -0
  178. data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
  179. data/ext/uchardet/src/nsUTF8Prober.h +66 -0
  180. data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
  181. data/ext/uchardet/src/nsUniversalDetector.h +91 -0
  182. data/ext/uchardet/src/nscore.h +59 -0
  183. data/ext/uchardet/src/prmem.h +49 -0
  184. data/ext/uchardet/src/symbols.cmake +41 -0
  185. data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
  186. data/ext/uchardet/src/tools/uchardet.cpp +254 -0
  187. data/ext/uchardet/src/uchardet.cpp +274 -0
  188. data/ext/uchardet/src/uchardet.h +136 -0
  189. data/ext/uchardet/test/CMakeLists.txt +47 -0
  190. data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
  191. data/ext/uchardet/test/ar/utf-8.txt +3 -0
  192. data/ext/uchardet/test/ar/windows-1256.txt +3 -0
  193. data/ext/uchardet/test/bg/windows-1251.txt +3 -0
  194. data/ext/uchardet/test/cs/ibm852.txt +4 -0
  195. data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
  196. data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
  197. data/ext/uchardet/test/cs/utf-8.txt +4 -0
  198. data/ext/uchardet/test/cs/windows-1250.txt +4 -0
  199. data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
  200. data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
  201. data/ext/uchardet/test/da/utf-8.txt +10 -0
  202. data/ext/uchardet/test/da/windows-1252.txt +10 -0
  203. data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
  204. data/ext/uchardet/test/de/windows-1252.txt +11 -0
  205. data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
  206. data/ext/uchardet/test/el/utf-8.txt +3 -0
  207. data/ext/uchardet/test/el/windows-1253.txt +5 -0
  208. data/ext/uchardet/test/en/ascii.txt +4 -0
  209. data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
  210. data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
  211. data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
  212. data/ext/uchardet/test/es/utf-8.txt +5 -0
  213. data/ext/uchardet/test/es/windows-1252.txt +5 -0
  214. data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
  215. data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
  216. data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
  217. data/ext/uchardet/test/et/utf-8.txt +6 -0
  218. data/ext/uchardet/test/et/windows-1252.txt +6 -0
  219. data/ext/uchardet/test/et/windows-1257.txt +6 -0
  220. data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
  221. data/ext/uchardet/test/fi/utf-8.txt +8 -0
  222. data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
  223. data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
  224. data/ext/uchardet/test/fr/utf-16.be +0 -0
  225. data/ext/uchardet/test/fr/utf-32.le +0 -0
  226. data/ext/uchardet/test/fr/utf-8.txt +14 -0
  227. data/ext/uchardet/test/fr/windows-1252.txt +3 -0
  228. data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
  229. data/ext/uchardet/test/ga/utf-8.txt +6 -0
  230. data/ext/uchardet/test/ga/windows-1252.txt +6 -0
  231. data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
  232. data/ext/uchardet/test/he/utf-8.txt +3 -0
  233. data/ext/uchardet/test/he/windows-1255.txt +1 -0
  234. data/ext/uchardet/test/hr/ibm852.txt +4 -0
  235. data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
  236. data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
  237. data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
  238. data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
  239. data/ext/uchardet/test/hr/utf-8.txt +4 -0
  240. data/ext/uchardet/test/hr/windows-1250.txt +4 -0
  241. data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
  242. data/ext/uchardet/test/hu/windows-1250.txt +1 -0
  243. data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
  244. data/ext/uchardet/test/it/utf-8.txt +18 -0
  245. data/ext/uchardet/test/ja/euc-jp.txt +10 -0
  246. data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
  247. data/ext/uchardet/test/ja/shift_jis.txt +1 -0
  248. data/ext/uchardet/test/ja/utf-16be.txt +0 -0
  249. data/ext/uchardet/test/ja/utf-16le.txt +0 -0
  250. data/ext/uchardet/test/ja/utf-8.txt +9 -0
  251. data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
  252. data/ext/uchardet/test/ko/uhc.smi +16 -0
  253. data/ext/uchardet/test/ko/utf-16.le +0 -0
  254. data/ext/uchardet/test/ko/utf-32.be +0 -0
  255. data/ext/uchardet/test/ko/utf-8.txt +3 -0
  256. data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
  257. data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
  258. data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
  259. data/ext/uchardet/test/lt/utf-8.txt +3 -0
  260. data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
  261. data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
  262. data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
  263. data/ext/uchardet/test/lv/utf-8.txt +6 -0
  264. data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
  265. data/ext/uchardet/test/mt/utf-8.txt +4 -0
  266. data/ext/uchardet/test/pl/ibm852.txt +3 -0
  267. data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
  268. data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
  269. data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
  270. data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
  271. data/ext/uchardet/test/pl/utf-8.txt +3 -0
  272. data/ext/uchardet/test/pl/windows-1250.txt +3 -0
  273. data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
  274. data/ext/uchardet/test/pt/utf-8.txt +6 -0
  275. data/ext/uchardet/test/ro/ibm852.txt +9 -0
  276. data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
  277. data/ext/uchardet/test/ro/utf-8.txt +9 -0
  278. data/ext/uchardet/test/ro/windows-1250.txt +9 -0
  279. data/ext/uchardet/test/ru/ibm855.txt +5 -0
  280. data/ext/uchardet/test/ru/ibm866.txt +11 -0
  281. data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
  282. data/ext/uchardet/test/ru/koi8-r.txt +1 -0
  283. data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
  284. data/ext/uchardet/test/ru/windows-1251.txt +4 -0
  285. data/ext/uchardet/test/sk/ibm852.txt +3 -0
  286. data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
  287. data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
  288. data/ext/uchardet/test/sk/utf-8.txt +3 -0
  289. data/ext/uchardet/test/sk/windows-1250.txt +3 -0
  290. data/ext/uchardet/test/sl/ibm852.txt +9 -0
  291. data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
  292. data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
  293. data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
  294. data/ext/uchardet/test/sl/utf-8.txt +9 -0
  295. data/ext/uchardet/test/sl/windows-1250.txt +9 -0
  296. data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
  297. data/ext/uchardet/test/sv/utf-8.txt +10 -0
  298. data/ext/uchardet/test/sv/windows-1252.txt +10 -0
  299. data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
  300. data/ext/uchardet/test/th/tis-620.txt +5 -0
  301. data/ext/uchardet/test/th/utf-8.txt +1 -0
  302. data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
  303. data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
  304. data/ext/uchardet/test/uchardet-tests.c +130 -0
  305. data/ext/uchardet/test/vi/utf-8.txt +4 -0
  306. data/ext/uchardet/test/vi/viscii.txt +4 -0
  307. data/ext/uchardet/test/vi/windows-1258.txt +4 -0
  308. data/ext/uchardet/test/zh/big5.txt +1 -0
  309. data/ext/uchardet/test/zh/euc-tw.txt +1 -0
  310. data/ext/uchardet/test/zh/gb18030.txt +1 -0
  311. data/ext/uchardet/test/zh/utf-8.txt +1 -0
  312. data/ext/uchardet/uchardet.doap +51 -0
  313. data/ext/uchardet/uchardet.pc.in +10 -0
  314. data/lib/cchardet.rb +56 -0
  315. data/lib/cchardet/lib_finder.rb +32 -0
  316. data/lib/cchardet/version.rb +5 -0
  317. metadata +362 -0
@@ -0,0 +1,142 @@
1
+ = Logs of language model for Arabic (ar) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2015-12-13 18:31:12.817808
5
+ - Maximum depth: 2
6
+ - Max number of pages: 50
7
+
8
+ == Parsed pages ==
9
+
10
+ الصفحة_الرئيسية (revision 17217037)
11
+ 11 ديسمبر (revision 17699159)
12
+ 12 ديسمبر (revision 17710194)
13
+ 13 ديسمبر (revision 17722318)
14
+ 1437 هـ (revision 17278274)
15
+ 14 ديسمبر (revision 17432010)
16
+ 15 ديسمبر (revision 17206233)
17
+ 1645 (revision 17168144)
18
+ 1954 (revision 17409780)
19
+ 1955 (revision 16826533)
20
+ 1972 (revision 17004868)
21
+ 1988 (revision 17671285)
22
+ 2003 (revision 17656994)
23
+ 2011 (revision 17589601)
24
+ 2015 (revision 17678287)
25
+ 216 ق.م (revision 17586752)
26
+ 25 يناير (revision 17325864)
27
+ 2 ربيع الأول (revision 17722146)
28
+ 6 (عدد) (revision 16972178)
29
+ آريز (revision 17466671)
30
+ آلهة اليونان (revision 17722617)
31
+ أثينا (revision 17642941)
32
+ أثينا (ميثولوجيا) (revision 17662932)
33
+ أزمة المهاجرين إلى أوروبا (revision 17718437)
34
+ أوروبا (revision 17713457)
35
+ إس سي إي سانتا مونيكا ستوديو (revision 17035439)
36
+ إسبارطة (revision 16733170)
37
+ إسماعيل الصفوي (revision 17194218)
38
+ إله الحرب (لعبة فيديو) (revision 17630201)
39
+ إمارة دبي (revision 17602037)
40
+ إيطاليا (revision 17586853)
41
+ اتفاق باريس (revision 17718086)
42
+ الأزمة الليبية (revision 17630232)
43
+ الإمارات العربية المتحدة (revision 17722077)
44
+ الإنتخابات البلدية السعودية 2015 (revision 17722004)
45
+ الاتحاد الأوروبي لكرة القدم (revision 17596822)
46
+ الاحتجاجات اللبنانية 2015 (revision 17315127)
47
+ الانتفاضة الفلسطينية (2015) (revision 17710414)
48
+ التمرد العراقي (revision 17708640)
49
+ الجمعية العامة للأمم المتحدة (revision 17304227)
50
+ الجمهورية الرومانية (revision 16472557)
51
+ الجيش اللبناني (revision 17516533)
52
+ الحرب الأهلية السورية (revision 17675300)
53
+ الحرب الأهلية اليمنية (2015) (revision 17686236)
54
+ الحرب في شمال غرب باكستان (revision 17490838)
55
+ الدولة الصفوية (revision 17031046)
56
+ الرياض (revision 17580586)
57
+ السعودية (revision 17711339)
58
+ السلطة الوطنية الفلسطينية (revision 17438123)
59
+ العراق (revision 17704602)
60
+ العلاقات الخارجية في تركيا (revision 17647409)
61
+
62
+ == End of Parsed pages ==
63
+
64
+ - Wikipedia parsing ended at: 2015-12-13 18:33:58.846891
65
+
66
+ 95 characters appeared 727795 times.
67
+
68
+ First 64 characters:
69
+ [ 0] Char ا: 14.933875610577156 %
70
+ [ 1] Char ل: 11.460782225764122 %
71
+ [ 2] Char ي: 8.30302489025069 %
72
+ [ 3] Char م: 6.3702003998378665 %
73
+ [ 4] Char و: 5.952637762007158 %
74
+ [ 5] Char ر: 4.9419135883043985 %
75
+ [ 6] Char ن: 4.900967992360486 %
76
+ [ 7] Char ت: 4.229625100474721 %
77
+ [ 8] Char ة: 3.6022506337636284 %
78
+ [ 9] Char ب: 3.5434428650925054 %
79
+ [10] Char ع: 3.3116468236247845 %
80
+ [11] Char د: 3.1756195082406444 %
81
+ [12] Char س: 2.5401383631379715 %
82
+ [13] Char ف: 2.3899587109007343 %
83
+ [14] Char ق: 2.010868445097864 %
84
+ [15] Char أ: 1.8763525443291036 %
85
+ [16] Char ه: 1.8663222473361318 %
86
+ [17] Char ك: 1.8573911609725264 %
87
+ [18] Char ح: 1.8431014227907585 %
88
+ [19] Char ج: 1.3270220323030524 %
89
+ [20] Char ط: 1.0305099650313618 %
90
+ [21] Char ش: 0.9638703206260004 %
91
+ [22] Char إ: 0.8946200509758929 %
92
+ [23] Char ص: 0.8509264284585631 %
93
+ [24] Char ى: 0.7726076711161797 %
94
+ [25] Char خ: 0.717097534333157 %
95
+ [26] Char ز: 0.6687322666410184 %
96
+ [27] Char ث: 0.6549921337739336 %
97
+ [28] Char ض: 0.5409490309771295 %
98
+ [29] Char غ: 0.4574090231452538 %
99
+ [30] Char ذ: 0.44765352880962356 %
100
+ [31] Char ئ: 0.39269299734128427 %
101
+ [32] Char ء: 0.295138053984982 %
102
+ [33] Char ظ: 0.2397653185306302 %
103
+ [34] Char آ: 0.12324899181775088 %
104
+ [35] Char ؤ: 0.08491402111858422 %
105
+ [36] Char ـ: 0.047678261048784344 %
106
+ [37] Char a: 0.03311372020967443 %
107
+ [38] Char e: 0.029403884335561525 %
108
+ [39] Char i: 0.027205463076827956 %
109
+ [40] Char o: 0.02432003517474014 %
110
+ [41] Char t: 0.02349562720271505 %
111
+ [42] Char r: 0.02294602188803166 %
112
+ [43] Char n: 0.020472797971956388 %
113
+ [44] Char s: 0.01799957405588112 %
114
+ [45] Char l: 0.012915724895059736 %
115
+ [46] Char h: 0.011816514265692949 %
116
+ [47] Char d: 0.011129507622338709 %
117
+ [48] Char پ: 0.010717303636326163 %
118
+ [49] Char c: 0.009480691678288529 %
119
+ [50] Char u: 0.007969277062909199 %
120
+ [51] Char m: 0.007694474405567502 %
121
+ [52] Char A: 0.006870066433542411 %
122
+ [53] Char گ: 0.006595263776200715 %
123
+ [54] Char f: 0.006183059790188171 %
124
+ [55] Char S: 0.005770855804175626 %
125
+ [56] Char y: 0.0054960531468339294 %
126
+ [57] Char T: 0.0049464478321505365 %
127
+ [58] Char b: 0.0048090465034796885 %
128
+ [59] Char G: 0.0046716451748088405 %
129
+ [60] Char I: 0.004396842517467144 %
130
+ [61] Char C: 0.0042594411887962955 %
131
+ [62] Char p: 0.0039846385314545995 %
132
+ [63] Char k: 0.003709835874112903 %
133
+
134
+ The first 64 characters have an accumulated ratio of 0.999523217389512.
135
+
136
+ 1479 sequences found.
137
+
138
+ First 512 (typical positive ratio): 0.9696025116913417
139
+ Next 512 (512-1024): 1.3740132867084825e-06
140
+ Rest: 0.0012305764497782395
141
+
142
+ - Processing end: 2015-12-13 18:33:59.193909
@@ -0,0 +1,157 @@
1
+ = Logs of language model for Croatian (hr) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2016-09-25 23:41:35.999066
5
+ - Maximum depth: 5
6
+ - Max number of pages: 100
7
+
8
+ == Parsed pages ==
9
+
10
+ Fizika čvrstog stanja (revision 4776646)
11
+ Agregatno stanje (revision 4663090)
12
+ Alnico (revision 3915185)
13
+ Aluminij (revision 4772363)
14
+ Amorfna tvar (revision 4659679)
15
+ Antimon (revision 4420072)
16
+ Antoine Henri Becquerel (revision 4634966)
17
+ Apsolutna nula (revision 4706683)
18
+ Arsen (revision 4540773)
19
+ Arthur Holly Compton (revision 4736068)
20
+ Atom (revision 4778162)
21
+ Atomska jezgra (revision 4540956)
22
+ Bell Labs (revision 4769518)
23
+ Bor (element) (revision 4602837)
24
+ Brian Josephson (revision 4403761)
25
+ Cink (revision 4537854)
26
+ Coulombov zakon (revision 4710338)
27
+ Dijamant (revision 4625335)
28
+ Dimenzija (revision 4669110)
29
+ Dinastija Han (revision 4541686)
30
+ Dislokacija (revision 4668021)
31
+ EV (revision 4538157)
32
+ Eksponencijalna funkcija (revision 4160157)
33
+ Električna struja (revision 4280621)
34
+ Električna vodljivost (revision 4460160)
35
+ Električni izolator (revision 4649046)
36
+ Električni luk (revision 4646980)
37
+ Električni naboj (revision 4727496)
38
+ Električni otpor (revision 4593314)
39
+ Električni vodič (revision 4333008)
40
+ Električno polje (revision 4705679)
41
+ Elektrolit (revision 4486319)
42
+ Elektromagnetsko zračenje (revision 4537368)
43
+ Elektron (revision 4630705)
44
+ Elektronika (revision 4090016)
45
+ Elektronska konfiguracija (revision 4420620)
46
+ Elektronski mikroskop (revision 4413214)
47
+ Elektrotehnika (revision 4596912)
48
+ Energetika (revision 4586277)
49
+ Energija (revision 4719089)
50
+ Fermi-Diracova statistika (revision 3934172)
51
+ Feromagnetizam (revision 4760511)
52
+ Fizika (revision 4769955)
53
+ Fizika kondenzirane tvari (revision 4769955)
54
+ Fizikalna veličina (revision 4621676)
55
+ Fosfor (revision 4602427)
56
+ Fotodioda (revision 3939069)
57
+ Fotoelektrični učinak (revision 4704417)
58
+ Foton (revision 4537522)
59
+ Fotonaponski sustavi (revision 4418887)
60
+ Francuski jezik (revision 4771366)
61
+ Galij (revision 4537855)
62
+ Genitiv (revision 4625199)
63
+ Germanij (revision 4537856)
64
+ Helij (revision 4747001)
65
+ Henri (revision 3922500)
66
+ Indij (revision 4537867)
67
+ Integrirani krug (revision 4447159)
68
+ Ion (revision 4549144)
69
+ Ioniziranje (revision 4566703)
70
+ Izolator (revision 4649046)
71
+ John Bardeen (revision 4403736)
72
+ Kadmij (revision 3921860)
73
+ Kelvin (revision 4624351)
74
+ Keramika (revision 4599177)
75
+ Kinetička energija (revision 4719090)
76
+ Klasična mehanika (revision 4637127)
77
+ Kompas (revision 4702880)
78
+ Kondenzacija (revision 4477825)
79
+ Kondenzirana tvar (revision 4776646)
80
+ Konstrukcija (revision 4680450)
81
+ Kovalentna veza (revision 4641419)
82
+ Kristal (revision 4720329)
83
+ Kristalna rešetka (revision 4479184)
84
+ Kristalografija (revision 4105956)
85
+ Krutine (revision 4625162)
86
+ Kubični kristalni sustav (revision 4344344)
87
+ Kubični metar (revision 4616551)
88
+ Kvantna mehanika (revision 4541215)
89
+ Latinski jezik (revision 4760544)
90
+ Luminiscencija (revision 4708222)
91
+ Magnet (revision 4603344)
92
+ Magnetizam (revision 4760040)
93
+ Magnetska permeabilnost (revision 4675996)
94
+ Magnetska vodljivost (revision 4736934)
95
+ Magnetski moment (revision 4410235)
96
+ Magnetsko polje (revision 4678057)
97
+ Materijal (revision 4669230)
98
+ Mehanika (revision 4698699)
99
+ Metal (revision 4671710)
100
+ Metan (revision 4422418)
101
+ Metar (revision 4655527)
102
+ Mjerna veličina (revision 4621676)
103
+ Molekula (revision 4539232)
104
+ Molekule (revision 4539232)
105
+ Napon (revision 4585417)
106
+ Niskotemperaturna fizika (revision 4657522)
107
+ Njemački jezik (revision 4731246)
108
+ Optika (revision 4768098)
109
+
110
+ == End of Parsed pages ==
111
+
112
+ - Wikipedia parsing ended at: 2016-09-25 23:50:27.589690
113
+
114
+ 49 characters appeared 500582 times.
115
+
116
+ First 31 characters:
117
+ [ 0] Char a: 10.808019465342342 %
118
+ [ 1] Char i: 10.18554402675286 %
119
+ [ 2] Char e: 9.571259054460608 %
120
+ [ 3] Char o: 8.468143081453189 %
121
+ [ 4] Char n: 6.952906816465634 %
122
+ [ 5] Char t: 5.369549843981606 %
123
+ [ 6] Char r: 5.331993559496746 %
124
+ [ 7] Char j: 5.102860270644969 %
125
+ [ 8] Char s: 4.717109284792501 %
126
+ [ 9] Char k: 4.013927788054705 %
127
+ [10] Char l: 3.854713113935379 %
128
+ [11] Char u: 3.786792173909569 %
129
+ [12] Char m: 3.730058212240951 %
130
+ [13] Char v: 3.0989927724129114 %
131
+ [14] Char p: 2.67308852495695 %
132
+ [15] Char d: 2.6135578186990345 %
133
+ [16] Char z: 1.8931963194841206 %
134
+ [17] Char g: 1.5665765049482403 %
135
+ [18] Char č: 1.161048539500022 %
136
+ [19] Char b: 1.1440683044935693 %
137
+ [20] Char c: 1.007627122029957 %
138
+ [21] Char h: 0.8006680224219008 %
139
+ [22] Char f: 0.5159993767254915 %
140
+ [23] Char š: 0.422907735395999 %
141
+ [24] Char ž: 0.3611795869607777 %
142
+ [25] Char ć: 0.34959307366225717 %
143
+ [26] Char đ: 0.2195444502598975 %
144
+ [27] Char y: 0.11306838839590717 %
145
+ [28] Char w: 0.07291512679241363 %
146
+ [29] Char x: 0.04534721584076135 %
147
+ [30] Char q: 0.02477116636235422 %
148
+
149
+ The first 31 characters have an accumulated ratio of 0.9997702674087363.
150
+
151
+ 712 sequences found.
152
+
153
+ First 512 (typical positive ratio): 0.9989731099787131
154
+ Next 512 (512-1024): 1.9976747066414694e-06
155
+ Rest: 3.7513395167998453e-17
156
+
157
+ - Processing end: 2016-09-25 23:50:27.987029
@@ -0,0 +1,161 @@
1
+ = Logs of language model for Czech (cs) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2016-09-21 03:20:56.824516
5
+ - Maximum depth: 5
6
+ - Max number of pages: 100
7
+
8
+ == Parsed pages ==
9
+
10
+ Sociální fobie (revision 13567590)
11
+ Adaptace (revision 13991192)
12
+ Agorafobie (revision 13013445)
13
+ Alkoholismus (revision 13822064)
14
+ Alprazolam (revision 14082425)
15
+ Antidepresivum (revision 14113423)
16
+ Asertivita (revision 14111958)
17
+ Atenolol (revision 12051880)
18
+ Automatické negativní myšlenky (revision 13567590)
19
+ Benzodiazepin (revision 13947546)
20
+ Beta-blokátory (revision 13428762)
21
+ Blud (revision 13888988)
22
+ Bohatství (revision 13556478)
23
+ Bupropion (revision 13686045)
24
+ Citaloparam (revision 13567590)
25
+ Clonazepan (revision 13567590)
26
+ Crohnova nemoc (revision 13745254)
27
+ Deprese (psychologie) (revision 13695735)
28
+ Diagnostický a statický manuál mentálních poruch (revision 13567590)
29
+ Diagnostický a statistický manuál mentálních poruch (revision 13714660)
30
+ Diagnóza (medicína) (revision 13052239)
31
+ Dichotomické myšlení (revision 13567590)
32
+ Digital object identifier (revision 14138049)
33
+ Dopamin (revision 13714274)
34
+ Dystymie (revision 13567267)
35
+ Důkaz kruhem (revision 13190761)
36
+ Elektivní mutismus (revision 9940891)
37
+ Emoce (revision 14110033)
38
+ Escitalopram (revision 12954987)
39
+ Evoluce (revision 13951488)
40
+ Expozice (psychologie) (revision 14119474)
41
+ Extraverze a introverze (revision 13872996)
42
+ Fluoxetin (revision 12955006)
43
+ Fluvoxamin (revision 12955006)
44
+ Gen (revision 13907182)
45
+ Generalizovaná úzkostná porucha (revision 14006709)
46
+ Halucinaci (revision 12188143)
47
+ Hněv (revision 14057864)
48
+ Inteligence (revision 14009781)
49
+ International Standard Serial Number (revision 12869806)
50
+ Interpersonální psychoterapie (revision 13567590)
51
+ Iracionalita (revision 4765977)
52
+ Ján Praško Pavlov (revision 14086840)
53
+ Klinické testování (revision 13530979)
54
+ Kognitivní omyl (revision 13107294)
55
+ Kognitivní psychologie (revision 11629465)
56
+ Kognitivní restrukturalizace (revision 13567360)
57
+ Kognitivně behaviorální terapie (revision 13980494)
58
+ Komorbidita (revision 11351714)
59
+ Lymská borelióza (revision 14068446)
60
+ Malé sebevědomí (revision 13567590)
61
+ Medical Subject Headings (revision 12239331)
62
+ Meditace (revision 13180783)
63
+ Mentální černý filtr (revision 13567590)
64
+ Mezinárodní klasifikace nemocí (revision 12531067)
65
+ Michael Liebowitz (revision 13567590)
66
+ Moclobemid (revision 13567590)
67
+ Moritova terapie (revision 11960292)
68
+ Musturbace (revision 13567590)
69
+ Nervozita (revision 13847097)
70
+ Noradrenalin (revision 14054165)
71
+ Obsedantně kompulzivní porucha (revision 13950365)
72
+ Panická ataka (revision 13253537)
73
+ Panická porucha (revision 13253537)
74
+ Paranoia (revision 14027052)
75
+ Paroxetin (revision 12955006)
76
+ Pohlavnost (revision 13564689)
77
+ Porucha (revision 11039108)
78
+ Pravděpodobnost (revision 13596041)
79
+ Predestinace (revision 12467403)
80
+ Profese (revision 13975485)
81
+ Propanolol (revision 12972658)
82
+ Psychiatr (revision 12767960)
83
+ Psychické trauma (revision 11227535)
84
+ Psychoaktivní droga (revision 13939232)
85
+ Psychodynamická léčba (revision 13567590)
86
+ Psychofarmaka (revision 9928215)
87
+ Psycholog (revision 12358728)
88
+ Psychoterapie (revision 13874178)
89
+ Puberta (revision 12540014)
90
+ RIMA (revision 10234728)
91
+ Remise (revision 9896748)
92
+ Richard Heimberg (revision 13567590)
93
+ Rámování myšlenek (revision 13567590)
94
+ Schizofrenie (revision 13977456)
95
+ Sebevražda (revision 14053884)
96
+ Selektivní abstrakce (revision 13567590)
97
+ Selektivní inhibitor zpětného vychytávání serotoninu (revision 12955027)
98
+ Serotonin (revision 13975104)
99
+ Sertralin (revision 12955006)
100
+ Skupinová terapie (revision 11964235)
101
+ Sociální chování (revision 13507313)
102
+ Sociální dovednost (revision 12226347)
103
+
104
+ == End of Parsed pages ==
105
+
106
+ - Wikipedia parsing ended at: 2016-09-21 03:28:11.731386
107
+
108
+ 47 characters appeared 594800 times.
109
+
110
+ First 41 characters:
111
+ [ 0] Char o: 8.323806321452588 %
112
+ [ 1] Char e: 8.040013449899126 %
113
+ [ 2] Char n: 6.895595158036315 %
114
+ [ 3] Char a: 6.263113651647613 %
115
+ [ 4] Char i: 5.650470746469401 %
116
+ [ 5] Char t: 5.40383322125084 %
117
+ [ 6] Char s: 4.588937457969065 %
118
+ [ 7] Char v: 3.8685272360457295 %
119
+ [ 8] Char p: 3.6914929388029587 %
120
+ [ 9] Char r: 3.6302958977807664 %
121
+ [10] Char l: 3.6017148621385338 %
122
+ [11] Char í: 3.5733019502353733 %
123
+ [12] Char k: 3.301950235373235 %
124
+ [13] Char u: 3.1782111634162744 %
125
+ [14] Char c: 3.1383658372562206 %
126
+ [15] Char d: 3.120208473436449 %
127
+ [16] Char m: 2.758406186953598 %
128
+ [17] Char h: 2.2747141896435776 %
129
+ [18] Char á: 2.156186953597848 %
130
+ [19] Char z: 2.0260591795561536 %
131
+ [20] Char y: 1.9894082044384667 %
132
+ [21] Char j: 1.8979488903833224 %
133
+ [22] Char b: 1.8189307330195021 %
134
+ [23] Char ě: 1.277236045729657 %
135
+ [24] Char é: 1.2291526563550772 %
136
+ [25] Char č: 0.9502353732347008 %
137
+ [26] Char ž: 0.9214862138533961 %
138
+ [27] Char ř: 0.8955951580363146 %
139
+ [28] Char ý: 0.7646267652992602 %
140
+ [29] Char š: 0.6605581708137189 %
141
+ [30] Char f: 0.6260928043039677 %
142
+ [31] Char ů: 0.5016812373907196 %
143
+ [32] Char g: 0.47041022192333554 %
144
+ [33] Char ú: 0.19502353732347008 %
145
+ [34] Char x: 0.13685272360457296 %
146
+ [35] Char ň: 0.05447209145931405 %
147
+ [36] Char w: 0.04488903833221251 %
148
+ [37] Char ó: 0.03429724277067922 %
149
+ [38] Char ť: 0.02269670477471419 %
150
+ [39] Char ď: 0.012104909213180902 %
151
+ [40] Char q: 0.007229320780094149 %
152
+
153
+ The first 41 characters have an accumulated ratio of 0.9999613315400132.
154
+
155
+ 1025 sequences found.
156
+
157
+ First 512 (typical positive ratio): 0.9786035192432675
158
+ Next 512 (512-1024): 1.6812373907195695e-06
159
+ Rest: 2.0246480655940202e-06
160
+
161
+ - Processing end: 2016-09-21 03:28:12.235582