cchardet 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (317) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.gitmodules +3 -0
  4. data/.rubocop.yml +11 -0
  5. data/CHANGELOG.md +5 -0
  6. data/Gemfile +10 -0
  7. data/README.md +35 -0
  8. data/Rakefile +15 -0
  9. data/cchardet.gemspec +30 -0
  10. data/ext/cchardet/extconf.rb +26 -0
  11. data/ext/uchardet/.gitignore +1 -0
  12. data/ext/uchardet/.gitlab-ci.yml +106 -0
  13. data/ext/uchardet/AUTHORS +16 -0
  14. data/ext/uchardet/CMakeLists.txt +74 -0
  15. data/ext/uchardet/COPYING +1316 -0
  16. data/ext/uchardet/INSTALL +26 -0
  17. data/ext/uchardet/README.md +295 -0
  18. data/ext/uchardet/build-mac/uchardet.cpp +7 -0
  19. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
  20. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  21. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
  22. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
  23. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
  24. data/ext/uchardet/doc/CMakeLists.txt +6 -0
  25. data/ext/uchardet/doc/README.maintainer +59 -0
  26. data/ext/uchardet/doc/uchardet.1 +18 -0
  27. data/ext/uchardet/script/BuildLangModel.py +533 -0
  28. data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
  29. data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
  30. data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
  31. data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
  32. data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
  33. data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
  34. data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
  35. data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
  36. data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
  37. data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
  38. data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
  39. data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
  40. data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
  41. data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
  42. data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
  43. data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
  44. data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
  45. data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
  46. data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
  47. data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
  48. data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
  49. data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
  50. data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
  51. data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
  52. data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
  53. data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
  54. data/ext/uchardet/script/README +63 -0
  55. data/ext/uchardet/script/charsets/codepoints.py +53 -0
  56. data/ext/uchardet/script/charsets/db.py +73 -0
  57. data/ext/uchardet/script/charsets/ibm852.py +72 -0
  58. data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
  59. data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
  60. data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
  61. data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
  62. data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
  63. data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
  64. data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
  65. data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
  66. data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
  67. data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
  68. data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
  69. data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
  70. data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
  71. data/ext/uchardet/script/charsets/tis-620.py +77 -0
  72. data/ext/uchardet/script/charsets/viscii.py +72 -0
  73. data/ext/uchardet/script/charsets/windows-1250.py +75 -0
  74. data/ext/uchardet/script/charsets/windows-1252.py +76 -0
  75. data/ext/uchardet/script/charsets/windows-1253.py +72 -0
  76. data/ext/uchardet/script/charsets/windows-1256.py +75 -0
  77. data/ext/uchardet/script/charsets/windows-1257.py +72 -0
  78. data/ext/uchardet/script/charsets/windows-1258.py +72 -0
  79. data/ext/uchardet/script/debug.sh +9 -0
  80. data/ext/uchardet/script/header-template.cpp +38 -0
  81. data/ext/uchardet/script/langs/ar.py +59 -0
  82. data/ext/uchardet/script/langs/cs.py +80 -0
  83. data/ext/uchardet/script/langs/da.py +69 -0
  84. data/ext/uchardet/script/langs/de.py +69 -0
  85. data/ext/uchardet/script/langs/el.py +55 -0
  86. data/ext/uchardet/script/langs/eo.py +67 -0
  87. data/ext/uchardet/script/langs/es.py +69 -0
  88. data/ext/uchardet/script/langs/et.py +57 -0
  89. data/ext/uchardet/script/langs/fi.py +60 -0
  90. data/ext/uchardet/script/langs/fr.py +79 -0
  91. data/ext/uchardet/script/langs/ga.py +60 -0
  92. data/ext/uchardet/script/langs/hr.py +59 -0
  93. data/ext/uchardet/script/langs/hu.py +66 -0
  94. data/ext/uchardet/script/langs/it.py +56 -0
  95. data/ext/uchardet/script/langs/lt.py +70 -0
  96. data/ext/uchardet/script/langs/lv.py +69 -0
  97. data/ext/uchardet/script/langs/mt.py +80 -0
  98. data/ext/uchardet/script/langs/pl.py +81 -0
  99. data/ext/uchardet/script/langs/pt.py +80 -0
  100. data/ext/uchardet/script/langs/ro.py +65 -0
  101. data/ext/uchardet/script/langs/sk.py +80 -0
  102. data/ext/uchardet/script/langs/sl.py +59 -0
  103. data/ext/uchardet/script/langs/sv.py +56 -0
  104. data/ext/uchardet/script/langs/th.py +55 -0
  105. data/ext/uchardet/script/langs/tr.py +67 -0
  106. data/ext/uchardet/script/langs/vi.py +64 -0
  107. data/ext/uchardet/script/release.sh +8 -0
  108. data/ext/uchardet/script/win32.sh +7 -0
  109. data/ext/uchardet/src/Big5Freq.tab +943 -0
  110. data/ext/uchardet/src/CMakeLists.txt +160 -0
  111. data/ext/uchardet/src/CharDistribution.cpp +109 -0
  112. data/ext/uchardet/src/CharDistribution.h +242 -0
  113. data/ext/uchardet/src/EUCKRFreq.tab +614 -0
  114. data/ext/uchardet/src/EUCTWFreq.tab +447 -0
  115. data/ext/uchardet/src/GB2312Freq.tab +491 -0
  116. data/ext/uchardet/src/JISFreq.tab +589 -0
  117. data/ext/uchardet/src/JpCntx.cpp +230 -0
  118. data/ext/uchardet/src/JpCntx.h +140 -0
  119. data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
  120. data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
  121. data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
  122. data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
  123. data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
  124. data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
  125. data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
  126. data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
  127. data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
  128. data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
  129. data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
  130. data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
  131. data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
  132. data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
  133. data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
  134. data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
  135. data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
  136. data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
  137. data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
  138. data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
  139. data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
  140. data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
  141. data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
  142. data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
  143. data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
  144. data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
  145. data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
  146. data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
  147. data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
  148. data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
  149. data/ext/uchardet/src/nsBig5Prober.h +75 -0
  150. data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
  151. data/ext/uchardet/src/nsCharSetProber.h +77 -0
  152. data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
  153. data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
  154. data/ext/uchardet/src/nsEUCJPProber.h +79 -0
  155. data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
  156. data/ext/uchardet/src/nsEUCKRProber.h +81 -0
  157. data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
  158. data/ext/uchardet/src/nsEUCTWProber.h +75 -0
  159. data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
  160. data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
  161. data/ext/uchardet/src/nsEscSM.cpp +267 -0
  162. data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
  163. data/ext/uchardet/src/nsGB2312Prober.h +77 -0
  164. data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
  165. data/ext/uchardet/src/nsHebrewProber.h +177 -0
  166. data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
  167. data/ext/uchardet/src/nsLatin1Prober.h +73 -0
  168. data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
  169. data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
  170. data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
  171. data/ext/uchardet/src/nsPkgInt.h +89 -0
  172. data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
  173. data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
  174. data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
  175. data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
  176. data/ext/uchardet/src/nsSJISProber.cpp +98 -0
  177. data/ext/uchardet/src/nsSJISProber.h +81 -0
  178. data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
  179. data/ext/uchardet/src/nsUTF8Prober.h +66 -0
  180. data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
  181. data/ext/uchardet/src/nsUniversalDetector.h +91 -0
  182. data/ext/uchardet/src/nscore.h +59 -0
  183. data/ext/uchardet/src/prmem.h +49 -0
  184. data/ext/uchardet/src/symbols.cmake +41 -0
  185. data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
  186. data/ext/uchardet/src/tools/uchardet.cpp +254 -0
  187. data/ext/uchardet/src/uchardet.cpp +274 -0
  188. data/ext/uchardet/src/uchardet.h +136 -0
  189. data/ext/uchardet/test/CMakeLists.txt +47 -0
  190. data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
  191. data/ext/uchardet/test/ar/utf-8.txt +3 -0
  192. data/ext/uchardet/test/ar/windows-1256.txt +3 -0
  193. data/ext/uchardet/test/bg/windows-1251.txt +3 -0
  194. data/ext/uchardet/test/cs/ibm852.txt +4 -0
  195. data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
  196. data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
  197. data/ext/uchardet/test/cs/utf-8.txt +4 -0
  198. data/ext/uchardet/test/cs/windows-1250.txt +4 -0
  199. data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
  200. data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
  201. data/ext/uchardet/test/da/utf-8.txt +10 -0
  202. data/ext/uchardet/test/da/windows-1252.txt +10 -0
  203. data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
  204. data/ext/uchardet/test/de/windows-1252.txt +11 -0
  205. data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
  206. data/ext/uchardet/test/el/utf-8.txt +3 -0
  207. data/ext/uchardet/test/el/windows-1253.txt +5 -0
  208. data/ext/uchardet/test/en/ascii.txt +4 -0
  209. data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
  210. data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
  211. data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
  212. data/ext/uchardet/test/es/utf-8.txt +5 -0
  213. data/ext/uchardet/test/es/windows-1252.txt +5 -0
  214. data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
  215. data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
  216. data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
  217. data/ext/uchardet/test/et/utf-8.txt +6 -0
  218. data/ext/uchardet/test/et/windows-1252.txt +6 -0
  219. data/ext/uchardet/test/et/windows-1257.txt +6 -0
  220. data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
  221. data/ext/uchardet/test/fi/utf-8.txt +8 -0
  222. data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
  223. data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
  224. data/ext/uchardet/test/fr/utf-16.be +0 -0
  225. data/ext/uchardet/test/fr/utf-32.le +0 -0
  226. data/ext/uchardet/test/fr/utf-8.txt +14 -0
  227. data/ext/uchardet/test/fr/windows-1252.txt +3 -0
  228. data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
  229. data/ext/uchardet/test/ga/utf-8.txt +6 -0
  230. data/ext/uchardet/test/ga/windows-1252.txt +6 -0
  231. data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
  232. data/ext/uchardet/test/he/utf-8.txt +3 -0
  233. data/ext/uchardet/test/he/windows-1255.txt +1 -0
  234. data/ext/uchardet/test/hr/ibm852.txt +4 -0
  235. data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
  236. data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
  237. data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
  238. data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
  239. data/ext/uchardet/test/hr/utf-8.txt +4 -0
  240. data/ext/uchardet/test/hr/windows-1250.txt +4 -0
  241. data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
  242. data/ext/uchardet/test/hu/windows-1250.txt +1 -0
  243. data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
  244. data/ext/uchardet/test/it/utf-8.txt +18 -0
  245. data/ext/uchardet/test/ja/euc-jp.txt +10 -0
  246. data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
  247. data/ext/uchardet/test/ja/shift_jis.txt +1 -0
  248. data/ext/uchardet/test/ja/utf-16be.txt +0 -0
  249. data/ext/uchardet/test/ja/utf-16le.txt +0 -0
  250. data/ext/uchardet/test/ja/utf-8.txt +9 -0
  251. data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
  252. data/ext/uchardet/test/ko/uhc.smi +16 -0
  253. data/ext/uchardet/test/ko/utf-16.le +0 -0
  254. data/ext/uchardet/test/ko/utf-32.be +0 -0
  255. data/ext/uchardet/test/ko/utf-8.txt +3 -0
  256. data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
  257. data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
  258. data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
  259. data/ext/uchardet/test/lt/utf-8.txt +3 -0
  260. data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
  261. data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
  262. data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
  263. data/ext/uchardet/test/lv/utf-8.txt +6 -0
  264. data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
  265. data/ext/uchardet/test/mt/utf-8.txt +4 -0
  266. data/ext/uchardet/test/pl/ibm852.txt +3 -0
  267. data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
  268. data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
  269. data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
  270. data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
  271. data/ext/uchardet/test/pl/utf-8.txt +3 -0
  272. data/ext/uchardet/test/pl/windows-1250.txt +3 -0
  273. data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
  274. data/ext/uchardet/test/pt/utf-8.txt +6 -0
  275. data/ext/uchardet/test/ro/ibm852.txt +9 -0
  276. data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
  277. data/ext/uchardet/test/ro/utf-8.txt +9 -0
  278. data/ext/uchardet/test/ro/windows-1250.txt +9 -0
  279. data/ext/uchardet/test/ru/ibm855.txt +5 -0
  280. data/ext/uchardet/test/ru/ibm866.txt +11 -0
  281. data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
  282. data/ext/uchardet/test/ru/koi8-r.txt +1 -0
  283. data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
  284. data/ext/uchardet/test/ru/windows-1251.txt +4 -0
  285. data/ext/uchardet/test/sk/ibm852.txt +3 -0
  286. data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
  287. data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
  288. data/ext/uchardet/test/sk/utf-8.txt +3 -0
  289. data/ext/uchardet/test/sk/windows-1250.txt +3 -0
  290. data/ext/uchardet/test/sl/ibm852.txt +9 -0
  291. data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
  292. data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
  293. data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
  294. data/ext/uchardet/test/sl/utf-8.txt +9 -0
  295. data/ext/uchardet/test/sl/windows-1250.txt +9 -0
  296. data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
  297. data/ext/uchardet/test/sv/utf-8.txt +10 -0
  298. data/ext/uchardet/test/sv/windows-1252.txt +10 -0
  299. data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
  300. data/ext/uchardet/test/th/tis-620.txt +5 -0
  301. data/ext/uchardet/test/th/utf-8.txt +1 -0
  302. data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
  303. data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
  304. data/ext/uchardet/test/uchardet-tests.c +130 -0
  305. data/ext/uchardet/test/vi/utf-8.txt +4 -0
  306. data/ext/uchardet/test/vi/viscii.txt +4 -0
  307. data/ext/uchardet/test/vi/windows-1258.txt +4 -0
  308. data/ext/uchardet/test/zh/big5.txt +1 -0
  309. data/ext/uchardet/test/zh/euc-tw.txt +1 -0
  310. data/ext/uchardet/test/zh/gb18030.txt +1 -0
  311. data/ext/uchardet/test/zh/utf-8.txt +1 -0
  312. data/ext/uchardet/uchardet.doap +51 -0
  313. data/ext/uchardet/uchardet.pc.in +10 -0
  314. data/lib/cchardet.rb +56 -0
  315. data/lib/cchardet/lib_finder.rb +32 -0
  316. data/lib/cchardet/version.rb +5 -0
  317. metadata +362 -0
@@ -0,0 +1,166 @@
1
+ = Logs of language model for Portuguese (pt) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2016-09-20 23:44:39.722451
5
+ - Maximum depth: 5
6
+ - Max number of pages: 100
7
+
8
+ == Parsed pages ==
9
+
10
+ Papagaio-das-mascarenhas (revision 46763149)
11
+ Albinismo (revision 46498446)
12
+ Alfred Newton (revision 43617011)
13
+ Alphonse Milne-Edwards (revision 39740747)
14
+ Animalia (revision 46727732)
15
+ Asa (revision 46338820)
16
+ August von Pelzeln (revision 34726241)
17
+ Aves (revision 46728980)
18
+ Bico (revision 45311553)
19
+ Carl Wilhelm Hahn (revision 45025566)
20
+ Carlos Lineu (revision 46625396)
21
+ Carolus Linnaeus (revision 46625396)
22
+ Cauda (revision 43275401)
23
+ Charles Lucien Bonaparte (revision 45529712)
24
+ Chordata (revision 46640101)
25
+ Cladograma (revision 46700307)
26
+ Classe (biologia) (revision 46701409)
27
+ Classificação científica (revision 46306288)
28
+ Coleção Leverian (revision 45026647)
29
+ Comores (revision 46181501)
30
+ Coracopsinae (revision 36946101)
31
+ Coracopsis nigra (revision 44338845)
32
+ Coracopsis vasa (revision 42905822)
33
+ Cylindraspis indica (revision 42905410)
34
+ Cúlmen (revision 45311553)
35
+ Digital object identifier (revision 42172651)
36
+ Eclectus roratus (revision 44380798)
37
+ Edward Newton (revision 39261469)
38
+ Endemismo (revision 45260961)
39
+ Epíteto específico (revision 35101647)
40
+ Espécie (revision 45685675)
41
+ Esquilo-vermelho (revision 43489595)
42
+ Estado de conservação (revision 46662839)
43
+ Extinção (revision 46526607)
44
+ Família (biologia) (revision 46636004)
45
+ Filo (revision 46704246)
46
+ França (revision 46740839)
47
+ François-Nicolas Martinet (revision 43679514)
48
+ François Levaillant (revision 40142351)
49
+ Fredrik Hasselqvist (revision 44381122)
50
+ Fregilupus varius (revision 46555765)
51
+ Fumigação (revision 42458244)
52
+ George Robert Gray (revision 39047844)
53
+ Georges-Louis Leclerc, conde de Buffon (revision 45622418)
54
+ Género (biologia) (revision 45296588)
55
+ Hermann Schlegel (revision 43137605)
56
+ Herpetologista (revision 46207704)
57
+ Histoire Naturelle (revision 44293456)
58
+ Holótipo (revision 44029660)
59
+ Ilha da Reunião (revision 45458206)
60
+ Ilha vulcânica (revision 37924535)
61
+ Ilhas Mascarenhas (revision 45858660)
62
+ Ilhas Molucas (revision 45476933)
63
+ International Standard Book Number (revision 46326494)
64
+ Jacques Barraband (revision 45007769)
65
+ Jean Feuilley (revision 43140791)
66
+ Johann Georg Wagler (revision 34585234)
67
+ John Gerrard Keulemans (revision 39664498)
68
+ Julian Hume (revision 41876605)
69
+ Leiolopisma (revision 43997173)
70
+ Lionel Walter Rothschild (revision 46022922)
71
+ Lista Vermelha da IUCN (revision 46569884)
72
+ Lista Vermelha da União Internacional para a Conservação da Natureza e dos Recursos Naturais (revision 46569884)
73
+ Lista Vermelha de Espécies Ameaçadas da IUCN (revision 46569884)
74
+ Lista de aves extintas (revision 45507420)
75
+ Londres (revision 46310311)
76
+ Língua inglesa (revision 46609785)
77
+ Madagascar (revision 46617630)
78
+ Mascarenotus grucheti (revision 43145662)
79
+ Mathurin Jacques Brisson (revision 36018826)
80
+ Maurício (revision 46723599)
81
+ Maximiliano I José da Baviera (revision 46372080)
82
+ Melanina (revision 46762903)
83
+ Museu Nacional de História Natural (França) (revision 43731807)
84
+ Naturhistorisches Museum (revision 46694247)
85
+ Nesoenas duboisi (revision 43995805)
86
+ Nome científico (revision 46671641)
87
+ Nomenclatura binomial (revision 46671641)
88
+ Nycticorax duboisi (revision 43816214)
89
+ Nível do mar (revision 46414695)
90
+ Ordem (biologia) (revision 46360024)
91
+ Otto Finsch (revision 42362273)
92
+ Papagaio (revision 46738207)
93
+ Papagaio-cinzento (revision 46673943)
94
+ Papagaio-cinzento-de-maurício (revision 46664408)
95
+ Pedro Mascarenhas (c. 1484-1555) (revision 45541977)
96
+ Periquito-de-maurício (revision 43010883)
97
+ Periquito-de-reunião (revision 43048764)
98
+ Peter Mundy (revision 43563846)
99
+ Piton des Neiges (revision 45632497)
100
+ Pleistoceno (revision 45916874)
101
+ Plumagem (revision 34951058)
102
+ Ponto quente (revision 45375495)
103
+ Porphyrio coerulescens (revision 43672493)
104
+ Praslin (revision 40728143)
105
+ Psitacídeos (revision 46598835)
106
+ Psittaciformes (revision 46598835)
107
+ Psittacula (revision 42856453)
108
+ Psittaculinae (revision 46760737)
109
+ Psittaculini (revision 43015966)
110
+ Psittrichasiidae (revision 44385977)
111
+
112
+ == End of Parsed pages ==
113
+
114
+ - Wikipedia parsing ended at: 2016-09-20 23:47:27.346826
115
+
116
+ 51 characters appeared 558324 times.
117
+
118
+ First 38 characters:
119
+ [ 0] Char a: 11.864795351802895 %
120
+ [ 1] Char e: 11.44604208309154 %
121
+ [ 2] Char o: 9.868284365350585 %
122
+ [ 3] Char s: 8.346587286235232 %
123
+ [ 4] Char i: 7.118089138206489 %
124
+ [ 5] Char r: 6.394136737808154 %
125
+ [ 6] Char n: 5.568272186042513 %
126
+ [ 7] Char d: 5.243192125002687 %
127
+ [ 8] Char t: 4.80061756256224 %
128
+ [ 9] Char m: 4.498105042949971 %
129
+ [10] Char c: 3.9747530107965985 %
130
+ [11] Char u: 3.7229279056605127 %
131
+ [12] Char l: 3.207814817202914 %
132
+ [13] Char p: 2.77562848811801 %
133
+ [14] Char g: 1.3850380782484721 %
134
+ [15] Char v: 1.3210967108703908 %
135
+ [16] Char f: 1.122466524813549 %
136
+ [17] Char b: 0.9702251739133549 %
137
+ [18] Char h: 0.9130898904578704 %
138
+ [19] Char é: 0.7026386112723079 %
139
+ [20] Char ã: 0.7022803963290133 %
140
+ [21] Char q: 0.5903382265494588 %
141
+ [22] Char ç: 0.5856814322866293 %
142
+ [23] Char í: 0.41391736697688086 %
143
+ [24] Char x: 0.3913498255493226 %
144
+ [25] Char á: 0.34567742027926435 %
145
+ [26] Char z: 0.3170202248156984 %
146
+ [27] Char ó: 0.22925756370852768 %
147
+ [28] Char j: 0.20454073262120204 %
148
+ [29] Char ê: 0.20239144296143458 %
149
+ [30] Char õ: 0.16155493942585308 %
150
+ [31] Char y: 0.15080849112701586 %
151
+ [32] Char w: 0.09241945537000021 %
152
+ [33] Char ú: 0.08794176857881804 %
153
+ [34] Char k: 0.08364318925928313 %
154
+ [35] Char â: 0.07898639499645367 %
155
+ [36] Char à: 0.06859816164091102 %
156
+ [37] Char ô: 0.031164700066627977 %
157
+
158
+ The first 38 characters have an accumulated ratio of 0.9998137282294869.
159
+
160
+ 891 sequences found.
161
+
162
+ First 512 (typical positive ratio): 0.9953179582313172
163
+ Next 512 (512-1024): 1.7910747164728723e-06
164
+ Rest: 2.42861286636753e-17
165
+
166
+ - Processing end: 2016-09-20 23:47:27.489355
@@ -0,0 +1,153 @@
1
+ = Logs of language model for Romanian (ro) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2016-09-28 18:53:56.086095
5
+ - Maximum depth: 5
6
+ - Max number of pages: 100
7
+
8
+ == Parsed pages ==
9
+
10
+ The Loving Kind (revision 10166481)
11
+ 12 ianuarie (revision 10711676)
12
+ 13 decembrie (revision 9938353)
13
+ 2007 (revision 10716321)
14
+ 2008 (revision 10752084)
15
+ 2009 (revision 10654003)
16
+ 21 noiembrie (revision 10447643)
17
+ 25 ianuarie (revision 10228199)
18
+ 31 ianuarie (revision 10718063)
19
+ 4 Music (revision 9701591)
20
+ Billboard (revision 10505294)
21
+ Biology (revision 10112430)
22
+ Bulgaria (revision 10481051)
23
+ CD (revision 10477531)
24
+ Call The Shots (revision 10101027)
25
+ Call the Shots (revision 10101027)
26
+ Can't Speak French (revision 9721506)
27
+ Casă de discuri (revision 10611348)
28
+ Channel 4 (revision 7953101)
29
+ Chemistry (revision 10112479)
30
+ Cheryl Cole (revision 10475016)
31
+ Chitară (revision 10468266)
32
+ Croația (revision 10737746)
33
+ Dance (revision 10231736)
34
+ Descărcare digitală (revision 10100743)
35
+ Digital Spy (revision 9044016)
36
+ Discografia Girls Aloud (revision 10172788)
37
+ Estonia (revision 10749810)
38
+ Europa (revision 10752724)
39
+ Fascination Records (revision 9655292)
40
+ Fiona Phillips (revision 5384082)
41
+ Gen muzical (revision 10534645)
42
+ Girls A Live (revision 10112444)
43
+ Girls Aloud (revision 10112446)
44
+ Good Morning Television (revision 10166481)
45
+ Heat World (revision 10166481)
46
+ I'll Stand By You (cântec de Girls Aloud) (revision 10112432)
47
+ ITunes (revision 10744174)
48
+ I Think We're Alone Now (revision 10112427)
49
+ Irlanda (revision 10573806)
50
+ Jump (cântec de Girls Aloud) (revision 10112438)
51
+ Lady GaGa (revision 10753010)
52
+ Life Got Cold (revision 10112437)
53
+ Limba engleză (revision 10756676)
54
+ Long Hot Summer (revision 10112429)
55
+ Love Machine (revision 10112433)
56
+ MSN Search (revision 10653298)
57
+ MTV (revision 10170766)
58
+ Mixed Up (revision 10112443)
59
+ Muzică electronică (revision 10608432)
60
+ Muzică pop (revision 10740529)
61
+ Nadine Coyle (revision 10316187)
62
+ Neil Tennant (revision 10499980)
63
+ No Good Advice (revision 10112436)
64
+ Out Of Control (revision 10112484)
65
+ Out of Control (revision 10112484)
66
+ Pet Shop Boys (revision 10612741)
67
+ Poker Face (revision 10496402)
68
+ PopJustice (revision 10625677)
69
+ Regatul Unit (revision 10752338)
70
+ Regatul Unit al Marii Britanii și Irlandei de Nord (revision 10752338)
71
+ Regatul Unit al Marii Britanii și al Irlandei de Nord (revision 10752338)
72
+ Republica Irlanda (revision 10573806)
73
+ Romanian Top 100 (revision 10736281)
74
+ România (revision 10732435)
75
+ Sarah Harding (revision 10633651)
76
+ Sarah Hearding (revision 10112425)
77
+ See the Day (revision 10112431)
78
+ Sexy! No No No... (revision 10112425)
79
+ Slant Magazine (revision 7697473)
80
+ Slovenia (revision 10521499)
81
+ Something Kinda Ooooh (revision 10112426)
82
+ Sound of the Underground (album) (revision 10112476)
83
+ Sound of the Underground (cântec) (revision 10112434)
84
+ Tangled Up (revision 10112482)
85
+ The Guardian (revision 9752334)
86
+ The Paul O'Grady Show (revision 10101027)
87
+ The Promise (revision 10166482)
88
+ The Show (revision 10112441)
89
+ The Sound of Girls Aloud (revision 10112480)
90
+ Tonalitate (revision 9966362)
91
+ Turneul Out of Control (revision 10112446)
92
+ UK Mix (revision 9721468)
93
+ UK Singles Chart (revision 10226705)
94
+ Ungaria (revision 10737745)
95
+ Uniunea Europeană (revision 10751590)
96
+ Untouchable (revision 10112410)
97
+ Wake Me Up (revision 10112439)
98
+ What Will The Neighbours Say? (revision 10112478)
99
+ Whole Lotta History (revision 10475020)
100
+ Wideboys (revision 10166481)
101
+ Wikimedia Commons (revision 9703907)
102
+ Xenomania (revision 10112484)
103
+
104
+ == End of Parsed pages ==
105
+
106
+ - Wikipedia parsing ended at: 2016-09-28 18:58:13.756622
107
+
108
+ 60 characters appeared 883554 times.
109
+
110
+ First 33 characters:
111
+ [ 0] Char e: 11.67014127036944 %
112
+ [ 1] Char i: 10.97567324690964 %
113
+ [ 2] Char a: 10.080198833348046 %
114
+ [ 3] Char r: 7.490657050955572 %
115
+ [ 4] Char n: 7.18246988865423 %
116
+ [ 5] Char t: 6.516296683620921 %
117
+ [ 6] Char l: 5.595130574928075 %
118
+ [ 7] Char u: 5.551217016730161 %
119
+ [ 8] Char o: 4.922732509840938 %
120
+ [ 9] Char c: 4.495707110148333 %
121
+ [10] Char s: 3.8308920563994957 %
122
+ [11] Char d: 3.590499279048027 %
123
+ [12] Char m: 2.971408651876399 %
124
+ [13] Char p: 2.902369294915761 %
125
+ [14] Char ă: 2.1349006399156134 %
126
+ [15] Char g: 1.2248261000459508 %
127
+ [16] Char f: 1.1199089133205216 %
128
+ [17] Char b: 1.0781457613230203 %
129
+ [18] Char ț: 1.0323081554721047 %
130
+ [19] Char ș: 0.9732285745975912 %
131
+ [20] Char î: 0.97017273420753 %
132
+ [21] Char v: 0.9693804792915882 %
133
+ [22] Char z: 0.7369102510995367 %
134
+ [23] Char h: 0.533413916976212 %
135
+ [24] Char â: 0.4986678799484808 %
136
+ [25] Char x: 0.22081276300033725 %
137
+ [26] Char j: 0.20055367300696958 %
138
+ [27] Char k: 0.1901411798260208 %
139
+ [28] Char y: 0.15471606715605385 %
140
+ [29] Char w: 0.11827234102273318 %
141
+ [30] Char á: 0.016297815413658927 %
142
+ [31] Char é: 0.013355154297303842 %
143
+ [32] Char q: 0.00520624659047438 %
144
+
145
+ The first 33 characters have an accumulated ratio of 0.9996661211425673.
146
+
147
+ 981 sequences found.
148
+
149
+ First 512 (typical positive ratio): 0.997762564143313
150
+ Next 512 (512-1024): 1.1317927370596478e-06
151
+ Rest: 3.0357660829594124e-18
152
+
153
+ - Processing end: 2016-09-28 18:58:13.862425
@@ -0,0 +1,158 @@
1
+ = Logs of language model for Slovak (sk) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2016-09-21 13:26:28.712674
5
+ - Maximum depth: 5
6
+ - Max number of pages: 100
7
+
8
+ == Parsed pages ==
9
+
10
+ Dôkaz (matematika) (revision 6358810)
11
+ 1825 (revision 6122752)
12
+ 1839 (revision 6165808)
13
+ 1847 (revision 5941780)
14
+ 1852 (revision 5941777)
15
+ 1878 (revision 6221358)
16
+ 1955 (revision 6226609)
17
+ 1976 (revision 6310709)
18
+ 1983 (revision 6356952)
19
+ 1993 (revision 6348358)
20
+ 1995 (revision 6277350)
21
+ 2012 (revision 6291145)
22
+ Adrien-Marie Legendre (revision 6060342)
23
+ Algebra (revision 6319238)
24
+ Algebraická geometria (revision 5964212)
25
+ Algebraická rovnica (revision 5288111)
26
+ Algebrické číslo (revision 6106622)
27
+ Algoritmus (revision 6286937)
28
+ Andrew Wiles (revision 5791970)
29
+ Arabi (revision 6044956)
30
+ Arabčina (revision 6322514)
31
+ Aristoteles (revision 6359959)
32
+ Arthur Cayley (revision 6332355)
33
+ Axióma (revision 6338092)
34
+ Babylonia (revision 6168813)
35
+ Bernard Bolzano (revision 6261374)
36
+ Boh (revision 6282272)
37
+ Bolzanova veta (revision 6345299)
38
+ Bytie (revision 5274918)
39
+ Byzantská ríša (revision 6359782)
40
+ Caroline Blundenová (revision 6358810)
41
+ Cauchyho postupnosť (revision 6215169)
42
+ Celé číslo (revision 6302805)
43
+ Charles Hermite (revision 5751036)
44
+ Daniel Marcus (revision 5657431)
45
+ David Hilbert (revision 5968866)
46
+ Dedukcia (revision 6338099)
47
+ Definícia (revision 6106684)
48
+ Derivácia (funkcia) (revision 5970574)
49
+ Desiatková číselná sústava (revision 5924486)
50
+ Diofantická rovnica (revision 6327292)
51
+ Dynastia Chan (revision 6342042)
52
+ Dôkaz (logika) (revision 5495754)
53
+ Dôkaz sporom (revision 5940134)
54
+ Dôkaz výpočtom (revision 6358810)
55
+ Energia (revision 6277761)
56
+ Eric Weisstein (revision 6054413)
57
+ Ernst Kummer (revision 6001344)
58
+ Európa (revision 6295124)
59
+ Experiment (revision 6354302)
60
+ Fenomén (filozofia) (revision 5420897)
61
+ Filozofia (revision 6296369)
62
+ Formula (logika) (revision 3916562)
63
+ Formálny dôkaz (revision 6358810)
64
+ Formálny jazyk (revision 5623029)
65
+ Gabriel Cramer (revision 5923903)
66
+ Galoisova teória (revision 6353573)
67
+ Gentzenovský kalkul (revision 6358810)
68
+ Geometria (revision 5970028)
69
+ Geometrický dôkaz (revision 6358810)
70
+ Georg Ferdinand Cantor (revision 6186696)
71
+ Giordano Bruno (revision 6312876)
72
+ Gottlob Frege (revision 5968855)
73
+ Gödelova veta o neúplnosti (revision 5323549)
74
+ Hardvér (revision 6214401)
75
+ Henri Poincaré (revision 6315506)
76
+ Hilbertovský kalkul (revision 6358810)
77
+ Hmotnosť (revision 5979540)
78
+ Hypotéza (revision 5983410)
79
+ Idea (revision 5960449)
80
+ India (revision 6362189)
81
+ Intuícia (revision 5837951)
82
+ Jazyk (lingvistika) (revision 6073293)
83
+ John Taylor (revision 6355518)
84
+ Kardinálne číslo (revision 6090126)
85
+ Kenneth Appel (revision 5968422)
86
+ Klasická mechanika (revision 6295646)
87
+ Konečná množina (revision 5276494)
88
+ Konfucianizmus (revision 5968816)
89
+ Kresťanstvo (revision 6289571)
90
+ Langlandsov program (revision 6088475)
91
+ Latinčina (revision 6121105)
92
+ Leonhard Euler (revision 6339382)
93
+ Lineárna algebra (revision 5473535)
94
+ Logická axióma (revision 5495754)
95
+ Logický kalkul (revision 1608550)
96
+
97
+ == End of Parsed pages ==
98
+
99
+ - Wikipedia parsing ended at: 2016-09-21 13:33:10.330458
100
+
101
+ 62 characters appeared 550293 times.
102
+
103
+ First 45 characters:
104
+ [ 0] Char o: 8.867094438780795 %
105
+ [ 1] Char a: 8.59705647718579 %
106
+ [ 2] Char e: 8.562347694773512 %
107
+ [ 3] Char n: 6.0867574183207855 %
108
+ [ 4] Char i: 5.828531346028389 %
109
+ [ 5] Char t: 5.366595613609477 %
110
+ [ 6] Char r: 4.977711873492848 %
111
+ [ 7] Char k: 4.264273759615332 %
112
+ [ 8] Char s: 4.257731790155426 %
113
+ [ 9] Char v: 4.117079446767449 %
114
+ [10] Char l: 3.5979014815743615 %
115
+ [11] Char d: 3.416361829061972 %
116
+ [12] Char m: 3.2513588215732345 %
117
+ [13] Char p: 2.878466562358598 %
118
+ [14] Char u: 2.5987973679476206 %
119
+ [15] Char c: 2.419438371921867 %
120
+ [16] Char z: 2.127412124086623 %
121
+ [17] Char h: 2.0687161203213558 %
122
+ [18] Char j: 2.0312815173007834 %
123
+ [19] Char y: 1.6700194260148686 %
124
+ [20] Char b: 1.6574806512167153 %
125
+ [21] Char á: 1.6422160558102683 %
126
+ [22] Char ý: 1.2564215790497062 %
127
+ [23] Char í: 1.1326693234331529 %
128
+ [24] Char č: 0.9473135220691523 %
129
+ [25] Char é: 0.8913433389121795 %
130
+ [26] Char ž: 0.7668641978000811 %
131
+ [27] Char ú: 0.6949025337411161 %
132
+ [28] Char š: 0.6785476100913513 %
133
+ [29] Char f: 0.6514711253822963 %
134
+ [30] Char g: 0.6096752093884531 %
135
+ [31] Char ť: 0.46375294615777407 %
136
+ [32] Char ô: 0.4172322744428877 %
137
+ [33] Char ľ: 0.36053520579036985 %
138
+ [34] Char x: 0.23114958758334195 %
139
+ [35] Char ó: 0.2251527822450949 %
140
+ [36] Char ň: 0.09304134342977287 %
141
+ [37] Char w: 0.09013380144759246 %
142
+ [38] Char ä: 0.0694175648245571 %
143
+ [39] Char ď: 0.06560141597294532 %
144
+ [40] Char q: 0.01726353051919614 %
145
+ [41] Char ě: 0.009994675563745132 %
146
+ [42] Char ĺ: 0.009267790068200032 %
147
+ [43] Char ö: 0.008904347320427481 %
148
+ [44] Char ŕ: 0.00599680533824708 %
149
+
150
+ The first 45 characters have an accumulated ratio of 0.9998128269848972.
151
+
152
+ 1181 sequences found.
153
+
154
+ First 512 (typical positive ratio): 0.9733303573968434
155
+ Next 512 (512-1024): 1.8172137388627513e-06
156
+ Rest: 0.0003522983638913346
157
+
158
+ - Processing end: 2016-09-21 13:33:10.831531