cchardet 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (317) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.gitmodules +3 -0
  4. data/.rubocop.yml +11 -0
  5. data/CHANGELOG.md +5 -0
  6. data/Gemfile +10 -0
  7. data/README.md +35 -0
  8. data/Rakefile +15 -0
  9. data/cchardet.gemspec +30 -0
  10. data/ext/cchardet/extconf.rb +26 -0
  11. data/ext/uchardet/.gitignore +1 -0
  12. data/ext/uchardet/.gitlab-ci.yml +106 -0
  13. data/ext/uchardet/AUTHORS +16 -0
  14. data/ext/uchardet/CMakeLists.txt +74 -0
  15. data/ext/uchardet/COPYING +1316 -0
  16. data/ext/uchardet/INSTALL +26 -0
  17. data/ext/uchardet/README.md +295 -0
  18. data/ext/uchardet/build-mac/uchardet.cpp +7 -0
  19. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
  20. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  21. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
  22. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
  23. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
  24. data/ext/uchardet/doc/CMakeLists.txt +6 -0
  25. data/ext/uchardet/doc/README.maintainer +59 -0
  26. data/ext/uchardet/doc/uchardet.1 +18 -0
  27. data/ext/uchardet/script/BuildLangModel.py +533 -0
  28. data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
  29. data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
  30. data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
  31. data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
  32. data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
  33. data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
  34. data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
  35. data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
  36. data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
  37. data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
  38. data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
  39. data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
  40. data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
  41. data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
  42. data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
  43. data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
  44. data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
  45. data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
  46. data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
  47. data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
  48. data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
  49. data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
  50. data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
  51. data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
  52. data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
  53. data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
  54. data/ext/uchardet/script/README +63 -0
  55. data/ext/uchardet/script/charsets/codepoints.py +53 -0
  56. data/ext/uchardet/script/charsets/db.py +73 -0
  57. data/ext/uchardet/script/charsets/ibm852.py +72 -0
  58. data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
  59. data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
  60. data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
  61. data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
  62. data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
  63. data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
  64. data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
  65. data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
  66. data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
  67. data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
  68. data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
  69. data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
  70. data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
  71. data/ext/uchardet/script/charsets/tis-620.py +77 -0
  72. data/ext/uchardet/script/charsets/viscii.py +72 -0
  73. data/ext/uchardet/script/charsets/windows-1250.py +75 -0
  74. data/ext/uchardet/script/charsets/windows-1252.py +76 -0
  75. data/ext/uchardet/script/charsets/windows-1253.py +72 -0
  76. data/ext/uchardet/script/charsets/windows-1256.py +75 -0
  77. data/ext/uchardet/script/charsets/windows-1257.py +72 -0
  78. data/ext/uchardet/script/charsets/windows-1258.py +72 -0
  79. data/ext/uchardet/script/debug.sh +9 -0
  80. data/ext/uchardet/script/header-template.cpp +38 -0
  81. data/ext/uchardet/script/langs/ar.py +59 -0
  82. data/ext/uchardet/script/langs/cs.py +80 -0
  83. data/ext/uchardet/script/langs/da.py +69 -0
  84. data/ext/uchardet/script/langs/de.py +69 -0
  85. data/ext/uchardet/script/langs/el.py +55 -0
  86. data/ext/uchardet/script/langs/eo.py +67 -0
  87. data/ext/uchardet/script/langs/es.py +69 -0
  88. data/ext/uchardet/script/langs/et.py +57 -0
  89. data/ext/uchardet/script/langs/fi.py +60 -0
  90. data/ext/uchardet/script/langs/fr.py +79 -0
  91. data/ext/uchardet/script/langs/ga.py +60 -0
  92. data/ext/uchardet/script/langs/hr.py +59 -0
  93. data/ext/uchardet/script/langs/hu.py +66 -0
  94. data/ext/uchardet/script/langs/it.py +56 -0
  95. data/ext/uchardet/script/langs/lt.py +70 -0
  96. data/ext/uchardet/script/langs/lv.py +69 -0
  97. data/ext/uchardet/script/langs/mt.py +80 -0
  98. data/ext/uchardet/script/langs/pl.py +81 -0
  99. data/ext/uchardet/script/langs/pt.py +80 -0
  100. data/ext/uchardet/script/langs/ro.py +65 -0
  101. data/ext/uchardet/script/langs/sk.py +80 -0
  102. data/ext/uchardet/script/langs/sl.py +59 -0
  103. data/ext/uchardet/script/langs/sv.py +56 -0
  104. data/ext/uchardet/script/langs/th.py +55 -0
  105. data/ext/uchardet/script/langs/tr.py +67 -0
  106. data/ext/uchardet/script/langs/vi.py +64 -0
  107. data/ext/uchardet/script/release.sh +8 -0
  108. data/ext/uchardet/script/win32.sh +7 -0
  109. data/ext/uchardet/src/Big5Freq.tab +943 -0
  110. data/ext/uchardet/src/CMakeLists.txt +160 -0
  111. data/ext/uchardet/src/CharDistribution.cpp +109 -0
  112. data/ext/uchardet/src/CharDistribution.h +242 -0
  113. data/ext/uchardet/src/EUCKRFreq.tab +614 -0
  114. data/ext/uchardet/src/EUCTWFreq.tab +447 -0
  115. data/ext/uchardet/src/GB2312Freq.tab +491 -0
  116. data/ext/uchardet/src/JISFreq.tab +589 -0
  117. data/ext/uchardet/src/JpCntx.cpp +230 -0
  118. data/ext/uchardet/src/JpCntx.h +140 -0
  119. data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
  120. data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
  121. data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
  122. data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
  123. data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
  124. data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
  125. data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
  126. data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
  127. data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
  128. data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
  129. data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
  130. data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
  131. data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
  132. data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
  133. data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
  134. data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
  135. data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
  136. data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
  137. data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
  138. data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
  139. data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
  140. data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
  141. data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
  142. data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
  143. data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
  144. data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
  145. data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
  146. data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
  147. data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
  148. data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
  149. data/ext/uchardet/src/nsBig5Prober.h +75 -0
  150. data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
  151. data/ext/uchardet/src/nsCharSetProber.h +77 -0
  152. data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
  153. data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
  154. data/ext/uchardet/src/nsEUCJPProber.h +79 -0
  155. data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
  156. data/ext/uchardet/src/nsEUCKRProber.h +81 -0
  157. data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
  158. data/ext/uchardet/src/nsEUCTWProber.h +75 -0
  159. data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
  160. data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
  161. data/ext/uchardet/src/nsEscSM.cpp +267 -0
  162. data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
  163. data/ext/uchardet/src/nsGB2312Prober.h +77 -0
  164. data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
  165. data/ext/uchardet/src/nsHebrewProber.h +177 -0
  166. data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
  167. data/ext/uchardet/src/nsLatin1Prober.h +73 -0
  168. data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
  169. data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
  170. data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
  171. data/ext/uchardet/src/nsPkgInt.h +89 -0
  172. data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
  173. data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
  174. data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
  175. data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
  176. data/ext/uchardet/src/nsSJISProber.cpp +98 -0
  177. data/ext/uchardet/src/nsSJISProber.h +81 -0
  178. data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
  179. data/ext/uchardet/src/nsUTF8Prober.h +66 -0
  180. data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
  181. data/ext/uchardet/src/nsUniversalDetector.h +91 -0
  182. data/ext/uchardet/src/nscore.h +59 -0
  183. data/ext/uchardet/src/prmem.h +49 -0
  184. data/ext/uchardet/src/symbols.cmake +41 -0
  185. data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
  186. data/ext/uchardet/src/tools/uchardet.cpp +254 -0
  187. data/ext/uchardet/src/uchardet.cpp +274 -0
  188. data/ext/uchardet/src/uchardet.h +136 -0
  189. data/ext/uchardet/test/CMakeLists.txt +47 -0
  190. data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
  191. data/ext/uchardet/test/ar/utf-8.txt +3 -0
  192. data/ext/uchardet/test/ar/windows-1256.txt +3 -0
  193. data/ext/uchardet/test/bg/windows-1251.txt +3 -0
  194. data/ext/uchardet/test/cs/ibm852.txt +4 -0
  195. data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
  196. data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
  197. data/ext/uchardet/test/cs/utf-8.txt +4 -0
  198. data/ext/uchardet/test/cs/windows-1250.txt +4 -0
  199. data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
  200. data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
  201. data/ext/uchardet/test/da/utf-8.txt +10 -0
  202. data/ext/uchardet/test/da/windows-1252.txt +10 -0
  203. data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
  204. data/ext/uchardet/test/de/windows-1252.txt +11 -0
  205. data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
  206. data/ext/uchardet/test/el/utf-8.txt +3 -0
  207. data/ext/uchardet/test/el/windows-1253.txt +5 -0
  208. data/ext/uchardet/test/en/ascii.txt +4 -0
  209. data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
  210. data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
  211. data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
  212. data/ext/uchardet/test/es/utf-8.txt +5 -0
  213. data/ext/uchardet/test/es/windows-1252.txt +5 -0
  214. data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
  215. data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
  216. data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
  217. data/ext/uchardet/test/et/utf-8.txt +6 -0
  218. data/ext/uchardet/test/et/windows-1252.txt +6 -0
  219. data/ext/uchardet/test/et/windows-1257.txt +6 -0
  220. data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
  221. data/ext/uchardet/test/fi/utf-8.txt +8 -0
  222. data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
  223. data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
  224. data/ext/uchardet/test/fr/utf-16.be +0 -0
  225. data/ext/uchardet/test/fr/utf-32.le +0 -0
  226. data/ext/uchardet/test/fr/utf-8.txt +14 -0
  227. data/ext/uchardet/test/fr/windows-1252.txt +3 -0
  228. data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
  229. data/ext/uchardet/test/ga/utf-8.txt +6 -0
  230. data/ext/uchardet/test/ga/windows-1252.txt +6 -0
  231. data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
  232. data/ext/uchardet/test/he/utf-8.txt +3 -0
  233. data/ext/uchardet/test/he/windows-1255.txt +1 -0
  234. data/ext/uchardet/test/hr/ibm852.txt +4 -0
  235. data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
  236. data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
  237. data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
  238. data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
  239. data/ext/uchardet/test/hr/utf-8.txt +4 -0
  240. data/ext/uchardet/test/hr/windows-1250.txt +4 -0
  241. data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
  242. data/ext/uchardet/test/hu/windows-1250.txt +1 -0
  243. data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
  244. data/ext/uchardet/test/it/utf-8.txt +18 -0
  245. data/ext/uchardet/test/ja/euc-jp.txt +10 -0
  246. data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
  247. data/ext/uchardet/test/ja/shift_jis.txt +1 -0
  248. data/ext/uchardet/test/ja/utf-16be.txt +0 -0
  249. data/ext/uchardet/test/ja/utf-16le.txt +0 -0
  250. data/ext/uchardet/test/ja/utf-8.txt +9 -0
  251. data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
  252. data/ext/uchardet/test/ko/uhc.smi +16 -0
  253. data/ext/uchardet/test/ko/utf-16.le +0 -0
  254. data/ext/uchardet/test/ko/utf-32.be +0 -0
  255. data/ext/uchardet/test/ko/utf-8.txt +3 -0
  256. data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
  257. data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
  258. data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
  259. data/ext/uchardet/test/lt/utf-8.txt +3 -0
  260. data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
  261. data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
  262. data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
  263. data/ext/uchardet/test/lv/utf-8.txt +6 -0
  264. data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
  265. data/ext/uchardet/test/mt/utf-8.txt +4 -0
  266. data/ext/uchardet/test/pl/ibm852.txt +3 -0
  267. data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
  268. data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
  269. data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
  270. data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
  271. data/ext/uchardet/test/pl/utf-8.txt +3 -0
  272. data/ext/uchardet/test/pl/windows-1250.txt +3 -0
  273. data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
  274. data/ext/uchardet/test/pt/utf-8.txt +6 -0
  275. data/ext/uchardet/test/ro/ibm852.txt +9 -0
  276. data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
  277. data/ext/uchardet/test/ro/utf-8.txt +9 -0
  278. data/ext/uchardet/test/ro/windows-1250.txt +9 -0
  279. data/ext/uchardet/test/ru/ibm855.txt +5 -0
  280. data/ext/uchardet/test/ru/ibm866.txt +11 -0
  281. data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
  282. data/ext/uchardet/test/ru/koi8-r.txt +1 -0
  283. data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
  284. data/ext/uchardet/test/ru/windows-1251.txt +4 -0
  285. data/ext/uchardet/test/sk/ibm852.txt +3 -0
  286. data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
  287. data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
  288. data/ext/uchardet/test/sk/utf-8.txt +3 -0
  289. data/ext/uchardet/test/sk/windows-1250.txt +3 -0
  290. data/ext/uchardet/test/sl/ibm852.txt +9 -0
  291. data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
  292. data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
  293. data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
  294. data/ext/uchardet/test/sl/utf-8.txt +9 -0
  295. data/ext/uchardet/test/sl/windows-1250.txt +9 -0
  296. data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
  297. data/ext/uchardet/test/sv/utf-8.txt +10 -0
  298. data/ext/uchardet/test/sv/windows-1252.txt +10 -0
  299. data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
  300. data/ext/uchardet/test/th/tis-620.txt +5 -0
  301. data/ext/uchardet/test/th/utf-8.txt +1 -0
  302. data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
  303. data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
  304. data/ext/uchardet/test/uchardet-tests.c +130 -0
  305. data/ext/uchardet/test/vi/utf-8.txt +4 -0
  306. data/ext/uchardet/test/vi/viscii.txt +4 -0
  307. data/ext/uchardet/test/vi/windows-1258.txt +4 -0
  308. data/ext/uchardet/test/zh/big5.txt +1 -0
  309. data/ext/uchardet/test/zh/euc-tw.txt +1 -0
  310. data/ext/uchardet/test/zh/gb18030.txt +1 -0
  311. data/ext/uchardet/test/zh/utf-8.txt +1 -0
  312. data/ext/uchardet/uchardet.doap +51 -0
  313. data/ext/uchardet/uchardet.pc.in +10 -0
  314. data/lib/cchardet.rb +56 -0
  315. data/lib/cchardet/lib_finder.rb +32 -0
  316. data/lib/cchardet/version.rb +5 -0
  317. metadata +362 -0
@@ -0,0 +1,158 @@
1
+ = Logs of language model for Danish (da) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2016-02-19 17:53:58.564190
5
+ - Maximum depth: 4
6
+ - Max number of pages: 100
7
+
8
+ == Parsed pages ==
9
+
10
+ Forside (revision 2692411)
11
+ 16. februar (revision 6877446)
12
+ 17. februar (revision 8454583)
13
+ 1878 (revision 8280505)
14
+ 19. februar (revision 8206479)
15
+ 1922 (revision 8455105)
16
+ 1926 (revision 8425271)
17
+ 1942 (revision 8443554)
18
+ 1945 (revision 8448461)
19
+ 1948 (revision 8454392)
20
+ 1985 (revision 8409096)
21
+ 2. verdenskrig (revision 8433181)
22
+ 23. oktober (revision 6877825)
23
+ 26. oktober (revision 7849938)
24
+ 3C 273 (revision 8443798)
25
+ A-bus (revision 8427319)
26
+ Aktuelle begivenheder (revision 8440596)
27
+ B-52 Stratofortress (revision 8422571)
28
+ Borgerkrigen i Syrien (revision 8447763)
29
+ Boutros Boutros-Ghali (revision 8453935)
30
+ Brasilien (revision 8452750)
31
+ Cusco (region) (revision 7693764)
32
+ Danmark (revision 8451178)
33
+ Danmark i Eurovision Song Contest (revision 8453514)
34
+ Dansk (sprog) (revision 8455750)
35
+ Dansk Melodi Grand Prix 2016 (revision 8452164)
36
+ Dobbeltmordet på Peter Bangs Vej (revision 8334648)
37
+ Encyklopædi (revision 8446641)
38
+ Eritrea-sagen (revision 8452285)
39
+ Eurovision Song Contest 2014 (revision 8445804)
40
+ Eurovision Song Contest 2016 (revision 8453588)
41
+ Flygtningekrisen i Europa 2015 (revision 8452286)
42
+ Fonograf (revision 8177165)
43
+ Formel 1 (revision 8450846)
44
+ Formel 1 2016 (revision 8456463)
45
+ Frederik 6. (revision 8438503)
46
+ Første observation af gravitationsbølger (revision 8451269)
47
+ Grammofon (revision 8375093)
48
+ Guadalcanal (revision 7796248)
49
+ Harper Lee (revision 8456583)
50
+ Hartkorn (revision 8437552)
51
+ IC4 (revision 8446402)
52
+ IC4-sagen (revision 8434463)
53
+ Islamisk Stat (revision 8439228)
54
+ Jonathan Leunbach (revision 8452603)
55
+ Juliane Marie af Braunschweig-Wolfenbüttel (revision 8437957)
56
+ Kaliumklorid (revision 8452216)
57
+ Kejserriget Japan (revision 8044942)
58
+ Kevin Magnussen (revision 8455302)
59
+ København (revision 8427847)
60
+ LIGO (revision 8451266)
61
+ Latinamerika (revision 7692181)
62
+ Leonid Hurwicz (revision 8445727)
63
+ Lighthouse X (revision 8452940)
64
+ Linkoban (revision 8455879)
65
+ Machu Picchu (revision 8406907)
66
+ Matador (tv-serie) (revision 8454648)
67
+ Middelaldercentret (revision 8449194)
68
+ Nobelprisen (revision 8409809)
69
+ Nykøbing Falster (revision 8452825)
70
+ Nyligt afdøde (revision 8456580)
71
+ Overvågning (revision 8455039)
72
+ Panorama (foto) (revision 8448393)
73
+ Peru (revision 8437485)
74
+ Peter Lauritsen (revision 8456097)
75
+ Professor (revision 8415451)
76
+ Renault F1 (revision 8450843)
77
+ S-bus (revision 8455589)
78
+ Salomonøerne (revision 8238961)
79
+ Slaget om Belgien (1940) (revision 8430013)
80
+ Slaget om Guadalcanal (revision 7762887)
81
+ Slaget om Henderson Field (revision 8445480)
82
+ Slaget om Iwo Jima (revision 8145239)
83
+ Soldiers of Love (Lighthouse X-sang) (revision 8452929)
84
+ Solen (revision 8276478)
85
+ Stillehavskrigen (revision 8430649)
86
+ Stockholm (revision 8358042)
87
+ Søslaget ved Guadalcanal (revision 7772812)
88
+ Thomas Edison (revision 8282441)
89
+ Togulykken ved Bad Aibling (revision 8455364)
90
+ Topografi (revision 6886168)
91
+ USA (revision 8448088)
92
+ United States Army (revision 8401635)
93
+ United States Marine Corps (revision 8401667)
94
+ Vestallierede (revision 6961443)
95
+ Wikimedia (revision 8263252)
96
+ Wikipedia (revision 8267051)
97
+ Zikavirus (revision 8454832)
98
+ 1. februar (revision 8404985)
99
+ 10. februar (revision 6877431)
100
+ 11. februar (revision 6877433)
101
+ 12. februar (revision 6877437)
102
+ 13. februar (revision 6877438)
103
+ 14. februar (revision 6877441)
104
+ 1497 (revision 7369489)
105
+ 15. februar (revision 7329463)
106
+ 1560 (revision 7874693)
107
+ 1568 (revision 7369703)
108
+ 1620 (revision 7423903)
109
+ 1688 (revision 7367090)
110
+ 18. februar (revision 6877450)
111
+
112
+ == End of Parsed pages ==
113
+
114
+ - Wikipedia parsing ended at: 2016-02-19 17:56:42.162636
115
+
116
+ 53 characters appeared 1301488 times.
117
+
118
+ First 30 characters:
119
+ [ 0] Char e: 15.272749345364689 %
120
+ [ 1] Char r: 8.48482659847805 %
121
+ [ 2] Char n: 7.695652975670924 %
122
+ [ 3] Char t: 6.977014002434137 %
123
+ [ 4] Char a: 6.780469739252302 %
124
+ [ 5] Char i: 6.164636170291236 %
125
+ [ 6] Char s: 6.0942551909814 %
126
+ [ 7] Char d: 5.953493232361728 %
127
+ [ 8] Char l: 5.076650725938311 %
128
+ [ 9] Char o: 4.883026197706011 %
129
+ [10] Char g: 4.012253666572415 %
130
+ [11] Char k: 3.232607599916403 %
131
+ [12] Char m: 3.0863135119186653 %
132
+ [13] Char f: 2.701600014752345 %
133
+ [14] Char v: 2.13970470722742 %
134
+ [15] Char b: 1.982423195603801 %
135
+ [16] Char u: 1.8339777239590376 %
136
+ [17] Char p: 1.5789619266562582 %
137
+ [18] Char h: 1.3433085821767086 %
138
+ [19] Char ø: 0.8730775850411222 %
139
+ [20] Char y: 0.859938777768216 %
140
+ [21] Char å: 0.7699648402443973 %
141
+ [22] Char æ: 0.7208671920140639 %
142
+ [23] Char j: 0.644108896893402 %
143
+ [24] Char c: 0.5698093259407694 %
144
+ [25] Char w: 0.11087309295206717 %
145
+ [26] Char z: 0.05309307500338075 %
146
+ [27] Char x: 0.032424424965885205 %
147
+ [28] Char é: 0.032193919575132464 %
148
+ [29] Char q: 0.012139950579644223 %
149
+
150
+ The first 30 characters have an accumulated ratio of 0.9997241618823994.
151
+
152
+ 964 sequences found.
153
+
154
+ First 512 (typical positive ratio): 0.9968082796759031
155
+ Next 512 (512-1024): 7.68351302509128e-07
156
+ Rest: 3.903127820947816e-17
157
+
158
+ - Processing end: 2016-02-19 17:56:42.304278
@@ -0,0 +1,110 @@
1
+ = Logs of language model for Esperanto (eo) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2015-12-04 01:22:51.466573
5
+ - Maximum depth: 3
6
+ - Max number of pages: 50
7
+
8
+ == Parsed pages ==
9
+
10
+ Vikipedio:Ĉefpaĝo (revision 5524911)
11
+ 10-a de novembro (revision 5792999)
12
+ 12-a de novembro (revision 5793854)
13
+ 13-a de novembro (revision 5795088)
14
+ 18-a de novembro (revision 5796972)
15
+ 2-a de novembro (revision 5772615)
16
+ 20-a de novembro (revision 5799664)
17
+ 2015 (revision 5791963)
18
+ 22-a de novembro (revision 5799355)
19
+ 24-a de novembro (revision 5800563)
20
+ 4-a de decembro (revision 5806422)
21
+ 4-a de novembro (revision 5789811)
22
+ 5-a de novembro (revision 5789774)
23
+ 6-a de novembro (revision 5790336)
24
+ 7-a de novembro (revision 5791066)
25
+ 8-a de novembro (revision 5791337)
26
+ 9-a de novembro (revision 5791916)
27
+ A Night at the Opera (Queen) (revision 5184272)
28
+ Abdelhamid Abaaoud (revision 5800134)
29
+ André Glucksmann (revision 5792591)
30
+ Anglio (revision 5693468)
31
+ Argentino (revision 5804665)
32
+ Atencoj de novembro 2015 en Parizo (revision 5800135)
33
+ Aung San Suu Kyi (revision 5791362)
34
+ Austin FX4 (revision 5583207)
35
+ Azilo (revision 5751210)
36
+ Aŭstrio (revision 5804014)
37
+ Bahio (revision 5773065)
38
+ Bamako (revision 5798202)
39
+ Bataclan (revision 5795605)
40
+ Bejruto (revision 5774306)
41
+ Birmo (revision 5790386)
42
+ Blonda (revision 5441229)
43
+ Bohemian rhapsody (revision 5654078)
44
+ Cayetano Redondo (revision 5591025)
45
+ Ciro la 2-a (revision 5774667)
46
+ DJ Abdel (revision 5628860)
47
+ Daniela Mercury (revision 5764721)
48
+ Decembro de 2015 (revision 5626904)
49
+ Dilatkoeficiento (revision 5806460)
50
+ Eksproprietigo (revision 5586845)
51
+ Elektroniko (revision 5788966)
52
+ Elle s'appelait Sarah (filmo) (revision 5475154)
53
+ Esperanto (revision 5804190)
54
+ Federaciero (revision 5696168)
55
+ Fondaĵo Vikimedio (revision 5772681)
56
+ Francio (revision 5759775)
57
+ François Hollande (revision 5627721)
58
+
59
+ == End of Parsed pages ==
60
+
61
+ - Wikipedia parsing ended at: 2015-12-04 01:27:38.176708
62
+
63
+ 56 characters appeared 342524 times.
64
+
65
+ First 35 characters:
66
+ [ 0] Char a: 12.557952143499435 %
67
+ [ 1] Char o: 9.84719318938235 %
68
+ [ 2] Char e: 9.10242785906973 %
69
+ [ 3] Char i: 8.362333734278474 %
70
+ [ 4] Char n: 7.6359612757062285 %
71
+ [ 5] Char r: 6.630192336887342 %
72
+ [ 6] Char t: 5.70821314710794 %
73
+ [ 7] Char l: 5.610409781504361 %
74
+ [ 8] Char s: 5.004320865107262 %
75
+ [ 9] Char k: 3.8855671427403626 %
76
+ [10] Char d: 3.7194473963868226 %
77
+ [11] Char j: 3.28531723324497 %
78
+ [12] Char u: 2.8465158645817517 %
79
+ [13] Char m: 2.787833845219605 %
80
+ [14] Char p: 2.6582078920017285 %
81
+ [15] Char g: 1.6825098387266293 %
82
+ [16] Char v: 1.4048650605505015 %
83
+ [17] Char c: 1.3823848839789328 %
84
+ [18] Char b: 1.1406499982482978 %
85
+ [19] Char f: 1.077296773364786 %
86
+ [20] Char z: 0.7342551178895493 %
87
+ [21] Char h: 0.6735294461118053 %
88
+ [22] Char ĝ: 0.53572888323154 %
89
+ [23] Char ŭ: 0.4268314045147202 %
90
+ [24] Char ĉ: 0.33545094650301877 %
91
+ [25] Char y: 0.17079095187490512 %
92
+ [26] Char ŝ: 0.15327393116978666 %
93
+ [27] Char w: 0.1442234704721421 %
94
+ [28] Char ĵ: 0.1039343228503696 %
95
+ [29] Char á: 0.0814541462788009 %
96
+ [30] Char ó: 0.05430276418586727 %
97
+ [31] Char é: 0.053718863495696656 %
98
+ [32] Char q: 0.04350060141771087 %
99
+ [33] Char x: 0.040873048311943105 %
100
+ [34] Char ĥ: 0.03824549520617533 %
101
+
102
+ The first 35 characters have an accumulated ratio of 0.9991971365510156.
103
+
104
+ 989 sequences found.
105
+
106
+ First 512 (typical positive ratio): 0.9942980632768038
107
+ Next 512 (512-1024): 0.0015327393116978665
108
+ Rest: -5.0306980803327406e-17
109
+
110
+ - Processing end: 2015-12-04 01:27:38.307198
@@ -0,0 +1,159 @@
1
+ = Logs of language model for Estonian (et) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2016-09-26 23:45:22.351942
5
+ - Maximum depth: 5
6
+ - Max number of pages: 100
7
+
8
+ == Parsed pages ==
9
+
10
+ Harilik pohl (revision 4248853)
11
+ A-vitamiin (revision 4330862)
12
+ Aasta keskmine sademete hulk (revision 4266801)
13
+ Aasta keskmine õhutemperatuur (revision 3902142)
14
+ Ahm (revision 4343671)
15
+ Ain Raal (revision 4464651)
16
+ Alalehed (revision 2892741)
17
+ Alamliik (revision 3522810)
18
+ Alaska (revision 4216575)
19
+ Aleksander Heintalu (revision 4445156)
20
+ Aleuudid (revision 4335893)
21
+ Ameerika jänes (revision 4325220)
22
+ Ameerika valgejänes (revision 4355263)
23
+ Anneli Sihvart (revision 4211078)
24
+ Arbutiin (revision 4451788)
25
+ Baribal (revision 4268462)
26
+ Bensoehape (revision 3810308)
27
+ Binaarne nomenklatuur (revision 3970950)
28
+ C-vitamiin (revision 4444353)
29
+ Droog (revision 4352968)
30
+ E-vitamiin (revision 4336726)
31
+ Eesti (revision 4474984)
32
+ Eesti Entsüklopeediakirjastus (revision 4012421)
33
+ Eesti köök (revision 4314947)
34
+ Ellips (revision 4272113)
35
+ Emakakael (botaanika) (revision 3521516)
36
+ Euraasia (revision 3710768)
37
+ Fenoloogia (revision 3512905)
38
+ Folaadid (revision 4266628)
39
+ Fosfor (revision 4270122)
40
+ Fotosüntees (revision 4380600)
41
+ Fruktoos (revision 4285660)
42
+ Glükoos (revision 4047315)
43
+ Gneiss (revision 4333338)
44
+ Graniit (revision 4435351)
45
+ Gröönimaa (revision 4331557)
46
+ Halljänes (revision 4051603)
47
+ Haned (revision 4127680)
48
+ Happeline keskkond (revision 2966453)
49
+ Heilongjiang (revision 4342364)
50
+ Hendrik Relve (revision 4342591)
51
+ Hiina (revision 4448121)
52
+ Holland (revision 4307885)
53
+ Hunt (revision 4427752)
54
+ Hõimkond (revision 3489569)
55
+ Hüdrofiilsus (revision 4309797)
56
+ Ida-Euroopa (revision 4337624)
57
+ Ida-sinilind (revision 4248853)
58
+ Ida-vöötorav (revision 3520679)
59
+ Igihaljus (revision 3536500)
60
+ Ilves (revision 4404632)
61
+ Imetaja (revision 4289188)
62
+ Indiaanlased (revision 4479868)
63
+ Indrek Rohtmets (revision 4218674)
64
+ Itaalia (revision 4404119)
65
+ Jaapan (revision 4465542)
66
+ Jilin (revision 3894473)
67
+ Jood (revision 4025060)
68
+ Juurestik (revision 3341159)
69
+ Jääkaru (revision 4372399)
70
+ Jõhvikas (revision 4391549)
71
+ Kaalium (revision 4486067)
72
+ Kaheidulehelised (revision 4031352)
73
+ Kaheli õiekate (revision 3063362)
74
+ Kahesuguline õis (revision 3383221)
75
+ Kaitsestaatus (revision 3527096)
76
+ Kajakas (revision 4456839)
77
+ Kalorsus (revision 3843290)
78
+ Kaltsium (revision 4339861)
79
+ Kanada (revision 4434682)
80
+ Kanalised (revision 3616579)
81
+ Kanarbikulaadsed (revision 4318215)
82
+ Kanarbikulised (revision 3534760)
83
+ Karboksüülhapped (revision 3659011)
84
+ Karoteen (revision 4347634)
85
+ Kasvuperiood (revision 4231717)
86
+ Katteseemnetaimed (revision 4176294)
87
+ Kaukasus (revision 4476003)
88
+ Kesk-Euroopa (revision 3580746)
89
+ Kimalane (revision 4261145)
90
+ Kiudained (toit) (revision 3538655)
91
+ Klass (bioloogia) (revision 3489567)
92
+ Kliima (revision 4160781)
93
+ Korea (revision 4329396)
94
+ Kroom (revision 4030460)
95
+ Kroonlehed (revision 3543291)
96
+ Kuusepüü (revision 4028988)
97
+ Kvertsetiin (revision 4448461)
98
+ Laanemets (revision 4001157)
99
+ Laanepüü (revision 4475093)
100
+ Laiuskraad (revision 3990366)
101
+ Leesikas (revision 4420533)
102
+ Lehed (revision 4471821)
103
+ Leheroots (revision 3595351)
104
+ Liik (bioloogia) (revision 4320981)
105
+ Liiv (revision 4399494)
106
+ Liivakivi (revision 4330598)
107
+ Linnaeus (revision 4276836)
108
+ Linnud (revision 4479668)
109
+
110
+ == End of Parsed pages ==
111
+
112
+ - Wikipedia parsing ended at: 2016-09-26 23:47:54.476445
113
+
114
+ 55 characters appeared 433559 times.
115
+
116
+ First 33 characters:
117
+ [ 0] Char a: 12.486881831538499 %
118
+ [ 1] Char i: 10.26503889897338 %
119
+ [ 2] Char e: 10.177622884082673 %
120
+ [ 3] Char s: 8.710233209320991 %
121
+ [ 4] Char t: 6.56634967789851 %
122
+ [ 5] Char l: 6.051540851418146 %
123
+ [ 6] Char u: 5.423944607308348 %
124
+ [ 7] Char n: 5.131020230233947 %
125
+ [ 8] Char k: 4.663033174262327 %
126
+ [ 9] Char o: 4.526950195936424 %
127
+ [10] Char d: 4.167368224393911 %
128
+ [11] Char r: 3.6740097656835635 %
129
+ [12] Char m: 3.552688330769284 %
130
+ [13] Char v: 2.4700213811730354 %
131
+ [14] Char p: 1.9229216784797456 %
132
+ [15] Char g: 1.865259399528092 %
133
+ [16] Char h: 1.8043680329551455 %
134
+ [17] Char j: 1.6860450365463524 %
135
+ [18] Char ä: 1.0247740215287884 %
136
+ [19] Char b: 0.9255949017319443 %
137
+ [20] Char õ: 0.9246723052687178 %
138
+ [21] Char ü: 0.6536595941959457 %
139
+ [22] Char f: 0.37342091849090897 %
140
+ [23] Char c: 0.34851081398379463 %
141
+ [24] Char ö: 0.24333481717597835 %
142
+ [25] Char y: 0.1287022066200909 %
143
+ [26] Char x: 0.06781084004714467 %
144
+ [27] Char w: 0.04082489349777078 %
145
+ [28] Char q: 0.020989069538401926 %
146
+ [29] Char š: 0.018913227496142396 %
147
+ [30] Char z: 0.017529332801302706 %
148
+ [31] Char ō: 0.010379210211297655 %
149
+ [32] Char ž: 0.009687262863877812 %
150
+
151
+ The first 33 characters have an accumulated ratio of 0.9995410082595447.
152
+
153
+ 853 sequences found.
154
+
155
+ First 512 (typical positive ratio): 0.9972721312183132
156
+ Next 512 (512-1024): 9.687262863877811e-05
157
+ Rest: -5.204170427930421e-18
158
+
159
+ - Processing end: 2016-09-26 23:47:54.561846
@@ -0,0 +1,156 @@
1
+ = Logs of language model for Finnish (fi) =
2
+
3
+ - Generated by BuildLangModel.py
4
+ - Started: 2016-09-21 18:12:24.181917
5
+ - Maximum depth: 5
6
+ - Max number of pages: 100
7
+
8
+ == Parsed pages ==
9
+
10
+ Yhdistynyt kuningaskunta (revision 15843357)
11
+ 1. toukokuuta (revision 15910178)
12
+ 1700-luku (revision 15493702)
13
+ 1707 (revision 15106709)
14
+ 1800-luku (revision 15708929)
15
+ 2014 (revision 15891601)
16
+ 409 (revision 12809782)
17
+ 5. marraskuuta (revision 15421719)
18
+ 927 (revision 12785964)
19
+ Aasia (revision 15948161)
20
+ Abhasia (revision 15730328)
21
+ Adolf Hitler (revision 15951829)
22
+ Afrikka (revision 15934209)
23
+ Agatha Christie (revision 15760740)
24
+ Aikavyöhyke (revision 15800313)
25
+ Ajoneuvon kansallisuustunnus (revision 15897445)
26
+ Akrotiri ja Dhekelia (revision 14625383)
27
+ Alamaat (revision 15913741)
28
+ Alan Turing (revision 15904871)
29
+ Alankomaat (revision 15936643)
30
+ Albania (revision 15767604)
31
+ Alec Guinness (revision 15363805)
32
+ Alexander Fleming (revision 15023225)
33
+ Alfred Hitchcock (revision 15892843)
34
+ Alfred Tennyson (revision 15856114)
35
+ Allen Jones (revision 12871703)
36
+ Andorra (revision 15913862)
37
+ Andrew Lloyd Webber (revision 14978349)
38
+ Anglit (revision 15902350)
39
+ Anguilla (revision 15854041)
40
+ Anne Brontë (revision 14287992)
41
+ Anthony Eden (revision 14391831)
42
+ Antigua ja Barbuda (revision 15196967)
43
+ Arabian Lawrence (revision 15736417)
44
+ Argentiina (revision 15676474)
45
+ Armenia (revision 15634470)
46
+ Arthur Conan Doyle (revision 15402837)
47
+ Arts and Crafts (revision 15806930)
48
+ Aurinko (revision 15934252)
49
+ Australia (revision 15934255)
50
+ Avara luonto (revision 15815943)
51
+ Azerbaidžan (revision 15946891)
52
+ BBC (revision 15866026)
53
+ BKT (revision 15656549)
54
+ Bahama (revision 15516869)
55
+ Bangladesh (revision 15883994)
56
+ Bank of England (revision 14481173)
57
+ Barbados (revision 15839821)
58
+ Barbara Hepworth (revision 15106880)
59
+ Bath (revision 15869900)
60
+ Beatrix Potter (revision 15057380)
61
+ Belfast (revision 15715934)
62
+ Belgia (revision 15932391)
63
+ Belize (revision 15665086)
64
+ Ben Nevis (revision 15610196)
65
+ Bengalin kieli (revision 15551820)
66
+ Benjamin Britten (revision 15081615)
67
+ Bermuda (revision 15632621)
68
+ Bertrand Russell (revision 14631969)
69
+ Bhutan (revision 15377394)
70
+ Big Ben (revision 14897401)
71
+ Big Brother (revision 14641391)
72
+ Birmingham (revision 15855259)
73
+ Black Sabbath (revision 15839917)
74
+ Bosnia ja Hertsegovina (revision 15934266)
75
+ Botswana (revision 15524955)
76
+ Bristol (revision 15891889)
77
+ Bristolin kanaali (revision 15849713)
78
+ Bristolin kansainvälinen lentoasema (revision 14452870)
79
+ Britannia (provinssi) (revision 14557442)
80
+ Britannian avoin golfturnaus (revision 14293265)
81
+ Britannian kuninkaallinen perhe (revision 15522149)
82
+ Britannian talous (revision 15470242)
83
+ Britannian väestö (revision 15661241)
84
+ Brittein saaret (revision 15805422)
85
+ Brittiläinen Antarktiksen alue (revision 15836227)
86
+ Brittiläinen Intia (revision 15593126)
87
+ Brittiläinen Intian valtameren alue (revision 14272903)
88
+ Brittiläinen imperiumi (revision 15906600)
89
+ Brittiläinen kansainyhteisö (revision 15894379)
90
+ Brittiläinen keittiö (revision 13393533)
91
+ Brittiläinen kulttuuri (revision 15951407)
92
+ Brittiläiset Neitsytsaaret (revision 15910520)
93
+ Brittiläiset merentakaiset alueet (revision 15836213)
94
+ Brunei (revision 15580824)
95
+ Bruttokansantuote (revision 15656549)
96
+ Bulgaria (revision 15944101)
97
+ Burma (revision 15627218)
98
+ Cambridge (revision 14641664)
99
+ Cambridgen yliopisto (revision 15493340)
100
+ Canterburyn tarinoita (revision 15232140)
101
+ Cardiff (revision 15840398)
102
+ Caymansaaret (revision 15914575)
103
+ Channel 4 (revision 15882475)
104
+ Charles Babbage (revision 15203616)
105
+ Charles Chaplin (revision 15674652)
106
+ Charles Darwin (revision 15894085)
107
+ Charles Dickens (revision 15699592)
108
+ Charles Dickensin joulutarina (revision 15116247)
109
+
110
+ == End of Parsed pages ==
111
+
112
+ - Wikipedia parsing ended at: 2016-09-21 18:15:05.189221
113
+
114
+ 61 characters appeared 940364 times.
115
+
116
+ First 30 characters:
117
+ [ 0] Char a: 12.508773198463574 %
118
+ [ 1] Char i: 10.969475649854738 %
119
+ [ 2] Char n: 8.815841525196626 %
120
+ [ 3] Char t: 8.80169806585535 %
121
+ [ 4] Char e: 7.8206949649284745 %
122
+ [ 5] Char s: 7.595782058862313 %
123
+ [ 6] Char l: 5.963541777439374 %
124
+ [ 7] Char o: 5.439808414613916 %
125
+ [ 8] Char u: 5.0102938861972595 %
126
+ [ 9] Char k: 4.589712068943515 %
127
+ [10] Char r: 3.1231523112326713 %
128
+ [11] Char ä: 3.041800834570443 %
129
+ [12] Char m: 3.0392486313810396 %
130
+ [13] Char v: 2.156292669647073 %
131
+ [14] Char h: 1.996141919512019 %
132
+ [15] Char j: 1.9248929138078446 %
133
+ [16] Char p: 1.6324529650220552 %
134
+ [17] Char y: 1.6323466232224966 %
135
+ [18] Char d: 1.1981530556252684 %
136
+ [19] Char b: 0.6835650875618378 %
137
+ [20] Char g: 0.5793501239945382 %
138
+ [21] Char c: 0.5056552569005194 %
139
+ [22] Char ö: 0.38931732818355447 %
140
+ [23] Char f: 0.215023118707224 %
141
+ [24] Char w: 0.2106631049253268 %
142
+ [25] Char z: 0.06593191572625068 %
143
+ [26] Char x: 0.024458613898447838 %
144
+ [27] Char š: 0.010421496356729947 %
145
+ [28] Char ž: 0.007869293167326695 %
146
+ [29] Char q: 0.007762951367768225 %
147
+
148
+ The first 30 characters have an accumulated ratio of 0.9996012182516557.
149
+
150
+ 919 sequences found.
151
+
152
+ First 512 (typical positive ratio): 0.9985378147555799
153
+ Next 512 (512-1024): 1.0634179955846884e-06
154
+ Rest: 3.881443777498106e-17
155
+
156
+ - Processing end: 2016-09-21 18:15:05.307164