cchardet 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (317) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.gitmodules +3 -0
  4. data/.rubocop.yml +11 -0
  5. data/CHANGELOG.md +5 -0
  6. data/Gemfile +10 -0
  7. data/README.md +35 -0
  8. data/Rakefile +15 -0
  9. data/cchardet.gemspec +30 -0
  10. data/ext/cchardet/extconf.rb +26 -0
  11. data/ext/uchardet/.gitignore +1 -0
  12. data/ext/uchardet/.gitlab-ci.yml +106 -0
  13. data/ext/uchardet/AUTHORS +16 -0
  14. data/ext/uchardet/CMakeLists.txt +74 -0
  15. data/ext/uchardet/COPYING +1316 -0
  16. data/ext/uchardet/INSTALL +26 -0
  17. data/ext/uchardet/README.md +295 -0
  18. data/ext/uchardet/build-mac/uchardet.cpp +7 -0
  19. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
  20. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  21. data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
  22. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
  23. data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
  24. data/ext/uchardet/doc/CMakeLists.txt +6 -0
  25. data/ext/uchardet/doc/README.maintainer +59 -0
  26. data/ext/uchardet/doc/uchardet.1 +18 -0
  27. data/ext/uchardet/script/BuildLangModel.py +533 -0
  28. data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
  29. data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
  30. data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
  31. data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
  32. data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
  33. data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
  34. data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
  35. data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
  36. data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
  37. data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
  38. data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
  39. data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
  40. data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
  41. data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
  42. data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
  43. data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
  44. data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
  45. data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
  46. data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
  47. data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
  48. data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
  49. data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
  50. data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
  51. data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
  52. data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
  53. data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
  54. data/ext/uchardet/script/README +63 -0
  55. data/ext/uchardet/script/charsets/codepoints.py +53 -0
  56. data/ext/uchardet/script/charsets/db.py +73 -0
  57. data/ext/uchardet/script/charsets/ibm852.py +72 -0
  58. data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
  59. data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
  60. data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
  61. data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
  62. data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
  63. data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
  64. data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
  65. data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
  66. data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
  67. data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
  68. data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
  69. data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
  70. data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
  71. data/ext/uchardet/script/charsets/tis-620.py +77 -0
  72. data/ext/uchardet/script/charsets/viscii.py +72 -0
  73. data/ext/uchardet/script/charsets/windows-1250.py +75 -0
  74. data/ext/uchardet/script/charsets/windows-1252.py +76 -0
  75. data/ext/uchardet/script/charsets/windows-1253.py +72 -0
  76. data/ext/uchardet/script/charsets/windows-1256.py +75 -0
  77. data/ext/uchardet/script/charsets/windows-1257.py +72 -0
  78. data/ext/uchardet/script/charsets/windows-1258.py +72 -0
  79. data/ext/uchardet/script/debug.sh +9 -0
  80. data/ext/uchardet/script/header-template.cpp +38 -0
  81. data/ext/uchardet/script/langs/ar.py +59 -0
  82. data/ext/uchardet/script/langs/cs.py +80 -0
  83. data/ext/uchardet/script/langs/da.py +69 -0
  84. data/ext/uchardet/script/langs/de.py +69 -0
  85. data/ext/uchardet/script/langs/el.py +55 -0
  86. data/ext/uchardet/script/langs/eo.py +67 -0
  87. data/ext/uchardet/script/langs/es.py +69 -0
  88. data/ext/uchardet/script/langs/et.py +57 -0
  89. data/ext/uchardet/script/langs/fi.py +60 -0
  90. data/ext/uchardet/script/langs/fr.py +79 -0
  91. data/ext/uchardet/script/langs/ga.py +60 -0
  92. data/ext/uchardet/script/langs/hr.py +59 -0
  93. data/ext/uchardet/script/langs/hu.py +66 -0
  94. data/ext/uchardet/script/langs/it.py +56 -0
  95. data/ext/uchardet/script/langs/lt.py +70 -0
  96. data/ext/uchardet/script/langs/lv.py +69 -0
  97. data/ext/uchardet/script/langs/mt.py +80 -0
  98. data/ext/uchardet/script/langs/pl.py +81 -0
  99. data/ext/uchardet/script/langs/pt.py +80 -0
  100. data/ext/uchardet/script/langs/ro.py +65 -0
  101. data/ext/uchardet/script/langs/sk.py +80 -0
  102. data/ext/uchardet/script/langs/sl.py +59 -0
  103. data/ext/uchardet/script/langs/sv.py +56 -0
  104. data/ext/uchardet/script/langs/th.py +55 -0
  105. data/ext/uchardet/script/langs/tr.py +67 -0
  106. data/ext/uchardet/script/langs/vi.py +64 -0
  107. data/ext/uchardet/script/release.sh +8 -0
  108. data/ext/uchardet/script/win32.sh +7 -0
  109. data/ext/uchardet/src/Big5Freq.tab +943 -0
  110. data/ext/uchardet/src/CMakeLists.txt +160 -0
  111. data/ext/uchardet/src/CharDistribution.cpp +109 -0
  112. data/ext/uchardet/src/CharDistribution.h +242 -0
  113. data/ext/uchardet/src/EUCKRFreq.tab +614 -0
  114. data/ext/uchardet/src/EUCTWFreq.tab +447 -0
  115. data/ext/uchardet/src/GB2312Freq.tab +491 -0
  116. data/ext/uchardet/src/JISFreq.tab +589 -0
  117. data/ext/uchardet/src/JpCntx.cpp +230 -0
  118. data/ext/uchardet/src/JpCntx.h +140 -0
  119. data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
  120. data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
  121. data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
  122. data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
  123. data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
  124. data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
  125. data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
  126. data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
  127. data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
  128. data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
  129. data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
  130. data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
  131. data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
  132. data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
  133. data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
  134. data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
  135. data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
  136. data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
  137. data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
  138. data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
  139. data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
  140. data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
  141. data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
  142. data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
  143. data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
  144. data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
  145. data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
  146. data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
  147. data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
  148. data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
  149. data/ext/uchardet/src/nsBig5Prober.h +75 -0
  150. data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
  151. data/ext/uchardet/src/nsCharSetProber.h +77 -0
  152. data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
  153. data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
  154. data/ext/uchardet/src/nsEUCJPProber.h +79 -0
  155. data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
  156. data/ext/uchardet/src/nsEUCKRProber.h +81 -0
  157. data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
  158. data/ext/uchardet/src/nsEUCTWProber.h +75 -0
  159. data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
  160. data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
  161. data/ext/uchardet/src/nsEscSM.cpp +267 -0
  162. data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
  163. data/ext/uchardet/src/nsGB2312Prober.h +77 -0
  164. data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
  165. data/ext/uchardet/src/nsHebrewProber.h +177 -0
  166. data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
  167. data/ext/uchardet/src/nsLatin1Prober.h +73 -0
  168. data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
  169. data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
  170. data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
  171. data/ext/uchardet/src/nsPkgInt.h +89 -0
  172. data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
  173. data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
  174. data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
  175. data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
  176. data/ext/uchardet/src/nsSJISProber.cpp +98 -0
  177. data/ext/uchardet/src/nsSJISProber.h +81 -0
  178. data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
  179. data/ext/uchardet/src/nsUTF8Prober.h +66 -0
  180. data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
  181. data/ext/uchardet/src/nsUniversalDetector.h +91 -0
  182. data/ext/uchardet/src/nscore.h +59 -0
  183. data/ext/uchardet/src/prmem.h +49 -0
  184. data/ext/uchardet/src/symbols.cmake +41 -0
  185. data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
  186. data/ext/uchardet/src/tools/uchardet.cpp +254 -0
  187. data/ext/uchardet/src/uchardet.cpp +274 -0
  188. data/ext/uchardet/src/uchardet.h +136 -0
  189. data/ext/uchardet/test/CMakeLists.txt +47 -0
  190. data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
  191. data/ext/uchardet/test/ar/utf-8.txt +3 -0
  192. data/ext/uchardet/test/ar/windows-1256.txt +3 -0
  193. data/ext/uchardet/test/bg/windows-1251.txt +3 -0
  194. data/ext/uchardet/test/cs/ibm852.txt +4 -0
  195. data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
  196. data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
  197. data/ext/uchardet/test/cs/utf-8.txt +4 -0
  198. data/ext/uchardet/test/cs/windows-1250.txt +4 -0
  199. data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
  200. data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
  201. data/ext/uchardet/test/da/utf-8.txt +10 -0
  202. data/ext/uchardet/test/da/windows-1252.txt +10 -0
  203. data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
  204. data/ext/uchardet/test/de/windows-1252.txt +11 -0
  205. data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
  206. data/ext/uchardet/test/el/utf-8.txt +3 -0
  207. data/ext/uchardet/test/el/windows-1253.txt +5 -0
  208. data/ext/uchardet/test/en/ascii.txt +4 -0
  209. data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
  210. data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
  211. data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
  212. data/ext/uchardet/test/es/utf-8.txt +5 -0
  213. data/ext/uchardet/test/es/windows-1252.txt +5 -0
  214. data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
  215. data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
  216. data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
  217. data/ext/uchardet/test/et/utf-8.txt +6 -0
  218. data/ext/uchardet/test/et/windows-1252.txt +6 -0
  219. data/ext/uchardet/test/et/windows-1257.txt +6 -0
  220. data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
  221. data/ext/uchardet/test/fi/utf-8.txt +8 -0
  222. data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
  223. data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
  224. data/ext/uchardet/test/fr/utf-16.be +0 -0
  225. data/ext/uchardet/test/fr/utf-32.le +0 -0
  226. data/ext/uchardet/test/fr/utf-8.txt +14 -0
  227. data/ext/uchardet/test/fr/windows-1252.txt +3 -0
  228. data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
  229. data/ext/uchardet/test/ga/utf-8.txt +6 -0
  230. data/ext/uchardet/test/ga/windows-1252.txt +6 -0
  231. data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
  232. data/ext/uchardet/test/he/utf-8.txt +3 -0
  233. data/ext/uchardet/test/he/windows-1255.txt +1 -0
  234. data/ext/uchardet/test/hr/ibm852.txt +4 -0
  235. data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
  236. data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
  237. data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
  238. data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
  239. data/ext/uchardet/test/hr/utf-8.txt +4 -0
  240. data/ext/uchardet/test/hr/windows-1250.txt +4 -0
  241. data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
  242. data/ext/uchardet/test/hu/windows-1250.txt +1 -0
  243. data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
  244. data/ext/uchardet/test/it/utf-8.txt +18 -0
  245. data/ext/uchardet/test/ja/euc-jp.txt +10 -0
  246. data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
  247. data/ext/uchardet/test/ja/shift_jis.txt +1 -0
  248. data/ext/uchardet/test/ja/utf-16be.txt +0 -0
  249. data/ext/uchardet/test/ja/utf-16le.txt +0 -0
  250. data/ext/uchardet/test/ja/utf-8.txt +9 -0
  251. data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
  252. data/ext/uchardet/test/ko/uhc.smi +16 -0
  253. data/ext/uchardet/test/ko/utf-16.le +0 -0
  254. data/ext/uchardet/test/ko/utf-32.be +0 -0
  255. data/ext/uchardet/test/ko/utf-8.txt +3 -0
  256. data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
  257. data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
  258. data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
  259. data/ext/uchardet/test/lt/utf-8.txt +3 -0
  260. data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
  261. data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
  262. data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
  263. data/ext/uchardet/test/lv/utf-8.txt +6 -0
  264. data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
  265. data/ext/uchardet/test/mt/utf-8.txt +4 -0
  266. data/ext/uchardet/test/pl/ibm852.txt +3 -0
  267. data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
  268. data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
  269. data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
  270. data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
  271. data/ext/uchardet/test/pl/utf-8.txt +3 -0
  272. data/ext/uchardet/test/pl/windows-1250.txt +3 -0
  273. data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
  274. data/ext/uchardet/test/pt/utf-8.txt +6 -0
  275. data/ext/uchardet/test/ro/ibm852.txt +9 -0
  276. data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
  277. data/ext/uchardet/test/ro/utf-8.txt +9 -0
  278. data/ext/uchardet/test/ro/windows-1250.txt +9 -0
  279. data/ext/uchardet/test/ru/ibm855.txt +5 -0
  280. data/ext/uchardet/test/ru/ibm866.txt +11 -0
  281. data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
  282. data/ext/uchardet/test/ru/koi8-r.txt +1 -0
  283. data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
  284. data/ext/uchardet/test/ru/windows-1251.txt +4 -0
  285. data/ext/uchardet/test/sk/ibm852.txt +3 -0
  286. data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
  287. data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
  288. data/ext/uchardet/test/sk/utf-8.txt +3 -0
  289. data/ext/uchardet/test/sk/windows-1250.txt +3 -0
  290. data/ext/uchardet/test/sl/ibm852.txt +9 -0
  291. data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
  292. data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
  293. data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
  294. data/ext/uchardet/test/sl/utf-8.txt +9 -0
  295. data/ext/uchardet/test/sl/windows-1250.txt +9 -0
  296. data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
  297. data/ext/uchardet/test/sv/utf-8.txt +10 -0
  298. data/ext/uchardet/test/sv/windows-1252.txt +10 -0
  299. data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
  300. data/ext/uchardet/test/th/tis-620.txt +5 -0
  301. data/ext/uchardet/test/th/utf-8.txt +1 -0
  302. data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
  303. data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
  304. data/ext/uchardet/test/uchardet-tests.c +130 -0
  305. data/ext/uchardet/test/vi/utf-8.txt +4 -0
  306. data/ext/uchardet/test/vi/viscii.txt +4 -0
  307. data/ext/uchardet/test/vi/windows-1258.txt +4 -0
  308. data/ext/uchardet/test/zh/big5.txt +1 -0
  309. data/ext/uchardet/test/zh/euc-tw.txt +1 -0
  310. data/ext/uchardet/test/zh/gb18030.txt +1 -0
  311. data/ext/uchardet/test/zh/utf-8.txt +1 -0
  312. data/ext/uchardet/uchardet.doap +51 -0
  313. data/ext/uchardet/uchardet.pc.in +10 -0
  314. data/lib/cchardet.rb +56 -0
  315. data/lib/cchardet/lib_finder.rb +32 -0
  316. data/lib/cchardet/version.rb +5 -0
  317. metadata +362 -0
@@ -0,0 +1,274 @@
1
+ /* ***** BEGIN LICENSE BLOCK *****
2
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3
+ *
4
+ * The contents of this file are subject to the Mozilla Public License Version
5
+ * 1.1 (the "License"); you may not use this file except in compliance with
6
+ * the License. You may obtain a copy of the License at
7
+ * http://www.mozilla.org/MPL/
8
+ *
9
+ * Software distributed under the License is distributed on an "AS IS" basis,
10
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11
+ * for the specific language governing rights and limitations under the
12
+ * License.
13
+ *
14
+ * The Original Code is Mozilla Universal charset detector code.
15
+ *
16
+ * The Initial Developer of the Original Code is
17
+ * Netscape Communications Corporation.
18
+ * Portions created by the Initial Developer are Copyright (C) 2001
19
+ * the Initial Developer. All Rights Reserved.
20
+ *
21
+ * Contributor(s):
22
+ * BYVoid <byvoid.kcp@gmail.com>
23
+ *
24
+ * Alternatively, the contents of this file may be used under the terms of
25
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
26
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
+ * in which case the provisions of the GPL or the LGPL are applicable instead
28
+ * of those above. If you wish to allow use of your version of this file only
29
+ * under the terms of either the GPL or the LGPL, and not to allow others to
30
+ * use your version of this file under the terms of the MPL, indicate your
31
+ * decision by deleting the provisions above and replace them with the notice
32
+ * and other provisions required by the GPL or the LGPL. If you do not delete
33
+ * the provisions above, a recipient may use your version of this file under
34
+ * the terms of any one of the MPL, the GPL or the LGPL.
35
+ *
36
+ * ***** END LICENSE BLOCK ***** */
37
+ #include "uchardet.h"
38
+ #include <string.h>
39
+ #include <stdlib.h>
40
+ #include <map>
41
+ #include <string>
42
+ #include <vector>
43
+ #include "nscore.h"
44
+ #include "nsUniversalDetector.h"
45
+
46
+ typedef struct _UChardetCandidate
47
+ {
48
+ char *encoding;
49
+ char *language;
50
+ float confidence;
51
+ } UChardetCandidate;
52
+
53
+ class HandleUniversalDetector : public nsUniversalDetector
54
+ {
55
+ protected:
56
+ std::vector<UChardetCandidate> candidates;
57
+ std::vector<UChardetCandidate> weighed_candidates;
58
+ std::map<std::string, float> weights;
59
+ float default_weight;
60
+
61
+ public:
62
+ HandleUniversalDetector()
63
+ : nsUniversalDetector(NS_FILTER_ALL), default_weight(1.0)
64
+ {
65
+ }
66
+
67
+ virtual ~HandleUniversalDetector()
68
+ {
69
+ Reset();
70
+ }
71
+
72
+ virtual void Report(const char *encoding,
73
+ const char *language,
74
+ float confidence)
75
+ {
76
+ std::vector<UChardetCandidate>::iterator it;
77
+ UChardetCandidate candidate;
78
+
79
+ for (it = candidates.begin(); it != candidates.end(); it++)
80
+ {
81
+ if (strcmp(it->encoding, encoding) == 0 &&
82
+ it->language && language && strcmp(it->language, language) == 0)
83
+ {
84
+ /* Already reported. Bail out or update the confidence
85
+ * when needed.
86
+ */
87
+ if (confidence > it->confidence)
88
+ {
89
+ candidates.erase(it);
90
+ break;
91
+ }
92
+ else
93
+ {
94
+ return;
95
+ }
96
+ }
97
+ }
98
+
99
+ candidate = UChardetCandidate();
100
+ candidate.encoding = strdup(encoding);
101
+ candidate.language = language ? strdup(language) : NULL;
102
+ candidate.confidence = confidence;
103
+
104
+ for (it = candidates.begin(); it != candidates.end(); it++)
105
+ {
106
+ if (it->confidence < confidence)
107
+ break;
108
+ }
109
+ candidates.insert(it, candidate);
110
+
111
+ if (weights.size() > 0)
112
+ WeighCandidates();
113
+ }
114
+
115
+ virtual void Reset()
116
+ {
117
+ std::vector<UChardetCandidate>::iterator it;
118
+
119
+ nsUniversalDetector::Reset();
120
+ for (it = candidates.begin(); it != candidates.end(); it++)
121
+ {
122
+ free(it->encoding);
123
+ if (it->language)
124
+ free(it->language);
125
+ }
126
+ candidates.clear();
127
+ }
128
+
129
+ size_t GetCandidates() const
130
+ {
131
+ return candidates.size();
132
+ }
133
+
134
+ const char* GetCharset(size_t i)
135
+ {
136
+ if (weights.size() > 0)
137
+ return (weighed_candidates.size() > i) ? weighed_candidates[i].encoding : "";
138
+ return (candidates.size() > i) ? candidates[i].encoding : "";
139
+ }
140
+
141
+ float GetConfidence(size_t i)
142
+ {
143
+ if (weights.size() > 0)
144
+ return (weighed_candidates.size() > i) ? weighed_candidates[i].confidence : 0.0;
145
+ return (candidates.size() > i) ? candidates[i].confidence : 0.0;
146
+ }
147
+
148
+ const char* GetLanguage(size_t i)
149
+ {
150
+ if (weights.size() > 0)
151
+ return (weighed_candidates.size() > i) ? weighed_candidates[i].language : NULL;
152
+ return (candidates.size() > i) ? candidates[i].language : NULL;
153
+ }
154
+
155
+ void WeighLanguage(const char *language,
156
+ float weight)
157
+ {
158
+ weights[language] = weight;
159
+ WeighCandidates();
160
+ }
161
+
162
+ void WeighDefault(float weight)
163
+ {
164
+ default_weight = weight;
165
+ WeighCandidates();
166
+ }
167
+
168
+ private:
169
+
170
+ void WeighCandidates()
171
+ {
172
+ std::vector<UChardetCandidate>::iterator it;
173
+ std::vector<UChardetCandidate>::iterator it2;
174
+ UChardetCandidate candidate;
175
+
176
+ weighed_candidates.clear();
177
+ for (it = candidates.begin(); it != candidates.end(); it++)
178
+ {
179
+ std::map<std::string, float>::iterator weight_it;
180
+ float confidence;
181
+
182
+ confidence = it->confidence * default_weight;
183
+ if (it->language)
184
+ {
185
+ weight_it = weights.find(it->language);
186
+ if (weight_it != weights.end())
187
+ confidence = weight_it->second * it->confidence;
188
+ }
189
+
190
+ candidate = UChardetCandidate();
191
+ candidate.encoding = it->encoding;
192
+ candidate.language = it->language;
193
+ candidate.confidence = confidence;
194
+
195
+ for (it2 = weighed_candidates.begin(); it2 != weighed_candidates.end(); it2++)
196
+ {
197
+ if (it2->confidence < confidence)
198
+ break;
199
+ }
200
+ weighed_candidates.insert(it2, candidate);
201
+ }
202
+ }
203
+ };
204
+
205
+ uchardet_t uchardet_new(void)
206
+ {
207
+ return reinterpret_cast<uchardet_t> (new HandleUniversalDetector());
208
+ }
209
+
210
+ void uchardet_delete(uchardet_t ud)
211
+ {
212
+ delete reinterpret_cast<HandleUniversalDetector*>(ud);
213
+ }
214
+
215
+ int uchardet_handle_data(uchardet_t ud, const char * data, size_t len)
216
+ {
217
+ nsresult ret = NS_OK;
218
+
219
+ if (len > 0)
220
+ ret = reinterpret_cast<HandleUniversalDetector*>(ud)->HandleData(data, (PRUint32)len);
221
+
222
+ return (ret != NS_OK);
223
+ }
224
+
225
+ void uchardet_data_end(uchardet_t ud)
226
+ {
227
+ reinterpret_cast<HandleUniversalDetector*>(ud)->DataEnd();
228
+ }
229
+
230
+ void uchardet_reset(uchardet_t ud)
231
+ {
232
+ reinterpret_cast<HandleUniversalDetector*>(ud)->Reset();
233
+ }
234
+
235
+ const char* uchardet_get_charset(uchardet_t ud)
236
+ {
237
+ return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset(0);
238
+ }
239
+
240
+ size_t uchardet_get_candidates (uchardet_t ud)
241
+ {
242
+ return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCandidates();
243
+ }
244
+
245
+ float uchardet_get_confidence (uchardet_t ud,
246
+ size_t candidate)
247
+ {
248
+ return reinterpret_cast<HandleUniversalDetector*>(ud)->GetConfidence(candidate);
249
+ }
250
+
251
+ const char * uchardet_get_encoding (uchardet_t ud,
252
+ size_t candidate)
253
+ {
254
+ return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset(candidate);
255
+ }
256
+
257
+ const char * uchardet_get_language (uchardet_t ud,
258
+ size_t candidate)
259
+ {
260
+ return reinterpret_cast<HandleUniversalDetector*>(ud)->GetLanguage(candidate);
261
+ }
262
+
263
+ void uchardet_weigh_language (uchardet_t ud,
264
+ const char *language,
265
+ float weight)
266
+ {
267
+ reinterpret_cast<HandleUniversalDetector*>(ud)->WeighLanguage(language, weight);
268
+ }
269
+
270
+ void uchardet_set_default_weight (uchardet_t ud,
271
+ float weight)
272
+ {
273
+ reinterpret_cast<HandleUniversalDetector*>(ud)->WeighDefault(weight);
274
+ }
@@ -0,0 +1,136 @@
1
+ /* ***** BEGIN LICENSE BLOCK *****
2
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3
+ *
4
+ * The contents of this file are subject to the Mozilla Public License Version
5
+ * 1.1 (the "License"); you may not use this file except in compliance with
6
+ * the License. You may obtain a copy of the License at
7
+ * http://www.mozilla.org/MPL/
8
+ *
9
+ * Software distributed under the License is distributed on an "AS IS" basis,
10
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11
+ * for the specific language governing rights and limitations under the
12
+ * License.
13
+ *
14
+ * The Original Code is Mozilla Universal charset detector code.
15
+ *
16
+ * The Initial Developer of the Original Code is
17
+ * Netscape Communications Corporation.
18
+ * Portions created by the Initial Developer are Copyright (C) 2001
19
+ * the Initial Developer. All Rights Reserved.
20
+ *
21
+ * Contributor(s):
22
+ * BYVoid <byvoid.kcp@gmail.com>
23
+ * Jehan <jehan at girinstud.io>
24
+ *
25
+ * Alternatively, the contents of this file may be used under the terms of
26
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
27
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28
+ * in which case the provisions of the GPL or the LGPL are applicable instead
29
+ * of those above. If you wish to allow use of your version of this file only
30
+ * under the terms of either the GPL or the LGPL, and not to allow others to
31
+ * use your version of this file under the terms of the MPL, indicate your
32
+ * decision by deleting the provisions above and replace them with the notice
33
+ * and other provisions required by the GPL or the LGPL. If you do not delete
34
+ * the provisions above, a recipient may use your version of this file under
35
+ * the terms of any one of the MPL, the GPL or the LGPL.
36
+ *
37
+ * ***** END LICENSE BLOCK ***** */
38
+ #ifndef UCHARDET_H___
39
+ #define UCHARDET_H___
40
+
41
+ #ifdef __cplusplus
42
+ extern "C" {
43
+ #endif
44
+
45
+ #include <stddef.h>
46
+
47
+ #if defined(UCHARDET_SHARED) && (defined(_WIN32) || defined(__CYGWIN__))
48
+ #ifdef BUILDING_UCHARDET
49
+ #define UCHARDET_INTERFACE __declspec(dllexport)
50
+ #else
51
+ #define UCHARDET_INTERFACE __declspec(dllimport)
52
+ #endif
53
+ #else
54
+ #define UCHARDET_INTERFACE
55
+ #endif
56
+
57
+ #if defined(__cplusplus) && (__cplusplus >= 201402L)
58
+ #define DEPRECATED(message) [[deprecated(message)]]
59
+ #elif defined(__GNUC__) || defined(__clang__)
60
+ #define DEPRECATED(message) __attribute__ ((deprecated))
61
+ #elif defined(_MSC_VER)
62
+ #define DEPRECATED(message) __declspec(deprecated) func
63
+ #else
64
+ #warning("DEPRECATED macro not available")
65
+ #define DEPRECATED(message)
66
+ #endif
67
+
68
+ /**
69
+ * A handle for a uchardet encoding detector.
70
+ */
71
+ typedef struct uchardet * uchardet_t;
72
+
73
+ /**
74
+ * Create an encoding detector.
75
+ * @return an instance of uchardet_t.
76
+ */
77
+ UCHARDET_INTERFACE uchardet_t uchardet_new(void);
78
+
79
+ /**
80
+ * Delete an encoding detector.
81
+ * @param ud [in] the uchardet_t handle to delete.
82
+ */
83
+ UCHARDET_INTERFACE void uchardet_delete(uchardet_t ud);
84
+
85
+ /**
86
+ * Feed data to an encoding detector.
87
+ * The detector is able to shortcut processing when it reaches certainty
88
+ * for an encoding, so you should not worry about limiting input data.
89
+ * As far as you should be concerned: the more the better.
90
+ *
91
+ * @param ud [in] handle of an instance of uchardet
92
+ * @param data [in] data
93
+ * @param len [in] number of byte of data
94
+ * @return non-zero number on failure.
95
+ */
96
+ UCHARDET_INTERFACE int uchardet_handle_data(uchardet_t ud, const char * data, size_t len);
97
+
98
+ /**
99
+ * Notify an end of data to an encoding detector.
100
+ * @param ud [in] handle of an instance of uchardet
101
+ */
102
+ UCHARDET_INTERFACE void uchardet_data_end(uchardet_t ud);
103
+
104
+ /**
105
+ * Reset an encoding detector.
106
+ * @param ud [in] handle of an instance of uchardet
107
+ */
108
+ UCHARDET_INTERFACE void uchardet_reset(uchardet_t ud);
109
+
110
+ /**
111
+ * Get an iconv-compatible name of the encoding that was detected.
112
+ * @param ud [in] handle of an instance of uchardet
113
+ * @return name of charset on success and "" on failure.
114
+ */
115
+ DEPRECATED("use uchardet_get_candidates() and uchardet_get_encoding() instead (since 0.1.0)")
116
+ UCHARDET_INTERFACE const char * uchardet_get_charset(uchardet_t ud);
117
+
118
+ UCHARDET_INTERFACE size_t uchardet_get_candidates (uchardet_t ud);
119
+ UCHARDET_INTERFACE float uchardet_get_confidence (uchardet_t ud,
120
+ size_t candidate);
121
+ UCHARDET_INTERFACE const char * uchardet_get_encoding (uchardet_t ud,
122
+ size_t candidate);
123
+ UCHARDET_INTERFACE const char * uchardet_get_language (uchardet_t ud,
124
+ size_t candidate);
125
+
126
+ UCHARDET_INTERFACE void uchardet_weigh_language (uchardet_t ud,
127
+ const char *language,
128
+ float weight);
129
+ UCHARDET_INTERFACE void uchardet_set_default_weight (uchardet_t ud,
130
+ float weight);
131
+
132
+ #ifdef __cplusplus
133
+ }
134
+ #endif
135
+
136
+ #endif
@@ -0,0 +1,47 @@
1
+ set(
2
+ UCHARDET_TEST_SOURCES
3
+ uchardet-tests.c
4
+ )
5
+
6
+ add_executable(
7
+ uchardet-tests
8
+ ${UCHARDET_TEST_SOURCES}
9
+ )
10
+
11
+ target_link_libraries(
12
+ uchardet-tests
13
+ ${UCHARDET_LIBRARY}
14
+ )
15
+
16
+ set_target_properties(
17
+ uchardet-tests
18
+ PROPERTIES
19
+ LINKER_LANGUAGE
20
+ C
21
+ OUTPUT_NAME
22
+ uchardet-tests
23
+ )
24
+
25
+ # Iterate through all langs.
26
+ file(GLOB dirs "[a-z][a-z]")
27
+ foreach(dir ${dirs})
28
+ get_filename_component(lang ${dir} NAME)
29
+ file(GLOB files "${dir}/*")
30
+ # Iterate through all files.
31
+ foreach(file ${files})
32
+ get_filename_component(charset ${file} NAME_WE)
33
+ # These are tests known to fail (not supported or not efficient
34
+ # enough). We will have to take a closer look and fix these, but
35
+ # there is no need to break the whole `make test` right now,
36
+ # which may make actual regressions harder to notice.
37
+ if ("${lang}:${charset}" STREQUAL "ja:utf-16le" OR
38
+ "${lang}:${charset}" STREQUAL "ja:utf-16be" OR
39
+ "${lang}:${charset}" STREQUAL "es:iso-8859-15" OR
40
+ "${lang}:${charset}" STREQUAL "da:iso-8859-1" OR
41
+ "${lang}:${charset}" STREQUAL "he:iso-8859-8")
42
+ message(STATUS "Skipping test ${lang}:${charset} (known broken)")
43
+ else()
44
+ add_test(NAME "${lang}:${charset}" COMMAND uchardet-tests ${file})
45
+ endif()
46
+ endforeach()
47
+ endforeach()