cchardet 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.gitmodules +3 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +35 -0
- data/Rakefile +15 -0
- data/cchardet.gemspec +30 -0
- data/ext/cchardet/extconf.rb +26 -0
- data/ext/uchardet/.gitignore +1 -0
- data/ext/uchardet/.gitlab-ci.yml +106 -0
- data/ext/uchardet/AUTHORS +16 -0
- data/ext/uchardet/CMakeLists.txt +74 -0
- data/ext/uchardet/COPYING +1316 -0
- data/ext/uchardet/INSTALL +26 -0
- data/ext/uchardet/README.md +295 -0
- data/ext/uchardet/build-mac/uchardet.cpp +7 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
- data/ext/uchardet/doc/CMakeLists.txt +6 -0
- data/ext/uchardet/doc/README.maintainer +59 -0
- data/ext/uchardet/doc/uchardet.1 +18 -0
- data/ext/uchardet/script/BuildLangModel.py +533 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
- data/ext/uchardet/script/README +63 -0
- data/ext/uchardet/script/charsets/codepoints.py +53 -0
- data/ext/uchardet/script/charsets/db.py +73 -0
- data/ext/uchardet/script/charsets/ibm852.py +72 -0
- data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
- data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
- data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
- data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
- data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
- data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
- data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
- data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
- data/ext/uchardet/script/charsets/tis-620.py +77 -0
- data/ext/uchardet/script/charsets/viscii.py +72 -0
- data/ext/uchardet/script/charsets/windows-1250.py +75 -0
- data/ext/uchardet/script/charsets/windows-1252.py +76 -0
- data/ext/uchardet/script/charsets/windows-1253.py +72 -0
- data/ext/uchardet/script/charsets/windows-1256.py +75 -0
- data/ext/uchardet/script/charsets/windows-1257.py +72 -0
- data/ext/uchardet/script/charsets/windows-1258.py +72 -0
- data/ext/uchardet/script/debug.sh +9 -0
- data/ext/uchardet/script/header-template.cpp +38 -0
- data/ext/uchardet/script/langs/ar.py +59 -0
- data/ext/uchardet/script/langs/cs.py +80 -0
- data/ext/uchardet/script/langs/da.py +69 -0
- data/ext/uchardet/script/langs/de.py +69 -0
- data/ext/uchardet/script/langs/el.py +55 -0
- data/ext/uchardet/script/langs/eo.py +67 -0
- data/ext/uchardet/script/langs/es.py +69 -0
- data/ext/uchardet/script/langs/et.py +57 -0
- data/ext/uchardet/script/langs/fi.py +60 -0
- data/ext/uchardet/script/langs/fr.py +79 -0
- data/ext/uchardet/script/langs/ga.py +60 -0
- data/ext/uchardet/script/langs/hr.py +59 -0
- data/ext/uchardet/script/langs/hu.py +66 -0
- data/ext/uchardet/script/langs/it.py +56 -0
- data/ext/uchardet/script/langs/lt.py +70 -0
- data/ext/uchardet/script/langs/lv.py +69 -0
- data/ext/uchardet/script/langs/mt.py +80 -0
- data/ext/uchardet/script/langs/pl.py +81 -0
- data/ext/uchardet/script/langs/pt.py +80 -0
- data/ext/uchardet/script/langs/ro.py +65 -0
- data/ext/uchardet/script/langs/sk.py +80 -0
- data/ext/uchardet/script/langs/sl.py +59 -0
- data/ext/uchardet/script/langs/sv.py +56 -0
- data/ext/uchardet/script/langs/th.py +55 -0
- data/ext/uchardet/script/langs/tr.py +67 -0
- data/ext/uchardet/script/langs/vi.py +64 -0
- data/ext/uchardet/script/release.sh +8 -0
- data/ext/uchardet/script/win32.sh +7 -0
- data/ext/uchardet/src/Big5Freq.tab +943 -0
- data/ext/uchardet/src/CMakeLists.txt +160 -0
- data/ext/uchardet/src/CharDistribution.cpp +109 -0
- data/ext/uchardet/src/CharDistribution.h +242 -0
- data/ext/uchardet/src/EUCKRFreq.tab +614 -0
- data/ext/uchardet/src/EUCTWFreq.tab +447 -0
- data/ext/uchardet/src/GB2312Freq.tab +491 -0
- data/ext/uchardet/src/JISFreq.tab +589 -0
- data/ext/uchardet/src/JpCntx.cpp +230 -0
- data/ext/uchardet/src/JpCntx.h +140 -0
- data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
- data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
- data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
- data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
- data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
- data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
- data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
- data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
- data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
- data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
- data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
- data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
- data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
- data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
- data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
- data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
- data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
- data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
- data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
- data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
- data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
- data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
- data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
- data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
- data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
- data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
- data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
- data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
- data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
- data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
- data/ext/uchardet/src/nsBig5Prober.h +75 -0
- data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
- data/ext/uchardet/src/nsCharSetProber.h +77 -0
- data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
- data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
- data/ext/uchardet/src/nsEUCJPProber.h +79 -0
- data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
- data/ext/uchardet/src/nsEUCKRProber.h +81 -0
- data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
- data/ext/uchardet/src/nsEUCTWProber.h +75 -0
- data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
- data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
- data/ext/uchardet/src/nsEscSM.cpp +267 -0
- data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
- data/ext/uchardet/src/nsGB2312Prober.h +77 -0
- data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
- data/ext/uchardet/src/nsHebrewProber.h +177 -0
- data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
- data/ext/uchardet/src/nsLatin1Prober.h +73 -0
- data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
- data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
- data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
- data/ext/uchardet/src/nsPkgInt.h +89 -0
- data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
- data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
- data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
- data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
- data/ext/uchardet/src/nsSJISProber.cpp +98 -0
- data/ext/uchardet/src/nsSJISProber.h +81 -0
- data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
- data/ext/uchardet/src/nsUTF8Prober.h +66 -0
- data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
- data/ext/uchardet/src/nsUniversalDetector.h +91 -0
- data/ext/uchardet/src/nscore.h +59 -0
- data/ext/uchardet/src/prmem.h +49 -0
- data/ext/uchardet/src/symbols.cmake +41 -0
- data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
- data/ext/uchardet/src/tools/uchardet.cpp +254 -0
- data/ext/uchardet/src/uchardet.cpp +274 -0
- data/ext/uchardet/src/uchardet.h +136 -0
- data/ext/uchardet/test/CMakeLists.txt +47 -0
- data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
- data/ext/uchardet/test/ar/utf-8.txt +3 -0
- data/ext/uchardet/test/ar/windows-1256.txt +3 -0
- data/ext/uchardet/test/bg/windows-1251.txt +3 -0
- data/ext/uchardet/test/cs/ibm852.txt +4 -0
- data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
- data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
- data/ext/uchardet/test/cs/utf-8.txt +4 -0
- data/ext/uchardet/test/cs/windows-1250.txt +4 -0
- data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
- data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
- data/ext/uchardet/test/da/utf-8.txt +10 -0
- data/ext/uchardet/test/da/windows-1252.txt +10 -0
- data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
- data/ext/uchardet/test/de/windows-1252.txt +11 -0
- data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
- data/ext/uchardet/test/el/utf-8.txt +3 -0
- data/ext/uchardet/test/el/windows-1253.txt +5 -0
- data/ext/uchardet/test/en/ascii.txt +4 -0
- data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
- data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
- data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
- data/ext/uchardet/test/es/utf-8.txt +5 -0
- data/ext/uchardet/test/es/windows-1252.txt +5 -0
- data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
- data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
- data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
- data/ext/uchardet/test/et/utf-8.txt +6 -0
- data/ext/uchardet/test/et/windows-1252.txt +6 -0
- data/ext/uchardet/test/et/windows-1257.txt +6 -0
- data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
- data/ext/uchardet/test/fi/utf-8.txt +8 -0
- data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
- data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
- data/ext/uchardet/test/fr/utf-16.be +0 -0
- data/ext/uchardet/test/fr/utf-32.le +0 -0
- data/ext/uchardet/test/fr/utf-8.txt +14 -0
- data/ext/uchardet/test/fr/windows-1252.txt +3 -0
- data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
- data/ext/uchardet/test/ga/utf-8.txt +6 -0
- data/ext/uchardet/test/ga/windows-1252.txt +6 -0
- data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
- data/ext/uchardet/test/he/utf-8.txt +3 -0
- data/ext/uchardet/test/he/windows-1255.txt +1 -0
- data/ext/uchardet/test/hr/ibm852.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
- data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
- data/ext/uchardet/test/hr/utf-8.txt +4 -0
- data/ext/uchardet/test/hr/windows-1250.txt +4 -0
- data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/hu/windows-1250.txt +1 -0
- data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
- data/ext/uchardet/test/it/utf-8.txt +18 -0
- data/ext/uchardet/test/ja/euc-jp.txt +10 -0
- data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
- data/ext/uchardet/test/ja/shift_jis.txt +1 -0
- data/ext/uchardet/test/ja/utf-16be.txt +0 -0
- data/ext/uchardet/test/ja/utf-16le.txt +0 -0
- data/ext/uchardet/test/ja/utf-8.txt +9 -0
- data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
- data/ext/uchardet/test/ko/uhc.smi +16 -0
- data/ext/uchardet/test/ko/utf-16.le +0 -0
- data/ext/uchardet/test/ko/utf-32.be +0 -0
- data/ext/uchardet/test/ko/utf-8.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
- data/ext/uchardet/test/lt/utf-8.txt +3 -0
- data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
- data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
- data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
- data/ext/uchardet/test/lv/utf-8.txt +6 -0
- data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
- data/ext/uchardet/test/mt/utf-8.txt +4 -0
- data/ext/uchardet/test/pl/ibm852.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
- data/ext/uchardet/test/pl/utf-8.txt +3 -0
- data/ext/uchardet/test/pl/windows-1250.txt +3 -0
- data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
- data/ext/uchardet/test/pt/utf-8.txt +6 -0
- data/ext/uchardet/test/ro/ibm852.txt +9 -0
- data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
- data/ext/uchardet/test/ro/utf-8.txt +9 -0
- data/ext/uchardet/test/ro/windows-1250.txt +9 -0
- data/ext/uchardet/test/ru/ibm855.txt +5 -0
- data/ext/uchardet/test/ru/ibm866.txt +11 -0
- data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
- data/ext/uchardet/test/ru/koi8-r.txt +1 -0
- data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
- data/ext/uchardet/test/ru/windows-1251.txt +4 -0
- data/ext/uchardet/test/sk/ibm852.txt +3 -0
- data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
- data/ext/uchardet/test/sk/utf-8.txt +3 -0
- data/ext/uchardet/test/sk/windows-1250.txt +3 -0
- data/ext/uchardet/test/sl/ibm852.txt +9 -0
- data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
- data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
- data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
- data/ext/uchardet/test/sl/utf-8.txt +9 -0
- data/ext/uchardet/test/sl/windows-1250.txt +9 -0
- data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
- data/ext/uchardet/test/sv/utf-8.txt +10 -0
- data/ext/uchardet/test/sv/windows-1252.txt +10 -0
- data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
- data/ext/uchardet/test/th/tis-620.txt +5 -0
- data/ext/uchardet/test/th/utf-8.txt +1 -0
- data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
- data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
- data/ext/uchardet/test/uchardet-tests.c +130 -0
- data/ext/uchardet/test/vi/utf-8.txt +4 -0
- data/ext/uchardet/test/vi/viscii.txt +4 -0
- data/ext/uchardet/test/vi/windows-1258.txt +4 -0
- data/ext/uchardet/test/zh/big5.txt +1 -0
- data/ext/uchardet/test/zh/euc-tw.txt +1 -0
- data/ext/uchardet/test/zh/gb18030.txt +1 -0
- data/ext/uchardet/test/zh/utf-8.txt +1 -0
- data/ext/uchardet/uchardet.doap +51 -0
- data/ext/uchardet/uchardet.pc.in +10 -0
- data/lib/cchardet.rb +56 -0
- data/lib/cchardet/lib_finder.rb +32 -0
- data/lib/cchardet/version.rb +5 -0
- metadata +362 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
/* ***** BEGIN LICENSE BLOCK *****
|
|
2
|
+
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
3
|
+
*
|
|
4
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
|
5
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
6
|
+
* the License. You may obtain a copy of the License at
|
|
7
|
+
* http://www.mozilla.org/MPL/
|
|
8
|
+
*
|
|
9
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
10
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
11
|
+
* for the specific language governing rights and limitations under the
|
|
12
|
+
* License.
|
|
13
|
+
*
|
|
14
|
+
* The Original Code is Mozilla Universal charset detector code.
|
|
15
|
+
*
|
|
16
|
+
* The Initial Developer of the Original Code is
|
|
17
|
+
* Netscape Communications Corporation.
|
|
18
|
+
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
19
|
+
* the Initial Developer. All Rights Reserved.
|
|
20
|
+
*
|
|
21
|
+
* Contributor(s):
|
|
22
|
+
* BYVoid <byvoid.kcp@gmail.com>
|
|
23
|
+
*
|
|
24
|
+
* Alternatively, the contents of this file may be used under the terms of
|
|
25
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
26
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
27
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
28
|
+
* of those above. If you wish to allow use of your version of this file only
|
|
29
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
30
|
+
* use your version of this file under the terms of the MPL, indicate your
|
|
31
|
+
* decision by deleting the provisions above and replace them with the notice
|
|
32
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
33
|
+
* the provisions above, a recipient may use your version of this file under
|
|
34
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
35
|
+
*
|
|
36
|
+
* ***** END LICENSE BLOCK ***** */
|
|
37
|
+
#include "uchardet.h"
|
|
38
|
+
#include <string.h>
|
|
39
|
+
#include <stdlib.h>
|
|
40
|
+
#include <map>
|
|
41
|
+
#include <string>
|
|
42
|
+
#include <vector>
|
|
43
|
+
#include "nscore.h"
|
|
44
|
+
#include "nsUniversalDetector.h"
|
|
45
|
+
|
|
46
|
+
typedef struct _UChardetCandidate
|
|
47
|
+
{
|
|
48
|
+
char *encoding;
|
|
49
|
+
char *language;
|
|
50
|
+
float confidence;
|
|
51
|
+
} UChardetCandidate;
|
|
52
|
+
|
|
53
|
+
class HandleUniversalDetector : public nsUniversalDetector
|
|
54
|
+
{
|
|
55
|
+
protected:
|
|
56
|
+
std::vector<UChardetCandidate> candidates;
|
|
57
|
+
std::vector<UChardetCandidate> weighed_candidates;
|
|
58
|
+
std::map<std::string, float> weights;
|
|
59
|
+
float default_weight;
|
|
60
|
+
|
|
61
|
+
public:
|
|
62
|
+
HandleUniversalDetector()
|
|
63
|
+
: nsUniversalDetector(NS_FILTER_ALL), default_weight(1.0)
|
|
64
|
+
{
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
virtual ~HandleUniversalDetector()
|
|
68
|
+
{
|
|
69
|
+
Reset();
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
virtual void Report(const char *encoding,
|
|
73
|
+
const char *language,
|
|
74
|
+
float confidence)
|
|
75
|
+
{
|
|
76
|
+
std::vector<UChardetCandidate>::iterator it;
|
|
77
|
+
UChardetCandidate candidate;
|
|
78
|
+
|
|
79
|
+
for (it = candidates.begin(); it != candidates.end(); it++)
|
|
80
|
+
{
|
|
81
|
+
if (strcmp(it->encoding, encoding) == 0 &&
|
|
82
|
+
it->language && language && strcmp(it->language, language) == 0)
|
|
83
|
+
{
|
|
84
|
+
/* Already reported. Bail out or update the confidence
|
|
85
|
+
* when needed.
|
|
86
|
+
*/
|
|
87
|
+
if (confidence > it->confidence)
|
|
88
|
+
{
|
|
89
|
+
candidates.erase(it);
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
else
|
|
93
|
+
{
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
candidate = UChardetCandidate();
|
|
100
|
+
candidate.encoding = strdup(encoding);
|
|
101
|
+
candidate.language = language ? strdup(language) : NULL;
|
|
102
|
+
candidate.confidence = confidence;
|
|
103
|
+
|
|
104
|
+
for (it = candidates.begin(); it != candidates.end(); it++)
|
|
105
|
+
{
|
|
106
|
+
if (it->confidence < confidence)
|
|
107
|
+
break;
|
|
108
|
+
}
|
|
109
|
+
candidates.insert(it, candidate);
|
|
110
|
+
|
|
111
|
+
if (weights.size() > 0)
|
|
112
|
+
WeighCandidates();
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
virtual void Reset()
|
|
116
|
+
{
|
|
117
|
+
std::vector<UChardetCandidate>::iterator it;
|
|
118
|
+
|
|
119
|
+
nsUniversalDetector::Reset();
|
|
120
|
+
for (it = candidates.begin(); it != candidates.end(); it++)
|
|
121
|
+
{
|
|
122
|
+
free(it->encoding);
|
|
123
|
+
if (it->language)
|
|
124
|
+
free(it->language);
|
|
125
|
+
}
|
|
126
|
+
candidates.clear();
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
size_t GetCandidates() const
|
|
130
|
+
{
|
|
131
|
+
return candidates.size();
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const char* GetCharset(size_t i)
|
|
135
|
+
{
|
|
136
|
+
if (weights.size() > 0)
|
|
137
|
+
return (weighed_candidates.size() > i) ? weighed_candidates[i].encoding : "";
|
|
138
|
+
return (candidates.size() > i) ? candidates[i].encoding : "";
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
float GetConfidence(size_t i)
|
|
142
|
+
{
|
|
143
|
+
if (weights.size() > 0)
|
|
144
|
+
return (weighed_candidates.size() > i) ? weighed_candidates[i].confidence : 0.0;
|
|
145
|
+
return (candidates.size() > i) ? candidates[i].confidence : 0.0;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const char* GetLanguage(size_t i)
|
|
149
|
+
{
|
|
150
|
+
if (weights.size() > 0)
|
|
151
|
+
return (weighed_candidates.size() > i) ? weighed_candidates[i].language : NULL;
|
|
152
|
+
return (candidates.size() > i) ? candidates[i].language : NULL;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
void WeighLanguage(const char *language,
|
|
156
|
+
float weight)
|
|
157
|
+
{
|
|
158
|
+
weights[language] = weight;
|
|
159
|
+
WeighCandidates();
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
void WeighDefault(float weight)
|
|
163
|
+
{
|
|
164
|
+
default_weight = weight;
|
|
165
|
+
WeighCandidates();
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
private:
|
|
169
|
+
|
|
170
|
+
void WeighCandidates()
|
|
171
|
+
{
|
|
172
|
+
std::vector<UChardetCandidate>::iterator it;
|
|
173
|
+
std::vector<UChardetCandidate>::iterator it2;
|
|
174
|
+
UChardetCandidate candidate;
|
|
175
|
+
|
|
176
|
+
weighed_candidates.clear();
|
|
177
|
+
for (it = candidates.begin(); it != candidates.end(); it++)
|
|
178
|
+
{
|
|
179
|
+
std::map<std::string, float>::iterator weight_it;
|
|
180
|
+
float confidence;
|
|
181
|
+
|
|
182
|
+
confidence = it->confidence * default_weight;
|
|
183
|
+
if (it->language)
|
|
184
|
+
{
|
|
185
|
+
weight_it = weights.find(it->language);
|
|
186
|
+
if (weight_it != weights.end())
|
|
187
|
+
confidence = weight_it->second * it->confidence;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
candidate = UChardetCandidate();
|
|
191
|
+
candidate.encoding = it->encoding;
|
|
192
|
+
candidate.language = it->language;
|
|
193
|
+
candidate.confidence = confidence;
|
|
194
|
+
|
|
195
|
+
for (it2 = weighed_candidates.begin(); it2 != weighed_candidates.end(); it2++)
|
|
196
|
+
{
|
|
197
|
+
if (it2->confidence < confidence)
|
|
198
|
+
break;
|
|
199
|
+
}
|
|
200
|
+
weighed_candidates.insert(it2, candidate);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
};
|
|
204
|
+
|
|
205
|
+
uchardet_t uchardet_new(void)
|
|
206
|
+
{
|
|
207
|
+
return reinterpret_cast<uchardet_t> (new HandleUniversalDetector());
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
void uchardet_delete(uchardet_t ud)
|
|
211
|
+
{
|
|
212
|
+
delete reinterpret_cast<HandleUniversalDetector*>(ud);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
int uchardet_handle_data(uchardet_t ud, const char * data, size_t len)
|
|
216
|
+
{
|
|
217
|
+
nsresult ret = NS_OK;
|
|
218
|
+
|
|
219
|
+
if (len > 0)
|
|
220
|
+
ret = reinterpret_cast<HandleUniversalDetector*>(ud)->HandleData(data, (PRUint32)len);
|
|
221
|
+
|
|
222
|
+
return (ret != NS_OK);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
void uchardet_data_end(uchardet_t ud)
|
|
226
|
+
{
|
|
227
|
+
reinterpret_cast<HandleUniversalDetector*>(ud)->DataEnd();
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
void uchardet_reset(uchardet_t ud)
|
|
231
|
+
{
|
|
232
|
+
reinterpret_cast<HandleUniversalDetector*>(ud)->Reset();
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const char* uchardet_get_charset(uchardet_t ud)
|
|
236
|
+
{
|
|
237
|
+
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset(0);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
size_t uchardet_get_candidates (uchardet_t ud)
|
|
241
|
+
{
|
|
242
|
+
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCandidates();
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
float uchardet_get_confidence (uchardet_t ud,
|
|
246
|
+
size_t candidate)
|
|
247
|
+
{
|
|
248
|
+
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetConfidence(candidate);
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
const char * uchardet_get_encoding (uchardet_t ud,
|
|
252
|
+
size_t candidate)
|
|
253
|
+
{
|
|
254
|
+
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset(candidate);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const char * uchardet_get_language (uchardet_t ud,
|
|
258
|
+
size_t candidate)
|
|
259
|
+
{
|
|
260
|
+
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetLanguage(candidate);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
void uchardet_weigh_language (uchardet_t ud,
|
|
264
|
+
const char *language,
|
|
265
|
+
float weight)
|
|
266
|
+
{
|
|
267
|
+
reinterpret_cast<HandleUniversalDetector*>(ud)->WeighLanguage(language, weight);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
void uchardet_set_default_weight (uchardet_t ud,
|
|
271
|
+
float weight)
|
|
272
|
+
{
|
|
273
|
+
reinterpret_cast<HandleUniversalDetector*>(ud)->WeighDefault(weight);
|
|
274
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/* ***** BEGIN LICENSE BLOCK *****
|
|
2
|
+
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
3
|
+
*
|
|
4
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
|
5
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
6
|
+
* the License. You may obtain a copy of the License at
|
|
7
|
+
* http://www.mozilla.org/MPL/
|
|
8
|
+
*
|
|
9
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
10
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
11
|
+
* for the specific language governing rights and limitations under the
|
|
12
|
+
* License.
|
|
13
|
+
*
|
|
14
|
+
* The Original Code is Mozilla Universal charset detector code.
|
|
15
|
+
*
|
|
16
|
+
* The Initial Developer of the Original Code is
|
|
17
|
+
* Netscape Communications Corporation.
|
|
18
|
+
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
19
|
+
* the Initial Developer. All Rights Reserved.
|
|
20
|
+
*
|
|
21
|
+
* Contributor(s):
|
|
22
|
+
* BYVoid <byvoid.kcp@gmail.com>
|
|
23
|
+
* Jehan <jehan at girinstud.io>
|
|
24
|
+
*
|
|
25
|
+
* Alternatively, the contents of this file may be used under the terms of
|
|
26
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
27
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
28
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
29
|
+
* of those above. If you wish to allow use of your version of this file only
|
|
30
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
31
|
+
* use your version of this file under the terms of the MPL, indicate your
|
|
32
|
+
* decision by deleting the provisions above and replace them with the notice
|
|
33
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
34
|
+
* the provisions above, a recipient may use your version of this file under
|
|
35
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
36
|
+
*
|
|
37
|
+
* ***** END LICENSE BLOCK ***** */
|
|
38
|
+
#ifndef UCHARDET_H___
|
|
39
|
+
#define UCHARDET_H___
|
|
40
|
+
|
|
41
|
+
#ifdef __cplusplus
|
|
42
|
+
extern "C" {
|
|
43
|
+
#endif
|
|
44
|
+
|
|
45
|
+
#include <stddef.h>
|
|
46
|
+
|
|
47
|
+
#if defined(UCHARDET_SHARED) && (defined(_WIN32) || defined(__CYGWIN__))
|
|
48
|
+
#ifdef BUILDING_UCHARDET
|
|
49
|
+
#define UCHARDET_INTERFACE __declspec(dllexport)
|
|
50
|
+
#else
|
|
51
|
+
#define UCHARDET_INTERFACE __declspec(dllimport)
|
|
52
|
+
#endif
|
|
53
|
+
#else
|
|
54
|
+
#define UCHARDET_INTERFACE
|
|
55
|
+
#endif
|
|
56
|
+
|
|
57
|
+
#if defined(__cplusplus) && (__cplusplus >= 201402L)
|
|
58
|
+
#define DEPRECATED(message) [[deprecated(message)]]
|
|
59
|
+
#elif defined(__GNUC__) || defined(__clang__)
|
|
60
|
+
#define DEPRECATED(message) __attribute__ ((deprecated))
|
|
61
|
+
#elif defined(_MSC_VER)
|
|
62
|
+
#define DEPRECATED(message) __declspec(deprecated) func
|
|
63
|
+
#else
|
|
64
|
+
#warning("DEPRECATED macro not available")
|
|
65
|
+
#define DEPRECATED(message)
|
|
66
|
+
#endif
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* A handle for a uchardet encoding detector.
|
|
70
|
+
*/
|
|
71
|
+
typedef struct uchardet * uchardet_t;
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Create an encoding detector.
|
|
75
|
+
* @return an instance of uchardet_t.
|
|
76
|
+
*/
|
|
77
|
+
UCHARDET_INTERFACE uchardet_t uchardet_new(void);
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Delete an encoding detector.
|
|
81
|
+
* @param ud [in] the uchardet_t handle to delete.
|
|
82
|
+
*/
|
|
83
|
+
UCHARDET_INTERFACE void uchardet_delete(uchardet_t ud);
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Feed data to an encoding detector.
|
|
87
|
+
* The detector is able to shortcut processing when it reaches certainty
|
|
88
|
+
* for an encoding, so you should not worry about limiting input data.
|
|
89
|
+
* As far as you should be concerned: the more the better.
|
|
90
|
+
*
|
|
91
|
+
* @param ud [in] handle of an instance of uchardet
|
|
92
|
+
* @param data [in] data
|
|
93
|
+
* @param len [in] number of byte of data
|
|
94
|
+
* @return non-zero number on failure.
|
|
95
|
+
*/
|
|
96
|
+
UCHARDET_INTERFACE int uchardet_handle_data(uchardet_t ud, const char * data, size_t len);
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Notify an end of data to an encoding detector.
|
|
100
|
+
* @param ud [in] handle of an instance of uchardet
|
|
101
|
+
*/
|
|
102
|
+
UCHARDET_INTERFACE void uchardet_data_end(uchardet_t ud);
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Reset an encoding detector.
|
|
106
|
+
* @param ud [in] handle of an instance of uchardet
|
|
107
|
+
*/
|
|
108
|
+
UCHARDET_INTERFACE void uchardet_reset(uchardet_t ud);
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Get an iconv-compatible name of the encoding that was detected.
|
|
112
|
+
* @param ud [in] handle of an instance of uchardet
|
|
113
|
+
* @return name of charset on success and "" on failure.
|
|
114
|
+
*/
|
|
115
|
+
DEPRECATED("use uchardet_get_candidates() and uchardet_get_encoding() instead (since 0.1.0)")
|
|
116
|
+
UCHARDET_INTERFACE const char * uchardet_get_charset(uchardet_t ud);
|
|
117
|
+
|
|
118
|
+
UCHARDET_INTERFACE size_t uchardet_get_candidates (uchardet_t ud);
|
|
119
|
+
UCHARDET_INTERFACE float uchardet_get_confidence (uchardet_t ud,
|
|
120
|
+
size_t candidate);
|
|
121
|
+
UCHARDET_INTERFACE const char * uchardet_get_encoding (uchardet_t ud,
|
|
122
|
+
size_t candidate);
|
|
123
|
+
UCHARDET_INTERFACE const char * uchardet_get_language (uchardet_t ud,
|
|
124
|
+
size_t candidate);
|
|
125
|
+
|
|
126
|
+
UCHARDET_INTERFACE void uchardet_weigh_language (uchardet_t ud,
|
|
127
|
+
const char *language,
|
|
128
|
+
float weight);
|
|
129
|
+
UCHARDET_INTERFACE void uchardet_set_default_weight (uchardet_t ud,
|
|
130
|
+
float weight);
|
|
131
|
+
|
|
132
|
+
#ifdef __cplusplus
|
|
133
|
+
}
|
|
134
|
+
#endif
|
|
135
|
+
|
|
136
|
+
#endif
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
set(
|
|
2
|
+
UCHARDET_TEST_SOURCES
|
|
3
|
+
uchardet-tests.c
|
|
4
|
+
)
|
|
5
|
+
|
|
6
|
+
add_executable(
|
|
7
|
+
uchardet-tests
|
|
8
|
+
${UCHARDET_TEST_SOURCES}
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
target_link_libraries(
|
|
12
|
+
uchardet-tests
|
|
13
|
+
${UCHARDET_LIBRARY}
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
set_target_properties(
|
|
17
|
+
uchardet-tests
|
|
18
|
+
PROPERTIES
|
|
19
|
+
LINKER_LANGUAGE
|
|
20
|
+
C
|
|
21
|
+
OUTPUT_NAME
|
|
22
|
+
uchardet-tests
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Iterate through all langs.
|
|
26
|
+
file(GLOB dirs "[a-z][a-z]")
|
|
27
|
+
foreach(dir ${dirs})
|
|
28
|
+
get_filename_component(lang ${dir} NAME)
|
|
29
|
+
file(GLOB files "${dir}/*")
|
|
30
|
+
# Iterate through all files.
|
|
31
|
+
foreach(file ${files})
|
|
32
|
+
get_filename_component(charset ${file} NAME_WE)
|
|
33
|
+
# These are tests known to fail (not supported or not efficient
|
|
34
|
+
# enough). We will have to take a closer look and fix these, but
|
|
35
|
+
# there is no need to break the whole `make test` right now,
|
|
36
|
+
# which may make actual regressions harder to notice.
|
|
37
|
+
if ("${lang}:${charset}" STREQUAL "ja:utf-16le" OR
|
|
38
|
+
"${lang}:${charset}" STREQUAL "ja:utf-16be" OR
|
|
39
|
+
"${lang}:${charset}" STREQUAL "es:iso-8859-15" OR
|
|
40
|
+
"${lang}:${charset}" STREQUAL "da:iso-8859-1" OR
|
|
41
|
+
"${lang}:${charset}" STREQUAL "he:iso-8859-8")
|
|
42
|
+
message(STATUS "Skipping test ${lang}:${charset} (known broken)")
|
|
43
|
+
else()
|
|
44
|
+
add_test(NAME "${lang}:${charset}" COMMAND uchardet-tests ${file})
|
|
45
|
+
endif()
|
|
46
|
+
endforeach()
|
|
47
|
+
endforeach()
|