cchardet 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.gitmodules +3 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +35 -0
- data/Rakefile +15 -0
- data/cchardet.gemspec +30 -0
- data/ext/cchardet/extconf.rb +26 -0
- data/ext/uchardet/.gitignore +1 -0
- data/ext/uchardet/.gitlab-ci.yml +106 -0
- data/ext/uchardet/AUTHORS +16 -0
- data/ext/uchardet/CMakeLists.txt +74 -0
- data/ext/uchardet/COPYING +1316 -0
- data/ext/uchardet/INSTALL +26 -0
- data/ext/uchardet/README.md +295 -0
- data/ext/uchardet/build-mac/uchardet.cpp +7 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
- data/ext/uchardet/doc/CMakeLists.txt +6 -0
- data/ext/uchardet/doc/README.maintainer +59 -0
- data/ext/uchardet/doc/uchardet.1 +18 -0
- data/ext/uchardet/script/BuildLangModel.py +533 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
- data/ext/uchardet/script/README +63 -0
- data/ext/uchardet/script/charsets/codepoints.py +53 -0
- data/ext/uchardet/script/charsets/db.py +73 -0
- data/ext/uchardet/script/charsets/ibm852.py +72 -0
- data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
- data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
- data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
- data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
- data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
- data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
- data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
- data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
- data/ext/uchardet/script/charsets/tis-620.py +77 -0
- data/ext/uchardet/script/charsets/viscii.py +72 -0
- data/ext/uchardet/script/charsets/windows-1250.py +75 -0
- data/ext/uchardet/script/charsets/windows-1252.py +76 -0
- data/ext/uchardet/script/charsets/windows-1253.py +72 -0
- data/ext/uchardet/script/charsets/windows-1256.py +75 -0
- data/ext/uchardet/script/charsets/windows-1257.py +72 -0
- data/ext/uchardet/script/charsets/windows-1258.py +72 -0
- data/ext/uchardet/script/debug.sh +9 -0
- data/ext/uchardet/script/header-template.cpp +38 -0
- data/ext/uchardet/script/langs/ar.py +59 -0
- data/ext/uchardet/script/langs/cs.py +80 -0
- data/ext/uchardet/script/langs/da.py +69 -0
- data/ext/uchardet/script/langs/de.py +69 -0
- data/ext/uchardet/script/langs/el.py +55 -0
- data/ext/uchardet/script/langs/eo.py +67 -0
- data/ext/uchardet/script/langs/es.py +69 -0
- data/ext/uchardet/script/langs/et.py +57 -0
- data/ext/uchardet/script/langs/fi.py +60 -0
- data/ext/uchardet/script/langs/fr.py +79 -0
- data/ext/uchardet/script/langs/ga.py +60 -0
- data/ext/uchardet/script/langs/hr.py +59 -0
- data/ext/uchardet/script/langs/hu.py +66 -0
- data/ext/uchardet/script/langs/it.py +56 -0
- data/ext/uchardet/script/langs/lt.py +70 -0
- data/ext/uchardet/script/langs/lv.py +69 -0
- data/ext/uchardet/script/langs/mt.py +80 -0
- data/ext/uchardet/script/langs/pl.py +81 -0
- data/ext/uchardet/script/langs/pt.py +80 -0
- data/ext/uchardet/script/langs/ro.py +65 -0
- data/ext/uchardet/script/langs/sk.py +80 -0
- data/ext/uchardet/script/langs/sl.py +59 -0
- data/ext/uchardet/script/langs/sv.py +56 -0
- data/ext/uchardet/script/langs/th.py +55 -0
- data/ext/uchardet/script/langs/tr.py +67 -0
- data/ext/uchardet/script/langs/vi.py +64 -0
- data/ext/uchardet/script/release.sh +8 -0
- data/ext/uchardet/script/win32.sh +7 -0
- data/ext/uchardet/src/Big5Freq.tab +943 -0
- data/ext/uchardet/src/CMakeLists.txt +160 -0
- data/ext/uchardet/src/CharDistribution.cpp +109 -0
- data/ext/uchardet/src/CharDistribution.h +242 -0
- data/ext/uchardet/src/EUCKRFreq.tab +614 -0
- data/ext/uchardet/src/EUCTWFreq.tab +447 -0
- data/ext/uchardet/src/GB2312Freq.tab +491 -0
- data/ext/uchardet/src/JISFreq.tab +589 -0
- data/ext/uchardet/src/JpCntx.cpp +230 -0
- data/ext/uchardet/src/JpCntx.h +140 -0
- data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
- data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
- data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
- data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
- data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
- data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
- data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
- data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
- data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
- data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
- data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
- data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
- data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
- data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
- data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
- data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
- data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
- data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
- data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
- data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
- data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
- data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
- data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
- data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
- data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
- data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
- data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
- data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
- data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
- data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
- data/ext/uchardet/src/nsBig5Prober.h +75 -0
- data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
- data/ext/uchardet/src/nsCharSetProber.h +77 -0
- data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
- data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
- data/ext/uchardet/src/nsEUCJPProber.h +79 -0
- data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
- data/ext/uchardet/src/nsEUCKRProber.h +81 -0
- data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
- data/ext/uchardet/src/nsEUCTWProber.h +75 -0
- data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
- data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
- data/ext/uchardet/src/nsEscSM.cpp +267 -0
- data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
- data/ext/uchardet/src/nsGB2312Prober.h +77 -0
- data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
- data/ext/uchardet/src/nsHebrewProber.h +177 -0
- data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
- data/ext/uchardet/src/nsLatin1Prober.h +73 -0
- data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
- data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
- data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
- data/ext/uchardet/src/nsPkgInt.h +89 -0
- data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
- data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
- data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
- data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
- data/ext/uchardet/src/nsSJISProber.cpp +98 -0
- data/ext/uchardet/src/nsSJISProber.h +81 -0
- data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
- data/ext/uchardet/src/nsUTF8Prober.h +66 -0
- data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
- data/ext/uchardet/src/nsUniversalDetector.h +91 -0
- data/ext/uchardet/src/nscore.h +59 -0
- data/ext/uchardet/src/prmem.h +49 -0
- data/ext/uchardet/src/symbols.cmake +41 -0
- data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
- data/ext/uchardet/src/tools/uchardet.cpp +254 -0
- data/ext/uchardet/src/uchardet.cpp +274 -0
- data/ext/uchardet/src/uchardet.h +136 -0
- data/ext/uchardet/test/CMakeLists.txt +47 -0
- data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
- data/ext/uchardet/test/ar/utf-8.txt +3 -0
- data/ext/uchardet/test/ar/windows-1256.txt +3 -0
- data/ext/uchardet/test/bg/windows-1251.txt +3 -0
- data/ext/uchardet/test/cs/ibm852.txt +4 -0
- data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
- data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
- data/ext/uchardet/test/cs/utf-8.txt +4 -0
- data/ext/uchardet/test/cs/windows-1250.txt +4 -0
- data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
- data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
- data/ext/uchardet/test/da/utf-8.txt +10 -0
- data/ext/uchardet/test/da/windows-1252.txt +10 -0
- data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
- data/ext/uchardet/test/de/windows-1252.txt +11 -0
- data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
- data/ext/uchardet/test/el/utf-8.txt +3 -0
- data/ext/uchardet/test/el/windows-1253.txt +5 -0
- data/ext/uchardet/test/en/ascii.txt +4 -0
- data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
- data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
- data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
- data/ext/uchardet/test/es/utf-8.txt +5 -0
- data/ext/uchardet/test/es/windows-1252.txt +5 -0
- data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
- data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
- data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
- data/ext/uchardet/test/et/utf-8.txt +6 -0
- data/ext/uchardet/test/et/windows-1252.txt +6 -0
- data/ext/uchardet/test/et/windows-1257.txt +6 -0
- data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
- data/ext/uchardet/test/fi/utf-8.txt +8 -0
- data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
- data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
- data/ext/uchardet/test/fr/utf-16.be +0 -0
- data/ext/uchardet/test/fr/utf-32.le +0 -0
- data/ext/uchardet/test/fr/utf-8.txt +14 -0
- data/ext/uchardet/test/fr/windows-1252.txt +3 -0
- data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
- data/ext/uchardet/test/ga/utf-8.txt +6 -0
- data/ext/uchardet/test/ga/windows-1252.txt +6 -0
- data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
- data/ext/uchardet/test/he/utf-8.txt +3 -0
- data/ext/uchardet/test/he/windows-1255.txt +1 -0
- data/ext/uchardet/test/hr/ibm852.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
- data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
- data/ext/uchardet/test/hr/utf-8.txt +4 -0
- data/ext/uchardet/test/hr/windows-1250.txt +4 -0
- data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/hu/windows-1250.txt +1 -0
- data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
- data/ext/uchardet/test/it/utf-8.txt +18 -0
- data/ext/uchardet/test/ja/euc-jp.txt +10 -0
- data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
- data/ext/uchardet/test/ja/shift_jis.txt +1 -0
- data/ext/uchardet/test/ja/utf-16be.txt +0 -0
- data/ext/uchardet/test/ja/utf-16le.txt +0 -0
- data/ext/uchardet/test/ja/utf-8.txt +9 -0
- data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
- data/ext/uchardet/test/ko/uhc.smi +16 -0
- data/ext/uchardet/test/ko/utf-16.le +0 -0
- data/ext/uchardet/test/ko/utf-32.be +0 -0
- data/ext/uchardet/test/ko/utf-8.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
- data/ext/uchardet/test/lt/utf-8.txt +3 -0
- data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
- data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
- data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
- data/ext/uchardet/test/lv/utf-8.txt +6 -0
- data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
- data/ext/uchardet/test/mt/utf-8.txt +4 -0
- data/ext/uchardet/test/pl/ibm852.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
- data/ext/uchardet/test/pl/utf-8.txt +3 -0
- data/ext/uchardet/test/pl/windows-1250.txt +3 -0
- data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
- data/ext/uchardet/test/pt/utf-8.txt +6 -0
- data/ext/uchardet/test/ro/ibm852.txt +9 -0
- data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
- data/ext/uchardet/test/ro/utf-8.txt +9 -0
- data/ext/uchardet/test/ro/windows-1250.txt +9 -0
- data/ext/uchardet/test/ru/ibm855.txt +5 -0
- data/ext/uchardet/test/ru/ibm866.txt +11 -0
- data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
- data/ext/uchardet/test/ru/koi8-r.txt +1 -0
- data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
- data/ext/uchardet/test/ru/windows-1251.txt +4 -0
- data/ext/uchardet/test/sk/ibm852.txt +3 -0
- data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
- data/ext/uchardet/test/sk/utf-8.txt +3 -0
- data/ext/uchardet/test/sk/windows-1250.txt +3 -0
- data/ext/uchardet/test/sl/ibm852.txt +9 -0
- data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
- data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
- data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
- data/ext/uchardet/test/sl/utf-8.txt +9 -0
- data/ext/uchardet/test/sl/windows-1250.txt +9 -0
- data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
- data/ext/uchardet/test/sv/utf-8.txt +10 -0
- data/ext/uchardet/test/sv/windows-1252.txt +10 -0
- data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
- data/ext/uchardet/test/th/tis-620.txt +5 -0
- data/ext/uchardet/test/th/utf-8.txt +1 -0
- data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
- data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
- data/ext/uchardet/test/uchardet-tests.c +130 -0
- data/ext/uchardet/test/vi/utf-8.txt +4 -0
- data/ext/uchardet/test/vi/viscii.txt +4 -0
- data/ext/uchardet/test/vi/windows-1258.txt +4 -0
- data/ext/uchardet/test/zh/big5.txt +1 -0
- data/ext/uchardet/test/zh/euc-tw.txt +1 -0
- data/ext/uchardet/test/zh/gb18030.txt +1 -0
- data/ext/uchardet/test/zh/utf-8.txt +1 -0
- data/ext/uchardet/uchardet.doap +51 -0
- data/ext/uchardet/uchardet.pc.in +10 -0
- data/lib/cchardet.rb +56 -0
- data/lib/cchardet/lib_finder.rb +32 -0
- data/lib/cchardet/version.rb +5 -0
- metadata +362 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
2
|
+
/* ***** BEGIN LICENSE BLOCK *****
|
|
3
|
+
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
4
|
+
*
|
|
5
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
|
6
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
* http://www.mozilla.org/MPL/
|
|
9
|
+
*
|
|
10
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
11
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
12
|
+
* for the specific language governing rights and limitations under the
|
|
13
|
+
* License.
|
|
14
|
+
*
|
|
15
|
+
* The Original Code is mozilla.org code.
|
|
16
|
+
*
|
|
17
|
+
* The Initial Developer of the Original Code is
|
|
18
|
+
* Netscape Communications Corporation.
|
|
19
|
+
* Portions created by the Initial Developer are Copyright (C) 1998
|
|
20
|
+
* the Initial Developer. All Rights Reserved.
|
|
21
|
+
*
|
|
22
|
+
* Contributor(s):
|
|
23
|
+
*
|
|
24
|
+
* Alternatively, the contents of this file may be used under the terms of
|
|
25
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
26
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
27
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
28
|
+
* of those above. If you wish to allow use of your version of this file only
|
|
29
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
30
|
+
* use your version of this file under the terms of the MPL, indicate your
|
|
31
|
+
* decision by deleting the provisions above and replace them with the notice
|
|
32
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
33
|
+
* the provisions above, a recipient may use your version of this file under
|
|
34
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
35
|
+
*
|
|
36
|
+
* ***** END LICENSE BLOCK ***** */
|
|
37
|
+
|
|
38
|
+
#ifndef nsBig5Prober_h__
|
|
39
|
+
#define nsBig5Prober_h__
|
|
40
|
+
|
|
41
|
+
#include "nsCharSetProber.h"
|
|
42
|
+
#include "nsCodingStateMachine.h"
|
|
43
|
+
#include "CharDistribution.h"
|
|
44
|
+
|
|
45
|
+
class nsBig5Prober: public nsCharSetProber {
|
|
46
|
+
public:
|
|
47
|
+
nsBig5Prober(PRBool aIsPreferredLanguage)
|
|
48
|
+
:mIsPreferredLanguage(aIsPreferredLanguage)
|
|
49
|
+
{mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
|
50
|
+
Reset();}
|
|
51
|
+
virtual ~nsBig5Prober(void){delete mCodingSM;}
|
|
52
|
+
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
|
53
|
+
const char* GetCharSetName() {return "BIG5";}
|
|
54
|
+
const char* GetLanguage() {return "zh";}
|
|
55
|
+
nsProbingState GetState(void) {return mState;}
|
|
56
|
+
void Reset(void);
|
|
57
|
+
float GetConfidence(void);
|
|
58
|
+
void SetOpion() {}
|
|
59
|
+
|
|
60
|
+
protected:
|
|
61
|
+
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
|
62
|
+
|
|
63
|
+
nsCodingStateMachine* mCodingSM;
|
|
64
|
+
nsProbingState mState;
|
|
65
|
+
|
|
66
|
+
//Big5ContextAnalysis mContextAnalyser;
|
|
67
|
+
Big5DistributionAnalysis mDistributionAnalyser;
|
|
68
|
+
char mLastChar[2];
|
|
69
|
+
PRBool mIsPreferredLanguage;
|
|
70
|
+
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
#endif /* nsBig5Prober_h__ */
|
|
75
|
+
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
2
|
+
/* ***** BEGIN LICENSE BLOCK *****
|
|
3
|
+
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
4
|
+
*
|
|
5
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
|
6
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
* http://www.mozilla.org/MPL/
|
|
9
|
+
*
|
|
10
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
11
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
12
|
+
* for the specific language governing rights and limitations under the
|
|
13
|
+
* License.
|
|
14
|
+
*
|
|
15
|
+
* The Original Code is Mozilla Universal charset detector code.
|
|
16
|
+
*
|
|
17
|
+
* The Initial Developer of the Original Code is
|
|
18
|
+
* Netscape Communications Corporation.
|
|
19
|
+
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
20
|
+
* the Initial Developer. All Rights Reserved.
|
|
21
|
+
*
|
|
22
|
+
* Contributor(s):
|
|
23
|
+
* Shy Shalom <shooshX@gmail.com>
|
|
24
|
+
*
|
|
25
|
+
* Alternatively, the contents of this file may be used under the terms of
|
|
26
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
27
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
28
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
29
|
+
* of those above. If you wish to allow use of your version of this file only
|
|
30
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
31
|
+
* use your version of this file under the terms of the MPL, indicate your
|
|
32
|
+
* decision by deleting the provisions above and replace them with the notice
|
|
33
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
34
|
+
* the provisions above, a recipient may use your version of this file under
|
|
35
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
36
|
+
*
|
|
37
|
+
* ***** END LICENSE BLOCK ***** */
|
|
38
|
+
|
|
39
|
+
#include "nsCharSetProber.h"
|
|
40
|
+
#include "prmem.h"
|
|
41
|
+
|
|
42
|
+
//This filter applies to all scripts which do not use English characters
|
|
43
|
+
PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
|
|
44
|
+
{
|
|
45
|
+
char *newptr;
|
|
46
|
+
char *prevPtr, *curPtr;
|
|
47
|
+
|
|
48
|
+
PRBool meetMSB = PR_FALSE;
|
|
49
|
+
newptr = *newBuf = (char*)PR_Malloc(aLen);
|
|
50
|
+
if (!newptr)
|
|
51
|
+
return PR_FALSE;
|
|
52
|
+
|
|
53
|
+
for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
|
|
54
|
+
{
|
|
55
|
+
if (*curPtr & 0x80)
|
|
56
|
+
{
|
|
57
|
+
meetMSB = PR_TRUE;
|
|
58
|
+
}
|
|
59
|
+
else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')
|
|
60
|
+
{
|
|
61
|
+
//current char is a symbol, most likely a punctuation. we treat it as segment delimiter
|
|
62
|
+
if (meetMSB && curPtr > prevPtr)
|
|
63
|
+
//this segment contains more than single symbol, and it has upper ASCII, we need to keep it
|
|
64
|
+
{
|
|
65
|
+
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
|
|
66
|
+
prevPtr++;
|
|
67
|
+
*newptr++ = ' ';
|
|
68
|
+
meetMSB = PR_FALSE;
|
|
69
|
+
}
|
|
70
|
+
else //ignore current segment. (either because it is just a symbol or just an English word)
|
|
71
|
+
prevPtr = curPtr+1;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
if (meetMSB && curPtr > prevPtr)
|
|
75
|
+
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
|
|
76
|
+
|
|
77
|
+
newLen = (PRUint32) (newptr - *newBuf);
|
|
78
|
+
|
|
79
|
+
return PR_TRUE;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
//This filter applies to all scripts which contain both English characters and upper ASCII characters.
|
|
83
|
+
PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen)
|
|
84
|
+
{
|
|
85
|
+
//do filtering to reduce load to probers
|
|
86
|
+
char *newptr;
|
|
87
|
+
char *prevPtr, *curPtr;
|
|
88
|
+
PRBool isInTag = PR_FALSE;
|
|
89
|
+
|
|
90
|
+
newptr = *newBuf = (char*)PR_Malloc(aLen);
|
|
91
|
+
if (!newptr)
|
|
92
|
+
return PR_FALSE;
|
|
93
|
+
|
|
94
|
+
for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
|
|
95
|
+
{
|
|
96
|
+
if (*curPtr == '>')
|
|
97
|
+
isInTag = PR_FALSE;
|
|
98
|
+
else if (*curPtr == '<')
|
|
99
|
+
isInTag = PR_TRUE;
|
|
100
|
+
|
|
101
|
+
if (!(*curPtr & 0x80) &&
|
|
102
|
+
(*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
|
|
103
|
+
{
|
|
104
|
+
if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol
|
|
105
|
+
// and it is not inside a tag, keep it.
|
|
106
|
+
{
|
|
107
|
+
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
|
|
108
|
+
prevPtr++;
|
|
109
|
+
*newptr++ = ' ';
|
|
110
|
+
}
|
|
111
|
+
else
|
|
112
|
+
prevPtr = curPtr+1;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// If the current segment contains more than just a symbol
|
|
117
|
+
// and it is not inside a tag then keep it.
|
|
118
|
+
if (!isInTag)
|
|
119
|
+
while (prevPtr < curPtr)
|
|
120
|
+
*newptr++ = *prevPtr++;
|
|
121
|
+
|
|
122
|
+
newLen = (PRUint32) (newptr - *newBuf);
|
|
123
|
+
|
|
124
|
+
return PR_TRUE;
|
|
125
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
2
|
+
/* ***** BEGIN LICENSE BLOCK *****
|
|
3
|
+
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
4
|
+
*
|
|
5
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
|
6
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
* http://www.mozilla.org/MPL/
|
|
9
|
+
*
|
|
10
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
11
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
12
|
+
* for the specific language governing rights and limitations under the
|
|
13
|
+
* License.
|
|
14
|
+
*
|
|
15
|
+
* The Original Code is Mozilla Universal charset detector code.
|
|
16
|
+
*
|
|
17
|
+
* The Initial Developer of the Original Code is
|
|
18
|
+
* Netscape Communications Corporation.
|
|
19
|
+
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
20
|
+
* the Initial Developer. All Rights Reserved.
|
|
21
|
+
*
|
|
22
|
+
* Contributor(s):
|
|
23
|
+
* Shy Shalom <shooshX@gmail.com>
|
|
24
|
+
*
|
|
25
|
+
* Alternatively, the contents of this file may be used under the terms of
|
|
26
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
27
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
28
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
29
|
+
* of those above. If you wish to allow use of your version of this file only
|
|
30
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
31
|
+
* use your version of this file under the terms of the MPL, indicate your
|
|
32
|
+
* decision by deleting the provisions above and replace them with the notice
|
|
33
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
34
|
+
* the provisions above, a recipient may use your version of this file under
|
|
35
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
36
|
+
*
|
|
37
|
+
* ***** END LICENSE BLOCK ***** */
|
|
38
|
+
#ifndef nsCharSetProber_h__
|
|
39
|
+
#define nsCharSetProber_h__
|
|
40
|
+
|
|
41
|
+
#include "nscore.h"
|
|
42
|
+
|
|
43
|
+
//#define DEBUG_chardet // Uncomment this for debug dump.
|
|
44
|
+
|
|
45
|
+
typedef enum {
|
|
46
|
+
eDetecting = 0, //We are still detecting, no sure answer yet, but caller can ask for confidence.
|
|
47
|
+
eFoundIt = 1, //That's a positive answer
|
|
48
|
+
eNotMe = 2 //Negative answer
|
|
49
|
+
} nsProbingState;
|
|
50
|
+
|
|
51
|
+
#define SHORTCUT_THRESHOLD (float)0.95
|
|
52
|
+
|
|
53
|
+
class nsCharSetProber {
|
|
54
|
+
public:
|
|
55
|
+
virtual ~nsCharSetProber() {}
|
|
56
|
+
virtual const char* GetCharSetName() = 0;
|
|
57
|
+
virtual const char* GetLanguage() = 0;
|
|
58
|
+
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
|
|
59
|
+
virtual nsProbingState GetState(void) = 0;
|
|
60
|
+
virtual void Reset(void) = 0;
|
|
61
|
+
virtual float GetConfidence(void) = 0;
|
|
62
|
+
virtual void SetOpion() = 0;
|
|
63
|
+
|
|
64
|
+
#ifdef DEBUG_chardet
|
|
65
|
+
virtual void DumpStatus() {};
|
|
66
|
+
#endif
|
|
67
|
+
|
|
68
|
+
// Helper functions used in the Latin1 and Group probers.
|
|
69
|
+
// both functions Allocate a new buffer for newBuf. This buffer should be
|
|
70
|
+
// freed by the caller using PR_FREEIF.
|
|
71
|
+
// Both functions return PR_FALSE in case of memory allocation failure.
|
|
72
|
+
static PRBool FilterWithoutEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
|
|
73
|
+
static PRBool FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen, char** newBuf, PRUint32& newLen);
|
|
74
|
+
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
#endif /* nsCharSetProber_h__ */
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
2
|
+
/* ***** BEGIN LICENSE BLOCK *****
|
|
3
|
+
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
4
|
+
*
|
|
5
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
|
6
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
* http://www.mozilla.org/MPL/
|
|
9
|
+
*
|
|
10
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
11
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
12
|
+
* for the specific language governing rights and limitations under the
|
|
13
|
+
* License.
|
|
14
|
+
*
|
|
15
|
+
* The Original Code is mozilla.org code.
|
|
16
|
+
*
|
|
17
|
+
* The Initial Developer of the Original Code is
|
|
18
|
+
* Netscape Communications Corporation.
|
|
19
|
+
* Portions created by the Initial Developer are Copyright (C) 1998
|
|
20
|
+
* the Initial Developer. All Rights Reserved.
|
|
21
|
+
*
|
|
22
|
+
* Contributor(s):
|
|
23
|
+
*
|
|
24
|
+
* Alternatively, the contents of this file may be used under the terms of
|
|
25
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
26
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
27
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
28
|
+
* of those above. If you wish to allow use of your version of this file only
|
|
29
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
30
|
+
* use your version of this file under the terms of the MPL, indicate your
|
|
31
|
+
* decision by deleting the provisions above and replace them with the notice
|
|
32
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
33
|
+
* the provisions above, a recipient may use your version of this file under
|
|
34
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
35
|
+
*
|
|
36
|
+
* ***** END LICENSE BLOCK ***** */
|
|
37
|
+
#ifndef nsCodingStateMachine_h__
|
|
38
|
+
#define nsCodingStateMachine_h__
|
|
39
|
+
|
|
40
|
+
#include "nsPkgInt.h"
|
|
41
|
+
|
|
42
|
+
/* Apart from these 3 generic states, machine states are specific to
|
|
43
|
+
* each charset prober.
|
|
44
|
+
*/
|
|
45
|
+
#define eStart 0
|
|
46
|
+
#define eError 1
|
|
47
|
+
#define eItsMe 2
|
|
48
|
+
|
|
49
|
+
#define GETCLASS(c) GETFROMPCK(((unsigned char)(c)), mModel->classTable)
|
|
50
|
+
|
|
51
|
+
//state machine model
|
|
52
|
+
typedef struct
|
|
53
|
+
{
|
|
54
|
+
nsPkgInt classTable;
|
|
55
|
+
PRUint32 classFactor;
|
|
56
|
+
nsPkgInt stateTable;
|
|
57
|
+
const PRUint32* charLenTable;
|
|
58
|
+
const char* name;
|
|
59
|
+
} SMModel;
|
|
60
|
+
|
|
61
|
+
class nsCodingStateMachine {
|
|
62
|
+
public:
|
|
63
|
+
nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; }
|
|
64
|
+
PRUint32 NextState(char c){
|
|
65
|
+
//for each byte we get its class , if it is first byte, we also get byte length
|
|
66
|
+
PRUint32 byteCls = GETCLASS(c);
|
|
67
|
+
if (mCurrentState == eStart)
|
|
68
|
+
{
|
|
69
|
+
mCurrentBytePos = 0;
|
|
70
|
+
mCurrentCharLen = mModel->charLenTable[byteCls];
|
|
71
|
+
}
|
|
72
|
+
//from byte's class and stateTable, we get its next state
|
|
73
|
+
mCurrentState = GETFROMPCK(mCurrentState * mModel->classFactor + byteCls,
|
|
74
|
+
mModel->stateTable);
|
|
75
|
+
mCurrentBytePos++;
|
|
76
|
+
return mCurrentState;
|
|
77
|
+
}
|
|
78
|
+
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}
|
|
79
|
+
void Reset(void) {mCurrentState = eStart;}
|
|
80
|
+
const char * GetCodingStateMachine() {return mModel->name;}
|
|
81
|
+
|
|
82
|
+
protected:
|
|
83
|
+
PRUint32 mCurrentState;
|
|
84
|
+
PRUint32 mCurrentCharLen;
|
|
85
|
+
PRUint32 mCurrentBytePos;
|
|
86
|
+
|
|
87
|
+
const SMModel *mModel;
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
extern const SMModel UTF8SMModel;
|
|
91
|
+
extern const SMModel Big5SMModel;
|
|
92
|
+
extern const SMModel EUCJPSMModel;
|
|
93
|
+
extern const SMModel EUCKRSMModel;
|
|
94
|
+
extern const SMModel EUCTWSMModel;
|
|
95
|
+
extern const SMModel GB18030SMModel;
|
|
96
|
+
extern const SMModel SJISSMModel;
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
extern const SMModel HZSMModel;
|
|
100
|
+
extern const SMModel ISO2022CNSMModel;
|
|
101
|
+
extern const SMModel ISO2022JPSMModel;
|
|
102
|
+
extern const SMModel ISO2022KRSMModel;
|
|
103
|
+
|
|
104
|
+
#endif /* nsCodingStateMachine_h__ */
|
|
105
|
+
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
2
|
+
/* ***** BEGIN LICENSE BLOCK *****
|
|
3
|
+
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
4
|
+
*
|
|
5
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
|
6
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
* http://www.mozilla.org/MPL/
|
|
9
|
+
*
|
|
10
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
11
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
12
|
+
* for the specific language governing rights and limitations under the
|
|
13
|
+
* License.
|
|
14
|
+
*
|
|
15
|
+
* The Original Code is mozilla.org code.
|
|
16
|
+
*
|
|
17
|
+
* The Initial Developer of the Original Code is
|
|
18
|
+
* Netscape Communications Corporation.
|
|
19
|
+
* Portions created by the Initial Developer are Copyright (C) 1998
|
|
20
|
+
* the Initial Developer. All Rights Reserved.
|
|
21
|
+
*
|
|
22
|
+
* Contributor(s):
|
|
23
|
+
*
|
|
24
|
+
* Alternatively, the contents of this file may be used under the terms of
|
|
25
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
26
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
27
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
28
|
+
* of those above. If you wish to allow use of your version of this file only
|
|
29
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
30
|
+
* use your version of this file under the terms of the MPL, indicate your
|
|
31
|
+
* decision by deleting the provisions above and replace them with the notice
|
|
32
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
33
|
+
* the provisions above, a recipient may use your version of this file under
|
|
34
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
35
|
+
*
|
|
36
|
+
* ***** END LICENSE BLOCK ***** */
|
|
37
|
+
|
|
38
|
+
// for japanese encoding, obeserve characteristic:
|
|
39
|
+
// 1, kana character (or hankaku?) often have hight frequency of appereance
|
|
40
|
+
// 2, kana character often exist in group
|
|
41
|
+
// 3, certain combination of kana is never used in japanese language
|
|
42
|
+
|
|
43
|
+
#include "nsEUCJPProber.h"
|
|
44
|
+
|
|
45
|
+
void nsEUCJPProber::Reset(void)
|
|
46
|
+
{
|
|
47
|
+
mCodingSM->Reset();
|
|
48
|
+
mState = eDetecting;
|
|
49
|
+
mContextAnalyser.Reset(mIsPreferredLanguage);
|
|
50
|
+
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|
54
|
+
{
|
|
55
|
+
PRUint32 codingState;
|
|
56
|
+
|
|
57
|
+
for (PRUint32 i = 0; i < aLen; i++)
|
|
58
|
+
{
|
|
59
|
+
codingState = mCodingSM->NextState(aBuf[i]);
|
|
60
|
+
if (codingState == eItsMe)
|
|
61
|
+
{
|
|
62
|
+
mState = eFoundIt;
|
|
63
|
+
break;
|
|
64
|
+
}
|
|
65
|
+
if (codingState == eStart)
|
|
66
|
+
{
|
|
67
|
+
PRUint32 charLen = mCodingSM->GetCurrentCharLen();
|
|
68
|
+
|
|
69
|
+
if (i == 0)
|
|
70
|
+
{
|
|
71
|
+
mLastChar[1] = aBuf[0];
|
|
72
|
+
mContextAnalyser.HandleOneChar(mLastChar, charLen);
|
|
73
|
+
mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
|
|
74
|
+
}
|
|
75
|
+
else
|
|
76
|
+
{
|
|
77
|
+
mContextAnalyser.HandleOneChar(aBuf+i-1, charLen);
|
|
78
|
+
mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
mLastChar[0] = aBuf[aLen-1];
|
|
84
|
+
|
|
85
|
+
if (mState == eDetecting)
|
|
86
|
+
if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
|
87
|
+
mState = eFoundIt;
|
|
88
|
+
|
|
89
|
+
return mState;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
float nsEUCJPProber::GetConfidence(void)
|
|
93
|
+
{
|
|
94
|
+
float contxtCf = mContextAnalyser.GetConfidence();
|
|
95
|
+
float distribCf = mDistributionAnalyser.GetConfidence();
|
|
96
|
+
|
|
97
|
+
return (contxtCf > distribCf ? contxtCf : distribCf);
|
|
98
|
+
}
|
|
99
|
+
|