cchardet 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.gitmodules +3 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +35 -0
- data/Rakefile +15 -0
- data/cchardet.gemspec +30 -0
- data/ext/cchardet/extconf.rb +26 -0
- data/ext/uchardet/.gitignore +1 -0
- data/ext/uchardet/.gitlab-ci.yml +106 -0
- data/ext/uchardet/AUTHORS +16 -0
- data/ext/uchardet/CMakeLists.txt +74 -0
- data/ext/uchardet/COPYING +1316 -0
- data/ext/uchardet/INSTALL +26 -0
- data/ext/uchardet/README.md +295 -0
- data/ext/uchardet/build-mac/uchardet.cpp +7 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
- data/ext/uchardet/doc/CMakeLists.txt +6 -0
- data/ext/uchardet/doc/README.maintainer +59 -0
- data/ext/uchardet/doc/uchardet.1 +18 -0
- data/ext/uchardet/script/BuildLangModel.py +533 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
- data/ext/uchardet/script/README +63 -0
- data/ext/uchardet/script/charsets/codepoints.py +53 -0
- data/ext/uchardet/script/charsets/db.py +73 -0
- data/ext/uchardet/script/charsets/ibm852.py +72 -0
- data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
- data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
- data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
- data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
- data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
- data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
- data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
- data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
- data/ext/uchardet/script/charsets/tis-620.py +77 -0
- data/ext/uchardet/script/charsets/viscii.py +72 -0
- data/ext/uchardet/script/charsets/windows-1250.py +75 -0
- data/ext/uchardet/script/charsets/windows-1252.py +76 -0
- data/ext/uchardet/script/charsets/windows-1253.py +72 -0
- data/ext/uchardet/script/charsets/windows-1256.py +75 -0
- data/ext/uchardet/script/charsets/windows-1257.py +72 -0
- data/ext/uchardet/script/charsets/windows-1258.py +72 -0
- data/ext/uchardet/script/debug.sh +9 -0
- data/ext/uchardet/script/header-template.cpp +38 -0
- data/ext/uchardet/script/langs/ar.py +59 -0
- data/ext/uchardet/script/langs/cs.py +80 -0
- data/ext/uchardet/script/langs/da.py +69 -0
- data/ext/uchardet/script/langs/de.py +69 -0
- data/ext/uchardet/script/langs/el.py +55 -0
- data/ext/uchardet/script/langs/eo.py +67 -0
- data/ext/uchardet/script/langs/es.py +69 -0
- data/ext/uchardet/script/langs/et.py +57 -0
- data/ext/uchardet/script/langs/fi.py +60 -0
- data/ext/uchardet/script/langs/fr.py +79 -0
- data/ext/uchardet/script/langs/ga.py +60 -0
- data/ext/uchardet/script/langs/hr.py +59 -0
- data/ext/uchardet/script/langs/hu.py +66 -0
- data/ext/uchardet/script/langs/it.py +56 -0
- data/ext/uchardet/script/langs/lt.py +70 -0
- data/ext/uchardet/script/langs/lv.py +69 -0
- data/ext/uchardet/script/langs/mt.py +80 -0
- data/ext/uchardet/script/langs/pl.py +81 -0
- data/ext/uchardet/script/langs/pt.py +80 -0
- data/ext/uchardet/script/langs/ro.py +65 -0
- data/ext/uchardet/script/langs/sk.py +80 -0
- data/ext/uchardet/script/langs/sl.py +59 -0
- data/ext/uchardet/script/langs/sv.py +56 -0
- data/ext/uchardet/script/langs/th.py +55 -0
- data/ext/uchardet/script/langs/tr.py +67 -0
- data/ext/uchardet/script/langs/vi.py +64 -0
- data/ext/uchardet/script/release.sh +8 -0
- data/ext/uchardet/script/win32.sh +7 -0
- data/ext/uchardet/src/Big5Freq.tab +943 -0
- data/ext/uchardet/src/CMakeLists.txt +160 -0
- data/ext/uchardet/src/CharDistribution.cpp +109 -0
- data/ext/uchardet/src/CharDistribution.h +242 -0
- data/ext/uchardet/src/EUCKRFreq.tab +614 -0
- data/ext/uchardet/src/EUCTWFreq.tab +447 -0
- data/ext/uchardet/src/GB2312Freq.tab +491 -0
- data/ext/uchardet/src/JISFreq.tab +589 -0
- data/ext/uchardet/src/JpCntx.cpp +230 -0
- data/ext/uchardet/src/JpCntx.h +140 -0
- data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
- data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
- data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
- data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
- data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
- data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
- data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
- data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
- data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
- data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
- data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
- data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
- data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
- data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
- data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
- data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
- data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
- data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
- data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
- data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
- data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
- data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
- data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
- data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
- data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
- data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
- data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
- data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
- data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
- data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
- data/ext/uchardet/src/nsBig5Prober.h +75 -0
- data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
- data/ext/uchardet/src/nsCharSetProber.h +77 -0
- data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
- data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
- data/ext/uchardet/src/nsEUCJPProber.h +79 -0
- data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
- data/ext/uchardet/src/nsEUCKRProber.h +81 -0
- data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
- data/ext/uchardet/src/nsEUCTWProber.h +75 -0
- data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
- data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
- data/ext/uchardet/src/nsEscSM.cpp +267 -0
- data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
- data/ext/uchardet/src/nsGB2312Prober.h +77 -0
- data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
- data/ext/uchardet/src/nsHebrewProber.h +177 -0
- data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
- data/ext/uchardet/src/nsLatin1Prober.h +73 -0
- data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
- data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
- data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
- data/ext/uchardet/src/nsPkgInt.h +89 -0
- data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
- data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
- data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
- data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
- data/ext/uchardet/src/nsSJISProber.cpp +98 -0
- data/ext/uchardet/src/nsSJISProber.h +81 -0
- data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
- data/ext/uchardet/src/nsUTF8Prober.h +66 -0
- data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
- data/ext/uchardet/src/nsUniversalDetector.h +91 -0
- data/ext/uchardet/src/nscore.h +59 -0
- data/ext/uchardet/src/prmem.h +49 -0
- data/ext/uchardet/src/symbols.cmake +41 -0
- data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
- data/ext/uchardet/src/tools/uchardet.cpp +254 -0
- data/ext/uchardet/src/uchardet.cpp +274 -0
- data/ext/uchardet/src/uchardet.h +136 -0
- data/ext/uchardet/test/CMakeLists.txt +47 -0
- data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
- data/ext/uchardet/test/ar/utf-8.txt +3 -0
- data/ext/uchardet/test/ar/windows-1256.txt +3 -0
- data/ext/uchardet/test/bg/windows-1251.txt +3 -0
- data/ext/uchardet/test/cs/ibm852.txt +4 -0
- data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
- data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
- data/ext/uchardet/test/cs/utf-8.txt +4 -0
- data/ext/uchardet/test/cs/windows-1250.txt +4 -0
- data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
- data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
- data/ext/uchardet/test/da/utf-8.txt +10 -0
- data/ext/uchardet/test/da/windows-1252.txt +10 -0
- data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
- data/ext/uchardet/test/de/windows-1252.txt +11 -0
- data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
- data/ext/uchardet/test/el/utf-8.txt +3 -0
- data/ext/uchardet/test/el/windows-1253.txt +5 -0
- data/ext/uchardet/test/en/ascii.txt +4 -0
- data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
- data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
- data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
- data/ext/uchardet/test/es/utf-8.txt +5 -0
- data/ext/uchardet/test/es/windows-1252.txt +5 -0
- data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
- data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
- data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
- data/ext/uchardet/test/et/utf-8.txt +6 -0
- data/ext/uchardet/test/et/windows-1252.txt +6 -0
- data/ext/uchardet/test/et/windows-1257.txt +6 -0
- data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
- data/ext/uchardet/test/fi/utf-8.txt +8 -0
- data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
- data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
- data/ext/uchardet/test/fr/utf-16.be +0 -0
- data/ext/uchardet/test/fr/utf-32.le +0 -0
- data/ext/uchardet/test/fr/utf-8.txt +14 -0
- data/ext/uchardet/test/fr/windows-1252.txt +3 -0
- data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
- data/ext/uchardet/test/ga/utf-8.txt +6 -0
- data/ext/uchardet/test/ga/windows-1252.txt +6 -0
- data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
- data/ext/uchardet/test/he/utf-8.txt +3 -0
- data/ext/uchardet/test/he/windows-1255.txt +1 -0
- data/ext/uchardet/test/hr/ibm852.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
- data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
- data/ext/uchardet/test/hr/utf-8.txt +4 -0
- data/ext/uchardet/test/hr/windows-1250.txt +4 -0
- data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/hu/windows-1250.txt +1 -0
- data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
- data/ext/uchardet/test/it/utf-8.txt +18 -0
- data/ext/uchardet/test/ja/euc-jp.txt +10 -0
- data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
- data/ext/uchardet/test/ja/shift_jis.txt +1 -0
- data/ext/uchardet/test/ja/utf-16be.txt +0 -0
- data/ext/uchardet/test/ja/utf-16le.txt +0 -0
- data/ext/uchardet/test/ja/utf-8.txt +9 -0
- data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
- data/ext/uchardet/test/ko/uhc.smi +16 -0
- data/ext/uchardet/test/ko/utf-16.le +0 -0
- data/ext/uchardet/test/ko/utf-32.be +0 -0
- data/ext/uchardet/test/ko/utf-8.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
- data/ext/uchardet/test/lt/utf-8.txt +3 -0
- data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
- data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
- data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
- data/ext/uchardet/test/lv/utf-8.txt +6 -0
- data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
- data/ext/uchardet/test/mt/utf-8.txt +4 -0
- data/ext/uchardet/test/pl/ibm852.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
- data/ext/uchardet/test/pl/utf-8.txt +3 -0
- data/ext/uchardet/test/pl/windows-1250.txt +3 -0
- data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
- data/ext/uchardet/test/pt/utf-8.txt +6 -0
- data/ext/uchardet/test/ro/ibm852.txt +9 -0
- data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
- data/ext/uchardet/test/ro/utf-8.txt +9 -0
- data/ext/uchardet/test/ro/windows-1250.txt +9 -0
- data/ext/uchardet/test/ru/ibm855.txt +5 -0
- data/ext/uchardet/test/ru/ibm866.txt +11 -0
- data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
- data/ext/uchardet/test/ru/koi8-r.txt +1 -0
- data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
- data/ext/uchardet/test/ru/windows-1251.txt +4 -0
- data/ext/uchardet/test/sk/ibm852.txt +3 -0
- data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
- data/ext/uchardet/test/sk/utf-8.txt +3 -0
- data/ext/uchardet/test/sk/windows-1250.txt +3 -0
- data/ext/uchardet/test/sl/ibm852.txt +9 -0
- data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
- data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
- data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
- data/ext/uchardet/test/sl/utf-8.txt +9 -0
- data/ext/uchardet/test/sl/windows-1250.txt +9 -0
- data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
- data/ext/uchardet/test/sv/utf-8.txt +10 -0
- data/ext/uchardet/test/sv/windows-1252.txt +10 -0
- data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
- data/ext/uchardet/test/th/tis-620.txt +5 -0
- data/ext/uchardet/test/th/utf-8.txt +1 -0
- data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
- data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
- data/ext/uchardet/test/uchardet-tests.c +130 -0
- data/ext/uchardet/test/vi/utf-8.txt +4 -0
- data/ext/uchardet/test/vi/viscii.txt +4 -0
- data/ext/uchardet/test/vi/windows-1258.txt +4 -0
- data/ext/uchardet/test/zh/big5.txt +1 -0
- data/ext/uchardet/test/zh/euc-tw.txt +1 -0
- data/ext/uchardet/test/zh/gb18030.txt +1 -0
- data/ext/uchardet/test/zh/utf-8.txt +1 -0
- data/ext/uchardet/uchardet.doap +51 -0
- data/ext/uchardet/uchardet.pc.in +10 -0
- data/lib/cchardet.rb +56 -0
- data/lib/cchardet/lib_finder.rb +32 -0
- data/lib/cchardet/version.rb +5 -0
- metadata +362 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
2
|
+
/* ***** BEGIN LICENSE BLOCK *****
|
|
3
|
+
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
4
|
+
*
|
|
5
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
|
6
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
* http://www.mozilla.org/MPL/
|
|
9
|
+
*
|
|
10
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
11
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
12
|
+
* for the specific language governing rights and limitations under the
|
|
13
|
+
* License.
|
|
14
|
+
*
|
|
15
|
+
* The Original Code is Mozilla Universal charset detector code.
|
|
16
|
+
*
|
|
17
|
+
* The Initial Developer of the Original Code is
|
|
18
|
+
* Netscape Communications Corporation.
|
|
19
|
+
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
20
|
+
* the Initial Developer. All Rights Reserved.
|
|
21
|
+
*
|
|
22
|
+
* Contributor(s):
|
|
23
|
+
* Shy Shalom <shooshX@gmail.com>
|
|
24
|
+
*
|
|
25
|
+
* Alternatively, the contents of this file may be used under the terms of
|
|
26
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
27
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
28
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
29
|
+
* of those above. If you wish to allow use of your version of this file only
|
|
30
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
31
|
+
* use your version of this file under the terms of the MPL, indicate your
|
|
32
|
+
* decision by deleting the provisions above and replace them with the notice
|
|
33
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
34
|
+
* the provisions above, a recipient may use your version of this file under
|
|
35
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
36
|
+
*
|
|
37
|
+
* ***** END LICENSE BLOCK ***** */
|
|
38
|
+
#include <stdio.h>
|
|
39
|
+
#include "nsSBCharSetProber.h"
|
|
40
|
+
|
|
41
|
+
nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|
42
|
+
{
|
|
43
|
+
unsigned char order;
|
|
44
|
+
|
|
45
|
+
for (PRUint32 i = 0; i < aLen; i++)
|
|
46
|
+
{
|
|
47
|
+
order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
|
|
48
|
+
|
|
49
|
+
if (order < SYMBOL_CAT_ORDER)
|
|
50
|
+
{
|
|
51
|
+
mTotalChar++;
|
|
52
|
+
}
|
|
53
|
+
else if (order == ILL)
|
|
54
|
+
{
|
|
55
|
+
/* When encountering an illegal codepoint, no need
|
|
56
|
+
* to continue analyzing data. */
|
|
57
|
+
mState = eNotMe;
|
|
58
|
+
break;
|
|
59
|
+
}
|
|
60
|
+
else if (order == CTR)
|
|
61
|
+
{
|
|
62
|
+
mCtrlChar++;
|
|
63
|
+
}
|
|
64
|
+
if (order < mModel->freqCharCount)
|
|
65
|
+
{
|
|
66
|
+
mFreqChar++;
|
|
67
|
+
|
|
68
|
+
if (mLastOrder < mModel->freqCharCount)
|
|
69
|
+
{
|
|
70
|
+
mTotalSeqs++;
|
|
71
|
+
if (!mReversed)
|
|
72
|
+
++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*mModel->freqCharCount+order]]);
|
|
73
|
+
else // reverse the order of the letters in the lookup
|
|
74
|
+
++(mSeqCounters[mModel->precedenceMatrix[order*mModel->freqCharCount+mLastOrder]]);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
mLastOrder = order;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if (mState == eDetecting)
|
|
81
|
+
if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
|
|
82
|
+
{
|
|
83
|
+
float cf = GetConfidence();
|
|
84
|
+
if (cf > POSITIVE_SHORTCUT_THRESHOLD)
|
|
85
|
+
mState = eFoundIt;
|
|
86
|
+
else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
|
|
87
|
+
mState = eNotMe;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return mState;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
void nsSingleByteCharSetProber::Reset(void)
|
|
94
|
+
{
|
|
95
|
+
mState = eDetecting;
|
|
96
|
+
mLastOrder = 255;
|
|
97
|
+
for (PRUint32 i = 0; i < NUMBER_OF_SEQ_CAT; i++)
|
|
98
|
+
mSeqCounters[i] = 0;
|
|
99
|
+
mTotalSeqs = 0;
|
|
100
|
+
mTotalChar = 0;
|
|
101
|
+
mCtrlChar = 0;
|
|
102
|
+
mFreqChar = 0;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
//#define NEGATIVE_APPROACH 1
|
|
106
|
+
|
|
107
|
+
float nsSingleByteCharSetProber::GetConfidence(void)
|
|
108
|
+
{
|
|
109
|
+
#ifdef NEGATIVE_APPROACH
|
|
110
|
+
if (mTotalSeqs > 0)
|
|
111
|
+
if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 )
|
|
112
|
+
return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar;
|
|
113
|
+
return (float)0.01;
|
|
114
|
+
#else //POSITIVE_APPROACH
|
|
115
|
+
float r;
|
|
116
|
+
|
|
117
|
+
if (mTotalSeqs > 0) {
|
|
118
|
+
r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
|
|
119
|
+
/* Multiply by a ratio of positive sequences per characters.
|
|
120
|
+
* This would help in particular to distinguish close winners.
|
|
121
|
+
* Indeed if you add a letter, you'd expect the positive sequence count
|
|
122
|
+
* to increase as well. If it doesn't, it may mean that this new codepoint
|
|
123
|
+
* may not have been a letter, but instead a symbol (or some other
|
|
124
|
+
* character). This could make the difference between very closely related
|
|
125
|
+
* charsets used for the same language.
|
|
126
|
+
*/
|
|
127
|
+
r = r * (mSeqCounters[POSITIVE_CAT] + (float) mSeqCounters[PROBABLE_CAT] / 4) / mTotalChar;
|
|
128
|
+
/* The more control characters (proportionnaly to the size of the text), the
|
|
129
|
+
* less confident we become in the current charset.
|
|
130
|
+
*/
|
|
131
|
+
r = r * (mTotalChar - mCtrlChar) / mTotalChar;
|
|
132
|
+
r = r*mFreqChar/mTotalChar;
|
|
133
|
+
if (r >= (float)1.00)
|
|
134
|
+
r = (float)0.99;
|
|
135
|
+
return r;
|
|
136
|
+
}
|
|
137
|
+
return (float)0.01;
|
|
138
|
+
#endif
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const char* nsSingleByteCharSetProber::GetCharSetName()
|
|
142
|
+
{
|
|
143
|
+
if (!mNameProber)
|
|
144
|
+
return mModel->charsetName;
|
|
145
|
+
return mNameProber->GetCharSetName();
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const char* nsSingleByteCharSetProber::GetLanguage()
|
|
149
|
+
{
|
|
150
|
+
if (!mNameProber)
|
|
151
|
+
return mModel->langName;
|
|
152
|
+
return mNameProber->GetLanguage();
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
#ifdef DEBUG_chardet
|
|
156
|
+
void nsSingleByteCharSetProber::DumpStatus()
|
|
157
|
+
{
|
|
158
|
+
printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
|
|
159
|
+
}
|
|
160
|
+
#endif
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
2
|
+
/* ***** BEGIN LICENSE BLOCK *****
|
|
3
|
+
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
4
|
+
*
|
|
5
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
|
6
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
* http://www.mozilla.org/MPL/
|
|
9
|
+
*
|
|
10
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
11
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
12
|
+
* for the specific language governing rights and limitations under the
|
|
13
|
+
* License.
|
|
14
|
+
*
|
|
15
|
+
* The Original Code is Mozilla Universal charset detector code.
|
|
16
|
+
*
|
|
17
|
+
* The Initial Developer of the Original Code is
|
|
18
|
+
* Netscape Communications Corporation.
|
|
19
|
+
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
20
|
+
* the Initial Developer. All Rights Reserved.
|
|
21
|
+
*
|
|
22
|
+
* Contributor(s):
|
|
23
|
+
* Shy Shalom <shooshX@gmail.com>
|
|
24
|
+
*
|
|
25
|
+
* Alternatively, the contents of this file may be used under the terms of
|
|
26
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
27
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
28
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
29
|
+
* of those above. If you wish to allow use of your version of this file only
|
|
30
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
31
|
+
* use your version of this file under the terms of the MPL, indicate your
|
|
32
|
+
* decision by deleting the provisions above and replace them with the notice
|
|
33
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
34
|
+
* the provisions above, a recipient may use your version of this file under
|
|
35
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
36
|
+
*
|
|
37
|
+
* ***** END LICENSE BLOCK ***** */
|
|
38
|
+
#ifndef nsSingleByteCharSetProber_h__
|
|
39
|
+
#define nsSingleByteCharSetProber_h__
|
|
40
|
+
|
|
41
|
+
#include "nsCharSetProber.h"
|
|
42
|
+
|
|
43
|
+
/** Codepoints **/
|
|
44
|
+
|
|
45
|
+
/* Illegal codepoints.*/
|
|
46
|
+
#define ILL 255
|
|
47
|
+
/* Control character. */
|
|
48
|
+
#define CTR 254
|
|
49
|
+
/* Symbols and punctuation that does not belong to words. */
|
|
50
|
+
#define SYM 253
|
|
51
|
+
/* Return/Line feeds. */
|
|
52
|
+
#define RET 252
|
|
53
|
+
/* Numbers 0-9. */
|
|
54
|
+
#define NUM 251
|
|
55
|
+
|
|
56
|
+
#define SB_ENOUGH_REL_THRESHOLD 1024
|
|
57
|
+
#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
|
|
58
|
+
#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
|
|
59
|
+
#define SYMBOL_CAT_ORDER 250
|
|
60
|
+
|
|
61
|
+
#define NUMBER_OF_SEQ_CAT 4
|
|
62
|
+
#define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
|
|
63
|
+
#define PROBABLE_CAT (NUMBER_OF_SEQ_CAT-2)
|
|
64
|
+
#define NEUTRAL_CAT (NUMBER_OF_SEQ_CAT-3)
|
|
65
|
+
#define NEGATIVE_CAT 0
|
|
66
|
+
|
|
67
|
+
typedef struct
|
|
68
|
+
{
|
|
69
|
+
/* [256] table mapping codepoints to chararacter orders. */
|
|
70
|
+
const unsigned char* const charToOrderMap;
|
|
71
|
+
/* freqCharCount x freqCharCount table of 2-char sequence's frequencies. */
|
|
72
|
+
const PRUint8* const precedenceMatrix;
|
|
73
|
+
/* The count of frequent characters. */
|
|
74
|
+
int freqCharCount;
|
|
75
|
+
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
|
76
|
+
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
|
|
77
|
+
const char* const charsetName;
|
|
78
|
+
const char* const langName;
|
|
79
|
+
} SequenceModel;
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class nsSingleByteCharSetProber : public nsCharSetProber{
|
|
83
|
+
public:
|
|
84
|
+
nsSingleByteCharSetProber(const SequenceModel *model)
|
|
85
|
+
:mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
|
|
86
|
+
nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
|
|
87
|
+
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
|
|
88
|
+
|
|
89
|
+
virtual const char* GetCharSetName();
|
|
90
|
+
virtual const char* GetLanguage();
|
|
91
|
+
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
|
92
|
+
virtual nsProbingState GetState(void) {return mState;}
|
|
93
|
+
virtual void Reset(void);
|
|
94
|
+
virtual float GetConfidence(void);
|
|
95
|
+
virtual void SetOpion() {}
|
|
96
|
+
|
|
97
|
+
// This feature is not implemented yet. any current language model
|
|
98
|
+
// contain this parameter as PR_FALSE. No one is looking at this
|
|
99
|
+
// parameter or calling this method.
|
|
100
|
+
// Moreover, the nsSBCSGroupProber which calls the HandleData of this
|
|
101
|
+
// prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
|
|
102
|
+
// of the English letters.
|
|
103
|
+
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented)
|
|
104
|
+
|
|
105
|
+
#ifdef DEBUG_chardet
|
|
106
|
+
virtual void DumpStatus();
|
|
107
|
+
#endif
|
|
108
|
+
|
|
109
|
+
protected:
|
|
110
|
+
nsProbingState mState;
|
|
111
|
+
const SequenceModel* const mModel;
|
|
112
|
+
const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup
|
|
113
|
+
|
|
114
|
+
//char order of last character
|
|
115
|
+
unsigned char mLastOrder;
|
|
116
|
+
|
|
117
|
+
PRUint32 mTotalSeqs;
|
|
118
|
+
PRUint32 mSeqCounters[NUMBER_OF_SEQ_CAT];
|
|
119
|
+
|
|
120
|
+
PRUint32 mTotalChar;
|
|
121
|
+
PRUint32 mCtrlChar;
|
|
122
|
+
//characters that fall in our sampling range
|
|
123
|
+
PRUint32 mFreqChar;
|
|
124
|
+
|
|
125
|
+
// Optional auxiliary prober for name decision. created and destroyed by the GroupProber
|
|
126
|
+
nsCharSetProber* mNameProber;
|
|
127
|
+
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
extern const SequenceModel Windows_1256ArabicModel;
|
|
131
|
+
extern const SequenceModel Iso_8859_6ArabicModel;
|
|
132
|
+
|
|
133
|
+
extern const SequenceModel Koi8rRussianModel;
|
|
134
|
+
extern const SequenceModel Win1251RussianModel;
|
|
135
|
+
extern const SequenceModel Latin5RussianModel;
|
|
136
|
+
extern const SequenceModel MacCyrillicRussianModel;
|
|
137
|
+
extern const SequenceModel Ibm866RussianModel;
|
|
138
|
+
extern const SequenceModel Ibm855RussianModel;
|
|
139
|
+
|
|
140
|
+
extern const SequenceModel Iso_8859_7GreekModel;
|
|
141
|
+
extern const SequenceModel Windows_1253GreekModel;
|
|
142
|
+
|
|
143
|
+
extern const SequenceModel Latin5BulgarianModel;
|
|
144
|
+
extern const SequenceModel Win1251BulgarianModel;
|
|
145
|
+
|
|
146
|
+
extern const SequenceModel Iso_8859_2HungarianModel;
|
|
147
|
+
extern const SequenceModel Windows_1250HungarianModel;
|
|
148
|
+
|
|
149
|
+
extern const SequenceModel Win1255Model;
|
|
150
|
+
|
|
151
|
+
extern const SequenceModel Tis_620ThaiModel;
|
|
152
|
+
extern const SequenceModel Iso_8859_11ThaiModel;
|
|
153
|
+
|
|
154
|
+
extern const SequenceModel Iso_8859_15FrenchModel;
|
|
155
|
+
extern const SequenceModel Iso_8859_1FrenchModel;
|
|
156
|
+
extern const SequenceModel Windows_1252FrenchModel;
|
|
157
|
+
|
|
158
|
+
extern const SequenceModel Iso_8859_15SpanishModel;
|
|
159
|
+
extern const SequenceModel Iso_8859_1SpanishModel;
|
|
160
|
+
extern const SequenceModel Windows_1252SpanishModel;
|
|
161
|
+
|
|
162
|
+
extern const SequenceModel Iso_8859_1GermanModel;
|
|
163
|
+
extern const SequenceModel Windows_1252GermanModel;
|
|
164
|
+
|
|
165
|
+
extern const SequenceModel Iso_8859_3EsperantoModel;
|
|
166
|
+
|
|
167
|
+
extern const SequenceModel Iso_8859_3TurkishModel;
|
|
168
|
+
extern const SequenceModel Iso_8859_9TurkishModel;
|
|
169
|
+
|
|
170
|
+
extern const SequenceModel VisciiVietnameseModel;
|
|
171
|
+
extern const SequenceModel Windows_1258VietnameseModel;
|
|
172
|
+
|
|
173
|
+
extern const SequenceModel Iso_8859_15DanishModel;
|
|
174
|
+
extern const SequenceModel Iso_8859_1DanishModel;
|
|
175
|
+
extern const SequenceModel Windows_1252DanishModel;
|
|
176
|
+
|
|
177
|
+
extern const SequenceModel Iso_8859_13LithuanianModel;
|
|
178
|
+
extern const SequenceModel Iso_8859_10LithuanianModel;
|
|
179
|
+
extern const SequenceModel Iso_8859_4LithuanianModel;
|
|
180
|
+
|
|
181
|
+
extern const SequenceModel Iso_8859_13LatvianModel;
|
|
182
|
+
extern const SequenceModel Iso_8859_10LatvianModel;
|
|
183
|
+
extern const SequenceModel Iso_8859_4LatvianModel;
|
|
184
|
+
|
|
185
|
+
extern const SequenceModel Iso_8859_1PortugueseModel;
|
|
186
|
+
extern const SequenceModel Iso_8859_9PortugueseModel;
|
|
187
|
+
extern const SequenceModel Iso_8859_15PortugueseModel;
|
|
188
|
+
extern const SequenceModel Windows_1252PortugueseModel;
|
|
189
|
+
|
|
190
|
+
extern const SequenceModel Iso_8859_3MalteseModel;
|
|
191
|
+
|
|
192
|
+
extern const SequenceModel Windows_1250CzechModel;
|
|
193
|
+
extern const SequenceModel Iso_8859_2CzechModel;
|
|
194
|
+
extern const SequenceModel Ibm852CzechModel;
|
|
195
|
+
extern const SequenceModel Mac_CentraleuropeCzechModel;
|
|
196
|
+
|
|
197
|
+
extern const SequenceModel Windows_1250SlovakModel;
|
|
198
|
+
extern const SequenceModel Iso_8859_2SlovakModel;
|
|
199
|
+
extern const SequenceModel Ibm852SlovakModel;
|
|
200
|
+
extern const SequenceModel Mac_CentraleuropeSlovakModel;
|
|
201
|
+
|
|
202
|
+
extern const SequenceModel Windows_1250PolishModel;
|
|
203
|
+
extern const SequenceModel Iso_8859_2PolishModel;
|
|
204
|
+
extern const SequenceModel Iso_8859_13PolishModel;
|
|
205
|
+
extern const SequenceModel Iso_8859_16PolishModel;
|
|
206
|
+
extern const SequenceModel Ibm852PolishModel;
|
|
207
|
+
extern const SequenceModel Mac_CentraleuropePolishModel;
|
|
208
|
+
|
|
209
|
+
extern const SequenceModel Iso_8859_1FinnishModel;
|
|
210
|
+
extern const SequenceModel Iso_8859_4FinnishModel;
|
|
211
|
+
extern const SequenceModel Iso_8859_9FinnishModel;
|
|
212
|
+
extern const SequenceModel Iso_8859_13FinnishModel;
|
|
213
|
+
extern const SequenceModel Iso_8859_15FinnishModel;
|
|
214
|
+
extern const SequenceModel Windows_1252FinnishModel;
|
|
215
|
+
|
|
216
|
+
extern const SequenceModel Iso_8859_1ItalianModel;
|
|
217
|
+
extern const SequenceModel Iso_8859_3ItalianModel;
|
|
218
|
+
extern const SequenceModel Iso_8859_9ItalianModel;
|
|
219
|
+
extern const SequenceModel Iso_8859_15ItalianModel;
|
|
220
|
+
extern const SequenceModel Windows_1252ItalianModel;
|
|
221
|
+
|
|
222
|
+
extern const SequenceModel Windows_1250CroatianModel;
|
|
223
|
+
extern const SequenceModel Iso_8859_2CroatianModel;
|
|
224
|
+
extern const SequenceModel Iso_8859_13CroatianModel;
|
|
225
|
+
extern const SequenceModel Iso_8859_16CroatianModel;
|
|
226
|
+
extern const SequenceModel Ibm852CroatianModel;
|
|
227
|
+
extern const SequenceModel Mac_CentraleuropeCroatianModel;
|
|
228
|
+
|
|
229
|
+
extern const SequenceModel Windows_1252EstonianModel;
|
|
230
|
+
extern const SequenceModel Windows_1257EstonianModel;
|
|
231
|
+
extern const SequenceModel Iso_8859_4EstonianModel;
|
|
232
|
+
extern const SequenceModel Iso_8859_13EstonianModel;
|
|
233
|
+
extern const SequenceModel Iso_8859_15EstonianModel;
|
|
234
|
+
|
|
235
|
+
extern const SequenceModel Iso_8859_15IrishModel;
|
|
236
|
+
extern const SequenceModel Iso_8859_9IrishModel;
|
|
237
|
+
extern const SequenceModel Iso_8859_1IrishModel;
|
|
238
|
+
extern const SequenceModel Windows_1252IrishModel;
|
|
239
|
+
|
|
240
|
+
extern const SequenceModel Windows_1250RomanianModel;
|
|
241
|
+
extern const SequenceModel Iso_8859_2RomanianModel;
|
|
242
|
+
extern const SequenceModel Iso_8859_16RomanianModel;
|
|
243
|
+
extern const SequenceModel Ibm852RomanianModel;
|
|
244
|
+
|
|
245
|
+
extern const SequenceModel Windows_1250SloveneModel;
|
|
246
|
+
extern const SequenceModel Iso_8859_2SloveneModel;
|
|
247
|
+
extern const SequenceModel Iso_8859_16SloveneModel;
|
|
248
|
+
extern const SequenceModel Ibm852SloveneModel;
|
|
249
|
+
extern const SequenceModel Mac_CentraleuropeSloveneModel;
|
|
250
|
+
|
|
251
|
+
extern const SequenceModel Iso_8859_1SwedishModel;
|
|
252
|
+
extern const SequenceModel Iso_8859_4SwedishModel;
|
|
253
|
+
extern const SequenceModel Iso_8859_9SwedishModel;
|
|
254
|
+
extern const SequenceModel Iso_8859_15SwedishModel;
|
|
255
|
+
extern const SequenceModel Windows_1252SwedishModel;
|
|
256
|
+
|
|
257
|
+
#endif /* nsSingleByteCharSetProber_h__ */
|
|
258
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
2
|
+
/* ***** BEGIN LICENSE BLOCK *****
|
|
3
|
+
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
4
|
+
*
|
|
5
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
|
6
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
7
|
+
* the License. You may obtain a copy of the License at
|
|
8
|
+
* http://www.mozilla.org/MPL/
|
|
9
|
+
*
|
|
10
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
11
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
12
|
+
* for the specific language governing rights and limitations under the
|
|
13
|
+
* License.
|
|
14
|
+
*
|
|
15
|
+
* The Original Code is mozilla.org code.
|
|
16
|
+
*
|
|
17
|
+
* The Initial Developer of the Original Code is
|
|
18
|
+
* Netscape Communications Corporation.
|
|
19
|
+
* Portions created by the Initial Developer are Copyright (C) 1998
|
|
20
|
+
* the Initial Developer. All Rights Reserved.
|
|
21
|
+
*
|
|
22
|
+
* Contributor(s):
|
|
23
|
+
*
|
|
24
|
+
* Alternatively, the contents of this file may be used under the terms of
|
|
25
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
26
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
27
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
28
|
+
* of those above. If you wish to allow use of your version of this file only
|
|
29
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
30
|
+
* use your version of this file under the terms of the MPL, indicate your
|
|
31
|
+
* decision by deleting the provisions above and replace them with the notice
|
|
32
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
33
|
+
* the provisions above, a recipient may use your version of this file under
|
|
34
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
35
|
+
*
|
|
36
|
+
* ***** END LICENSE BLOCK ***** */
|
|
37
|
+
|
|
38
|
+
// for S-JIS encoding, obeserve characteristic:
|
|
39
|
+
// 1, kana character (or hankaku?) often have hight frequency of appereance
|
|
40
|
+
// 2, kana character often exist in group
|
|
41
|
+
// 3, certain combination of kana is never used in japanese language
|
|
42
|
+
|
|
43
|
+
#include "nsSJISProber.h"
|
|
44
|
+
|
|
45
|
+
void nsSJISProber::Reset(void)
|
|
46
|
+
{
|
|
47
|
+
mCodingSM->Reset();
|
|
48
|
+
mState = eDetecting;
|
|
49
|
+
mContextAnalyser.Reset(mIsPreferredLanguage);
|
|
50
|
+
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|
54
|
+
{
|
|
55
|
+
PRUint32 codingState;
|
|
56
|
+
|
|
57
|
+
for (PRUint32 i = 0; i < aLen; i++)
|
|
58
|
+
{
|
|
59
|
+
codingState = mCodingSM->NextState(aBuf[i]);
|
|
60
|
+
if (codingState == eItsMe)
|
|
61
|
+
{
|
|
62
|
+
mState = eFoundIt;
|
|
63
|
+
break;
|
|
64
|
+
}
|
|
65
|
+
if (codingState == eStart)
|
|
66
|
+
{
|
|
67
|
+
PRUint32 charLen = mCodingSM->GetCurrentCharLen();
|
|
68
|
+
if (i == 0)
|
|
69
|
+
{
|
|
70
|
+
mLastChar[1] = aBuf[0];
|
|
71
|
+
mContextAnalyser.HandleOneChar(mLastChar+2-charLen, charLen);
|
|
72
|
+
mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
|
|
73
|
+
}
|
|
74
|
+
else
|
|
75
|
+
{
|
|
76
|
+
mContextAnalyser.HandleOneChar(aBuf+i+1-charLen, charLen);
|
|
77
|
+
mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
mLastChar[0] = aBuf[aLen-1];
|
|
83
|
+
|
|
84
|
+
if (mState == eDetecting)
|
|
85
|
+
if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
|
86
|
+
mState = eFoundIt;
|
|
87
|
+
|
|
88
|
+
return mState;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
float nsSJISProber::GetConfidence(void)
|
|
92
|
+
{
|
|
93
|
+
float contxtCf = mContextAnalyser.GetConfidence();
|
|
94
|
+
float distribCf = mDistributionAnalyser.GetConfidence();
|
|
95
|
+
|
|
96
|
+
return (contxtCf > distribCf ? contxtCf : distribCf);
|
|
97
|
+
}
|
|
98
|
+
|