cchardet 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.gitmodules +3 -0
- data/.rubocop.yml +11 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +10 -0
- data/README.md +35 -0
- data/Rakefile +15 -0
- data/cchardet.gemspec +30 -0
- data/ext/cchardet/extconf.rb +26 -0
- data/ext/uchardet/.gitignore +1 -0
- data/ext/uchardet/.gitlab-ci.yml +106 -0
- data/ext/uchardet/AUTHORS +16 -0
- data/ext/uchardet/CMakeLists.txt +74 -0
- data/ext/uchardet/COPYING +1316 -0
- data/ext/uchardet/INSTALL +26 -0
- data/ext/uchardet/README.md +295 -0
- data/ext/uchardet/build-mac/uchardet.cpp +7 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.pbxproj +543 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/project.xcworkspace/xcshareddata/uchardet.xccheckout +41 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet-ios.xcscheme +77 -0
- data/ext/uchardet/build-mac/uchardet.xcodeproj/xcshareddata/xcschemes/uchardet.xcscheme +77 -0
- data/ext/uchardet/doc/CMakeLists.txt +6 -0
- data/ext/uchardet/doc/README.maintainer +59 -0
- data/ext/uchardet/doc/uchardet.1 +18 -0
- data/ext/uchardet/script/BuildLangModel.py +533 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangArabicModel.log +142 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangCroatianModel.log +157 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangCzechModel.log +161 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangDanishModel.log +158 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangEsperantoModel.log +110 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangEstonianModel.log +159 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangFinnishModel.log +156 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangFrenchModel.log +116 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangGermanModel.log +159 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangGreekModel.log +272 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangHungarianModel.log +109 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangIrishModel.log +156 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangItalianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangLatvianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangLithuanianModel.log +162 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangMalteseModel.log +147 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangPolishModel.log +154 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangPortugueseModel.log +166 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangRomanianModel.log +153 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSlovakModel.log +158 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSloveneModel.log +148 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSpanishModel.log +109 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangSwedishModel.log +151 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangThaiModel.log +141 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangTurkishModel.log +113 -0
- data/ext/uchardet/script/BuildLangModelLogs/LangVietnameseModel.log +121 -0
- data/ext/uchardet/script/README +63 -0
- data/ext/uchardet/script/charsets/codepoints.py +53 -0
- data/ext/uchardet/script/charsets/db.py +73 -0
- data/ext/uchardet/script/charsets/ibm852.py +72 -0
- data/ext/uchardet/script/charsets/iso-8859-1.py +85 -0
- data/ext/uchardet/script/charsets/iso-8859-10.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-11.py +77 -0
- data/ext/uchardet/script/charsets/iso-8859-13.py +72 -0
- data/ext/uchardet/script/charsets/iso-8859-15.py +80 -0
- data/ext/uchardet/script/charsets/iso-8859-16.py +83 -0
- data/ext/uchardet/script/charsets/iso-8859-2.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-3.py +75 -0
- data/ext/uchardet/script/charsets/iso-8859-4.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-6.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-7.py +73 -0
- data/ext/uchardet/script/charsets/iso-8859-9.py +76 -0
- data/ext/uchardet/script/charsets/mac-centraleurope.py +72 -0
- data/ext/uchardet/script/charsets/tis-620.py +77 -0
- data/ext/uchardet/script/charsets/viscii.py +72 -0
- data/ext/uchardet/script/charsets/windows-1250.py +75 -0
- data/ext/uchardet/script/charsets/windows-1252.py +76 -0
- data/ext/uchardet/script/charsets/windows-1253.py +72 -0
- data/ext/uchardet/script/charsets/windows-1256.py +75 -0
- data/ext/uchardet/script/charsets/windows-1257.py +72 -0
- data/ext/uchardet/script/charsets/windows-1258.py +72 -0
- data/ext/uchardet/script/debug.sh +9 -0
- data/ext/uchardet/script/header-template.cpp +38 -0
- data/ext/uchardet/script/langs/ar.py +59 -0
- data/ext/uchardet/script/langs/cs.py +80 -0
- data/ext/uchardet/script/langs/da.py +69 -0
- data/ext/uchardet/script/langs/de.py +69 -0
- data/ext/uchardet/script/langs/el.py +55 -0
- data/ext/uchardet/script/langs/eo.py +67 -0
- data/ext/uchardet/script/langs/es.py +69 -0
- data/ext/uchardet/script/langs/et.py +57 -0
- data/ext/uchardet/script/langs/fi.py +60 -0
- data/ext/uchardet/script/langs/fr.py +79 -0
- data/ext/uchardet/script/langs/ga.py +60 -0
- data/ext/uchardet/script/langs/hr.py +59 -0
- data/ext/uchardet/script/langs/hu.py +66 -0
- data/ext/uchardet/script/langs/it.py +56 -0
- data/ext/uchardet/script/langs/lt.py +70 -0
- data/ext/uchardet/script/langs/lv.py +69 -0
- data/ext/uchardet/script/langs/mt.py +80 -0
- data/ext/uchardet/script/langs/pl.py +81 -0
- data/ext/uchardet/script/langs/pt.py +80 -0
- data/ext/uchardet/script/langs/ro.py +65 -0
- data/ext/uchardet/script/langs/sk.py +80 -0
- data/ext/uchardet/script/langs/sl.py +59 -0
- data/ext/uchardet/script/langs/sv.py +56 -0
- data/ext/uchardet/script/langs/th.py +55 -0
- data/ext/uchardet/script/langs/tr.py +67 -0
- data/ext/uchardet/script/langs/vi.py +64 -0
- data/ext/uchardet/script/release.sh +8 -0
- data/ext/uchardet/script/win32.sh +7 -0
- data/ext/uchardet/src/Big5Freq.tab +943 -0
- data/ext/uchardet/src/CMakeLists.txt +160 -0
- data/ext/uchardet/src/CharDistribution.cpp +109 -0
- data/ext/uchardet/src/CharDistribution.h +242 -0
- data/ext/uchardet/src/EUCKRFreq.tab +614 -0
- data/ext/uchardet/src/EUCTWFreq.tab +447 -0
- data/ext/uchardet/src/GB2312Freq.tab +491 -0
- data/ext/uchardet/src/JISFreq.tab +589 -0
- data/ext/uchardet/src/JpCntx.cpp +230 -0
- data/ext/uchardet/src/JpCntx.h +140 -0
- data/ext/uchardet/src/LangModels/LangArabicModel.cpp +267 -0
- data/ext/uchardet/src/LangModels/LangBulgarianModel.cpp +249 -0
- data/ext/uchardet/src/LangModels/LangCroatianModel.cpp +298 -0
- data/ext/uchardet/src/LangModels/LangCzechModel.cpp +285 -0
- data/ext/uchardet/src/LangModels/LangDanishModel.cpp +201 -0
- data/ext/uchardet/src/LangModels/LangEsperantoModel.cpp +142 -0
- data/ext/uchardet/src/LangModels/LangEstonianModel.cpp +268 -0
- data/ext/uchardet/src/LangModels/LangFinnishModel.cpp +297 -0
- data/ext/uchardet/src/LangModels/LangFrenchModel.cpp +209 -0
- data/ext/uchardet/src/LangModels/LangGermanModel.cpp +170 -0
- data/ext/uchardet/src/LangModels/LangGreekModel.cpp +231 -0
- data/ext/uchardet/src/LangModels/LangHebrewModel.cpp +220 -0
- data/ext/uchardet/src/LangModels/LangHungarianModel.cpp +171 -0
- data/ext/uchardet/src/LangModels/LangIrishModel.cpp +234 -0
- data/ext/uchardet/src/LangModels/LangItalianModel.cpp +269 -0
- data/ext/uchardet/src/LangModels/LangLatvianModel.cpp +210 -0
- data/ext/uchardet/src/LangModels/LangLithuanianModel.cpp +209 -0
- data/ext/uchardet/src/LangModels/LangMalteseModel.cpp +138 -0
- data/ext/uchardet/src/LangModels/LangPolishModel.cpp +304 -0
- data/ext/uchardet/src/LangModels/LangPortugueseModel.cpp +241 -0
- data/ext/uchardet/src/LangModels/LangRomanianModel.cpp +236 -0
- data/ext/uchardet/src/LangModels/LangRussianModel.cpp +367 -0
- data/ext/uchardet/src/LangModels/LangSlovakModel.cpp +293 -0
- data/ext/uchardet/src/LangModels/LangSloveneModel.cpp +264 -0
- data/ext/uchardet/src/LangModels/LangSpanishModel.cpp +204 -0
- data/ext/uchardet/src/LangModels/LangSwedishModel.cpp +266 -0
- data/ext/uchardet/src/LangModels/LangThaiModel.cpp +267 -0
- data/ext/uchardet/src/LangModels/LangTurkishModel.cpp +175 -0
- data/ext/uchardet/src/LangModels/LangVietnameseModel.cpp +249 -0
- data/ext/uchardet/src/nsBig5Prober.cpp +88 -0
- data/ext/uchardet/src/nsBig5Prober.h +75 -0
- data/ext/uchardet/src/nsCharSetProber.cpp +125 -0
- data/ext/uchardet/src/nsCharSetProber.h +77 -0
- data/ext/uchardet/src/nsCodingStateMachine.h +105 -0
- data/ext/uchardet/src/nsEUCJPProber.cpp +99 -0
- data/ext/uchardet/src/nsEUCJPProber.h +79 -0
- data/ext/uchardet/src/nsEUCKRProber.cpp +91 -0
- data/ext/uchardet/src/nsEUCKRProber.h +81 -0
- data/ext/uchardet/src/nsEUCTWProber.cpp +91 -0
- data/ext/uchardet/src/nsEUCTWProber.h +75 -0
- data/ext/uchardet/src/nsEscCharsetProber.cpp +101 -0
- data/ext/uchardet/src/nsEscCharsetProber.h +70 -0
- data/ext/uchardet/src/nsEscSM.cpp +267 -0
- data/ext/uchardet/src/nsGB2312Prober.cpp +96 -0
- data/ext/uchardet/src/nsGB2312Prober.h +77 -0
- data/ext/uchardet/src/nsHebrewProber.cpp +194 -0
- data/ext/uchardet/src/nsHebrewProber.h +177 -0
- data/ext/uchardet/src/nsLatin1Prober.cpp +182 -0
- data/ext/uchardet/src/nsLatin1Prober.h +73 -0
- data/ext/uchardet/src/nsMBCSGroupProber.cpp +242 -0
- data/ext/uchardet/src/nsMBCSGroupProber.h +81 -0
- data/ext/uchardet/src/nsMBCSSM.cpp +513 -0
- data/ext/uchardet/src/nsPkgInt.h +89 -0
- data/ext/uchardet/src/nsSBCSGroupProber.cpp +343 -0
- data/ext/uchardet/src/nsSBCSGroupProber.h +71 -0
- data/ext/uchardet/src/nsSBCharSetProber.cpp +160 -0
- data/ext/uchardet/src/nsSBCharSetProber.h +258 -0
- data/ext/uchardet/src/nsSJISProber.cpp +98 -0
- data/ext/uchardet/src/nsSJISProber.h +81 -0
- data/ext/uchardet/src/nsUTF8Prober.cpp +87 -0
- data/ext/uchardet/src/nsUTF8Prober.h +66 -0
- data/ext/uchardet/src/nsUniversalDetector.cpp +339 -0
- data/ext/uchardet/src/nsUniversalDetector.h +91 -0
- data/ext/uchardet/src/nscore.h +59 -0
- data/ext/uchardet/src/prmem.h +49 -0
- data/ext/uchardet/src/symbols.cmake +41 -0
- data/ext/uchardet/src/tools/CMakeLists.txt +23 -0
- data/ext/uchardet/src/tools/uchardet.cpp +254 -0
- data/ext/uchardet/src/uchardet.cpp +274 -0
- data/ext/uchardet/src/uchardet.h +136 -0
- data/ext/uchardet/test/CMakeLists.txt +47 -0
- data/ext/uchardet/test/ar/iso-8859-6.txt +3 -0
- data/ext/uchardet/test/ar/utf-8.txt +3 -0
- data/ext/uchardet/test/ar/windows-1256.txt +3 -0
- data/ext/uchardet/test/bg/windows-1251.txt +3 -0
- data/ext/uchardet/test/cs/ibm852.txt +4 -0
- data/ext/uchardet/test/cs/iso-8859-2.txt +4 -0
- data/ext/uchardet/test/cs/mac-centraleurope.txt +4 -0
- data/ext/uchardet/test/cs/utf-8.txt +4 -0
- data/ext/uchardet/test/cs/windows-1250.txt +4 -0
- data/ext/uchardet/test/da/iso-8859-1.txt +7 -0
- data/ext/uchardet/test/da/iso-8859-15.txt +10 -0
- data/ext/uchardet/test/da/utf-8.txt +10 -0
- data/ext/uchardet/test/da/windows-1252.txt +10 -0
- data/ext/uchardet/test/de/iso-8859-1.txt +11 -0
- data/ext/uchardet/test/de/windows-1252.txt +11 -0
- data/ext/uchardet/test/el/iso-8859-7.txt +3 -0
- data/ext/uchardet/test/el/utf-8.txt +3 -0
- data/ext/uchardet/test/el/windows-1253.txt +5 -0
- data/ext/uchardet/test/en/ascii.txt +4 -0
- data/ext/uchardet/test/eo/iso-8859-3.txt +7 -0
- data/ext/uchardet/test/es/iso-8859-1.txt +5 -0
- data/ext/uchardet/test/es/iso-8859-15.txt +5 -0
- data/ext/uchardet/test/es/utf-8.txt +5 -0
- data/ext/uchardet/test/es/windows-1252.txt +5 -0
- data/ext/uchardet/test/et/iso-8859-13.txt +6 -0
- data/ext/uchardet/test/et/iso-8859-15.txt +6 -0
- data/ext/uchardet/test/et/iso-8859-4.txt +6 -0
- data/ext/uchardet/test/et/utf-8.txt +6 -0
- data/ext/uchardet/test/et/windows-1252.txt +6 -0
- data/ext/uchardet/test/et/windows-1257.txt +6 -0
- data/ext/uchardet/test/fi/iso-8859-1.txt +8 -0
- data/ext/uchardet/test/fi/utf-8.txt +8 -0
- data/ext/uchardet/test/fr/iso-8859-1.txt +5 -0
- data/ext/uchardet/test/fr/iso-8859-15.txt +16 -0
- data/ext/uchardet/test/fr/utf-16.be +0 -0
- data/ext/uchardet/test/fr/utf-32.le +0 -0
- data/ext/uchardet/test/fr/utf-8.txt +14 -0
- data/ext/uchardet/test/fr/windows-1252.txt +3 -0
- data/ext/uchardet/test/ga/iso-8859-1.txt +6 -0
- data/ext/uchardet/test/ga/utf-8.txt +6 -0
- data/ext/uchardet/test/ga/windows-1252.txt +6 -0
- data/ext/uchardet/test/he/iso-8859-8.txt +2 -0
- data/ext/uchardet/test/he/utf-8.txt +3 -0
- data/ext/uchardet/test/he/windows-1255.txt +1 -0
- data/ext/uchardet/test/hr/ibm852.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-13.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-16.txt +4 -0
- data/ext/uchardet/test/hr/iso-8859-2.txt +4 -0
- data/ext/uchardet/test/hr/mac-centraleurope.txt +4 -0
- data/ext/uchardet/test/hr/utf-8.txt +4 -0
- data/ext/uchardet/test/hr/windows-1250.txt +4 -0
- data/ext/uchardet/test/hu/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/hu/windows-1250.txt +1 -0
- data/ext/uchardet/test/it/iso-8859-1.txt +18 -0
- data/ext/uchardet/test/it/utf-8.txt +18 -0
- data/ext/uchardet/test/ja/euc-jp.txt +10 -0
- data/ext/uchardet/test/ja/iso-2022-jp.txt +8 -0
- data/ext/uchardet/test/ja/shift_jis.txt +1 -0
- data/ext/uchardet/test/ja/utf-16be.txt +0 -0
- data/ext/uchardet/test/ja/utf-16le.txt +0 -0
- data/ext/uchardet/test/ja/utf-8.txt +9 -0
- data/ext/uchardet/test/ko/iso-2022-kr.txt +8 -0
- data/ext/uchardet/test/ko/uhc.smi +16 -0
- data/ext/uchardet/test/ko/utf-16.le +0 -0
- data/ext/uchardet/test/ko/utf-32.be +0 -0
- data/ext/uchardet/test/ko/utf-8.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-10.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-13.txt +3 -0
- data/ext/uchardet/test/lt/iso-8859-4.txt +3 -0
- data/ext/uchardet/test/lt/utf-8.txt +3 -0
- data/ext/uchardet/test/lv/iso-8859-10.txt +6 -0
- data/ext/uchardet/test/lv/iso-8859-13.txt +6 -0
- data/ext/uchardet/test/lv/iso-8859-4.txt +6 -0
- data/ext/uchardet/test/lv/utf-8.txt +6 -0
- data/ext/uchardet/test/mt/iso-8859-3.txt +4 -0
- data/ext/uchardet/test/mt/utf-8.txt +4 -0
- data/ext/uchardet/test/pl/ibm852.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-13.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-16.txt +3 -0
- data/ext/uchardet/test/pl/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/pl/mac-centraleurope.txt +3 -0
- data/ext/uchardet/test/pl/utf-8.txt +3 -0
- data/ext/uchardet/test/pl/windows-1250.txt +3 -0
- data/ext/uchardet/test/pt/iso-8859-1.txt +6 -0
- data/ext/uchardet/test/pt/utf-8.txt +6 -0
- data/ext/uchardet/test/ro/ibm852.txt +9 -0
- data/ext/uchardet/test/ro/iso-8859-16.txt +9 -0
- data/ext/uchardet/test/ro/utf-8.txt +9 -0
- data/ext/uchardet/test/ro/windows-1250.txt +9 -0
- data/ext/uchardet/test/ru/ibm855.txt +5 -0
- data/ext/uchardet/test/ru/ibm866.txt +11 -0
- data/ext/uchardet/test/ru/iso-8859-5.txt +3 -0
- data/ext/uchardet/test/ru/koi8-r.txt +1 -0
- data/ext/uchardet/test/ru/mac-cyrillic.txt +9 -0
- data/ext/uchardet/test/ru/windows-1251.txt +4 -0
- data/ext/uchardet/test/sk/ibm852.txt +3 -0
- data/ext/uchardet/test/sk/iso-8859-2.txt +3 -0
- data/ext/uchardet/test/sk/mac-centraleurope.txt +3 -0
- data/ext/uchardet/test/sk/utf-8.txt +3 -0
- data/ext/uchardet/test/sk/windows-1250.txt +3 -0
- data/ext/uchardet/test/sl/ibm852.txt +9 -0
- data/ext/uchardet/test/sl/iso-8859-16.txt +9 -0
- data/ext/uchardet/test/sl/iso-8859-2.txt +9 -0
- data/ext/uchardet/test/sl/mac-centraleurope.txt +9 -0
- data/ext/uchardet/test/sl/utf-8.txt +9 -0
- data/ext/uchardet/test/sl/windows-1250.txt +9 -0
- data/ext/uchardet/test/sv/iso-8859-1.txt +10 -0
- data/ext/uchardet/test/sv/utf-8.txt +10 -0
- data/ext/uchardet/test/sv/windows-1252.txt +10 -0
- data/ext/uchardet/test/th/iso-8859-11.txt +5 -0
- data/ext/uchardet/test/th/tis-620.txt +5 -0
- data/ext/uchardet/test/th/utf-8.txt +1 -0
- data/ext/uchardet/test/tr/iso-8859-3.txt +13 -0
- data/ext/uchardet/test/tr/iso-8859-9.txt +13 -0
- data/ext/uchardet/test/uchardet-tests.c +130 -0
- data/ext/uchardet/test/vi/utf-8.txt +4 -0
- data/ext/uchardet/test/vi/viscii.txt +4 -0
- data/ext/uchardet/test/vi/windows-1258.txt +4 -0
- data/ext/uchardet/test/zh/big5.txt +1 -0
- data/ext/uchardet/test/zh/euc-tw.txt +1 -0
- data/ext/uchardet/test/zh/gb18030.txt +1 -0
- data/ext/uchardet/test/zh/utf-8.txt +1 -0
- data/ext/uchardet/uchardet.doap +51 -0
- data/ext/uchardet/uchardet.pc.in +10 -0
- data/lib/cchardet.rb +56 -0
- data/lib/cchardet/lib_finder.rb +32 -0
- data/lib/cchardet/version.rb +5 -0
- metadata +362 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
= Logs of language model for Turkish (tr) =
|
|
2
|
+
|
|
3
|
+
- Generated by BuildLangModel.py
|
|
4
|
+
- Started: 2015-12-04 02:22:03.929245
|
|
5
|
+
- Maximum depth: 3
|
|
6
|
+
- Max number of pages: 50
|
|
7
|
+
|
|
8
|
+
== Parsed pages ==
|
|
9
|
+
|
|
10
|
+
Ana_Sayfa (revision 16293313)
|
|
11
|
+
1048 (revision 12894005)
|
|
12
|
+
1131 (revision 14840814)
|
|
13
|
+
16. yüzyıl (revision 15185081)
|
|
14
|
+
1859 (revision 16014427)
|
|
15
|
+
1866 (revision 16120346)
|
|
16
|
+
1869 (revision 12888270)
|
|
17
|
+
1892 (revision 13955858)
|
|
18
|
+
1895 (revision 15334635)
|
|
19
|
+
1902 (revision 16283638)
|
|
20
|
+
1906 (revision 15874323)
|
|
21
|
+
1918 (revision 16099474)
|
|
22
|
+
1926 (revision 16180584)
|
|
23
|
+
1927 (revision 15370980)
|
|
24
|
+
1940 (revision 15370990)
|
|
25
|
+
1943 (revision 16091797)
|
|
26
|
+
1944 (revision 16247827)
|
|
27
|
+
1945 (revision 16281147)
|
|
28
|
+
1948 (revision 15443886)
|
|
29
|
+
1961 (revision 15799529)
|
|
30
|
+
1964 (revision 16085332)
|
|
31
|
+
1975 (revision 15006928)
|
|
32
|
+
1980 (revision 16213240)
|
|
33
|
+
1981 (revision 16295456)
|
|
34
|
+
1983 (revision 16327128)
|
|
35
|
+
1993 (revision 16300456)
|
|
36
|
+
2002 (revision 16297206)
|
|
37
|
+
2015 (revision 16328338)
|
|
38
|
+
24 Ekim (revision 16213661)
|
|
39
|
+
4 Aralık (revision 16341162)
|
|
40
|
+
ABD (revision 16325951)
|
|
41
|
+
ABD Senatosu (revision 15970439)
|
|
42
|
+
Adam Horowitz (revision 14362106)
|
|
43
|
+
Akçe (revision 16261547)
|
|
44
|
+
Altın Takım (revision 13503001)
|
|
45
|
+
American Broadcasting Company (revision 16055235)
|
|
46
|
+
Amerika Birleşik Devletleri (revision 16325951)
|
|
47
|
+
Ana Sayfa/Kardeş projeler (revision 16293313)
|
|
48
|
+
Ana Sayfa/Kategoriler (revision 16293313)
|
|
49
|
+
Aptullah Kuran (revision 15744893)
|
|
50
|
+
Avrupa (revision 16299756)
|
|
51
|
+
Ayasofya (revision 16305207)
|
|
52
|
+
BM Güvenlik Konseyi (revision 16085518)
|
|
53
|
+
Birleşmiş Milletler (revision 16258474)
|
|
54
|
+
Budapeşte (revision 16219173)
|
|
55
|
+
CIA (revision 16054325)
|
|
56
|
+
Charlie Pace (revision 16129416)
|
|
57
|
+
Cuma (revision 14197127)
|
|
58
|
+
Desmond Hume (revision 16035300)
|
|
59
|
+
Diğerleri (Lost) (revision 16329444)
|
|
60
|
+
|
|
61
|
+
== End of Parsed pages ==
|
|
62
|
+
|
|
63
|
+
- Wikipedia parsing ended at: 2015-12-04 02:24:44.728803
|
|
64
|
+
|
|
65
|
+
48 characters appeared 267623 times.
|
|
66
|
+
|
|
67
|
+
First 36 characters:
|
|
68
|
+
[ 0] Char a: 12.311722086666691 %
|
|
69
|
+
[ 1] Char e: 8.716365932673948 %
|
|
70
|
+
[ 2] Char i: 8.507863673899479 %
|
|
71
|
+
[ 3] Char n: 7.322987934519828 %
|
|
72
|
+
[ 4] Char r: 6.979220769515326 %
|
|
73
|
+
[ 5] Char l: 6.609297407173524 %
|
|
74
|
+
[ 6] Char ı: 4.514933320379788 %
|
|
75
|
+
[ 7] Char d: 4.3475336574210734 %
|
|
76
|
+
[ 8] Char t: 4.2634601659797555 %
|
|
77
|
+
[ 9] Char k: 4.240293248338147 %
|
|
78
|
+
[10] Char s: 3.929781819948211 %
|
|
79
|
+
[11] Char m: 3.429451130881875 %
|
|
80
|
+
[12] Char u: 3.0998830444319063 %
|
|
81
|
+
[13] Char y: 2.9212735826143494 %
|
|
82
|
+
[14] Char o: 2.7135186437638024 %
|
|
83
|
+
[15] Char b: 2.3129551645411643 %
|
|
84
|
+
[16] Char ü: 1.8305601536489764 %
|
|
85
|
+
[17] Char ş: 1.5988909772328985 %
|
|
86
|
+
[18] Char z: 1.2267256551193282 %
|
|
87
|
+
[19] Char h: 1.1983274980102607 %
|
|
88
|
+
[20] Char v: 1.194964558352608 %
|
|
89
|
+
[21] Char c: 1.143773143563894 %
|
|
90
|
+
[22] Char g: 1.1004285879763698 %
|
|
91
|
+
[23] Char p: 1.0178497363828969 %
|
|
92
|
+
[24] Char ç: 0.8295251155543433 %
|
|
93
|
+
[25] Char ğ: 0.8205572764672693 %
|
|
94
|
+
[26] Char f: 0.7047226882592303 %
|
|
95
|
+
[27] Char ö: 0.6710932916827029 %
|
|
96
|
+
[28] Char j: 0.1296600068006113 %
|
|
97
|
+
[29] Char w: 0.11359262843627041 %
|
|
98
|
+
[30] Char â: 0.07846859201189733 %
|
|
99
|
+
[31] Char î: 0.04147625577771716 %
|
|
100
|
+
[32] Char x: 0.024287897527492032 %
|
|
101
|
+
[33] Char é: 0.014946398478456635 %
|
|
102
|
+
[34] Char q: 0.01083613889688106 %
|
|
103
|
+
[35] Char û: 0.009341499049035397 %
|
|
104
|
+
|
|
105
|
+
The first 36 characters have an accumulated ratio of 0.99980569681978.
|
|
106
|
+
|
|
107
|
+
935 sequences found.
|
|
108
|
+
|
|
109
|
+
First 512 (typical positive ratio): 0.991865243864388
|
|
110
|
+
Next 512 (512-1024): 3.7365996196141585e-06
|
|
111
|
+
Rest: 2.949029909160572e-17
|
|
112
|
+
|
|
113
|
+
- Processing end: 2015-12-04 02:24:44.883537
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
= Logs of language model for Vietnamese (vi) =
|
|
2
|
+
|
|
3
|
+
- Generated by BuildLangModel.py
|
|
4
|
+
- Started: 2016-02-13 03:37:17.480303
|
|
5
|
+
- Maximum depth: 3
|
|
6
|
+
- Max number of pages: 40
|
|
7
|
+
|
|
8
|
+
== Parsed pages ==
|
|
9
|
+
|
|
10
|
+
Chữ_Quốc_ngữ (revision 22887853)
|
|
11
|
+
1651 (revision 21455247)
|
|
12
|
+
1773 (revision 21354755)
|
|
13
|
+
1815 (revision 21361292)
|
|
14
|
+
1838 (revision 21361314)
|
|
15
|
+
1865 (revision 21361338)
|
|
16
|
+
1869 (revision 21361342)
|
|
17
|
+
1888 (revision 21389506)
|
|
18
|
+
1902 (revision 21354811)
|
|
19
|
+
1918 (revision 21354828)
|
|
20
|
+
1919 (revision 21354829)
|
|
21
|
+
1938 (revision 21354849)
|
|
22
|
+
1945 (revision 21354857)
|
|
23
|
+
22 tháng 2 (revision 21376086)
|
|
24
|
+
26 tháng 11 (revision 22579845)
|
|
25
|
+
28 tháng 12 (revision 22475308)
|
|
26
|
+
A (revision 22549334)
|
|
27
|
+
ASCII (revision 22528409)
|
|
28
|
+
Alexandre de Rhodes (revision 22859954)
|
|
29
|
+
Antonio Barbosa (revision 22145269)
|
|
30
|
+
B (revision 22836557)
|
|
31
|
+
BBC (revision 22863903)
|
|
32
|
+
Biên khảo (revision 22531516)
|
|
33
|
+
Bán nguyên âm (revision 22655600)
|
|
34
|
+
Bình luận (revision 22117664)
|
|
35
|
+
Bảng chữ cái Bồ Đào Nha (revision 22887853)
|
|
36
|
+
Bảng chữ cái Hy Lạp (revision 21362081)
|
|
37
|
+
Bảng chữ cái Latinh (revision 22442448)
|
|
38
|
+
Bắc Kỳ (revision 22393289)
|
|
39
|
+
Bồ Đào Nha (revision 22620858)
|
|
40
|
+
C (revision 21341881)
|
|
41
|
+
Cao Xuân Dục (revision 22620201)
|
|
42
|
+
Chính tả (revision 22187359)
|
|
43
|
+
Chính tả tiếng Việt (revision 20897580)
|
|
44
|
+
Chữ Hán (revision 22889609)
|
|
45
|
+
Chữ Nôm (revision 22781506)
|
|
46
|
+
Chữ cái (revision 22169220)
|
|
47
|
+
Công giáo (revision 22173119)
|
|
48
|
+
D (revision 21447691)
|
|
49
|
+
|
|
50
|
+
== End of Parsed pages ==
|
|
51
|
+
|
|
52
|
+
- Wikipedia parsing ended at: 2016-02-13 03:42:06.560479
|
|
53
|
+
|
|
54
|
+
101 characters appeared 222814 times.
|
|
55
|
+
|
|
56
|
+
First 55 characters:
|
|
57
|
+
[ 0] Char n: 11.262308472537633 %
|
|
58
|
+
[ 1] Char h: 8.881398834902654 %
|
|
59
|
+
[ 2] Char t: 7.022898022565907 %
|
|
60
|
+
[ 3] Char c: 6.365398942615815 %
|
|
61
|
+
[ 4] Char i: 6.198443544840091 %
|
|
62
|
+
[ 5] Char g: 5.591210606155808 %
|
|
63
|
+
[ 6] Char a: 3.5998635633308496 %
|
|
64
|
+
[ 7] Char u: 2.8499106878382867 %
|
|
65
|
+
[ 8] Char m: 2.615185760320267 %
|
|
66
|
+
[ 9] Char o: 2.6012728105056238 %
|
|
67
|
+
[10] Char đ: 2.222032726848403 %
|
|
68
|
+
[11] Char r: 2.1102803234985235 %
|
|
69
|
+
[12] Char à: 2.0447548179198796 %
|
|
70
|
+
[13] Char v: 1.9437737305555307 %
|
|
71
|
+
[14] Char l: 1.9119085874316697 %
|
|
72
|
+
[15] Char á: 1.7539292863105551 %
|
|
73
|
+
[16] Char p: 1.6453185167897888 %
|
|
74
|
+
[17] Char b: 1.541195795596327 %
|
|
75
|
+
[18] Char ư: 1.4397659033992478 %
|
|
76
|
+
[19] Char s: 1.3760356171515256 %
|
|
77
|
+
[20] Char y: 1.280440187779942 %
|
|
78
|
+
[21] Char e: 1.2454334108269678 %
|
|
79
|
+
[22] Char d: 1.1251537156552103 %
|
|
80
|
+
[23] Char ế: 1.071745940560288 %
|
|
81
|
+
[24] Char k: 1.0695019163966357 %
|
|
82
|
+
[25] Char â: 0.9658280000359044 %
|
|
83
|
+
[26] Char ữ: 0.9604423420431392 %
|
|
84
|
+
[27] Char ê: 0.8374698178749989 %
|
|
85
|
+
[28] Char ệ: 0.7459136319979893 %
|
|
86
|
+
[29] Char ô: 0.7073164163831717 %
|
|
87
|
+
[30] Char ạ: 0.6727584442629277 %
|
|
88
|
+
[31] Char ộ: 0.6705144200992756 %
|
|
89
|
+
[32] Char ố: 0.6476253736300233 %
|
|
90
|
+
[33] Char ó: 0.6072329386842837 %
|
|
91
|
+
[34] Char ả: 0.5484395055965963 %
|
|
92
|
+
[35] Char ủ: 0.5475418959311353 %
|
|
93
|
+
[36] Char q: 0.5138815334763525 %
|
|
94
|
+
[37] Char ợ: 0.48560682901433483 %
|
|
95
|
+
[38] Char ờ: 0.4851580241816044 %
|
|
96
|
+
[39] Char ể: 0.4748355130288043 %
|
|
97
|
+
[40] Char ớ: 0.4676546357051173 %
|
|
98
|
+
[41] Char ấ: 0.418286104104769 %
|
|
99
|
+
[42] Char ị: 0.40212913012647317 %
|
|
100
|
+
[43] Char ầ: 0.3904602044754818 %
|
|
101
|
+
[44] Char ọ: 0.3801376933226817 %
|
|
102
|
+
[45] Char ề: 0.3787912788244904 %
|
|
103
|
+
[46] Char ơ: 0.3590438661843511 %
|
|
104
|
+
[47] Char í: 0.35679984202069887 %
|
|
105
|
+
[48] Char ụ: 0.35276059852612496 %
|
|
106
|
+
[49] Char ậ: 0.3469261357006292 %
|
|
107
|
+
[50] Char ì: 0.32762752789322036 %
|
|
108
|
+
[51] Char ă: 0.3253835037295682 %
|
|
109
|
+
[52] Char ứ: 0.29665999443482005 %
|
|
110
|
+
[53] Char ồ: 0.29665999443482005 %
|
|
111
|
+
[54] Char x: 0.2939671654384374 %
|
|
112
|
+
|
|
113
|
+
The first 55 characters have an accumulated ratio of 0.9603301408349568.
|
|
114
|
+
|
|
115
|
+
1494 sequences found.
|
|
116
|
+
|
|
117
|
+
First 512 (typical positive ratio): 0.9321889118082535
|
|
118
|
+
Next 512 (512-1024): 0.009604423420431392
|
|
119
|
+
Rest: 0.0068905733918831966
|
|
120
|
+
|
|
121
|
+
- Processing end: 2016-02-13 03:42:07.174723
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Supporting new or Updating languages #
|
|
2
|
+
|
|
3
|
+
We generate statistical language data using Wikipedia as natural
|
|
4
|
+
language text resource.
|
|
5
|
+
|
|
6
|
+
Right now, we have automated scripts only to generate statistical data
|
|
7
|
+
for single-byte encodings. Multi-byte encodings usually requires more
|
|
8
|
+
in-depth knowledge of its specification.
|
|
9
|
+
|
|
10
|
+
## New single-byte encoding ##
|
|
11
|
+
|
|
12
|
+
Uchardet uses language data, and therefore rather than supporting a
|
|
13
|
+
charset, we in fact support a couple (language, charset). So for
|
|
14
|
+
instance if uchardet supports (French, ISO-8859-15), it should be able
|
|
15
|
+
to recognize French text encoded in ISO-8859-15, but may fail at
|
|
16
|
+
detecting ISO-8859-15 for non-supported languages.
|
|
17
|
+
|
|
18
|
+
This is why, though less flexible, it also makes uchardet much more
|
|
19
|
+
accurate than other detection system, as well as making it an efficient
|
|
20
|
+
language recognition system.
|
|
21
|
+
Since many single-byte charsets actually share the same layout (or very
|
|
22
|
+
similar ones), it is actually impossible to have an accurate single-byte
|
|
23
|
+
encoding detector for random text.
|
|
24
|
+
|
|
25
|
+
Therefore you need to describe the language and the codepoint layouts of
|
|
26
|
+
every charset you want to add support for.
|
|
27
|
+
|
|
28
|
+
I recommend having a look at langs/fr.py which is heavily commented as
|
|
29
|
+
a base of a new language description, and charsets/windows-1252.py as a
|
|
30
|
+
base for a new charset layout (note that charset layouts can be shared
|
|
31
|
+
between languages. If yours is already there, you have nothing to do).
|
|
32
|
+
The important name in the charset file are:
|
|
33
|
+
|
|
34
|
+
- `name`: an iconv-compatible name.
|
|
35
|
+
- `charmap`: fill it with CTR (control character), SYM (symbol), NUM
|
|
36
|
+
(number), LET (letter), ILL (illegal codepoint).
|
|
37
|
+
|
|
38
|
+
## Tools ##
|
|
39
|
+
|
|
40
|
+
You must install Python 3 and the [`Wikipedia` Python
|
|
41
|
+
tool](https://github.com/goldsmith/Wikipedia).
|
|
42
|
+
|
|
43
|
+
## Run script ##
|
|
44
|
+
|
|
45
|
+
Let's say you added (or modified) support for French (`fr`), run:
|
|
46
|
+
|
|
47
|
+
> ./BuildLangModel.py fr --max-page=100 --max-depth=4
|
|
48
|
+
|
|
49
|
+
The options can be changed to any value. Bigger values mean the script
|
|
50
|
+
will process more data, so more processing time now, but uchardet may
|
|
51
|
+
possibly be more accurate in the end.
|
|
52
|
+
|
|
53
|
+
## Updating core code ##
|
|
54
|
+
|
|
55
|
+
If you were only updating data for a language model, you have nothing
|
|
56
|
+
else to do. Just build `uchardet` again and test it.
|
|
57
|
+
|
|
58
|
+
If you were creating new models though, you will have to add these in
|
|
59
|
+
src/nsSBCSGroupProber.cpp and src/nsSBCharSetProber.h, and increase the
|
|
60
|
+
value of `NUM_OF_SBCS_PROBERS` in src/nsSBCSGroupProber.h.
|
|
61
|
+
Finally add the new file in src/CMakeLists.txt.
|
|
62
|
+
|
|
63
|
+
I will be looking to make this step more straightforward in the future.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/usr/bin/python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# ##### BEGIN LICENSE BLOCK #####
|
|
5
|
+
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
6
|
+
#
|
|
7
|
+
# The contents of this file are subject to the Mozilla Public License Version
|
|
8
|
+
# 1.1 (the "License"); you may not use this file except in compliance with
|
|
9
|
+
# the License. You may obtain a copy of the License at
|
|
10
|
+
# http://www.mozilla.org/MPL/
|
|
11
|
+
#
|
|
12
|
+
# Software distributed under the License is distributed on an "AS IS" basis,
|
|
13
|
+
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
14
|
+
# for the specific language governing rights and limitations under the
|
|
15
|
+
# License.
|
|
16
|
+
#
|
|
17
|
+
# The Original Code is Mozilla Universal charset detector code.
|
|
18
|
+
#
|
|
19
|
+
# The Initial Developer of the Original Code is
|
|
20
|
+
# Netscape Communications Corporation.
|
|
21
|
+
# Portions created by the Initial Developer are Copyright (C) 2001
|
|
22
|
+
# the Initial Developer. All Rights Reserved.
|
|
23
|
+
#
|
|
24
|
+
# Contributor(s):
|
|
25
|
+
# Jehan <jehan@girinstud.io>
|
|
26
|
+
#
|
|
27
|
+
# Alternatively, the contents of this file may be used under the terms of
|
|
28
|
+
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
29
|
+
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
30
|
+
# in which case the provisions of the GPL or the LGPL are applicable instead
|
|
31
|
+
# of those above. If you wish to allow use of your version of this file only
|
|
32
|
+
# under the terms of either the GPL or the LGPL, and not to allow others to
|
|
33
|
+
# use your version of this file under the terms of the MPL, indicate your
|
|
34
|
+
# decision by deleting the provisions above and replace them with the notice
|
|
35
|
+
# and other provisions required by the GPL or the LGPL. If you do not delete
|
|
36
|
+
# the provisions above, a recipient may use your version of this file under
|
|
37
|
+
# the terms of any one of the MPL, the GPL or the LGPL.
|
|
38
|
+
#
|
|
39
|
+
# ##### END LICENSE BLOCK #####
|
|
40
|
+
|
|
41
|
+
# Illegal codepoints.
|
|
42
|
+
ILL = 255
|
|
43
|
+
# Control characters
|
|
44
|
+
CTR = 254
|
|
45
|
+
# Symbols and punctuations.
|
|
46
|
+
SYM = 253
|
|
47
|
+
# Return/Line feeds.
|
|
48
|
+
RET = 252
|
|
49
|
+
# Numbers 0-9.
|
|
50
|
+
NUM = 251
|
|
51
|
+
|
|
52
|
+
# Letters (should be all the rest).
|
|
53
|
+
LET = 0
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# ##### BEGIN LICENSE BLOCK #####
|
|
5
|
+
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
6
|
+
#
|
|
7
|
+
# The contents of this file are subject to the Mozilla Public License Version
|
|
8
|
+
# 1.1 (the "License"); you may not use this file except in compliance with
|
|
9
|
+
# the License. You may obtain a copy of the License at
|
|
10
|
+
# http://www.mozilla.org/MPL/
|
|
11
|
+
#
|
|
12
|
+
# Software distributed under the License is distributed on an "AS IS" basis,
|
|
13
|
+
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
14
|
+
# for the specific language governing rights and limitations under the
|
|
15
|
+
# License.
|
|
16
|
+
#
|
|
17
|
+
# The Original Code is Mozilla Universal charset detector code.
|
|
18
|
+
#
|
|
19
|
+
# The Initial Developer of the Original Code is
|
|
20
|
+
# Netscape Communications Corporation.
|
|
21
|
+
# Portions created by the Initial Developer are Copyright (C) 2001
|
|
22
|
+
# the Initial Developer. All Rights Reserved.
|
|
23
|
+
#
|
|
24
|
+
# Contributor(s):
|
|
25
|
+
# Jehan <jehan@girinstud.io>
|
|
26
|
+
#
|
|
27
|
+
# Alternatively, the contents of this file may be used under the terms of
|
|
28
|
+
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
29
|
+
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
30
|
+
# in which case the provisions of the GPL or the LGPL are applicable instead
|
|
31
|
+
# of those above. If you wish to allow use of your version of this file only
|
|
32
|
+
# under the terms of either the GPL or the LGPL, and not to allow others to
|
|
33
|
+
# use your version of this file under the terms of the MPL, indicate your
|
|
34
|
+
# decision by deleting the provisions above and replace them with the notice
|
|
35
|
+
# and other provisions required by the GPL or the LGPL. If you do not delete
|
|
36
|
+
# the provisions above, a recipient may use your version of this file under
|
|
37
|
+
# the terms of any one of the MPL, the GPL or the LGPL.
|
|
38
|
+
#
|
|
39
|
+
# ##### END LICENSE BLOCK #####
|
|
40
|
+
|
|
41
|
+
import importlib
|
|
42
|
+
import sys
|
|
43
|
+
import os
|
|
44
|
+
|
|
45
|
+
def load(charset_names):
|
|
46
|
+
'''
|
|
47
|
+
Load a list of charsets.
|
|
48
|
+
|
|
49
|
+
This function will return a dictionary of charsets from our
|
|
50
|
+
charset database.
|
|
51
|
+
|
|
52
|
+
:param charset_names: a list of supported charset names.
|
|
53
|
+
:return: a dictionary with all the loaded charsets.
|
|
54
|
+
:rtype: dict
|
|
55
|
+
'''
|
|
56
|
+
charsets = {}
|
|
57
|
+
|
|
58
|
+
# Temporarily change the search path for modules.
|
|
59
|
+
sys_path_backup = sys.path
|
|
60
|
+
current_dir = os.path.dirname(os.path.realpath(__file__))
|
|
61
|
+
sys.path = [current_dir + '/../charsets']
|
|
62
|
+
|
|
63
|
+
for name in charset_names:
|
|
64
|
+
try:
|
|
65
|
+
charset = importlib.import_module(name.lower())
|
|
66
|
+
except ImportError:
|
|
67
|
+
print('Unknown charset "{}": '
|
|
68
|
+
'file "charsets/{}.py" does not exist.'.format(name, name.lower()))
|
|
69
|
+
exit(1)
|
|
70
|
+
charsets[charset.name] = charset
|
|
71
|
+
# Set back the default module paths.
|
|
72
|
+
sys.path = sys_path_backup
|
|
73
|
+
return charsets
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# ##### BEGIN LICENSE BLOCK #####
|
|
5
|
+
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
6
|
+
#
|
|
7
|
+
# The contents of this file are subject to the Mozilla Public License Version
|
|
8
|
+
# 1.1 (the "License"); you may not use this file except in compliance with
|
|
9
|
+
# the License. You may obtain a copy of the License at
|
|
10
|
+
# http://www.mozilla.org/MPL/
|
|
11
|
+
#
|
|
12
|
+
# Software distributed under the License is distributed on an "AS IS" basis,
|
|
13
|
+
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
14
|
+
# for the specific language governing rights and limitations under the
|
|
15
|
+
# License.
|
|
16
|
+
#
|
|
17
|
+
# The Original Code is Mozilla Universal charset detector code.
|
|
18
|
+
#
|
|
19
|
+
# The Initial Developer of the Original Code is
|
|
20
|
+
# Netscape Communications Corporation.
|
|
21
|
+
# Portions created by the Initial Developer are Copyright (C) 2001
|
|
22
|
+
# the Initial Developer. All Rights Reserved.
|
|
23
|
+
#
|
|
24
|
+
# Contributor(s):
|
|
25
|
+
# Jehan <jehan@girinstud.io>
|
|
26
|
+
#
|
|
27
|
+
# Alternatively, the contents of this file may be used under the terms of
|
|
28
|
+
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
29
|
+
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
30
|
+
# in which case the provisions of the GPL or the LGPL are applicable instead
|
|
31
|
+
# of those above. If you wish to allow use of your version of this file only
|
|
32
|
+
# under the terms of either the GPL or the LGPL, and not to allow others to
|
|
33
|
+
# use your version of this file under the terms of the MPL, indicate your
|
|
34
|
+
# decision by deleting the provisions above and replace them with the notice
|
|
35
|
+
# and other provisions required by the GPL or the LGPL. If you do not delete
|
|
36
|
+
# the provisions above, a recipient may use your version of this file under
|
|
37
|
+
# the terms of any one of the MPL, the GPL or the LGPL.
|
|
38
|
+
#
|
|
39
|
+
# ##### END LICENSE BLOCK #####
|
|
40
|
+
|
|
41
|
+
from codepoints import *
|
|
42
|
+
|
|
43
|
+
name = 'IBM852'
|
|
44
|
+
aliases = ['CP852']
|
|
45
|
+
|
|
46
|
+
language = \
|
|
47
|
+
{
|
|
48
|
+
'complete': [ 'bs', 'hr', 'cs', 'de', 'hu', 'pl', 'sr', 'sk', 'sl',
|
|
49
|
+
'hsb', 'dsb', 'tk' ],
|
|
50
|
+
'incomplete': [ 'ro' ]
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
|
54
|
+
charmap = \
|
|
55
|
+
[
|
|
56
|
+
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
|
57
|
+
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
|
58
|
+
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
|
59
|
+
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
|
60
|
+
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
|
61
|
+
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
|
62
|
+
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
|
63
|
+
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
|
64
|
+
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
|
|
65
|
+
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET, # 9X
|
|
66
|
+
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,SYM,SYM, # AX
|
|
67
|
+
SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # BX
|
|
68
|
+
SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
|
|
69
|
+
LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # DX
|
|
70
|
+
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # EX
|
|
71
|
+
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,SYM,SYM, # FX
|
|
72
|
+
]
|