@datawrapper/jschardet 3.0.1-0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTORS +4 -0
- package/LICENSE +504 -0
- package/README.md +101 -0
- package/dist/jschardet.js +7859 -0
- package/dist/jschardet.min.js +669 -0
- package/index.d.ts +13 -0
- package/index.js +1 -0
- package/package.json +33 -0
- package/src/big5freq.js +925 -0
- package/src/big5prober.js +54 -0
- package/src/chardistribution.js +301 -0
- package/src/charsetgroupprober.js +120 -0
- package/src/charsetprober.js +104 -0
- package/src/codingstatemachine.js +71 -0
- package/src/constants.js +40 -0
- package/src/escprober.js +109 -0
- package/src/escsm.js +250 -0
- package/src/eucjpprober.js +107 -0
- package/src/euckrfreq.js +597 -0
- package/src/euckrprober.js +54 -0
- package/src/euctwfreq.js +429 -0
- package/src/euctwprober.js +54 -0
- package/src/gb2312freq.js +473 -0
- package/src/gb2312prober.js +54 -0
- package/src/hebrewprober.js +323 -0
- package/src/index.js +56 -0
- package/src/jisfreq.js +569 -0
- package/src/jpcntx.js +242 -0
- package/src/langbulgarianmodel.js +228 -0
- package/src/langcyrillicmodel.js +329 -0
- package/src/langgreekmodel.js +225 -0
- package/src/langhebrewmodel.js +199 -0
- package/src/langhungarianmodel.js +225 -0
- package/src/langthaimodel.js +200 -0
- package/src/latin1prober.js +168 -0
- package/src/logger.js +7 -0
- package/src/mbcharsetprober.js +99 -0
- package/src/mbcsgroupprober.js +64 -0
- package/src/mbcssm/big5.js +52 -0
- package/src/mbcssm/eucjp.js +54 -0
- package/src/mbcssm/euckr.js +51 -0
- package/src/mbcssm/euctw.js +55 -0
- package/src/mbcssm/gb2312.js +60 -0
- package/src/mbcssm/sjis.js +54 -0
- package/src/mbcssm/ucs2be.js +56 -0
- package/src/mbcssm/ucs2le.js +56 -0
- package/src/mbcssm/utf8.js +75 -0
- package/src/sbcharsetprober.js +137 -0
- package/src/sbcsgroupprober.js +83 -0
- package/src/sjisprober.js +105 -0
- package/src/universaldetector.js +262 -0
- package/src/utf8prober.js +108 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* The Original Code is Mozilla Universal charset detector code.
|
|
3
|
+
*
|
|
4
|
+
* The Initial Developer of the Original Code is
|
|
5
|
+
* Netscape Communications Corporation.
|
|
6
|
+
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
7
|
+
* the Initial Developer. All Rights Reserved.
|
|
8
|
+
*
|
|
9
|
+
* Contributor(s):
|
|
10
|
+
* António Afonso (antonio.afonso gmail.com) - port to JavaScript
|
|
11
|
+
* Mark Pilgrim - port to Python
|
|
12
|
+
* Shy Shalom - original C code
|
|
13
|
+
*
|
|
14
|
+
* This library is free software; you can redistribute it and/or
|
|
15
|
+
* modify it under the terms of the GNU Lesser General Public
|
|
16
|
+
* License as published by the Free Software Foundation; either
|
|
17
|
+
* version 2.1 of the License, or (at your option) any later version.
|
|
18
|
+
*
|
|
19
|
+
* This library is distributed in the hope that it will be useful,
|
|
20
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
21
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
22
|
+
* Lesser General Public License for more details.
|
|
23
|
+
*
|
|
24
|
+
* You should have received a copy of the GNU Lesser General Public
|
|
25
|
+
* License along with this library; if not, write to the Free Software
|
|
26
|
+
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
|
27
|
+
* 02110-1301 USA
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
var CharSetProber = require('./charsetprober');
|
|
31
|
+
var Constants = require('./constants');
|
|
32
|
+
|
|
33
|
+
var UDF = 0; // undefined
|
|
34
|
+
var OTH = 1; // other
|
|
35
|
+
var ASC = 2; // ascii capital letter
|
|
36
|
+
var ASS = 3; // ascii small letter
|
|
37
|
+
var ACV = 4; // accent capital vowel
|
|
38
|
+
var ACO = 5; // accent capital other
|
|
39
|
+
var ASV = 6; // accent small vowel
|
|
40
|
+
var ASO = 7; // accent small other
|
|
41
|
+
|
|
42
|
+
var Latin1_CharToClass = [
|
|
43
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
|
|
44
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
|
|
45
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
|
|
46
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
|
|
47
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
|
|
48
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
|
|
49
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
|
|
50
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
|
|
51
|
+
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
|
|
52
|
+
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
|
|
53
|
+
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
|
|
54
|
+
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
|
|
55
|
+
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
|
|
56
|
+
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
|
|
57
|
+
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
|
|
58
|
+
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
|
|
59
|
+
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
|
|
60
|
+
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
|
|
61
|
+
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
|
|
62
|
+
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
|
|
63
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
|
|
64
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
|
|
65
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
|
|
66
|
+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
|
|
67
|
+
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
|
|
68
|
+
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
|
|
69
|
+
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
|
|
70
|
+
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
|
|
71
|
+
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
|
|
72
|
+
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
|
|
73
|
+
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
|
|
74
|
+
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO // F8 - FF
|
|
75
|
+
];
|
|
76
|
+
|
|
77
|
+
// 0 : illegal
|
|
78
|
+
// 1 : very unlikely
|
|
79
|
+
// 2 : normal
|
|
80
|
+
// 3 : very likely
|
|
81
|
+
var Latin1ClassModel = [
|
|
82
|
+
// UDF OTH ASC ASS ACV ACO ASV ASO
|
|
83
|
+
0, 0, 0, 0, 0, 0, 0, 0, // UDF
|
|
84
|
+
0, 3, 3, 3, 3, 3, 3, 3, // OTH
|
|
85
|
+
0, 3, 3, 3, 3, 3, 3, 3, // ASC
|
|
86
|
+
0, 3, 3, 3, 1, 1, 3, 3, // ASS
|
|
87
|
+
0, 3, 3, 3, 1, 2, 1, 2, // ACV
|
|
88
|
+
0, 3, 3, 3, 3, 3, 3, 3, // ACO
|
|
89
|
+
0, 3, 1, 3, 1, 1, 1, 3, // ASV
|
|
90
|
+
0, 3, 1, 3, 1, 1, 3, 3 // ASO
|
|
91
|
+
];
|
|
92
|
+
|
|
93
|
+
function Latin1Prober() {
|
|
94
|
+
CharSetProber.apply(this);
|
|
95
|
+
|
|
96
|
+
var FREQ_CAT_NUM = 4;
|
|
97
|
+
var CLASS_NUM = 8; // total classes
|
|
98
|
+
var self = this;
|
|
99
|
+
|
|
100
|
+
function init() {
|
|
101
|
+
self.reset();
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
this.reset = function() {
|
|
105
|
+
this._mLastCharClass = OTH;
|
|
106
|
+
this._mFreqCounter = [];
|
|
107
|
+
for( var i = 0; i < FREQ_CAT_NUM; this._mFreqCounter[i++] = 0 );
|
|
108
|
+
Latin1Prober.prototype.reset.apply(this);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
this.getCharsetName = function() {
|
|
112
|
+
return "windows-1252";
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
this.getSupportedCharsetNames = function() {
|
|
116
|
+
return [this.getCharsetName()];
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
this.feed = function(aBuf) {
|
|
120
|
+
aBuf = this.filterWithEnglishLetters(aBuf);
|
|
121
|
+
for( var i = 0; i < aBuf.length; i++ ) {
|
|
122
|
+
var c = aBuf.charCodeAt(i);
|
|
123
|
+
var charClass = Latin1_CharToClass[c];
|
|
124
|
+
var freq = Latin1ClassModel[(this._mLastCharClass * CLASS_NUM) + charClass];
|
|
125
|
+
if( freq == 0 ) {
|
|
126
|
+
this._mState = Constants.notMe;
|
|
127
|
+
break;
|
|
128
|
+
}
|
|
129
|
+
this._mFreqCounter[freq]++;
|
|
130
|
+
this._mLastCharClass = charClass;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return this.getState();
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
this.getConfidence = function() {
|
|
137
|
+
var confidence;
|
|
138
|
+
var constants;
|
|
139
|
+
|
|
140
|
+
if( this.getState() == Constants.notMe ) {
|
|
141
|
+
return 0.01;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
var total = 0;
|
|
145
|
+
for( var i = 0; i < this._mFreqCounter.length; i++ ) {
|
|
146
|
+
total += this._mFreqCounter[i];
|
|
147
|
+
}
|
|
148
|
+
if( total < 0.01 ) {
|
|
149
|
+
constants = 0.0;
|
|
150
|
+
} else {
|
|
151
|
+
confidence = (this._mFreqCounter[3] / total) - (this._mFreqCounter[1] * 20 / total);
|
|
152
|
+
}
|
|
153
|
+
if( confidence < 0 ) {
|
|
154
|
+
confidence = 0.0;
|
|
155
|
+
}
|
|
156
|
+
// lower the confidence of latin1 so that other more accurate detector
|
|
157
|
+
// can take priority.
|
|
158
|
+
//
|
|
159
|
+
// antonio.afonso: need to change this otherwise languages like pt, es, fr using latin1 will never be detected.
|
|
160
|
+
confidence = confidence * 0.95;
|
|
161
|
+
return confidence;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
init();
|
|
165
|
+
}
|
|
166
|
+
Latin1Prober.prototype = new CharSetProber();
|
|
167
|
+
|
|
168
|
+
module.exports = Latin1Prober
|
package/src/logger.js
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* The Original Code is Mozilla Universal charset detector code.
|
|
3
|
+
*
|
|
4
|
+
* The Initial Developer of the Original Code is
|
|
5
|
+
* Netscape Communications Corporation.
|
|
6
|
+
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
7
|
+
* the Initial Developer. All Rights Reserved.
|
|
8
|
+
*
|
|
9
|
+
* Contributor(s):
|
|
10
|
+
* António Afonso (antonio.afonso gmail.com) - port to JavaScript
|
|
11
|
+
* Mark Pilgrim - port to Python
|
|
12
|
+
* Shy Shalom - original C code
|
|
13
|
+
*
|
|
14
|
+
* This library is free software; you can redistribute it and/or
|
|
15
|
+
* modify it under the terms of the GNU Lesser General Public
|
|
16
|
+
* License as published by the Free Software Foundation; either
|
|
17
|
+
* version 2.1 of the License, or (at your option) any later version.
|
|
18
|
+
*
|
|
19
|
+
* This library is distributed in the hope that it will be useful,
|
|
20
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
21
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
22
|
+
* Lesser General Public License for more details.
|
|
23
|
+
*
|
|
24
|
+
* You should have received a copy of the GNU Lesser General Public
|
|
25
|
+
* License along with this library; if not, write to the Free Software
|
|
26
|
+
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
|
27
|
+
* 02110-1301 USA
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
var CharSetProber = require('./charsetprober');
|
|
31
|
+
var constants = require('./constants');
|
|
32
|
+
var logger = require('./logger');
|
|
33
|
+
|
|
34
|
+
function MultiByteCharSetProber() {
|
|
35
|
+
CharSetProber.apply(this);
|
|
36
|
+
|
|
37
|
+
var self = this;
|
|
38
|
+
|
|
39
|
+
function init() {
|
|
40
|
+
self._mDistributionAnalyzer = null;
|
|
41
|
+
self._mCodingSM = null;
|
|
42
|
+
self._mLastChar = ["\x00", "\x00"];
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
this.reset = function() {
|
|
46
|
+
MultiByteCharSetProber.prototype.reset.apply(this);
|
|
47
|
+
if( this._mCodingSM ) {
|
|
48
|
+
this._mCodingSM.reset();
|
|
49
|
+
}
|
|
50
|
+
if( this._mDistributionAnalyzer ) {
|
|
51
|
+
this._mDistributionAnalyzer.reset();
|
|
52
|
+
}
|
|
53
|
+
this._mLastChar = ["\x00", "\x00"];
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
this.getCharsetName = function() {
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
this.feed = function(aBuf) {
|
|
60
|
+
var aLen = aBuf.length;
|
|
61
|
+
for( var i = 0; i < aLen; i++ ) {
|
|
62
|
+
var codingState = this._mCodingSM.nextState(aBuf[i]);
|
|
63
|
+
if( codingState == constants.error ) {
|
|
64
|
+
logger.log(this.getCharsetName() + " prober hit error at byte " + i + "\n");
|
|
65
|
+
this._mState = constants.notMe;
|
|
66
|
+
break;
|
|
67
|
+
} else if( codingState == constants.itsMe ) {
|
|
68
|
+
this._mState = constants.foundIt;
|
|
69
|
+
break;
|
|
70
|
+
} else if( codingState == constants.start ) {
|
|
71
|
+
var charLen = this._mCodingSM.getCurrentCharLen();
|
|
72
|
+
if( i == 0 ) {
|
|
73
|
+
this._mLastChar[1] = aBuf[0];
|
|
74
|
+
this._mDistributionAnalyzer.feed(this._mLastChar.join(''), charLen);
|
|
75
|
+
} else {
|
|
76
|
+
this._mDistributionAnalyzer.feed(aBuf.slice(i-1,i+1), charLen);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
this._mLastChar[0] = aBuf[aLen - 1];
|
|
82
|
+
|
|
83
|
+
if( this.getState() == constants.detecting ) {
|
|
84
|
+
if( this._mDistributionAnalyzer.gotEnoughData() &&
|
|
85
|
+
this.getConfidence() > constants.SHORTCUT_THRESHOLD ) {
|
|
86
|
+
this._mState = constants.foundIt;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return this.getState();
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
this.getConfidence = function() {
|
|
94
|
+
return this._mDistributionAnalyzer.getConfidence();
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
MultiByteCharSetProber.prototype = new CharSetProber();
|
|
98
|
+
|
|
99
|
+
module.exports = MultiByteCharSetProber
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* The Original Code is Mozilla Universal charset detector code.
|
|
3
|
+
*
|
|
4
|
+
* The Initial Developer of the Original Code is
|
|
5
|
+
* Netscape Communications Corporation.
|
|
6
|
+
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
7
|
+
* the Initial Developer. All Rights Reserved.
|
|
8
|
+
*
|
|
9
|
+
* Contributor(s):
|
|
10
|
+
* António Afonso (antonio.afonso gmail.com) - port to JavaScript
|
|
11
|
+
* Mark Pilgrim - port to Python
|
|
12
|
+
* Shy Shalom - original C code
|
|
13
|
+
*
|
|
14
|
+
* This library is free software; you can redistribute it and/or
|
|
15
|
+
* modify it under the terms of the GNU Lesser General Public
|
|
16
|
+
* License as published by the Free Software Foundation; either
|
|
17
|
+
* version 2.1 of the License, or (at your option) any later version.
|
|
18
|
+
*
|
|
19
|
+
* This library is distributed in the hope that it will be useful,
|
|
20
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
21
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
22
|
+
* Lesser General Public License for more details.
|
|
23
|
+
*
|
|
24
|
+
* You should have received a copy of the GNU Lesser General Public
|
|
25
|
+
* License along with this library; if not, write to the Free Software
|
|
26
|
+
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
|
27
|
+
* 02110-1301 USA
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
var CharSetGroupProber = require('./charsetgroupprober');
|
|
31
|
+
var Big5Prober = require('./big5prober');
|
|
32
|
+
var UTF8Prober = require('./utf8prober');
|
|
33
|
+
var SJISProber = require('./sjisprober');
|
|
34
|
+
var EUCJPProber = require('./eucjpprober');
|
|
35
|
+
var GB2312Prober = require('./gb2312prober');
|
|
36
|
+
var EUCKRProber = require('./euckrprober');
|
|
37
|
+
var EUCTWProber = require('./euctwprober');
|
|
38
|
+
|
|
39
|
+
function MBCSGroupProber() {
|
|
40
|
+
CharSetGroupProber.apply(this);
|
|
41
|
+
this._mProbers = [
|
|
42
|
+
new UTF8Prober(),
|
|
43
|
+
new SJISProber(),
|
|
44
|
+
new EUCJPProber(),
|
|
45
|
+
new GB2312Prober(),
|
|
46
|
+
new EUCKRProber(),
|
|
47
|
+
new Big5Prober(),
|
|
48
|
+
new EUCTWProber()
|
|
49
|
+
];
|
|
50
|
+
const supportedCharsetNames = (function() {
|
|
51
|
+
const charsetNames = [];
|
|
52
|
+
for (const prober of this._mProbers) {
|
|
53
|
+
charsetNames.push(prober.getCharsetName())
|
|
54
|
+
}
|
|
55
|
+
return charsetNames;
|
|
56
|
+
});
|
|
57
|
+
this.getSupportedCharsetNames = function() {
|
|
58
|
+
return supportedCharsetNames;
|
|
59
|
+
}
|
|
60
|
+
this.reset();
|
|
61
|
+
}
|
|
62
|
+
MBCSGroupProber.prototype = new CharSetGroupProber();
|
|
63
|
+
|
|
64
|
+
module.exports = MBCSGroupProber
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
var consts = require('../constants');
|
|
2
|
+
|
|
3
|
+
var BIG5_cls = [
|
|
4
|
+
1,1,1,1,1,1,1,1, // 00 - 07 //allow 0x00 as legal value
|
|
5
|
+
1,1,1,1,1,1,0,0, // 08 - 0f
|
|
6
|
+
1,1,1,1,1,1,1,1, // 10 - 17
|
|
7
|
+
1,1,1,0,1,1,1,1, // 18 - 1f
|
|
8
|
+
1,1,1,1,1,1,1,1, // 20 - 27
|
|
9
|
+
1,1,1,1,1,1,1,1, // 28 - 2f
|
|
10
|
+
1,1,1,1,1,1,1,1, // 30 - 37
|
|
11
|
+
1,1,1,1,1,1,1,1, // 38 - 3f
|
|
12
|
+
2,2,2,2,2,2,2,2, // 40 - 47
|
|
13
|
+
2,2,2,2,2,2,2,2, // 48 - 4f
|
|
14
|
+
2,2,2,2,2,2,2,2, // 50 - 57
|
|
15
|
+
2,2,2,2,2,2,2,2, // 58 - 5f
|
|
16
|
+
2,2,2,2,2,2,2,2, // 60 - 67
|
|
17
|
+
2,2,2,2,2,2,2,2, // 68 - 6f
|
|
18
|
+
2,2,2,2,2,2,2,2, // 70 - 77
|
|
19
|
+
2,2,2,2,2,2,2,1, // 78 - 7f
|
|
20
|
+
4,4,4,4,4,4,4,4, // 80 - 87
|
|
21
|
+
4,4,4,4,4,4,4,4, // 88 - 8f
|
|
22
|
+
4,4,4,4,4,4,4,4, // 90 - 97
|
|
23
|
+
4,4,4,4,4,4,4,4, // 98 - 9f
|
|
24
|
+
4,3,3,3,3,3,3,3, // a0 - a7
|
|
25
|
+
3,3,3,3,3,3,3,3, // a8 - af
|
|
26
|
+
3,3,3,3,3,3,3,3, // b0 - b7
|
|
27
|
+
3,3,3,3,3,3,3,3, // b8 - bf
|
|
28
|
+
3,3,3,3,3,3,3,3, // c0 - c7
|
|
29
|
+
3,3,3,3,3,3,3,3, // c8 - cf
|
|
30
|
+
3,3,3,3,3,3,3,3, // d0 - d7
|
|
31
|
+
3,3,3,3,3,3,3,3, // d8 - df
|
|
32
|
+
3,3,3,3,3,3,3,3, // e0 - e7
|
|
33
|
+
3,3,3,3,3,3,3,3, // e8 - ef
|
|
34
|
+
3,3,3,3,3,3,3,3, // f0 - f7
|
|
35
|
+
3,3,3,3,3,3,3,0 // f8 - ff
|
|
36
|
+
];
|
|
37
|
+
|
|
38
|
+
var BIG5_st = [
|
|
39
|
+
consts.error,consts.start,consts.start, 3,consts.error,consts.error,consts.error,consts.error, //00-07
|
|
40
|
+
consts.error,consts.error,consts.itsMe,consts.itsMe,consts.itsMe,consts.itsMe,consts.itsMe,consts.error, //08-0f
|
|
41
|
+
consts.error,consts.start,consts.start,consts.start,consts.start,consts.start,consts.start,consts.start //10-17
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
var Big5CharLenTable = [0, 1, 1, 2, 0];
|
|
45
|
+
|
|
46
|
+
module.exports = {
|
|
47
|
+
"classTable" : BIG5_cls,
|
|
48
|
+
"classFactor" : 5,
|
|
49
|
+
"stateTable" : BIG5_st,
|
|
50
|
+
"charLenTable" : Big5CharLenTable,
|
|
51
|
+
"name" : "Big5"
|
|
52
|
+
};
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
var consts = require('../constants');
|
|
2
|
+
|
|
3
|
+
var EUCJP_cls = [
|
|
4
|
+
4,4,4,4,4,4,4,4, // 00 - 07
|
|
5
|
+
4,4,4,4,4,4,5,5, // 08 - 0f
|
|
6
|
+
4,4,4,4,4,4,4,4, // 10 - 17
|
|
7
|
+
4,4,4,5,4,4,4,4, // 18 - 1f
|
|
8
|
+
4,4,4,4,4,4,4,4, // 20 - 27
|
|
9
|
+
4,4,4,4,4,4,4,4, // 28 - 2f
|
|
10
|
+
4,4,4,4,4,4,4,4, // 30 - 37
|
|
11
|
+
4,4,4,4,4,4,4,4, // 38 - 3f
|
|
12
|
+
4,4,4,4,4,4,4,4, // 40 - 47
|
|
13
|
+
4,4,4,4,4,4,4,4, // 48 - 4f
|
|
14
|
+
4,4,4,4,4,4,4,4, // 50 - 57
|
|
15
|
+
4,4,4,4,4,4,4,4, // 58 - 5f
|
|
16
|
+
4,4,4,4,4,4,4,4, // 60 - 67
|
|
17
|
+
4,4,4,4,4,4,4,4, // 68 - 6f
|
|
18
|
+
4,4,4,4,4,4,4,4, // 70 - 77
|
|
19
|
+
4,4,4,4,4,4,4,4, // 78 - 7f
|
|
20
|
+
5,5,5,5,5,5,5,5, // 80 - 87
|
|
21
|
+
5,5,5,5,5,5,1,3, // 88 - 8f
|
|
22
|
+
5,5,5,5,5,5,5,5, // 90 - 97
|
|
23
|
+
5,5,5,5,5,5,5,5, // 98 - 9f
|
|
24
|
+
5,2,2,2,2,2,2,2, // a0 - a7
|
|
25
|
+
2,2,2,2,2,2,2,2, // a8 - af
|
|
26
|
+
2,2,2,2,2,2,2,2, // b0 - b7
|
|
27
|
+
2,2,2,2,2,2,2,2, // b8 - bf
|
|
28
|
+
2,2,2,2,2,2,2,2, // c0 - c7
|
|
29
|
+
2,2,2,2,2,2,2,2, // c8 - cf
|
|
30
|
+
2,2,2,2,2,2,2,2, // d0 - d7
|
|
31
|
+
2,2,2,2,2,2,2,2, // d8 - df
|
|
32
|
+
0,0,0,0,0,0,0,0, // e0 - e7
|
|
33
|
+
0,0,0,0,0,0,0,0, // e8 - ef
|
|
34
|
+
0,0,0,0,0,0,0,0, // f0 - f7
|
|
35
|
+
0,0,0,0,0,0,0,5 // f8 - ff
|
|
36
|
+
];
|
|
37
|
+
|
|
38
|
+
var EUCJP_st = [
|
|
39
|
+
3, 4, 3, 5,consts.start,consts.error,consts.error,consts.error, //00-07
|
|
40
|
+
consts.error,consts.error,consts.error,consts.error,consts.itsMe,consts.itsMe,consts.itsMe,consts.itsMe, //08-0f
|
|
41
|
+
consts.itsMe,consts.itsMe,consts.start,consts.error,consts.start,consts.error,consts.error,consts.error, //10-17
|
|
42
|
+
consts.error,consts.error,consts.start,consts.error,consts.error,consts.error, 3,consts.error, //18-1f
|
|
43
|
+
3,consts.error,consts.error,consts.error,consts.start,consts.start,consts.start,consts.start //20-27
|
|
44
|
+
];
|
|
45
|
+
|
|
46
|
+
var EUCJPCharLenTable = [2, 2, 2, 3, 1, 0];
|
|
47
|
+
|
|
48
|
+
module.exports = {
|
|
49
|
+
"classTable" : EUCJP_cls,
|
|
50
|
+
"classFactor" : 6,
|
|
51
|
+
"stateTable" : EUCJP_st,
|
|
52
|
+
"charLenTable" : EUCJPCharLenTable,
|
|
53
|
+
"name" : "EUC-JP"
|
|
54
|
+
};
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
var consts = require('../constants');
|
|
2
|
+
|
|
3
|
+
var EUCKR_cls = [
|
|
4
|
+
1,1,1,1,1,1,1,1, // 00 - 07
|
|
5
|
+
1,1,1,1,1,1,0,0, // 08 - 0f
|
|
6
|
+
1,1,1,1,1,1,1,1, // 10 - 17
|
|
7
|
+
1,1,1,0,1,1,1,1, // 18 - 1f
|
|
8
|
+
1,1,1,1,1,1,1,1, // 20 - 27
|
|
9
|
+
1,1,1,1,1,1,1,1, // 28 - 2f
|
|
10
|
+
1,1,1,1,1,1,1,1, // 30 - 37
|
|
11
|
+
1,1,1,1,1,1,1,1, // 38 - 3f
|
|
12
|
+
1,1,1,1,1,1,1,1, // 40 - 47
|
|
13
|
+
1,1,1,1,1,1,1,1, // 48 - 4f
|
|
14
|
+
1,1,1,1,1,1,1,1, // 50 - 57
|
|
15
|
+
1,1,1,1,1,1,1,1, // 58 - 5f
|
|
16
|
+
1,1,1,1,1,1,1,1, // 60 - 67
|
|
17
|
+
1,1,1,1,1,1,1,1, // 68 - 6f
|
|
18
|
+
1,1,1,1,1,1,1,1, // 70 - 77
|
|
19
|
+
1,1,1,1,1,1,1,1, // 78 - 7f
|
|
20
|
+
0,0,0,0,0,0,0,0, // 80 - 87
|
|
21
|
+
0,0,0,0,0,0,0,0, // 88 - 8f
|
|
22
|
+
0,0,0,0,0,0,0,0, // 90 - 97
|
|
23
|
+
0,0,0,0,0,0,0,0, // 98 - 9f
|
|
24
|
+
0,2,2,2,2,2,2,2, // a0 - a7
|
|
25
|
+
2,2,2,2,2,3,3,3, // a8 - af
|
|
26
|
+
2,2,2,2,2,2,2,2, // b0 - b7
|
|
27
|
+
2,2,2,2,2,2,2,2, // b8 - bf
|
|
28
|
+
2,2,2,2,2,2,2,2, // c0 - c7
|
|
29
|
+
2,3,2,2,2,2,2,2, // c8 - cf
|
|
30
|
+
2,2,2,2,2,2,2,2, // d0 - d7
|
|
31
|
+
2,2,2,2,2,2,2,2, // d8 - df
|
|
32
|
+
2,2,2,2,2,2,2,2, // e0 - e7
|
|
33
|
+
2,2,2,2,2,2,2,2, // e8 - ef
|
|
34
|
+
2,2,2,2,2,2,2,2, // f0 - f7
|
|
35
|
+
2,2,2,2,2,2,2,0 // f8 - ff
|
|
36
|
+
];
|
|
37
|
+
|
|
38
|
+
var EUCKR_st = [
|
|
39
|
+
consts.error,consts.start, 3,consts.error,consts.error,consts.error,consts.error,consts.error, //00-07
|
|
40
|
+
consts.itsMe,consts.itsMe,consts.itsMe,consts.itsMe,consts.error,consts.error,consts.start,consts.start //08-0f
|
|
41
|
+
];
|
|
42
|
+
|
|
43
|
+
var EUCKRCharLenTable = [0, 1, 2, 0];
|
|
44
|
+
|
|
45
|
+
module.exports = {
|
|
46
|
+
"classTable" : EUCKR_cls,
|
|
47
|
+
"classFactor" : 4,
|
|
48
|
+
"stateTable" : EUCKR_st,
|
|
49
|
+
"charLenTable" : EUCKRCharLenTable,
|
|
50
|
+
"name" : "EUC-KR"
|
|
51
|
+
};
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
var consts = require('../constants');
|
|
2
|
+
|
|
3
|
+
var EUCTW_cls = [
|
|
4
|
+
2,2,2,2,2,2,2,2, // 00 - 07
|
|
5
|
+
2,2,2,2,2,2,0,0, // 08 - 0f
|
|
6
|
+
2,2,2,2,2,2,2,2, // 10 - 17
|
|
7
|
+
2,2,2,0,2,2,2,2, // 18 - 1f
|
|
8
|
+
2,2,2,2,2,2,2,2, // 20 - 27
|
|
9
|
+
2,2,2,2,2,2,2,2, // 28 - 2f
|
|
10
|
+
2,2,2,2,2,2,2,2, // 30 - 37
|
|
11
|
+
2,2,2,2,2,2,2,2, // 38 - 3f
|
|
12
|
+
2,2,2,2,2,2,2,2, // 40 - 47
|
|
13
|
+
2,2,2,2,2,2,2,2, // 48 - 4f
|
|
14
|
+
2,2,2,2,2,2,2,2, // 50 - 57
|
|
15
|
+
2,2,2,2,2,2,2,2, // 58 - 5f
|
|
16
|
+
2,2,2,2,2,2,2,2, // 60 - 67
|
|
17
|
+
2,2,2,2,2,2,2,2, // 68 - 6f
|
|
18
|
+
2,2,2,2,2,2,2,2, // 70 - 77
|
|
19
|
+
2,2,2,2,2,2,2,2, // 78 - 7f
|
|
20
|
+
0,0,0,0,0,0,0,0, // 80 - 87
|
|
21
|
+
0,0,0,0,0,0,6,0, // 88 - 8f
|
|
22
|
+
0,0,0,0,0,0,0,0, // 90 - 97
|
|
23
|
+
0,0,0,0,0,0,0,0, // 98 - 9f
|
|
24
|
+
0,3,4,4,4,4,4,4, // a0 - a7
|
|
25
|
+
5,5,1,1,1,1,1,1, // a8 - af
|
|
26
|
+
1,1,1,1,1,1,1,1, // b0 - b7
|
|
27
|
+
1,1,1,1,1,1,1,1, // b8 - bf
|
|
28
|
+
1,1,3,1,3,3,3,3, // c0 - c7
|
|
29
|
+
3,3,3,3,3,3,3,3, // c8 - cf
|
|
30
|
+
3,3,3,3,3,3,3,3, // d0 - d7
|
|
31
|
+
3,3,3,3,3,3,3,3, // d8 - df
|
|
32
|
+
3,3,3,3,3,3,3,3, // e0 - e7
|
|
33
|
+
3,3,3,3,3,3,3,3, // e8 - ef
|
|
34
|
+
3,3,3,3,3,3,3,3, // f0 - f7
|
|
35
|
+
3,3,3,3,3,3,3,0 // f8 - ff
|
|
36
|
+
];
|
|
37
|
+
|
|
38
|
+
var EUCTW_st = [
|
|
39
|
+
consts.error,consts.error,consts.start, 3, 3, 3, 4,consts.error, //00-07
|
|
40
|
+
consts.error,consts.error,consts.error,consts.error,consts.error,consts.error,consts.itsMe,consts.itsMe, //08-0f
|
|
41
|
+
consts.itsMe,consts.itsMe,consts.itsMe,consts.itsMe,consts.itsMe,consts.error,consts.start,consts.error, //10-17
|
|
42
|
+
consts.start,consts.start,consts.start,consts.error,consts.error,consts.error,consts.error,consts.error, //18-1f
|
|
43
|
+
5,consts.error,consts.error,consts.error,consts.start,consts.error,consts.start,consts.start, //20-27
|
|
44
|
+
consts.start,consts.error,consts.start,consts.start,consts.start,consts.start,consts.start,consts.start //28-2f
|
|
45
|
+
];
|
|
46
|
+
|
|
47
|
+
var EUCTWCharLenTable = [0, 0, 1, 2, 2, 2, 3];
|
|
48
|
+
|
|
49
|
+
module.exports = {
|
|
50
|
+
"classTable" : EUCTW_cls,
|
|
51
|
+
"classFactor" : 7,
|
|
52
|
+
"stateTable" : EUCTW_st,
|
|
53
|
+
"charLenTable" : EUCTWCharLenTable,
|
|
54
|
+
"name" : "x-euc-tw"
|
|
55
|
+
};
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
var consts = require('../constants');
|
|
2
|
+
|
|
3
|
+
var GB2312_cls = [
|
|
4
|
+
1,1,1,1,1,1,1,1, // 00 - 07
|
|
5
|
+
1,1,1,1,1,1,0,0, // 08 - 0f
|
|
6
|
+
1,1,1,1,1,1,1,1, // 10 - 17
|
|
7
|
+
1,1,1,0,1,1,1,1, // 18 - 1f
|
|
8
|
+
1,1,1,1,1,1,1,1, // 20 - 27
|
|
9
|
+
1,1,1,1,1,1,1,1, // 28 - 2f
|
|
10
|
+
3,3,3,3,3,3,3,3, // 30 - 37
|
|
11
|
+
3,3,1,1,1,1,1,1, // 38 - 3f
|
|
12
|
+
2,2,2,2,2,2,2,2, // 40 - 47
|
|
13
|
+
2,2,2,2,2,2,2,2, // 48 - 4f
|
|
14
|
+
2,2,2,2,2,2,2,2, // 50 - 57
|
|
15
|
+
2,2,2,2,2,2,2,2, // 58 - 5f
|
|
16
|
+
2,2,2,2,2,2,2,2, // 60 - 67
|
|
17
|
+
2,2,2,2,2,2,2,2, // 68 - 6f
|
|
18
|
+
2,2,2,2,2,2,2,2, // 70 - 77
|
|
19
|
+
2,2,2,2,2,2,2,4, // 78 - 7f
|
|
20
|
+
5,6,6,6,6,6,6,6, // 80 - 87
|
|
21
|
+
6,6,6,6,6,6,6,6, // 88 - 8f
|
|
22
|
+
6,6,6,6,6,6,6,6, // 90 - 97
|
|
23
|
+
6,6,6,6,6,6,6,6, // 98 - 9f
|
|
24
|
+
6,6,6,6,6,6,6,6, // a0 - a7
|
|
25
|
+
6,6,6,6,6,6,6,6, // a8 - af
|
|
26
|
+
6,6,6,6,6,6,6,6, // b0 - b7
|
|
27
|
+
6,6,6,6,6,6,6,6, // b8 - bf
|
|
28
|
+
6,6,6,6,6,6,6,6, // c0 - c7
|
|
29
|
+
6,6,6,6,6,6,6,6, // c8 - cf
|
|
30
|
+
6,6,6,6,6,6,6,6, // d0 - d7
|
|
31
|
+
6,6,6,6,6,6,6,6, // d8 - df
|
|
32
|
+
6,6,6,6,6,6,6,6, // e0 - e7
|
|
33
|
+
6,6,6,6,6,6,6,6, // e8 - ef
|
|
34
|
+
6,6,6,6,6,6,6,6, // f0 - f7
|
|
35
|
+
6,6,6,6,6,6,6,0 // f8 - ff
|
|
36
|
+
];
|
|
37
|
+
|
|
38
|
+
var GB2312_st = [
|
|
39
|
+
consts.error,consts.start,consts.start,consts.start,consts.start,consts.start, 3,consts.error, //00-07
|
|
40
|
+
consts.error,consts.error,consts.error,consts.error,consts.error,consts.error,consts.itsMe,consts.itsMe, //08-0f
|
|
41
|
+
consts.itsMe,consts.itsMe,consts.itsMe,consts.itsMe,consts.itsMe,consts.error,consts.error,consts.start, //10-17
|
|
42
|
+
4,consts.error,consts.start,consts.start,consts.error,consts.error,consts.error,consts.error, //18-1f
|
|
43
|
+
consts.error,consts.error, 5,consts.error,consts.error,consts.error,consts.itsMe,consts.error, //20-27
|
|
44
|
+
consts.error,consts.error,consts.start,consts.start,consts.start,consts.start,consts.start,consts.start //28-2f
|
|
45
|
+
];
|
|
46
|
+
|
|
47
|
+
// To be accurate, the length of class 6 can be either 2 or 4.
|
|
48
|
+
// But it is not necessary to discriminate between the two since
|
|
49
|
+
// it is used for frequency analysis only, and we are validing
|
|
50
|
+
// each code range there as well. So it is safe to set it to be
|
|
51
|
+
// 2 here.
|
|
52
|
+
var GB2312CharLenTable = [0, 1, 1, 1, 1, 1, 2];
|
|
53
|
+
|
|
54
|
+
module.exports = {
|
|
55
|
+
"classTable" : GB2312_cls,
|
|
56
|
+
"classFactor" : 7,
|
|
57
|
+
"stateTable" : GB2312_st,
|
|
58
|
+
"charLenTable" : GB2312CharLenTable,
|
|
59
|
+
"name" : "GB2312"
|
|
60
|
+
};
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
var consts = require('../constants');
|
|
2
|
+
|
|
3
|
+
var SJIS_cls = [
|
|
4
|
+
1,1,1,1,1,1,1,1, // 00 - 07
|
|
5
|
+
1,1,1,1,1,1,0,0, // 08 - 0f
|
|
6
|
+
1,1,1,1,1,1,1,1, // 10 - 17
|
|
7
|
+
1,1,1,0,1,1,1,1, // 18 - 1f
|
|
8
|
+
1,1,1,1,1,1,1,1, // 20 - 27
|
|
9
|
+
1,1,1,1,1,1,1,1, // 28 - 2f
|
|
10
|
+
1,1,1,1,1,1,1,1, // 30 - 37
|
|
11
|
+
1,1,1,1,1,1,1,1, // 38 - 3f
|
|
12
|
+
2,2,2,2,2,2,2,2, // 40 - 47
|
|
13
|
+
2,2,2,2,2,2,2,2, // 48 - 4f
|
|
14
|
+
2,2,2,2,2,2,2,2, // 50 - 57
|
|
15
|
+
2,2,2,2,2,2,2,2, // 58 - 5f
|
|
16
|
+
2,2,2,2,2,2,2,2, // 60 - 67
|
|
17
|
+
2,2,2,2,2,2,2,2, // 68 - 6f
|
|
18
|
+
2,2,2,2,2,2,2,2, // 70 - 77
|
|
19
|
+
2,2,2,2,2,2,2,1, // 78 - 7f
|
|
20
|
+
3,3,3,3,3,3,3,3, // 80 - 87
|
|
21
|
+
3,3,3,3,3,3,3,3, // 88 - 8f
|
|
22
|
+
3,3,3,3,3,3,3,3, // 90 - 97
|
|
23
|
+
3,3,3,3,3,3,3,3, // 98 - 9f
|
|
24
|
+
// 0xa0 is illegal in sjis encoding, but some pages does
|
|
25
|
+
// contain such byte. We need to be more consts.error forgiven.
|
|
26
|
+
2,2,2,2,2,2,2,2, // a0 - a7
|
|
27
|
+
2,2,2,2,2,2,2,2, // a8 - af
|
|
28
|
+
2,2,2,2,2,2,2,2, // b0 - b7
|
|
29
|
+
2,2,2,2,2,2,2,2, // b8 - bf
|
|
30
|
+
2,2,2,2,2,2,2,2, // c0 - c7
|
|
31
|
+
2,2,2,2,2,2,2,2, // c8 - cf
|
|
32
|
+
2,2,2,2,2,2,2,2, // d0 - d7
|
|
33
|
+
2,2,2,2,2,2,2,2, // d8 - df
|
|
34
|
+
3,3,3,3,3,3,3,3, // e0 - e7
|
|
35
|
+
3,3,3,3,3,4,4,4, // e8 - ef
|
|
36
|
+
3,3,3,3,3,3,3,3, // f0 - f7
|
|
37
|
+
3,3,3,3,3,0,0,0 // f8 - ff
|
|
38
|
+
];
|
|
39
|
+
|
|
40
|
+
var SJIS_st = [
|
|
41
|
+
consts.error,consts.start,consts.start, 3,consts.error,consts.error,consts.error,consts.error, //00-07
|
|
42
|
+
consts.error,consts.error,consts.error,consts.error,consts.itsMe,consts.itsMe,consts.itsMe,consts.itsMe, //08-0f
|
|
43
|
+
consts.itsMe,consts.itsMe,consts.error,consts.error,consts.start,consts.start,consts.start,consts.start //10-17
|
|
44
|
+
];
|
|
45
|
+
|
|
46
|
+
var SJISCharLenTable = [0, 1, 1, 2, 0, 0];
|
|
47
|
+
|
|
48
|
+
module.exports = {
|
|
49
|
+
"classTable" : SJIS_cls,
|
|
50
|
+
"classFactor" : 6,
|
|
51
|
+
"stateTable" : SJIS_st,
|
|
52
|
+
"charLenTable" : SJISCharLenTable,
|
|
53
|
+
"name" : "Shift_JIS"
|
|
54
|
+
};
|