@datawrapper/jschardet 3.0.1-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/CONTRIBUTORS +4 -0
  2. package/LICENSE +504 -0
  3. package/README.md +101 -0
  4. package/dist/jschardet.js +7859 -0
  5. package/dist/jschardet.min.js +669 -0
  6. package/index.d.ts +13 -0
  7. package/index.js +1 -0
  8. package/package.json +33 -0
  9. package/src/big5freq.js +925 -0
  10. package/src/big5prober.js +54 -0
  11. package/src/chardistribution.js +301 -0
  12. package/src/charsetgroupprober.js +120 -0
  13. package/src/charsetprober.js +104 -0
  14. package/src/codingstatemachine.js +71 -0
  15. package/src/constants.js +40 -0
  16. package/src/escprober.js +109 -0
  17. package/src/escsm.js +250 -0
  18. package/src/eucjpprober.js +107 -0
  19. package/src/euckrfreq.js +597 -0
  20. package/src/euckrprober.js +54 -0
  21. package/src/euctwfreq.js +429 -0
  22. package/src/euctwprober.js +54 -0
  23. package/src/gb2312freq.js +473 -0
  24. package/src/gb2312prober.js +54 -0
  25. package/src/hebrewprober.js +323 -0
  26. package/src/index.js +56 -0
  27. package/src/jisfreq.js +569 -0
  28. package/src/jpcntx.js +242 -0
  29. package/src/langbulgarianmodel.js +228 -0
  30. package/src/langcyrillicmodel.js +329 -0
  31. package/src/langgreekmodel.js +225 -0
  32. package/src/langhebrewmodel.js +199 -0
  33. package/src/langhungarianmodel.js +225 -0
  34. package/src/langthaimodel.js +200 -0
  35. package/src/latin1prober.js +168 -0
  36. package/src/logger.js +7 -0
  37. package/src/mbcharsetprober.js +99 -0
  38. package/src/mbcsgroupprober.js +64 -0
  39. package/src/mbcssm/big5.js +52 -0
  40. package/src/mbcssm/eucjp.js +54 -0
  41. package/src/mbcssm/euckr.js +51 -0
  42. package/src/mbcssm/euctw.js +55 -0
  43. package/src/mbcssm/gb2312.js +60 -0
  44. package/src/mbcssm/sjis.js +54 -0
  45. package/src/mbcssm/ucs2be.js +56 -0
  46. package/src/mbcssm/ucs2le.js +56 -0
  47. package/src/mbcssm/utf8.js +75 -0
  48. package/src/sbcharsetprober.js +137 -0
  49. package/src/sbcsgroupprober.js +83 -0
  50. package/src/sjisprober.js +105 -0
  51. package/src/universaldetector.js +262 -0
  52. package/src/utf8prober.js +108 -0
@@ -0,0 +1,54 @@
1
+ /*
2
+ * The Original Code is Mozilla Universal charset detector code.
3
+ *
4
+ * The Initial Developer of the Original Code is
5
+ * Netscape Communications Corporation.
6
+ * Portions created by the Initial Developer are Copyright (C) 2001
7
+ * the Initial Developer. All Rights Reserved.
8
+ *
9
+ * Contributor(s):
10
+ * António Afonso (antonio.afonso gmail.com) - port to JavaScript
11
+ * Mark Pilgrim - port to Python
12
+ * Shy Shalom - original C code
13
+ *
14
+ * This library is free software; you can redistribute it and/or
15
+ * modify it under the terms of the GNU Lesser General Public
16
+ * License as published by the Free Software Foundation; either
17
+ * version 2.1 of the License, or (at your option) any later version.
18
+ *
19
+ * This library is distributed in the hope that it will be useful,
20
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ * Lesser General Public License for more details.
23
+ *
24
+ * You should have received a copy of the GNU Lesser General Public
25
+ * License along with this library; if not, write to the Free Software
26
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ * 02110-1301 USA
28
+ */
29
+
30
+ var CodingStateMachine = require('./codingstatemachine')
31
+ var MultiByteCharSetProber = require('./mbcharsetprober')
32
+ var Big5SMModel = require('./mbcssm/big5')
33
+ var Big5DistributionAnalysis = require('./chardistribution').Big5DistributionAnalysis
34
+
35
+ function Big5Prober() {
36
+ MultiByteCharSetProber.apply(this);
37
+
38
+ var self = this;
39
+
40
+ function init() {
41
+ self._mCodingSM = new CodingStateMachine(Big5SMModel);
42
+ self._mDistributionAnalyzer = new Big5DistributionAnalysis();
43
+ self.reset();
44
+ }
45
+
46
+ this.getCharsetName = function() {
47
+ return "Big5";
48
+ }
49
+
50
+ init();
51
+ }
52
+ Big5Prober.prototype = new MultiByteCharSetProber();
53
+
54
+ module.exports = Big5Prober
@@ -0,0 +1,301 @@
1
+ /*
2
+ * The Original Code is Mozilla Universal charset detector code.
3
+ *
4
+ * The Initial Developer of the Original Code is
5
+ * Netscape Communications Corporation.
6
+ * Portions created by the Initial Developer are Copyright (C) 2001
7
+ * the Initial Developer. All Rights Reserved.
8
+ *
9
+ * Contributor(s):
10
+ * António Afonso (antonio.afonso gmail.com) - port to JavaScript
11
+ * Mark Pilgrim - port to Python
12
+ * Shy Shalom - original C code
13
+ *
14
+ * This library is free software; you can redistribute it and/or
15
+ * modify it under the terms of the GNU Lesser General Public
16
+ * License as published by the Free Software Foundation; either
17
+ * version 2.1 of the License, or (at your option) any later version.
18
+ *
19
+ * This library is distributed in the hope that it will be useful,
20
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ * Lesser General Public License for more details.
23
+ *
24
+ * You should have received a copy of the GNU Lesser General Public
25
+ * License along with this library; if not, write to the Free Software
26
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ * 02110-1301 USA
28
+ */
29
+
30
+ var jisfreq = require('./jisfreq');
31
+ var euctwfreq = require('./euctwfreq');
32
+ var euckrfreq = require('./euckrfreq');
33
+ var gb2312freq = require('./gb2312freq');
34
+ var big5freq = require('./big5freq');
35
+
36
+ function CharDistributionAnalysis() {
37
+ var ENOUGH_DATA_THRESHOLD = 1024;
38
+ var SURE_YES = 0.99;
39
+ var SURE_NO = 0.01;
40
+ var MINIMUM_DATA_THRESHOLD = 3;
41
+
42
+ var self = this;
43
+
44
+ function init() {
45
+ self._mCharToFreqOrder = null; // Mapping table to get frequency order from char order (get from GetOrder())
46
+ self._mTableSize = null; // Size of above table
47
+ self._mTypicalDistributionRatio = null; // This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
48
+ self.reset();
49
+ }
50
+
51
+ /**
52
+ * reset analyser, clear any state
53
+ */
54
+ this.reset = function() {
55
+ this._mDone = false; // If this flag is set to constants.True, detection is done and conclusion has been made
56
+ this._mTotalChars = 0; // Total characters encountered
57
+ this._mFreqChars = 0; // The number of characters whose frequency order is less than 512
58
+ }
59
+
60
+ /**
61
+ * feed a character with known length
62
+ */
63
+ this.feed = function(aStr, aCharLen) {
64
+ if( aCharLen == 2 ) {
65
+ // we only care about 2-bytes character in our distribution analysis
66
+ var order = this.getOrder(aStr);
67
+ } else {
68
+ order = -1;
69
+ }
70
+ if( order >= 0 ) {
71
+ this._mTotalChars++;
72
+ // order is valid
73
+ if( order < this._mTableSize ) {
74
+ if( 512 > this._mCharToFreqOrder[order] ) {
75
+ this._mFreqChars++;
76
+ }
77
+ }
78
+ }
79
+ }
80
+
81
+ /**
82
+ * return confidence based on existing data
83
+ */
84
+ this.getConfidence = function() {
85
+ // if we didn't receive any character in our consideration range, return negative answer
86
+ if( this._mTotalChars <= 0 || this._mFreqChars <= MINIMUM_DATA_THRESHOLD) {
87
+ return SURE_NO;
88
+ }
89
+ if( this._mTotalChars != this._mFreqChars ) {
90
+ var r = this._mFreqChars / ((this._mTotalChars - this._mFreqChars) * this._mTypicalDistributionRatio);
91
+ if( r < SURE_YES ) {
92
+ return r;
93
+ }
94
+ }
95
+
96
+ // normalize confidence (we don't want to be 100% sure)
97
+ return SURE_YES;
98
+ }
99
+
100
+ this.gotEnoughData = function() {
101
+ // It is not necessary to receive all data to draw conclusion. For charset detection,
102
+ // certain amount of data is enough
103
+ return this._mTotalChars > ENOUGH_DATA_THRESHOLD;
104
+ }
105
+
106
+ this.getOrder = function(aStr) {
107
+ // We do not handle characters based on the original encoding string, but
108
+ // convert this encoding string to a number, here called order.
109
+ // This allows multiple encodings of a language to share one frequency table.
110
+ return -1;
111
+ }
112
+
113
+ init();
114
+ }
115
+
116
+ exports.CharDistributionAnalysis = CharDistributionAnalysis
117
+
118
+ function EUCTWDistributionAnalysis() {
119
+ CharDistributionAnalysis.apply(this);
120
+
121
+ var self = this;
122
+
123
+ function init() {
124
+ self._mCharToFreqOrder = euctwfreq.EUCTWCharToFreqOrder;
125
+ self._mTableSize = euctwfreq.EUCTW_TABLE_SIZE;
126
+ self._mTypicalDistributionRatio = euctwfreq.EUCTW_TYPICAL_DISTRIBUTION_RATIO;
127
+ }
128
+
129
+ this.getOrder = function(aStr) {
130
+ // for euc-TW encoding, we are interested
131
+ // first byte range: 0xc4 -- 0xfe
132
+ // second byte range: 0xa1 -- 0xfe
133
+ // no validation needed here. State machine has done that
134
+ if( aStr.charCodeAt(0) >= 0xC4 ) {
135
+ return 94 * (aStr.charCodeAt(0) - 0xC4) + aStr.charCodeAt(1) - 0xA1;
136
+ } else {
137
+ return -1;
138
+ }
139
+ }
140
+
141
+ init();
142
+ }
143
+ EUCTWDistributionAnalysis.prototype = new CharDistributionAnalysis();
144
+
145
+ exports.EUCTWDistributionAnalysis = EUCTWDistributionAnalysis
146
+
147
+ function EUCKRDistributionAnalysis() {
148
+ CharDistributionAnalysis.apply(this);
149
+
150
+ var self = this;
151
+
152
+ function init() {
153
+ self._mCharToFreqOrder = euckrfreq.EUCKRCharToFreqOrder;
154
+ self._mTableSize = euckrfreq.EUCKR_TABLE_SIZE;
155
+ self._mTypicalDistributionRatio = euckrfreq.EUCKR_TYPICAL_DISTRIBUTION_RATIO;
156
+ }
157
+
158
+ this.getOrder = function(aStr) {
159
+ // for euc-KR encoding, we are interested
160
+ // first byte range: 0xb0 -- 0xfe
161
+ // second byte range: 0xa1 -- 0xfe
162
+ // no validation needed here. State machine has done that
163
+ if( aStr.charCodeAt(0) >= 0xB0 ) {
164
+ return 94 * (aStr.charCodeAt(0) - 0xB0) + aStr.charCodeAt(1) - 0xA1;
165
+ } else {
166
+ return -1;
167
+ }
168
+ }
169
+
170
+ init();
171
+ }
172
+ EUCKRDistributionAnalysis.prototype = new CharDistributionAnalysis();
173
+
174
+ exports.EUCKRDistributionAnalysis = EUCKRDistributionAnalysis
175
+
176
+ function GB2312DistributionAnalysis() {
177
+ CharDistributionAnalysis.apply(this);
178
+
179
+ var self = this;
180
+
181
+ function init() {
182
+ self._mCharToFreqOrder = gb2312freq.GB2312CharToFreqOrder;
183
+ self._mTableSize = gb2312freq.GB2312_TABLE_SIZE;
184
+ self._mTypicalDistributionRatio = gb2312freq.GB2312_TYPICAL_DISTRIBUTION_RATIO;
185
+ }
186
+
187
+ this.getOrder = function(aStr) {
188
+ // for GB2312 encoding, we are interested
189
+ // first byte range: 0xb0 -- 0xfe
190
+ // second byte range: 0xa1 -- 0xfe
191
+ // no validation needed here. State machine has done that
192
+ if( aStr.charCodeAt(0) >= 0xB0 && aStr.charCodeAt(1) >= 0xA1 ) {
193
+ return 94 * (aStr.charCodeAt(0) - 0xB0) + aStr.charCodeAt(1) - 0xA1;
194
+ } else {
195
+ return -1;
196
+ }
197
+ }
198
+
199
+ init();
200
+ }
201
+ GB2312DistributionAnalysis.prototype = new CharDistributionAnalysis();
202
+
203
+ exports.GB2312DistributionAnalysis = GB2312DistributionAnalysis
204
+
205
+ function Big5DistributionAnalysis() {
206
+ CharDistributionAnalysis.apply(this);
207
+
208
+ var self = this;
209
+
210
+ function init() {
211
+ self._mCharToFreqOrder = big5freq.Big5CharToFreqOrder;
212
+ self._mTableSize = big5freq.BIG5_TABLE_SIZE;
213
+ self._mTypicalDistributionRatio = big5freq.BIG5_TYPICAL_DISTRIBUTION_RATIO;
214
+ }
215
+
216
+ this.getOrder = function(aStr) {
217
+ // for big5 encoding, we are interested
218
+ // first byte range: 0xa4 -- 0xfe
219
+ // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
220
+ // no validation needed here. State machine has done that
221
+ if( aStr.charCodeAt(0) >= 0xA4 ) {
222
+ if( aStr.charCodeAt(1) >= 0xA1 ) {
223
+ return 157 * (aStr.charCodeAt(0) - 0xA4) + aStr.charCodeAt(1) - 0xA1 + 63;
224
+ } else {
225
+ return 157 * (aStr.charCodeAt(0) - 0xA4) + aStr.charCodeAt(1) - 0x40;
226
+ }
227
+ } else {
228
+ return -1;
229
+ }
230
+ }
231
+
232
+ init();
233
+ }
234
+ Big5DistributionAnalysis.prototype = new CharDistributionAnalysis();
235
+
236
+ exports.Big5DistributionAnalysis = Big5DistributionAnalysis
237
+
238
+ function SJISDistributionAnalysis() {
239
+ CharDistributionAnalysis.apply(this);
240
+
241
+ var self = this;
242
+
243
+ function init() {
244
+ self._mCharToFreqOrder = jisfreq.JISCharToFreqOrder;
245
+ self._mTableSize = jisfreq.JIS_TABLE_SIZE;
246
+ self._mTypicalDistributionRatio = jisfreq.JIS_TYPICAL_DISTRIBUTION_RATIO;
247
+ }
248
+
249
+ this.getOrder = function(aStr) {
250
+ // for sjis encoding, we are interested
251
+ // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xef
252
+ // second byte range: 0x40 -- 0x7e, 0x80 -- 0xfc
253
+ // no validation needed here. State machine has done that
254
+ if( aStr.charCodeAt(0) >= 0x81 && aStr.charCodeAt(0) <= 0x9F ) {
255
+ var order = 188 * (aStr.charCodeAt(0) - 0x81);
256
+ } else if( aStr.charCodeAt(0) >= 0xE0 && aStr.charCodeAt(0) <= 0xEF ) {
257
+ order = 188 * (aStr.charCodeAt(0) - 0xE0 + 31);
258
+ } else {
259
+ return -1;
260
+ }
261
+ order += aStr.charCodeAt(1) - 0x40;
262
+ if( aStr.charCodeAt(1) < 0x40 || aStr.charCodeAt(1) === 0x7F || aStr.charCodeAt(1) > 0xFC) {
263
+ order = -1;
264
+ }
265
+ return order;
266
+ }
267
+
268
+ init();
269
+ }
270
+ SJISDistributionAnalysis.prototype = new CharDistributionAnalysis();
271
+
272
+ exports.SJISDistributionAnalysis = SJISDistributionAnalysis
273
+
274
+ function EUCJPDistributionAnalysis() {
275
+ CharDistributionAnalysis.apply(this);
276
+
277
+ var self = this;
278
+
279
+ function init() {
280
+ self._mCharToFreqOrder = jisfreq.JISCharToFreqOrder;
281
+ self._mTableSize = jisfreq.JIS_TABLE_SIZE;
282
+ self._mTypicalDistributionRatio = jisfreq.JIS_TYPICAL_DISTRIBUTION_RATIO;
283
+ }
284
+
285
+ this.getOrder = function(aStr) {
286
+ // for euc-JP encoding, we are interested
287
+ // first byte range: 0xa0 -- 0xfe
288
+ // second byte range: 0xa1 -- 0xfe
289
+ // no validation needed here. State machine has done that
290
+ if( aStr[0] >= "\xA0" ) {
291
+ return 94 * (aStr.charCodeAt(0) - 0xA1) + aStr.charCodeAt(1) - 0xA1;
292
+ } else {
293
+ return -1;
294
+ }
295
+ }
296
+
297
+ init();
298
+ }
299
+ EUCJPDistributionAnalysis.prototype = new CharDistributionAnalysis();
300
+
301
+ exports.EUCJPDistributionAnalysis = EUCJPDistributionAnalysis
@@ -0,0 +1,120 @@
1
+ /*
2
+ * The Original Code is Mozilla Universal charset detector code.
3
+ *
4
+ * The Initial Developer of the Original Code is
5
+ * Netscape Communications Corporation.
6
+ * Portions created by the Initial Developer are Copyright (C) 2001
7
+ * the Initial Developer. All Rights Reserved.
8
+ *
9
+ * Contributor(s):
10
+ * António Afonso (antonio.afonso gmail.com) - port to JavaScript
11
+ * Mark Pilgrim - port to Python
12
+ * Shy Shalom - original C code
13
+ *
14
+ * This library is free software; you can redistribute it and/or
15
+ * modify it under the terms of the GNU Lesser General Public
16
+ * License as published by the Free Software Foundation; either
17
+ * version 2.1 of the License, or (at your option) any later version.
18
+ *
19
+ * This library is distributed in the hope that it will be useful,
20
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ * Lesser General Public License for more details.
23
+ *
24
+ * You should have received a copy of the GNU Lesser General Public
25
+ * License along with this library; if not, write to the Free Software
26
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ * 02110-1301 USA
28
+ */
29
+
30
+ var constants = require('./constants');
31
+ var CharSetProber = require('./charsetprober');
32
+ var logger = require('./logger');
33
+
34
+ function CharSetGroupProber() {
35
+ CharSetProber.apply(this);
36
+
37
+ var self = this;
38
+
39
+ function init() {
40
+ self._mActiveNum = 0;
41
+ self._mProbers = [];
42
+ self._mBestGuessProber = null;
43
+ }
44
+
45
+ this.reset = function() {
46
+ CharSetGroupProber.prototype.reset.apply(this);
47
+ this._mActiveNum = 0;
48
+ for( var i = 0, prober; prober = this._mProbers[i]; i++ ) {
49
+ if( prober ) {
50
+ prober.reset();
51
+ prober.active = true;
52
+ this._mActiveNum++;
53
+ }
54
+ }
55
+ this._mBestGuessProber = null;
56
+ }
57
+
58
+ this.getCharsetName = function() {
59
+ if( !this._mBestGuessProber ) {
60
+ this.getConfidence();
61
+ if( !this._mBestGuessProber ) return null;
62
+ }
63
+ return this._mBestGuessProber.getCharsetName();
64
+ }
65
+
66
+ this.getSupportedCharsetNames = function() {
67
+ throw new Error("Unimplemented method getSupportedCharsetNames()");
68
+ }
69
+
70
+ this.feed = function(aBuf) {
71
+ for( var i = 0, prober; prober = this._mProbers[i]; i++ ) {
72
+ if( !prober || !prober.active ) continue;
73
+ var st = prober.feed(aBuf);
74
+ if( !st ) continue;
75
+ if( st == constants.foundIt ) {
76
+ this._mBestGuessProber = prober;
77
+ return this.getState();
78
+ } else if( st == constants.notMe ) {
79
+ prober.active = false;
80
+ this._mActiveNum--;
81
+ if( this._mActiveNum <= 0 ) {
82
+ this._mState = constants.notMe;
83
+ return this.getState();
84
+ }
85
+ }
86
+ }
87
+ return this.getState();
88
+ }
89
+
90
+ this.getConfidence = function() {
91
+ var st = this.getState();
92
+ if( st == constants.foundIt ) {
93
+ return 0.99;
94
+ } else if( st == constants.notMe ) {
95
+ return 0.01;
96
+ }
97
+ var bestConf = 0.0;
98
+ this._mBestGuessProber = null;
99
+ for( var i = 0, prober; prober = this._mProbers[i]; i++ ) {
100
+ if( !prober ) continue;
101
+ if( !prober.active ) {
102
+ logger.log(prober.getCharsetName() + " not active\n");
103
+ continue;
104
+ }
105
+ var cf = prober.getConfidence();
106
+ logger.log(prober.getCharsetName() + " confidence = " + cf + "\n");
107
+ if( bestConf < cf ) {
108
+ bestConf = cf;
109
+ this._mBestGuessProber = prober;
110
+ }
111
+ }
112
+ if( !this._mBestGuessProber ) return 0.0;
113
+ return bestConf;
114
+ }
115
+
116
+ init();
117
+ }
118
+ CharSetGroupProber.prototype = new CharSetProber();
119
+
120
+ module.exports = CharSetGroupProber
@@ -0,0 +1,104 @@
1
+ /*
2
+ * The Original Code is Mozilla Universal charset detector code.
3
+ *
4
+ * The Initial Developer of the Original Code is
5
+ * Netscape Communications Corporation.
6
+ * Portions created by the Initial Developer are Copyright (C) 2001
7
+ * the Initial Developer. All Rights Reserved.
8
+ *
9
+ * Contributor(s):
10
+ * António Afonso (antonio.afonso gmail.com) - port to JavaScript
11
+ * Mark Pilgrim - port to Python
12
+ * Shy Shalom - original C code
13
+ *
14
+ * This library is free software; you can redistribute it and/or
15
+ * modify it under the terms of the GNU Lesser General Public
16
+ * License as published by the Free Software Foundation; either
17
+ * version 2.1 of the License, or (at your option) any later version.
18
+ *
19
+ * This library is distributed in the hope that it will be useful,
20
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ * Lesser General Public License for more details.
23
+ *
24
+ * You should have received a copy of the GNU Lesser General Public
25
+ * License along with this library; if not, write to the Free Software
26
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ * 02110-1301 USA
28
+ */
29
+
30
+ var constants = require('./constants')
31
+
32
+ function CharSetProber() {
33
+ this.reset = function() {
34
+ this._mState = constants.detecting;
35
+ }
36
+
37
+ this.getCharsetName = function() {
38
+ return null;
39
+ }
40
+
41
+ this.getSupportedCharsetNames = function() {
42
+ throw new Error("Unimplemented method getSupportedCharsetNames()");
43
+ }
44
+
45
+ this.feed = function(aBuf) {
46
+ }
47
+
48
+ this.getState = function() {
49
+ return this._mState;
50
+ }
51
+
52
+ this.getConfidence = function() {
53
+ return 0.0;
54
+ }
55
+
56
+ this.filterHighBitOnly = function(aBuf) {
57
+ aBuf = aBuf.replace(/[\x00-\x7F]+/g, " ");
58
+ return aBuf;
59
+ }
60
+
61
+ this.filterWithoutEnglishLetters = function(aBuf) {
62
+ aBuf = aBuf.replace(/[A-Za-z]+/g, " ");
63
+ return aBuf;
64
+ }
65
+
66
+ // Input: aBuf is a string containing all different types of characters
67
+ // Output: a string that contains all alphabetic letters, high-byte characters, and word immediately preceding `>`, but nothing else within `<>`
68
+ // Ex: input - '¡£º <div blah blah> abcdef</div> apples! * and oranges 9jd93jd>'
69
+ // output - '¡£º blah div apples and oranges jd jd '
70
+ this.filterWithEnglishLetters = function(aBuf) {
71
+ var result = '';
72
+ var inTag = false;
73
+ var prev = 0;
74
+
75
+ for (var curr = 0; curr < aBuf.length; curr++) {
76
+ var c = aBuf[curr];
77
+
78
+ if (c == '>') {
79
+ inTag = false;
80
+ } else if (c == '<') {
81
+ inTag = true;
82
+ }
83
+
84
+ var isAlpha = /[a-zA-Z]/.test(c);
85
+ var isASCII = /^[\x00-\x7F]*$/.test(c);
86
+
87
+ if (isASCII && !isAlpha) {
88
+ if (curr > prev && !inTag) {
89
+ result = result + aBuf.substring(prev, curr) + ' ';
90
+ }
91
+
92
+ prev = curr + 1;
93
+ }
94
+ }
95
+
96
+ if (!inTag) {
97
+ result = result + aBuf.substring(prev);
98
+ }
99
+
100
+ return result;
101
+ }
102
+ }
103
+
104
+ module.exports = CharSetProber
@@ -0,0 +1,71 @@
1
+ /*
2
+ * The Original Code is Mozilla Universal charset detector code.
3
+ *
4
+ * The Initial Developer of the Original Code is
5
+ * Netscape Communications Corporation.
6
+ * Portions created by the Initial Developer are Copyright (C) 2001
7
+ * the Initial Developer. All Rights Reserved.
8
+ *
9
+ * Contributor(s):
10
+ * António Afonso (antonio.afonso gmail.com) - port to JavaScript
11
+ * Mark Pilgrim - port to Python
12
+ * Shy Shalom - original C code
13
+ *
14
+ * This library is free software; you can redistribute it and/or
15
+ * modify it under the terms of the GNU Lesser General Public
16
+ * License as published by the Free Software Foundation; either
17
+ * version 2.1 of the License, or (at your option) any later version.
18
+ *
19
+ * This library is distributed in the hope that it will be useful,
20
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ * Lesser General Public License for more details.
23
+ *
24
+ * You should have received a copy of the GNU Lesser General Public
25
+ * License along with this library; if not, write to the Free Software
26
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ * 02110-1301 USA
28
+ */
29
+
30
+ var constants = require('./constants')
31
+
32
+ function CodingStateMachine(sm) {
33
+ var self = this;
34
+
35
+ function init(sm) {
36
+ self._mModel = sm;
37
+ self._mCurrentBytePos = 0;
38
+ self._mCurrentCharLen = 0;
39
+ self.reset();
40
+ }
41
+
42
+ this.reset = function() {
43
+ this._mCurrentState = constants.start;
44
+ }
45
+
46
+ this.nextState = function(c) {
47
+ // for each byte we get its class
48
+ // if it is first byte, we also get byte length
49
+ var byteCls = this._mModel.classTable[c.charCodeAt(0)];
50
+ if( this._mCurrentState == constants.start ) {
51
+ this._mCurrentBytePos = 0;
52
+ this._mCurrentCharLen = this._mModel.charLenTable[byteCls];
53
+ }
54
+ // from byte's class and stateTable, we get its next state
55
+ this._mCurrentState = this._mModel.stateTable[this._mCurrentState * this._mModel.classFactor + byteCls];
56
+ this._mCurrentBytePos++;
57
+ return this._mCurrentState;
58
+ }
59
+
60
+ this.getCurrentCharLen = function() {
61
+ return this._mCurrentCharLen;
62
+ }
63
+
64
+ this.getCodingStateMachine = function() {
65
+ return this._mModel.name;
66
+ }
67
+
68
+ init(sm);
69
+ }
70
+
71
+ module.exports = CodingStateMachine
@@ -0,0 +1,40 @@
1
+ /*
2
+ * The Original Code is Mozilla Universal charset detector code.
3
+ *
4
+ * The Initial Developer of the Original Code is
5
+ * Netscape Communications Corporation.
6
+ * Portions created by the Initial Developer are Copyright (C) 2001
7
+ * the Initial Developer. All Rights Reserved.
8
+ *
9
+ * Contributor(s):
10
+ * António Afonso (antonio.afonso gmail.com) - port to JavaScript
11
+ * Mark Pilgrim - port to Python
12
+ * Shy Shalom - original C code
13
+ *
14
+ * This library is free software; you can redistribute it and/or
15
+ * modify it under the terms of the GNU Lesser General Public
16
+ * License as published by the Free Software Foundation; either
17
+ * version 2.1 of the License, or (at your option) any later version.
18
+ *
19
+ * This library is distributed in the hope that it will be useful,
20
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
+ * Lesser General Public License for more details.
23
+ *
24
+ * You should have received a copy of the GNU Lesser General Public
25
+ * License along with this library; if not, write to the Free Software
26
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
+ * 02110-1301 USA
28
+ */
29
+
30
+ module.exports = {
31
+ detecting : 0,
32
+ foundIt : 1,
33
+ notMe : 2,
34
+
35
+ start : 0,
36
+ error : 1,
37
+ itsMe : 2,
38
+
39
+ SHORTCUT_THRESHOLD : 0.95
40
+ };