mol_vary 0.0.96 → 0.0.97

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/web.mjs CHANGED
@@ -77,57 +77,140 @@ var $;
77
77
  "use strict";
78
78
  var $;
79
79
  (function ($) {
80
+ const ascii_set = [...`0123456789.,:;()'"- \n`].map(c => c.charCodeAt(0));
81
+ const ascii_map = new Array(0x80).fill(0);
82
+ for (let i = 0; i < ascii_set.length; ++i)
83
+ ascii_map[ascii_set[i]] = i | 0x80;
84
+ const diacr_set = [
85
+ 0x00, 0x01, 0x0F, 0x0B, 0x07, 0x08, 0x12, 0x13,
86
+ 0x02, 0x0C, 0x06, 0x11, 0x03, 0x09, 0x0A, 0x04,
87
+ 0x28, 0x31, 0x27, 0x26, 0x23,
88
+ ];
89
+ const diacr_map = new Array(0x80).fill(0);
90
+ for (let i = 0; i < diacr_set.length; ++i)
91
+ diacr_map[diacr_set[i]] = i | 0x80;
92
+ const wide_offset = 0x0E_00;
93
+ const wide_limit = 128 * 128 * 8 + wide_offset;
94
+ const tine_limit = 128 * 98;
95
+ const full_mode = 0x95;
96
+ const wide_mode = 0x96;
97
+ const tiny_mode = 0x9E;
80
98
  function $mol_charset_ucf_encode(str) {
81
99
  const buf = $mol_charset_buffer(str.length * 3);
82
100
  return buf.slice(0, $mol_charset_ucf_encode_to(str, buf));
83
101
  }
84
102
  $.$mol_charset_ucf_encode = $mol_charset_ucf_encode;
85
- const fast_char = `0123456789.,:;()?!-'" \n`;
86
- const fast_map = new Array(0x80).fill(0);
87
- for (let i = 0; i < fast_char.length; ++i)
88
- fast_map[fast_char[i].charCodeAt(0)] = i | 0x80;
89
103
  function $mol_charset_ucf_encode_to(str, buf, from = 0) {
90
104
  let pos = from;
91
- let mode = 0x9C;
105
+ let mode = tiny_mode;
106
+ const write_high = (code) => {
107
+ buf[pos++] = ((code + 128 - mode) & 0x7F) | 0x80;
108
+ };
109
+ const write_remap = (code) => {
110
+ const fast = ascii_map[code];
111
+ if (fast)
112
+ write_high(fast);
113
+ else
114
+ buf[pos++] = code;
115
+ };
116
+ const write_mode = (m) => {
117
+ write_high(m);
118
+ mode = m;
119
+ };
92
120
  for (let i = 0; i < str.length; i++) {
93
121
  let code = str.charCodeAt(i);
94
122
  if (code >= 0xd800 && code < 0xe000)
95
123
  code = ((code - 0xd800) << 10) + str.charCodeAt(++i) + 0x2400;
96
124
  if (code < 0x80) {
97
- if (mode !== 0x9C) {
98
- const fast = fast_map[code];
99
- if (fast)
100
- code = fast;
101
- else
102
- buf[pos++] = mode = 0x9C;
125
+ if (mode !== tiny_mode) {
126
+ const fast = ascii_map[code];
127
+ if (!fast)
128
+ write_mode(tiny_mode);
103
129
  }
104
130
  buf[pos++] = code;
105
131
  }
106
- else if (code < 0x32_00) {
107
- const page = (code >> 7) + 0x9C;
132
+ else if (code < tine_limit) {
133
+ const page = (code >> 7) + tiny_mode;
134
+ code &= 0x7F;
135
+ if (page === 164) {
136
+ const fast = diacr_map[code];
137
+ if (fast) {
138
+ if (mode !== tiny_mode)
139
+ write_mode(tiny_mode);
140
+ write_high(fast);
141
+ continue;
142
+ }
143
+ }
108
144
  if (mode !== page)
109
- buf[pos++] = mode = page;
110
- buf[pos++] = code & 0x7F;
145
+ write_mode(page);
146
+ write_remap(code);
111
147
  }
112
- else if (code < 0x04_20_00) {
113
- code -= 0x2000;
114
- const page = (code >> 15) + 0x98;
148
+ else if (code < wide_limit) {
149
+ code -= wide_offset;
150
+ const page = (code >> 14) + wide_mode;
115
151
  if (mode !== page)
116
- buf[pos++] = mode = page;
117
- buf[pos++] = code & 0x7F;
118
- buf[pos++] = code >> 7;
152
+ write_mode(page);
153
+ write_remap(code & 0x7F);
154
+ write_remap((code >> 7) & 0x7F);
119
155
  }
120
156
  else {
121
- if (mode !== 0x97)
122
- buf[pos++] = mode = 0x97;
123
- buf[pos++] = code & 0x7F;
124
- buf[pos++] = code >> 7;
125
- buf[pos++] = code >> 15;
157
+ if (mode !== full_mode)
158
+ write_mode(full_mode);
159
+ write_remap(code & 0x7F);
160
+ write_remap((code >> 7) & 0x7F);
161
+ write_remap(code >> 14);
126
162
  }
127
163
  }
164
+ if (mode !== tiny_mode)
165
+ write_mode(tiny_mode);
128
166
  return pos - from;
129
167
  }
130
168
  $.$mol_charset_ucf_encode_to = $mol_charset_ucf_encode_to;
169
+ function $mol_charset_ucf_decode(buffer, mode = tiny_mode) {
170
+ let text = '';
171
+ let pos = 0;
172
+ let page_offset = 0;
173
+ const read_remap = () => {
174
+ let code = buffer[pos++];
175
+ if (code > 0x80)
176
+ code = ((mode + code) & 0x7F) | 0x80;
177
+ return code;
178
+ };
179
+ while (pos < buffer.length) {
180
+ let code = read_remap();
181
+ if (code < full_mode) {
182
+ if (mode === tiny_mode) {
183
+ if (code > 0x80) {
184
+ code = diacr_set[code - 0x080] | (6 << 7);
185
+ }
186
+ }
187
+ else if (!ascii_map[code]) {
188
+ if (code >= 0x80)
189
+ code = ascii_set[code - 0x80];
190
+ if (mode < tiny_mode)
191
+ code |= read_remap() << 7;
192
+ if (mode === full_mode)
193
+ code |= read_remap() << 14;
194
+ code += page_offset;
195
+ }
196
+ text += String.fromCodePoint(code);
197
+ }
198
+ else if (code >= tiny_mode) {
199
+ mode = code;
200
+ page_offset = (mode - tiny_mode) << 7;
201
+ }
202
+ else if (code === full_mode) {
203
+ mode = code;
204
+ page_offset = 0;
205
+ }
206
+ else {
207
+ mode = code;
208
+ page_offset = ((mode - wide_mode) << 14) + wide_offset;
209
+ }
210
+ }
211
+ return text;
212
+ }
213
+ $.$mol_charset_ucf_decode = $mol_charset_ucf_decode;
131
214
  })($ || ($ = {}));
132
215
 
133
216
  ;
@@ -156,45 +239,6 @@ var $;
156
239
  $.$mol_bigint_decode = $mol_bigint_decode;
157
240
  })($ || ($ = {}));
158
241
 
159
- ;
160
- "use strict";
161
- var $;
162
- (function ($) {
163
- const fast_char = `0123456789.,:;()?!-'" \n`;
164
- function $mol_charset_ucf_decode(buffer, mode = 0x9C) {
165
- let text = '';
166
- let pos = 0;
167
- let page_offset = 0;
168
- while (pos < buffer.length) {
169
- let code = buffer[pos++];
170
- if (code < 0x80) {
171
- if (mode < 0x9C)
172
- code |= buffer[pos++] << 7;
173
- if (mode === 0x97)
174
- code |= buffer[pos++] << 15;
175
- text += String.fromCodePoint(page_offset + code);
176
- }
177
- else if (code < 0x97) {
178
- text += fast_char[code - 0x80];
179
- }
180
- else if (code >= 0x9C) {
181
- mode = code;
182
- page_offset = (mode - 0x9C) << 7;
183
- }
184
- else if (code === 0x97) {
185
- mode = code;
186
- page_offset = 0;
187
- }
188
- else {
189
- mode = code;
190
- page_offset = ((mode - 0x98) << 15) + 0x20_00;
191
- }
192
- }
193
- return text;
194
- }
195
- $.$mol_charset_ucf_decode = $mol_charset_ucf_decode;
196
- })($ || ($ = {}));
197
-
198
242
  ;
199
243
  "use strict";
200
244
  var $;
package/web.test.js CHANGED
@@ -1332,15 +1332,46 @@ var $;
1332
1332
  (function ($_1) {
1333
1333
  var $$;
1334
1334
  (function ($$) {
1335
+ function check(text, bytes) {
1336
+ const ideal = new Uint8Array(bytes);
1337
+ const actual = $mol_charset_ucf_encode(text);
1338
+ $mol_assert_equal(actual, ideal);
1339
+ $mol_assert_equal($mol_charset_ucf_decode(actual), text);
1340
+ }
1335
1341
  $mol_test({
1336
- "Complex UCF encoding"($) {
1337
- $mol_assert_equal($mol_charset_ucf_encode('hi мир, 美しい 世界 🏴‍☠\t\n'), new Uint8Array([
1338
- 0x68, 0x69, 0x20,
1339
- 0xA4, 0x3C, 0x38, 0x40, 0x8B, 0x95,
1340
- 0x98, 0x0E, 0xBF, 0xFC, 0x57, 0x44, 0x95,
1341
- 0x98, 0x16, 0x5C, 0x4C, 0xAA, 0x95,
1342
- 0x9B, 0x74, 0xA7, 0xDC, 0x0D, 0xE8, 0x20, 0x9C, 0x09, 0x0A,
1343
- ]));
1342
+ "Full ASCII compatible"($) {
1343
+ check('hi', [0x68, 0x69]);
1344
+ },
1345
+ "1B ASCII with diacritic"($) {
1346
+ check('allo\u0302', [0x61, 0x6C, 0x6C, 0x6F, 0xEA]);
1347
+ },
1348
+ "1B Cyrillic"($) {
1349
+ check('мир', [0x88, 0x3C, 0xE2, 0x40, 0xF8]);
1350
+ },
1351
+ "1B Cyrillic with nummbers and punctuation"($) {
1352
+ check('м.1', [0x88, 0x3C, 0x2E, 0x31, 0xF8]);
1353
+ },
1354
+ "2B Kanji"($) {
1355
+ check('美', [0xF9, 0x0E, 0x63, 0x87]);
1356
+ },
1357
+ "3B rare Kanji"($) {
1358
+ check('𲎯', [0xF7, 0x2F, 0x47, 0x0C, 0x89]);
1359
+ },
1360
+ "1B Kana"($) {
1361
+ check('しい', [0xE0, 0x57, 0x44, 0xA0]);
1362
+ },
1363
+ "2B Emoji with 1B modifiers"($) {
1364
+ check('🏴', [0xFF, 0x74, 0x4B, 0x81]);
1365
+ check('🏴‍☠', [0xFF, 0x74, 0x4B, 0xC1, 0x0D, 0x8C, 0xA9, 0xB4]);
1366
+ },
1367
+ "Mixed scripts"($) {
1368
+ check('allô 美しい мир, 🏴‍☠\n', [
1369
+ 0x61, 0x6C, 0x6C, 0x6F, 0xEA, 0x20,
1370
+ 0xF9, 0x0E, 0x63, 0xE7, 0x57, 0x44, 0x20,
1371
+ 0xA8, 0x3C, 0xE2, 0x40, 0x2C, 0x20,
1372
+ 0xF7, 0x74, 0x4B, 0xC1, 0x0D, 0x8C, 0xA9, 0x0A,
1373
+ 0xB4,
1374
+ ]);
1344
1375
  },
1345
1376
  });
1346
1377
  })($$ = $_1.$$ || ($_1.$$ = {}));
@@ -1386,26 +1417,6 @@ var $;
1386
1417
  })($$ = $_1.$$ || ($_1.$$ = {}));
1387
1418
  })($ || ($ = {}));
1388
1419
 
1389
- ;
1390
- "use strict";
1391
- var $;
1392
- (function ($_1) {
1393
- var $$;
1394
- (function ($$) {
1395
- $mol_test({
1396
- "Complex UCF eecoding"($) {
1397
- $mol_assert_equal('hi мир, 美しい 世界 🏴‍☠\t\n', $mol_charset_ucf_decode(new Uint8Array([
1398
- 0x68, 0x69, 0x20,
1399
- 0xA4, 0x3C, 0x38, 0x40, 0x8B, 0x95,
1400
- 0x98, 0x0E, 0xBF, 0xFC, 0x57, 0x44, 0x95,
1401
- 0x98, 0x16, 0x5C, 0x4C, 0xAA, 0x95,
1402
- 0x9B, 0x74, 0xA7, 0xDC, 0x0D, 0xE8, 0x20, 0x9C, 0x09, 0x0A,
1403
- ])));
1404
- },
1405
- });
1406
- })($$ = $_1.$$ || ($_1.$$ = {}));
1407
- })($ || ($ = {}));
1408
-
1409
1420
  ;
1410
1421
  "use strict";
1411
1422
  var $;
@@ -1649,11 +1660,11 @@ var $;
1649
1660
  },
1650
1661
  "vary pack text"($) {
1651
1662
  check(['foo'], [text | 3, ...str('foo')]);
1652
- check(['абв'], [text | 4, ...str('абв')]);
1663
+ check(['абв'], [text | 5, ...str('абв')]);
1653
1664
  const long_lat = 'abcdefghijklmnopqrst';
1654
1665
  check([long_lat], [text | L1, 20, ...str(long_lat)]);
1655
1666
  const long_cyr = 'абвгдеёжзийклмнопрст';
1656
- check([long_cyr], [text | L1, 21, ...str(long_cyr)]);
1667
+ check([long_cyr], [text | L1, 22, ...str(long_cyr)]);
1657
1668
  },
1658
1669
  "vary pack dedup text"($) {
1659
1670
  check([["f", "f"]], [list | 2, text | 1, ...str('f'), link | 0]);