nokolexbor 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/ext/nokolexbor/nl_attribute.c +46 -0
  3. data/ext/nokolexbor/nl_cdata.c +8 -0
  4. data/ext/nokolexbor/nl_comment.c +6 -0
  5. data/ext/nokolexbor/nl_document.c +53 -7
  6. data/ext/nokolexbor/nl_document_fragment.c +9 -0
  7. data/ext/nokolexbor/nl_error.c +21 -19
  8. data/ext/nokolexbor/nl_node.c +255 -49
  9. data/ext/nokolexbor/nl_node_set.c +56 -1
  10. data/ext/nokolexbor/nl_processing_instruction.c +6 -0
  11. data/ext/nokolexbor/nl_text.c +6 -0
  12. data/ext/nokolexbor/nokolexbor.h +1 -0
  13. data/lib/nokolexbor/document.rb +52 -5
  14. data/lib/nokolexbor/document_fragment.rb +11 -0
  15. data/lib/nokolexbor/node.rb +367 -18
  16. data/lib/nokolexbor/node_set.rb +56 -0
  17. data/lib/nokolexbor/version.rb +1 -1
  18. metadata +2 -24
  19. data/vendor/lexbor/source/lexbor/encoding/base.h +0 -218
  20. data/vendor/lexbor/source/lexbor/encoding/big5.c +0 -42839
  21. data/vendor/lexbor/source/lexbor/encoding/config.cmake +0 -12
  22. data/vendor/lexbor/source/lexbor/encoding/const.h +0 -65
  23. data/vendor/lexbor/source/lexbor/encoding/decode.c +0 -3193
  24. data/vendor/lexbor/source/lexbor/encoding/decode.h +0 -370
  25. data/vendor/lexbor/source/lexbor/encoding/encode.c +0 -1931
  26. data/vendor/lexbor/source/lexbor/encoding/encode.h +0 -377
  27. data/vendor/lexbor/source/lexbor/encoding/encoding.c +0 -252
  28. data/vendor/lexbor/source/lexbor/encoding/encoding.h +0 -475
  29. data/vendor/lexbor/source/lexbor/encoding/euc_kr.c +0 -53883
  30. data/vendor/lexbor/source/lexbor/encoding/gb18030.c +0 -47905
  31. data/vendor/lexbor/source/lexbor/encoding/iso_2022_jp_katakana.c +0 -159
  32. data/vendor/lexbor/source/lexbor/encoding/jis0208.c +0 -22477
  33. data/vendor/lexbor/source/lexbor/encoding/jis0212.c +0 -15787
  34. data/vendor/lexbor/source/lexbor/encoding/multi.h +0 -53
  35. data/vendor/lexbor/source/lexbor/encoding/range.c +0 -71
  36. data/vendor/lexbor/source/lexbor/encoding/range.h +0 -34
  37. data/vendor/lexbor/source/lexbor/encoding/res.c +0 -222
  38. data/vendor/lexbor/source/lexbor/encoding/res.h +0 -34
  39. data/vendor/lexbor/source/lexbor/encoding/single.c +0 -13748
  40. data/vendor/lexbor/source/lexbor/encoding/single.h +0 -116
@@ -1,3193 +0,0 @@
1
- /*
2
- * Copyright (C) 2019 Alexander Borisov
3
- *
4
- * Author: Alexander Borisov <borisov@lexbor.com>
5
- */
6
-
7
- #include "lexbor/encoding/decode.h"
8
- #include "lexbor/encoding/single.h"
9
- #include "lexbor/encoding/multi.h"
10
- #include "lexbor/encoding/range.h"
11
-
12
-
13
- #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY(_lower, _upper, _cont) \
14
- { \
15
- ch = *p; \
16
- \
17
- if (ch < _lower || ch > _upper) { \
18
- ctx->u.utf_8.lower = 0x00; \
19
- ctx->u.utf_8.need = 0; \
20
- \
21
- LXB_ENCODING_DECODE_ERROR_BEGIN { \
22
- *data = p; \
23
- ctx->have_error = true; \
24
- } \
25
- LXB_ENCODING_DECODE_ERROR_END(); \
26
- \
27
- _cont; \
28
- } \
29
- else { \
30
- p++; \
31
- need--; \
32
- ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F); \
33
- } \
34
- }
35
-
36
- #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(first, two, f_lower, s_upper) \
37
- do { \
38
- if (ch == first) { \
39
- ctx->u.utf_8.lower = f_lower; \
40
- ctx->u.utf_8.upper = 0xBF; \
41
- } \
42
- else if (ch == two) { \
43
- ctx->u.utf_8.lower = 0x80; \
44
- ctx->u.utf_8.upper = s_upper; \
45
- } \
46
- } \
47
- while (0)
48
-
49
- #define LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, cp) \
50
- do { \
51
- (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
52
- } \
53
- while (0)
54
-
55
- #define LXB_ENCODING_DECODE_APPEND(ctx, cp) \
56
- do { \
57
- if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
58
- return LXB_STATUS_SMALL_BUFFER; \
59
- } \
60
- \
61
- (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
62
- } \
63
- while (0)
64
-
65
- #define LXB_ENCODING_DECODE_APPEND_P(ctx, cp) \
66
- do { \
67
- if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
68
- *data = p; \
69
- return LXB_STATUS_SMALL_BUFFER; \
70
- } \
71
- \
72
- (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
73
- } \
74
- while (0)
75
-
76
- #define LXB_ENCODING_DECODE_CHECK_OUT(ctx) \
77
- do { \
78
- if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
79
- return LXB_STATUS_SMALL_BUFFER; \
80
- } \
81
- } \
82
- while (0)
83
-
84
- #define LXB_ENCODING_DECODE_ERROR_BEGIN \
85
- do { \
86
- if (ctx->replace_to == NULL) { \
87
- return LXB_STATUS_ERROR; \
88
- } \
89
- \
90
- if ((ctx->buffer_used + ctx->replace_len) > ctx->buffer_length) { \
91
- do
92
-
93
- #define LXB_ENCODING_DECODE_ERROR_END() \
94
- while (0); \
95
- \
96
- return LXB_STATUS_SMALL_BUFFER; \
97
- } \
98
- \
99
- memcpy(&ctx->buffer_out[ctx->buffer_used], ctx->replace_to, \
100
- sizeof(lxb_codepoint_t) * ctx->replace_len); \
101
- \
102
- ctx->buffer_used += ctx->replace_len; \
103
- } \
104
- while (0)
105
-
106
- #define LXB_ENCODING_DECODE_ERROR(ctx) \
107
- do { \
108
- LXB_ENCODING_DECODE_ERROR_BEGIN { \
109
- } LXB_ENCODING_DECODE_ERROR_END(); \
110
- } \
111
- while (0)
112
-
113
- #define LXB_ENCODING_DECODE_FAILED(ident) \
114
- do { \
115
- if ((byte) < (0x80)) { \
116
- (*data)--; \
117
- } \
118
- \
119
- LXB_ENCODING_DECODE_ERROR_BEGIN { \
120
- ctx->have_error = true; \
121
- (ident) = 0x01; \
122
- } \
123
- LXB_ENCODING_DECODE_ERROR_END(); \
124
- } \
125
- while (0)
126
-
127
- #define LXB_ENCODING_DECODE_SINGLE(decode_map) \
128
- do { \
129
- const lxb_char_t *p = *data; \
130
- \
131
- while (p < end) { \
132
- if (*p < 0x80) { \
133
- LXB_ENCODING_DECODE_APPEND_P(ctx, *p++); \
134
- } \
135
- else { \
136
- ctx->codepoint = decode_map[(*p++) - 0x80].codepoint; \
137
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) { \
138
- LXB_ENCODING_DECODE_ERROR_BEGIN { \
139
- *data = p - 1; \
140
- } \
141
- LXB_ENCODING_DECODE_ERROR_END(); \
142
- continue; \
143
- } \
144
- \
145
- LXB_ENCODING_DECODE_APPEND_P(ctx, ctx->codepoint); \
146
- } \
147
- \
148
- *data = p; \
149
- } \
150
- } \
151
- while (0)
152
-
153
- #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(lower, upper) \
154
- do { \
155
- ch = **data; \
156
- \
157
- if (ch < lower || ch > upper) { \
158
- goto failed; \
159
- } \
160
- \
161
- (*data)++; \
162
- needed--; \
163
- ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F); \
164
- } \
165
- while (0)
166
-
167
- #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(first, two, f_lower, \
168
- s_upper) \
169
- do { \
170
- if (ch == first) { \
171
- ctx->u.utf_8.lower = f_lower; \
172
- ctx->u.utf_8.upper = 0xBF; \
173
- } \
174
- else if (ch == two) { \
175
- ctx->u.utf_8.lower = 0x80; \
176
- ctx->u.utf_8.upper = s_upper; \
177
- } \
178
- } \
179
- while (0)
180
-
181
-
182
- lxb_status_t
183
- lxb_encoding_decode_default(lxb_encoding_decode_t *ctx,
184
- const lxb_char_t **data, const lxb_char_t *end)
185
- {
186
- return lxb_encoding_decode_utf_8(ctx, data, end);
187
- }
188
-
189
- lxb_status_t
190
- lxb_encoding_decode_auto(lxb_encoding_decode_t *ctx,
191
- const lxb_char_t **data, const lxb_char_t *end)
192
- {
193
- *data = end;
194
- return LXB_STATUS_ERROR;
195
- }
196
-
197
- lxb_status_t
198
- lxb_encoding_decode_undefined(lxb_encoding_decode_t *ctx,
199
- const lxb_char_t **data, const lxb_char_t *end)
200
- {
201
- *data = end;
202
- return LXB_STATUS_ERROR;
203
- }
204
-
205
- lxb_status_t
206
- lxb_encoding_decode_big5(lxb_encoding_decode_t *ctx,
207
- const lxb_char_t **data, const lxb_char_t *end)
208
- {
209
- uint32_t index;
210
- lxb_char_t lead, byte;
211
-
212
- ctx->status = LXB_STATUS_OK;
213
-
214
- if (ctx->u.lead != 0x00) {
215
- if (ctx->have_error) {
216
- ctx->u.lead = 0x00;
217
- ctx->have_error = false;
218
-
219
- LXB_ENCODING_DECODE_ERROR_BEGIN {
220
- ctx->u.lead = 0x01;
221
- ctx->have_error = true;
222
- } LXB_ENCODING_DECODE_ERROR_END();
223
- }
224
- else if (ctx->second_codepoint != 0x0000) {
225
- if ((ctx->buffer_used + 2) > ctx->buffer_length) {
226
- return LXB_STATUS_SMALL_BUFFER;
227
- }
228
-
229
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->u.lead);
230
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->second_codepoint);
231
-
232
- ctx->u.lead = 0x00;
233
- ctx->second_codepoint = 0x0000;
234
- }
235
- else {
236
- if (*data >= end) {
237
- ctx->status = LXB_STATUS_CONTINUE;
238
-
239
- return LXB_STATUS_CONTINUE;
240
- }
241
-
242
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
243
-
244
- lead = (lxb_char_t) ctx->u.lead;
245
- ctx->u.lead = 0x00;
246
-
247
- goto lead_state;
248
- }
249
- }
250
-
251
- while (*data < end) {
252
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
253
-
254
- lead = *(*data)++;
255
-
256
- if (lead < 0x80) {
257
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
258
- continue;
259
- }
260
-
261
- if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
262
- LXB_ENCODING_DECODE_ERROR_BEGIN {
263
- (*data)--;
264
- }
265
- LXB_ENCODING_DECODE_ERROR_END();
266
-
267
- continue;
268
- }
269
-
270
- if (*data >= end) {
271
- ctx->u.lead = lead;
272
- ctx->status = LXB_STATUS_CONTINUE;
273
-
274
- return LXB_STATUS_CONTINUE;
275
- }
276
-
277
- lead_state:
278
-
279
- index = 0;
280
- byte = *(*data)++;
281
-
282
- if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
283
- || (unsigned) (byte - 0xA1) <= (0xFE - 0xA1))
284
- {
285
- if (byte < 0x7F) {
286
- /* Max index == (0xFE - 0x81) * 157 + (0x7E - 0x62) == 19653 */
287
- index = (lead - 0x81) * 157 + (byte - 0x40);
288
- }
289
- else {
290
- /* Max index == (0xFE - 0x81) * 157 + (0xFE - 0x62) == 19781 */
291
- index = (lead - 0x81) * 157 + (byte - 0x62);
292
- }
293
- }
294
-
295
- /*
296
- * 1133 U+00CA U+0304 Ê̄ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON)
297
- * 1135 U+00CA U+030C Ê̌ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON)
298
- * 1164 U+00EA U+0304 ê̄ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON)
299
- * 1166 U+00EA U+030C ê̌ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON)
300
- */
301
- switch (index) {
302
- case 1133:
303
- if ((ctx->buffer_used + 2) > ctx->buffer_length) {
304
- ctx->u.lead = 0x00CA;
305
- ctx->second_codepoint = 0x0304;
306
-
307
- return LXB_STATUS_SMALL_BUFFER;
308
- }
309
-
310
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00CA);
311
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x0304);
312
-
313
- continue;
314
-
315
- case 1135:
316
- if ((ctx->buffer_used + 2) > ctx->buffer_length) {
317
- ctx->u.lead = 0x00CA;
318
- ctx->second_codepoint = 0x030C;
319
-
320
- return LXB_STATUS_SMALL_BUFFER;
321
- }
322
-
323
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00CA);
324
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x030C);
325
-
326
- continue;
327
-
328
- case 1164:
329
- if ((ctx->buffer_used + 2) > ctx->buffer_length) {
330
- ctx->u.lead = 0x00EA;
331
- ctx->second_codepoint = 0x0304;
332
-
333
- return LXB_STATUS_SMALL_BUFFER;
334
- }
335
-
336
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00EA);
337
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x0304);
338
-
339
- continue;
340
-
341
- case 1166:
342
- if ((ctx->buffer_used + 2) > ctx->buffer_length) {
343
- ctx->u.lead = 0x00EA;
344
- ctx->second_codepoint = 0x030C;
345
-
346
- return LXB_STATUS_SMALL_BUFFER;
347
- }
348
-
349
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00EA);
350
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x030C);
351
-
352
- continue;
353
-
354
- case 0:
355
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
356
- continue;
357
- }
358
-
359
- ctx->codepoint = lxb_encoding_multi_index_big5[index].codepoint;
360
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
361
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
362
- continue;
363
- }
364
-
365
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
366
- }
367
-
368
- return LXB_STATUS_OK;
369
- }
370
-
371
- lxb_status_t
372
- lxb_encoding_decode_euc_jp(lxb_encoding_decode_t *ctx,
373
- const lxb_char_t **data, const lxb_char_t *end)
374
- {
375
- bool is_jis0212;
376
- lxb_char_t byte, lead;
377
-
378
- ctx->status = LXB_STATUS_OK;
379
-
380
- if (ctx->u.euc_jp.lead != 0x00) {
381
- if (ctx->have_error) {
382
- ctx->have_error = false;
383
- ctx->u.euc_jp.lead = 0x00;
384
-
385
- LXB_ENCODING_DECODE_ERROR_BEGIN {
386
- ctx->have_error = true;
387
- ctx->u.euc_jp.lead = 0x01;
388
- } LXB_ENCODING_DECODE_ERROR_END();
389
- }
390
- else {
391
- if (*data >= end) {
392
- ctx->status = LXB_STATUS_CONTINUE;
393
-
394
- return LXB_STATUS_CONTINUE;
395
- }
396
-
397
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
398
-
399
- lead = ctx->u.euc_jp.lead;
400
- byte = *(*data)++;
401
-
402
- ctx->u.euc_jp.lead = 0x00;
403
-
404
- if (ctx->u.euc_jp.is_jis0212) {
405
- is_jis0212 = true;
406
- ctx->u.euc_jp.is_jis0212 = false;
407
-
408
- goto lead_jis_state;
409
- }
410
-
411
- goto lead_state;
412
- }
413
- }
414
-
415
- while (*data < end) {
416
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
417
-
418
- lead = *(*data)++;
419
-
420
- if (lead < 0x80) {
421
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
422
- continue;
423
- }
424
-
425
- if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
426
- && (lead != 0x8E && lead != 0x8F))
427
- {
428
- LXB_ENCODING_DECODE_ERROR_BEGIN {
429
- (*data)--;
430
- }
431
- LXB_ENCODING_DECODE_ERROR_END();
432
-
433
- continue;
434
- }
435
-
436
- if (*data >= end) {
437
- ctx->u.euc_jp.lead = lead;
438
- ctx->status = LXB_STATUS_CONTINUE;
439
-
440
- return LXB_STATUS_CONTINUE;
441
- }
442
-
443
- byte = *(*data)++;
444
-
445
- lead_state:
446
-
447
- if (lead == 0x8E && (unsigned) (byte - 0xA1) <= (0xDF - 0xA1)) {
448
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xFF61 - 0xA1 + byte);
449
- continue;
450
- }
451
-
452
- is_jis0212 = false;
453
-
454
- if (lead == 0x8F && (unsigned) (byte - 0xA1) <= (0xFE - 0xA1)) {
455
- if (*data >= end) {
456
- ctx->u.euc_jp.lead = byte;
457
- ctx->u.euc_jp.is_jis0212 = true;
458
-
459
- ctx->status = LXB_STATUS_CONTINUE;
460
-
461
- return LXB_STATUS_CONTINUE;
462
- }
463
-
464
- lead = byte;
465
- byte = *(*data)++;
466
- is_jis0212 = true;
467
- }
468
-
469
- lead_jis_state:
470
-
471
- if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
472
- || (unsigned) (byte - 0xA1) > (0xFE - 0xA1))
473
- {
474
- LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
475
- continue;
476
- }
477
-
478
- /* Max index == (0xFE - 0xA1) * 94 + 0xFE - 0xA1 == 8835 */
479
- ctx->codepoint = (lead - 0xA1) * 94 + byte - 0xA1;
480
-
481
- if (is_jis0212) {
482
- if ((sizeof(lxb_encoding_multi_index_jis0212)
483
- / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
484
- {
485
- LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
486
- continue;
487
- }
488
-
489
- ctx->codepoint = lxb_encoding_multi_index_jis0212[ctx->codepoint].codepoint;
490
- }
491
- else {
492
- if ((sizeof(lxb_encoding_multi_index_jis0208)
493
- / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
494
- {
495
- LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
496
- continue;
497
- }
498
-
499
- ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
500
- }
501
-
502
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
503
- LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
504
- continue;
505
- }
506
-
507
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
508
- }
509
-
510
- return LXB_STATUS_OK;
511
- }
512
-
513
- lxb_status_t
514
- lxb_encoding_decode_euc_kr(lxb_encoding_decode_t *ctx,
515
- const lxb_char_t **data, const lxb_char_t *end)
516
- {
517
- lxb_char_t lead, byte;
518
-
519
- ctx->status = LXB_STATUS_OK;
520
-
521
- if (ctx->u.lead != 0x00) {
522
- if (ctx->have_error) {
523
- ctx->have_error = false;
524
- ctx->u.lead = 0x00;
525
-
526
- LXB_ENCODING_DECODE_ERROR_BEGIN {
527
- ctx->have_error = true;
528
- ctx->u.lead = 0x01;
529
- } LXB_ENCODING_DECODE_ERROR_END();
530
- }
531
- else {
532
- if (*data >= end) {
533
- ctx->status = LXB_STATUS_CONTINUE;
534
-
535
- return LXB_STATUS_CONTINUE;
536
- }
537
-
538
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
539
-
540
- lead = (lxb_char_t) ctx->u.lead;
541
- ctx->u.lead = 0x00;
542
-
543
- goto lead_state;
544
- }
545
- }
546
-
547
- while (*data < end) {
548
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
549
-
550
- lead = *(*data)++;
551
-
552
- if (lead < 0x80) {
553
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
554
- continue;
555
- }
556
-
557
- if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
558
- LXB_ENCODING_DECODE_ERROR_BEGIN {
559
- (*data)--;
560
- }
561
- LXB_ENCODING_DECODE_ERROR_END();
562
-
563
- continue;
564
- }
565
-
566
- if (*data == end) {
567
- ctx->u.lead = lead;
568
- ctx->status = LXB_STATUS_CONTINUE;
569
-
570
- return LXB_STATUS_CONTINUE;
571
- }
572
-
573
- lead_state:
574
-
575
- byte = *(*data)++;
576
-
577
- if ((unsigned) (byte - 0x41) > (0xFE - 0x41)) {
578
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
579
- continue;
580
- }
581
-
582
- /* Max index == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
583
- ctx->codepoint = (lead - 0x81) * 190 + (byte - 0x41);
584
-
585
- if (ctx->codepoint >= sizeof(lxb_encoding_multi_index_euc_kr)
586
- / sizeof(lxb_encoding_multi_index_t))
587
- {
588
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
589
- continue;
590
- }
591
-
592
- ctx->codepoint = lxb_encoding_multi_index_euc_kr[ctx->codepoint].codepoint;
593
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
594
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
595
- continue;
596
- }
597
-
598
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
599
- }
600
-
601
- return LXB_STATUS_OK;
602
- }
603
-
604
- lxb_status_t
605
- lxb_encoding_decode_gbk(lxb_encoding_decode_t *ctx,
606
- const lxb_char_t **data, const lxb_char_t *end)
607
- {
608
- return lxb_encoding_decode_gb18030(ctx, data, end);
609
- }
610
-
611
- lxb_status_t
612
- lxb_encoding_decode_ibm866(lxb_encoding_decode_t *ctx,
613
- const lxb_char_t **data, const lxb_char_t *end)
614
- {
615
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_ibm866);
616
-
617
- return LXB_STATUS_OK;
618
- }
619
-
620
- lxb_status_t
621
- lxb_encoding_decode_iso_2022_jp(lxb_encoding_decode_t *ctx,
622
- const lxb_char_t **data, const lxb_char_t *end)
623
- {
624
- #define LXB_ENCODING_DECODE_ISO_2022_JP_OK() \
625
- do { \
626
- if (*data >= end) { \
627
- return LXB_STATUS_OK; \
628
- } \
629
- } \
630
- while (0)
631
-
632
- #define LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE() \
633
- do { \
634
- if (*data >= end) { \
635
- ctx->status = LXB_STATUS_CONTINUE; \
636
- return LXB_STATUS_CONTINUE; \
637
- } \
638
- } \
639
- while (0)
640
-
641
-
642
- lxb_char_t byte;
643
- lxb_encoding_ctx_2022_jp_t *iso = &ctx->u.iso_2022_jp;
644
-
645
- ctx->status = LXB_STATUS_OK;
646
-
647
- if (ctx->have_error) {
648
- ctx->have_error = false;
649
-
650
- LXB_ENCODING_DECODE_ERROR_BEGIN {
651
- ctx->have_error = true;
652
- }
653
- LXB_ENCODING_DECODE_ERROR_END();
654
- }
655
-
656
- if (iso->prepand != 0x00) {
657
- if (*data >= end) {
658
- ctx->status = LXB_STATUS_CONTINUE;
659
-
660
- return LXB_STATUS_CONTINUE;
661
- }
662
-
663
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
664
-
665
- byte = iso->prepand;
666
- iso->prepand = 0x00;
667
-
668
- goto prepand;
669
- }
670
-
671
- if (*data >= end) {
672
- return LXB_STATUS_OK;
673
- }
674
-
675
- do {
676
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
677
-
678
- byte = *(*data)++;
679
-
680
- prepand:
681
-
682
- switch (iso->state) {
683
- case LXB_ENCODING_DECODE_2022_JP_ASCII:
684
- if (byte == 0x1B) {
685
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
686
-
687
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
688
- break;
689
- }
690
-
691
- /* 0x00 to 0x7F, excluding 0x0E, 0x0F, and 0x1B */
692
- if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)
693
- && byte != 0x0E && byte != 0x0F)
694
- {
695
- iso->out_flag = false;
696
-
697
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, byte);
698
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
699
- break;
700
- }
701
-
702
- iso->out_flag = false;
703
-
704
- LXB_ENCODING_DECODE_ERROR_BEGIN {
705
- ctx->have_error = true;
706
- }
707
- LXB_ENCODING_DECODE_ERROR_END();
708
-
709
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
710
- break;
711
-
712
- case LXB_ENCODING_DECODE_2022_JP_ROMAN:
713
- switch (byte) {
714
- case 0x1B:
715
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
716
-
717
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
718
- continue;
719
-
720
- case 0x5C:
721
- iso->out_flag = false;
722
-
723
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00A5);
724
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
725
-
726
- continue;
727
-
728
- case 0x7E:
729
- iso->out_flag = false;
730
-
731
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x203E);
732
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
733
-
734
- continue;
735
-
736
- case 0x0E:
737
- case 0x0F:
738
- break;
739
-
740
- default:
741
- /* 0x00 to 0x7F */
742
- if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)) {
743
- iso->out_flag = false;
744
-
745
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, byte);
746
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
747
-
748
- continue;
749
- }
750
-
751
- break;
752
- }
753
-
754
- iso->out_flag = false;
755
-
756
- LXB_ENCODING_DECODE_ERROR_BEGIN {
757
- ctx->have_error = true;
758
- }
759
- LXB_ENCODING_DECODE_ERROR_END();
760
-
761
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
762
- break;
763
-
764
- case LXB_ENCODING_DECODE_2022_JP_KATAKANA:
765
- if (byte == 0x1B) {
766
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
767
-
768
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
769
- break;
770
- }
771
-
772
- /* 0x21 to 0x5F */
773
- if ((unsigned) (byte - 0x21) <= (0x5F - 0x21)) {
774
- iso->out_flag = false;
775
-
776
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx,
777
- 0xFF61 - 0x21 + byte);
778
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
779
- break;
780
- }
781
-
782
- iso->out_flag = false;
783
-
784
- LXB_ENCODING_DECODE_ERROR_BEGIN {
785
- ctx->have_error = true;
786
- }
787
- LXB_ENCODING_DECODE_ERROR_END();
788
-
789
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
790
- break;
791
-
792
- case LXB_ENCODING_DECODE_2022_JP_LEAD:
793
- if (byte == 0x1B) {
794
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
795
-
796
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
797
- break;
798
- }
799
-
800
- /* 0x21 to 0x7E */
801
- if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
802
- iso->out_flag = false;
803
- iso->lead = byte;
804
- iso->state = LXB_ENCODING_DECODE_2022_JP_TRAIL;
805
-
806
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
807
- break;
808
- }
809
-
810
- iso->out_flag = false;
811
-
812
- LXB_ENCODING_DECODE_ERROR_BEGIN {
813
- ctx->have_error = true;
814
- }
815
- LXB_ENCODING_DECODE_ERROR_END();
816
-
817
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
818
- break;
819
-
820
- case LXB_ENCODING_DECODE_2022_JP_TRAIL:
821
- if (byte == 0x1B) {
822
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
823
-
824
- LXB_ENCODING_DECODE_ERROR_BEGIN {
825
- ctx->have_error = true;
826
- }
827
- LXB_ENCODING_DECODE_ERROR_END();
828
-
829
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
830
- break;
831
- }
832
-
833
- iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
834
-
835
- /* 0x21 to 0x7E */
836
- if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
837
- /* Max index == (0x7E - 0x21) * 94 + 0x7E - 0x21 == 8835 */
838
- ctx->codepoint = (iso->lead - 0x21) * 94 + byte - 0x21;
839
-
840
- ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
841
-
842
- if (ctx->codepoint != LXB_ENCODING_ERROR_CODEPOINT) {
843
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
844
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
845
-
846
- break;
847
- }
848
- }
849
-
850
- LXB_ENCODING_DECODE_ERROR_BEGIN {
851
- iso->prepand = 0x01;
852
- ctx->have_error = true;
853
- }
854
- LXB_ENCODING_DECODE_ERROR_END();
855
-
856
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
857
- break;
858
-
859
- case LXB_ENCODING_DECODE_2022_JP_ESCAPE_START:
860
- if (byte == 0x24 || byte == 0x28) {
861
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE;
862
- iso->lead = byte;
863
-
864
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
865
- break;
866
- }
867
-
868
- (*data)--;
869
-
870
- iso->out_flag = false;
871
- iso->state = ctx->u.iso_2022_jp.out_state;
872
-
873
- LXB_ENCODING_DECODE_ERROR_BEGIN {
874
- iso->prepand = 0x01;
875
- ctx->have_error = true;
876
- }
877
- LXB_ENCODING_DECODE_ERROR_END();
878
-
879
- break;
880
-
881
- case LXB_ENCODING_DECODE_2022_JP_ESCAPE:
882
- iso->state = LXB_ENCODING_DECODE_2022_JP_UNSET;
883
-
884
- if (iso->lead == 0x28) {
885
- if (byte == 0x42) {
886
- iso->state = LXB_ENCODING_DECODE_2022_JP_ASCII;
887
- }
888
- else if (byte == 0x4A) {
889
- iso->state = LXB_ENCODING_DECODE_2022_JP_ROMAN;
890
- }
891
- else if (byte == 0x49) {
892
- iso->state = LXB_ENCODING_DECODE_2022_JP_KATAKANA;
893
- }
894
- }
895
- else if (iso->lead == 0x24) {
896
- if (byte == 0x40 || byte == 0x42) {
897
- iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
898
- }
899
- }
900
-
901
- if (iso->state == LXB_ENCODING_DECODE_2022_JP_UNSET) {
902
- (*data)--;
903
-
904
- iso->out_flag = false;
905
- iso->state = iso->out_state;
906
-
907
- LXB_ENCODING_DECODE_ERROR_BEGIN {
908
- iso->prepand = iso->lead;
909
- iso->lead = 0x00;
910
-
911
- ctx->have_error = true;
912
- }
913
- LXB_ENCODING_DECODE_ERROR_END();
914
-
915
- byte = iso->lead;
916
- iso->lead = 0x00;
917
-
918
- goto prepand;
919
- }
920
-
921
- iso->lead = 0x00;
922
- iso->out_state = iso->state;
923
-
924
- if (iso->out_flag) {
925
- LXB_ENCODING_DECODE_ERROR_BEGIN {
926
- ctx->have_error = true;
927
- }
928
- LXB_ENCODING_DECODE_ERROR_END();
929
-
930
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
931
- break;
932
- }
933
-
934
- iso->out_flag = true;
935
-
936
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
937
- break;
938
- }
939
- }
940
- while (true);
941
-
942
- return LXB_STATUS_OK;
943
-
944
- #undef LXB_ENCODING_DECODE_ISO_2022_JP_OK
945
- #undef LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE
946
- }
947
-
948
- lxb_status_t
949
- lxb_encoding_decode_iso_8859_10(lxb_encoding_decode_t *ctx,
950
- const lxb_char_t **data, const lxb_char_t *end)
951
- {
952
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_10);
953
-
954
- return LXB_STATUS_OK;
955
- }
956
-
957
- lxb_status_t
958
- lxb_encoding_decode_iso_8859_13(lxb_encoding_decode_t *ctx,
959
- const lxb_char_t **data, const lxb_char_t *end)
960
- {
961
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_13);
962
-
963
- return LXB_STATUS_OK;
964
- }
965
-
966
- lxb_status_t
967
- lxb_encoding_decode_iso_8859_14(lxb_encoding_decode_t *ctx,
968
- const lxb_char_t **data, const lxb_char_t *end)
969
- {
970
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_14);
971
-
972
- return LXB_STATUS_OK;
973
- }
974
-
975
- lxb_status_t
976
- lxb_encoding_decode_iso_8859_15(lxb_encoding_decode_t *ctx,
977
- const lxb_char_t **data, const lxb_char_t *end)
978
- {
979
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_15);
980
-
981
- return LXB_STATUS_OK;
982
- }
983
-
984
- lxb_status_t
985
- lxb_encoding_decode_iso_8859_16(lxb_encoding_decode_t *ctx,
986
- const lxb_char_t **data, const lxb_char_t *end)
987
- {
988
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_16);
989
-
990
- return LXB_STATUS_OK;
991
- }
992
-
993
- lxb_status_t
994
- lxb_encoding_decode_iso_8859_2(lxb_encoding_decode_t *ctx,
995
- const lxb_char_t **data, const lxb_char_t *end)
996
- {
997
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_2);
998
-
999
- return LXB_STATUS_OK;
1000
- }
1001
-
1002
- lxb_status_t
1003
- lxb_encoding_decode_iso_8859_3(lxb_encoding_decode_t *ctx,
1004
- const lxb_char_t **data, const lxb_char_t *end)
1005
- {
1006
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_3);
1007
-
1008
- return LXB_STATUS_OK;
1009
- }
1010
-
1011
- lxb_status_t
1012
- lxb_encoding_decode_iso_8859_4(lxb_encoding_decode_t *ctx,
1013
- const lxb_char_t **data, const lxb_char_t *end)
1014
- {
1015
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_4);
1016
-
1017
- return LXB_STATUS_OK;
1018
- }
1019
-
1020
- lxb_status_t
1021
- lxb_encoding_decode_iso_8859_5(lxb_encoding_decode_t *ctx,
1022
- const lxb_char_t **data, const lxb_char_t *end)
1023
- {
1024
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_5);
1025
-
1026
- return LXB_STATUS_OK;
1027
- }
1028
-
1029
- lxb_status_t
1030
- lxb_encoding_decode_iso_8859_6(lxb_encoding_decode_t *ctx,
1031
- const lxb_char_t **data, const lxb_char_t *end)
1032
- {
1033
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_6);
1034
-
1035
- return LXB_STATUS_OK;
1036
- }
1037
-
1038
- lxb_status_t
1039
- lxb_encoding_decode_iso_8859_7(lxb_encoding_decode_t *ctx,
1040
- const lxb_char_t **data, const lxb_char_t *end)
1041
- {
1042
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_7);
1043
-
1044
- return LXB_STATUS_OK;
1045
- }
1046
-
1047
- lxb_status_t
1048
- lxb_encoding_decode_iso_8859_8(lxb_encoding_decode_t *ctx,
1049
- const lxb_char_t **data, const lxb_char_t *end)
1050
- {
1051
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_8);
1052
-
1053
- return LXB_STATUS_OK;
1054
- }
1055
-
1056
- lxb_status_t
1057
- lxb_encoding_decode_iso_8859_8_i(lxb_encoding_decode_t *ctx,
1058
- const lxb_char_t **data, const lxb_char_t *end)
1059
- {
1060
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_8);
1061
-
1062
- return LXB_STATUS_OK;
1063
- }
1064
-
1065
- lxb_status_t
1066
- lxb_encoding_decode_koi8_r(lxb_encoding_decode_t *ctx,
1067
- const lxb_char_t **data, const lxb_char_t *end)
1068
- {
1069
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_koi8_r);
1070
-
1071
- return LXB_STATUS_OK;
1072
- }
1073
-
1074
- lxb_status_t
1075
- lxb_encoding_decode_koi8_u(lxb_encoding_decode_t *ctx,
1076
- const lxb_char_t **data, const lxb_char_t *end)
1077
- {
1078
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_koi8_u);
1079
-
1080
- return LXB_STATUS_OK;
1081
- }
1082
-
1083
- lxb_status_t
1084
- lxb_encoding_decode_shift_jis(lxb_encoding_decode_t *ctx,
1085
- const lxb_char_t **data, const lxb_char_t *end)
1086
- {
1087
- lxb_char_t byte, lead;
1088
-
1089
- ctx->status = LXB_STATUS_OK;
1090
-
1091
- if (ctx->u.lead != 0x00) {
1092
- if (ctx->have_error) {
1093
- ctx->have_error = false;
1094
- ctx->u.lead = 0x00;
1095
-
1096
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1097
- ctx->have_error = true;
1098
- ctx->u.lead = 0x01;
1099
- } LXB_ENCODING_DECODE_ERROR_END();
1100
- }
1101
- else {
1102
- if (*data >= end) {
1103
- ctx->status = LXB_STATUS_CONTINUE;
1104
-
1105
- return LXB_STATUS_CONTINUE;
1106
- }
1107
-
1108
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1109
-
1110
- lead = (lxb_char_t) ctx->u.lead;
1111
- ctx->u.lead = 0x00;
1112
-
1113
- goto lead_state;
1114
- }
1115
- }
1116
-
1117
- while (*data < end) {
1118
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1119
-
1120
- lead = *(*data)++;
1121
-
1122
- if (lead <= 0x80) {
1123
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
1124
- continue;
1125
- }
1126
-
1127
- if ((unsigned) (lead - 0xA1) <= (0xDF - 0xA1)) {
1128
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xFF61 - 0xA1 + lead);
1129
- continue;
1130
- }
1131
-
1132
- if ((unsigned) (lead - 0x81) > (0x9F - 0x81)
1133
- && lead != 0xE0 && lead != 0xFC)
1134
- {
1135
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1136
- (*data)--;
1137
- }
1138
- LXB_ENCODING_DECODE_ERROR_END();
1139
-
1140
- continue;
1141
- }
1142
-
1143
- if (*data >= end) {
1144
- ctx->u.lead = lead;
1145
- ctx->status = LXB_STATUS_CONTINUE;
1146
-
1147
- return LXB_STATUS_CONTINUE;
1148
- }
1149
-
1150
- lead_state:
1151
-
1152
- byte = *(*data)++;
1153
-
1154
- if (byte < 0x7F) {
1155
- ctx->codepoint = 0x40;
1156
- }
1157
- else {
1158
- ctx->codepoint = 0x41;
1159
- }
1160
-
1161
- if (lead < 0xA0) {
1162
- ctx->second_codepoint = 0x81;
1163
- }
1164
- else {
1165
- ctx->second_codepoint = 0xC1;
1166
- }
1167
-
1168
- if ((unsigned) (byte - 0x40) > (0x7E - 0x40)
1169
- && (unsigned) (byte - 0x80) > (0xFC - 0x80))
1170
- {
1171
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1172
- continue;
1173
- }
1174
-
1175
- /* Max index == (0xFC - 0xC1) * 188 + 0xFC - 0x41 = 11279 */
1176
- ctx->codepoint = (lead - ctx->second_codepoint) * 188
1177
- + byte - ctx->codepoint;
1178
-
1179
- if (ctx->codepoint >= (sizeof(lxb_encoding_multi_index_jis0208)
1180
- / sizeof(lxb_encoding_multi_index_t)))
1181
- {
1182
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1183
- continue;
1184
- }
1185
-
1186
- if ((unsigned) (ctx->codepoint - 8836) <= (10715 - 8836)) {
1187
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xE000 - 8836 + ctx->codepoint);
1188
- continue;
1189
- }
1190
-
1191
- ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
1192
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1193
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1194
- continue;
1195
- }
1196
-
1197
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1198
- }
1199
-
1200
- return LXB_STATUS_OK;
1201
- }
1202
-
1203
- lxb_inline lxb_status_t
1204
- lxb_encoding_decode_utf_16(lxb_encoding_decode_t *ctx, bool is_be,
1205
- const lxb_char_t **data, const lxb_char_t *end)
1206
- {
1207
- unsigned lead;
1208
- lxb_codepoint_t unit;
1209
-
1210
- ctx->status = LXB_STATUS_OK;
1211
-
1212
- if (ctx->have_error) {
1213
- ctx->have_error = false;
1214
-
1215
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1216
- ctx->have_error = true;
1217
- }
1218
- LXB_ENCODING_DECODE_ERROR_END();
1219
- }
1220
-
1221
- if (ctx->u.lead != 0x00) {
1222
- if (*data >= end) {
1223
- ctx->status = LXB_STATUS_CONTINUE;
1224
-
1225
- return LXB_STATUS_CONTINUE;
1226
- }
1227
-
1228
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1229
-
1230
- lead = ctx->u.lead - 0x01;
1231
- ctx->u.lead = 0x00;
1232
-
1233
- goto lead_state;
1234
- }
1235
-
1236
- while (*data < end) {
1237
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1238
-
1239
- pair_state:
1240
-
1241
- lead = *(*data)++;
1242
-
1243
- if (*data >= end) {
1244
- ctx->u.lead = lead + 0x01;
1245
- ctx->status = LXB_STATUS_CONTINUE;
1246
-
1247
- return LXB_STATUS_CONTINUE;
1248
- }
1249
-
1250
- lead_state:
1251
-
1252
- /* For UTF-16BE or UTF-16LE */
1253
- if (is_be) {
1254
- unit = (lead << 8) + *(*data)++;
1255
- }
1256
- else {
1257
- unit = (*(*data)++ << 8) + lead;
1258
- }
1259
-
1260
- if (ctx->second_codepoint != 0x00) {
1261
- if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
1262
- ctx->codepoint = 0x10000 + ((ctx->second_codepoint - 0xD800) << 10)
1263
- + (unit - 0xDC00);
1264
-
1265
- ctx->second_codepoint = 0x00;
1266
-
1267
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1268
- continue;
1269
- }
1270
-
1271
- (*data)--;
1272
-
1273
- ctx->second_codepoint = 0x00;
1274
-
1275
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1276
- ctx->have_error = true;
1277
-
1278
- ctx->u.lead = lead + 0x01;
1279
- }
1280
- LXB_ENCODING_DECODE_ERROR_END();
1281
-
1282
- goto lead_state;
1283
- }
1284
-
1285
- /* Surrogate pair */
1286
- if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
1287
- if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
1288
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1289
- ctx->have_error = true;
1290
- }
1291
- LXB_ENCODING_DECODE_ERROR_END();
1292
-
1293
- continue;
1294
- }
1295
-
1296
- ctx->second_codepoint = unit;
1297
-
1298
- if (*data >= end) {
1299
- ctx->status = LXB_STATUS_CONTINUE;
1300
-
1301
- return LXB_STATUS_CONTINUE;
1302
- }
1303
-
1304
- goto pair_state;
1305
- }
1306
-
1307
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, unit);
1308
- }
1309
-
1310
- return LXB_STATUS_OK;
1311
- }
1312
-
1313
- lxb_status_t
1314
- lxb_encoding_decode_utf_16be(lxb_encoding_decode_t *ctx,
1315
- const lxb_char_t **data, const lxb_char_t *end)
1316
- {
1317
- return lxb_encoding_decode_utf_16(ctx, true, data, end);
1318
- }
1319
-
1320
- lxb_status_t
1321
- lxb_encoding_decode_utf_16le(lxb_encoding_decode_t *ctx,
1322
- const lxb_char_t **data, const lxb_char_t *end)
1323
- {
1324
- return lxb_encoding_decode_utf_16(ctx, false, data, end);
1325
- }
1326
-
1327
- lxb_status_t
1328
- lxb_encoding_decode_utf_8(lxb_encoding_decode_t *ctx,
1329
- const lxb_char_t **data, const lxb_char_t *end)
1330
- {
1331
- unsigned need;
1332
- lxb_char_t ch;
1333
- const lxb_char_t *p = *data;
1334
-
1335
- ctx->status = LXB_STATUS_OK;
1336
-
1337
- if (ctx->have_error) {
1338
- ctx->have_error = false;
1339
-
1340
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1341
- ctx->have_error = true;
1342
- }
1343
- LXB_ENCODING_DECODE_ERROR_END();
1344
- }
1345
-
1346
- if (ctx->u.utf_8.need != 0) {
1347
- if (p >= end) {
1348
- ctx->status = LXB_STATUS_CONTINUE;
1349
-
1350
- return LXB_STATUS_CONTINUE;
1351
- }
1352
-
1353
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1354
-
1355
- need = ctx->u.utf_8.need;
1356
- ctx->u.utf_8.need = 0;
1357
-
1358
- if (ctx->u.utf_8.lower != 0x00) {
1359
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY(ctx->u.utf_8.lower,
1360
- ctx->u.utf_8.upper, goto begin);
1361
- ctx->u.utf_8.lower = 0x00;
1362
- }
1363
-
1364
- goto decode;
1365
- }
1366
-
1367
- begin:
1368
-
1369
- while (p < end) {
1370
- if (ctx->buffer_used >= ctx->buffer_length) {
1371
- *data = p;
1372
-
1373
- return LXB_STATUS_SMALL_BUFFER;
1374
- }
1375
-
1376
- ch = *p++;
1377
-
1378
- if (ch < 0x80) {
1379
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ch);
1380
- continue;
1381
- }
1382
- else if (ch <= 0xDF) {
1383
- if (ch < 0xC2) {
1384
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1385
- *data = p - 1;
1386
- }
1387
- LXB_ENCODING_DECODE_ERROR_END();
1388
-
1389
- continue;
1390
- }
1391
-
1392
- need = 1;
1393
- ctx->codepoint = ch & 0x1F;
1394
- }
1395
- else if (ch < 0xF0) {
1396
- need = 2;
1397
- ctx->codepoint = ch & 0x0F;
1398
-
1399
- if (p == end) {
1400
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(0xE0, 0xED, 0xA0, 0x9F);
1401
-
1402
- *data = p;
1403
-
1404
- ctx->u.utf_8.need = need;
1405
- ctx->status = LXB_STATUS_CONTINUE;
1406
-
1407
- return LXB_STATUS_CONTINUE;
1408
- }
1409
-
1410
- if (ch == 0xE0) {
1411
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0xA0, 0xBF, continue);
1412
- }
1413
- else if (ch == 0xED) {
1414
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x80, 0x9F, continue);
1415
- }
1416
- }
1417
- else if (ch < 0xF5) {
1418
- need = 3;
1419
- ctx->codepoint = ch & 0x07;
1420
-
1421
- if (p == end) {
1422
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(0xF0, 0xF4, 0x90, 0x8F);
1423
-
1424
- *data = p;
1425
-
1426
- ctx->u.utf_8.need = need;
1427
- ctx->status = LXB_STATUS_CONTINUE;
1428
-
1429
- return LXB_STATUS_CONTINUE;
1430
- }
1431
-
1432
- if (ch == 0xF0) {
1433
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x90, 0xBF, continue);
1434
- }
1435
- else if (ch == 0xF4) {
1436
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x80, 0x8F, continue);
1437
- }
1438
- }
1439
- else {
1440
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1441
- *data = p - 1;
1442
- }
1443
- LXB_ENCODING_DECODE_ERROR_END();
1444
-
1445
- continue;
1446
- }
1447
-
1448
- decode:
1449
-
1450
- do {
1451
- if (p >= end) {
1452
- *data = p;
1453
-
1454
- ctx->u.utf_8.need = need;
1455
- ctx->status = LXB_STATUS_CONTINUE;
1456
-
1457
- return LXB_STATUS_CONTINUE;
1458
- }
1459
-
1460
- ch = *p++;
1461
-
1462
- if (ch < 0x80 || ch > 0xBF) {
1463
- p--;
1464
-
1465
- ctx->u.utf_8.need = 0;
1466
-
1467
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1468
- *data = p;
1469
- ctx->have_error = true;
1470
- }
1471
- LXB_ENCODING_DECODE_ERROR_END();
1472
-
1473
- break;
1474
- }
1475
-
1476
- ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
1477
-
1478
- if (--need == 0) {
1479
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1480
-
1481
- break;
1482
- }
1483
- }
1484
- while (true);
1485
- }
1486
-
1487
- *data = p;
1488
-
1489
- return LXB_STATUS_OK;
1490
- }
1491
-
1492
- lxb_inline lxb_codepoint_t
1493
- lxb_encoding_decode_gb18030_range(uint32_t index)
1494
- {
1495
- size_t mid, left, right;
1496
- const lxb_encoding_range_index_t *range;
1497
-
1498
- /*
1499
- * Pointer greater than 39419 and less than 189000,
1500
- * or pointer is greater than 1237575
1501
- */
1502
- if ((unsigned) (index - 39419) < (189000 - 39419)
1503
- || index > 1237575)
1504
- {
1505
- return LXB_ENCODING_ERROR_CODEPOINT;
1506
- }
1507
-
1508
- if (index == 7457) {
1509
- return 0xE7C7;
1510
- }
1511
-
1512
- left = 0;
1513
- right = LXB_ENCODING_RANGE_INDEX_GB18030_SIZE;
1514
- range = lxb_encoding_range_index_gb18030;
1515
-
1516
- /* Some compilers say about uninitialized mid */
1517
- mid = 0;
1518
-
1519
- while (left < right) {
1520
- mid = left + (right - left) / 2;
1521
-
1522
- if (range[mid].index < index) {
1523
- left = mid + 1;
1524
-
1525
- if (left < right && range[ left ].index > index) {
1526
- break;
1527
- }
1528
- }
1529
- else if (range[mid].index > index) {
1530
- right = mid - 1;
1531
-
1532
- if (right > 0 && range[right].index <= index) {
1533
- mid = right;
1534
- break;
1535
- }
1536
- }
1537
- else {
1538
- break;
1539
- }
1540
- }
1541
-
1542
- return range[mid].codepoint + index - range[mid].index;
1543
- }
1544
-
1545
- lxb_status_t
1546
- lxb_encoding_decode_gb18030(lxb_encoding_decode_t *ctx,
1547
- const lxb_char_t **data, const lxb_char_t *end)
1548
- {
1549
- uint32_t pointer;
1550
- lxb_char_t first, second, third, offset;
1551
-
1552
- /* Make compiler happy */
1553
- second = 0x00;
1554
-
1555
- ctx->status = LXB_STATUS_OK;
1556
-
1557
- if (ctx->have_error) {
1558
- ctx->have_error = false;
1559
-
1560
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1561
- ctx->have_error = true;
1562
- }
1563
- LXB_ENCODING_DECODE_ERROR_END();
1564
- }
1565
-
1566
- if (ctx->u.gb18030.first != 0) {
1567
- if (*data >= end) {
1568
- ctx->status = LXB_STATUS_CONTINUE;
1569
-
1570
- return LXB_STATUS_CONTINUE;
1571
- }
1572
-
1573
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1574
-
1575
- if (ctx->u.gb18030.third != 0x00) {
1576
- first = ctx->u.gb18030.first;
1577
- second = ctx->u.gb18030.second;
1578
- third = ctx->u.gb18030.third;
1579
-
1580
- memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
1581
-
1582
- if (ctx->prepend) {
1583
- /* The first is always < 0x80 */
1584
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, second);
1585
-
1586
- if (ctx->buffer_used == ctx->buffer_length) {
1587
- ctx->u.gb18030.first = third;
1588
-
1589
- return LXB_STATUS_SMALL_BUFFER;
1590
- }
1591
-
1592
- first = third;
1593
- ctx->prepend = false;
1594
-
1595
- goto prepend_first;
1596
- }
1597
-
1598
- goto third_state;
1599
- }
1600
- else if (ctx->u.gb18030.second != 0x00) {
1601
- first = ctx->u.gb18030.first;
1602
- second = ctx->u.gb18030.second;
1603
-
1604
- memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
1605
-
1606
- goto second_state;
1607
- }
1608
-
1609
- first = ctx->u.gb18030.first;
1610
- ctx->u.gb18030.first = 0x00;
1611
-
1612
- if (ctx->prepend) {
1613
- ctx->prepend = false;
1614
- goto prepend_first;
1615
- }
1616
-
1617
- goto first_state;
1618
- }
1619
-
1620
- while (*data < end) {
1621
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1622
-
1623
- first = *(*data)++;
1624
-
1625
- prepend_first:
1626
-
1627
- if (first < 0x80) {
1628
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, first);
1629
- continue;
1630
- }
1631
-
1632
- if (first == 0x80) {
1633
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x20AC);
1634
- continue;
1635
- }
1636
-
1637
- /* Range 0x81 to 0xFE, inclusive */
1638
- if ((unsigned) (first - 0x81) > (0xFE - 0x81)) {
1639
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1640
- (*data)--;
1641
- }
1642
- LXB_ENCODING_DECODE_ERROR_END();
1643
-
1644
- continue;
1645
- }
1646
-
1647
- if (*data == end) {
1648
- ctx->u.gb18030.first = first;
1649
- ctx->status = LXB_STATUS_CONTINUE;
1650
-
1651
- return LXB_STATUS_CONTINUE;
1652
- }
1653
-
1654
- /* First */
1655
- first_state:
1656
-
1657
- second = *(*data)++;
1658
-
1659
- /* Range 0x30 to 0x39, inclusive */
1660
- if ((unsigned) (second - 0x30) > (0x39 - 0x30)) {
1661
- offset = (second < 0x7F) ? 0x40 : 0x41;
1662
-
1663
- /* Range 0x40 to 0x7E, inclusive, or 0x80 to 0xFE, inclusive */
1664
- if ((unsigned) (second - 0x40) <= (0x7E - 0x40)
1665
- || (unsigned) (second - 0x80) <= (0xFE - 0x80))
1666
- {
1667
- pointer = (first - 0x81) * 190 + (second - offset);
1668
- }
1669
- else {
1670
- if (second < 0x80) {
1671
- (*data)--;
1672
- }
1673
-
1674
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1675
- ctx->have_error = true;
1676
- }
1677
- LXB_ENCODING_DECODE_ERROR_END();
1678
-
1679
- continue;
1680
- }
1681
-
1682
- /* Max pointer value == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
1683
- ctx->codepoint = lxb_encoding_multi_index_gb18030[pointer].codepoint;
1684
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1685
- if (second < 0x80) {
1686
- (*data)--;
1687
- }
1688
-
1689
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1690
- ctx->have_error = true;
1691
- }
1692
- LXB_ENCODING_DECODE_ERROR_END();
1693
-
1694
- continue;
1695
- }
1696
-
1697
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1698
- continue;
1699
- }
1700
-
1701
- if (*data == end) {
1702
- ctx->u.gb18030.first = first;
1703
- ctx->u.gb18030.second = second;
1704
-
1705
- ctx->status = LXB_STATUS_CONTINUE;
1706
-
1707
- return LXB_STATUS_CONTINUE;
1708
- }
1709
-
1710
- /* Second */
1711
- second_state:
1712
-
1713
- third = *(*data)++;
1714
-
1715
- /* Range 0x81 to 0xFE, inclusive */
1716
- if ((unsigned) (third - 0x81) > (0xFE - 0x81)) {
1717
- (*data)--;
1718
-
1719
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1720
- ctx->prepend = true;
1721
- ctx->have_error = true;
1722
- ctx->u.gb18030.first = second;
1723
- }
1724
- LXB_ENCODING_DECODE_ERROR_END();
1725
-
1726
- first = second;
1727
-
1728
- goto prepend_first;
1729
- }
1730
-
1731
- if (*data == end) {
1732
- ctx->u.gb18030.first = first;
1733
- ctx->u.gb18030.second = second;
1734
- ctx->u.gb18030.third = third;
1735
-
1736
- ctx->status = LXB_STATUS_CONTINUE;
1737
-
1738
- return LXB_STATUS_CONTINUE;
1739
- }
1740
-
1741
- /* Third */
1742
- third_state:
1743
-
1744
- /* Range 0x30 to 0x39, inclusive */
1745
- if ((unsigned) (**data - 0x30) > (0x39 - 0x30)) {
1746
- ctx->prepend = true;
1747
-
1748
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1749
- ctx->prepend = true;
1750
- ctx->have_error = true;
1751
-
1752
- /* First is a fake for trigger */
1753
- ctx->u.gb18030.first = 0x01;
1754
- ctx->u.gb18030.second = second;
1755
- ctx->u.gb18030.third = third;
1756
- }
1757
- LXB_ENCODING_DECODE_ERROR_END();
1758
-
1759
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, second);
1760
-
1761
- if (ctx->buffer_used == ctx->buffer_length) {
1762
- ctx->prepend = true;
1763
- ctx->have_error = true;
1764
-
1765
- /* First is a fake for trigger */
1766
- ctx->u.gb18030.first = 0x01;
1767
- ctx->u.gb18030.second = second;
1768
- ctx->u.gb18030.third = third;
1769
-
1770
- return LXB_STATUS_SMALL_BUFFER;
1771
- }
1772
-
1773
- first = third;
1774
-
1775
- goto prepend_first;
1776
- }
1777
-
1778
- pointer = ((first - 0x81) * (10 * 126 * 10))
1779
- + ((second - 0x30) * (10 * 126))
1780
- + ((third - 0x81) * 10) + (*(*data)++) - 0x30;
1781
-
1782
- ctx->codepoint = lxb_encoding_decode_gb18030_range(pointer);
1783
-
1784
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1785
- LXB_ENCODING_DECODE_ERROR_BEGIN {}
1786
- LXB_ENCODING_DECODE_ERROR_END();
1787
-
1788
- continue;
1789
- }
1790
-
1791
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1792
- }
1793
-
1794
- return LXB_STATUS_OK;
1795
- }
1796
-
1797
- lxb_status_t
1798
- lxb_encoding_decode_macintosh(lxb_encoding_decode_t *ctx,
1799
- const lxb_char_t **data, const lxb_char_t *end)
1800
- {
1801
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_macintosh);
1802
-
1803
- return LXB_STATUS_OK;
1804
- }
1805
-
1806
- lxb_status_t
1807
- lxb_encoding_decode_replacement(lxb_encoding_decode_t *ctx,
1808
- const lxb_char_t **data, const lxb_char_t *end)
1809
- {
1810
- *data = end;
1811
- return LXB_STATUS_ERROR;
1812
- }
1813
-
1814
- lxb_status_t
1815
- lxb_encoding_decode_windows_1250(lxb_encoding_decode_t *ctx,
1816
- const lxb_char_t **data, const lxb_char_t *end)
1817
- {
1818
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1250);
1819
-
1820
- return LXB_STATUS_OK;
1821
- }
1822
-
1823
- lxb_status_t
1824
- lxb_encoding_decode_windows_1251(lxb_encoding_decode_t *ctx,
1825
- const lxb_char_t **data, const lxb_char_t *end)
1826
- {
1827
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1251);
1828
-
1829
- return LXB_STATUS_OK;
1830
- }
1831
-
1832
- lxb_status_t
1833
- lxb_encoding_decode_windows_1252(lxb_encoding_decode_t *ctx,
1834
- const lxb_char_t **data, const lxb_char_t *end)
1835
- {
1836
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1252);
1837
-
1838
- return LXB_STATUS_OK;
1839
- }
1840
-
1841
- lxb_status_t
1842
- lxb_encoding_decode_windows_1253(lxb_encoding_decode_t *ctx,
1843
- const lxb_char_t **data, const lxb_char_t *end)
1844
- {
1845
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1253);
1846
-
1847
- return LXB_STATUS_OK;
1848
- }
1849
-
1850
- lxb_status_t
1851
- lxb_encoding_decode_windows_1254(lxb_encoding_decode_t *ctx,
1852
- const lxb_char_t **data, const lxb_char_t *end)
1853
- {
1854
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1254);
1855
-
1856
- return LXB_STATUS_OK;
1857
- }
1858
-
1859
- lxb_status_t
1860
- lxb_encoding_decode_windows_1255(lxb_encoding_decode_t *ctx,
1861
- const lxb_char_t **data, const lxb_char_t *end)
1862
- {
1863
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1255);
1864
-
1865
- return LXB_STATUS_OK;
1866
- }
1867
-
1868
- lxb_status_t
1869
- lxb_encoding_decode_windows_1256(lxb_encoding_decode_t *ctx,
1870
- const lxb_char_t **data, const lxb_char_t *end)
1871
- {
1872
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1256);
1873
-
1874
- return LXB_STATUS_OK;
1875
- }
1876
-
1877
- lxb_status_t
1878
- lxb_encoding_decode_windows_1257(lxb_encoding_decode_t *ctx,
1879
- const lxb_char_t **data, const lxb_char_t *end)
1880
- {
1881
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1257);
1882
-
1883
- return LXB_STATUS_OK;
1884
- }
1885
-
1886
- lxb_status_t
1887
- lxb_encoding_decode_windows_1258(lxb_encoding_decode_t *ctx,
1888
- const lxb_char_t **data, const lxb_char_t *end)
1889
- {
1890
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1258);
1891
-
1892
- return LXB_STATUS_OK;
1893
- }
1894
-
1895
- lxb_status_t
1896
- lxb_encoding_decode_windows_874(lxb_encoding_decode_t *ctx,
1897
- const lxb_char_t **data, const lxb_char_t *end)
1898
- {
1899
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_874);
1900
-
1901
- return LXB_STATUS_OK;
1902
- }
1903
-
1904
- lxb_status_t
1905
- lxb_encoding_decode_x_mac_cyrillic(lxb_encoding_decode_t *ctx,
1906
- const lxb_char_t **data, const lxb_char_t *end)
1907
- {
1908
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_x_mac_cyrillic);
1909
-
1910
- return LXB_STATUS_OK;
1911
- }
1912
-
1913
- lxb_status_t
1914
- lxb_encoding_decode_x_user_defined(lxb_encoding_decode_t *ctx,
1915
- const lxb_char_t **data, const lxb_char_t *end)
1916
- {
1917
- while (*data < end) {
1918
- if (**data < 0x80) {
1919
- LXB_ENCODING_DECODE_APPEND(ctx, *(*data)++);
1920
- }
1921
- else {
1922
- LXB_ENCODING_DECODE_APPEND(ctx, 0xF780 + (*(*data)++) - 0x80);
1923
- }
1924
- }
1925
-
1926
- return LXB_STATUS_OK;
1927
- }
1928
-
1929
- /*
1930
- * Single
1931
- */
1932
- lxb_codepoint_t
1933
- lxb_encoding_decode_default_single(lxb_encoding_decode_t *ctx,
1934
- const lxb_char_t **data, const lxb_char_t *end)
1935
- {
1936
- return lxb_encoding_decode_utf_8_single(ctx, data, end);
1937
- }
1938
-
1939
- lxb_codepoint_t
1940
- lxb_encoding_decode_auto_single(lxb_encoding_decode_t *ctx,
1941
- const lxb_char_t **data, const lxb_char_t *end)
1942
- {
1943
- return LXB_ENCODING_DECODE_ERROR;
1944
- }
1945
-
1946
- lxb_codepoint_t
1947
- lxb_encoding_decode_undefined_single(lxb_encoding_decode_t *ctx,
1948
- const lxb_char_t **data, const lxb_char_t *end)
1949
- {
1950
- return LXB_ENCODING_DECODE_ERROR;
1951
- }
1952
-
1953
- lxb_codepoint_t
1954
- lxb_encoding_decode_big5_single(lxb_encoding_decode_t *ctx,
1955
- const lxb_char_t **data, const lxb_char_t *end)
1956
- {
1957
- uint32_t index;
1958
- lxb_char_t lead, byte;
1959
-
1960
- if (ctx->u.lead != 0x00) {
1961
- if (ctx->second_codepoint != 0x00) {
1962
- (*data)++;
1963
-
1964
- ctx->u.lead = 0x00;
1965
-
1966
- ctx->codepoint = ctx->second_codepoint;
1967
- ctx->second_codepoint = 0x00;
1968
-
1969
- return ctx->codepoint;
1970
- }
1971
-
1972
- lead = (lxb_char_t) ctx->u.lead;
1973
- ctx->u.lead = 0x00;
1974
-
1975
- goto lead_state;
1976
- }
1977
-
1978
- lead = *(*data)++;
1979
-
1980
- if (lead < 0x80) {
1981
- return lead;
1982
- }
1983
-
1984
- if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
1985
- return LXB_ENCODING_DECODE_ERROR;
1986
- }
1987
-
1988
- if (*data >= end) {
1989
- ctx->u.lead = lead;
1990
-
1991
- return LXB_ENCODING_DECODE_CONTINUE;
1992
- }
1993
-
1994
- lead_state:
1995
-
1996
- index = 0;
1997
- byte = **data;
1998
-
1999
- if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
2000
- || (unsigned) (byte - 0xA1) <= (0xFE - 0xA1))
2001
- {
2002
- if (byte < 0x7F) {
2003
- /* Max index == (0xFE - 0x81) * 157 + (0x7E - 0x62) == 19653 */
2004
- index = (lead - 0x81) * 157 + (byte - 0x40);
2005
- }
2006
- else {
2007
- /* Max index == (0xFE - 0x81) * 157 + (0xFE - 0x62) == 19781 */
2008
- index = (lead - 0x81) * 157 + (byte - 0x62);
2009
- }
2010
- }
2011
-
2012
- /*
2013
- * 1133 U+00CA U+0304 Ê̄ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON)
2014
- * 1135 U+00CA U+030C Ê̌ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON)
2015
- * 1164 U+00EA U+0304 ê̄ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON)
2016
- * 1166 U+00EA U+030C ê̌ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON)
2017
- */
2018
- switch (index) {
2019
- case 1133:
2020
- ctx->u.lead = lead;
2021
- ctx->second_codepoint = 0x0304;
2022
- return 0x00CA;
2023
-
2024
- case 1135:
2025
- ctx->u.lead = lead;
2026
- ctx->second_codepoint = 0x030C;
2027
- return 0x00CA;
2028
-
2029
- case 1164:
2030
- ctx->u.lead = lead;
2031
- ctx->second_codepoint = 0x0304;
2032
- return 0x00EA;
2033
-
2034
- case 1166:
2035
- ctx->u.lead = lead;
2036
- ctx->second_codepoint = 0x030C;
2037
- return 0x00EA;
2038
-
2039
- case 0:
2040
- goto failed;
2041
- }
2042
-
2043
- ctx->codepoint = lxb_encoding_multi_index_big5[index].codepoint;
2044
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2045
- goto failed;
2046
- }
2047
-
2048
- (*data)++;
2049
-
2050
- return ctx->codepoint;
2051
-
2052
- failed:
2053
-
2054
- if (byte >= 0x80) {
2055
- (*data)++;
2056
- }
2057
-
2058
- return LXB_ENCODING_DECODE_ERROR;
2059
- }
2060
-
2061
- lxb_codepoint_t
2062
- lxb_encoding_decode_euc_jp_single(lxb_encoding_decode_t *ctx,
2063
- const lxb_char_t **data, const lxb_char_t *end)
2064
- {
2065
- bool is_jis0212;
2066
- lxb_char_t byte, lead;
2067
-
2068
- if (ctx->u.euc_jp.lead != 0x00) {
2069
- lead = ctx->u.euc_jp.lead;
2070
- byte = *(*data)++;
2071
-
2072
- ctx->u.euc_jp.lead = 0x00;
2073
-
2074
- if (ctx->u.euc_jp.is_jis0212) {
2075
- is_jis0212 = true;
2076
- ctx->u.euc_jp.is_jis0212 = false;
2077
-
2078
- goto lead_jis_state;
2079
- }
2080
-
2081
- goto lead_state;
2082
- }
2083
-
2084
- lead = *(*data)++;
2085
-
2086
- if (lead < 0x80) {
2087
- return lead;
2088
- }
2089
-
2090
- if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
2091
- && (lead != 0x8E && lead != 0x8F))
2092
- {
2093
- return LXB_ENCODING_DECODE_ERROR;
2094
- }
2095
-
2096
- if (*data >= end) {
2097
- ctx->u.euc_jp.lead = lead;
2098
- return LXB_ENCODING_DECODE_CONTINUE;
2099
- }
2100
-
2101
- byte = *(*data)++;
2102
-
2103
- lead_state:
2104
-
2105
- if (lead == 0x8E && (unsigned) (byte - 0xA1) <= (0xDF - 0xA1)) {
2106
- return 0xFF61 - 0xA1 + byte;
2107
- }
2108
-
2109
- is_jis0212 = false;
2110
-
2111
- if (lead == 0x8F && (unsigned) (byte - 0xA1) <= (0xFE - 0xA1)) {
2112
- if (*data >= end) {
2113
- ctx->u.euc_jp.lead = byte;
2114
- ctx->u.euc_jp.is_jis0212 = true;
2115
-
2116
- return LXB_ENCODING_DECODE_CONTINUE;
2117
- }
2118
-
2119
- lead = byte;
2120
- byte = *(*data)++;
2121
- is_jis0212 = true;
2122
- }
2123
-
2124
- lead_jis_state:
2125
-
2126
- if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
2127
- || (unsigned) (byte - 0xA1) > (0xFE - 0xA1))
2128
- {
2129
- goto failed;
2130
- }
2131
-
2132
- /* Max index == (0xFE - 0xA1) * 94 + 0xFE - 0xA1 == 8835 */
2133
- ctx->codepoint = (lead - 0xA1) * 94 + byte - 0xA1;
2134
-
2135
- if (is_jis0212) {
2136
- if ((sizeof(lxb_encoding_multi_index_jis0212)
2137
- / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
2138
- {
2139
- goto failed;
2140
- }
2141
-
2142
- ctx->codepoint = lxb_encoding_multi_index_jis0212[ctx->codepoint].codepoint;
2143
- }
2144
- else {
2145
- if ((sizeof(lxb_encoding_multi_index_jis0208)
2146
- / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
2147
- {
2148
- goto failed;
2149
- }
2150
-
2151
- ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2152
- }
2153
-
2154
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2155
- goto failed;
2156
- }
2157
-
2158
- return ctx->codepoint;
2159
-
2160
- failed:
2161
-
2162
- if (byte < 0x80) {
2163
- (*data)--;
2164
- }
2165
-
2166
- return LXB_ENCODING_DECODE_ERROR;
2167
- }
2168
-
2169
- lxb_codepoint_t
2170
- lxb_encoding_decode_euc_kr_single(lxb_encoding_decode_t *ctx,
2171
- const lxb_char_t **data, const lxb_char_t *end)
2172
- {
2173
- lxb_char_t lead, byte;
2174
-
2175
- if (ctx->u.lead != 0x00) {
2176
- lead = (lxb_char_t) ctx->u.lead;
2177
- ctx->u.lead = 0x00;
2178
-
2179
- goto lead_state;
2180
- }
2181
-
2182
- lead = *(*data)++;
2183
-
2184
- if (lead < 0x80) {
2185
- return lead;
2186
- }
2187
-
2188
- if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
2189
- return LXB_ENCODING_DECODE_ERROR;
2190
- }
2191
-
2192
- if (*data == end) {
2193
- ctx->u.lead = lead;
2194
- return LXB_ENCODING_DECODE_CONTINUE;
2195
- }
2196
-
2197
- lead_state:
2198
-
2199
- byte = *(*data)++;
2200
-
2201
- if ((unsigned) (byte - 0x41) > (0xFE - 0x41)) {
2202
- goto failed;
2203
- }
2204
-
2205
- /* Max index == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
2206
- ctx->codepoint = (lead - 0x81) * 190 + (byte - 0x41);
2207
-
2208
- if (ctx->codepoint >= sizeof(lxb_encoding_multi_index_euc_kr)
2209
- / sizeof(lxb_encoding_multi_index_t))
2210
- {
2211
- goto failed;
2212
- }
2213
-
2214
- ctx->codepoint = lxb_encoding_multi_index_euc_kr[ctx->codepoint].codepoint;
2215
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2216
- goto failed;
2217
- }
2218
-
2219
- return ctx->codepoint;
2220
-
2221
- failed:
2222
-
2223
- if (byte < 0x80) {
2224
- (*data)--;
2225
- }
2226
-
2227
- return LXB_ENCODING_DECODE_ERROR;
2228
- }
2229
-
2230
- lxb_codepoint_t
2231
- lxb_encoding_decode_gbk_single(lxb_encoding_decode_t *ctx,
2232
- const lxb_char_t **data, const lxb_char_t *end)
2233
- {
2234
- return lxb_encoding_decode_gb18030_single(ctx, data, end);
2235
- }
2236
-
2237
- lxb_codepoint_t
2238
- lxb_encoding_decode_ibm866_single(lxb_encoding_decode_t *ctx,
2239
- const lxb_char_t **data, const lxb_char_t *end)
2240
- {
2241
- if (**data < 0x80) {
2242
- return *(*data)++;
2243
- }
2244
-
2245
- return lxb_encoding_single_index_ibm866[*(*data)++ - 0x80].codepoint;
2246
- }
2247
-
2248
- lxb_codepoint_t
2249
- lxb_encoding_decode_iso_2022_jp_single(lxb_encoding_decode_t *ctx,
2250
- const lxb_char_t **data, const lxb_char_t *end)
2251
- {
2252
- lxb_char_t byte;
2253
- lxb_encoding_ctx_2022_jp_t *iso = &ctx->u.iso_2022_jp;
2254
-
2255
- if (iso->prepand != 0x00) {
2256
- byte = iso->prepand;
2257
- iso->prepand = 0x00;
2258
-
2259
- goto prepand;
2260
- }
2261
-
2262
- do {
2263
- byte = *(*data)++;
2264
-
2265
- prepand:
2266
-
2267
- switch (iso->state) {
2268
- case LXB_ENCODING_DECODE_2022_JP_ASCII:
2269
- if (byte == 0x1B) {
2270
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2271
-
2272
- break;
2273
- }
2274
-
2275
- /* 0x00 to 0x7F, excluding 0x0E, 0x0F, and 0x1B */
2276
- if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)
2277
- && byte != 0x0E && byte != 0x0F)
2278
- {
2279
- iso->out_flag = false;
2280
-
2281
- return byte;
2282
- }
2283
-
2284
- iso->out_flag = false;
2285
-
2286
- return LXB_ENCODING_DECODE_ERROR;
2287
-
2288
- case LXB_ENCODING_DECODE_2022_JP_ROMAN:
2289
- switch (byte) {
2290
- case 0x1B:
2291
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2292
-
2293
- continue;
2294
-
2295
- case 0x5C:
2296
- iso->out_flag = false;
2297
-
2298
- return 0x00A5;
2299
-
2300
- case 0x7E:
2301
- iso->out_flag = false;
2302
-
2303
- return 0x203E;
2304
-
2305
- case 0x0E:
2306
- case 0x0F:
2307
- break;
2308
-
2309
- default:
2310
- /* 0x00 to 0x7F */
2311
- if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)) {
2312
- iso->out_flag = false;
2313
-
2314
- return byte;
2315
- }
2316
-
2317
- break;
2318
- }
2319
-
2320
- iso->out_flag = false;
2321
-
2322
- return LXB_ENCODING_DECODE_ERROR;
2323
-
2324
- case LXB_ENCODING_DECODE_2022_JP_KATAKANA:
2325
- if (byte == 0x1B) {
2326
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2327
-
2328
- break;
2329
- }
2330
-
2331
- /* 0x21 to 0x5F */
2332
- if ((unsigned) (byte - 0x21) <= (0x5F - 0x21)) {
2333
- iso->out_flag = false;
2334
-
2335
- return 0xFF61 - 0x21 + byte;
2336
- }
2337
-
2338
- iso->out_flag = false;
2339
-
2340
- return LXB_ENCODING_DECODE_ERROR;
2341
-
2342
- case LXB_ENCODING_DECODE_2022_JP_LEAD:
2343
- if (byte == 0x1B) {
2344
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2345
-
2346
- break;
2347
- }
2348
-
2349
- /* 0x21 to 0x7E */
2350
- if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
2351
- iso->out_flag = false;
2352
- iso->lead = byte;
2353
- iso->state = LXB_ENCODING_DECODE_2022_JP_TRAIL;
2354
-
2355
- break;
2356
- }
2357
-
2358
- iso->out_flag = false;
2359
-
2360
- return LXB_ENCODING_DECODE_ERROR;
2361
-
2362
- case LXB_ENCODING_DECODE_2022_JP_TRAIL:
2363
- if (byte == 0x1B) {
2364
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2365
-
2366
- return LXB_ENCODING_DECODE_ERROR;
2367
- }
2368
-
2369
- iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
2370
-
2371
- /* 0x21 to 0x7E */
2372
- if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
2373
- /* Max index == (0x7E - 0x21) * 94 + 0x7E - 0x21 == 8835 */
2374
- ctx->codepoint = (iso->lead - 0x21) * 94 + byte - 0x21;
2375
-
2376
- return lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2377
- }
2378
-
2379
- return LXB_ENCODING_DECODE_ERROR;
2380
-
2381
- case LXB_ENCODING_DECODE_2022_JP_ESCAPE_START:
2382
- if (byte == 0x24 || byte == 0x28) {
2383
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE;
2384
- iso->lead = byte;
2385
-
2386
- break;
2387
- }
2388
-
2389
- (*data)--;
2390
-
2391
- iso->out_flag = false;
2392
- iso->state = ctx->u.iso_2022_jp.out_state;
2393
-
2394
- return LXB_ENCODING_DECODE_ERROR;
2395
-
2396
- case LXB_ENCODING_DECODE_2022_JP_ESCAPE:
2397
- iso->state = LXB_ENCODING_DECODE_2022_JP_UNSET;
2398
-
2399
- if (iso->lead == 0x28) {
2400
- if (byte == 0x42) {
2401
- iso->state = LXB_ENCODING_DECODE_2022_JP_ASCII;
2402
- }
2403
- else if (byte == 0x4A) {
2404
- iso->state = LXB_ENCODING_DECODE_2022_JP_ROMAN;
2405
- }
2406
- else if (byte == 0x49) {
2407
- iso->state = LXB_ENCODING_DECODE_2022_JP_KATAKANA;
2408
- }
2409
- }
2410
- else if (iso->lead == 0x24) {
2411
- if (byte == 0x40 || byte == 0x42) {
2412
- iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
2413
- }
2414
- }
2415
-
2416
- if (iso->state == LXB_ENCODING_DECODE_2022_JP_UNSET) {
2417
- iso->prepand = iso->lead;
2418
- iso->lead = 0x00;
2419
-
2420
- (*data)--;
2421
-
2422
- iso->out_flag = false;
2423
- iso->state = iso->out_state;
2424
-
2425
- return LXB_ENCODING_DECODE_ERROR;
2426
- }
2427
-
2428
- iso->lead = 0x00;
2429
- iso->out_state = iso->state;
2430
-
2431
- if (iso->out_flag) {
2432
- return LXB_ENCODING_DECODE_ERROR;
2433
- }
2434
-
2435
- iso->out_flag = true;
2436
-
2437
- break;
2438
- }
2439
- }
2440
- while (*data < end);
2441
-
2442
- return LXB_ENCODING_DECODE_CONTINUE;
2443
- }
2444
-
2445
- lxb_codepoint_t
2446
- lxb_encoding_decode_iso_8859_10_single(lxb_encoding_decode_t *ctx,
2447
- const lxb_char_t **data, const lxb_char_t *end)
2448
- {
2449
- if (**data < 0x80) {
2450
- return *(*data)++;
2451
- }
2452
-
2453
- return lxb_encoding_single_index_iso_8859_10[*(*data)++ - 0x80].codepoint;
2454
- }
2455
-
2456
- lxb_codepoint_t
2457
- lxb_encoding_decode_iso_8859_13_single(lxb_encoding_decode_t *ctx,
2458
- const lxb_char_t **data, const lxb_char_t *end)
2459
- {
2460
- if (**data < 0x80) {
2461
- return *(*data)++;
2462
- }
2463
-
2464
- return lxb_encoding_single_index_iso_8859_13[*(*data)++ - 0x80].codepoint;
2465
- }
2466
-
2467
- lxb_codepoint_t
2468
- lxb_encoding_decode_iso_8859_14_single(lxb_encoding_decode_t *ctx,
2469
- const lxb_char_t **data, const lxb_char_t *end)
2470
- {
2471
- if (**data < 0x80) {
2472
- return *(*data)++;
2473
- }
2474
-
2475
- return lxb_encoding_single_index_iso_8859_14[*(*data)++ - 0x80].codepoint;
2476
- }
2477
-
2478
- lxb_codepoint_t
2479
- lxb_encoding_decode_iso_8859_15_single(lxb_encoding_decode_t *ctx,
2480
- const lxb_char_t **data, const lxb_char_t *end)
2481
- {
2482
- if (**data < 0x80) {
2483
- return *(*data)++;
2484
- }
2485
-
2486
- return lxb_encoding_single_index_iso_8859_15[*(*data)++ - 0x80].codepoint;
2487
- }
2488
-
2489
- lxb_codepoint_t
2490
- lxb_encoding_decode_iso_8859_16_single(lxb_encoding_decode_t *ctx,
2491
- const lxb_char_t **data, const lxb_char_t *end)
2492
- {
2493
- if (**data < 0x80) {
2494
- return *(*data)++;
2495
- }
2496
-
2497
- return lxb_encoding_single_index_iso_8859_16[*(*data)++ - 0x80].codepoint;
2498
- }
2499
-
2500
- lxb_codepoint_t
2501
- lxb_encoding_decode_iso_8859_2_single(lxb_encoding_decode_t *ctx,
2502
- const lxb_char_t **data, const lxb_char_t *end)
2503
- {
2504
- if (**data < 0x80) {
2505
- return *(*data)++;
2506
- }
2507
-
2508
- return lxb_encoding_single_index_iso_8859_2[*(*data)++ - 0x80].codepoint;
2509
- }
2510
-
2511
- lxb_codepoint_t
2512
- lxb_encoding_decode_iso_8859_3_single(lxb_encoding_decode_t *ctx,
2513
- const lxb_char_t **data, const lxb_char_t *end)
2514
- {
2515
- if (**data < 0x80) {
2516
- return *(*data)++;
2517
- }
2518
-
2519
- return lxb_encoding_single_index_iso_8859_3[*(*data)++ - 0x80].codepoint;
2520
- }
2521
-
2522
- lxb_codepoint_t
2523
- lxb_encoding_decode_iso_8859_4_single(lxb_encoding_decode_t *ctx,
2524
- const lxb_char_t **data, const lxb_char_t *end)
2525
- {
2526
- if (**data < 0x80) {
2527
- return *(*data)++;
2528
- }
2529
-
2530
- return lxb_encoding_single_index_iso_8859_4[*(*data)++ - 0x80].codepoint;
2531
- }
2532
-
2533
- lxb_codepoint_t
2534
- lxb_encoding_decode_iso_8859_5_single(lxb_encoding_decode_t *ctx,
2535
- const lxb_char_t **data, const lxb_char_t *end)
2536
- {
2537
- if (**data < 0x80) {
2538
- return *(*data)++;
2539
- }
2540
-
2541
- return lxb_encoding_single_index_iso_8859_5[*(*data)++ - 0x80].codepoint;
2542
- }
2543
-
2544
- lxb_codepoint_t
2545
- lxb_encoding_decode_iso_8859_6_single(lxb_encoding_decode_t *ctx,
2546
- const lxb_char_t **data, const lxb_char_t *end)
2547
- {
2548
- if (**data < 0x80) {
2549
- return *(*data)++;
2550
- }
2551
-
2552
- return lxb_encoding_single_index_iso_8859_6[*(*data)++ - 0x80].codepoint;
2553
- }
2554
-
2555
- lxb_codepoint_t
2556
- lxb_encoding_decode_iso_8859_7_single(lxb_encoding_decode_t *ctx,
2557
- const lxb_char_t **data, const lxb_char_t *end)
2558
- {
2559
- if (**data < 0x80) {
2560
- return *(*data)++;
2561
- }
2562
-
2563
- return lxb_encoding_single_index_iso_8859_7[*(*data)++ - 0x80].codepoint;
2564
- }
2565
-
2566
- lxb_codepoint_t
2567
- lxb_encoding_decode_iso_8859_8_single(lxb_encoding_decode_t *ctx,
2568
- const lxb_char_t **data, const lxb_char_t *end)
2569
- {
2570
- if (**data < 0x80) {
2571
- return *(*data)++;
2572
- }
2573
-
2574
- return lxb_encoding_single_index_iso_8859_8[*(*data)++ - 0x80].codepoint;
2575
- }
2576
-
2577
- lxb_codepoint_t
2578
- lxb_encoding_decode_iso_8859_8_i_single(lxb_encoding_decode_t *ctx,
2579
- const lxb_char_t **data, const lxb_char_t *end)
2580
- {
2581
- if (**data < 0x80) {
2582
- return *(*data)++;
2583
- }
2584
-
2585
- return lxb_encoding_single_index_iso_8859_8[*(*data)++ - 0x80].codepoint;
2586
- }
2587
-
2588
- lxb_codepoint_t
2589
- lxb_encoding_decode_koi8_r_single(lxb_encoding_decode_t *ctx,
2590
- const lxb_char_t **data, const lxb_char_t *end)
2591
- {
2592
- if (**data < 0x80) {
2593
- return *(*data)++;
2594
- }
2595
-
2596
- return lxb_encoding_single_index_koi8_r[*(*data)++ - 0x80].codepoint;
2597
- }
2598
-
2599
- lxb_codepoint_t
2600
- lxb_encoding_decode_koi8_u_single(lxb_encoding_decode_t *ctx,
2601
- const lxb_char_t **data, const lxb_char_t *end)
2602
- {
2603
- if (**data < 0x80) {
2604
- return *(*data)++;
2605
- }
2606
-
2607
- return lxb_encoding_single_index_koi8_u[*(*data)++ - 0x80].codepoint;
2608
- }
2609
-
2610
- lxb_codepoint_t
2611
- lxb_encoding_decode_shift_jis_single(lxb_encoding_decode_t *ctx,
2612
- const lxb_char_t **data, const lxb_char_t *end)
2613
- {
2614
- lxb_char_t byte, lead;
2615
-
2616
- if (ctx->u.lead != 0x00) {
2617
- lead = (lxb_char_t) ctx->u.lead;
2618
- ctx->u.lead = 0x00;
2619
-
2620
- goto lead_state;
2621
- }
2622
-
2623
- lead = *(*data)++;
2624
-
2625
- if (lead <= 0x80) {
2626
- return lead;
2627
- }
2628
-
2629
- if ((unsigned) (lead - 0xA1) <= (0xDF - 0xA1)) {
2630
- return 0xFF61 - 0xA1 + lead;
2631
- }
2632
-
2633
- if ((unsigned) (lead - 0x81) > (0x9F - 0x81)
2634
- && lead != 0xE0 && lead != 0xFC)
2635
- {
2636
- return LXB_ENCODING_DECODE_ERROR;
2637
- }
2638
-
2639
- if (*data >= end) {
2640
- ctx->u.lead = lead;
2641
-
2642
- return LXB_ENCODING_DECODE_CONTINUE;
2643
- }
2644
-
2645
- lead_state:
2646
-
2647
- byte = *(*data)++;
2648
-
2649
- if (byte < 0x7F) {
2650
- ctx->codepoint = 0x40;
2651
- }
2652
- else {
2653
- ctx->codepoint = 0x41;
2654
- }
2655
-
2656
- if (lead < 0xA0) {
2657
- ctx->second_codepoint = 0x81;
2658
- }
2659
- else {
2660
- ctx->second_codepoint = 0xC1;
2661
- }
2662
-
2663
- if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
2664
- || (unsigned) (byte - 0x80) <= (0xFC - 0x80))
2665
- {
2666
- /* Max index == (0xFC - 0xC1) * 188 + 0xFC - 0x41 = 11279 */
2667
- ctx->codepoint = (lead - ctx->second_codepoint) * 188
2668
- + byte - ctx->codepoint;
2669
-
2670
- if (ctx->codepoint >= (sizeof(lxb_encoding_multi_index_jis0208)
2671
- / sizeof(lxb_encoding_multi_index_t)))
2672
- {
2673
- goto failed;
2674
- }
2675
-
2676
- if ((unsigned) (ctx->codepoint - 8836) <= (10715 - 8836)) {
2677
- return 0xE000 - 8836 + ctx->codepoint;
2678
- }
2679
-
2680
- ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2681
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2682
- goto failed;
2683
- }
2684
-
2685
- return ctx->codepoint;
2686
- }
2687
-
2688
- failed:
2689
-
2690
- if (byte < 0x80) {
2691
- (*data)--;
2692
- }
2693
-
2694
- return LXB_ENCODING_DECODE_ERROR;
2695
- }
2696
-
2697
- lxb_inline lxb_codepoint_t
2698
- lxb_encoding_decode_utf_16_single(lxb_encoding_decode_t *ctx, bool is_be,
2699
- const lxb_char_t **data, const lxb_char_t *end)
2700
- {
2701
- unsigned lead;
2702
- lxb_codepoint_t unit;
2703
-
2704
- if (ctx->u.lead != 0x00) {
2705
- lead = ctx->u.lead - 0x01;
2706
- ctx->u.lead = 0x00;
2707
-
2708
- goto lead_state;
2709
- }
2710
-
2711
- pair_state:
2712
-
2713
- lead = *(*data)++;
2714
-
2715
- if (*data >= end) {
2716
- ctx->u.lead = lead + 0x01;
2717
- return LXB_ENCODING_DECODE_CONTINUE;
2718
- }
2719
-
2720
- lead_state:
2721
-
2722
- /* For UTF-16BE or UTF-16LE */
2723
- if (is_be) {
2724
- unit = (lead << 8) + *(*data)++;
2725
- }
2726
- else {
2727
- unit = (*(*data)++ << 8) + lead;
2728
- }
2729
-
2730
- if (ctx->second_codepoint != 0x00) {
2731
- if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
2732
- ctx->codepoint = 0x10000 + ((ctx->second_codepoint - 0xD800) << 10)
2733
- + (unit - 0xDC00);
2734
-
2735
- ctx->second_codepoint = 0x00;
2736
- return ctx->codepoint;
2737
- }
2738
-
2739
- (*data)--;
2740
-
2741
- ctx->u.lead = lead + 0x01;
2742
- ctx->second_codepoint = 0x00;
2743
-
2744
- return LXB_ENCODING_DECODE_ERROR;
2745
- }
2746
-
2747
- /* Surrogate pair */
2748
- if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
2749
- if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
2750
- return LXB_ENCODING_DECODE_ERROR;
2751
- }
2752
-
2753
- ctx->second_codepoint = unit;
2754
-
2755
- if (*data >= end) {
2756
- return LXB_ENCODING_DECODE_CONTINUE;
2757
- }
2758
-
2759
- goto pair_state;
2760
- }
2761
-
2762
- return unit;
2763
- }
2764
-
2765
- lxb_codepoint_t
2766
- lxb_encoding_decode_utf_16be_single(lxb_encoding_decode_t *ctx,
2767
- const lxb_char_t **data, const lxb_char_t *end)
2768
- {
2769
- return lxb_encoding_decode_utf_16_single(ctx, true, data, end);
2770
- }
2771
-
2772
- lxb_codepoint_t
2773
- lxb_encoding_decode_utf_16le_single(lxb_encoding_decode_t *ctx,
2774
- const lxb_char_t **data, const lxb_char_t *end)
2775
- {
2776
- return lxb_encoding_decode_utf_16_single(ctx, false, data, end);
2777
- }
2778
-
2779
- lxb_codepoint_t
2780
- lxb_encoding_decode_utf_8_single(lxb_encoding_decode_t *ctx,
2781
- const lxb_char_t **data, const lxb_char_t *end)
2782
- {
2783
- unsigned needed;
2784
- lxb_char_t ch;
2785
- const lxb_char_t *p;
2786
-
2787
- if (ctx->u.utf_8.need != 0) {
2788
- needed = ctx->u.utf_8.need;
2789
- ctx->u.utf_8.need = 0;
2790
-
2791
- if (ctx->u.utf_8.lower != 0x00) {
2792
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(ctx->u.utf_8.lower,
2793
- ctx->u.utf_8.upper);
2794
- ctx->u.utf_8.lower = 0x00;
2795
- }
2796
-
2797
- goto decode;
2798
- }
2799
-
2800
- ch = *(*data)++;
2801
-
2802
- if (ch < 0x80) {
2803
- return ch;
2804
- }
2805
- else if (ch <= 0xDF) {
2806
- if (ch < 0xC2) {
2807
- return LXB_ENCODING_DECODE_ERROR;
2808
- }
2809
-
2810
- needed = 1;
2811
- ctx->codepoint = ch & 0x1F;
2812
- }
2813
- else if (ch < 0xF0) {
2814
- needed = 2;
2815
- ctx->codepoint = ch & 0x0F;
2816
-
2817
- if (*data == end) {
2818
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(0xE0, 0xED,
2819
- 0xA0, 0x9F);
2820
- goto next;
2821
- }
2822
-
2823
- if (ch == 0xE0) {
2824
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0xA0, 0xBF);
2825
- }
2826
- else if (ch == 0xED) {
2827
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x80, 0x9F);
2828
- }
2829
- }
2830
- else if (ch < 0xF5) {
2831
- needed = 3;
2832
- ctx->codepoint = ch & 0x07;
2833
-
2834
- if (*data == end) {
2835
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(0xF0, 0xF4,
2836
- 0x90, 0x8F);
2837
-
2838
- goto next;
2839
- }
2840
-
2841
- if (ch == 0xF0) {
2842
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x90, 0xBF);
2843
- }
2844
- else if (ch == 0xF4) {
2845
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x80, 0x8F);
2846
- }
2847
- }
2848
- else {
2849
- return LXB_ENCODING_DECODE_ERROR;
2850
- }
2851
-
2852
- decode:
2853
-
2854
- for (p = *data; p < end; p++) {
2855
- ch = *p;
2856
-
2857
- if (ch < 0x80 || ch > 0xBF) {
2858
- *data = p;
2859
-
2860
- goto failed;
2861
- }
2862
-
2863
- ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
2864
-
2865
- if (--needed == 0) {
2866
- *data = p + 1;
2867
-
2868
- return ctx->codepoint;
2869
- }
2870
- }
2871
-
2872
- *data = p;
2873
-
2874
- next:
2875
-
2876
- ctx->u.utf_8.need = needed;
2877
-
2878
- return LXB_ENCODING_DECODE_CONTINUE;
2879
-
2880
- failed:
2881
-
2882
- ctx->u.utf_8.lower = 0x00;
2883
- ctx->u.utf_8.need = 0;
2884
-
2885
- return LXB_ENCODING_DECODE_ERROR;
2886
- }
2887
-
2888
- lxb_codepoint_t
2889
- lxb_encoding_decode_gb18030_single(lxb_encoding_decode_t *ctx,
2890
- const lxb_char_t **data, const lxb_char_t *end)
2891
- {
2892
- uint32_t pointer;
2893
- lxb_char_t first, second, third, offset;
2894
-
2895
- /* Make compiler happy */
2896
- second = 0x00;
2897
-
2898
- if (ctx->u.gb18030.first != 0) {
2899
- if (ctx->u.gb18030.third != 0x00) {
2900
- first = ctx->u.gb18030.first;
2901
- second = ctx->u.gb18030.second;
2902
- third = ctx->u.gb18030.third;
2903
-
2904
- memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
2905
-
2906
- if (ctx->prepend) {
2907
- /* The first is always < 0x80 */
2908
- ctx->u.gb18030.first = third;
2909
-
2910
- return second;
2911
- }
2912
-
2913
- goto third_state;
2914
- }
2915
- else if (ctx->u.gb18030.second != 0x00) {
2916
- first = ctx->u.gb18030.first;
2917
- second = ctx->u.gb18030.second;
2918
-
2919
- memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
2920
-
2921
- goto second_state;
2922
- }
2923
-
2924
- first = ctx->u.gb18030.first;
2925
- ctx->u.gb18030.first = 0x00;
2926
-
2927
- if (ctx->prepend) {
2928
- ctx->prepend = false;
2929
- goto prepend_first;
2930
- }
2931
-
2932
- goto first_state;
2933
- }
2934
-
2935
- first = *(*data)++;
2936
-
2937
- prepend_first:
2938
-
2939
- if (first < 0x80) {
2940
- return first;
2941
- }
2942
-
2943
- if (first == 0x80) {
2944
- return 0x20AC;
2945
- }
2946
-
2947
- /* Range 0x81 to 0xFE, inclusive */
2948
- if ((unsigned) (first - 0x81) > (0xFE - 0x81)) {
2949
- return LXB_ENCODING_DECODE_ERROR;
2950
- }
2951
-
2952
- if (*data == end) {
2953
- ctx->u.gb18030.first = first;
2954
- return LXB_ENCODING_DECODE_CONTINUE;
2955
- }
2956
-
2957
- /* First */
2958
- first_state:
2959
-
2960
- second = *(*data)++;
2961
-
2962
- /* Range 0x30 to 0x39, inclusive */
2963
- if ((unsigned) (second - 0x30) > (0x39 - 0x30)) {
2964
- offset = (second < 0x7F) ? 0x40 : 0x41;
2965
-
2966
- /* Range 0x40 to 0x7E, inclusive, or 0x80 to 0xFE, inclusive */
2967
- if ((unsigned) (second - 0x40) <= (0x7E - 0x40)
2968
- || (unsigned) (second - 0x80) <= (0xFE - 0x80))
2969
- {
2970
- pointer = (first - 0x81) * 190 + (second - offset);
2971
- }
2972
- else {
2973
- goto failed;
2974
- }
2975
-
2976
- /* Max pointer value == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
2977
- ctx->codepoint = lxb_encoding_multi_index_gb18030[pointer].codepoint;
2978
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2979
- goto failed;
2980
- }
2981
-
2982
- return ctx->codepoint;
2983
- }
2984
-
2985
- if (*data == end) {
2986
- ctx->u.gb18030.first = first;
2987
- ctx->u.gb18030.second = second;
2988
-
2989
- return LXB_ENCODING_DECODE_CONTINUE;
2990
- }
2991
-
2992
- /* Second */
2993
- second_state:
2994
-
2995
- third = *(*data)++;
2996
-
2997
- /* Range 0x81 to 0xFE, inclusive */
2998
- if ((unsigned) (third - 0x81) > (0xFE - 0x81)) {
2999
- (*data)--;
3000
-
3001
- ctx->prepend = true;
3002
- ctx->u.gb18030.first = second;
3003
-
3004
- return LXB_ENCODING_DECODE_ERROR;
3005
- }
3006
-
3007
- if (*data == end) {
3008
- ctx->u.gb18030.first = first;
3009
- ctx->u.gb18030.second = second;
3010
- ctx->u.gb18030.third = third;
3011
-
3012
- return LXB_ENCODING_DECODE_CONTINUE;
3013
- }
3014
-
3015
- /* Third */
3016
- third_state:
3017
-
3018
- /* Range 0x30 to 0x39, inclusive */
3019
- if ((unsigned) (**data - 0x30) > (0x39 - 0x30)) {
3020
- ctx->prepend = true;
3021
-
3022
- /* First is a fake for trigger */
3023
- ctx->u.gb18030.first = 0x01;
3024
- ctx->u.gb18030.second = second;
3025
- ctx->u.gb18030.third = third;
3026
-
3027
- return LXB_ENCODING_DECODE_ERROR;
3028
- }
3029
-
3030
- pointer = ((first - 0x81) * (10 * 126 * 10))
3031
- + ((second - 0x30) * (10 * 126))
3032
- + ((third - 0x81) * 10) + (*(*data)++) - 0x30;
3033
-
3034
- return lxb_encoding_decode_gb18030_range(pointer);
3035
-
3036
- failed:
3037
-
3038
- if (second < 0x80) {
3039
- (*data)--;
3040
- }
3041
-
3042
- return LXB_ENCODING_DECODE_ERROR;
3043
- }
3044
-
3045
- lxb_codepoint_t
3046
- lxb_encoding_decode_macintosh_single(lxb_encoding_decode_t *ctx,
3047
- const lxb_char_t **data, const lxb_char_t *end)
3048
- {
3049
- if (**data < 0x80) {
3050
- return *(*data)++;
3051
- }
3052
-
3053
- return lxb_encoding_single_index_macintosh[*(*data)++ - 0x80].codepoint;
3054
- }
3055
-
3056
- lxb_codepoint_t
3057
- lxb_encoding_decode_replacement_single(lxb_encoding_decode_t *ctx,
3058
- const lxb_char_t **data, const lxb_char_t *end)
3059
- {
3060
- return LXB_ENCODING_DECODE_ERROR;
3061
- }
3062
-
3063
- lxb_codepoint_t
3064
- lxb_encoding_decode_windows_1250_single(lxb_encoding_decode_t *ctx,
3065
- const lxb_char_t **data, const lxb_char_t *end)
3066
- {
3067
- if (**data < 0x80) {
3068
- return *(*data)++;
3069
- }
3070
-
3071
- return lxb_encoding_single_index_windows_1250[*(*data)++ - 0x80].codepoint;
3072
- }
3073
-
3074
- lxb_codepoint_t
3075
- lxb_encoding_decode_windows_1251_single(lxb_encoding_decode_t *ctx,
3076
- const lxb_char_t **data, const lxb_char_t *end)
3077
- {
3078
- if (**data < 0x80) {
3079
- return *(*data)++;
3080
- }
3081
-
3082
- return lxb_encoding_single_index_windows_1251[*(*data)++ - 0x80].codepoint;
3083
- }
3084
-
3085
- lxb_codepoint_t
3086
- lxb_encoding_decode_windows_1252_single(lxb_encoding_decode_t *ctx,
3087
- const lxb_char_t **data, const lxb_char_t *end)
3088
- {
3089
- if (**data < 0x80) {
3090
- return *(*data)++;
3091
- }
3092
-
3093
- return lxb_encoding_single_index_windows_1252[*(*data)++ - 0x80].codepoint;
3094
- }
3095
-
3096
- lxb_codepoint_t
3097
- lxb_encoding_decode_windows_1253_single(lxb_encoding_decode_t *ctx,
3098
- const lxb_char_t **data, const lxb_char_t *end)
3099
- {
3100
- if (**data < 0x80) {
3101
- return *(*data)++;
3102
- }
3103
-
3104
- return lxb_encoding_single_index_windows_1253[*(*data)++ - 0x80].codepoint;
3105
- }
3106
-
3107
- lxb_codepoint_t
3108
- lxb_encoding_decode_windows_1254_single(lxb_encoding_decode_t *ctx,
3109
- const lxb_char_t **data, const lxb_char_t *end)
3110
- {
3111
- if (**data < 0x80) {
3112
- return *(*data)++;
3113
- }
3114
-
3115
- return lxb_encoding_single_index_windows_1254[*(*data)++ - 0x80].codepoint;
3116
- }
3117
-
3118
- lxb_codepoint_t
3119
- lxb_encoding_decode_windows_1255_single(lxb_encoding_decode_t *ctx,
3120
- const lxb_char_t **data, const lxb_char_t *end)
3121
- {
3122
- if (**data < 0x80) {
3123
- return *(*data)++;
3124
- }
3125
-
3126
- return lxb_encoding_single_index_windows_1255[*(*data)++ - 0x80].codepoint;
3127
- }
3128
-
3129
- lxb_codepoint_t
3130
- lxb_encoding_decode_windows_1256_single(lxb_encoding_decode_t *ctx,
3131
- const lxb_char_t **data, const lxb_char_t *end)
3132
- {
3133
- if (**data < 0x80) {
3134
- return *(*data)++;
3135
- }
3136
-
3137
- return lxb_encoding_single_index_windows_1256[*(*data)++ - 0x80].codepoint;
3138
- }
3139
-
3140
- lxb_codepoint_t
3141
- lxb_encoding_decode_windows_1257_single(lxb_encoding_decode_t *ctx,
3142
- const lxb_char_t **data, const lxb_char_t *end)
3143
- {
3144
- if (**data < 0x80) {
3145
- return *(*data)++;
3146
- }
3147
-
3148
- return lxb_encoding_single_index_windows_1257[*(*data)++ - 0x80].codepoint;
3149
- }
3150
-
3151
- lxb_codepoint_t
3152
- lxb_encoding_decode_windows_1258_single(lxb_encoding_decode_t *ctx,
3153
- const lxb_char_t **data, const lxb_char_t *end)
3154
- {
3155
- if (**data < 0x80) {
3156
- return *(*data)++;
3157
- }
3158
-
3159
- return lxb_encoding_single_index_windows_1258[*(*data)++ - 0x80].codepoint;
3160
- }
3161
-
3162
- lxb_codepoint_t
3163
- lxb_encoding_decode_windows_874_single(lxb_encoding_decode_t *ctx,
3164
- const lxb_char_t **data, const lxb_char_t *end)
3165
- {
3166
- if (**data < 0x80) {
3167
- return *(*data)++;
3168
- }
3169
-
3170
- return lxb_encoding_single_index_windows_874[*(*data)++ - 0x80].codepoint;
3171
- }
3172
-
3173
- lxb_codepoint_t
3174
- lxb_encoding_decode_x_mac_cyrillic_single(lxb_encoding_decode_t *ctx,
3175
- const lxb_char_t **data, const lxb_char_t *end)
3176
- {
3177
- if (**data < 0x80) {
3178
- return *(*data)++;
3179
- }
3180
-
3181
- return lxb_encoding_single_index_x_mac_cyrillic[*(*data)++ - 0x80].codepoint;
3182
- }
3183
-
3184
- lxb_codepoint_t
3185
- lxb_encoding_decode_x_user_defined_single(lxb_encoding_decode_t *ctx,
3186
- const lxb_char_t **data, const lxb_char_t *end)
3187
- {
3188
- if (**data < 0x80) {
3189
- return *(*data)++;
3190
- }
3191
-
3192
- return 0xF780 + (*(*data)++) - 0x80;
3193
- }