nokolexbor 0.3.3 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/ext/nokolexbor/nl_attribute.c +201 -0
  3. data/ext/nokolexbor/nl_cdata.c +8 -0
  4. data/ext/nokolexbor/nl_comment.c +6 -0
  5. data/ext/nokolexbor/nl_document.c +53 -7
  6. data/ext/nokolexbor/nl_document_fragment.c +9 -0
  7. data/ext/nokolexbor/nl_error.c +21 -19
  8. data/ext/nokolexbor/nl_node.c +317 -48
  9. data/ext/nokolexbor/nl_node_set.c +56 -1
  10. data/ext/nokolexbor/nl_processing_instruction.c +6 -0
  11. data/ext/nokolexbor/nl_text.c +6 -0
  12. data/ext/nokolexbor/nokolexbor.c +1 -0
  13. data/ext/nokolexbor/nokolexbor.h +2 -0
  14. data/lib/nokolexbor/document.rb +52 -5
  15. data/lib/nokolexbor/document_fragment.rb +11 -0
  16. data/lib/nokolexbor/node.rb +370 -24
  17. data/lib/nokolexbor/node_set.rb +56 -0
  18. data/lib/nokolexbor/version.rb +1 -1
  19. data/lib/nokolexbor.rb +0 -1
  20. metadata +3 -25
  21. data/lib/nokolexbor/attribute.rb +0 -18
  22. data/vendor/lexbor/source/lexbor/encoding/base.h +0 -218
  23. data/vendor/lexbor/source/lexbor/encoding/big5.c +0 -42839
  24. data/vendor/lexbor/source/lexbor/encoding/config.cmake +0 -12
  25. data/vendor/lexbor/source/lexbor/encoding/const.h +0 -65
  26. data/vendor/lexbor/source/lexbor/encoding/decode.c +0 -3193
  27. data/vendor/lexbor/source/lexbor/encoding/decode.h +0 -370
  28. data/vendor/lexbor/source/lexbor/encoding/encode.c +0 -1931
  29. data/vendor/lexbor/source/lexbor/encoding/encode.h +0 -377
  30. data/vendor/lexbor/source/lexbor/encoding/encoding.c +0 -252
  31. data/vendor/lexbor/source/lexbor/encoding/encoding.h +0 -475
  32. data/vendor/lexbor/source/lexbor/encoding/euc_kr.c +0 -53883
  33. data/vendor/lexbor/source/lexbor/encoding/gb18030.c +0 -47905
  34. data/vendor/lexbor/source/lexbor/encoding/iso_2022_jp_katakana.c +0 -159
  35. data/vendor/lexbor/source/lexbor/encoding/jis0208.c +0 -22477
  36. data/vendor/lexbor/source/lexbor/encoding/jis0212.c +0 -15787
  37. data/vendor/lexbor/source/lexbor/encoding/multi.h +0 -53
  38. data/vendor/lexbor/source/lexbor/encoding/range.c +0 -71
  39. data/vendor/lexbor/source/lexbor/encoding/range.h +0 -34
  40. data/vendor/lexbor/source/lexbor/encoding/res.c +0 -222
  41. data/vendor/lexbor/source/lexbor/encoding/res.h +0 -34
  42. data/vendor/lexbor/source/lexbor/encoding/single.c +0 -13748
  43. data/vendor/lexbor/source/lexbor/encoding/single.h +0 -116
@@ -1,3193 +0,0 @@
1
- /*
2
- * Copyright (C) 2019 Alexander Borisov
3
- *
4
- * Author: Alexander Borisov <borisov@lexbor.com>
5
- */
6
-
7
- #include "lexbor/encoding/decode.h"
8
- #include "lexbor/encoding/single.h"
9
- #include "lexbor/encoding/multi.h"
10
- #include "lexbor/encoding/range.h"
11
-
12
-
13
- #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY(_lower, _upper, _cont) \
14
- { \
15
- ch = *p; \
16
- \
17
- if (ch < _lower || ch > _upper) { \
18
- ctx->u.utf_8.lower = 0x00; \
19
- ctx->u.utf_8.need = 0; \
20
- \
21
- LXB_ENCODING_DECODE_ERROR_BEGIN { \
22
- *data = p; \
23
- ctx->have_error = true; \
24
- } \
25
- LXB_ENCODING_DECODE_ERROR_END(); \
26
- \
27
- _cont; \
28
- } \
29
- else { \
30
- p++; \
31
- need--; \
32
- ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F); \
33
- } \
34
- }
35
-
36
- #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(first, two, f_lower, s_upper) \
37
- do { \
38
- if (ch == first) { \
39
- ctx->u.utf_8.lower = f_lower; \
40
- ctx->u.utf_8.upper = 0xBF; \
41
- } \
42
- else if (ch == two) { \
43
- ctx->u.utf_8.lower = 0x80; \
44
- ctx->u.utf_8.upper = s_upper; \
45
- } \
46
- } \
47
- while (0)
48
-
49
- #define LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, cp) \
50
- do { \
51
- (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
52
- } \
53
- while (0)
54
-
55
- #define LXB_ENCODING_DECODE_APPEND(ctx, cp) \
56
- do { \
57
- if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
58
- return LXB_STATUS_SMALL_BUFFER; \
59
- } \
60
- \
61
- (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
62
- } \
63
- while (0)
64
-
65
- #define LXB_ENCODING_DECODE_APPEND_P(ctx, cp) \
66
- do { \
67
- if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
68
- *data = p; \
69
- return LXB_STATUS_SMALL_BUFFER; \
70
- } \
71
- \
72
- (ctx)->buffer_out[(ctx)->buffer_used++] = (cp); \
73
- } \
74
- while (0)
75
-
76
- #define LXB_ENCODING_DECODE_CHECK_OUT(ctx) \
77
- do { \
78
- if ((ctx)->buffer_used >= (ctx)->buffer_length) { \
79
- return LXB_STATUS_SMALL_BUFFER; \
80
- } \
81
- } \
82
- while (0)
83
-
84
- #define LXB_ENCODING_DECODE_ERROR_BEGIN \
85
- do { \
86
- if (ctx->replace_to == NULL) { \
87
- return LXB_STATUS_ERROR; \
88
- } \
89
- \
90
- if ((ctx->buffer_used + ctx->replace_len) > ctx->buffer_length) { \
91
- do
92
-
93
- #define LXB_ENCODING_DECODE_ERROR_END() \
94
- while (0); \
95
- \
96
- return LXB_STATUS_SMALL_BUFFER; \
97
- } \
98
- \
99
- memcpy(&ctx->buffer_out[ctx->buffer_used], ctx->replace_to, \
100
- sizeof(lxb_codepoint_t) * ctx->replace_len); \
101
- \
102
- ctx->buffer_used += ctx->replace_len; \
103
- } \
104
- while (0)
105
-
106
- #define LXB_ENCODING_DECODE_ERROR(ctx) \
107
- do { \
108
- LXB_ENCODING_DECODE_ERROR_BEGIN { \
109
- } LXB_ENCODING_DECODE_ERROR_END(); \
110
- } \
111
- while (0)
112
-
113
- #define LXB_ENCODING_DECODE_FAILED(ident) \
114
- do { \
115
- if ((byte) < (0x80)) { \
116
- (*data)--; \
117
- } \
118
- \
119
- LXB_ENCODING_DECODE_ERROR_BEGIN { \
120
- ctx->have_error = true; \
121
- (ident) = 0x01; \
122
- } \
123
- LXB_ENCODING_DECODE_ERROR_END(); \
124
- } \
125
- while (0)
126
-
127
- #define LXB_ENCODING_DECODE_SINGLE(decode_map) \
128
- do { \
129
- const lxb_char_t *p = *data; \
130
- \
131
- while (p < end) { \
132
- if (*p < 0x80) { \
133
- LXB_ENCODING_DECODE_APPEND_P(ctx, *p++); \
134
- } \
135
- else { \
136
- ctx->codepoint = decode_map[(*p++) - 0x80].codepoint; \
137
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) { \
138
- LXB_ENCODING_DECODE_ERROR_BEGIN { \
139
- *data = p - 1; \
140
- } \
141
- LXB_ENCODING_DECODE_ERROR_END(); \
142
- continue; \
143
- } \
144
- \
145
- LXB_ENCODING_DECODE_APPEND_P(ctx, ctx->codepoint); \
146
- } \
147
- \
148
- *data = p; \
149
- } \
150
- } \
151
- while (0)
152
-
153
- #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(lower, upper) \
154
- do { \
155
- ch = **data; \
156
- \
157
- if (ch < lower || ch > upper) { \
158
- goto failed; \
159
- } \
160
- \
161
- (*data)++; \
162
- needed--; \
163
- ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F); \
164
- } \
165
- while (0)
166
-
167
- #define LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(first, two, f_lower, \
168
- s_upper) \
169
- do { \
170
- if (ch == first) { \
171
- ctx->u.utf_8.lower = f_lower; \
172
- ctx->u.utf_8.upper = 0xBF; \
173
- } \
174
- else if (ch == two) { \
175
- ctx->u.utf_8.lower = 0x80; \
176
- ctx->u.utf_8.upper = s_upper; \
177
- } \
178
- } \
179
- while (0)
180
-
181
-
182
- lxb_status_t
183
- lxb_encoding_decode_default(lxb_encoding_decode_t *ctx,
184
- const lxb_char_t **data, const lxb_char_t *end)
185
- {
186
- return lxb_encoding_decode_utf_8(ctx, data, end);
187
- }
188
-
189
- lxb_status_t
190
- lxb_encoding_decode_auto(lxb_encoding_decode_t *ctx,
191
- const lxb_char_t **data, const lxb_char_t *end)
192
- {
193
- *data = end;
194
- return LXB_STATUS_ERROR;
195
- }
196
-
197
- lxb_status_t
198
- lxb_encoding_decode_undefined(lxb_encoding_decode_t *ctx,
199
- const lxb_char_t **data, const lxb_char_t *end)
200
- {
201
- *data = end;
202
- return LXB_STATUS_ERROR;
203
- }
204
-
205
- lxb_status_t
206
- lxb_encoding_decode_big5(lxb_encoding_decode_t *ctx,
207
- const lxb_char_t **data, const lxb_char_t *end)
208
- {
209
- uint32_t index;
210
- lxb_char_t lead, byte;
211
-
212
- ctx->status = LXB_STATUS_OK;
213
-
214
- if (ctx->u.lead != 0x00) {
215
- if (ctx->have_error) {
216
- ctx->u.lead = 0x00;
217
- ctx->have_error = false;
218
-
219
- LXB_ENCODING_DECODE_ERROR_BEGIN {
220
- ctx->u.lead = 0x01;
221
- ctx->have_error = true;
222
- } LXB_ENCODING_DECODE_ERROR_END();
223
- }
224
- else if (ctx->second_codepoint != 0x0000) {
225
- if ((ctx->buffer_used + 2) > ctx->buffer_length) {
226
- return LXB_STATUS_SMALL_BUFFER;
227
- }
228
-
229
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->u.lead);
230
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->second_codepoint);
231
-
232
- ctx->u.lead = 0x00;
233
- ctx->second_codepoint = 0x0000;
234
- }
235
- else {
236
- if (*data >= end) {
237
- ctx->status = LXB_STATUS_CONTINUE;
238
-
239
- return LXB_STATUS_CONTINUE;
240
- }
241
-
242
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
243
-
244
- lead = (lxb_char_t) ctx->u.lead;
245
- ctx->u.lead = 0x00;
246
-
247
- goto lead_state;
248
- }
249
- }
250
-
251
- while (*data < end) {
252
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
253
-
254
- lead = *(*data)++;
255
-
256
- if (lead < 0x80) {
257
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
258
- continue;
259
- }
260
-
261
- if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
262
- LXB_ENCODING_DECODE_ERROR_BEGIN {
263
- (*data)--;
264
- }
265
- LXB_ENCODING_DECODE_ERROR_END();
266
-
267
- continue;
268
- }
269
-
270
- if (*data >= end) {
271
- ctx->u.lead = lead;
272
- ctx->status = LXB_STATUS_CONTINUE;
273
-
274
- return LXB_STATUS_CONTINUE;
275
- }
276
-
277
- lead_state:
278
-
279
- index = 0;
280
- byte = *(*data)++;
281
-
282
- if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
283
- || (unsigned) (byte - 0xA1) <= (0xFE - 0xA1))
284
- {
285
- if (byte < 0x7F) {
286
- /* Max index == (0xFE - 0x81) * 157 + (0x7E - 0x62) == 19653 */
287
- index = (lead - 0x81) * 157 + (byte - 0x40);
288
- }
289
- else {
290
- /* Max index == (0xFE - 0x81) * 157 + (0xFE - 0x62) == 19781 */
291
- index = (lead - 0x81) * 157 + (byte - 0x62);
292
- }
293
- }
294
-
295
- /*
296
- * 1133 U+00CA U+0304 Ê̄ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON)
297
- * 1135 U+00CA U+030C Ê̌ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON)
298
- * 1164 U+00EA U+0304 ê̄ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON)
299
- * 1166 U+00EA U+030C ê̌ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON)
300
- */
301
- switch (index) {
302
- case 1133:
303
- if ((ctx->buffer_used + 2) > ctx->buffer_length) {
304
- ctx->u.lead = 0x00CA;
305
- ctx->second_codepoint = 0x0304;
306
-
307
- return LXB_STATUS_SMALL_BUFFER;
308
- }
309
-
310
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00CA);
311
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x0304);
312
-
313
- continue;
314
-
315
- case 1135:
316
- if ((ctx->buffer_used + 2) > ctx->buffer_length) {
317
- ctx->u.lead = 0x00CA;
318
- ctx->second_codepoint = 0x030C;
319
-
320
- return LXB_STATUS_SMALL_BUFFER;
321
- }
322
-
323
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00CA);
324
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x030C);
325
-
326
- continue;
327
-
328
- case 1164:
329
- if ((ctx->buffer_used + 2) > ctx->buffer_length) {
330
- ctx->u.lead = 0x00EA;
331
- ctx->second_codepoint = 0x0304;
332
-
333
- return LXB_STATUS_SMALL_BUFFER;
334
- }
335
-
336
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00EA);
337
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x0304);
338
-
339
- continue;
340
-
341
- case 1166:
342
- if ((ctx->buffer_used + 2) > ctx->buffer_length) {
343
- ctx->u.lead = 0x00EA;
344
- ctx->second_codepoint = 0x030C;
345
-
346
- return LXB_STATUS_SMALL_BUFFER;
347
- }
348
-
349
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00EA);
350
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x030C);
351
-
352
- continue;
353
-
354
- case 0:
355
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
356
- continue;
357
- }
358
-
359
- ctx->codepoint = lxb_encoding_multi_index_big5[index].codepoint;
360
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
361
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
362
- continue;
363
- }
364
-
365
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
366
- }
367
-
368
- return LXB_STATUS_OK;
369
- }
370
-
371
- lxb_status_t
372
- lxb_encoding_decode_euc_jp(lxb_encoding_decode_t *ctx,
373
- const lxb_char_t **data, const lxb_char_t *end)
374
- {
375
- bool is_jis0212;
376
- lxb_char_t byte, lead;
377
-
378
- ctx->status = LXB_STATUS_OK;
379
-
380
- if (ctx->u.euc_jp.lead != 0x00) {
381
- if (ctx->have_error) {
382
- ctx->have_error = false;
383
- ctx->u.euc_jp.lead = 0x00;
384
-
385
- LXB_ENCODING_DECODE_ERROR_BEGIN {
386
- ctx->have_error = true;
387
- ctx->u.euc_jp.lead = 0x01;
388
- } LXB_ENCODING_DECODE_ERROR_END();
389
- }
390
- else {
391
- if (*data >= end) {
392
- ctx->status = LXB_STATUS_CONTINUE;
393
-
394
- return LXB_STATUS_CONTINUE;
395
- }
396
-
397
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
398
-
399
- lead = ctx->u.euc_jp.lead;
400
- byte = *(*data)++;
401
-
402
- ctx->u.euc_jp.lead = 0x00;
403
-
404
- if (ctx->u.euc_jp.is_jis0212) {
405
- is_jis0212 = true;
406
- ctx->u.euc_jp.is_jis0212 = false;
407
-
408
- goto lead_jis_state;
409
- }
410
-
411
- goto lead_state;
412
- }
413
- }
414
-
415
- while (*data < end) {
416
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
417
-
418
- lead = *(*data)++;
419
-
420
- if (lead < 0x80) {
421
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
422
- continue;
423
- }
424
-
425
- if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
426
- && (lead != 0x8E && lead != 0x8F))
427
- {
428
- LXB_ENCODING_DECODE_ERROR_BEGIN {
429
- (*data)--;
430
- }
431
- LXB_ENCODING_DECODE_ERROR_END();
432
-
433
- continue;
434
- }
435
-
436
- if (*data >= end) {
437
- ctx->u.euc_jp.lead = lead;
438
- ctx->status = LXB_STATUS_CONTINUE;
439
-
440
- return LXB_STATUS_CONTINUE;
441
- }
442
-
443
- byte = *(*data)++;
444
-
445
- lead_state:
446
-
447
- if (lead == 0x8E && (unsigned) (byte - 0xA1) <= (0xDF - 0xA1)) {
448
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xFF61 - 0xA1 + byte);
449
- continue;
450
- }
451
-
452
- is_jis0212 = false;
453
-
454
- if (lead == 0x8F && (unsigned) (byte - 0xA1) <= (0xFE - 0xA1)) {
455
- if (*data >= end) {
456
- ctx->u.euc_jp.lead = byte;
457
- ctx->u.euc_jp.is_jis0212 = true;
458
-
459
- ctx->status = LXB_STATUS_CONTINUE;
460
-
461
- return LXB_STATUS_CONTINUE;
462
- }
463
-
464
- lead = byte;
465
- byte = *(*data)++;
466
- is_jis0212 = true;
467
- }
468
-
469
- lead_jis_state:
470
-
471
- if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
472
- || (unsigned) (byte - 0xA1) > (0xFE - 0xA1))
473
- {
474
- LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
475
- continue;
476
- }
477
-
478
- /* Max index == (0xFE - 0xA1) * 94 + 0xFE - 0xA1 == 8835 */
479
- ctx->codepoint = (lead - 0xA1) * 94 + byte - 0xA1;
480
-
481
- if (is_jis0212) {
482
- if ((sizeof(lxb_encoding_multi_index_jis0212)
483
- / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
484
- {
485
- LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
486
- continue;
487
- }
488
-
489
- ctx->codepoint = lxb_encoding_multi_index_jis0212[ctx->codepoint].codepoint;
490
- }
491
- else {
492
- if ((sizeof(lxb_encoding_multi_index_jis0208)
493
- / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
494
- {
495
- LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
496
- continue;
497
- }
498
-
499
- ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
500
- }
501
-
502
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
503
- LXB_ENCODING_DECODE_FAILED(ctx->u.euc_jp.lead);
504
- continue;
505
- }
506
-
507
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
508
- }
509
-
510
- return LXB_STATUS_OK;
511
- }
512
-
513
- lxb_status_t
514
- lxb_encoding_decode_euc_kr(lxb_encoding_decode_t *ctx,
515
- const lxb_char_t **data, const lxb_char_t *end)
516
- {
517
- lxb_char_t lead, byte;
518
-
519
- ctx->status = LXB_STATUS_OK;
520
-
521
- if (ctx->u.lead != 0x00) {
522
- if (ctx->have_error) {
523
- ctx->have_error = false;
524
- ctx->u.lead = 0x00;
525
-
526
- LXB_ENCODING_DECODE_ERROR_BEGIN {
527
- ctx->have_error = true;
528
- ctx->u.lead = 0x01;
529
- } LXB_ENCODING_DECODE_ERROR_END();
530
- }
531
- else {
532
- if (*data >= end) {
533
- ctx->status = LXB_STATUS_CONTINUE;
534
-
535
- return LXB_STATUS_CONTINUE;
536
- }
537
-
538
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
539
-
540
- lead = (lxb_char_t) ctx->u.lead;
541
- ctx->u.lead = 0x00;
542
-
543
- goto lead_state;
544
- }
545
- }
546
-
547
- while (*data < end) {
548
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
549
-
550
- lead = *(*data)++;
551
-
552
- if (lead < 0x80) {
553
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
554
- continue;
555
- }
556
-
557
- if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
558
- LXB_ENCODING_DECODE_ERROR_BEGIN {
559
- (*data)--;
560
- }
561
- LXB_ENCODING_DECODE_ERROR_END();
562
-
563
- continue;
564
- }
565
-
566
- if (*data == end) {
567
- ctx->u.lead = lead;
568
- ctx->status = LXB_STATUS_CONTINUE;
569
-
570
- return LXB_STATUS_CONTINUE;
571
- }
572
-
573
- lead_state:
574
-
575
- byte = *(*data)++;
576
-
577
- if ((unsigned) (byte - 0x41) > (0xFE - 0x41)) {
578
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
579
- continue;
580
- }
581
-
582
- /* Max index == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
583
- ctx->codepoint = (lead - 0x81) * 190 + (byte - 0x41);
584
-
585
- if (ctx->codepoint >= sizeof(lxb_encoding_multi_index_euc_kr)
586
- / sizeof(lxb_encoding_multi_index_t))
587
- {
588
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
589
- continue;
590
- }
591
-
592
- ctx->codepoint = lxb_encoding_multi_index_euc_kr[ctx->codepoint].codepoint;
593
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
594
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
595
- continue;
596
- }
597
-
598
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
599
- }
600
-
601
- return LXB_STATUS_OK;
602
- }
603
-
604
- lxb_status_t
605
- lxb_encoding_decode_gbk(lxb_encoding_decode_t *ctx,
606
- const lxb_char_t **data, const lxb_char_t *end)
607
- {
608
- return lxb_encoding_decode_gb18030(ctx, data, end);
609
- }
610
-
611
- lxb_status_t
612
- lxb_encoding_decode_ibm866(lxb_encoding_decode_t *ctx,
613
- const lxb_char_t **data, const lxb_char_t *end)
614
- {
615
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_ibm866);
616
-
617
- return LXB_STATUS_OK;
618
- }
619
-
620
- lxb_status_t
621
- lxb_encoding_decode_iso_2022_jp(lxb_encoding_decode_t *ctx,
622
- const lxb_char_t **data, const lxb_char_t *end)
623
- {
624
- #define LXB_ENCODING_DECODE_ISO_2022_JP_OK() \
625
- do { \
626
- if (*data >= end) { \
627
- return LXB_STATUS_OK; \
628
- } \
629
- } \
630
- while (0)
631
-
632
- #define LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE() \
633
- do { \
634
- if (*data >= end) { \
635
- ctx->status = LXB_STATUS_CONTINUE; \
636
- return LXB_STATUS_CONTINUE; \
637
- } \
638
- } \
639
- while (0)
640
-
641
-
642
- lxb_char_t byte;
643
- lxb_encoding_ctx_2022_jp_t *iso = &ctx->u.iso_2022_jp;
644
-
645
- ctx->status = LXB_STATUS_OK;
646
-
647
- if (ctx->have_error) {
648
- ctx->have_error = false;
649
-
650
- LXB_ENCODING_DECODE_ERROR_BEGIN {
651
- ctx->have_error = true;
652
- }
653
- LXB_ENCODING_DECODE_ERROR_END();
654
- }
655
-
656
- if (iso->prepand != 0x00) {
657
- if (*data >= end) {
658
- ctx->status = LXB_STATUS_CONTINUE;
659
-
660
- return LXB_STATUS_CONTINUE;
661
- }
662
-
663
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
664
-
665
- byte = iso->prepand;
666
- iso->prepand = 0x00;
667
-
668
- goto prepand;
669
- }
670
-
671
- if (*data >= end) {
672
- return LXB_STATUS_OK;
673
- }
674
-
675
- do {
676
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
677
-
678
- byte = *(*data)++;
679
-
680
- prepand:
681
-
682
- switch (iso->state) {
683
- case LXB_ENCODING_DECODE_2022_JP_ASCII:
684
- if (byte == 0x1B) {
685
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
686
-
687
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
688
- break;
689
- }
690
-
691
- /* 0x00 to 0x7F, excluding 0x0E, 0x0F, and 0x1B */
692
- if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)
693
- && byte != 0x0E && byte != 0x0F)
694
- {
695
- iso->out_flag = false;
696
-
697
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, byte);
698
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
699
- break;
700
- }
701
-
702
- iso->out_flag = false;
703
-
704
- LXB_ENCODING_DECODE_ERROR_BEGIN {
705
- ctx->have_error = true;
706
- }
707
- LXB_ENCODING_DECODE_ERROR_END();
708
-
709
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
710
- break;
711
-
712
- case LXB_ENCODING_DECODE_2022_JP_ROMAN:
713
- switch (byte) {
714
- case 0x1B:
715
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
716
-
717
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
718
- continue;
719
-
720
- case 0x5C:
721
- iso->out_flag = false;
722
-
723
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x00A5);
724
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
725
-
726
- continue;
727
-
728
- case 0x7E:
729
- iso->out_flag = false;
730
-
731
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x203E);
732
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
733
-
734
- continue;
735
-
736
- case 0x0E:
737
- case 0x0F:
738
- break;
739
-
740
- default:
741
- /* 0x00 to 0x7F */
742
- if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)) {
743
- iso->out_flag = false;
744
-
745
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, byte);
746
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
747
-
748
- continue;
749
- }
750
-
751
- break;
752
- }
753
-
754
- iso->out_flag = false;
755
-
756
- LXB_ENCODING_DECODE_ERROR_BEGIN {
757
- ctx->have_error = true;
758
- }
759
- LXB_ENCODING_DECODE_ERROR_END();
760
-
761
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
762
- break;
763
-
764
- case LXB_ENCODING_DECODE_2022_JP_KATAKANA:
765
- if (byte == 0x1B) {
766
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
767
-
768
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
769
- break;
770
- }
771
-
772
- /* 0x21 to 0x5F */
773
- if ((unsigned) (byte - 0x21) <= (0x5F - 0x21)) {
774
- iso->out_flag = false;
775
-
776
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx,
777
- 0xFF61 - 0x21 + byte);
778
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
779
- break;
780
- }
781
-
782
- iso->out_flag = false;
783
-
784
- LXB_ENCODING_DECODE_ERROR_BEGIN {
785
- ctx->have_error = true;
786
- }
787
- LXB_ENCODING_DECODE_ERROR_END();
788
-
789
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
790
- break;
791
-
792
- case LXB_ENCODING_DECODE_2022_JP_LEAD:
793
- if (byte == 0x1B) {
794
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
795
-
796
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
797
- break;
798
- }
799
-
800
- /* 0x21 to 0x7E */
801
- if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
802
- iso->out_flag = false;
803
- iso->lead = byte;
804
- iso->state = LXB_ENCODING_DECODE_2022_JP_TRAIL;
805
-
806
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
807
- break;
808
- }
809
-
810
- iso->out_flag = false;
811
-
812
- LXB_ENCODING_DECODE_ERROR_BEGIN {
813
- ctx->have_error = true;
814
- }
815
- LXB_ENCODING_DECODE_ERROR_END();
816
-
817
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
818
- break;
819
-
820
- case LXB_ENCODING_DECODE_2022_JP_TRAIL:
821
- if (byte == 0x1B) {
822
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
823
-
824
- LXB_ENCODING_DECODE_ERROR_BEGIN {
825
- ctx->have_error = true;
826
- }
827
- LXB_ENCODING_DECODE_ERROR_END();
828
-
829
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
830
- break;
831
- }
832
-
833
- iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
834
-
835
- /* 0x21 to 0x7E */
836
- if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
837
- /* Max index == (0x7E - 0x21) * 94 + 0x7E - 0x21 == 8835 */
838
- ctx->codepoint = (iso->lead - 0x21) * 94 + byte - 0x21;
839
-
840
- ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
841
-
842
- if (ctx->codepoint != LXB_ENCODING_ERROR_CODEPOINT) {
843
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
844
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
845
-
846
- break;
847
- }
848
- }
849
-
850
- LXB_ENCODING_DECODE_ERROR_BEGIN {
851
- iso->prepand = 0x01;
852
- ctx->have_error = true;
853
- }
854
- LXB_ENCODING_DECODE_ERROR_END();
855
-
856
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
857
- break;
858
-
859
- case LXB_ENCODING_DECODE_2022_JP_ESCAPE_START:
860
- if (byte == 0x24 || byte == 0x28) {
861
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE;
862
- iso->lead = byte;
863
-
864
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
865
- break;
866
- }
867
-
868
- (*data)--;
869
-
870
- iso->out_flag = false;
871
- iso->state = ctx->u.iso_2022_jp.out_state;
872
-
873
- LXB_ENCODING_DECODE_ERROR_BEGIN {
874
- iso->prepand = 0x01;
875
- ctx->have_error = true;
876
- }
877
- LXB_ENCODING_DECODE_ERROR_END();
878
-
879
- break;
880
-
881
- case LXB_ENCODING_DECODE_2022_JP_ESCAPE:
882
- iso->state = LXB_ENCODING_DECODE_2022_JP_UNSET;
883
-
884
- if (iso->lead == 0x28) {
885
- if (byte == 0x42) {
886
- iso->state = LXB_ENCODING_DECODE_2022_JP_ASCII;
887
- }
888
- else if (byte == 0x4A) {
889
- iso->state = LXB_ENCODING_DECODE_2022_JP_ROMAN;
890
- }
891
- else if (byte == 0x49) {
892
- iso->state = LXB_ENCODING_DECODE_2022_JP_KATAKANA;
893
- }
894
- }
895
- else if (iso->lead == 0x24) {
896
- if (byte == 0x40 || byte == 0x42) {
897
- iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
898
- }
899
- }
900
-
901
- if (iso->state == LXB_ENCODING_DECODE_2022_JP_UNSET) {
902
- (*data)--;
903
-
904
- iso->out_flag = false;
905
- iso->state = iso->out_state;
906
-
907
- LXB_ENCODING_DECODE_ERROR_BEGIN {
908
- iso->prepand = iso->lead;
909
- iso->lead = 0x00;
910
-
911
- ctx->have_error = true;
912
- }
913
- LXB_ENCODING_DECODE_ERROR_END();
914
-
915
- byte = iso->lead;
916
- iso->lead = 0x00;
917
-
918
- goto prepand;
919
- }
920
-
921
- iso->lead = 0x00;
922
- iso->out_state = iso->state;
923
-
924
- if (iso->out_flag) {
925
- LXB_ENCODING_DECODE_ERROR_BEGIN {
926
- ctx->have_error = true;
927
- }
928
- LXB_ENCODING_DECODE_ERROR_END();
929
-
930
- LXB_ENCODING_DECODE_ISO_2022_JP_OK();
931
- break;
932
- }
933
-
934
- iso->out_flag = true;
935
-
936
- LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE();
937
- break;
938
- }
939
- }
940
- while (true);
941
-
942
- return LXB_STATUS_OK;
943
-
944
- #undef LXB_ENCODING_DECODE_ISO_2022_JP_OK
945
- #undef LXB_ENCODING_DECODE_ISO_2022_JP_CONTINUE
946
- }
947
-
948
- lxb_status_t
949
- lxb_encoding_decode_iso_8859_10(lxb_encoding_decode_t *ctx,
950
- const lxb_char_t **data, const lxb_char_t *end)
951
- {
952
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_10);
953
-
954
- return LXB_STATUS_OK;
955
- }
956
-
957
- lxb_status_t
958
- lxb_encoding_decode_iso_8859_13(lxb_encoding_decode_t *ctx,
959
- const lxb_char_t **data, const lxb_char_t *end)
960
- {
961
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_13);
962
-
963
- return LXB_STATUS_OK;
964
- }
965
-
966
- lxb_status_t
967
- lxb_encoding_decode_iso_8859_14(lxb_encoding_decode_t *ctx,
968
- const lxb_char_t **data, const lxb_char_t *end)
969
- {
970
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_14);
971
-
972
- return LXB_STATUS_OK;
973
- }
974
-
975
- lxb_status_t
976
- lxb_encoding_decode_iso_8859_15(lxb_encoding_decode_t *ctx,
977
- const lxb_char_t **data, const lxb_char_t *end)
978
- {
979
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_15);
980
-
981
- return LXB_STATUS_OK;
982
- }
983
-
984
- lxb_status_t
985
- lxb_encoding_decode_iso_8859_16(lxb_encoding_decode_t *ctx,
986
- const lxb_char_t **data, const lxb_char_t *end)
987
- {
988
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_16);
989
-
990
- return LXB_STATUS_OK;
991
- }
992
-
993
- lxb_status_t
994
- lxb_encoding_decode_iso_8859_2(lxb_encoding_decode_t *ctx,
995
- const lxb_char_t **data, const lxb_char_t *end)
996
- {
997
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_2);
998
-
999
- return LXB_STATUS_OK;
1000
- }
1001
-
1002
- lxb_status_t
1003
- lxb_encoding_decode_iso_8859_3(lxb_encoding_decode_t *ctx,
1004
- const lxb_char_t **data, const lxb_char_t *end)
1005
- {
1006
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_3);
1007
-
1008
- return LXB_STATUS_OK;
1009
- }
1010
-
1011
- lxb_status_t
1012
- lxb_encoding_decode_iso_8859_4(lxb_encoding_decode_t *ctx,
1013
- const lxb_char_t **data, const lxb_char_t *end)
1014
- {
1015
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_4);
1016
-
1017
- return LXB_STATUS_OK;
1018
- }
1019
-
1020
- lxb_status_t
1021
- lxb_encoding_decode_iso_8859_5(lxb_encoding_decode_t *ctx,
1022
- const lxb_char_t **data, const lxb_char_t *end)
1023
- {
1024
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_5);
1025
-
1026
- return LXB_STATUS_OK;
1027
- }
1028
-
1029
- lxb_status_t
1030
- lxb_encoding_decode_iso_8859_6(lxb_encoding_decode_t *ctx,
1031
- const lxb_char_t **data, const lxb_char_t *end)
1032
- {
1033
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_6);
1034
-
1035
- return LXB_STATUS_OK;
1036
- }
1037
-
1038
- lxb_status_t
1039
- lxb_encoding_decode_iso_8859_7(lxb_encoding_decode_t *ctx,
1040
- const lxb_char_t **data, const lxb_char_t *end)
1041
- {
1042
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_7);
1043
-
1044
- return LXB_STATUS_OK;
1045
- }
1046
-
1047
- lxb_status_t
1048
- lxb_encoding_decode_iso_8859_8(lxb_encoding_decode_t *ctx,
1049
- const lxb_char_t **data, const lxb_char_t *end)
1050
- {
1051
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_8);
1052
-
1053
- return LXB_STATUS_OK;
1054
- }
1055
-
1056
- lxb_status_t
1057
- lxb_encoding_decode_iso_8859_8_i(lxb_encoding_decode_t *ctx,
1058
- const lxb_char_t **data, const lxb_char_t *end)
1059
- {
1060
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_iso_8859_8);
1061
-
1062
- return LXB_STATUS_OK;
1063
- }
1064
-
1065
- lxb_status_t
1066
- lxb_encoding_decode_koi8_r(lxb_encoding_decode_t *ctx,
1067
- const lxb_char_t **data, const lxb_char_t *end)
1068
- {
1069
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_koi8_r);
1070
-
1071
- return LXB_STATUS_OK;
1072
- }
1073
-
1074
- lxb_status_t
1075
- lxb_encoding_decode_koi8_u(lxb_encoding_decode_t *ctx,
1076
- const lxb_char_t **data, const lxb_char_t *end)
1077
- {
1078
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_koi8_u);
1079
-
1080
- return LXB_STATUS_OK;
1081
- }
1082
-
1083
- lxb_status_t
1084
- lxb_encoding_decode_shift_jis(lxb_encoding_decode_t *ctx,
1085
- const lxb_char_t **data, const lxb_char_t *end)
1086
- {
1087
- lxb_char_t byte, lead;
1088
-
1089
- ctx->status = LXB_STATUS_OK;
1090
-
1091
- if (ctx->u.lead != 0x00) {
1092
- if (ctx->have_error) {
1093
- ctx->have_error = false;
1094
- ctx->u.lead = 0x00;
1095
-
1096
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1097
- ctx->have_error = true;
1098
- ctx->u.lead = 0x01;
1099
- } LXB_ENCODING_DECODE_ERROR_END();
1100
- }
1101
- else {
1102
- if (*data >= end) {
1103
- ctx->status = LXB_STATUS_CONTINUE;
1104
-
1105
- return LXB_STATUS_CONTINUE;
1106
- }
1107
-
1108
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1109
-
1110
- lead = (lxb_char_t) ctx->u.lead;
1111
- ctx->u.lead = 0x00;
1112
-
1113
- goto lead_state;
1114
- }
1115
- }
1116
-
1117
- while (*data < end) {
1118
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1119
-
1120
- lead = *(*data)++;
1121
-
1122
- if (lead <= 0x80) {
1123
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, lead);
1124
- continue;
1125
- }
1126
-
1127
- if ((unsigned) (lead - 0xA1) <= (0xDF - 0xA1)) {
1128
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xFF61 - 0xA1 + lead);
1129
- continue;
1130
- }
1131
-
1132
- if ((unsigned) (lead - 0x81) > (0x9F - 0x81)
1133
- && lead != 0xE0 && lead != 0xFC)
1134
- {
1135
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1136
- (*data)--;
1137
- }
1138
- LXB_ENCODING_DECODE_ERROR_END();
1139
-
1140
- continue;
1141
- }
1142
-
1143
- if (*data >= end) {
1144
- ctx->u.lead = lead;
1145
- ctx->status = LXB_STATUS_CONTINUE;
1146
-
1147
- return LXB_STATUS_CONTINUE;
1148
- }
1149
-
1150
- lead_state:
1151
-
1152
- byte = *(*data)++;
1153
-
1154
- if (byte < 0x7F) {
1155
- ctx->codepoint = 0x40;
1156
- }
1157
- else {
1158
- ctx->codepoint = 0x41;
1159
- }
1160
-
1161
- if (lead < 0xA0) {
1162
- ctx->second_codepoint = 0x81;
1163
- }
1164
- else {
1165
- ctx->second_codepoint = 0xC1;
1166
- }
1167
-
1168
- if ((unsigned) (byte - 0x40) > (0x7E - 0x40)
1169
- && (unsigned) (byte - 0x80) > (0xFC - 0x80))
1170
- {
1171
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1172
- continue;
1173
- }
1174
-
1175
- /* Max index == (0xFC - 0xC1) * 188 + 0xFC - 0x41 = 11279 */
1176
- ctx->codepoint = (lead - ctx->second_codepoint) * 188
1177
- + byte - ctx->codepoint;
1178
-
1179
- if (ctx->codepoint >= (sizeof(lxb_encoding_multi_index_jis0208)
1180
- / sizeof(lxb_encoding_multi_index_t)))
1181
- {
1182
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1183
- continue;
1184
- }
1185
-
1186
- if ((unsigned) (ctx->codepoint - 8836) <= (10715 - 8836)) {
1187
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0xE000 - 8836 + ctx->codepoint);
1188
- continue;
1189
- }
1190
-
1191
- ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
1192
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1193
- LXB_ENCODING_DECODE_FAILED(ctx->u.lead);
1194
- continue;
1195
- }
1196
-
1197
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1198
- }
1199
-
1200
- return LXB_STATUS_OK;
1201
- }
1202
-
1203
- lxb_inline lxb_status_t
1204
- lxb_encoding_decode_utf_16(lxb_encoding_decode_t *ctx, bool is_be,
1205
- const lxb_char_t **data, const lxb_char_t *end)
1206
- {
1207
- unsigned lead;
1208
- lxb_codepoint_t unit;
1209
-
1210
- ctx->status = LXB_STATUS_OK;
1211
-
1212
- if (ctx->have_error) {
1213
- ctx->have_error = false;
1214
-
1215
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1216
- ctx->have_error = true;
1217
- }
1218
- LXB_ENCODING_DECODE_ERROR_END();
1219
- }
1220
-
1221
- if (ctx->u.lead != 0x00) {
1222
- if (*data >= end) {
1223
- ctx->status = LXB_STATUS_CONTINUE;
1224
-
1225
- return LXB_STATUS_CONTINUE;
1226
- }
1227
-
1228
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1229
-
1230
- lead = ctx->u.lead - 0x01;
1231
- ctx->u.lead = 0x00;
1232
-
1233
- goto lead_state;
1234
- }
1235
-
1236
- while (*data < end) {
1237
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1238
-
1239
- pair_state:
1240
-
1241
- lead = *(*data)++;
1242
-
1243
- if (*data >= end) {
1244
- ctx->u.lead = lead + 0x01;
1245
- ctx->status = LXB_STATUS_CONTINUE;
1246
-
1247
- return LXB_STATUS_CONTINUE;
1248
- }
1249
-
1250
- lead_state:
1251
-
1252
- /* For UTF-16BE or UTF-16LE */
1253
- if (is_be) {
1254
- unit = (lead << 8) + *(*data)++;
1255
- }
1256
- else {
1257
- unit = (*(*data)++ << 8) + lead;
1258
- }
1259
-
1260
- if (ctx->second_codepoint != 0x00) {
1261
- if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
1262
- ctx->codepoint = 0x10000 + ((ctx->second_codepoint - 0xD800) << 10)
1263
- + (unit - 0xDC00);
1264
-
1265
- ctx->second_codepoint = 0x00;
1266
-
1267
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1268
- continue;
1269
- }
1270
-
1271
- (*data)--;
1272
-
1273
- ctx->second_codepoint = 0x00;
1274
-
1275
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1276
- ctx->have_error = true;
1277
-
1278
- ctx->u.lead = lead + 0x01;
1279
- }
1280
- LXB_ENCODING_DECODE_ERROR_END();
1281
-
1282
- goto lead_state;
1283
- }
1284
-
1285
- /* Surrogate pair */
1286
- if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
1287
- if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
1288
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1289
- ctx->have_error = true;
1290
- }
1291
- LXB_ENCODING_DECODE_ERROR_END();
1292
-
1293
- continue;
1294
- }
1295
-
1296
- ctx->second_codepoint = unit;
1297
-
1298
- if (*data >= end) {
1299
- ctx->status = LXB_STATUS_CONTINUE;
1300
-
1301
- return LXB_STATUS_CONTINUE;
1302
- }
1303
-
1304
- goto pair_state;
1305
- }
1306
-
1307
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, unit);
1308
- }
1309
-
1310
- return LXB_STATUS_OK;
1311
- }
1312
-
1313
- lxb_status_t
1314
- lxb_encoding_decode_utf_16be(lxb_encoding_decode_t *ctx,
1315
- const lxb_char_t **data, const lxb_char_t *end)
1316
- {
1317
- return lxb_encoding_decode_utf_16(ctx, true, data, end);
1318
- }
1319
-
1320
- lxb_status_t
1321
- lxb_encoding_decode_utf_16le(lxb_encoding_decode_t *ctx,
1322
- const lxb_char_t **data, const lxb_char_t *end)
1323
- {
1324
- return lxb_encoding_decode_utf_16(ctx, false, data, end);
1325
- }
1326
-
1327
- lxb_status_t
1328
- lxb_encoding_decode_utf_8(lxb_encoding_decode_t *ctx,
1329
- const lxb_char_t **data, const lxb_char_t *end)
1330
- {
1331
- unsigned need;
1332
- lxb_char_t ch;
1333
- const lxb_char_t *p = *data;
1334
-
1335
- ctx->status = LXB_STATUS_OK;
1336
-
1337
- if (ctx->have_error) {
1338
- ctx->have_error = false;
1339
-
1340
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1341
- ctx->have_error = true;
1342
- }
1343
- LXB_ENCODING_DECODE_ERROR_END();
1344
- }
1345
-
1346
- if (ctx->u.utf_8.need != 0) {
1347
- if (p >= end) {
1348
- ctx->status = LXB_STATUS_CONTINUE;
1349
-
1350
- return LXB_STATUS_CONTINUE;
1351
- }
1352
-
1353
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1354
-
1355
- need = ctx->u.utf_8.need;
1356
- ctx->u.utf_8.need = 0;
1357
-
1358
- if (ctx->u.utf_8.lower != 0x00) {
1359
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY(ctx->u.utf_8.lower,
1360
- ctx->u.utf_8.upper, goto begin);
1361
- ctx->u.utf_8.lower = 0x00;
1362
- }
1363
-
1364
- goto decode;
1365
- }
1366
-
1367
- begin:
1368
-
1369
- while (p < end) {
1370
- if (ctx->buffer_used >= ctx->buffer_length) {
1371
- *data = p;
1372
-
1373
- return LXB_STATUS_SMALL_BUFFER;
1374
- }
1375
-
1376
- ch = *p++;
1377
-
1378
- if (ch < 0x80) {
1379
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ch);
1380
- continue;
1381
- }
1382
- else if (ch <= 0xDF) {
1383
- if (ch < 0xC2) {
1384
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1385
- *data = p - 1;
1386
- }
1387
- LXB_ENCODING_DECODE_ERROR_END();
1388
-
1389
- continue;
1390
- }
1391
-
1392
- need = 1;
1393
- ctx->codepoint = ch & 0x1F;
1394
- }
1395
- else if (ch < 0xF0) {
1396
- need = 2;
1397
- ctx->codepoint = ch & 0x0F;
1398
-
1399
- if (p == end) {
1400
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(0xE0, 0xED, 0xA0, 0x9F);
1401
-
1402
- *data = p;
1403
-
1404
- ctx->u.utf_8.need = need;
1405
- ctx->status = LXB_STATUS_CONTINUE;
1406
-
1407
- return LXB_STATUS_CONTINUE;
1408
- }
1409
-
1410
- if (ch == 0xE0) {
1411
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0xA0, 0xBF, continue);
1412
- }
1413
- else if (ch == 0xED) {
1414
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x80, 0x9F, continue);
1415
- }
1416
- }
1417
- else if (ch < 0xF5) {
1418
- need = 3;
1419
- ctx->codepoint = ch & 0x07;
1420
-
1421
- if (p == end) {
1422
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET(0xF0, 0xF4, 0x90, 0x8F);
1423
-
1424
- *data = p;
1425
-
1426
- ctx->u.utf_8.need = need;
1427
- ctx->status = LXB_STATUS_CONTINUE;
1428
-
1429
- return LXB_STATUS_CONTINUE;
1430
- }
1431
-
1432
- if (ch == 0xF0) {
1433
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x90, 0xBF, continue);
1434
- }
1435
- else if (ch == 0xF4) {
1436
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY(0x80, 0x8F, continue);
1437
- }
1438
- }
1439
- else {
1440
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1441
- *data = p - 1;
1442
- }
1443
- LXB_ENCODING_DECODE_ERROR_END();
1444
-
1445
- continue;
1446
- }
1447
-
1448
- decode:
1449
-
1450
- do {
1451
- if (p >= end) {
1452
- *data = p;
1453
-
1454
- ctx->u.utf_8.need = need;
1455
- ctx->status = LXB_STATUS_CONTINUE;
1456
-
1457
- return LXB_STATUS_CONTINUE;
1458
- }
1459
-
1460
- ch = *p++;
1461
-
1462
- if (ch < 0x80 || ch > 0xBF) {
1463
- p--;
1464
-
1465
- ctx->u.utf_8.need = 0;
1466
-
1467
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1468
- *data = p;
1469
- ctx->have_error = true;
1470
- }
1471
- LXB_ENCODING_DECODE_ERROR_END();
1472
-
1473
- break;
1474
- }
1475
-
1476
- ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
1477
-
1478
- if (--need == 0) {
1479
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1480
-
1481
- break;
1482
- }
1483
- }
1484
- while (true);
1485
- }
1486
-
1487
- *data = p;
1488
-
1489
- return LXB_STATUS_OK;
1490
- }
1491
-
1492
- lxb_inline lxb_codepoint_t
1493
- lxb_encoding_decode_gb18030_range(uint32_t index)
1494
- {
1495
- size_t mid, left, right;
1496
- const lxb_encoding_range_index_t *range;
1497
-
1498
- /*
1499
- * Pointer greater than 39419 and less than 189000,
1500
- * or pointer is greater than 1237575
1501
- */
1502
- if ((unsigned) (index - 39419) < (189000 - 39419)
1503
- || index > 1237575)
1504
- {
1505
- return LXB_ENCODING_ERROR_CODEPOINT;
1506
- }
1507
-
1508
- if (index == 7457) {
1509
- return 0xE7C7;
1510
- }
1511
-
1512
- left = 0;
1513
- right = LXB_ENCODING_RANGE_INDEX_GB18030_SIZE;
1514
- range = lxb_encoding_range_index_gb18030;
1515
-
1516
- /* Some compilers say about uninitialized mid */
1517
- mid = 0;
1518
-
1519
- while (left < right) {
1520
- mid = left + (right - left) / 2;
1521
-
1522
- if (range[mid].index < index) {
1523
- left = mid + 1;
1524
-
1525
- if (left < right && range[ left ].index > index) {
1526
- break;
1527
- }
1528
- }
1529
- else if (range[mid].index > index) {
1530
- right = mid - 1;
1531
-
1532
- if (right > 0 && range[right].index <= index) {
1533
- mid = right;
1534
- break;
1535
- }
1536
- }
1537
- else {
1538
- break;
1539
- }
1540
- }
1541
-
1542
- return range[mid].codepoint + index - range[mid].index;
1543
- }
1544
-
1545
- lxb_status_t
1546
- lxb_encoding_decode_gb18030(lxb_encoding_decode_t *ctx,
1547
- const lxb_char_t **data, const lxb_char_t *end)
1548
- {
1549
- uint32_t pointer;
1550
- lxb_char_t first, second, third, offset;
1551
-
1552
- /* Make compiler happy */
1553
- second = 0x00;
1554
-
1555
- ctx->status = LXB_STATUS_OK;
1556
-
1557
- if (ctx->have_error) {
1558
- ctx->have_error = false;
1559
-
1560
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1561
- ctx->have_error = true;
1562
- }
1563
- LXB_ENCODING_DECODE_ERROR_END();
1564
- }
1565
-
1566
- if (ctx->u.gb18030.first != 0) {
1567
- if (*data >= end) {
1568
- ctx->status = LXB_STATUS_CONTINUE;
1569
-
1570
- return LXB_STATUS_CONTINUE;
1571
- }
1572
-
1573
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1574
-
1575
- if (ctx->u.gb18030.third != 0x00) {
1576
- first = ctx->u.gb18030.first;
1577
- second = ctx->u.gb18030.second;
1578
- third = ctx->u.gb18030.third;
1579
-
1580
- memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
1581
-
1582
- if (ctx->prepend) {
1583
- /* The first is always < 0x80 */
1584
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, second);
1585
-
1586
- if (ctx->buffer_used == ctx->buffer_length) {
1587
- ctx->u.gb18030.first = third;
1588
-
1589
- return LXB_STATUS_SMALL_BUFFER;
1590
- }
1591
-
1592
- first = third;
1593
- ctx->prepend = false;
1594
-
1595
- goto prepend_first;
1596
- }
1597
-
1598
- goto third_state;
1599
- }
1600
- else if (ctx->u.gb18030.second != 0x00) {
1601
- first = ctx->u.gb18030.first;
1602
- second = ctx->u.gb18030.second;
1603
-
1604
- memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
1605
-
1606
- goto second_state;
1607
- }
1608
-
1609
- first = ctx->u.gb18030.first;
1610
- ctx->u.gb18030.first = 0x00;
1611
-
1612
- if (ctx->prepend) {
1613
- ctx->prepend = false;
1614
- goto prepend_first;
1615
- }
1616
-
1617
- goto first_state;
1618
- }
1619
-
1620
- while (*data < end) {
1621
- LXB_ENCODING_DECODE_CHECK_OUT(ctx);
1622
-
1623
- first = *(*data)++;
1624
-
1625
- prepend_first:
1626
-
1627
- if (first < 0x80) {
1628
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, first);
1629
- continue;
1630
- }
1631
-
1632
- if (first == 0x80) {
1633
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, 0x20AC);
1634
- continue;
1635
- }
1636
-
1637
- /* Range 0x81 to 0xFE, inclusive */
1638
- if ((unsigned) (first - 0x81) > (0xFE - 0x81)) {
1639
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1640
- (*data)--;
1641
- }
1642
- LXB_ENCODING_DECODE_ERROR_END();
1643
-
1644
- continue;
1645
- }
1646
-
1647
- if (*data == end) {
1648
- ctx->u.gb18030.first = first;
1649
- ctx->status = LXB_STATUS_CONTINUE;
1650
-
1651
- return LXB_STATUS_CONTINUE;
1652
- }
1653
-
1654
- /* First */
1655
- first_state:
1656
-
1657
- second = *(*data)++;
1658
-
1659
- /* Range 0x30 to 0x39, inclusive */
1660
- if ((unsigned) (second - 0x30) > (0x39 - 0x30)) {
1661
- offset = (second < 0x7F) ? 0x40 : 0x41;
1662
-
1663
- /* Range 0x40 to 0x7E, inclusive, or 0x80 to 0xFE, inclusive */
1664
- if ((unsigned) (second - 0x40) <= (0x7E - 0x40)
1665
- || (unsigned) (second - 0x80) <= (0xFE - 0x80))
1666
- {
1667
- pointer = (first - 0x81) * 190 + (second - offset);
1668
- }
1669
- else {
1670
- if (second < 0x80) {
1671
- (*data)--;
1672
- }
1673
-
1674
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1675
- ctx->have_error = true;
1676
- }
1677
- LXB_ENCODING_DECODE_ERROR_END();
1678
-
1679
- continue;
1680
- }
1681
-
1682
- /* Max pointer value == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
1683
- ctx->codepoint = lxb_encoding_multi_index_gb18030[pointer].codepoint;
1684
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1685
- if (second < 0x80) {
1686
- (*data)--;
1687
- }
1688
-
1689
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1690
- ctx->have_error = true;
1691
- }
1692
- LXB_ENCODING_DECODE_ERROR_END();
1693
-
1694
- continue;
1695
- }
1696
-
1697
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1698
- continue;
1699
- }
1700
-
1701
- if (*data == end) {
1702
- ctx->u.gb18030.first = first;
1703
- ctx->u.gb18030.second = second;
1704
-
1705
- ctx->status = LXB_STATUS_CONTINUE;
1706
-
1707
- return LXB_STATUS_CONTINUE;
1708
- }
1709
-
1710
- /* Second */
1711
- second_state:
1712
-
1713
- third = *(*data)++;
1714
-
1715
- /* Range 0x81 to 0xFE, inclusive */
1716
- if ((unsigned) (third - 0x81) > (0xFE - 0x81)) {
1717
- (*data)--;
1718
-
1719
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1720
- ctx->prepend = true;
1721
- ctx->have_error = true;
1722
- ctx->u.gb18030.first = second;
1723
- }
1724
- LXB_ENCODING_DECODE_ERROR_END();
1725
-
1726
- first = second;
1727
-
1728
- goto prepend_first;
1729
- }
1730
-
1731
- if (*data == end) {
1732
- ctx->u.gb18030.first = first;
1733
- ctx->u.gb18030.second = second;
1734
- ctx->u.gb18030.third = third;
1735
-
1736
- ctx->status = LXB_STATUS_CONTINUE;
1737
-
1738
- return LXB_STATUS_CONTINUE;
1739
- }
1740
-
1741
- /* Third */
1742
- third_state:
1743
-
1744
- /* Range 0x30 to 0x39, inclusive */
1745
- if ((unsigned) (**data - 0x30) > (0x39 - 0x30)) {
1746
- ctx->prepend = true;
1747
-
1748
- LXB_ENCODING_DECODE_ERROR_BEGIN {
1749
- ctx->prepend = true;
1750
- ctx->have_error = true;
1751
-
1752
- /* First is a fake for trigger */
1753
- ctx->u.gb18030.first = 0x01;
1754
- ctx->u.gb18030.second = second;
1755
- ctx->u.gb18030.third = third;
1756
- }
1757
- LXB_ENCODING_DECODE_ERROR_END();
1758
-
1759
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, second);
1760
-
1761
- if (ctx->buffer_used == ctx->buffer_length) {
1762
- ctx->prepend = true;
1763
- ctx->have_error = true;
1764
-
1765
- /* First is a fake for trigger */
1766
- ctx->u.gb18030.first = 0x01;
1767
- ctx->u.gb18030.second = second;
1768
- ctx->u.gb18030.third = third;
1769
-
1770
- return LXB_STATUS_SMALL_BUFFER;
1771
- }
1772
-
1773
- first = third;
1774
-
1775
- goto prepend_first;
1776
- }
1777
-
1778
- pointer = ((first - 0x81) * (10 * 126 * 10))
1779
- + ((second - 0x30) * (10 * 126))
1780
- + ((third - 0x81) * 10) + (*(*data)++) - 0x30;
1781
-
1782
- ctx->codepoint = lxb_encoding_decode_gb18030_range(pointer);
1783
-
1784
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
1785
- LXB_ENCODING_DECODE_ERROR_BEGIN {}
1786
- LXB_ENCODING_DECODE_ERROR_END();
1787
-
1788
- continue;
1789
- }
1790
-
1791
- LXB_ENCODING_DECODE_APPEND_WO_CHECK(ctx, ctx->codepoint);
1792
- }
1793
-
1794
- return LXB_STATUS_OK;
1795
- }
1796
-
1797
- lxb_status_t
1798
- lxb_encoding_decode_macintosh(lxb_encoding_decode_t *ctx,
1799
- const lxb_char_t **data, const lxb_char_t *end)
1800
- {
1801
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_macintosh);
1802
-
1803
- return LXB_STATUS_OK;
1804
- }
1805
-
1806
- lxb_status_t
1807
- lxb_encoding_decode_replacement(lxb_encoding_decode_t *ctx,
1808
- const lxb_char_t **data, const lxb_char_t *end)
1809
- {
1810
- *data = end;
1811
- return LXB_STATUS_ERROR;
1812
- }
1813
-
1814
- lxb_status_t
1815
- lxb_encoding_decode_windows_1250(lxb_encoding_decode_t *ctx,
1816
- const lxb_char_t **data, const lxb_char_t *end)
1817
- {
1818
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1250);
1819
-
1820
- return LXB_STATUS_OK;
1821
- }
1822
-
1823
- lxb_status_t
1824
- lxb_encoding_decode_windows_1251(lxb_encoding_decode_t *ctx,
1825
- const lxb_char_t **data, const lxb_char_t *end)
1826
- {
1827
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1251);
1828
-
1829
- return LXB_STATUS_OK;
1830
- }
1831
-
1832
- lxb_status_t
1833
- lxb_encoding_decode_windows_1252(lxb_encoding_decode_t *ctx,
1834
- const lxb_char_t **data, const lxb_char_t *end)
1835
- {
1836
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1252);
1837
-
1838
- return LXB_STATUS_OK;
1839
- }
1840
-
1841
- lxb_status_t
1842
- lxb_encoding_decode_windows_1253(lxb_encoding_decode_t *ctx,
1843
- const lxb_char_t **data, const lxb_char_t *end)
1844
- {
1845
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1253);
1846
-
1847
- return LXB_STATUS_OK;
1848
- }
1849
-
1850
- lxb_status_t
1851
- lxb_encoding_decode_windows_1254(lxb_encoding_decode_t *ctx,
1852
- const lxb_char_t **data, const lxb_char_t *end)
1853
- {
1854
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1254);
1855
-
1856
- return LXB_STATUS_OK;
1857
- }
1858
-
1859
- lxb_status_t
1860
- lxb_encoding_decode_windows_1255(lxb_encoding_decode_t *ctx,
1861
- const lxb_char_t **data, const lxb_char_t *end)
1862
- {
1863
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1255);
1864
-
1865
- return LXB_STATUS_OK;
1866
- }
1867
-
1868
- lxb_status_t
1869
- lxb_encoding_decode_windows_1256(lxb_encoding_decode_t *ctx,
1870
- const lxb_char_t **data, const lxb_char_t *end)
1871
- {
1872
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1256);
1873
-
1874
- return LXB_STATUS_OK;
1875
- }
1876
-
1877
- lxb_status_t
1878
- lxb_encoding_decode_windows_1257(lxb_encoding_decode_t *ctx,
1879
- const lxb_char_t **data, const lxb_char_t *end)
1880
- {
1881
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1257);
1882
-
1883
- return LXB_STATUS_OK;
1884
- }
1885
-
1886
- lxb_status_t
1887
- lxb_encoding_decode_windows_1258(lxb_encoding_decode_t *ctx,
1888
- const lxb_char_t **data, const lxb_char_t *end)
1889
- {
1890
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_1258);
1891
-
1892
- return LXB_STATUS_OK;
1893
- }
1894
-
1895
- lxb_status_t
1896
- lxb_encoding_decode_windows_874(lxb_encoding_decode_t *ctx,
1897
- const lxb_char_t **data, const lxb_char_t *end)
1898
- {
1899
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_windows_874);
1900
-
1901
- return LXB_STATUS_OK;
1902
- }
1903
-
1904
- lxb_status_t
1905
- lxb_encoding_decode_x_mac_cyrillic(lxb_encoding_decode_t *ctx,
1906
- const lxb_char_t **data, const lxb_char_t *end)
1907
- {
1908
- LXB_ENCODING_DECODE_SINGLE(lxb_encoding_single_index_x_mac_cyrillic);
1909
-
1910
- return LXB_STATUS_OK;
1911
- }
1912
-
1913
- lxb_status_t
1914
- lxb_encoding_decode_x_user_defined(lxb_encoding_decode_t *ctx,
1915
- const lxb_char_t **data, const lxb_char_t *end)
1916
- {
1917
- while (*data < end) {
1918
- if (**data < 0x80) {
1919
- LXB_ENCODING_DECODE_APPEND(ctx, *(*data)++);
1920
- }
1921
- else {
1922
- LXB_ENCODING_DECODE_APPEND(ctx, 0xF780 + (*(*data)++) - 0x80);
1923
- }
1924
- }
1925
-
1926
- return LXB_STATUS_OK;
1927
- }
1928
-
1929
- /*
1930
- * Single
1931
- */
1932
- lxb_codepoint_t
1933
- lxb_encoding_decode_default_single(lxb_encoding_decode_t *ctx,
1934
- const lxb_char_t **data, const lxb_char_t *end)
1935
- {
1936
- return lxb_encoding_decode_utf_8_single(ctx, data, end);
1937
- }
1938
-
1939
- lxb_codepoint_t
1940
- lxb_encoding_decode_auto_single(lxb_encoding_decode_t *ctx,
1941
- const lxb_char_t **data, const lxb_char_t *end)
1942
- {
1943
- return LXB_ENCODING_DECODE_ERROR;
1944
- }
1945
-
1946
- lxb_codepoint_t
1947
- lxb_encoding_decode_undefined_single(lxb_encoding_decode_t *ctx,
1948
- const lxb_char_t **data, const lxb_char_t *end)
1949
- {
1950
- return LXB_ENCODING_DECODE_ERROR;
1951
- }
1952
-
1953
- lxb_codepoint_t
1954
- lxb_encoding_decode_big5_single(lxb_encoding_decode_t *ctx,
1955
- const lxb_char_t **data, const lxb_char_t *end)
1956
- {
1957
- uint32_t index;
1958
- lxb_char_t lead, byte;
1959
-
1960
- if (ctx->u.lead != 0x00) {
1961
- if (ctx->second_codepoint != 0x00) {
1962
- (*data)++;
1963
-
1964
- ctx->u.lead = 0x00;
1965
-
1966
- ctx->codepoint = ctx->second_codepoint;
1967
- ctx->second_codepoint = 0x00;
1968
-
1969
- return ctx->codepoint;
1970
- }
1971
-
1972
- lead = (lxb_char_t) ctx->u.lead;
1973
- ctx->u.lead = 0x00;
1974
-
1975
- goto lead_state;
1976
- }
1977
-
1978
- lead = *(*data)++;
1979
-
1980
- if (lead < 0x80) {
1981
- return lead;
1982
- }
1983
-
1984
- if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
1985
- return LXB_ENCODING_DECODE_ERROR;
1986
- }
1987
-
1988
- if (*data >= end) {
1989
- ctx->u.lead = lead;
1990
-
1991
- return LXB_ENCODING_DECODE_CONTINUE;
1992
- }
1993
-
1994
- lead_state:
1995
-
1996
- index = 0;
1997
- byte = **data;
1998
-
1999
- if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
2000
- || (unsigned) (byte - 0xA1) <= (0xFE - 0xA1))
2001
- {
2002
- if (byte < 0x7F) {
2003
- /* Max index == (0xFE - 0x81) * 157 + (0x7E - 0x62) == 19653 */
2004
- index = (lead - 0x81) * 157 + (byte - 0x40);
2005
- }
2006
- else {
2007
- /* Max index == (0xFE - 0x81) * 157 + (0xFE - 0x62) == 19781 */
2008
- index = (lead - 0x81) * 157 + (byte - 0x62);
2009
- }
2010
- }
2011
-
2012
- /*
2013
- * 1133 U+00CA U+0304 Ê̄ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON)
2014
- * 1135 U+00CA U+030C Ê̌ (LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON)
2015
- * 1164 U+00EA U+0304 ê̄ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON)
2016
- * 1166 U+00EA U+030C ê̌ (LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON)
2017
- */
2018
- switch (index) {
2019
- case 1133:
2020
- ctx->u.lead = lead;
2021
- ctx->second_codepoint = 0x0304;
2022
- return 0x00CA;
2023
-
2024
- case 1135:
2025
- ctx->u.lead = lead;
2026
- ctx->second_codepoint = 0x030C;
2027
- return 0x00CA;
2028
-
2029
- case 1164:
2030
- ctx->u.lead = lead;
2031
- ctx->second_codepoint = 0x0304;
2032
- return 0x00EA;
2033
-
2034
- case 1166:
2035
- ctx->u.lead = lead;
2036
- ctx->second_codepoint = 0x030C;
2037
- return 0x00EA;
2038
-
2039
- case 0:
2040
- goto failed;
2041
- }
2042
-
2043
- ctx->codepoint = lxb_encoding_multi_index_big5[index].codepoint;
2044
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2045
- goto failed;
2046
- }
2047
-
2048
- (*data)++;
2049
-
2050
- return ctx->codepoint;
2051
-
2052
- failed:
2053
-
2054
- if (byte >= 0x80) {
2055
- (*data)++;
2056
- }
2057
-
2058
- return LXB_ENCODING_DECODE_ERROR;
2059
- }
2060
-
2061
- lxb_codepoint_t
2062
- lxb_encoding_decode_euc_jp_single(lxb_encoding_decode_t *ctx,
2063
- const lxb_char_t **data, const lxb_char_t *end)
2064
- {
2065
- bool is_jis0212;
2066
- lxb_char_t byte, lead;
2067
-
2068
- if (ctx->u.euc_jp.lead != 0x00) {
2069
- lead = ctx->u.euc_jp.lead;
2070
- byte = *(*data)++;
2071
-
2072
- ctx->u.euc_jp.lead = 0x00;
2073
-
2074
- if (ctx->u.euc_jp.is_jis0212) {
2075
- is_jis0212 = true;
2076
- ctx->u.euc_jp.is_jis0212 = false;
2077
-
2078
- goto lead_jis_state;
2079
- }
2080
-
2081
- goto lead_state;
2082
- }
2083
-
2084
- lead = *(*data)++;
2085
-
2086
- if (lead < 0x80) {
2087
- return lead;
2088
- }
2089
-
2090
- if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
2091
- && (lead != 0x8E && lead != 0x8F))
2092
- {
2093
- return LXB_ENCODING_DECODE_ERROR;
2094
- }
2095
-
2096
- if (*data >= end) {
2097
- ctx->u.euc_jp.lead = lead;
2098
- return LXB_ENCODING_DECODE_CONTINUE;
2099
- }
2100
-
2101
- byte = *(*data)++;
2102
-
2103
- lead_state:
2104
-
2105
- if (lead == 0x8E && (unsigned) (byte - 0xA1) <= (0xDF - 0xA1)) {
2106
- return 0xFF61 - 0xA1 + byte;
2107
- }
2108
-
2109
- is_jis0212 = false;
2110
-
2111
- if (lead == 0x8F && (unsigned) (byte - 0xA1) <= (0xFE - 0xA1)) {
2112
- if (*data >= end) {
2113
- ctx->u.euc_jp.lead = byte;
2114
- ctx->u.euc_jp.is_jis0212 = true;
2115
-
2116
- return LXB_ENCODING_DECODE_CONTINUE;
2117
- }
2118
-
2119
- lead = byte;
2120
- byte = *(*data)++;
2121
- is_jis0212 = true;
2122
- }
2123
-
2124
- lead_jis_state:
2125
-
2126
- if ((unsigned) (lead - 0xA1) > (0xFE - 0xA1)
2127
- || (unsigned) (byte - 0xA1) > (0xFE - 0xA1))
2128
- {
2129
- goto failed;
2130
- }
2131
-
2132
- /* Max index == (0xFE - 0xA1) * 94 + 0xFE - 0xA1 == 8835 */
2133
- ctx->codepoint = (lead - 0xA1) * 94 + byte - 0xA1;
2134
-
2135
- if (is_jis0212) {
2136
- if ((sizeof(lxb_encoding_multi_index_jis0212)
2137
- / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
2138
- {
2139
- goto failed;
2140
- }
2141
-
2142
- ctx->codepoint = lxb_encoding_multi_index_jis0212[ctx->codepoint].codepoint;
2143
- }
2144
- else {
2145
- if ((sizeof(lxb_encoding_multi_index_jis0208)
2146
- / sizeof(lxb_encoding_multi_index_t)) <= ctx->codepoint)
2147
- {
2148
- goto failed;
2149
- }
2150
-
2151
- ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2152
- }
2153
-
2154
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2155
- goto failed;
2156
- }
2157
-
2158
- return ctx->codepoint;
2159
-
2160
- failed:
2161
-
2162
- if (byte < 0x80) {
2163
- (*data)--;
2164
- }
2165
-
2166
- return LXB_ENCODING_DECODE_ERROR;
2167
- }
2168
-
2169
- lxb_codepoint_t
2170
- lxb_encoding_decode_euc_kr_single(lxb_encoding_decode_t *ctx,
2171
- const lxb_char_t **data, const lxb_char_t *end)
2172
- {
2173
- lxb_char_t lead, byte;
2174
-
2175
- if (ctx->u.lead != 0x00) {
2176
- lead = (lxb_char_t) ctx->u.lead;
2177
- ctx->u.lead = 0x00;
2178
-
2179
- goto lead_state;
2180
- }
2181
-
2182
- lead = *(*data)++;
2183
-
2184
- if (lead < 0x80) {
2185
- return lead;
2186
- }
2187
-
2188
- if ((unsigned) (lead - 0x81) > (0xFE - 0x81)) {
2189
- return LXB_ENCODING_DECODE_ERROR;
2190
- }
2191
-
2192
- if (*data == end) {
2193
- ctx->u.lead = lead;
2194
- return LXB_ENCODING_DECODE_CONTINUE;
2195
- }
2196
-
2197
- lead_state:
2198
-
2199
- byte = *(*data)++;
2200
-
2201
- if ((unsigned) (byte - 0x41) > (0xFE - 0x41)) {
2202
- goto failed;
2203
- }
2204
-
2205
- /* Max index == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
2206
- ctx->codepoint = (lead - 0x81) * 190 + (byte - 0x41);
2207
-
2208
- if (ctx->codepoint >= sizeof(lxb_encoding_multi_index_euc_kr)
2209
- / sizeof(lxb_encoding_multi_index_t))
2210
- {
2211
- goto failed;
2212
- }
2213
-
2214
- ctx->codepoint = lxb_encoding_multi_index_euc_kr[ctx->codepoint].codepoint;
2215
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2216
- goto failed;
2217
- }
2218
-
2219
- return ctx->codepoint;
2220
-
2221
- failed:
2222
-
2223
- if (byte < 0x80) {
2224
- (*data)--;
2225
- }
2226
-
2227
- return LXB_ENCODING_DECODE_ERROR;
2228
- }
2229
-
2230
- lxb_codepoint_t
2231
- lxb_encoding_decode_gbk_single(lxb_encoding_decode_t *ctx,
2232
- const lxb_char_t **data, const lxb_char_t *end)
2233
- {
2234
- return lxb_encoding_decode_gb18030_single(ctx, data, end);
2235
- }
2236
-
2237
- lxb_codepoint_t
2238
- lxb_encoding_decode_ibm866_single(lxb_encoding_decode_t *ctx,
2239
- const lxb_char_t **data, const lxb_char_t *end)
2240
- {
2241
- if (**data < 0x80) {
2242
- return *(*data)++;
2243
- }
2244
-
2245
- return lxb_encoding_single_index_ibm866[*(*data)++ - 0x80].codepoint;
2246
- }
2247
-
2248
- lxb_codepoint_t
2249
- lxb_encoding_decode_iso_2022_jp_single(lxb_encoding_decode_t *ctx,
2250
- const lxb_char_t **data, const lxb_char_t *end)
2251
- {
2252
- lxb_char_t byte;
2253
- lxb_encoding_ctx_2022_jp_t *iso = &ctx->u.iso_2022_jp;
2254
-
2255
- if (iso->prepand != 0x00) {
2256
- byte = iso->prepand;
2257
- iso->prepand = 0x00;
2258
-
2259
- goto prepand;
2260
- }
2261
-
2262
- do {
2263
- byte = *(*data)++;
2264
-
2265
- prepand:
2266
-
2267
- switch (iso->state) {
2268
- case LXB_ENCODING_DECODE_2022_JP_ASCII:
2269
- if (byte == 0x1B) {
2270
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2271
-
2272
- break;
2273
- }
2274
-
2275
- /* 0x00 to 0x7F, excluding 0x0E, 0x0F, and 0x1B */
2276
- if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)
2277
- && byte != 0x0E && byte != 0x0F)
2278
- {
2279
- iso->out_flag = false;
2280
-
2281
- return byte;
2282
- }
2283
-
2284
- iso->out_flag = false;
2285
-
2286
- return LXB_ENCODING_DECODE_ERROR;
2287
-
2288
- case LXB_ENCODING_DECODE_2022_JP_ROMAN:
2289
- switch (byte) {
2290
- case 0x1B:
2291
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2292
-
2293
- continue;
2294
-
2295
- case 0x5C:
2296
- iso->out_flag = false;
2297
-
2298
- return 0x00A5;
2299
-
2300
- case 0x7E:
2301
- iso->out_flag = false;
2302
-
2303
- return 0x203E;
2304
-
2305
- case 0x0E:
2306
- case 0x0F:
2307
- break;
2308
-
2309
- default:
2310
- /* 0x00 to 0x7F */
2311
- if ((unsigned) (byte - 0x00) <= (0x7F - 0x00)) {
2312
- iso->out_flag = false;
2313
-
2314
- return byte;
2315
- }
2316
-
2317
- break;
2318
- }
2319
-
2320
- iso->out_flag = false;
2321
-
2322
- return LXB_ENCODING_DECODE_ERROR;
2323
-
2324
- case LXB_ENCODING_DECODE_2022_JP_KATAKANA:
2325
- if (byte == 0x1B) {
2326
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2327
-
2328
- break;
2329
- }
2330
-
2331
- /* 0x21 to 0x5F */
2332
- if ((unsigned) (byte - 0x21) <= (0x5F - 0x21)) {
2333
- iso->out_flag = false;
2334
-
2335
- return 0xFF61 - 0x21 + byte;
2336
- }
2337
-
2338
- iso->out_flag = false;
2339
-
2340
- return LXB_ENCODING_DECODE_ERROR;
2341
-
2342
- case LXB_ENCODING_DECODE_2022_JP_LEAD:
2343
- if (byte == 0x1B) {
2344
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2345
-
2346
- break;
2347
- }
2348
-
2349
- /* 0x21 to 0x7E */
2350
- if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
2351
- iso->out_flag = false;
2352
- iso->lead = byte;
2353
- iso->state = LXB_ENCODING_DECODE_2022_JP_TRAIL;
2354
-
2355
- break;
2356
- }
2357
-
2358
- iso->out_flag = false;
2359
-
2360
- return LXB_ENCODING_DECODE_ERROR;
2361
-
2362
- case LXB_ENCODING_DECODE_2022_JP_TRAIL:
2363
- if (byte == 0x1B) {
2364
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE_START;
2365
-
2366
- return LXB_ENCODING_DECODE_ERROR;
2367
- }
2368
-
2369
- iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
2370
-
2371
- /* 0x21 to 0x7E */
2372
- if ((unsigned) (byte - 0x21) <= (0x7E - 0x21)) {
2373
- /* Max index == (0x7E - 0x21) * 94 + 0x7E - 0x21 == 8835 */
2374
- ctx->codepoint = (iso->lead - 0x21) * 94 + byte - 0x21;
2375
-
2376
- return lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2377
- }
2378
-
2379
- return LXB_ENCODING_DECODE_ERROR;
2380
-
2381
- case LXB_ENCODING_DECODE_2022_JP_ESCAPE_START:
2382
- if (byte == 0x24 || byte == 0x28) {
2383
- iso->state = LXB_ENCODING_DECODE_2022_JP_ESCAPE;
2384
- iso->lead = byte;
2385
-
2386
- break;
2387
- }
2388
-
2389
- (*data)--;
2390
-
2391
- iso->out_flag = false;
2392
- iso->state = ctx->u.iso_2022_jp.out_state;
2393
-
2394
- return LXB_ENCODING_DECODE_ERROR;
2395
-
2396
- case LXB_ENCODING_DECODE_2022_JP_ESCAPE:
2397
- iso->state = LXB_ENCODING_DECODE_2022_JP_UNSET;
2398
-
2399
- if (iso->lead == 0x28) {
2400
- if (byte == 0x42) {
2401
- iso->state = LXB_ENCODING_DECODE_2022_JP_ASCII;
2402
- }
2403
- else if (byte == 0x4A) {
2404
- iso->state = LXB_ENCODING_DECODE_2022_JP_ROMAN;
2405
- }
2406
- else if (byte == 0x49) {
2407
- iso->state = LXB_ENCODING_DECODE_2022_JP_KATAKANA;
2408
- }
2409
- }
2410
- else if (iso->lead == 0x24) {
2411
- if (byte == 0x40 || byte == 0x42) {
2412
- iso->state = LXB_ENCODING_DECODE_2022_JP_LEAD;
2413
- }
2414
- }
2415
-
2416
- if (iso->state == LXB_ENCODING_DECODE_2022_JP_UNSET) {
2417
- iso->prepand = iso->lead;
2418
- iso->lead = 0x00;
2419
-
2420
- (*data)--;
2421
-
2422
- iso->out_flag = false;
2423
- iso->state = iso->out_state;
2424
-
2425
- return LXB_ENCODING_DECODE_ERROR;
2426
- }
2427
-
2428
- iso->lead = 0x00;
2429
- iso->out_state = iso->state;
2430
-
2431
- if (iso->out_flag) {
2432
- return LXB_ENCODING_DECODE_ERROR;
2433
- }
2434
-
2435
- iso->out_flag = true;
2436
-
2437
- break;
2438
- }
2439
- }
2440
- while (*data < end);
2441
-
2442
- return LXB_ENCODING_DECODE_CONTINUE;
2443
- }
2444
-
2445
- lxb_codepoint_t
2446
- lxb_encoding_decode_iso_8859_10_single(lxb_encoding_decode_t *ctx,
2447
- const lxb_char_t **data, const lxb_char_t *end)
2448
- {
2449
- if (**data < 0x80) {
2450
- return *(*data)++;
2451
- }
2452
-
2453
- return lxb_encoding_single_index_iso_8859_10[*(*data)++ - 0x80].codepoint;
2454
- }
2455
-
2456
- lxb_codepoint_t
2457
- lxb_encoding_decode_iso_8859_13_single(lxb_encoding_decode_t *ctx,
2458
- const lxb_char_t **data, const lxb_char_t *end)
2459
- {
2460
- if (**data < 0x80) {
2461
- return *(*data)++;
2462
- }
2463
-
2464
- return lxb_encoding_single_index_iso_8859_13[*(*data)++ - 0x80].codepoint;
2465
- }
2466
-
2467
- lxb_codepoint_t
2468
- lxb_encoding_decode_iso_8859_14_single(lxb_encoding_decode_t *ctx,
2469
- const lxb_char_t **data, const lxb_char_t *end)
2470
- {
2471
- if (**data < 0x80) {
2472
- return *(*data)++;
2473
- }
2474
-
2475
- return lxb_encoding_single_index_iso_8859_14[*(*data)++ - 0x80].codepoint;
2476
- }
2477
-
2478
- lxb_codepoint_t
2479
- lxb_encoding_decode_iso_8859_15_single(lxb_encoding_decode_t *ctx,
2480
- const lxb_char_t **data, const lxb_char_t *end)
2481
- {
2482
- if (**data < 0x80) {
2483
- return *(*data)++;
2484
- }
2485
-
2486
- return lxb_encoding_single_index_iso_8859_15[*(*data)++ - 0x80].codepoint;
2487
- }
2488
-
2489
- lxb_codepoint_t
2490
- lxb_encoding_decode_iso_8859_16_single(lxb_encoding_decode_t *ctx,
2491
- const lxb_char_t **data, const lxb_char_t *end)
2492
- {
2493
- if (**data < 0x80) {
2494
- return *(*data)++;
2495
- }
2496
-
2497
- return lxb_encoding_single_index_iso_8859_16[*(*data)++ - 0x80].codepoint;
2498
- }
2499
-
2500
- lxb_codepoint_t
2501
- lxb_encoding_decode_iso_8859_2_single(lxb_encoding_decode_t *ctx,
2502
- const lxb_char_t **data, const lxb_char_t *end)
2503
- {
2504
- if (**data < 0x80) {
2505
- return *(*data)++;
2506
- }
2507
-
2508
- return lxb_encoding_single_index_iso_8859_2[*(*data)++ - 0x80].codepoint;
2509
- }
2510
-
2511
- lxb_codepoint_t
2512
- lxb_encoding_decode_iso_8859_3_single(lxb_encoding_decode_t *ctx,
2513
- const lxb_char_t **data, const lxb_char_t *end)
2514
- {
2515
- if (**data < 0x80) {
2516
- return *(*data)++;
2517
- }
2518
-
2519
- return lxb_encoding_single_index_iso_8859_3[*(*data)++ - 0x80].codepoint;
2520
- }
2521
-
2522
- lxb_codepoint_t
2523
- lxb_encoding_decode_iso_8859_4_single(lxb_encoding_decode_t *ctx,
2524
- const lxb_char_t **data, const lxb_char_t *end)
2525
- {
2526
- if (**data < 0x80) {
2527
- return *(*data)++;
2528
- }
2529
-
2530
- return lxb_encoding_single_index_iso_8859_4[*(*data)++ - 0x80].codepoint;
2531
- }
2532
-
2533
- lxb_codepoint_t
2534
- lxb_encoding_decode_iso_8859_5_single(lxb_encoding_decode_t *ctx,
2535
- const lxb_char_t **data, const lxb_char_t *end)
2536
- {
2537
- if (**data < 0x80) {
2538
- return *(*data)++;
2539
- }
2540
-
2541
- return lxb_encoding_single_index_iso_8859_5[*(*data)++ - 0x80].codepoint;
2542
- }
2543
-
2544
- lxb_codepoint_t
2545
- lxb_encoding_decode_iso_8859_6_single(lxb_encoding_decode_t *ctx,
2546
- const lxb_char_t **data, const lxb_char_t *end)
2547
- {
2548
- if (**data < 0x80) {
2549
- return *(*data)++;
2550
- }
2551
-
2552
- return lxb_encoding_single_index_iso_8859_6[*(*data)++ - 0x80].codepoint;
2553
- }
2554
-
2555
- lxb_codepoint_t
2556
- lxb_encoding_decode_iso_8859_7_single(lxb_encoding_decode_t *ctx,
2557
- const lxb_char_t **data, const lxb_char_t *end)
2558
- {
2559
- if (**data < 0x80) {
2560
- return *(*data)++;
2561
- }
2562
-
2563
- return lxb_encoding_single_index_iso_8859_7[*(*data)++ - 0x80].codepoint;
2564
- }
2565
-
2566
- lxb_codepoint_t
2567
- lxb_encoding_decode_iso_8859_8_single(lxb_encoding_decode_t *ctx,
2568
- const lxb_char_t **data, const lxb_char_t *end)
2569
- {
2570
- if (**data < 0x80) {
2571
- return *(*data)++;
2572
- }
2573
-
2574
- return lxb_encoding_single_index_iso_8859_8[*(*data)++ - 0x80].codepoint;
2575
- }
2576
-
2577
- lxb_codepoint_t
2578
- lxb_encoding_decode_iso_8859_8_i_single(lxb_encoding_decode_t *ctx,
2579
- const lxb_char_t **data, const lxb_char_t *end)
2580
- {
2581
- if (**data < 0x80) {
2582
- return *(*data)++;
2583
- }
2584
-
2585
- return lxb_encoding_single_index_iso_8859_8[*(*data)++ - 0x80].codepoint;
2586
- }
2587
-
2588
- lxb_codepoint_t
2589
- lxb_encoding_decode_koi8_r_single(lxb_encoding_decode_t *ctx,
2590
- const lxb_char_t **data, const lxb_char_t *end)
2591
- {
2592
- if (**data < 0x80) {
2593
- return *(*data)++;
2594
- }
2595
-
2596
- return lxb_encoding_single_index_koi8_r[*(*data)++ - 0x80].codepoint;
2597
- }
2598
-
2599
- lxb_codepoint_t
2600
- lxb_encoding_decode_koi8_u_single(lxb_encoding_decode_t *ctx,
2601
- const lxb_char_t **data, const lxb_char_t *end)
2602
- {
2603
- if (**data < 0x80) {
2604
- return *(*data)++;
2605
- }
2606
-
2607
- return lxb_encoding_single_index_koi8_u[*(*data)++ - 0x80].codepoint;
2608
- }
2609
-
2610
- lxb_codepoint_t
2611
- lxb_encoding_decode_shift_jis_single(lxb_encoding_decode_t *ctx,
2612
- const lxb_char_t **data, const lxb_char_t *end)
2613
- {
2614
- lxb_char_t byte, lead;
2615
-
2616
- if (ctx->u.lead != 0x00) {
2617
- lead = (lxb_char_t) ctx->u.lead;
2618
- ctx->u.lead = 0x00;
2619
-
2620
- goto lead_state;
2621
- }
2622
-
2623
- lead = *(*data)++;
2624
-
2625
- if (lead <= 0x80) {
2626
- return lead;
2627
- }
2628
-
2629
- if ((unsigned) (lead - 0xA1) <= (0xDF - 0xA1)) {
2630
- return 0xFF61 - 0xA1 + lead;
2631
- }
2632
-
2633
- if ((unsigned) (lead - 0x81) > (0x9F - 0x81)
2634
- && lead != 0xE0 && lead != 0xFC)
2635
- {
2636
- return LXB_ENCODING_DECODE_ERROR;
2637
- }
2638
-
2639
- if (*data >= end) {
2640
- ctx->u.lead = lead;
2641
-
2642
- return LXB_ENCODING_DECODE_CONTINUE;
2643
- }
2644
-
2645
- lead_state:
2646
-
2647
- byte = *(*data)++;
2648
-
2649
- if (byte < 0x7F) {
2650
- ctx->codepoint = 0x40;
2651
- }
2652
- else {
2653
- ctx->codepoint = 0x41;
2654
- }
2655
-
2656
- if (lead < 0xA0) {
2657
- ctx->second_codepoint = 0x81;
2658
- }
2659
- else {
2660
- ctx->second_codepoint = 0xC1;
2661
- }
2662
-
2663
- if ((unsigned) (byte - 0x40) <= (0x7E - 0x40)
2664
- || (unsigned) (byte - 0x80) <= (0xFC - 0x80))
2665
- {
2666
- /* Max index == (0xFC - 0xC1) * 188 + 0xFC - 0x41 = 11279 */
2667
- ctx->codepoint = (lead - ctx->second_codepoint) * 188
2668
- + byte - ctx->codepoint;
2669
-
2670
- if (ctx->codepoint >= (sizeof(lxb_encoding_multi_index_jis0208)
2671
- / sizeof(lxb_encoding_multi_index_t)))
2672
- {
2673
- goto failed;
2674
- }
2675
-
2676
- if ((unsigned) (ctx->codepoint - 8836) <= (10715 - 8836)) {
2677
- return 0xE000 - 8836 + ctx->codepoint;
2678
- }
2679
-
2680
- ctx->codepoint = lxb_encoding_multi_index_jis0208[ctx->codepoint].codepoint;
2681
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2682
- goto failed;
2683
- }
2684
-
2685
- return ctx->codepoint;
2686
- }
2687
-
2688
- failed:
2689
-
2690
- if (byte < 0x80) {
2691
- (*data)--;
2692
- }
2693
-
2694
- return LXB_ENCODING_DECODE_ERROR;
2695
- }
2696
-
2697
- lxb_inline lxb_codepoint_t
2698
- lxb_encoding_decode_utf_16_single(lxb_encoding_decode_t *ctx, bool is_be,
2699
- const lxb_char_t **data, const lxb_char_t *end)
2700
- {
2701
- unsigned lead;
2702
- lxb_codepoint_t unit;
2703
-
2704
- if (ctx->u.lead != 0x00) {
2705
- lead = ctx->u.lead - 0x01;
2706
- ctx->u.lead = 0x00;
2707
-
2708
- goto lead_state;
2709
- }
2710
-
2711
- pair_state:
2712
-
2713
- lead = *(*data)++;
2714
-
2715
- if (*data >= end) {
2716
- ctx->u.lead = lead + 0x01;
2717
- return LXB_ENCODING_DECODE_CONTINUE;
2718
- }
2719
-
2720
- lead_state:
2721
-
2722
- /* For UTF-16BE or UTF-16LE */
2723
- if (is_be) {
2724
- unit = (lead << 8) + *(*data)++;
2725
- }
2726
- else {
2727
- unit = (*(*data)++ << 8) + lead;
2728
- }
2729
-
2730
- if (ctx->second_codepoint != 0x00) {
2731
- if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
2732
- ctx->codepoint = 0x10000 + ((ctx->second_codepoint - 0xD800) << 10)
2733
- + (unit - 0xDC00);
2734
-
2735
- ctx->second_codepoint = 0x00;
2736
- return ctx->codepoint;
2737
- }
2738
-
2739
- (*data)--;
2740
-
2741
- ctx->u.lead = lead + 0x01;
2742
- ctx->second_codepoint = 0x00;
2743
-
2744
- return LXB_ENCODING_DECODE_ERROR;
2745
- }
2746
-
2747
- /* Surrogate pair */
2748
- if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) {
2749
- if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) {
2750
- return LXB_ENCODING_DECODE_ERROR;
2751
- }
2752
-
2753
- ctx->second_codepoint = unit;
2754
-
2755
- if (*data >= end) {
2756
- return LXB_ENCODING_DECODE_CONTINUE;
2757
- }
2758
-
2759
- goto pair_state;
2760
- }
2761
-
2762
- return unit;
2763
- }
2764
-
2765
- lxb_codepoint_t
2766
- lxb_encoding_decode_utf_16be_single(lxb_encoding_decode_t *ctx,
2767
- const lxb_char_t **data, const lxb_char_t *end)
2768
- {
2769
- return lxb_encoding_decode_utf_16_single(ctx, true, data, end);
2770
- }
2771
-
2772
- lxb_codepoint_t
2773
- lxb_encoding_decode_utf_16le_single(lxb_encoding_decode_t *ctx,
2774
- const lxb_char_t **data, const lxb_char_t *end)
2775
- {
2776
- return lxb_encoding_decode_utf_16_single(ctx, false, data, end);
2777
- }
2778
-
2779
- lxb_codepoint_t
2780
- lxb_encoding_decode_utf_8_single(lxb_encoding_decode_t *ctx,
2781
- const lxb_char_t **data, const lxb_char_t *end)
2782
- {
2783
- unsigned needed;
2784
- lxb_char_t ch;
2785
- const lxb_char_t *p;
2786
-
2787
- if (ctx->u.utf_8.need != 0) {
2788
- needed = ctx->u.utf_8.need;
2789
- ctx->u.utf_8.need = 0;
2790
-
2791
- if (ctx->u.utf_8.lower != 0x00) {
2792
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(ctx->u.utf_8.lower,
2793
- ctx->u.utf_8.upper);
2794
- ctx->u.utf_8.lower = 0x00;
2795
- }
2796
-
2797
- goto decode;
2798
- }
2799
-
2800
- ch = *(*data)++;
2801
-
2802
- if (ch < 0x80) {
2803
- return ch;
2804
- }
2805
- else if (ch <= 0xDF) {
2806
- if (ch < 0xC2) {
2807
- return LXB_ENCODING_DECODE_ERROR;
2808
- }
2809
-
2810
- needed = 1;
2811
- ctx->codepoint = ch & 0x1F;
2812
- }
2813
- else if (ch < 0xF0) {
2814
- needed = 2;
2815
- ctx->codepoint = ch & 0x0F;
2816
-
2817
- if (*data == end) {
2818
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(0xE0, 0xED,
2819
- 0xA0, 0x9F);
2820
- goto next;
2821
- }
2822
-
2823
- if (ch == 0xE0) {
2824
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0xA0, 0xBF);
2825
- }
2826
- else if (ch == 0xED) {
2827
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x80, 0x9F);
2828
- }
2829
- }
2830
- else if (ch < 0xF5) {
2831
- needed = 3;
2832
- ctx->codepoint = ch & 0x07;
2833
-
2834
- if (*data == end) {
2835
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SET_SINGLE(0xF0, 0xF4,
2836
- 0x90, 0x8F);
2837
-
2838
- goto next;
2839
- }
2840
-
2841
- if (ch == 0xF0) {
2842
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x90, 0xBF);
2843
- }
2844
- else if (ch == 0xF4) {
2845
- LXB_ENCODING_DECODE_UTF_8_BOUNDARY_SINGLE(0x80, 0x8F);
2846
- }
2847
- }
2848
- else {
2849
- return LXB_ENCODING_DECODE_ERROR;
2850
- }
2851
-
2852
- decode:
2853
-
2854
- for (p = *data; p < end; p++) {
2855
- ch = *p;
2856
-
2857
- if (ch < 0x80 || ch > 0xBF) {
2858
- *data = p;
2859
-
2860
- goto failed;
2861
- }
2862
-
2863
- ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
2864
-
2865
- if (--needed == 0) {
2866
- *data = p + 1;
2867
-
2868
- return ctx->codepoint;
2869
- }
2870
- }
2871
-
2872
- *data = p;
2873
-
2874
- next:
2875
-
2876
- ctx->u.utf_8.need = needed;
2877
-
2878
- return LXB_ENCODING_DECODE_CONTINUE;
2879
-
2880
- failed:
2881
-
2882
- ctx->u.utf_8.lower = 0x00;
2883
- ctx->u.utf_8.need = 0;
2884
-
2885
- return LXB_ENCODING_DECODE_ERROR;
2886
- }
2887
-
2888
- lxb_codepoint_t
2889
- lxb_encoding_decode_gb18030_single(lxb_encoding_decode_t *ctx,
2890
- const lxb_char_t **data, const lxb_char_t *end)
2891
- {
2892
- uint32_t pointer;
2893
- lxb_char_t first, second, third, offset;
2894
-
2895
- /* Make compiler happy */
2896
- second = 0x00;
2897
-
2898
- if (ctx->u.gb18030.first != 0) {
2899
- if (ctx->u.gb18030.third != 0x00) {
2900
- first = ctx->u.gb18030.first;
2901
- second = ctx->u.gb18030.second;
2902
- third = ctx->u.gb18030.third;
2903
-
2904
- memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
2905
-
2906
- if (ctx->prepend) {
2907
- /* The first is always < 0x80 */
2908
- ctx->u.gb18030.first = third;
2909
-
2910
- return second;
2911
- }
2912
-
2913
- goto third_state;
2914
- }
2915
- else if (ctx->u.gb18030.second != 0x00) {
2916
- first = ctx->u.gb18030.first;
2917
- second = ctx->u.gb18030.second;
2918
-
2919
- memset(&ctx->u.gb18030, 0, sizeof(lxb_encoding_ctx_gb18030_t));
2920
-
2921
- goto second_state;
2922
- }
2923
-
2924
- first = ctx->u.gb18030.first;
2925
- ctx->u.gb18030.first = 0x00;
2926
-
2927
- if (ctx->prepend) {
2928
- ctx->prepend = false;
2929
- goto prepend_first;
2930
- }
2931
-
2932
- goto first_state;
2933
- }
2934
-
2935
- first = *(*data)++;
2936
-
2937
- prepend_first:
2938
-
2939
- if (first < 0x80) {
2940
- return first;
2941
- }
2942
-
2943
- if (first == 0x80) {
2944
- return 0x20AC;
2945
- }
2946
-
2947
- /* Range 0x81 to 0xFE, inclusive */
2948
- if ((unsigned) (first - 0x81) > (0xFE - 0x81)) {
2949
- return LXB_ENCODING_DECODE_ERROR;
2950
- }
2951
-
2952
- if (*data == end) {
2953
- ctx->u.gb18030.first = first;
2954
- return LXB_ENCODING_DECODE_CONTINUE;
2955
- }
2956
-
2957
- /* First */
2958
- first_state:
2959
-
2960
- second = *(*data)++;
2961
-
2962
- /* Range 0x30 to 0x39, inclusive */
2963
- if ((unsigned) (second - 0x30) > (0x39 - 0x30)) {
2964
- offset = (second < 0x7F) ? 0x40 : 0x41;
2965
-
2966
- /* Range 0x40 to 0x7E, inclusive, or 0x80 to 0xFE, inclusive */
2967
- if ((unsigned) (second - 0x40) <= (0x7E - 0x40)
2968
- || (unsigned) (second - 0x80) <= (0xFE - 0x80))
2969
- {
2970
- pointer = (first - 0x81) * 190 + (second - offset);
2971
- }
2972
- else {
2973
- goto failed;
2974
- }
2975
-
2976
- /* Max pointer value == (0xFE - 0x81) * 190 + (0xFE - 0x41) == 23939 */
2977
- ctx->codepoint = lxb_encoding_multi_index_gb18030[pointer].codepoint;
2978
- if (ctx->codepoint == LXB_ENCODING_ERROR_CODEPOINT) {
2979
- goto failed;
2980
- }
2981
-
2982
- return ctx->codepoint;
2983
- }
2984
-
2985
- if (*data == end) {
2986
- ctx->u.gb18030.first = first;
2987
- ctx->u.gb18030.second = second;
2988
-
2989
- return LXB_ENCODING_DECODE_CONTINUE;
2990
- }
2991
-
2992
- /* Second */
2993
- second_state:
2994
-
2995
- third = *(*data)++;
2996
-
2997
- /* Range 0x81 to 0xFE, inclusive */
2998
- if ((unsigned) (third - 0x81) > (0xFE - 0x81)) {
2999
- (*data)--;
3000
-
3001
- ctx->prepend = true;
3002
- ctx->u.gb18030.first = second;
3003
-
3004
- return LXB_ENCODING_DECODE_ERROR;
3005
- }
3006
-
3007
- if (*data == end) {
3008
- ctx->u.gb18030.first = first;
3009
- ctx->u.gb18030.second = second;
3010
- ctx->u.gb18030.third = third;
3011
-
3012
- return LXB_ENCODING_DECODE_CONTINUE;
3013
- }
3014
-
3015
- /* Third */
3016
- third_state:
3017
-
3018
- /* Range 0x30 to 0x39, inclusive */
3019
- if ((unsigned) (**data - 0x30) > (0x39 - 0x30)) {
3020
- ctx->prepend = true;
3021
-
3022
- /* First is a fake for trigger */
3023
- ctx->u.gb18030.first = 0x01;
3024
- ctx->u.gb18030.second = second;
3025
- ctx->u.gb18030.third = third;
3026
-
3027
- return LXB_ENCODING_DECODE_ERROR;
3028
- }
3029
-
3030
- pointer = ((first - 0x81) * (10 * 126 * 10))
3031
- + ((second - 0x30) * (10 * 126))
3032
- + ((third - 0x81) * 10) + (*(*data)++) - 0x30;
3033
-
3034
- return lxb_encoding_decode_gb18030_range(pointer);
3035
-
3036
- failed:
3037
-
3038
- if (second < 0x80) {
3039
- (*data)--;
3040
- }
3041
-
3042
- return LXB_ENCODING_DECODE_ERROR;
3043
- }
3044
-
3045
- lxb_codepoint_t
3046
- lxb_encoding_decode_macintosh_single(lxb_encoding_decode_t *ctx,
3047
- const lxb_char_t **data, const lxb_char_t *end)
3048
- {
3049
- if (**data < 0x80) {
3050
- return *(*data)++;
3051
- }
3052
-
3053
- return lxb_encoding_single_index_macintosh[*(*data)++ - 0x80].codepoint;
3054
- }
3055
-
3056
- lxb_codepoint_t
3057
- lxb_encoding_decode_replacement_single(lxb_encoding_decode_t *ctx,
3058
- const lxb_char_t **data, const lxb_char_t *end)
3059
- {
3060
- return LXB_ENCODING_DECODE_ERROR;
3061
- }
3062
-
3063
- lxb_codepoint_t
3064
- lxb_encoding_decode_windows_1250_single(lxb_encoding_decode_t *ctx,
3065
- const lxb_char_t **data, const lxb_char_t *end)
3066
- {
3067
- if (**data < 0x80) {
3068
- return *(*data)++;
3069
- }
3070
-
3071
- return lxb_encoding_single_index_windows_1250[*(*data)++ - 0x80].codepoint;
3072
- }
3073
-
3074
- lxb_codepoint_t
3075
- lxb_encoding_decode_windows_1251_single(lxb_encoding_decode_t *ctx,
3076
- const lxb_char_t **data, const lxb_char_t *end)
3077
- {
3078
- if (**data < 0x80) {
3079
- return *(*data)++;
3080
- }
3081
-
3082
- return lxb_encoding_single_index_windows_1251[*(*data)++ - 0x80].codepoint;
3083
- }
3084
-
3085
- lxb_codepoint_t
3086
- lxb_encoding_decode_windows_1252_single(lxb_encoding_decode_t *ctx,
3087
- const lxb_char_t **data, const lxb_char_t *end)
3088
- {
3089
- if (**data < 0x80) {
3090
- return *(*data)++;
3091
- }
3092
-
3093
- return lxb_encoding_single_index_windows_1252[*(*data)++ - 0x80].codepoint;
3094
- }
3095
-
3096
- lxb_codepoint_t
3097
- lxb_encoding_decode_windows_1253_single(lxb_encoding_decode_t *ctx,
3098
- const lxb_char_t **data, const lxb_char_t *end)
3099
- {
3100
- if (**data < 0x80) {
3101
- return *(*data)++;
3102
- }
3103
-
3104
- return lxb_encoding_single_index_windows_1253[*(*data)++ - 0x80].codepoint;
3105
- }
3106
-
3107
- lxb_codepoint_t
3108
- lxb_encoding_decode_windows_1254_single(lxb_encoding_decode_t *ctx,
3109
- const lxb_char_t **data, const lxb_char_t *end)
3110
- {
3111
- if (**data < 0x80) {
3112
- return *(*data)++;
3113
- }
3114
-
3115
- return lxb_encoding_single_index_windows_1254[*(*data)++ - 0x80].codepoint;
3116
- }
3117
-
3118
- lxb_codepoint_t
3119
- lxb_encoding_decode_windows_1255_single(lxb_encoding_decode_t *ctx,
3120
- const lxb_char_t **data, const lxb_char_t *end)
3121
- {
3122
- if (**data < 0x80) {
3123
- return *(*data)++;
3124
- }
3125
-
3126
- return lxb_encoding_single_index_windows_1255[*(*data)++ - 0x80].codepoint;
3127
- }
3128
-
3129
- lxb_codepoint_t
3130
- lxb_encoding_decode_windows_1256_single(lxb_encoding_decode_t *ctx,
3131
- const lxb_char_t **data, const lxb_char_t *end)
3132
- {
3133
- if (**data < 0x80) {
3134
- return *(*data)++;
3135
- }
3136
-
3137
- return lxb_encoding_single_index_windows_1256[*(*data)++ - 0x80].codepoint;
3138
- }
3139
-
3140
- lxb_codepoint_t
3141
- lxb_encoding_decode_windows_1257_single(lxb_encoding_decode_t *ctx,
3142
- const lxb_char_t **data, const lxb_char_t *end)
3143
- {
3144
- if (**data < 0x80) {
3145
- return *(*data)++;
3146
- }
3147
-
3148
- return lxb_encoding_single_index_windows_1257[*(*data)++ - 0x80].codepoint;
3149
- }
3150
-
3151
- lxb_codepoint_t
3152
- lxb_encoding_decode_windows_1258_single(lxb_encoding_decode_t *ctx,
3153
- const lxb_char_t **data, const lxb_char_t *end)
3154
- {
3155
- if (**data < 0x80) {
3156
- return *(*data)++;
3157
- }
3158
-
3159
- return lxb_encoding_single_index_windows_1258[*(*data)++ - 0x80].codepoint;
3160
- }
3161
-
3162
- lxb_codepoint_t
3163
- lxb_encoding_decode_windows_874_single(lxb_encoding_decode_t *ctx,
3164
- const lxb_char_t **data, const lxb_char_t *end)
3165
- {
3166
- if (**data < 0x80) {
3167
- return *(*data)++;
3168
- }
3169
-
3170
- return lxb_encoding_single_index_windows_874[*(*data)++ - 0x80].codepoint;
3171
- }
3172
-
3173
- lxb_codepoint_t
3174
- lxb_encoding_decode_x_mac_cyrillic_single(lxb_encoding_decode_t *ctx,
3175
- const lxb_char_t **data, const lxb_char_t *end)
3176
- {
3177
- if (**data < 0x80) {
3178
- return *(*data)++;
3179
- }
3180
-
3181
- return lxb_encoding_single_index_x_mac_cyrillic[*(*data)++ - 0x80].codepoint;
3182
- }
3183
-
3184
- lxb_codepoint_t
3185
- lxb_encoding_decode_x_user_defined_single(lxb_encoding_decode_t *ctx,
3186
- const lxb_char_t **data, const lxb_char_t *end)
3187
- {
3188
- if (**data < 0x80) {
3189
- return *(*data)++;
3190
- }
3191
-
3192
- return 0xF780 + (*(*data)++) - 0x80;
3193
- }