fast-xml 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1002 @@
1
+ #include "xh_config.h"
2
+ #include "xh_core.h"
3
+ #include "xh_ruby_hash.h"
4
+
5
+ static const char DEF_CONTENT_KEY[] = "content";
6
+
7
+ #define _NEW_STRING(s, l, utf8) \
8
+ ((utf8) ? rb_utf8_str_new((const char *) (s), l) : rb_str_new((const char *) (s), l))
9
+
10
+ #define NEW_STRING(s, l) \
11
+ _NEW_STRING(s, l, ctx->opts.utf8)
12
+
13
+ #define CAT_STRING(v, s, l) \
14
+ rb_str_cat((v), (const char *) (s), (l));
15
+
16
+ #define SAVE_VALUE(lv, v , s, l) \
17
+ xh_log_trace2("save value: [%.*s]", l, s); \
18
+ if ( RTEST(v) ) { \
19
+ xh_log_trace0("add to array"); \
20
+ /* get array if value is reference to array */ \
21
+ if ( RB_TYPE_P(v, RUBY_T_ARRAY) ) { \
22
+ av = v; \
23
+ } \
24
+ /* create a new array and move value to array */ \
25
+ else { \
26
+ av = rb_ary_new(); \
27
+ *(lv) = av; \
28
+ rb_ary_push(av, v); \
29
+ (v) = *(lv); \
30
+ } \
31
+ /* add value to array */ \
32
+ rb_ary_push(av, NEW_STRING((s), (l))); \
33
+ (lv) = (VALUE *) &RARRAY_CONST_PTR(av)[RARRAY_LEN(av) - 1]; \
34
+ } \
35
+ else { \
36
+ xh_log_trace0("set string"); \
37
+ *(lv) = NEW_STRING(s, l); \
38
+ } \
39
+
40
+ #define _OPEN_TAG(s, l) \
41
+ val = *lval; \
42
+ /* if content exists that move to hash with 'content' key */ \
43
+ if ( RB_TYPE_P(val, RUBY_T_STRING) || RB_TYPE_P(val, RUBY_T_ARRAY)) {\
44
+ *lval = hash_new(); \
45
+ if (RB_TYPE_P(val, RUBY_T_HASH) || RB_TYPE_P(val, RUBY_T_ARRAY) || (RB_TYPE_P(val, RUBY_T_STRING) && RSTRING_LEN(val))) {\
46
+ (void) hash_store(*lval, (const char *) content_key, content_key_len, val);\
47
+ } \
48
+ val = *lval; \
49
+ } \
50
+ /* fetch existen or create empty hash entry */ \
51
+ lval = hash_fetch(val, (const char *) s, l, Qnil); \
52
+ /* save as empty string */ \
53
+ val = *lval; \
54
+ xh_log_trace0("save as empty string"); \
55
+ SAVE_VALUE(lval, val, "", 0) \
56
+ if (++depth >= ctx->opts.max_depth) goto MAX_DEPTH_EXCEEDED; \
57
+ nodes[depth].lval = lval; \
58
+ nodes[depth].flags = XH_X2H_NODE_FLAG_NONE; \
59
+ if (depth > 1 && ctx->opts.force_array.enable && !RB_TYPE_P(val, RUBY_T_ARRAY) \
60
+ && (ctx->opts.force_array.always || xh_x2h_match_node(s, l, ctx->opts.force_array.expr))\
61
+ ) { \
62
+ nodes[depth].flags |= XH_X2H_NODE_FLAG_FORCE_ARRAY; \
63
+ } \
64
+ (s) = NULL;
65
+
66
+ #define OPEN_TAG(s, l) \
67
+ xh_log_trace2("new tag: [%.*s]", l, s); \
68
+ if (real_depth == 0) { \
69
+ if (flags & XH_X2H_ROOT_FOUND) goto INVALID_XML; \
70
+ flags |= XH_X2H_ROOT_FOUND; \
71
+ } \
72
+ if (XH_X2H_FILTER_SEARCH(flags)) { \
73
+ xh_x2h_xpath_update(ctx->xpath, s, l); \
74
+ if (xh_x2h_match_node(ctx->xpath, xh_strlen(ctx->xpath), ctx->opts.filter.expr)) {\
75
+ xh_log_trace2("match node: [%.*s]", l, s); \
76
+ ctx->hash = hash_new(); \
77
+ nodes[0].lval = lval = &ctx->hash; \
78
+ depth = 0; \
79
+ flags |= XH_X2H_FILTER_MATCHED; \
80
+ } \
81
+ } \
82
+ if (!XH_X2H_FILTER_SEARCH(flags)) { \
83
+ _OPEN_TAG(s, l) \
84
+ } \
85
+ real_depth++;
86
+
87
+ #define _CLOSE_TAG \
88
+ val = *nodes[depth].lval; \
89
+ if (ctx->opts.force_content && RB_TYPE_P(val, RUBY_T_STRING)) { \
90
+ lval = nodes[depth].lval; \
91
+ *lval = hash_new(); \
92
+ (void) hash_store(*lval, (const char *) content_key, content_key_len, val);\
93
+ val = *lval; \
94
+ } \
95
+ if ((nodes[depth].flags & XH_X2H_NODE_FLAG_FORCE_ARRAY) \
96
+ && (RB_TYPE_P(val, RUBY_T_STRING) || !RB_TYPE_P(val, RUBY_T_ARRAY))\
97
+ ) { \
98
+ lval = nodes[depth].lval; \
99
+ av = rb_ary_new(); \
100
+ *lval = av; \
101
+ rb_ary_push(av, val); \
102
+ } \
103
+ lval = nodes[--depth].lval;
104
+
105
+ #define CLOSE_TAG \
106
+ xh_log_trace0("close tag"); \
107
+ if (real_depth == 0) goto INVALID_XML; \
108
+ if (!XH_X2H_FILTER_SEARCH(flags)) { \
109
+ _CLOSE_TAG \
110
+ } \
111
+ if ((flags & XH_X2H_FILTER_MATCHED) && depth == 0) { \
112
+ xh_log_trace0("match node finished"); \
113
+ val = *nodes[0].lval; \
114
+ if (ctx->opts.block_given) { \
115
+ rb_yield(val); \
116
+ } \
117
+ else { \
118
+ rb_ary_push(ctx->result, val); \
119
+ } \
120
+ flags ^= XH_X2H_FILTER_MATCHED; \
121
+ } \
122
+ if ((flags & (XH_X2H_FILTER_ENABLED | XH_X2H_FILTER_MATCHED)) == XH_X2H_FILTER_ENABLED) {\
123
+ xh_x2h_xpath_update(ctx->xpath, NULL, 0); \
124
+ } \
125
+ real_depth--;
126
+
127
+ #define NEW_NODE_ATTRIBUTE(k, kl, v, vl) \
128
+ if (!XH_X2H_FILTER_SEARCH(flags)) { \
129
+ _OPEN_TAG(k, kl) \
130
+ _NEW_TEXT(v, vl) \
131
+ _CLOSE_TAG \
132
+ }
133
+
134
+ #define NEW_XML_DECL_ATTRIBUTE(k, kl, v, vl) \
135
+ xh_log_trace4("new xml decl attr name: [%.*s] value: [%.*s]", kl, k, vl, v);\
136
+ /* save encoding parameter to converter context if param found */ \
137
+ if ((kl) == (sizeof("encoding") - 1) && \
138
+ xh_strncmp((k), XH_CHAR_CAST "encoding", sizeof("encoding") - 1) == 0) {\
139
+ xh_str_range_copy(ctx->encoding, XH_CHAR_CAST (v), vl, XH_PARAM_LEN);\
140
+ } \
141
+ (k) = (v) = NULL;
142
+
143
+ #define NEW_ATTRIBUTE(k, kl, v, vl) NEW_NODE_ATTRIBUTE(k, kl, v, vl)
144
+
145
+ #define _NEW_TEXT(s, l) \
146
+ val = *lval; \
147
+ if ( !RB_TYPE_P(val, RUBY_T_STRING) ) { \
148
+ xh_log_trace0("add to array"); \
149
+ /* add content to array*/ \
150
+ if ( RB_TYPE_P(val, RUBY_T_ARRAY) ) { \
151
+ av = val; \
152
+ rb_ary_push(av, NEW_STRING(s, l)); \
153
+ } \
154
+ /* save content to hash with "content" key */ \
155
+ else { \
156
+ xh_log_trace0("save to hash"); \
157
+ lval = hash_fetch(val, (const char *) content_key, content_key_len, Qnil);\
158
+ val = *lval; \
159
+ SAVE_VALUE(lval, val, s, l) \
160
+ lval = nodes[depth].lval; \
161
+ } \
162
+ } \
163
+ else if (RSTRING_LEN(val) && !ctx->opts.merge_text) { \
164
+ xh_log_trace0("create a new array"); \
165
+ xh_log_trace1("create a new array val: %s", StringValueCStr(val));\
166
+ /* content already exists, create a new array and move*/ \
167
+ /* old and new content to array */ \
168
+ *lval = av = rb_ary_new(); \
169
+ rb_ary_push(av, val); \
170
+ rb_ary_push(av, NEW_STRING(s, l)); \
171
+ } \
172
+ else { \
173
+ xh_log_trace0("concat"); \
174
+ /* concatenate with previous string */ \
175
+ CAT_STRING(val, s, l) \
176
+ } \
177
+
178
+ #define NEW_TEXT(s, l) \
179
+ xh_log_trace2("new text: [%.*s]", l, s); \
180
+ if (real_depth == 0) goto INVALID_XML; \
181
+ if (!XH_X2H_FILTER_SEARCH(flags)) { \
182
+ _NEW_TEXT(s, l) \
183
+ }
184
+
185
+ #define NEW_COMMENT(s, l) (s) = NULL;
186
+
187
+ #define NEW_CDATA(s, l) NEW_TEXT(s, l)
188
+
189
+ #define CHECK_EOF_WITH_CHUNK(loop) \
190
+ if (cur >= eof || *cur == '\0') { \
191
+ eof = cur; \
192
+ if (terminate) goto XH_PPCAT(loop, _FINISH); \
193
+ ctx->state = XH_PPCAT(loop, _START); \
194
+ goto CHUNK_FINISH; \
195
+ } \
196
+
197
+ #define CHECK_EOF_WITHOUT_CHUNK(loop) \
198
+ if (cur >= eof || *cur == '\0') goto XH_PPCAT(loop, _FINISH); \
199
+
200
+ #define CHECK_EOF(loop) CHECK_EOF_WITH_CHUNK(loop)
201
+
202
+ #define DO(loop) \
203
+ XH_PPCAT(loop, _START): \
204
+ CHECK_EOF(loop) \
205
+ c = *cur++; \
206
+ xh_log_trace3("'%c'=[0x%X] %s start", c, c, XH_STRINGIZE(loop)); \
207
+ switch (c) {
208
+
209
+ #define _DO(loop) \
210
+ XH_PPCAT(loop, _START): \
211
+ CHECK_EOF_WITHOUT_CHUNK(loop) \
212
+ c = *cur++; \
213
+ xh_log_trace3("'%c'=[0x%X] %s start", c, c, XH_STRINGIZE(loop)); \
214
+ switch (c) {
215
+
216
+ #define END(loop) \
217
+ } \
218
+ xh_log_trace1(" %s end", XH_STRINGIZE(loop)); \
219
+ goto XH_PPCAT(loop, _START); \
220
+ XH_PPCAT(loop, _FINISH):
221
+
222
+ #define EXPECT_ANY(desc) \
223
+ default: xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
224
+
225
+ #define EXPECT_CHAR(desc, c1) \
226
+ case c1: xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
227
+
228
+ #define EXPECT_BLANK_WO_CR(desc) \
229
+ case ' ': case '\t': case '\n': \
230
+ xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
231
+
232
+ #define EXPECT_BLANK(desc) \
233
+ case ' ': case '\t': case '\n': case '\r': \
234
+ xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
235
+
236
+ #define EXPECT_DIGIT(desc) \
237
+ case '0': case '1': case '2': case '3': case '4': \
238
+ case '5': case '6': case '7': case '8': case '9': \
239
+ xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
240
+
241
+ #define EXPECT_HEX_CHAR_LC(desc) \
242
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': \
243
+ xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
244
+
245
+ #define EXPECT_HEX_CHAR_UC(desc) \
246
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': \
247
+ xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
248
+
249
+ #define SKIP_BLANK \
250
+ EXPECT_BLANK("skip blank") break;
251
+
252
+ #define SCAN2(loop, c1, c2) \
253
+ DO(XH_PPCAT(loop, _1)) EXPECT_CHAR(XH_STRINGIZE(c1), c1) \
254
+ DO(XH_PPCAT(loop, _2)) EXPECT_CHAR(XH_STRINGIZE(c2), c2)
255
+
256
+ #define END2(loop, stop) \
257
+ EXPECT_ANY("wrong character") goto stop; \
258
+ END(XH_PPCAT(loop, _2)) goto stop; \
259
+ EXPECT_ANY("wrong character") goto stop; \
260
+ END(XH_PPCAT(loop, _1))
261
+
262
+ #define SCAN3(loop, c1, c2, c3) \
263
+ DO(XH_PPCAT(loop, _1)) EXPECT_CHAR(STRINGIZE(c1), c1) \
264
+ DO(XH_PPCAT(loop, _2)) EXPECT_CHAR(STRINGIZE(c2), c2) \
265
+ DO(XH_PPCAT(loop, _3)) EXPECT_CHAR(STRINGIZE(c3), c3)
266
+
267
+ #define END3(loop, stop) \
268
+ EXPECT_ANY("wrong character") goto stop; \
269
+ END(XH_PPCAT(loop, _3)) goto stop; \
270
+ EXPECT_ANY("wrong character") goto stop; \
271
+ END(XH_PPCAT(loop, _2)) goto stop; \
272
+ EXPECT_ANY("wrong character") goto stop; \
273
+ END(XH_PPCAT(loop, _1))
274
+
275
+ #define SCAN5(loop, c1, c2, c3, c4, c5) \
276
+ SCAN3(XH_PPCAT(loop, _1), c1, c2, c3) \
277
+ SCAN2(XH_PPCAT(loop, _2), c4, c5)
278
+
279
+ #define END5(loop, stop) \
280
+ END2(XH_PPCAT(loop, _2), stop) \
281
+ END3(XH_PPCAT(loop, _1), stop)
282
+
283
+ #define SCAN6(loop, c1, c2, c3, c4, c5, c6) \
284
+ SCAN3(XH_PPCAT(loop, _1), c1, c2, c3) \
285
+ SCAN3(XH_PPCAT(loop, _2), c4, c5, c6)
286
+
287
+ #define END6(loop, stop) \
288
+ END3(XH_PPCAT(loop, _2), stop) \
289
+ END3(XH_PPCAT(loop, _1), stop)
290
+
291
+ #define SEARCH_END_TAG \
292
+ EXPECT_CHAR("end tag", '>') \
293
+ goto PARSE_CONTENT; \
294
+ EXPECT_CHAR("self closing tag", '/') \
295
+ CLOSE_TAG \
296
+ DO(SEARCH_END_TAG) \
297
+ EXPECT_CHAR("end tag", '>') \
298
+ goto PARSE_CONTENT; \
299
+ EXPECT_ANY("wrong character") \
300
+ goto INVALID_XML; \
301
+ END(SEARCH_END_TAG) \
302
+ goto INVALID_XML;
303
+
304
+ #define SEARCH_NODE_ATTRIBUTE_VALUE(loop, top_loop, quot) \
305
+ EXPECT_CHAR("start attr value", quot) \
306
+ content = cur; \
307
+ flags &= ~XH_X2H_NEED_NORMALIZE; \
308
+ DO(XH_PPCAT(loop, _END_ATTR_VALUE)) \
309
+ EXPECT_CHAR("attr value end", quot) \
310
+ if (flags & XH_X2H_NEED_NORMALIZE) { \
311
+ NORMALIZE_TEXT(loop, content, cur - content - 1) \
312
+ NEW_ATTRIBUTE(node, end - node, enc, enc_len) \
313
+ } \
314
+ else { \
315
+ NEW_ATTRIBUTE(node, end - node, content, cur - content - 1)\
316
+ } \
317
+ goto top_loop; \
318
+ EXPECT_CHAR("CR", '\r') \
319
+ flags |= XH_X2H_NORMALIZE_LINE_FEED; \
320
+ break; \
321
+ EXPECT_CHAR("reference", '&') \
322
+ flags |= XH_X2H_NORMALIZE_REF; \
323
+ break; \
324
+ END(XH_PPCAT(loop, _END_ATTR_VALUE)) \
325
+ goto INVALID_XML;
326
+
327
+ #define SEARCH_XML_DECL_ATTRIBUTE_VALUE(loop, top_loop, quot) \
328
+ EXPECT_CHAR("start attr value", quot) \
329
+ content = cur; \
330
+ DO(XH_PPCAT(loop, _END_ATTR_VALUE)) \
331
+ EXPECT_CHAR("attr value end", quot) \
332
+ NEW_ATTRIBUTE(node, end - node, content, cur - content - 1)\
333
+ goto top_loop; \
334
+ END(XH_PPCAT(loop, _END_ATTR_VALUE)) \
335
+ goto INVALID_XML;
336
+
337
+ #define SEARCH_ATTRIBUTE_VALUE(loop, top_loop, quot) SEARCH_NODE_ATTRIBUTE_VALUE(loop, top_loop, quot)
338
+
339
+ #define SEARCH_ATTRIBUTES(loop, search_end_tag) \
340
+ XH_PPCAT(loop, _SEARCH_ATTRIBUTES_LOOP): \
341
+ DO(XH_PPCAT(loop, _SEARCH_ATTR)) \
342
+ search_end_tag \
343
+ \
344
+ SKIP_BLANK \
345
+ \
346
+ EXPECT_ANY("start attr name") \
347
+ node = cur - 1; \
348
+ \
349
+ DO(XH_PPCAT(loop, _PARSE_ATTR_NAME)) \
350
+ EXPECT_BLANK("end attr name") \
351
+ end = cur - 1; \
352
+ xh_log_trace2("attr name: [%.*s]", end - node, node);\
353
+ \
354
+ DO(XH_PPCAT(loop, _ATTR_SKIP_BLANK)) \
355
+ EXPECT_CHAR("search attr value", '=') \
356
+ goto XH_PPCAT(loop, _SEARCH_ATTRIBUTE_VALUE);\
357
+ SKIP_BLANK \
358
+ EXPECT_ANY("wrong character") \
359
+ goto INVALID_XML; \
360
+ END(XH_PPCAT(loop, _ATTR_SKIP_BLANK)) \
361
+ goto INVALID_XML; \
362
+ EXPECT_CHAR("end attr name", '=') \
363
+ end = cur - 1; \
364
+ xh_log_trace2("attr name: [%.*s]", end - node, node);\
365
+ \
366
+ XH_PPCAT(loop, _SEARCH_ATTRIBUTE_VALUE): \
367
+ DO(XH_PPCAT(loop, _PARSE_ATTR_VALUE)) \
368
+ SEARCH_ATTRIBUTE_VALUE(XH_PPCAT(loop, _1), XH_PPCAT(loop, _SEARCH_ATTRIBUTES_LOOP), '"')\
369
+ SEARCH_ATTRIBUTE_VALUE(XH_PPCAT(loop, _2), XH_PPCAT(loop, _SEARCH_ATTRIBUTES_LOOP), '\'')\
370
+ SKIP_BLANK \
371
+ EXPECT_ANY("wrong character") \
372
+ goto INVALID_XML; \
373
+ END(XH_PPCAT(loop, _PARSE_ATTR_VALUE)) \
374
+ goto INVALID_XML; \
375
+ END(XH_PPCAT(loop, _PARSE_ATTR_NAME)) \
376
+ goto INVALID_XML; \
377
+ END(XH_PPCAT(loop, _SEARCH_ATTR)) \
378
+ goto INVALID_XML;
379
+
380
+ #define PARSE_XML_DECLARATION \
381
+ SCAN3(XML_DECL, 'x', 'm', 'l') \
382
+ DO(XML_DECL_ATTR) \
383
+ EXPECT_BLANK("blank") \
384
+ SEARCH_ATTRIBUTES(XML_DECL_ATTR, SEARCH_END_XML_DECLARATION)\
385
+ goto INVALID_XML; \
386
+ EXPECT_ANY("wrong character") \
387
+ goto INVALID_XML; \
388
+ END(XML_DECL_ATTR) \
389
+ goto INVALID_XML; \
390
+ END3(XML_DECL, INVALID_XML) \
391
+ goto INVALID_XML;
392
+
393
+ #define SEARCH_END_XML_DECLARATION \
394
+ EXPECT_CHAR("end tag", '?') \
395
+ DO(XML_DECL_SEARCH_END_TAG2) \
396
+ EXPECT_CHAR("end tag", '>') \
397
+ goto XML_DECL_FOUND; \
398
+ EXPECT_ANY("wrong character") \
399
+ goto INVALID_XML; \
400
+ END(XML_DECL_SEARCH_END_TAG2) \
401
+ goto INVALID_XML;
402
+
403
+ #define PARSE_COMMENT \
404
+ DO(COMMENT1) \
405
+ EXPECT_CHAR("-", '-') \
406
+ content = NULL; \
407
+ DO(END_COMMENT1) \
408
+ SKIP_BLANK \
409
+ EXPECT_CHAR("1st -", '-') \
410
+ if (content == NULL) content = end = cur - 1; \
411
+ DO(END_COMMENT2) \
412
+ EXPECT_CHAR("2nd -", '-') \
413
+ DO(END_COMMENT3) \
414
+ EXPECT_CHAR(">", '>') \
415
+ NEW_COMMENT(content, end - content) \
416
+ goto PARSE_CONTENT; \
417
+ EXPECT_CHAR("2nd -", '-') \
418
+ end = cur - 2; \
419
+ goto END_COMMENT3_START; \
420
+ EXPECT_ANY("any character") \
421
+ end = cur - 1; \
422
+ goto END_COMMENT1_START; \
423
+ END(END_COMMENT3) \
424
+ EXPECT_BLANK("skip blank") \
425
+ end = cur - 1; \
426
+ goto END_COMMENT1_START; \
427
+ EXPECT_ANY("any character") \
428
+ end = cur; \
429
+ goto END_COMMENT1_START; \
430
+ END(END_COMMENT2) \
431
+ EXPECT_ANY("any char") \
432
+ if (content == NULL) content = cur - 1; \
433
+ end = cur; \
434
+ END(END_COMMENT1) \
435
+ goto INVALID_XML; \
436
+ \
437
+ EXPECT_ANY("wrong character") \
438
+ goto INVALID_XML; \
439
+ \
440
+ END(COMMENT1) \
441
+ goto INVALID_XML;
442
+
443
+ #define PARSE_CDATA \
444
+ SCAN6(CDATA, 'C', 'D', 'A', 'T', 'A', '[') \
445
+ content = end = cur; \
446
+ DO(END_CDATA1) \
447
+ EXPECT_CHAR("1st ]", ']') \
448
+ DO(END_CDATA2) \
449
+ EXPECT_CHAR("2nd ]", ']') \
450
+ DO(END_CDATA3) \
451
+ EXPECT_CHAR(">", '>') \
452
+ end = cur - 3; \
453
+ NEW_CDATA(content, end - content) \
454
+ goto PARSE_CONTENT; \
455
+ EXPECT_CHAR("2nd ]", ']') \
456
+ goto END_CDATA3_START; \
457
+ EXPECT_ANY("any character") \
458
+ goto END_CDATA1_START; \
459
+ END(END_CDATA3) \
460
+ EXPECT_ANY("any character") \
461
+ goto END_CDATA1_START; \
462
+ END(END_CDATA2) \
463
+ ; \
464
+ END(END_CDATA1) \
465
+ goto INVALID_XML; \
466
+ END6(CDATA, INVALID_XML)
467
+
468
+ #define PARSE_CDATA_WITH_TRIM \
469
+ SCAN6(CDATA_WITH_TRIM, 'C', 'D', 'A', 'T', 'A', '[') \
470
+ content = NULL; \
471
+ DO(END_CDATA_WITH_TRIM1) \
472
+ SKIP_BLANK \
473
+ EXPECT_CHAR("1st ]", ']') \
474
+ if (content == NULL) content = end = cur - 1; \
475
+ DO(END_CDATA_WITH_TRIM2) \
476
+ EXPECT_CHAR("2nd ]", ']') \
477
+ DO(END_CDATA_WITH_TRIM3) \
478
+ EXPECT_CHAR(">", '>') \
479
+ NEW_CDATA(content, end - content) \
480
+ goto PARSE_CONTENT; \
481
+ EXPECT_CHAR("2nd ]", ']') \
482
+ end = cur - 2; \
483
+ goto END_CDATA_WITH_TRIM3_START; \
484
+ EXPECT_ANY("any character") \
485
+ end = cur - 1; \
486
+ goto END_CDATA_WITH_TRIM1_START; \
487
+ END(END_CDATA_WITH_TRIM3) \
488
+ EXPECT_BLANK("skip blank") \
489
+ end = cur - 1; \
490
+ goto END_CDATA_WITH_TRIM1_START; \
491
+ EXPECT_ANY("any character") \
492
+ end = cur; \
493
+ goto END_CDATA_WITH_TRIM1_START; \
494
+ END(END_CDATA_WITH_TRIM2) \
495
+ EXPECT_ANY("any char") \
496
+ if (content == NULL) content = cur - 1; \
497
+ end = cur; \
498
+ END(END_CDATA_WITH_TRIM1) \
499
+ goto INVALID_XML; \
500
+ END6(CDATA_WITH_TRIM, INVALID_XML)
501
+
502
+ #define NORMALIZE_REFERENCE(loop) \
503
+ _DO(XH_PPCAT(loop, _REFERENCE)) \
504
+ EXPECT_CHAR("char reference", '#') \
505
+ _DO(XH_PPCAT(loop, _CHAR_REFERENCE)) \
506
+ EXPECT_CHAR("hex", 'x') \
507
+ code = 0; \
508
+ _DO(XH_PPCAT(loop, _HEX_CHAR_REFERENCE_LOOP)) \
509
+ EXPECT_DIGIT("hex digit") \
510
+ code = code * 16 + (c - '0'); \
511
+ break; \
512
+ EXPECT_HEX_CHAR_LC("hex a-f") \
513
+ code = code * 16 + (c - 'a') + 10; \
514
+ break; \
515
+ EXPECT_HEX_CHAR_UC("hex A-F") \
516
+ code = code * 16 + (c - 'A') + 10; \
517
+ break; \
518
+ EXPECT_CHAR("reference end", ';') \
519
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
520
+ END(XH_PPCAT(loop, _HEX_CHAR_REFERENCE_LOOP)) \
521
+ goto INVALID_REF; \
522
+ EXPECT_DIGIT("digit") \
523
+ code = (c - '0'); \
524
+ _DO(XH_PPCAT(loop, _CHAR_REFERENCE_LOOP)) \
525
+ EXPECT_DIGIT("digit") \
526
+ code = code * 10 + (c - '0'); \
527
+ break; \
528
+ EXPECT_CHAR("reference end", ';') \
529
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
530
+ END(XH_PPCAT(loop, _CHAR_REFERENCE_LOOP)) \
531
+ goto INVALID_REF; \
532
+ EXPECT_ANY("any char") \
533
+ goto INVALID_REF; \
534
+ END(XH_PPCAT(loop, _CHAR_REFERENCE)) \
535
+ goto INVALID_REF; \
536
+ EXPECT_CHAR("amp or apos", 'a') \
537
+ if (xh_str_equal3(cur, 'm', 'p', ';')) { \
538
+ code = '&'; \
539
+ cur += 3; \
540
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
541
+ } \
542
+ if (xh_str_equal4(cur, 'p', 'o', 's', ';')) { \
543
+ code = '\''; \
544
+ cur += 4; \
545
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
546
+ } \
547
+ goto INVALID_REF; \
548
+ EXPECT_CHAR("lt", 'l') \
549
+ if (xh_str_equal2(cur, 't', ';')) { \
550
+ code = '<'; \
551
+ cur += 2; \
552
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
553
+ } \
554
+ goto INVALID_REF; \
555
+ EXPECT_CHAR("gt", 'g') \
556
+ if (xh_str_equal2(cur, 't', ';')) { \
557
+ code = '>'; \
558
+ cur += 2; \
559
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
560
+ } \
561
+ goto INVALID_REF; \
562
+ EXPECT_CHAR("quot", 'q') \
563
+ if (xh_str_equal4(cur, 'u', 'o', 't', ';')) { \
564
+ code = '"'; \
565
+ cur += 4; \
566
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
567
+ } \
568
+ goto INVALID_REF; \
569
+ EXPECT_ANY("any char") \
570
+ goto INVALID_REF; \
571
+ END(XH_PPCAT(loop, _REFERENCE)) \
572
+ goto INVALID_REF; \
573
+ XH_PPCAT(loop, _REFEFENCE_VALUE): \
574
+ xh_log_trace1("parse reference value: %lu", code); \
575
+ if (code == 0 || code > 0x10FFFF) goto INVALID_REF; \
576
+ if (code >= 0x80) { \
577
+ if (code < 0x800) { \
578
+ *enc_cur++ = (code >> 6) | 0xC0; bits = 0; \
579
+ } \
580
+ else if (code < 0x10000) { \
581
+ *enc_cur++ = (code >> 12) | 0xE0; bits = 6; \
582
+ } \
583
+ else if (code < 0x110000) { \
584
+ *enc_cur++ = (code >> 18) | 0xF0; bits = 12; \
585
+ } \
586
+ else { \
587
+ goto INVALID_REF; \
588
+ } \
589
+ for (; bits >= 0; bits-= 6) { \
590
+ *enc_cur++ = ((code >> bits) & 0x3F) | 0x80; \
591
+ } \
592
+ } \
593
+ else { \
594
+ *enc_cur++ = (xh_char_t) code; \
595
+ }
596
+
597
+ #define NORMALIZE_LINE_FEED(loop) \
598
+ _DO(XH_PPCAT(loop, _NORMALIZE_LINE_FEED)) \
599
+ EXPECT_CHAR("LF", '\n') \
600
+ goto XH_PPCAT(loop, _NORMALIZE_LINE_FEED_END); \
601
+ EXPECT_ANY("any char") \
602
+ cur--; \
603
+ goto XH_PPCAT(loop, _NORMALIZE_LINE_FEED_END); \
604
+ END(XH_PPCAT(loop, _NORMALIZE_LINE_FEED)) \
605
+ XH_PPCAT(loop, _NORMALIZE_LINE_FEED_END): \
606
+ *enc_cur++ = '\n';
607
+
608
+ #define NORMALIZE_TEXT(loop, s, l) \
609
+ enc_len = l; \
610
+ if (enc_len) { \
611
+ old_cur = cur; \
612
+ old_eof = eof; \
613
+ cur = s; \
614
+ eof = cur + enc_len; \
615
+ if (ctx->tmp == NULL) { \
616
+ xh_log_trace1("malloc() %lu", enc_len); \
617
+ if ((ctx->tmp = malloc(enc_len)) == NULL) goto MALLOC; \
618
+ ctx->tmp_size = enc_len; \
619
+ } \
620
+ else if (enc_len > ctx->tmp_size) { \
621
+ xh_log_trace1("realloc() %lu", enc_len); \
622
+ if ((enc = realloc(ctx->tmp, enc_len)) == NULL) goto MALLOC;\
623
+ ctx->tmp = enc; \
624
+ ctx->tmp_size = enc_len; \
625
+ } \
626
+ enc = enc_cur = ctx->tmp; \
627
+ memcpy(enc, cur, enc_len); \
628
+ _DO(XH_PPCAT(loop, _NORMALIZE_TEXT)) \
629
+ EXPECT_CHAR("reference", '&') \
630
+ NORMALIZE_REFERENCE(loop) \
631
+ break; \
632
+ EXPECT_CHAR("CR", '\r') \
633
+ NORMALIZE_LINE_FEED(loop) \
634
+ break; \
635
+ EXPECT_ANY("any char") \
636
+ *enc_cur++ = c; \
637
+ END(XH_PPCAT(loop, _NORMALIZE_TEXT)) \
638
+ enc_len = enc_cur - enc; \
639
+ cur = old_cur; \
640
+ eof = old_eof; \
641
+ } \
642
+ else { \
643
+ enc = s; \
644
+ }
645
+
646
+ XH_INLINE void
647
+ xh_x2h_xpath_update(xh_char_t *xpath, xh_char_t *name, size_t name_len)
648
+ {
649
+ size_t len;
650
+
651
+ len = xh_strlen(xpath);
652
+ if (name != NULL) {
653
+ if ((len + name_len + 1) > XH_X2H_XPATH_MAX_LEN)
654
+ rb_raise(xh_parse_error_class, "XPath too long");
655
+
656
+ xpath[len++] = '/';
657
+ for (;name_len--;) xpath[len++] = *name++;
658
+ }
659
+ else if (len == 0) {
660
+ rb_raise(xh_parse_error_class, "Can't update xpath, something wrong!");
661
+ }
662
+ else {
663
+ for (;--len && xpath[len] != '/';) {/* void */}
664
+ }
665
+ xpath[len] = '\0';
666
+
667
+ xh_log_trace1("xpath: [%s]", xpath);
668
+ }
669
+
670
+ XH_INLINE xh_bool_t
671
+ xh_x2h_match_node(xh_char_t *name, size_t name_len, VALUE expr)
672
+ {
673
+ size_t i, l;
674
+ VALUE ary;
675
+ VALUE str;
676
+ xh_char_t *expr_str;
677
+ size_t expr_len;
678
+
679
+ xh_log_trace2("match node: [%.*s]", name_len, name);
680
+
681
+ str = _NEW_STRING(name, name_len, TRUE);
682
+
683
+ if ( RB_TYPE_P(expr, RUBY_T_REGEXP) ) {
684
+ if (rb_reg_search(expr, str, 0, 0) >= 0) return TRUE;
685
+ }
686
+ else if ( RB_TYPE_P(expr, RUBY_T_ARRAY) ) {
687
+ l = RARRAY_LEN(expr);
688
+ ary = expr;
689
+ for (i = 0; i < l; i++) {
690
+ expr = RARRAY_AREF(ary, i);
691
+ if ( RB_TYPE_P(expr, RUBY_T_REGEXP) ) {
692
+ if (rb_reg_search(expr, str, 0, 0) >= 0) return TRUE;
693
+ }
694
+ else {
695
+ expr_str = XH_CHAR_CAST RSTRING_PTR(expr);
696
+ expr_len = RSTRING_LEN(expr);
697
+ if (name_len == expr_len && !xh_strncmp(name, expr_str, name_len)) {
698
+ return TRUE;
699
+ }
700
+ }
701
+ }
702
+ } else {
703
+ xh_log_trace0("match string");
704
+ expr_str = XH_CHAR_CAST RSTRING_PTR(expr);
705
+ expr_len = RSTRING_LEN(expr);
706
+ xh_log_trace2("expr: [%.*s]", expr_len, expr_str);
707
+ if (name_len == expr_len && !xh_strncmp(name, expr_str, name_len)) {
708
+ xh_log_trace0("match TRUE");
709
+ return TRUE;
710
+ }
711
+ }
712
+
713
+ return FALSE;
714
+ }
715
+
716
+ static void
717
+ xh_x2h_parse_chunk(xh_x2h_ctx_t *ctx, xh_char_t **buf, size_t *bytesleft, xh_bool_t terminate)
718
+ {
719
+ xh_char_t c, *cur, *node, *end, *content, *eof, *enc,
720
+ *enc_cur, *old_cur, *old_eof, *content_key;
721
+ unsigned int depth, real_depth, code, flags;
722
+ int bits;
723
+ VALUE *lval, val;
724
+ xh_x2h_node_t *nodes;
725
+ VALUE av;
726
+ size_t enc_len, content_key_len;
727
+
728
+ cur = *buf;
729
+ eof = cur + *bytesleft;
730
+ nodes = ctx->nodes;
731
+ depth = ctx->depth;
732
+ real_depth = ctx->real_depth;
733
+ flags = ctx->flags;
734
+ node = ctx->node;
735
+ end = ctx->end;
736
+ content = ctx->content;
737
+ code = ctx->code;
738
+ lval = ctx->lval;
739
+ enc = enc_cur = old_eof = old_cur = NULL;
740
+ c = '\0';
741
+
742
+ if (ctx->opts.content[0] == '\0') {
743
+ content_key = (xh_char_t *) DEF_CONTENT_KEY;
744
+ content_key_len = sizeof(DEF_CONTENT_KEY) - 1;
745
+ }
746
+ else {
747
+ content_key = ctx->opts.content;
748
+ content_key_len = xh_strlen(ctx->opts.content);
749
+ }
750
+
751
+ #define XH_X2H_PROCESS_STATE(st) case st: goto st;
752
+ switch (ctx->state) {
753
+ case PARSER_ST_NONE: break;
754
+ XH_X2H_PARSER_STATE_LIST
755
+ case XML_DECL_FOUND: break;
756
+ case PARSER_ST_DONE: goto DONE;
757
+ }
758
+ #undef XH_X2H_PROCESS_STATE
759
+
760
+ PARSE_CONTENT:
761
+ content = NULL;
762
+ flags &= ~(XH_X2H_NEED_NORMALIZE | XH_X2H_IS_NOT_BLANK);
763
+ DO(CONTENT)
764
+ EXPECT_CHAR("new element", '<')
765
+ if (content != NULL) {
766
+ if (flags & XH_X2H_IS_NOT_BLANK) {
767
+ if (flags & XH_X2H_NEED_NORMALIZE) {
768
+ NORMALIZE_TEXT(TEXT1, content, end - content)
769
+ NEW_TEXT(enc, enc_len)
770
+ }
771
+ else {
772
+ NEW_TEXT(content, end - content)
773
+ }
774
+ }
775
+ content = NULL;
776
+ }
777
+ DO(PARSE_ELEMENT)
778
+ EXPECT_CHAR("xml declaration", '?')
779
+ if (real_depth != 0) goto INVALID_XML;
780
+ #undef NEW_ATTRIBUTE
781
+ #define NEW_ATTRIBUTE(k, kl, v, vl) NEW_XML_DECL_ATTRIBUTE(k, kl, v, vl)
782
+ #undef SEARCH_ATTRIBUTE_VALUE
783
+ #define SEARCH_ATTRIBUTE_VALUE(loop, top_loop, quot) SEARCH_XML_DECL_ATTRIBUTE_VALUE(loop, top_loop, quot)
784
+ PARSE_XML_DECLARATION
785
+ #undef NEW_ATTRIBUTE
786
+ #define NEW_ATTRIBUTE(k, kl, v, vl) NEW_NODE_ATTRIBUTE(k, kl, v, vl)
787
+ #undef SEARCH_ATTRIBUTE_VALUE
788
+ #define SEARCH_ATTRIBUTE_VALUE(loop, top_loop, quot) SEARCH_NODE_ATTRIBUTE_VALUE(loop, top_loop, quot)
789
+ EXPECT_CHAR("comment", '!')
790
+ DO(XML_COMMENT_NODE_OR_CDATA)
791
+ EXPECT_CHAR("comment", '-')
792
+ PARSE_COMMENT
793
+ EXPECT_CHAR("cdata", '[')
794
+ if (ctx->opts.trim) {
795
+ PARSE_CDATA_WITH_TRIM
796
+ ;
797
+ }
798
+ else {
799
+ PARSE_CDATA
800
+ ;
801
+ }
802
+ EXPECT_ANY("wrong character")
803
+ goto INVALID_XML;
804
+ END(XML_COMMENT_NODE_OR_CDATA)
805
+ goto INVALID_XML;
806
+ EXPECT_CHAR("closing tag", '/')
807
+ //node = cur;
808
+ DO(PARSE_CLOSING_TAG)
809
+ EXPECT_CHAR("end tag name", '>')
810
+ CLOSE_TAG
811
+ goto PARSE_CONTENT;
812
+ EXPECT_BLANK("end tag name")
813
+ DO(SEARCH_CLOSING_END_TAG)
814
+ EXPECT_CHAR("end tag", '>')
815
+ CLOSE_TAG
816
+ goto PARSE_CONTENT;
817
+ SKIP_BLANK
818
+ EXPECT_ANY("wrong character")
819
+ goto INVALID_XML;
820
+ END(SEARCH_CLOSING_END_TAG)
821
+ goto INVALID_XML;
822
+ END(PARSE_CLOSING_TAG)
823
+ goto INVALID_XML;
824
+ EXPECT_ANY("opening tag")
825
+ node = cur - 1;
826
+ DO(PARSE_OPENING_TAG)
827
+ EXPECT_CHAR("end tag", '>')
828
+ OPEN_TAG(node, cur - node - 1)
829
+ goto PARSE_CONTENT;
830
+ EXPECT_CHAR("self closing tag", '/')
831
+ OPEN_TAG(node, cur - node - 1)
832
+ CLOSE_TAG
833
+
834
+ DO(SEARCH_OPENING_END_TAG)
835
+ EXPECT_CHAR("end tag", '>')
836
+ goto PARSE_CONTENT;
837
+ EXPECT_ANY("wrong character")
838
+ goto INVALID_XML;
839
+ END(SEARCH_OPENING_END_TAG)
840
+ goto INVALID_XML;
841
+ EXPECT_BLANK("end tag name")
842
+ OPEN_TAG(node, cur - node - 1)
843
+
844
+ SEARCH_ATTRIBUTES(NODE, SEARCH_END_TAG)
845
+
846
+ goto PARSE_CONTENT;
847
+ END(PARSE_OPENING_TAG);
848
+ goto INVALID_XML;
849
+ END(PARSE_ELEMENT)
850
+
851
+ EXPECT_CHAR("wrong symbol", '>')
852
+ goto INVALID_XML;
853
+ EXPECT_BLANK_WO_CR("blank")
854
+ if (!ctx->opts.trim)
855
+ goto START_CONTENT;
856
+ break;
857
+ EXPECT_CHAR("CR", '\r')
858
+ if (content != NULL) {
859
+ flags |= XH_X2H_NORMALIZE_LINE_FEED;
860
+ }
861
+ if (!ctx->opts.trim)
862
+ goto START_CONTENT;
863
+ break;
864
+ EXPECT_CHAR("reference", '&')
865
+ flags |= XH_X2H_NORMALIZE_REF;
866
+ EXPECT_ANY("any char")
867
+ flags |= XH_X2H_IS_NOT_BLANK;
868
+ START_CONTENT:
869
+ if (content == NULL) content = cur - 1;
870
+ end = cur;
871
+ END(CONTENT)
872
+
873
+ if (content != NULL) {
874
+ if (flags & XH_X2H_IS_NOT_BLANK) {
875
+ if (flags & XH_X2H_NEED_NORMALIZE) {
876
+ NORMALIZE_TEXT(TEXT2, content, end - content)
877
+ NEW_TEXT(enc, enc_len)
878
+ }
879
+ else {
880
+ NEW_TEXT(content, end - content)
881
+ }
882
+ }
883
+ content = NULL;
884
+ }
885
+
886
+ if (real_depth != 0 || !(flags & XH_X2H_ROOT_FOUND)) goto INVALID_XML;
887
+
888
+ ctx->state = PARSER_ST_DONE;
889
+ *bytesleft = eof - cur;
890
+ *buf = cur;
891
+ return;
892
+
893
+ XML_DECL_FOUND:
894
+ ctx->state = XML_DECL_FOUND;
895
+ CHUNK_FINISH:
896
+ ctx->content = content;
897
+ ctx->node = node;
898
+ ctx->end = end;
899
+ ctx->depth = depth;
900
+ ctx->real_depth = real_depth;
901
+ ctx->flags = flags;
902
+ ctx->code = code;
903
+ ctx->lval = lval;
904
+ *bytesleft = eof - cur;
905
+ *buf = cur;
906
+ return;
907
+
908
+ MAX_DEPTH_EXCEEDED:
909
+ rb_raise(xh_parse_error_class, "Maximum depth exceeded");
910
+ INVALID_XML:
911
+ rb_raise(xh_parse_error_class, "Invalid XML");
912
+ INVALID_REF:
913
+ rb_raise(xh_parse_error_class, "Invalid reference");
914
+ MALLOC:
915
+ rb_raise(rb_eNoMemError, "Memory allocation error");
916
+ DONE:
917
+ rb_raise(xh_parse_error_class, "Parsing is done");
918
+ }
919
+
920
+ static void
921
+ xh_x2h_parse(xh_x2h_ctx_t *ctx, xh_reader_t *reader)
922
+ {
923
+ xh_char_t *buf, *preserve;
924
+ size_t len, off;
925
+ xh_bool_t eof;
926
+
927
+ do {
928
+ preserve = ctx->node != NULL ? ctx->node : ctx->content;
929
+
930
+ len = reader->read(reader, &buf, preserve, &off);
931
+ eof = (len == 0);
932
+ if (off) {
933
+ if (ctx->node != NULL) ctx->node -= off;
934
+ if (ctx->content != NULL) ctx->content -= off;
935
+ if (ctx->end != NULL) ctx->end -= off;
936
+ }
937
+
938
+ xh_log_trace2("read buf: %.*s", len, buf);
939
+
940
+ do {
941
+ xh_log_trace2("parse buf: %.*s", len, buf);
942
+
943
+ xh_x2h_parse_chunk(ctx, &buf, &len, eof);
944
+
945
+ if (ctx->state == XML_DECL_FOUND && ctx->opts.encoding[0] == '\0' && ctx->encoding[0] != '\0') {
946
+ reader->switch_encoding(reader, ctx->encoding, &buf, &len);
947
+ }
948
+ } while (len > 0);
949
+ } while (!eof);
950
+
951
+ if (ctx->state != PARSER_ST_DONE)
952
+ rb_raise(xh_parse_error_class, "Invalid XML");
953
+ }
954
+
955
+ static VALUE
956
+ xh_x2h_exec(VALUE arg)
957
+ {
958
+ xh_x2h_ctx_t *ctx = (xh_x2h_ctx_t *) arg;
959
+
960
+ if (ctx->opts.filter.enable) {
961
+ ctx->flags |= XH_X2H_FILTER_ENABLED;
962
+ if (!ctx->opts.block_given)
963
+ ctx->result = rb_ary_new();
964
+ }
965
+ else {
966
+ ctx->result = hash_new();
967
+ ctx->nodes[0].lval = ctx->lval = &ctx->result;
968
+ }
969
+
970
+ xh_reader_init(&ctx->reader, ctx->input, ctx->opts.encoding, ctx->opts.buf_size);
971
+
972
+ xh_x2h_parse(ctx, &ctx->reader);
973
+
974
+ return Qnil;
975
+ }
976
+
977
+ VALUE
978
+ xh_x2h(xh_x2h_ctx_t *ctx)
979
+ {
980
+ VALUE result;
981
+ int state;
982
+
983
+ result = rb_protect(xh_x2h_exec, (VALUE) ctx, &state);
984
+
985
+ if (state) {
986
+ xh_reader_destroy(&ctx->reader);
987
+ rb_exc_raise(rb_errinfo());
988
+ }
989
+
990
+ xh_reader_destroy(&ctx->reader);
991
+
992
+ result = ctx->result;
993
+ if (ctx->opts.filter.enable) {
994
+ if (ctx->opts.block_given)
995
+ result = Qnil;
996
+ }
997
+ else if (!ctx->opts.keep_root) {
998
+ result = hash_first_value(result);
999
+ }
1000
+
1001
+ return result;
1002
+ }