fast-xml 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1002 @@
1
+ #include "xh_config.h"
2
+ #include "xh_core.h"
3
+ #include "xh_ruby_hash.h"
4
+
5
+ static const char DEF_CONTENT_KEY[] = "content";
6
+
7
+ #define _NEW_STRING(s, l, utf8) \
8
+ ((utf8) ? rb_utf8_str_new((const char *) (s), l) : rb_str_new((const char *) (s), l))
9
+
10
+ #define NEW_STRING(s, l) \
11
+ _NEW_STRING(s, l, ctx->opts.utf8)
12
+
13
+ #define CAT_STRING(v, s, l) \
14
+ rb_str_cat((v), (const char *) (s), (l));
15
+
16
+ #define SAVE_VALUE(lv, v , s, l) \
17
+ xh_log_trace2("save value: [%.*s]", l, s); \
18
+ if ( RTEST(v) ) { \
19
+ xh_log_trace0("add to array"); \
20
+ /* get array if value is reference to array */ \
21
+ if ( RB_TYPE_P(v, RUBY_T_ARRAY) ) { \
22
+ av = v; \
23
+ } \
24
+ /* create a new array and move value to array */ \
25
+ else { \
26
+ av = rb_ary_new(); \
27
+ *(lv) = av; \
28
+ rb_ary_push(av, v); \
29
+ (v) = *(lv); \
30
+ } \
31
+ /* add value to array */ \
32
+ rb_ary_push(av, NEW_STRING((s), (l))); \
33
+ (lv) = (VALUE *) &RARRAY_CONST_PTR(av)[RARRAY_LEN(av) - 1]; \
34
+ } \
35
+ else { \
36
+ xh_log_trace0("set string"); \
37
+ *(lv) = NEW_STRING(s, l); \
38
+ } \
39
+
40
+ #define _OPEN_TAG(s, l) \
41
+ val = *lval; \
42
+ /* if content exists that move to hash with 'content' key */ \
43
+ if ( RB_TYPE_P(val, RUBY_T_STRING) || RB_TYPE_P(val, RUBY_T_ARRAY)) {\
44
+ *lval = hash_new(); \
45
+ if (RB_TYPE_P(val, RUBY_T_HASH) || RB_TYPE_P(val, RUBY_T_ARRAY) || (RB_TYPE_P(val, RUBY_T_STRING) && RSTRING_LEN(val))) {\
46
+ (void) hash_store(*lval, (const char *) content_key, content_key_len, val);\
47
+ } \
48
+ val = *lval; \
49
+ } \
50
+ /* fetch existen or create empty hash entry */ \
51
+ lval = hash_fetch(val, (const char *) s, l, Qnil); \
52
+ /* save as empty string */ \
53
+ val = *lval; \
54
+ xh_log_trace0("save as empty string"); \
55
+ SAVE_VALUE(lval, val, "", 0) \
56
+ if (++depth >= ctx->opts.max_depth) goto MAX_DEPTH_EXCEEDED; \
57
+ nodes[depth].lval = lval; \
58
+ nodes[depth].flags = XH_X2H_NODE_FLAG_NONE; \
59
+ if (depth > 1 && ctx->opts.force_array.enable && !RB_TYPE_P(val, RUBY_T_ARRAY) \
60
+ && (ctx->opts.force_array.always || xh_x2h_match_node(s, l, ctx->opts.force_array.expr))\
61
+ ) { \
62
+ nodes[depth].flags |= XH_X2H_NODE_FLAG_FORCE_ARRAY; \
63
+ } \
64
+ (s) = NULL;
65
+
66
+ #define OPEN_TAG(s, l) \
67
+ xh_log_trace2("new tag: [%.*s]", l, s); \
68
+ if (real_depth == 0) { \
69
+ if (flags & XH_X2H_ROOT_FOUND) goto INVALID_XML; \
70
+ flags |= XH_X2H_ROOT_FOUND; \
71
+ } \
72
+ if (XH_X2H_FILTER_SEARCH(flags)) { \
73
+ xh_x2h_xpath_update(ctx->xpath, s, l); \
74
+ if (xh_x2h_match_node(ctx->xpath, xh_strlen(ctx->xpath), ctx->opts.filter.expr)) {\
75
+ xh_log_trace2("match node: [%.*s]", l, s); \
76
+ ctx->hash = hash_new(); \
77
+ nodes[0].lval = lval = &ctx->hash; \
78
+ depth = 0; \
79
+ flags |= XH_X2H_FILTER_MATCHED; \
80
+ } \
81
+ } \
82
+ if (!XH_X2H_FILTER_SEARCH(flags)) { \
83
+ _OPEN_TAG(s, l) \
84
+ } \
85
+ real_depth++;
86
+
87
+ #define _CLOSE_TAG \
88
+ val = *nodes[depth].lval; \
89
+ if (ctx->opts.force_content && RB_TYPE_P(val, RUBY_T_STRING)) { \
90
+ lval = nodes[depth].lval; \
91
+ *lval = hash_new(); \
92
+ (void) hash_store(*lval, (const char *) content_key, content_key_len, val);\
93
+ val = *lval; \
94
+ } \
95
+ if ((nodes[depth].flags & XH_X2H_NODE_FLAG_FORCE_ARRAY) \
96
+ && (RB_TYPE_P(val, RUBY_T_STRING) || !RB_TYPE_P(val, RUBY_T_ARRAY))\
97
+ ) { \
98
+ lval = nodes[depth].lval; \
99
+ av = rb_ary_new(); \
100
+ *lval = av; \
101
+ rb_ary_push(av, val); \
102
+ } \
103
+ lval = nodes[--depth].lval;
104
+
105
+ #define CLOSE_TAG \
106
+ xh_log_trace0("close tag"); \
107
+ if (real_depth == 0) goto INVALID_XML; \
108
+ if (!XH_X2H_FILTER_SEARCH(flags)) { \
109
+ _CLOSE_TAG \
110
+ } \
111
+ if ((flags & XH_X2H_FILTER_MATCHED) && depth == 0) { \
112
+ xh_log_trace0("match node finished"); \
113
+ val = *nodes[0].lval; \
114
+ if (ctx->opts.block_given) { \
115
+ rb_yield(val); \
116
+ } \
117
+ else { \
118
+ rb_ary_push(ctx->result, val); \
119
+ } \
120
+ flags ^= XH_X2H_FILTER_MATCHED; \
121
+ } \
122
+ if ((flags & (XH_X2H_FILTER_ENABLED | XH_X2H_FILTER_MATCHED)) == XH_X2H_FILTER_ENABLED) {\
123
+ xh_x2h_xpath_update(ctx->xpath, NULL, 0); \
124
+ } \
125
+ real_depth--;
126
+
127
+ #define NEW_NODE_ATTRIBUTE(k, kl, v, vl) \
128
+ if (!XH_X2H_FILTER_SEARCH(flags)) { \
129
+ _OPEN_TAG(k, kl) \
130
+ _NEW_TEXT(v, vl) \
131
+ _CLOSE_TAG \
132
+ }
133
+
134
+ #define NEW_XML_DECL_ATTRIBUTE(k, kl, v, vl) \
135
+ xh_log_trace4("new xml decl attr name: [%.*s] value: [%.*s]", kl, k, vl, v);\
136
+ /* save encoding parameter to converter context if param found */ \
137
+ if ((kl) == (sizeof("encoding") - 1) && \
138
+ xh_strncmp((k), XH_CHAR_CAST "encoding", sizeof("encoding") - 1) == 0) {\
139
+ xh_str_range_copy(ctx->encoding, XH_CHAR_CAST (v), vl, XH_PARAM_LEN);\
140
+ } \
141
+ (k) = (v) = NULL;
142
+
143
+ #define NEW_ATTRIBUTE(k, kl, v, vl) NEW_NODE_ATTRIBUTE(k, kl, v, vl)
144
+
145
+ #define _NEW_TEXT(s, l) \
146
+ val = *lval; \
147
+ if ( !RB_TYPE_P(val, RUBY_T_STRING) ) { \
148
+ xh_log_trace0("add to array"); \
149
+ /* add content to array*/ \
150
+ if ( RB_TYPE_P(val, RUBY_T_ARRAY) ) { \
151
+ av = val; \
152
+ rb_ary_push(av, NEW_STRING(s, l)); \
153
+ } \
154
+ /* save content to hash with "content" key */ \
155
+ else { \
156
+ xh_log_trace0("save to hash"); \
157
+ lval = hash_fetch(val, (const char *) content_key, content_key_len, Qnil);\
158
+ val = *lval; \
159
+ SAVE_VALUE(lval, val, s, l) \
160
+ lval = nodes[depth].lval; \
161
+ } \
162
+ } \
163
+ else if (RSTRING_LEN(val) && !ctx->opts.merge_text) { \
164
+ xh_log_trace0("create a new array"); \
165
+ xh_log_trace1("create a new array val: %s", StringValueCStr(val));\
166
+ /* content already exists, create a new array and move*/ \
167
+ /* old and new content to array */ \
168
+ *lval = av = rb_ary_new(); \
169
+ rb_ary_push(av, val); \
170
+ rb_ary_push(av, NEW_STRING(s, l)); \
171
+ } \
172
+ else { \
173
+ xh_log_trace0("concat"); \
174
+ /* concatenate with previous string */ \
175
+ CAT_STRING(val, s, l) \
176
+ } \
177
+
178
+ #define NEW_TEXT(s, l) \
179
+ xh_log_trace2("new text: [%.*s]", l, s); \
180
+ if (real_depth == 0) goto INVALID_XML; \
181
+ if (!XH_X2H_FILTER_SEARCH(flags)) { \
182
+ _NEW_TEXT(s, l) \
183
+ }
184
+
185
+ #define NEW_COMMENT(s, l) (s) = NULL;
186
+
187
+ #define NEW_CDATA(s, l) NEW_TEXT(s, l)
188
+
189
+ #define CHECK_EOF_WITH_CHUNK(loop) \
190
+ if (cur >= eof || *cur == '\0') { \
191
+ eof = cur; \
192
+ if (terminate) goto XH_PPCAT(loop, _FINISH); \
193
+ ctx->state = XH_PPCAT(loop, _START); \
194
+ goto CHUNK_FINISH; \
195
+ } \
196
+
197
+ #define CHECK_EOF_WITHOUT_CHUNK(loop) \
198
+ if (cur >= eof || *cur == '\0') goto XH_PPCAT(loop, _FINISH); \
199
+
200
+ #define CHECK_EOF(loop) CHECK_EOF_WITH_CHUNK(loop)
201
+
202
+ #define DO(loop) \
203
+ XH_PPCAT(loop, _START): \
204
+ CHECK_EOF(loop) \
205
+ c = *cur++; \
206
+ xh_log_trace3("'%c'=[0x%X] %s start", c, c, XH_STRINGIZE(loop)); \
207
+ switch (c) {
208
+
209
+ #define _DO(loop) \
210
+ XH_PPCAT(loop, _START): \
211
+ CHECK_EOF_WITHOUT_CHUNK(loop) \
212
+ c = *cur++; \
213
+ xh_log_trace3("'%c'=[0x%X] %s start", c, c, XH_STRINGIZE(loop)); \
214
+ switch (c) {
215
+
216
+ #define END(loop) \
217
+ } \
218
+ xh_log_trace1(" %s end", XH_STRINGIZE(loop)); \
219
+ goto XH_PPCAT(loop, _START); \
220
+ XH_PPCAT(loop, _FINISH):
221
+
222
+ #define EXPECT_ANY(desc) \
223
+ default: xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
224
+
225
+ #define EXPECT_CHAR(desc, c1) \
226
+ case c1: xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
227
+
228
+ #define EXPECT_BLANK_WO_CR(desc) \
229
+ case ' ': case '\t': case '\n': \
230
+ xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
231
+
232
+ #define EXPECT_BLANK(desc) \
233
+ case ' ': case '\t': case '\n': case '\r': \
234
+ xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
235
+
236
+ #define EXPECT_DIGIT(desc) \
237
+ case '0': case '1': case '2': case '3': case '4': \
238
+ case '5': case '6': case '7': case '8': case '9': \
239
+ xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
240
+
241
+ #define EXPECT_HEX_CHAR_LC(desc) \
242
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': \
243
+ xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
244
+
245
+ #define EXPECT_HEX_CHAR_UC(desc) \
246
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': \
247
+ xh_log_trace3("'%c'=[0x%X] - %s expected", c, c, desc);
248
+
249
+ #define SKIP_BLANK \
250
+ EXPECT_BLANK("skip blank") break;
251
+
252
+ #define SCAN2(loop, c1, c2) \
253
+ DO(XH_PPCAT(loop, _1)) EXPECT_CHAR(XH_STRINGIZE(c1), c1) \
254
+ DO(XH_PPCAT(loop, _2)) EXPECT_CHAR(XH_STRINGIZE(c2), c2)
255
+
256
+ #define END2(loop, stop) \
257
+ EXPECT_ANY("wrong character") goto stop; \
258
+ END(XH_PPCAT(loop, _2)) goto stop; \
259
+ EXPECT_ANY("wrong character") goto stop; \
260
+ END(XH_PPCAT(loop, _1))
261
+
262
+ #define SCAN3(loop, c1, c2, c3) \
263
+ DO(XH_PPCAT(loop, _1)) EXPECT_CHAR(STRINGIZE(c1), c1) \
264
+ DO(XH_PPCAT(loop, _2)) EXPECT_CHAR(STRINGIZE(c2), c2) \
265
+ DO(XH_PPCAT(loop, _3)) EXPECT_CHAR(STRINGIZE(c3), c3)
266
+
267
+ #define END3(loop, stop) \
268
+ EXPECT_ANY("wrong character") goto stop; \
269
+ END(XH_PPCAT(loop, _3)) goto stop; \
270
+ EXPECT_ANY("wrong character") goto stop; \
271
+ END(XH_PPCAT(loop, _2)) goto stop; \
272
+ EXPECT_ANY("wrong character") goto stop; \
273
+ END(XH_PPCAT(loop, _1))
274
+
275
+ #define SCAN5(loop, c1, c2, c3, c4, c5) \
276
+ SCAN3(XH_PPCAT(loop, _1), c1, c2, c3) \
277
+ SCAN2(XH_PPCAT(loop, _2), c4, c5)
278
+
279
+ #define END5(loop, stop) \
280
+ END2(XH_PPCAT(loop, _2), stop) \
281
+ END3(XH_PPCAT(loop, _1), stop)
282
+
283
+ #define SCAN6(loop, c1, c2, c3, c4, c5, c6) \
284
+ SCAN3(XH_PPCAT(loop, _1), c1, c2, c3) \
285
+ SCAN3(XH_PPCAT(loop, _2), c4, c5, c6)
286
+
287
+ #define END6(loop, stop) \
288
+ END3(XH_PPCAT(loop, _2), stop) \
289
+ END3(XH_PPCAT(loop, _1), stop)
290
+
291
+ #define SEARCH_END_TAG \
292
+ EXPECT_CHAR("end tag", '>') \
293
+ goto PARSE_CONTENT; \
294
+ EXPECT_CHAR("self closing tag", '/') \
295
+ CLOSE_TAG \
296
+ DO(SEARCH_END_TAG) \
297
+ EXPECT_CHAR("end tag", '>') \
298
+ goto PARSE_CONTENT; \
299
+ EXPECT_ANY("wrong character") \
300
+ goto INVALID_XML; \
301
+ END(SEARCH_END_TAG) \
302
+ goto INVALID_XML;
303
+
304
+ #define SEARCH_NODE_ATTRIBUTE_VALUE(loop, top_loop, quot) \
305
+ EXPECT_CHAR("start attr value", quot) \
306
+ content = cur; \
307
+ flags &= ~XH_X2H_NEED_NORMALIZE; \
308
+ DO(XH_PPCAT(loop, _END_ATTR_VALUE)) \
309
+ EXPECT_CHAR("attr value end", quot) \
310
+ if (flags & XH_X2H_NEED_NORMALIZE) { \
311
+ NORMALIZE_TEXT(loop, content, cur - content - 1) \
312
+ NEW_ATTRIBUTE(node, end - node, enc, enc_len) \
313
+ } \
314
+ else { \
315
+ NEW_ATTRIBUTE(node, end - node, content, cur - content - 1)\
316
+ } \
317
+ goto top_loop; \
318
+ EXPECT_CHAR("CR", '\r') \
319
+ flags |= XH_X2H_NORMALIZE_LINE_FEED; \
320
+ break; \
321
+ EXPECT_CHAR("reference", '&') \
322
+ flags |= XH_X2H_NORMALIZE_REF; \
323
+ break; \
324
+ END(XH_PPCAT(loop, _END_ATTR_VALUE)) \
325
+ goto INVALID_XML;
326
+
327
+ #define SEARCH_XML_DECL_ATTRIBUTE_VALUE(loop, top_loop, quot) \
328
+ EXPECT_CHAR("start attr value", quot) \
329
+ content = cur; \
330
+ DO(XH_PPCAT(loop, _END_ATTR_VALUE)) \
331
+ EXPECT_CHAR("attr value end", quot) \
332
+ NEW_ATTRIBUTE(node, end - node, content, cur - content - 1)\
333
+ goto top_loop; \
334
+ END(XH_PPCAT(loop, _END_ATTR_VALUE)) \
335
+ goto INVALID_XML;
336
+
337
+ #define SEARCH_ATTRIBUTE_VALUE(loop, top_loop, quot) SEARCH_NODE_ATTRIBUTE_VALUE(loop, top_loop, quot)
338
+
339
+ #define SEARCH_ATTRIBUTES(loop, search_end_tag) \
340
+ XH_PPCAT(loop, _SEARCH_ATTRIBUTES_LOOP): \
341
+ DO(XH_PPCAT(loop, _SEARCH_ATTR)) \
342
+ search_end_tag \
343
+ \
344
+ SKIP_BLANK \
345
+ \
346
+ EXPECT_ANY("start attr name") \
347
+ node = cur - 1; \
348
+ \
349
+ DO(XH_PPCAT(loop, _PARSE_ATTR_NAME)) \
350
+ EXPECT_BLANK("end attr name") \
351
+ end = cur - 1; \
352
+ xh_log_trace2("attr name: [%.*s]", end - node, node);\
353
+ \
354
+ DO(XH_PPCAT(loop, _ATTR_SKIP_BLANK)) \
355
+ EXPECT_CHAR("search attr value", '=') \
356
+ goto XH_PPCAT(loop, _SEARCH_ATTRIBUTE_VALUE);\
357
+ SKIP_BLANK \
358
+ EXPECT_ANY("wrong character") \
359
+ goto INVALID_XML; \
360
+ END(XH_PPCAT(loop, _ATTR_SKIP_BLANK)) \
361
+ goto INVALID_XML; \
362
+ EXPECT_CHAR("end attr name", '=') \
363
+ end = cur - 1; \
364
+ xh_log_trace2("attr name: [%.*s]", end - node, node);\
365
+ \
366
+ XH_PPCAT(loop, _SEARCH_ATTRIBUTE_VALUE): \
367
+ DO(XH_PPCAT(loop, _PARSE_ATTR_VALUE)) \
368
+ SEARCH_ATTRIBUTE_VALUE(XH_PPCAT(loop, _1), XH_PPCAT(loop, _SEARCH_ATTRIBUTES_LOOP), '"')\
369
+ SEARCH_ATTRIBUTE_VALUE(XH_PPCAT(loop, _2), XH_PPCAT(loop, _SEARCH_ATTRIBUTES_LOOP), '\'')\
370
+ SKIP_BLANK \
371
+ EXPECT_ANY("wrong character") \
372
+ goto INVALID_XML; \
373
+ END(XH_PPCAT(loop, _PARSE_ATTR_VALUE)) \
374
+ goto INVALID_XML; \
375
+ END(XH_PPCAT(loop, _PARSE_ATTR_NAME)) \
376
+ goto INVALID_XML; \
377
+ END(XH_PPCAT(loop, _SEARCH_ATTR)) \
378
+ goto INVALID_XML;
379
+
380
+ #define PARSE_XML_DECLARATION \
381
+ SCAN3(XML_DECL, 'x', 'm', 'l') \
382
+ DO(XML_DECL_ATTR) \
383
+ EXPECT_BLANK("blank") \
384
+ SEARCH_ATTRIBUTES(XML_DECL_ATTR, SEARCH_END_XML_DECLARATION)\
385
+ goto INVALID_XML; \
386
+ EXPECT_ANY("wrong character") \
387
+ goto INVALID_XML; \
388
+ END(XML_DECL_ATTR) \
389
+ goto INVALID_XML; \
390
+ END3(XML_DECL, INVALID_XML) \
391
+ goto INVALID_XML;
392
+
393
+ #define SEARCH_END_XML_DECLARATION \
394
+ EXPECT_CHAR("end tag", '?') \
395
+ DO(XML_DECL_SEARCH_END_TAG2) \
396
+ EXPECT_CHAR("end tag", '>') \
397
+ goto XML_DECL_FOUND; \
398
+ EXPECT_ANY("wrong character") \
399
+ goto INVALID_XML; \
400
+ END(XML_DECL_SEARCH_END_TAG2) \
401
+ goto INVALID_XML;
402
+
403
+ #define PARSE_COMMENT \
404
+ DO(COMMENT1) \
405
+ EXPECT_CHAR("-", '-') \
406
+ content = NULL; \
407
+ DO(END_COMMENT1) \
408
+ SKIP_BLANK \
409
+ EXPECT_CHAR("1st -", '-') \
410
+ if (content == NULL) content = end = cur - 1; \
411
+ DO(END_COMMENT2) \
412
+ EXPECT_CHAR("2nd -", '-') \
413
+ DO(END_COMMENT3) \
414
+ EXPECT_CHAR(">", '>') \
415
+ NEW_COMMENT(content, end - content) \
416
+ goto PARSE_CONTENT; \
417
+ EXPECT_CHAR("2nd -", '-') \
418
+ end = cur - 2; \
419
+ goto END_COMMENT3_START; \
420
+ EXPECT_ANY("any character") \
421
+ end = cur - 1; \
422
+ goto END_COMMENT1_START; \
423
+ END(END_COMMENT3) \
424
+ EXPECT_BLANK("skip blank") \
425
+ end = cur - 1; \
426
+ goto END_COMMENT1_START; \
427
+ EXPECT_ANY("any character") \
428
+ end = cur; \
429
+ goto END_COMMENT1_START; \
430
+ END(END_COMMENT2) \
431
+ EXPECT_ANY("any char") \
432
+ if (content == NULL) content = cur - 1; \
433
+ end = cur; \
434
+ END(END_COMMENT1) \
435
+ goto INVALID_XML; \
436
+ \
437
+ EXPECT_ANY("wrong character") \
438
+ goto INVALID_XML; \
439
+ \
440
+ END(COMMENT1) \
441
+ goto INVALID_XML;
442
+
443
+ #define PARSE_CDATA \
444
+ SCAN6(CDATA, 'C', 'D', 'A', 'T', 'A', '[') \
445
+ content = end = cur; \
446
+ DO(END_CDATA1) \
447
+ EXPECT_CHAR("1st ]", ']') \
448
+ DO(END_CDATA2) \
449
+ EXPECT_CHAR("2nd ]", ']') \
450
+ DO(END_CDATA3) \
451
+ EXPECT_CHAR(">", '>') \
452
+ end = cur - 3; \
453
+ NEW_CDATA(content, end - content) \
454
+ goto PARSE_CONTENT; \
455
+ EXPECT_CHAR("2nd ]", ']') \
456
+ goto END_CDATA3_START; \
457
+ EXPECT_ANY("any character") \
458
+ goto END_CDATA1_START; \
459
+ END(END_CDATA3) \
460
+ EXPECT_ANY("any character") \
461
+ goto END_CDATA1_START; \
462
+ END(END_CDATA2) \
463
+ ; \
464
+ END(END_CDATA1) \
465
+ goto INVALID_XML; \
466
+ END6(CDATA, INVALID_XML)
467
+
468
+ #define PARSE_CDATA_WITH_TRIM \
469
+ SCAN6(CDATA_WITH_TRIM, 'C', 'D', 'A', 'T', 'A', '[') \
470
+ content = NULL; \
471
+ DO(END_CDATA_WITH_TRIM1) \
472
+ SKIP_BLANK \
473
+ EXPECT_CHAR("1st ]", ']') \
474
+ if (content == NULL) content = end = cur - 1; \
475
+ DO(END_CDATA_WITH_TRIM2) \
476
+ EXPECT_CHAR("2nd ]", ']') \
477
+ DO(END_CDATA_WITH_TRIM3) \
478
+ EXPECT_CHAR(">", '>') \
479
+ NEW_CDATA(content, end - content) \
480
+ goto PARSE_CONTENT; \
481
+ EXPECT_CHAR("2nd ]", ']') \
482
+ end = cur - 2; \
483
+ goto END_CDATA_WITH_TRIM3_START; \
484
+ EXPECT_ANY("any character") \
485
+ end = cur - 1; \
486
+ goto END_CDATA_WITH_TRIM1_START; \
487
+ END(END_CDATA_WITH_TRIM3) \
488
+ EXPECT_BLANK("skip blank") \
489
+ end = cur - 1; \
490
+ goto END_CDATA_WITH_TRIM1_START; \
491
+ EXPECT_ANY("any character") \
492
+ end = cur; \
493
+ goto END_CDATA_WITH_TRIM1_START; \
494
+ END(END_CDATA_WITH_TRIM2) \
495
+ EXPECT_ANY("any char") \
496
+ if (content == NULL) content = cur - 1; \
497
+ end = cur; \
498
+ END(END_CDATA_WITH_TRIM1) \
499
+ goto INVALID_XML; \
500
+ END6(CDATA_WITH_TRIM, INVALID_XML)
501
+
502
+ #define NORMALIZE_REFERENCE(loop) \
503
+ _DO(XH_PPCAT(loop, _REFERENCE)) \
504
+ EXPECT_CHAR("char reference", '#') \
505
+ _DO(XH_PPCAT(loop, _CHAR_REFERENCE)) \
506
+ EXPECT_CHAR("hex", 'x') \
507
+ code = 0; \
508
+ _DO(XH_PPCAT(loop, _HEX_CHAR_REFERENCE_LOOP)) \
509
+ EXPECT_DIGIT("hex digit") \
510
+ code = code * 16 + (c - '0'); \
511
+ break; \
512
+ EXPECT_HEX_CHAR_LC("hex a-f") \
513
+ code = code * 16 + (c - 'a') + 10; \
514
+ break; \
515
+ EXPECT_HEX_CHAR_UC("hex A-F") \
516
+ code = code * 16 + (c - 'A') + 10; \
517
+ break; \
518
+ EXPECT_CHAR("reference end", ';') \
519
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
520
+ END(XH_PPCAT(loop, _HEX_CHAR_REFERENCE_LOOP)) \
521
+ goto INVALID_REF; \
522
+ EXPECT_DIGIT("digit") \
523
+ code = (c - '0'); \
524
+ _DO(XH_PPCAT(loop, _CHAR_REFERENCE_LOOP)) \
525
+ EXPECT_DIGIT("digit") \
526
+ code = code * 10 + (c - '0'); \
527
+ break; \
528
+ EXPECT_CHAR("reference end", ';') \
529
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
530
+ END(XH_PPCAT(loop, _CHAR_REFERENCE_LOOP)) \
531
+ goto INVALID_REF; \
532
+ EXPECT_ANY("any char") \
533
+ goto INVALID_REF; \
534
+ END(XH_PPCAT(loop, _CHAR_REFERENCE)) \
535
+ goto INVALID_REF; \
536
+ EXPECT_CHAR("amp or apos", 'a') \
537
+ if (xh_str_equal3(cur, 'm', 'p', ';')) { \
538
+ code = '&'; \
539
+ cur += 3; \
540
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
541
+ } \
542
+ if (xh_str_equal4(cur, 'p', 'o', 's', ';')) { \
543
+ code = '\''; \
544
+ cur += 4; \
545
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
546
+ } \
547
+ goto INVALID_REF; \
548
+ EXPECT_CHAR("lt", 'l') \
549
+ if (xh_str_equal2(cur, 't', ';')) { \
550
+ code = '<'; \
551
+ cur += 2; \
552
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
553
+ } \
554
+ goto INVALID_REF; \
555
+ EXPECT_CHAR("gt", 'g') \
556
+ if (xh_str_equal2(cur, 't', ';')) { \
557
+ code = '>'; \
558
+ cur += 2; \
559
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
560
+ } \
561
+ goto INVALID_REF; \
562
+ EXPECT_CHAR("quot", 'q') \
563
+ if (xh_str_equal4(cur, 'u', 'o', 't', ';')) { \
564
+ code = '"'; \
565
+ cur += 4; \
566
+ goto XH_PPCAT(loop, _REFEFENCE_VALUE); \
567
+ } \
568
+ goto INVALID_REF; \
569
+ EXPECT_ANY("any char") \
570
+ goto INVALID_REF; \
571
+ END(XH_PPCAT(loop, _REFERENCE)) \
572
+ goto INVALID_REF; \
573
+ XH_PPCAT(loop, _REFEFENCE_VALUE): \
574
+ xh_log_trace1("parse reference value: %lu", code); \
575
+ if (code == 0 || code > 0x10FFFF) goto INVALID_REF; \
576
+ if (code >= 0x80) { \
577
+ if (code < 0x800) { \
578
+ *enc_cur++ = (code >> 6) | 0xC0; bits = 0; \
579
+ } \
580
+ else if (code < 0x10000) { \
581
+ *enc_cur++ = (code >> 12) | 0xE0; bits = 6; \
582
+ } \
583
+ else if (code < 0x110000) { \
584
+ *enc_cur++ = (code >> 18) | 0xF0; bits = 12; \
585
+ } \
586
+ else { \
587
+ goto INVALID_REF; \
588
+ } \
589
+ for (; bits >= 0; bits-= 6) { \
590
+ *enc_cur++ = ((code >> bits) & 0x3F) | 0x80; \
591
+ } \
592
+ } \
593
+ else { \
594
+ *enc_cur++ = (xh_char_t) code; \
595
+ }
596
+
597
+ #define NORMALIZE_LINE_FEED(loop) \
598
+ _DO(XH_PPCAT(loop, _NORMALIZE_LINE_FEED)) \
599
+ EXPECT_CHAR("LF", '\n') \
600
+ goto XH_PPCAT(loop, _NORMALIZE_LINE_FEED_END); \
601
+ EXPECT_ANY("any char") \
602
+ cur--; \
603
+ goto XH_PPCAT(loop, _NORMALIZE_LINE_FEED_END); \
604
+ END(XH_PPCAT(loop, _NORMALIZE_LINE_FEED)) \
605
+ XH_PPCAT(loop, _NORMALIZE_LINE_FEED_END): \
606
+ *enc_cur++ = '\n';
607
+
608
+ #define NORMALIZE_TEXT(loop, s, l) \
609
+ enc_len = l; \
610
+ if (enc_len) { \
611
+ old_cur = cur; \
612
+ old_eof = eof; \
613
+ cur = s; \
614
+ eof = cur + enc_len; \
615
+ if (ctx->tmp == NULL) { \
616
+ xh_log_trace1("malloc() %lu", enc_len); \
617
+ if ((ctx->tmp = malloc(enc_len)) == NULL) goto MALLOC; \
618
+ ctx->tmp_size = enc_len; \
619
+ } \
620
+ else if (enc_len > ctx->tmp_size) { \
621
+ xh_log_trace1("realloc() %lu", enc_len); \
622
+ if ((enc = realloc(ctx->tmp, enc_len)) == NULL) goto MALLOC;\
623
+ ctx->tmp = enc; \
624
+ ctx->tmp_size = enc_len; \
625
+ } \
626
+ enc = enc_cur = ctx->tmp; \
627
+ memcpy(enc, cur, enc_len); \
628
+ _DO(XH_PPCAT(loop, _NORMALIZE_TEXT)) \
629
+ EXPECT_CHAR("reference", '&') \
630
+ NORMALIZE_REFERENCE(loop) \
631
+ break; \
632
+ EXPECT_CHAR("CR", '\r') \
633
+ NORMALIZE_LINE_FEED(loop) \
634
+ break; \
635
+ EXPECT_ANY("any char") \
636
+ *enc_cur++ = c; \
637
+ END(XH_PPCAT(loop, _NORMALIZE_TEXT)) \
638
+ enc_len = enc_cur - enc; \
639
+ cur = old_cur; \
640
+ eof = old_eof; \
641
+ } \
642
+ else { \
643
+ enc = s; \
644
+ }
645
+
646
+ XH_INLINE void
647
+ xh_x2h_xpath_update(xh_char_t *xpath, xh_char_t *name, size_t name_len)
648
+ {
649
+ size_t len;
650
+
651
+ len = xh_strlen(xpath);
652
+ if (name != NULL) {
653
+ if ((len + name_len + 1) > XH_X2H_XPATH_MAX_LEN)
654
+ rb_raise(xh_parse_error_class, "XPath too long");
655
+
656
+ xpath[len++] = '/';
657
+ for (;name_len--;) xpath[len++] = *name++;
658
+ }
659
+ else if (len == 0) {
660
+ rb_raise(xh_parse_error_class, "Can't update xpath, something wrong!");
661
+ }
662
+ else {
663
+ for (;--len && xpath[len] != '/';) {/* void */}
664
+ }
665
+ xpath[len] = '\0';
666
+
667
+ xh_log_trace1("xpath: [%s]", xpath);
668
+ }
669
+
670
+ XH_INLINE xh_bool_t
671
+ xh_x2h_match_node(xh_char_t *name, size_t name_len, VALUE expr)
672
+ {
673
+ size_t i, l;
674
+ VALUE ary;
675
+ VALUE str;
676
+ xh_char_t *expr_str;
677
+ size_t expr_len;
678
+
679
+ xh_log_trace2("match node: [%.*s]", name_len, name);
680
+
681
+ str = _NEW_STRING(name, name_len, TRUE);
682
+
683
+ if ( RB_TYPE_P(expr, RUBY_T_REGEXP) ) {
684
+ if (rb_reg_search(expr, str, 0, 0) >= 0) return TRUE;
685
+ }
686
+ else if ( RB_TYPE_P(expr, RUBY_T_ARRAY) ) {
687
+ l = RARRAY_LEN(expr);
688
+ ary = expr;
689
+ for (i = 0; i < l; i++) {
690
+ expr = RARRAY_AREF(ary, i);
691
+ if ( RB_TYPE_P(expr, RUBY_T_REGEXP) ) {
692
+ if (rb_reg_search(expr, str, 0, 0) >= 0) return TRUE;
693
+ }
694
+ else {
695
+ expr_str = XH_CHAR_CAST RSTRING_PTR(expr);
696
+ expr_len = RSTRING_LEN(expr);
697
+ if (name_len == expr_len && !xh_strncmp(name, expr_str, name_len)) {
698
+ return TRUE;
699
+ }
700
+ }
701
+ }
702
+ } else {
703
+ xh_log_trace0("match string");
704
+ expr_str = XH_CHAR_CAST RSTRING_PTR(expr);
705
+ expr_len = RSTRING_LEN(expr);
706
+ xh_log_trace2("expr: [%.*s]", expr_len, expr_str);
707
+ if (name_len == expr_len && !xh_strncmp(name, expr_str, name_len)) {
708
+ xh_log_trace0("match TRUE");
709
+ return TRUE;
710
+ }
711
+ }
712
+
713
+ return FALSE;
714
+ }
715
+
716
+ static void
717
+ xh_x2h_parse_chunk(xh_x2h_ctx_t *ctx, xh_char_t **buf, size_t *bytesleft, xh_bool_t terminate)
718
+ {
719
+ xh_char_t c, *cur, *node, *end, *content, *eof, *enc,
720
+ *enc_cur, *old_cur, *old_eof, *content_key;
721
+ unsigned int depth, real_depth, code, flags;
722
+ int bits;
723
+ VALUE *lval, val;
724
+ xh_x2h_node_t *nodes;
725
+ VALUE av;
726
+ size_t enc_len, content_key_len;
727
+
728
+ cur = *buf;
729
+ eof = cur + *bytesleft;
730
+ nodes = ctx->nodes;
731
+ depth = ctx->depth;
732
+ real_depth = ctx->real_depth;
733
+ flags = ctx->flags;
734
+ node = ctx->node;
735
+ end = ctx->end;
736
+ content = ctx->content;
737
+ code = ctx->code;
738
+ lval = ctx->lval;
739
+ enc = enc_cur = old_eof = old_cur = NULL;
740
+ c = '\0';
741
+
742
+ if (ctx->opts.content[0] == '\0') {
743
+ content_key = (xh_char_t *) DEF_CONTENT_KEY;
744
+ content_key_len = sizeof(DEF_CONTENT_KEY) - 1;
745
+ }
746
+ else {
747
+ content_key = ctx->opts.content;
748
+ content_key_len = xh_strlen(ctx->opts.content);
749
+ }
750
+
751
+ #define XH_X2H_PROCESS_STATE(st) case st: goto st;
752
+ switch (ctx->state) {
753
+ case PARSER_ST_NONE: break;
754
+ XH_X2H_PARSER_STATE_LIST
755
+ case XML_DECL_FOUND: break;
756
+ case PARSER_ST_DONE: goto DONE;
757
+ }
758
+ #undef XH_X2H_PROCESS_STATE
759
+
760
+ PARSE_CONTENT:
761
+ content = NULL;
762
+ flags &= ~(XH_X2H_NEED_NORMALIZE | XH_X2H_IS_NOT_BLANK);
763
+ DO(CONTENT)
764
+ EXPECT_CHAR("new element", '<')
765
+ if (content != NULL) {
766
+ if (flags & XH_X2H_IS_NOT_BLANK) {
767
+ if (flags & XH_X2H_NEED_NORMALIZE) {
768
+ NORMALIZE_TEXT(TEXT1, content, end - content)
769
+ NEW_TEXT(enc, enc_len)
770
+ }
771
+ else {
772
+ NEW_TEXT(content, end - content)
773
+ }
774
+ }
775
+ content = NULL;
776
+ }
777
+ DO(PARSE_ELEMENT)
778
+ EXPECT_CHAR("xml declaration", '?')
779
+ if (real_depth != 0) goto INVALID_XML;
780
+ #undef NEW_ATTRIBUTE
781
+ #define NEW_ATTRIBUTE(k, kl, v, vl) NEW_XML_DECL_ATTRIBUTE(k, kl, v, vl)
782
+ #undef SEARCH_ATTRIBUTE_VALUE
783
+ #define SEARCH_ATTRIBUTE_VALUE(loop, top_loop, quot) SEARCH_XML_DECL_ATTRIBUTE_VALUE(loop, top_loop, quot)
784
+ PARSE_XML_DECLARATION
785
+ #undef NEW_ATTRIBUTE
786
+ #define NEW_ATTRIBUTE(k, kl, v, vl) NEW_NODE_ATTRIBUTE(k, kl, v, vl)
787
+ #undef SEARCH_ATTRIBUTE_VALUE
788
+ #define SEARCH_ATTRIBUTE_VALUE(loop, top_loop, quot) SEARCH_NODE_ATTRIBUTE_VALUE(loop, top_loop, quot)
789
+ EXPECT_CHAR("comment", '!')
790
+ DO(XML_COMMENT_NODE_OR_CDATA)
791
+ EXPECT_CHAR("comment", '-')
792
+ PARSE_COMMENT
793
+ EXPECT_CHAR("cdata", '[')
794
+ if (ctx->opts.trim) {
795
+ PARSE_CDATA_WITH_TRIM
796
+ ;
797
+ }
798
+ else {
799
+ PARSE_CDATA
800
+ ;
801
+ }
802
+ EXPECT_ANY("wrong character")
803
+ goto INVALID_XML;
804
+ END(XML_COMMENT_NODE_OR_CDATA)
805
+ goto INVALID_XML;
806
+ EXPECT_CHAR("closing tag", '/')
807
+ //node = cur;
808
+ DO(PARSE_CLOSING_TAG)
809
+ EXPECT_CHAR("end tag name", '>')
810
+ CLOSE_TAG
811
+ goto PARSE_CONTENT;
812
+ EXPECT_BLANK("end tag name")
813
+ DO(SEARCH_CLOSING_END_TAG)
814
+ EXPECT_CHAR("end tag", '>')
815
+ CLOSE_TAG
816
+ goto PARSE_CONTENT;
817
+ SKIP_BLANK
818
+ EXPECT_ANY("wrong character")
819
+ goto INVALID_XML;
820
+ END(SEARCH_CLOSING_END_TAG)
821
+ goto INVALID_XML;
822
+ END(PARSE_CLOSING_TAG)
823
+ goto INVALID_XML;
824
+ EXPECT_ANY("opening tag")
825
+ node = cur - 1;
826
+ DO(PARSE_OPENING_TAG)
827
+ EXPECT_CHAR("end tag", '>')
828
+ OPEN_TAG(node, cur - node - 1)
829
+ goto PARSE_CONTENT;
830
+ EXPECT_CHAR("self closing tag", '/')
831
+ OPEN_TAG(node, cur - node - 1)
832
+ CLOSE_TAG
833
+
834
+ DO(SEARCH_OPENING_END_TAG)
835
+ EXPECT_CHAR("end tag", '>')
836
+ goto PARSE_CONTENT;
837
+ EXPECT_ANY("wrong character")
838
+ goto INVALID_XML;
839
+ END(SEARCH_OPENING_END_TAG)
840
+ goto INVALID_XML;
841
+ EXPECT_BLANK("end tag name")
842
+ OPEN_TAG(node, cur - node - 1)
843
+
844
+ SEARCH_ATTRIBUTES(NODE, SEARCH_END_TAG)
845
+
846
+ goto PARSE_CONTENT;
847
+ END(PARSE_OPENING_TAG);
848
+ goto INVALID_XML;
849
+ END(PARSE_ELEMENT)
850
+
851
+ EXPECT_CHAR("wrong symbol", '>')
852
+ goto INVALID_XML;
853
+ EXPECT_BLANK_WO_CR("blank")
854
+ if (!ctx->opts.trim)
855
+ goto START_CONTENT;
856
+ break;
857
+ EXPECT_CHAR("CR", '\r')
858
+ if (content != NULL) {
859
+ flags |= XH_X2H_NORMALIZE_LINE_FEED;
860
+ }
861
+ if (!ctx->opts.trim)
862
+ goto START_CONTENT;
863
+ break;
864
+ EXPECT_CHAR("reference", '&')
865
+ flags |= XH_X2H_NORMALIZE_REF;
866
+ EXPECT_ANY("any char")
867
+ flags |= XH_X2H_IS_NOT_BLANK;
868
+ START_CONTENT:
869
+ if (content == NULL) content = cur - 1;
870
+ end = cur;
871
+ END(CONTENT)
872
+
873
+ if (content != NULL) {
874
+ if (flags & XH_X2H_IS_NOT_BLANK) {
875
+ if (flags & XH_X2H_NEED_NORMALIZE) {
876
+ NORMALIZE_TEXT(TEXT2, content, end - content)
877
+ NEW_TEXT(enc, enc_len)
878
+ }
879
+ else {
880
+ NEW_TEXT(content, end - content)
881
+ }
882
+ }
883
+ content = NULL;
884
+ }
885
+
886
+ if (real_depth != 0 || !(flags & XH_X2H_ROOT_FOUND)) goto INVALID_XML;
887
+
888
+ ctx->state = PARSER_ST_DONE;
889
+ *bytesleft = eof - cur;
890
+ *buf = cur;
891
+ return;
892
+
893
+ XML_DECL_FOUND:
894
+ ctx->state = XML_DECL_FOUND;
895
+ CHUNK_FINISH:
896
+ ctx->content = content;
897
+ ctx->node = node;
898
+ ctx->end = end;
899
+ ctx->depth = depth;
900
+ ctx->real_depth = real_depth;
901
+ ctx->flags = flags;
902
+ ctx->code = code;
903
+ ctx->lval = lval;
904
+ *bytesleft = eof - cur;
905
+ *buf = cur;
906
+ return;
907
+
908
+ MAX_DEPTH_EXCEEDED:
909
+ rb_raise(xh_parse_error_class, "Maximum depth exceeded");
910
+ INVALID_XML:
911
+ rb_raise(xh_parse_error_class, "Invalid XML");
912
+ INVALID_REF:
913
+ rb_raise(xh_parse_error_class, "Invalid reference");
914
+ MALLOC:
915
+ rb_raise(rb_eNoMemError, "Memory allocation error");
916
+ DONE:
917
+ rb_raise(xh_parse_error_class, "Parsing is done");
918
+ }
919
+
920
+ static void
921
+ xh_x2h_parse(xh_x2h_ctx_t *ctx, xh_reader_t *reader)
922
+ {
923
+ xh_char_t *buf, *preserve;
924
+ size_t len, off;
925
+ xh_bool_t eof;
926
+
927
+ do {
928
+ preserve = ctx->node != NULL ? ctx->node : ctx->content;
929
+
930
+ len = reader->read(reader, &buf, preserve, &off);
931
+ eof = (len == 0);
932
+ if (off) {
933
+ if (ctx->node != NULL) ctx->node -= off;
934
+ if (ctx->content != NULL) ctx->content -= off;
935
+ if (ctx->end != NULL) ctx->end -= off;
936
+ }
937
+
938
+ xh_log_trace2("read buf: %.*s", len, buf);
939
+
940
+ do {
941
+ xh_log_trace2("parse buf: %.*s", len, buf);
942
+
943
+ xh_x2h_parse_chunk(ctx, &buf, &len, eof);
944
+
945
+ if (ctx->state == XML_DECL_FOUND && ctx->opts.encoding[0] == '\0' && ctx->encoding[0] != '\0') {
946
+ reader->switch_encoding(reader, ctx->encoding, &buf, &len);
947
+ }
948
+ } while (len > 0);
949
+ } while (!eof);
950
+
951
+ if (ctx->state != PARSER_ST_DONE)
952
+ rb_raise(xh_parse_error_class, "Invalid XML");
953
+ }
954
+
955
+ static VALUE
956
+ xh_x2h_exec(VALUE arg)
957
+ {
958
+ xh_x2h_ctx_t *ctx = (xh_x2h_ctx_t *) arg;
959
+
960
+ if (ctx->opts.filter.enable) {
961
+ ctx->flags |= XH_X2H_FILTER_ENABLED;
962
+ if (!ctx->opts.block_given)
963
+ ctx->result = rb_ary_new();
964
+ }
965
+ else {
966
+ ctx->result = hash_new();
967
+ ctx->nodes[0].lval = ctx->lval = &ctx->result;
968
+ }
969
+
970
+ xh_reader_init(&ctx->reader, ctx->input, ctx->opts.encoding, ctx->opts.buf_size);
971
+
972
+ xh_x2h_parse(ctx, &ctx->reader);
973
+
974
+ return Qnil;
975
+ }
976
+
977
+ VALUE
978
+ xh_x2h(xh_x2h_ctx_t *ctx)
979
+ {
980
+ VALUE result;
981
+ int state;
982
+
983
+ result = rb_protect(xh_x2h_exec, (VALUE) ctx, &state);
984
+
985
+ if (state) {
986
+ xh_reader_destroy(&ctx->reader);
987
+ rb_exc_raise(rb_errinfo());
988
+ }
989
+
990
+ xh_reader_destroy(&ctx->reader);
991
+
992
+ result = ctx->result;
993
+ if (ctx->opts.filter.enable) {
994
+ if (ctx->opts.block_given)
995
+ result = Qnil;
996
+ }
997
+ else if (!ctx->opts.keep_root) {
998
+ result = hash_first_value(result);
999
+ }
1000
+
1001
+ return result;
1002
+ }