onigmo 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,985 @@
1
+ #include <ruby.h>
2
+ #include <ruby/onigmo.h>
3
+ #include <ruby/encoding.h>
4
+
5
+ #include "regint.h"
6
+ #include "regparse.h"
7
+
8
+ VALUE rb_cOnigmoNode;
9
+ VALUE rb_cOnigmoAlternationNode;
10
+ VALUE rb_cOnigmoAnchorBufferBeginNode;
11
+ VALUE rb_cOnigmoAnchorBufferEndNode;
12
+ VALUE rb_cOnigmoAnchorKeepNode;
13
+ VALUE rb_cOnigmoAnchorLineBeginNode;
14
+ VALUE rb_cOnigmoAnchorLineEndNode;
15
+ VALUE rb_cOnigmoAnchorPositionBeginNode;
16
+ VALUE rb_cOnigmoAnchorSemiEndNode;
17
+ VALUE rb_cOnigmoAnchorWordBoundaryNode;
18
+ VALUE rb_cOnigmoAnchorWordBoundaryInvertNode;
19
+ VALUE rb_cOnigmoAnyNode;
20
+ VALUE rb_cOnigmoBackrefNode;
21
+ VALUE rb_cOnigmoCallNode;
22
+ VALUE rb_cOnigmoCClassNode;
23
+ VALUE rb_cOnigmoCClassInvertNode;
24
+ VALUE rb_cOnigmoEncloseAbsentNode;
25
+ VALUE rb_cOnigmoEncloseConditionNode;
26
+ VALUE rb_cOnigmoEncloseMemoryNode;
27
+ VALUE rb_cOnigmoEncloseOptionsNode;
28
+ VALUE rb_cOnigmoEncloseStopBacktrackNode;
29
+ VALUE rb_cOnigmoListNode;
30
+ VALUE rb_cOnigmoLookAheadNode;
31
+ VALUE rb_cOnigmoLookAheadInvertNode;
32
+ VALUE rb_cOnigmoLookBehindNode;
33
+ VALUE rb_cOnigmoLookBehindInvertNode;
34
+ VALUE rb_cOnigmoQuantifierNode;
35
+ VALUE rb_cOnigmoStringNode;
36
+ VALUE rb_cOnigmoWordNode;
37
+ VALUE rb_cOnigmoWordInvertNode;
38
+
39
+ static VALUE
40
+ build_options(OnigOptionType option) {
41
+ VALUE options = rb_ary_new();
42
+
43
+ if (option & ONIG_OPTION_NONE) rb_ary_push(options, ID2SYM(rb_intern("none")));
44
+ if (option & ONIG_OPTION_IGNORECASE) rb_ary_push(options, ID2SYM(rb_intern("ignorecase")));
45
+ if (option & ONIG_OPTION_EXTEND) rb_ary_push(options, ID2SYM(rb_intern("extend")));
46
+ if (option & ONIG_OPTION_MULTILINE) rb_ary_push(options, ID2SYM(rb_intern("multiline")));
47
+ if (option & ONIG_OPTION_DOTALL) rb_ary_push(options, ID2SYM(rb_intern("dotall")));
48
+ if (option & ONIG_OPTION_SINGLELINE) rb_ary_push(options, ID2SYM(rb_intern("singleline")));
49
+ if (option & ONIG_OPTION_FIND_LONGEST) rb_ary_push(options, ID2SYM(rb_intern("find_longest")));
50
+ if (option & ONIG_OPTION_FIND_NOT_EMPTY) rb_ary_push(options, ID2SYM(rb_intern("find_not_empty")));
51
+ if (option & ONIG_OPTION_NEGATE_SINGLELINE) rb_ary_push(options, ID2SYM(rb_intern("negate_singleline")));
52
+ if (option & ONIG_OPTION_DONT_CAPTURE_GROUP) rb_ary_push(options, ID2SYM(rb_intern("dont_capture_group")));
53
+ if (option & ONIG_OPTION_CAPTURE_GROUP) rb_ary_push(options, ID2SYM(rb_intern("capture_group")));
54
+ if (option & ONIG_OPTION_NOTBOL) rb_ary_push(options, ID2SYM(rb_intern("not_bol")));
55
+ if (option & ONIG_OPTION_NOTEOL) rb_ary_push(options, ID2SYM(rb_intern("not_eol")));
56
+ if (option & ONIG_OPTION_NOTBOS) rb_ary_push(options, ID2SYM(rb_intern("not_bos")));
57
+ if (option & ONIG_OPTION_NOTEOS) rb_ary_push(options, ID2SYM(rb_intern("not_eos")));
58
+ if (option & ONIG_OPTION_ASCII_RANGE) rb_ary_push(options, ID2SYM(rb_intern("ascii_range")));
59
+ if (option & ONIG_OPTION_POSIX_BRACKET_ALL_RANGE) rb_ary_push(options, ID2SYM(rb_intern("posix_bracket_all_range")));
60
+ if (option & ONIG_OPTION_WORD_BOUND_ALL_RANGE) rb_ary_push(options, ID2SYM(rb_intern("word_bound_all_range")));
61
+ if (option & ONIG_OPTION_NEWLINE_CRLF) rb_ary_push(options, ID2SYM(rb_intern("newline_crlf")));
62
+
63
+ return options;
64
+ }
65
+
66
+ static VALUE
67
+ build_bitset(BitSetRef ref, OnigEncoding encoding) {
68
+ VALUE values = rb_ary_new();
69
+
70
+ for (int index = 0; index < SINGLE_BYTE_SIZE; index++) {
71
+ if (BITSET_AT(ref, index) != 0) {
72
+ const char character = (const char) index;
73
+ rb_ary_push(values, rb_enc_str_new(&character, 1, encoding));
74
+ }
75
+ }
76
+
77
+ return values;
78
+ }
79
+
80
+ static VALUE
81
+ build_node(Node *node, OnigEncoding encoding) {
82
+ int type = NTYPE(node);
83
+
84
+ switch (type) {
85
+ case NT_STR: {
86
+ VALUE value = rb_enc_str_new((const char *) NSTR(node)->s, NSTR(node)->end - NSTR(node)->s, encoding);
87
+ VALUE argv[] = { value };
88
+ return rb_class_new_instance(1, argv, rb_cOnigmoStringNode);
89
+ }
90
+ case NT_CCLASS: {
91
+ CClassNode* cclass_node = NCCLASS(node);
92
+ VALUE values = build_bitset(cclass_node->bs, encoding);
93
+
94
+ if (cclass_node->mbuf != NULL) {
95
+ BBuf *bbuf = cclass_node->mbuf;
96
+ OnigCodePoint *data = (OnigCodePoint *) bbuf->p;
97
+ OnigCodePoint *end = (OnigCodePoint *) (bbuf->p + bbuf->used);
98
+
99
+ for (++data; data < end; data += 2) {
100
+ for (OnigCodePoint code = data[0]; code < data[1]; code++) {
101
+ rb_ary_push(values, INT2NUM(code));
102
+ }
103
+ }
104
+ }
105
+
106
+ VALUE argv[] = { values };
107
+ if (IS_NCCLASS_NOT(cclass_node)) {
108
+ return rb_class_new_instance(1, argv, rb_cOnigmoCClassInvertNode);
109
+ } else {
110
+ return rb_class_new_instance(1, argv, rb_cOnigmoCClassNode);
111
+ }
112
+ }
113
+ case NT_CTYPE: {
114
+ if (NCTYPE(node)->ctype == ONIGENC_CTYPE_WORD) {
115
+ if (NCTYPE(node)->not == 0) {
116
+ return rb_class_new_instance(0, NULL, rb_cOnigmoWordNode);
117
+ } else {
118
+ return rb_class_new_instance(0, NULL, rb_cOnigmoWordInvertNode);
119
+ }
120
+ } else {
121
+ RUBY_ASSERT("unknown ctype");
122
+ return Qnil;
123
+ }
124
+ }
125
+ case NT_CANY: {
126
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnyNode);
127
+ }
128
+ case NT_BREF: {
129
+ BRefNode *backref_node = NBREF(node);
130
+ int *backrefs = BACKREFS_P(backref_node);
131
+
132
+ VALUE values = rb_ary_new();
133
+ for (int index = 0; index < backref_node->back_num; index++) {
134
+ rb_ary_push(values, INT2NUM(backrefs[index]));
135
+ }
136
+
137
+ VALUE argv[] = { values };
138
+ return rb_class_new_instance(1, argv, rb_cOnigmoBackrefNode);
139
+ }
140
+ case NT_QTFR: {
141
+ int lower = NQTFR(node)->lower;
142
+ int upper = NQTFR(node)->upper;
143
+
144
+ VALUE argv[] = {
145
+ lower == -1 ? Qnil : INT2NUM(lower),
146
+ upper = -1 ? Qnil : INT2NUM(upper),
147
+ (NQTFR(node)->greedy ? Qtrue : Qfalse),
148
+ build_node(NQTFR(node)->target, encoding)
149
+ };
150
+
151
+ return rb_class_new_instance(4, argv, rb_cOnigmoQuantifierNode);
152
+ }
153
+ case NT_ENCLOSE: {
154
+ VALUE target = build_node(NENCLOSE(node)->target, encoding);
155
+
156
+ switch (NENCLOSE(node)->type) {
157
+ case ENCLOSE_OPTION: {
158
+ VALUE argv[] = { build_options(NENCLOSE(node)->option), target };
159
+ return rb_class_new_instance(2, argv, rb_cOnigmoEncloseOptionsNode);
160
+ }
161
+ case ENCLOSE_MEMORY: {
162
+ VALUE argv[] = { INT2NUM(NENCLOSE(node)->regnum), target };
163
+ return rb_class_new_instance(2, argv, rb_cOnigmoEncloseMemoryNode);
164
+ }
165
+ case ENCLOSE_STOP_BACKTRACK: {
166
+ VALUE argv[] = { target };
167
+ return rb_class_new_instance(1, argv, rb_cOnigmoEncloseStopBacktrackNode);
168
+ }
169
+ case ENCLOSE_CONDITION: {
170
+ VALUE argv[] = { INT2NUM(NENCLOSE(node)->regnum), target };
171
+ return rb_class_new_instance(2, argv, rb_cOnigmoEncloseConditionNode);
172
+ }
173
+ case ENCLOSE_ABSENT: {
174
+ VALUE argv[] = { target };
175
+ return rb_class_new_instance(1, argv, rb_cOnigmoEncloseAbsentNode);
176
+ }
177
+ default:
178
+ RUBY_ASSERT("unknown enclose type");
179
+ return Qnil;
180
+ }
181
+ }
182
+ case NT_ANCHOR: {
183
+ switch (NANCHOR(node)->type) {
184
+ case ANCHOR_BEGIN_BUF:
185
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorBufferBeginNode);
186
+ case ANCHOR_END_BUF:
187
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorBufferEndNode);
188
+ case ANCHOR_BEGIN_LINE:
189
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorLineBeginNode);
190
+ case ANCHOR_END_LINE:
191
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorLineEndNode);
192
+ case ANCHOR_SEMI_END_BUF:
193
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorSemiEndNode);
194
+ case ANCHOR_BEGIN_POSITION:
195
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorPositionBeginNode);
196
+ case ANCHOR_WORD_BOUND:
197
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorWordBoundaryNode);
198
+ case ANCHOR_NOT_WORD_BOUND:
199
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorWordBoundaryInvertNode);
200
+ case ANCHOR_PREC_READ: {
201
+ VALUE target = build_node(NANCHOR(node)->target, encoding);
202
+ VALUE argv[] = { target };
203
+ return rb_class_new_instance(1, argv, rb_cOnigmoLookAheadNode);
204
+ }
205
+ case ANCHOR_PREC_READ_NOT: {
206
+ VALUE target = build_node(NANCHOR(node)->target, encoding);
207
+ VALUE argv[] = { target };
208
+ return rb_class_new_instance(1, argv, rb_cOnigmoLookAheadInvertNode);
209
+ }
210
+ case ANCHOR_LOOK_BEHIND: {
211
+ VALUE target = build_node(NANCHOR(node)->target, encoding);
212
+ VALUE argv[] = { target };
213
+ return rb_class_new_instance(1, argv, rb_cOnigmoLookBehindNode);
214
+ }
215
+ case ANCHOR_LOOK_BEHIND_NOT: {
216
+ VALUE target = build_node(NANCHOR(node)->target, encoding);
217
+ VALUE argv[] = { target };
218
+ return rb_class_new_instance(1, argv, rb_cOnigmoLookBehindInvertNode);
219
+ }
220
+ case ANCHOR_KEEP:
221
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorKeepNode);
222
+ default:
223
+ RUBY_ASSERT("unknown anchor type");
224
+ return Qnil;
225
+ }
226
+ }
227
+ case NT_LIST: {
228
+ VALUE nodes = rb_ary_new();
229
+ rb_ary_push(nodes, build_node(NCAR(node), encoding));
230
+
231
+ while (IS_NOT_NULL(node = NCDR(node))) {
232
+ RUBY_ASSERT(NTYPE(node) == type);
233
+ rb_ary_push(nodes, build_node(NCAR(node), encoding));
234
+ }
235
+
236
+ VALUE argv[] = { nodes };
237
+ return rb_class_new_instance(1, argv, rb_cOnigmoListNode);
238
+ }
239
+ case NT_ALT: {
240
+ VALUE nodes = rb_ary_new();
241
+ rb_ary_push(nodes, build_node(NCAR(node), encoding));
242
+
243
+ while (IS_NOT_NULL(node = NCDR(node))) {
244
+ RUBY_ASSERT(NTYPE(node) == type);
245
+ rb_ary_push(nodes, build_node(NCAR(node), encoding));
246
+ }
247
+
248
+ VALUE argv[] = { nodes };
249
+ return rb_class_new_instance(1, argv, rb_cOnigmoAlternationNode);
250
+ }
251
+ case NT_CALL: {
252
+ CallNode *call_node = NCALL(node);
253
+
254
+ VALUE name;
255
+ ptrdiff_t length = call_node->name_end - call_node->name;
256
+
257
+ if (length > 0) {
258
+ name = rb_enc_str_new((const char *) call_node->name, length, encoding);
259
+ } else {
260
+ name = Qnil;
261
+ }
262
+
263
+ VALUE argv[] = { INT2NUM(call_node->group_num), name };
264
+ return rb_class_new_instance(2, argv, rb_cOnigmoCallNode);
265
+ }
266
+ default: {
267
+ RUBY_ASSERT("unknown node type");
268
+ return Qnil;
269
+ }
270
+ }
271
+ }
272
+
273
+ static void
274
+ fail(int result, regex_t *regex, OnigErrorInfo *einfo) {
275
+ OnigUChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
276
+ onig_error_code_to_str(message, result, einfo);
277
+
278
+ onig_free(regex);
279
+ onig_end();
280
+
281
+ rb_raise(rb_eArgError, "%s", message);
282
+ }
283
+
284
+ static VALUE
285
+ parse(VALUE self, VALUE string) {
286
+ const OnigUChar *pattern = (const OnigUChar *) StringValueCStr(string);
287
+ const OnigUChar *pattern_end = pattern + strlen((const char *) pattern);
288
+
289
+ regex_t *regex = calloc(1, sizeof(regex_t));
290
+ if (regex == NULL) {
291
+ rb_raise(rb_eNoMemError, "failed to allocate memory");
292
+ return Qnil;
293
+ }
294
+
295
+ int result;
296
+ OnigEncoding encoding = rb_enc_get(string);
297
+
298
+ if ((result = onig_reg_init(regex, ONIG_OPTION_DEFAULT, ONIGENC_CASE_FOLD_DEFAULT, encoding, ONIG_SYNTAX_DEFAULT)) != ONIG_NORMAL) {
299
+ fail(result, regex, NULL);
300
+ return Qnil;
301
+ }
302
+
303
+ OnigDistance init_size = (pattern_end - pattern) * 2;
304
+ result = BBUF_INIT(regex, init_size);
305
+
306
+ if (result != ONIG_NORMAL) {
307
+ fail(result, regex, NULL);
308
+ return Qnil;
309
+ }
310
+
311
+ Node *root;
312
+ ScanEnv scan_env = { 0 };
313
+
314
+ result = onig_parse_make_tree(&root, pattern, pattern_end, regex, &scan_env);
315
+ if (result != ONIG_NORMAL) {
316
+ fail(result, regex, NULL);
317
+ return Qnil;
318
+ }
319
+
320
+ VALUE node = build_node(root, encoding);
321
+
322
+ onig_node_free(root);
323
+ onig_free(regex);
324
+ onig_end();
325
+
326
+ return node;
327
+ }
328
+
329
+ static VALUE
330
+ read_memnum(const unsigned char **cursor) {
331
+ MemNumType memnum = *((MemNumType *) *cursor);
332
+ *cursor += SIZE_MEMNUM;
333
+ return INT2NUM(memnum);
334
+ }
335
+
336
+ static VALUE
337
+ read_reladdr(const unsigned char **cursor) {
338
+ RelAddrType address;
339
+ GET_RELADDR_INC(address, *cursor);
340
+ return INT2NUM(address);
341
+ }
342
+
343
+ static VALUE
344
+ read_absaddr(const unsigned char **cursor) {
345
+ AbsAddrType address;
346
+ GET_ABSADDR_INC(address, *cursor);
347
+ return INT2NUM(address);
348
+ }
349
+
350
+ static VALUE
351
+ read_exact(const unsigned char **cursor, int length, OnigEncoding encoding) {
352
+ VALUE exact = rb_enc_str_new((const char *) *cursor, length, encoding);
353
+ *cursor += length;
354
+ return exact;
355
+ }
356
+
357
+ static VALUE
358
+ read_length(const unsigned char **cursor) {
359
+ LengthType length;
360
+ GET_LENGTH_INC(length, *cursor);
361
+ return INT2NUM(length);
362
+ }
363
+
364
+ static VALUE
365
+ read_bitset(const unsigned char **cursor, OnigEncoding encoding) {
366
+ VALUE bitset = build_bitset((BitSetRef) (*cursor), encoding);
367
+ *cursor += SIZE_BITSET;
368
+ return bitset;
369
+ }
370
+
371
+ static VALUE
372
+ read_option(const unsigned char **cursor) {
373
+ OnigOptionType option = *((OnigOptionType *) cursor);
374
+ *cursor += SIZE_OPTION;
375
+ return build_options(option);
376
+ }
377
+
378
+ static VALUE
379
+ read_state_check(const unsigned char **cursor) {
380
+ StateCheckNumType state_check = *((StateCheckNumType *) cursor);
381
+ *cursor += SIZE_STATE_CHECK_NUM;
382
+ return INT2NUM(state_check);
383
+ }
384
+
385
+ static VALUE
386
+ read_codepoint(const unsigned char **cursor, LengthType length) {
387
+ const unsigned char *buffer = *cursor;
388
+
389
+ #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
390
+ ALIGNMENT_RIGHT(buffer);
391
+ #endif
392
+
393
+ OnigCodePoint code = *((OnigCodePoint *) buffer);
394
+ *cursor += length;
395
+
396
+ return INT2NUM(code);
397
+ }
398
+
399
+ static VALUE
400
+ compile(VALUE self, VALUE string) {
401
+ const OnigUChar *pattern = (const OnigUChar *) StringValueCStr(string);
402
+
403
+ regex_t *regex;
404
+ OnigErrorInfo einfo;
405
+
406
+ OnigEncoding encoding = rb_enc_get(string);
407
+ int result = onig_new(&regex, pattern, pattern + strlen((const char *) pattern), ONIG_OPTION_DEFAULT, encoding, ONIG_SYNTAX_DEFAULT, &einfo);
408
+
409
+ if (result != ONIG_NORMAL) {
410
+ fail(result, regex, &einfo);
411
+ return Qnil;
412
+ }
413
+
414
+ VALUE insns = rb_ary_new();
415
+ const unsigned char *cursor = regex->p;
416
+ const unsigned char *end = cursor + regex->used;
417
+ LengthType length;
418
+
419
+ while (cursor < end) {
420
+ VALUE insn = rb_ary_new();
421
+
422
+ switch (*cursor++) {
423
+ case OP_FINISH: {
424
+ rb_ary_push(insn, ID2SYM(rb_intern("finish")));
425
+ break;
426
+ }
427
+ case OP_END: {
428
+ rb_ary_push(insn, ID2SYM(rb_intern("end")));
429
+ break;
430
+ }
431
+ case OP_EXACT1: {
432
+ rb_ary_push(insn, ID2SYM(rb_intern("exact1")));
433
+ rb_ary_push(insn, read_exact(&cursor, 1, encoding));
434
+ break;
435
+ }
436
+ case OP_EXACT2: {
437
+ rb_ary_push(insn, ID2SYM(rb_intern("exact2")));
438
+ rb_ary_push(insn, read_exact(&cursor, 2, encoding));
439
+ break;
440
+ }
441
+ case OP_EXACT3: {
442
+ rb_ary_push(insn, ID2SYM(rb_intern("exact3")));
443
+ rb_ary_push(insn, read_exact(&cursor, 3, encoding));
444
+ break;
445
+ }
446
+ case OP_EXACT4: {
447
+ rb_ary_push(insn, ID2SYM(rb_intern("exact4")));
448
+ rb_ary_push(insn, read_exact(&cursor, 4, encoding));
449
+ break;
450
+ }
451
+ case OP_EXACT5: {
452
+ rb_ary_push(insn, ID2SYM(rb_intern("exact5")));
453
+ rb_ary_push(insn, read_exact(&cursor, 5, encoding));
454
+ break;
455
+ }
456
+ case OP_EXACTN: {
457
+ rb_ary_push(insn, ID2SYM(rb_intern("exactn")));
458
+ rb_ary_push(insn, read_length(&cursor));
459
+ break;
460
+ }
461
+ case OP_EXACTMB2N1: {
462
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n1")));
463
+ rb_ary_push(insn, read_exact(&cursor, 2, encoding));
464
+ break;
465
+ }
466
+ case OP_EXACTMB2N2: {
467
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n2")));
468
+ rb_ary_push(insn, read_exact(&cursor, 4, encoding));
469
+ break;
470
+ }
471
+ case OP_EXACTMB2N3: {
472
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n3")));
473
+ rb_ary_push(insn, read_exact(&cursor, 6, encoding));
474
+ break;
475
+ }
476
+ case OP_EXACTMB2N: {
477
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n")));
478
+
479
+ VALUE length = read_length(&cursor);
480
+ rb_ary_push(insn, length);
481
+
482
+ rb_ary_push(insn, read_exact(&cursor, NUM2INT(length) * 2, encoding));
483
+ break;
484
+ }
485
+ case OP_EXACTMB3N: {
486
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmb3n")));
487
+
488
+ VALUE length = read_length(&cursor);
489
+ rb_ary_push(insn, length);
490
+
491
+ rb_ary_push(insn, read_exact(&cursor, NUM2INT(length) * 3, encoding));
492
+ break;
493
+ }
494
+ case OP_EXACTMBN: {
495
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmbn")));
496
+
497
+ VALUE length = read_length(&cursor);
498
+ rb_ary_push(insn, length);
499
+
500
+ rb_ary_push(insn, read_exact(&cursor, NUM2INT(length) * 2, encoding));
501
+ break;
502
+ }
503
+ case OP_EXACT1_IC: {
504
+ rb_ary_push(insn, ID2SYM(rb_intern("exact1_ic")));
505
+ length = enclen(encoding, cursor, end);
506
+ rb_ary_push(insn, read_exact(&cursor, length, encoding));
507
+ break;
508
+ }
509
+ case OP_EXACTN_IC: {
510
+ rb_ary_push(insn, ID2SYM(rb_intern("exactn_ic")));
511
+ length = enclen(encoding, cursor, end);
512
+ rb_ary_push(insn, read_exact(&cursor, length, encoding));
513
+ break;
514
+ }
515
+ case OP_CCLASS: {
516
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass")));
517
+ rb_ary_push(insn, read_bitset(&cursor, encoding));
518
+ break;
519
+ }
520
+ case OP_CCLASS_MB: {
521
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass_mb")));
522
+
523
+ VALUE length = read_length(&cursor);
524
+ rb_ary_push(insn, length);
525
+ rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
526
+
527
+ break;
528
+ }
529
+ case OP_CCLASS_MB_NOT: {
530
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass_mb_not")));
531
+
532
+ VALUE length = read_length(&cursor);
533
+ rb_ary_push(insn, length);
534
+ rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
535
+
536
+ break;
537
+ }
538
+ case OP_CCLASS_NOT: {
539
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass_not")));
540
+ rb_ary_push(insn, read_bitset(&cursor, encoding));
541
+ break;
542
+ }
543
+ case OP_CCLASS_MIX: {
544
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass_mix")));
545
+ rb_ary_push(insn, read_bitset(&cursor, encoding));
546
+
547
+ VALUE length = read_length(&cursor);
548
+ rb_ary_push(insn, length);
549
+ rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
550
+
551
+ break;
552
+ }
553
+ case OP_CCLASS_MIX_NOT: {
554
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass_mix_not")));
555
+ rb_ary_push(insn, read_bitset(&cursor, encoding));
556
+
557
+ VALUE length = read_length(&cursor);
558
+ rb_ary_push(insn, length);
559
+ rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
560
+
561
+ break;
562
+ }
563
+ case OP_ANYCHAR: {
564
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar")));
565
+ break;
566
+ }
567
+ case OP_ANYCHAR_ML: {
568
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar_ml")));
569
+ break;
570
+ }
571
+ case OP_ANYCHAR_STAR: {
572
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar_star")));
573
+ break;
574
+ }
575
+ case OP_ANYCHAR_ML_STAR: {
576
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar_ml_star")));
577
+ break;
578
+ }
579
+ case OP_ANYCHAR_STAR_PEEK_NEXT: {
580
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar_star_peek_next")));
581
+ rb_ary_push(insn, read_exact(&cursor, 1, encoding));
582
+ break;
583
+ }
584
+ case OP_ANYCHAR_ML_STAR_PEEK_NEXT: {
585
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar_ml_star_peek_next")));
586
+ rb_ary_push(insn, read_exact(&cursor, 1, encoding));
587
+ break;
588
+ }
589
+ case OP_WORD: {
590
+ rb_ary_push(insn, ID2SYM(rb_intern("word")));
591
+ break;
592
+ }
593
+ case OP_NOT_WORD: {
594
+ rb_ary_push(insn, ID2SYM(rb_intern("not_word")));
595
+ break;
596
+ }
597
+ case OP_WORD_BOUND: {
598
+ rb_ary_push(insn, ID2SYM(rb_intern("word_bound")));
599
+ break;
600
+ }
601
+ case OP_NOT_WORD_BOUND: {
602
+ rb_ary_push(insn, ID2SYM(rb_intern("not_word_bound")));
603
+ break;
604
+ }
605
+ case OP_WORD_BEGIN: {
606
+ rb_ary_push(insn, ID2SYM(rb_intern("word_begin")));
607
+ break;
608
+ }
609
+ case OP_WORD_END: {
610
+ rb_ary_push(insn, ID2SYM(rb_intern("word_end")));
611
+ break;
612
+ }
613
+ case OP_ASCII_WORD: {
614
+ rb_ary_push(insn, ID2SYM(rb_intern("ascii_word")));
615
+ break;
616
+ }
617
+ case OP_NOT_ASCII_WORD: {
618
+ rb_ary_push(insn, ID2SYM(rb_intern("not_ascii_word")));
619
+ break;
620
+ }
621
+ case OP_ASCII_WORD_BOUND: {
622
+ rb_ary_push(insn, ID2SYM(rb_intern("ascii_word_bound")));
623
+ break;
624
+ }
625
+ case OP_NOT_ASCII_WORD_BOUND: {
626
+ rb_ary_push(insn, ID2SYM(rb_intern("not_ascii_word_bound")));
627
+ break;
628
+ }
629
+ case OP_ASCII_WORD_BEGIN: {
630
+ rb_ary_push(insn, ID2SYM(rb_intern("ascii_word_begin")));
631
+ break;
632
+ }
633
+ case OP_ASCII_WORD_END: {
634
+ rb_ary_push(insn, ID2SYM(rb_intern("ascii_word_end")));
635
+ break;
636
+ }
637
+ case OP_BEGIN_BUF: {
638
+ rb_ary_push(insn, ID2SYM(rb_intern("begin_buf")));
639
+ break;
640
+ }
641
+ case OP_END_BUF: {
642
+ rb_ary_push(insn, ID2SYM(rb_intern("end_buf")));
643
+ break;
644
+ }
645
+ case OP_BEGIN_LINE: {
646
+ rb_ary_push(insn, ID2SYM(rb_intern("begin_line")));
647
+ break;
648
+ }
649
+ case OP_END_LINE: {
650
+ rb_ary_push(insn, ID2SYM(rb_intern("end_line")));
651
+ break;
652
+ }
653
+ case OP_SEMI_END_BUF: {
654
+ rb_ary_push(insn, ID2SYM(rb_intern("semi_end_buf")));
655
+ break;
656
+ }
657
+ case OP_BEGIN_POSITION: {
658
+ rb_ary_push(insn, ID2SYM(rb_intern("begin_position")));
659
+ break;
660
+ }
661
+ case OP_BACKREF1: {
662
+ rb_ary_push(insn, ID2SYM(rb_intern("backref1")));
663
+ break;
664
+ }
665
+ case OP_BACKREF2: {
666
+ rb_ary_push(insn, ID2SYM(rb_intern("backref2")));
667
+ break;
668
+ }
669
+ case OP_BACKREFN: {
670
+ rb_ary_push(insn, ID2SYM(rb_intern("backrefn")));
671
+ rb_ary_push(insn, read_memnum(&cursor));
672
+ break;
673
+ }
674
+ case OP_BACKREFN_IC: {
675
+ rb_ary_push(insn, ID2SYM(rb_intern("backrefn_ic")));
676
+ rb_ary_push(insn, read_memnum(&cursor));
677
+ break;
678
+ }
679
+ case OP_BACKREF_MULTI: {
680
+ rb_ary_push(insn, ID2SYM(rb_intern("backref_multi")));
681
+
682
+ VALUE length = read_length(&cursor);
683
+ rb_ary_push(insn, length);
684
+
685
+ for (int i = 0; i < NUM2INT(length); i++) {
686
+ rb_ary_push(insn, read_memnum(&cursor));
687
+ }
688
+
689
+ break;
690
+ }
691
+ case OP_BACKREF_MULTI_IC: {
692
+ rb_ary_push(insn, ID2SYM(rb_intern("backref_multi_ic")));
693
+
694
+ VALUE length = read_length(&cursor);
695
+ rb_ary_push(insn, length);
696
+
697
+ for (int i = 0; i < NUM2INT(length); i++) {
698
+ rb_ary_push(insn, read_memnum(&cursor));
699
+ }
700
+
701
+ break;
702
+ }
703
+ case OP_BACKREF_WITH_LEVEL: {
704
+ rb_ary_push(insn, ID2SYM(rb_intern("backref_with_level")));
705
+
706
+ rb_ary_push(insn, read_option(&cursor));
707
+ rb_ary_push(insn, read_length(&cursor));
708
+
709
+ VALUE length = read_length(&cursor);
710
+ rb_ary_push(insn, length);
711
+
712
+ for (int i = 0; i < NUM2INT(length); i++) {
713
+ rb_ary_push(insn, read_memnum(&cursor));
714
+ }
715
+
716
+ break;
717
+ }
718
+ case OP_MEMORY_START:
719
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_start")));
720
+ rb_ary_push(insn, read_memnum(&cursor));
721
+ break;
722
+
723
+ case OP_MEMORY_START_PUSH:
724
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_start_push")));
725
+ rb_ary_push(insn, read_memnum(&cursor));
726
+ break;
727
+
728
+ case OP_MEMORY_END_PUSH:
729
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_end_push")));
730
+ rb_ary_push(insn, read_memnum(&cursor));
731
+ break;
732
+
733
+ case OP_MEMORY_END_PUSH_REC:
734
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_end_push_rec")));
735
+ rb_ary_push(insn, read_memnum(&cursor));
736
+ break;
737
+
738
+ case OP_MEMORY_END:
739
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_end")));
740
+ rb_ary_push(insn, read_memnum(&cursor));
741
+ break;
742
+
743
+ case OP_MEMORY_END_REC:
744
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_end_rec")));
745
+ rb_ary_push(insn, read_memnum(&cursor));
746
+ break;
747
+
748
+ case OP_KEEP: {
749
+ rb_ary_push(insn, ID2SYM(rb_intern("keep")));
750
+ break;
751
+ }
752
+ case OP_FAIL: {
753
+ rb_ary_push(insn, ID2SYM(rb_intern("fail")));
754
+ break;
755
+ }
756
+ case OP_JUMP: {
757
+ rb_ary_push(insn, ID2SYM(rb_intern("jump")));
758
+ rb_ary_push(insn, read_reladdr(&cursor));
759
+ break;
760
+ }
761
+ case OP_PUSH: {
762
+ rb_ary_push(insn, ID2SYM(rb_intern("push")));
763
+ rb_ary_push(insn, read_reladdr(&cursor));
764
+ break;
765
+ }
766
+ case OP_POP: {
767
+ rb_ary_push(insn, ID2SYM(rb_intern("pop")));
768
+ break;
769
+ }
770
+ case OP_PUSH_OR_JUMP_EXACT1: {
771
+ rb_ary_push(insn, ID2SYM(rb_intern("push_or_jump_exact1")));
772
+ rb_ary_push(insn, read_reladdr(&cursor));
773
+ rb_ary_push(insn, read_exact(&cursor, 1, encoding));
774
+ break;
775
+ }
776
+ case OP_PUSH_IF_PEEK_NEXT: {
777
+ rb_ary_push(insn, ID2SYM(rb_intern("push_if_peek_next")));
778
+ rb_ary_push(insn, read_reladdr(&cursor));
779
+ rb_ary_push(insn, read_exact(&cursor, 1, encoding));
780
+ break;
781
+ }
782
+ case OP_REPEAT: {
783
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat")));
784
+ rb_ary_push(insn, read_memnum(&cursor));
785
+ rb_ary_push(insn, read_reladdr(&cursor));
786
+ break;
787
+ }
788
+ case OP_REPEAT_NG: {
789
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat_ng")));
790
+ rb_ary_push(insn, read_memnum(&cursor));
791
+ rb_ary_push(insn, read_reladdr(&cursor));
792
+ break;
793
+ }
794
+ case OP_REPEAT_INC: {
795
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc")));
796
+ rb_ary_push(insn, read_memnum(&cursor));
797
+ break;
798
+ }
799
+ case OP_REPEAT_INC_NG: {
800
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc_ng")));
801
+ rb_ary_push(insn, read_memnum(&cursor));
802
+ break;
803
+ }
804
+ case OP_REPEAT_INC_SG: {
805
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc_sg")));
806
+ rb_ary_push(insn, read_memnum(&cursor));
807
+ break;
808
+ }
809
+ case OP_REPEAT_INC_NG_SG: {
810
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc_ng_sg")));
811
+ rb_ary_push(insn, read_memnum(&cursor));
812
+ break;
813
+ }
814
+ case OP_NULL_CHECK_START: {
815
+ rb_ary_push(insn, ID2SYM(rb_intern("null_check_start")));
816
+ rb_ary_push(insn, read_memnum(&cursor));
817
+ break;
818
+ }
819
+ case OP_NULL_CHECK_END: {
820
+ rb_ary_push(insn, ID2SYM(rb_intern("null_check_end")));
821
+ rb_ary_push(insn, read_memnum(&cursor));
822
+ break;
823
+ }
824
+ case OP_NULL_CHECK_END_MEMST: {
825
+ rb_ary_push(insn, ID2SYM(rb_intern("null_check_end_memst")));
826
+ rb_ary_push(insn, read_memnum(&cursor));
827
+ break;
828
+ }
829
+ case OP_NULL_CHECK_END_MEMST_PUSH: {
830
+ rb_ary_push(insn, ID2SYM(rb_intern("null_check_end_memst_push")));
831
+ rb_ary_push(insn, read_memnum(&cursor));
832
+ break;
833
+ }
834
+ case OP_PUSH_POS: {
835
+ rb_ary_push(insn, ID2SYM(rb_intern("push_pos")));
836
+ break;
837
+ }
838
+ case OP_POP_POS: {
839
+ rb_ary_push(insn, ID2SYM(rb_intern("pop_pos")));
840
+ break;
841
+ }
842
+ case OP_PUSH_POS_NOT: {
843
+ rb_ary_push(insn, ID2SYM(rb_intern("push_pos_not")));
844
+ rb_ary_push(insn, read_reladdr(&cursor));
845
+ break;
846
+ }
847
+ case OP_FAIL_POS: {
848
+ rb_ary_push(insn, ID2SYM(rb_intern("fail_pos")));
849
+ break;
850
+ }
851
+ case OP_PUSH_STOP_BT: {
852
+ rb_ary_push(insn, ID2SYM(rb_intern("push_stop_bt")));
853
+ break;
854
+ }
855
+ case OP_POP_STOP_BT: {
856
+ rb_ary_push(insn, ID2SYM(rb_intern("pop_stop_bt")));
857
+ break;
858
+ }
859
+ case OP_LOOK_BEHIND: {
860
+ rb_ary_push(insn, ID2SYM(rb_intern("look_behind")));
861
+ rb_ary_push(insn, read_length(&cursor));
862
+ break;
863
+ }
864
+ case OP_PUSH_LOOK_BEHIND_NOT: {
865
+ rb_ary_push(insn, ID2SYM(rb_intern("push_look_behind_not")));
866
+ rb_ary_push(insn, read_reladdr(&cursor));
867
+ rb_ary_push(insn, read_length(&cursor));
868
+ break;
869
+ }
870
+ case OP_FAIL_LOOK_BEHIND_NOT: {
871
+ rb_ary_push(insn, ID2SYM(rb_intern("fail_look_behind_not")));
872
+ break;
873
+ }
874
+ case OP_PUSH_ABSENT_POS: {
875
+ rb_ary_push(insn, ID2SYM(rb_intern("push_absent_pos")));
876
+ break;
877
+ }
878
+ case OP_ABSENT: {
879
+ rb_ary_push(insn, ID2SYM(rb_intern("absent")));
880
+ rb_ary_push(insn, read_reladdr(&cursor));
881
+ break;
882
+ }
883
+ case OP_ABSENT_END: {
884
+ rb_ary_push(insn, ID2SYM(rb_intern("absent_end")));
885
+ break;
886
+ }
887
+ case OP_CALL: {
888
+ rb_ary_push(insn, ID2SYM(rb_intern("call")));
889
+ rb_ary_push(insn, read_absaddr(&cursor));
890
+ break;
891
+ }
892
+ case OP_RETURN: {
893
+ rb_ary_push(insn, ID2SYM(rb_intern("return")));
894
+ break;
895
+ }
896
+ case OP_CONDITION: {
897
+ rb_ary_push(insn, ID2SYM(rb_intern("condition")));
898
+ rb_ary_push(insn, read_memnum(&cursor));
899
+ rb_ary_push(insn, read_reladdr(&cursor));
900
+ break;
901
+ }
902
+ case OP_STATE_CHECK_PUSH: {
903
+ rb_ary_push(insn, ID2SYM(rb_intern("state_check_push")));
904
+ rb_ary_push(insn, read_state_check(&cursor));
905
+ rb_ary_push(insn, read_reladdr(&cursor));
906
+ break;
907
+ }
908
+ case OP_STATE_CHECK_PUSH_OR_JUMP: {
909
+ rb_ary_push(insn, ID2SYM(rb_intern("state_check_push_or_jump")));
910
+ rb_ary_push(insn, read_state_check(&cursor));
911
+ rb_ary_push(insn, read_reladdr(&cursor));
912
+ break;
913
+ }
914
+ case OP_STATE_CHECK: {
915
+ rb_ary_push(insn, ID2SYM(rb_intern("state_check")));
916
+ rb_ary_push(insn, read_state_check(&cursor));
917
+ break;
918
+ }
919
+ case OP_STATE_CHECK_ANYCHAR_STAR: {
920
+ rb_ary_push(insn, ID2SYM(rb_intern("state_check_anychar_star")));
921
+ rb_ary_push(insn, read_state_check(&cursor));
922
+ break;
923
+ }
924
+ case OP_STATE_CHECK_ANYCHAR_ML_STAR: {
925
+ rb_ary_push(insn, ID2SYM(rb_intern("state_check_anychar_ml_star")));
926
+ rb_ary_push(insn, read_state_check(&cursor));
927
+ break;
928
+ }
929
+ case OP_SET_OPTION_PUSH: {
930
+ rb_ary_push(insn, ID2SYM(rb_intern("set_option_push")));
931
+ rb_ary_push(insn, read_option(&cursor));
932
+ break;
933
+ }
934
+ case OP_SET_OPTION: {
935
+ rb_ary_push(insn, ID2SYM(rb_intern("set_option")));
936
+ rb_ary_push(insn, read_option(&cursor));
937
+ break;
938
+ }
939
+ }
940
+
941
+ rb_ary_push(insns, insn);
942
+ }
943
+
944
+ onig_free(regex);
945
+ onig_end();
946
+ return insns;
947
+ }
948
+
949
+ void
950
+ Init_onigmo(void) {
951
+ VALUE rb_cOnigmo = rb_define_module("Onigmo");
952
+ rb_define_singleton_method(rb_cOnigmo, "parse", parse, 1);
953
+ rb_define_singleton_method(rb_cOnigmo, "compile", compile, 1);
954
+
955
+ rb_cOnigmoNode = rb_define_class_under(rb_cOnigmo, "Node", rb_cObject);
956
+ rb_cOnigmoAlternationNode = rb_define_class_under(rb_cOnigmo, "AlternationNode", rb_cOnigmoNode);
957
+ rb_cOnigmoAnchorBufferBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorBufferBeginNode", rb_cOnigmoNode);
958
+ rb_cOnigmoAnchorBufferEndNode = rb_define_class_under(rb_cOnigmo, "AnchorBufferEndNode", rb_cOnigmoNode);
959
+ rb_cOnigmoAnchorKeepNode = rb_define_class_under(rb_cOnigmo, "AnchorKeepNode", rb_cOnigmoNode);
960
+ rb_cOnigmoAnchorLineBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorLineBeginNode", rb_cOnigmoNode);
961
+ rb_cOnigmoAnchorLineEndNode = rb_define_class_under(rb_cOnigmo, "AnchorLineEndNode", rb_cOnigmoNode);
962
+ rb_cOnigmoAnchorPositionBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorPositionBeginNode", rb_cOnigmoNode);
963
+ rb_cOnigmoAnchorSemiEndNode = rb_define_class_under(rb_cOnigmo, "AnchorSemiEndNode", rb_cOnigmoNode);
964
+ rb_cOnigmoAnchorWordBoundaryNode = rb_define_class_under(rb_cOnigmo, "AnchorWordBoundaryNode", rb_cOnigmoNode);
965
+ rb_cOnigmoAnchorWordBoundaryInvertNode = rb_define_class_under(rb_cOnigmo, "AnchorWordBoundaryInvertNode", rb_cOnigmoNode);
966
+ rb_cOnigmoAnyNode = rb_define_class_under(rb_cOnigmo, "AnyNode", rb_cOnigmoNode);
967
+ rb_cOnigmoBackrefNode = rb_define_class_under(rb_cOnigmo, "BackrefNode", rb_cOnigmoNode);
968
+ rb_cOnigmoCallNode = rb_define_class_under(rb_cOnigmo, "CallNode", rb_cOnigmoNode);
969
+ rb_cOnigmoCClassNode = rb_define_class_under(rb_cOnigmo, "CClassNode", rb_cOnigmoNode);
970
+ rb_cOnigmoCClassInvertNode = rb_define_class_under(rb_cOnigmo, "CClassInvertNode", rb_cOnigmoNode);
971
+ rb_cOnigmoEncloseAbsentNode = rb_define_class_under(rb_cOnigmo, "EncloseAbsentNode", rb_cOnigmoNode);
972
+ rb_cOnigmoEncloseConditionNode = rb_define_class_under(rb_cOnigmo, "EncloseConditionNode", rb_cOnigmoNode);
973
+ rb_cOnigmoEncloseMemoryNode = rb_define_class_under(rb_cOnigmo, "EncloseMemoryNode", rb_cOnigmoNode);
974
+ rb_cOnigmoEncloseOptionsNode = rb_define_class_under(rb_cOnigmo, "EncloseOptionsNode", rb_cOnigmoNode);
975
+ rb_cOnigmoEncloseStopBacktrackNode = rb_define_class_under(rb_cOnigmo, "EncloseStopBacktrackNode", rb_cOnigmoNode);
976
+ rb_cOnigmoListNode = rb_define_class_under(rb_cOnigmo, "ListNode", rb_cOnigmoNode);
977
+ rb_cOnigmoLookAheadNode = rb_define_class_under(rb_cOnigmo, "LookAheadNode", rb_cOnigmoNode);
978
+ rb_cOnigmoLookAheadInvertNode = rb_define_class_under(rb_cOnigmo, "LookAheadInvertNode", rb_cOnigmoNode);
979
+ rb_cOnigmoLookBehindNode = rb_define_class_under(rb_cOnigmo, "LookBehindNode", rb_cOnigmoNode);
980
+ rb_cOnigmoLookBehindInvertNode = rb_define_class_under(rb_cOnigmo, "LookBehindInvertNode", rb_cOnigmoNode);
981
+ rb_cOnigmoQuantifierNode = rb_define_class_under(rb_cOnigmo, "QuantifierNode", rb_cOnigmoNode);
982
+ rb_cOnigmoStringNode = rb_define_class_under(rb_cOnigmo, "StringNode", rb_cOnigmoNode);
983
+ rb_cOnigmoWordNode = rb_define_class_under(rb_cOnigmo, "WordNode", rb_cOnigmoNode);
984
+ rb_cOnigmoWordInvertNode = rb_define_class_under(rb_cOnigmo, "WordInvertNode", rb_cOnigmoNode);
985
+ }