onigmo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,985 @@
1
+ #include <ruby.h>
2
+ #include <ruby/onigmo.h>
3
+ #include <ruby/encoding.h>
4
+
5
+ #include "regint.h"
6
+ #include "regparse.h"
7
+
8
+ VALUE rb_cOnigmoNode;
9
+ VALUE rb_cOnigmoAlternationNode;
10
+ VALUE rb_cOnigmoAnchorBufferBeginNode;
11
+ VALUE rb_cOnigmoAnchorBufferEndNode;
12
+ VALUE rb_cOnigmoAnchorKeepNode;
13
+ VALUE rb_cOnigmoAnchorLineBeginNode;
14
+ VALUE rb_cOnigmoAnchorLineEndNode;
15
+ VALUE rb_cOnigmoAnchorPositionBeginNode;
16
+ VALUE rb_cOnigmoAnchorSemiEndNode;
17
+ VALUE rb_cOnigmoAnchorWordBoundaryNode;
18
+ VALUE rb_cOnigmoAnchorWordBoundaryInvertNode;
19
+ VALUE rb_cOnigmoAnyNode;
20
+ VALUE rb_cOnigmoBackrefNode;
21
+ VALUE rb_cOnigmoCallNode;
22
+ VALUE rb_cOnigmoCClassNode;
23
+ VALUE rb_cOnigmoCClassInvertNode;
24
+ VALUE rb_cOnigmoEncloseAbsentNode;
25
+ VALUE rb_cOnigmoEncloseConditionNode;
26
+ VALUE rb_cOnigmoEncloseMemoryNode;
27
+ VALUE rb_cOnigmoEncloseOptionsNode;
28
+ VALUE rb_cOnigmoEncloseStopBacktrackNode;
29
+ VALUE rb_cOnigmoListNode;
30
+ VALUE rb_cOnigmoLookAheadNode;
31
+ VALUE rb_cOnigmoLookAheadInvertNode;
32
+ VALUE rb_cOnigmoLookBehindNode;
33
+ VALUE rb_cOnigmoLookBehindInvertNode;
34
+ VALUE rb_cOnigmoQuantifierNode;
35
+ VALUE rb_cOnigmoStringNode;
36
+ VALUE rb_cOnigmoWordNode;
37
+ VALUE rb_cOnigmoWordInvertNode;
38
+
39
+ static VALUE
40
+ build_options(OnigOptionType option) {
41
+ VALUE options = rb_ary_new();
42
+
43
+ if (option & ONIG_OPTION_NONE) rb_ary_push(options, ID2SYM(rb_intern("none")));
44
+ if (option & ONIG_OPTION_IGNORECASE) rb_ary_push(options, ID2SYM(rb_intern("ignorecase")));
45
+ if (option & ONIG_OPTION_EXTEND) rb_ary_push(options, ID2SYM(rb_intern("extend")));
46
+ if (option & ONIG_OPTION_MULTILINE) rb_ary_push(options, ID2SYM(rb_intern("multiline")));
47
+ if (option & ONIG_OPTION_DOTALL) rb_ary_push(options, ID2SYM(rb_intern("dotall")));
48
+ if (option & ONIG_OPTION_SINGLELINE) rb_ary_push(options, ID2SYM(rb_intern("singleline")));
49
+ if (option & ONIG_OPTION_FIND_LONGEST) rb_ary_push(options, ID2SYM(rb_intern("find_longest")));
50
+ if (option & ONIG_OPTION_FIND_NOT_EMPTY) rb_ary_push(options, ID2SYM(rb_intern("find_not_empty")));
51
+ if (option & ONIG_OPTION_NEGATE_SINGLELINE) rb_ary_push(options, ID2SYM(rb_intern("negate_singleline")));
52
+ if (option & ONIG_OPTION_DONT_CAPTURE_GROUP) rb_ary_push(options, ID2SYM(rb_intern("dont_capture_group")));
53
+ if (option & ONIG_OPTION_CAPTURE_GROUP) rb_ary_push(options, ID2SYM(rb_intern("capture_group")));
54
+ if (option & ONIG_OPTION_NOTBOL) rb_ary_push(options, ID2SYM(rb_intern("not_bol")));
55
+ if (option & ONIG_OPTION_NOTEOL) rb_ary_push(options, ID2SYM(rb_intern("not_eol")));
56
+ if (option & ONIG_OPTION_NOTBOS) rb_ary_push(options, ID2SYM(rb_intern("not_bos")));
57
+ if (option & ONIG_OPTION_NOTEOS) rb_ary_push(options, ID2SYM(rb_intern("not_eos")));
58
+ if (option & ONIG_OPTION_ASCII_RANGE) rb_ary_push(options, ID2SYM(rb_intern("ascii_range")));
59
+ if (option & ONIG_OPTION_POSIX_BRACKET_ALL_RANGE) rb_ary_push(options, ID2SYM(rb_intern("posix_bracket_all_range")));
60
+ if (option & ONIG_OPTION_WORD_BOUND_ALL_RANGE) rb_ary_push(options, ID2SYM(rb_intern("word_bound_all_range")));
61
+ if (option & ONIG_OPTION_NEWLINE_CRLF) rb_ary_push(options, ID2SYM(rb_intern("newline_crlf")));
62
+
63
+ return options;
64
+ }
65
+
66
+ static VALUE
67
+ build_bitset(BitSetRef ref, OnigEncoding encoding) {
68
+ VALUE values = rb_ary_new();
69
+
70
+ for (int index = 0; index < SINGLE_BYTE_SIZE; index++) {
71
+ if (BITSET_AT(ref, index) != 0) {
72
+ const char character = (const char) index;
73
+ rb_ary_push(values, rb_enc_str_new(&character, 1, encoding));
74
+ }
75
+ }
76
+
77
+ return values;
78
+ }
79
+
80
+ static VALUE
81
+ build_node(Node *node, OnigEncoding encoding) {
82
+ int type = NTYPE(node);
83
+
84
+ switch (type) {
85
+ case NT_STR: {
86
+ VALUE value = rb_enc_str_new((const char *) NSTR(node)->s, NSTR(node)->end - NSTR(node)->s, encoding);
87
+ VALUE argv[] = { value };
88
+ return rb_class_new_instance(1, argv, rb_cOnigmoStringNode);
89
+ }
90
+ case NT_CCLASS: {
91
+ CClassNode* cclass_node = NCCLASS(node);
92
+ VALUE values = build_bitset(cclass_node->bs, encoding);
93
+
94
+ if (cclass_node->mbuf != NULL) {
95
+ BBuf *bbuf = cclass_node->mbuf;
96
+ OnigCodePoint *data = (OnigCodePoint *) bbuf->p;
97
+ OnigCodePoint *end = (OnigCodePoint *) (bbuf->p + bbuf->used);
98
+
99
+ for (++data; data < end; data += 2) {
100
+ for (OnigCodePoint code = data[0]; code < data[1]; code++) {
101
+ rb_ary_push(values, INT2NUM(code));
102
+ }
103
+ }
104
+ }
105
+
106
+ VALUE argv[] = { values };
107
+ if (IS_NCCLASS_NOT(cclass_node)) {
108
+ return rb_class_new_instance(1, argv, rb_cOnigmoCClassInvertNode);
109
+ } else {
110
+ return rb_class_new_instance(1, argv, rb_cOnigmoCClassNode);
111
+ }
112
+ }
113
+ case NT_CTYPE: {
114
+ if (NCTYPE(node)->ctype == ONIGENC_CTYPE_WORD) {
115
+ if (NCTYPE(node)->not == 0) {
116
+ return rb_class_new_instance(0, NULL, rb_cOnigmoWordNode);
117
+ } else {
118
+ return rb_class_new_instance(0, NULL, rb_cOnigmoWordInvertNode);
119
+ }
120
+ } else {
121
+ RUBY_ASSERT("unknown ctype");
122
+ return Qnil;
123
+ }
124
+ }
125
+ case NT_CANY: {
126
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnyNode);
127
+ }
128
+ case NT_BREF: {
129
+ BRefNode *backref_node = NBREF(node);
130
+ int *backrefs = BACKREFS_P(backref_node);
131
+
132
+ VALUE values = rb_ary_new();
133
+ for (int index = 0; index < backref_node->back_num; index++) {
134
+ rb_ary_push(values, INT2NUM(backrefs[index]));
135
+ }
136
+
137
+ VALUE argv[] = { values };
138
+ return rb_class_new_instance(1, argv, rb_cOnigmoBackrefNode);
139
+ }
140
+ case NT_QTFR: {
141
+ int lower = NQTFR(node)->lower;
142
+ int upper = NQTFR(node)->upper;
143
+
144
+ VALUE argv[] = {
145
+ lower == -1 ? Qnil : INT2NUM(lower),
146
+ upper = -1 ? Qnil : INT2NUM(upper),
147
+ (NQTFR(node)->greedy ? Qtrue : Qfalse),
148
+ build_node(NQTFR(node)->target, encoding)
149
+ };
150
+
151
+ return rb_class_new_instance(4, argv, rb_cOnigmoQuantifierNode);
152
+ }
153
+ case NT_ENCLOSE: {
154
+ VALUE target = build_node(NENCLOSE(node)->target, encoding);
155
+
156
+ switch (NENCLOSE(node)->type) {
157
+ case ENCLOSE_OPTION: {
158
+ VALUE argv[] = { build_options(NENCLOSE(node)->option), target };
159
+ return rb_class_new_instance(2, argv, rb_cOnigmoEncloseOptionsNode);
160
+ }
161
+ case ENCLOSE_MEMORY: {
162
+ VALUE argv[] = { INT2NUM(NENCLOSE(node)->regnum), target };
163
+ return rb_class_new_instance(2, argv, rb_cOnigmoEncloseMemoryNode);
164
+ }
165
+ case ENCLOSE_STOP_BACKTRACK: {
166
+ VALUE argv[] = { target };
167
+ return rb_class_new_instance(1, argv, rb_cOnigmoEncloseStopBacktrackNode);
168
+ }
169
+ case ENCLOSE_CONDITION: {
170
+ VALUE argv[] = { INT2NUM(NENCLOSE(node)->regnum), target };
171
+ return rb_class_new_instance(2, argv, rb_cOnigmoEncloseConditionNode);
172
+ }
173
+ case ENCLOSE_ABSENT: {
174
+ VALUE argv[] = { target };
175
+ return rb_class_new_instance(1, argv, rb_cOnigmoEncloseAbsentNode);
176
+ }
177
+ default:
178
+ RUBY_ASSERT("unknown enclose type");
179
+ return Qnil;
180
+ }
181
+ }
182
+ case NT_ANCHOR: {
183
+ switch (NANCHOR(node)->type) {
184
+ case ANCHOR_BEGIN_BUF:
185
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorBufferBeginNode);
186
+ case ANCHOR_END_BUF:
187
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorBufferEndNode);
188
+ case ANCHOR_BEGIN_LINE:
189
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorLineBeginNode);
190
+ case ANCHOR_END_LINE:
191
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorLineEndNode);
192
+ case ANCHOR_SEMI_END_BUF:
193
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorSemiEndNode);
194
+ case ANCHOR_BEGIN_POSITION:
195
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorPositionBeginNode);
196
+ case ANCHOR_WORD_BOUND:
197
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorWordBoundaryNode);
198
+ case ANCHOR_NOT_WORD_BOUND:
199
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorWordBoundaryInvertNode);
200
+ case ANCHOR_PREC_READ: {
201
+ VALUE target = build_node(NANCHOR(node)->target, encoding);
202
+ VALUE argv[] = { target };
203
+ return rb_class_new_instance(1, argv, rb_cOnigmoLookAheadNode);
204
+ }
205
+ case ANCHOR_PREC_READ_NOT: {
206
+ VALUE target = build_node(NANCHOR(node)->target, encoding);
207
+ VALUE argv[] = { target };
208
+ return rb_class_new_instance(1, argv, rb_cOnigmoLookAheadInvertNode);
209
+ }
210
+ case ANCHOR_LOOK_BEHIND: {
211
+ VALUE target = build_node(NANCHOR(node)->target, encoding);
212
+ VALUE argv[] = { target };
213
+ return rb_class_new_instance(1, argv, rb_cOnigmoLookBehindNode);
214
+ }
215
+ case ANCHOR_LOOK_BEHIND_NOT: {
216
+ VALUE target = build_node(NANCHOR(node)->target, encoding);
217
+ VALUE argv[] = { target };
218
+ return rb_class_new_instance(1, argv, rb_cOnigmoLookBehindInvertNode);
219
+ }
220
+ case ANCHOR_KEEP:
221
+ return rb_class_new_instance(0, NULL, rb_cOnigmoAnchorKeepNode);
222
+ default:
223
+ RUBY_ASSERT("unknown anchor type");
224
+ return Qnil;
225
+ }
226
+ }
227
+ case NT_LIST: {
228
+ VALUE nodes = rb_ary_new();
229
+ rb_ary_push(nodes, build_node(NCAR(node), encoding));
230
+
231
+ while (IS_NOT_NULL(node = NCDR(node))) {
232
+ RUBY_ASSERT(NTYPE(node) == type);
233
+ rb_ary_push(nodes, build_node(NCAR(node), encoding));
234
+ }
235
+
236
+ VALUE argv[] = { nodes };
237
+ return rb_class_new_instance(1, argv, rb_cOnigmoListNode);
238
+ }
239
+ case NT_ALT: {
240
+ VALUE nodes = rb_ary_new();
241
+ rb_ary_push(nodes, build_node(NCAR(node), encoding));
242
+
243
+ while (IS_NOT_NULL(node = NCDR(node))) {
244
+ RUBY_ASSERT(NTYPE(node) == type);
245
+ rb_ary_push(nodes, build_node(NCAR(node), encoding));
246
+ }
247
+
248
+ VALUE argv[] = { nodes };
249
+ return rb_class_new_instance(1, argv, rb_cOnigmoAlternationNode);
250
+ }
251
+ case NT_CALL: {
252
+ CallNode *call_node = NCALL(node);
253
+
254
+ VALUE name;
255
+ ptrdiff_t length = call_node->name_end - call_node->name;
256
+
257
+ if (length > 0) {
258
+ name = rb_enc_str_new((const char *) call_node->name, length, encoding);
259
+ } else {
260
+ name = Qnil;
261
+ }
262
+
263
+ VALUE argv[] = { INT2NUM(call_node->group_num), name };
264
+ return rb_class_new_instance(2, argv, rb_cOnigmoCallNode);
265
+ }
266
+ default: {
267
+ RUBY_ASSERT("unknown node type");
268
+ return Qnil;
269
+ }
270
+ }
271
+ }
272
+
273
+ static void
274
+ fail(int result, regex_t *regex, OnigErrorInfo *einfo) {
275
+ OnigUChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
276
+ onig_error_code_to_str(message, result, einfo);
277
+
278
+ onig_free(regex);
279
+ onig_end();
280
+
281
+ rb_raise(rb_eArgError, "%s", message);
282
+ }
283
+
284
+ static VALUE
285
+ parse(VALUE self, VALUE string) {
286
+ const OnigUChar *pattern = (const OnigUChar *) StringValueCStr(string);
287
+ const OnigUChar *pattern_end = pattern + strlen((const char *) pattern);
288
+
289
+ regex_t *regex = calloc(1, sizeof(regex_t));
290
+ if (regex == NULL) {
291
+ rb_raise(rb_eNoMemError, "failed to allocate memory");
292
+ return Qnil;
293
+ }
294
+
295
+ int result;
296
+ OnigEncoding encoding = rb_enc_get(string);
297
+
298
+ if ((result = onig_reg_init(regex, ONIG_OPTION_DEFAULT, ONIGENC_CASE_FOLD_DEFAULT, encoding, ONIG_SYNTAX_DEFAULT)) != ONIG_NORMAL) {
299
+ fail(result, regex, NULL);
300
+ return Qnil;
301
+ }
302
+
303
+ OnigDistance init_size = (pattern_end - pattern) * 2;
304
+ result = BBUF_INIT(regex, init_size);
305
+
306
+ if (result != ONIG_NORMAL) {
307
+ fail(result, regex, NULL);
308
+ return Qnil;
309
+ }
310
+
311
+ Node *root;
312
+ ScanEnv scan_env = { 0 };
313
+
314
+ result = onig_parse_make_tree(&root, pattern, pattern_end, regex, &scan_env);
315
+ if (result != ONIG_NORMAL) {
316
+ fail(result, regex, NULL);
317
+ return Qnil;
318
+ }
319
+
320
+ VALUE node = build_node(root, encoding);
321
+
322
+ onig_node_free(root);
323
+ onig_free(regex);
324
+ onig_end();
325
+
326
+ return node;
327
+ }
328
+
329
+ static VALUE
330
+ read_memnum(const unsigned char **cursor) {
331
+ MemNumType memnum = *((MemNumType *) *cursor);
332
+ *cursor += SIZE_MEMNUM;
333
+ return INT2NUM(memnum);
334
+ }
335
+
336
+ static VALUE
337
+ read_reladdr(const unsigned char **cursor) {
338
+ RelAddrType address;
339
+ GET_RELADDR_INC(address, *cursor);
340
+ return INT2NUM(address);
341
+ }
342
+
343
+ static VALUE
344
+ read_absaddr(const unsigned char **cursor) {
345
+ AbsAddrType address;
346
+ GET_ABSADDR_INC(address, *cursor);
347
+ return INT2NUM(address);
348
+ }
349
+
350
+ static VALUE
351
+ read_exact(const unsigned char **cursor, int length, OnigEncoding encoding) {
352
+ VALUE exact = rb_enc_str_new((const char *) *cursor, length, encoding);
353
+ *cursor += length;
354
+ return exact;
355
+ }
356
+
357
+ static VALUE
358
+ read_length(const unsigned char **cursor) {
359
+ LengthType length;
360
+ GET_LENGTH_INC(length, *cursor);
361
+ return INT2NUM(length);
362
+ }
363
+
364
+ static VALUE
365
+ read_bitset(const unsigned char **cursor, OnigEncoding encoding) {
366
+ VALUE bitset = build_bitset((BitSetRef) (*cursor), encoding);
367
+ *cursor += SIZE_BITSET;
368
+ return bitset;
369
+ }
370
+
371
+ static VALUE
372
+ read_option(const unsigned char **cursor) {
373
+ OnigOptionType option = *((OnigOptionType *) cursor);
374
+ *cursor += SIZE_OPTION;
375
+ return build_options(option);
376
+ }
377
+
378
+ static VALUE
379
+ read_state_check(const unsigned char **cursor) {
380
+ StateCheckNumType state_check = *((StateCheckNumType *) cursor);
381
+ *cursor += SIZE_STATE_CHECK_NUM;
382
+ return INT2NUM(state_check);
383
+ }
384
+
385
+ static VALUE
386
+ read_codepoint(const unsigned char **cursor, LengthType length) {
387
+ const unsigned char *buffer = *cursor;
388
+
389
+ #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
390
+ ALIGNMENT_RIGHT(buffer);
391
+ #endif
392
+
393
+ OnigCodePoint code = *((OnigCodePoint *) buffer);
394
+ *cursor += length;
395
+
396
+ return INT2NUM(code);
397
+ }
398
+
399
+ static VALUE
400
+ compile(VALUE self, VALUE string) {
401
+ const OnigUChar *pattern = (const OnigUChar *) StringValueCStr(string);
402
+
403
+ regex_t *regex;
404
+ OnigErrorInfo einfo;
405
+
406
+ OnigEncoding encoding = rb_enc_get(string);
407
+ int result = onig_new(&regex, pattern, pattern + strlen((const char *) pattern), ONIG_OPTION_DEFAULT, encoding, ONIG_SYNTAX_DEFAULT, &einfo);
408
+
409
+ if (result != ONIG_NORMAL) {
410
+ fail(result, regex, &einfo);
411
+ return Qnil;
412
+ }
413
+
414
+ VALUE insns = rb_ary_new();
415
+ const unsigned char *cursor = regex->p;
416
+ const unsigned char *end = cursor + regex->used;
417
+ LengthType length;
418
+
419
+ while (cursor < end) {
420
+ VALUE insn = rb_ary_new();
421
+
422
+ switch (*cursor++) {
423
+ case OP_FINISH: {
424
+ rb_ary_push(insn, ID2SYM(rb_intern("finish")));
425
+ break;
426
+ }
427
+ case OP_END: {
428
+ rb_ary_push(insn, ID2SYM(rb_intern("end")));
429
+ break;
430
+ }
431
+ case OP_EXACT1: {
432
+ rb_ary_push(insn, ID2SYM(rb_intern("exact1")));
433
+ rb_ary_push(insn, read_exact(&cursor, 1, encoding));
434
+ break;
435
+ }
436
+ case OP_EXACT2: {
437
+ rb_ary_push(insn, ID2SYM(rb_intern("exact2")));
438
+ rb_ary_push(insn, read_exact(&cursor, 2, encoding));
439
+ break;
440
+ }
441
+ case OP_EXACT3: {
442
+ rb_ary_push(insn, ID2SYM(rb_intern("exact3")));
443
+ rb_ary_push(insn, read_exact(&cursor, 3, encoding));
444
+ break;
445
+ }
446
+ case OP_EXACT4: {
447
+ rb_ary_push(insn, ID2SYM(rb_intern("exact4")));
448
+ rb_ary_push(insn, read_exact(&cursor, 4, encoding));
449
+ break;
450
+ }
451
+ case OP_EXACT5: {
452
+ rb_ary_push(insn, ID2SYM(rb_intern("exact5")));
453
+ rb_ary_push(insn, read_exact(&cursor, 5, encoding));
454
+ break;
455
+ }
456
+ case OP_EXACTN: {
457
+ rb_ary_push(insn, ID2SYM(rb_intern("exactn")));
458
+ rb_ary_push(insn, read_length(&cursor));
459
+ break;
460
+ }
461
+ case OP_EXACTMB2N1: {
462
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n1")));
463
+ rb_ary_push(insn, read_exact(&cursor, 2, encoding));
464
+ break;
465
+ }
466
+ case OP_EXACTMB2N2: {
467
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n2")));
468
+ rb_ary_push(insn, read_exact(&cursor, 4, encoding));
469
+ break;
470
+ }
471
+ case OP_EXACTMB2N3: {
472
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n3")));
473
+ rb_ary_push(insn, read_exact(&cursor, 6, encoding));
474
+ break;
475
+ }
476
+ case OP_EXACTMB2N: {
477
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmb2n")));
478
+
479
+ VALUE length = read_length(&cursor);
480
+ rb_ary_push(insn, length);
481
+
482
+ rb_ary_push(insn, read_exact(&cursor, NUM2INT(length) * 2, encoding));
483
+ break;
484
+ }
485
+ case OP_EXACTMB3N: {
486
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmb3n")));
487
+
488
+ VALUE length = read_length(&cursor);
489
+ rb_ary_push(insn, length);
490
+
491
+ rb_ary_push(insn, read_exact(&cursor, NUM2INT(length) * 3, encoding));
492
+ break;
493
+ }
494
+ case OP_EXACTMBN: {
495
+ rb_ary_push(insn, ID2SYM(rb_intern("exactmbn")));
496
+
497
+ VALUE length = read_length(&cursor);
498
+ rb_ary_push(insn, length);
499
+
500
+ rb_ary_push(insn, read_exact(&cursor, NUM2INT(length) * 2, encoding));
501
+ break;
502
+ }
503
+ case OP_EXACT1_IC: {
504
+ rb_ary_push(insn, ID2SYM(rb_intern("exact1_ic")));
505
+ length = enclen(encoding, cursor, end);
506
+ rb_ary_push(insn, read_exact(&cursor, length, encoding));
507
+ break;
508
+ }
509
+ case OP_EXACTN_IC: {
510
+ rb_ary_push(insn, ID2SYM(rb_intern("exactn_ic")));
511
+ length = enclen(encoding, cursor, end);
512
+ rb_ary_push(insn, read_exact(&cursor, length, encoding));
513
+ break;
514
+ }
515
+ case OP_CCLASS: {
516
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass")));
517
+ rb_ary_push(insn, read_bitset(&cursor, encoding));
518
+ break;
519
+ }
520
+ case OP_CCLASS_MB: {
521
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass_mb")));
522
+
523
+ VALUE length = read_length(&cursor);
524
+ rb_ary_push(insn, length);
525
+ rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
526
+
527
+ break;
528
+ }
529
+ case OP_CCLASS_MB_NOT: {
530
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass_mb_not")));
531
+
532
+ VALUE length = read_length(&cursor);
533
+ rb_ary_push(insn, length);
534
+ rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
535
+
536
+ break;
537
+ }
538
+ case OP_CCLASS_NOT: {
539
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass_not")));
540
+ rb_ary_push(insn, read_bitset(&cursor, encoding));
541
+ break;
542
+ }
543
+ case OP_CCLASS_MIX: {
544
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass_mix")));
545
+ rb_ary_push(insn, read_bitset(&cursor, encoding));
546
+
547
+ VALUE length = read_length(&cursor);
548
+ rb_ary_push(insn, length);
549
+ rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
550
+
551
+ break;
552
+ }
553
+ case OP_CCLASS_MIX_NOT: {
554
+ rb_ary_push(insn, ID2SYM(rb_intern("cclass_mix_not")));
555
+ rb_ary_push(insn, read_bitset(&cursor, encoding));
556
+
557
+ VALUE length = read_length(&cursor);
558
+ rb_ary_push(insn, length);
559
+ rb_ary_push(insn, read_codepoint(&cursor, NUM2INT(length)));
560
+
561
+ break;
562
+ }
563
+ case OP_ANYCHAR: {
564
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar")));
565
+ break;
566
+ }
567
+ case OP_ANYCHAR_ML: {
568
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar_ml")));
569
+ break;
570
+ }
571
+ case OP_ANYCHAR_STAR: {
572
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar_star")));
573
+ break;
574
+ }
575
+ case OP_ANYCHAR_ML_STAR: {
576
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar_ml_star")));
577
+ break;
578
+ }
579
+ case OP_ANYCHAR_STAR_PEEK_NEXT: {
580
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar_star_peek_next")));
581
+ rb_ary_push(insn, read_exact(&cursor, 1, encoding));
582
+ break;
583
+ }
584
+ case OP_ANYCHAR_ML_STAR_PEEK_NEXT: {
585
+ rb_ary_push(insn, ID2SYM(rb_intern("anychar_ml_star_peek_next")));
586
+ rb_ary_push(insn, read_exact(&cursor, 1, encoding));
587
+ break;
588
+ }
589
+ case OP_WORD: {
590
+ rb_ary_push(insn, ID2SYM(rb_intern("word")));
591
+ break;
592
+ }
593
+ case OP_NOT_WORD: {
594
+ rb_ary_push(insn, ID2SYM(rb_intern("not_word")));
595
+ break;
596
+ }
597
+ case OP_WORD_BOUND: {
598
+ rb_ary_push(insn, ID2SYM(rb_intern("word_bound")));
599
+ break;
600
+ }
601
+ case OP_NOT_WORD_BOUND: {
602
+ rb_ary_push(insn, ID2SYM(rb_intern("not_word_bound")));
603
+ break;
604
+ }
605
+ case OP_WORD_BEGIN: {
606
+ rb_ary_push(insn, ID2SYM(rb_intern("word_begin")));
607
+ break;
608
+ }
609
+ case OP_WORD_END: {
610
+ rb_ary_push(insn, ID2SYM(rb_intern("word_end")));
611
+ break;
612
+ }
613
+ case OP_ASCII_WORD: {
614
+ rb_ary_push(insn, ID2SYM(rb_intern("ascii_word")));
615
+ break;
616
+ }
617
+ case OP_NOT_ASCII_WORD: {
618
+ rb_ary_push(insn, ID2SYM(rb_intern("not_ascii_word")));
619
+ break;
620
+ }
621
+ case OP_ASCII_WORD_BOUND: {
622
+ rb_ary_push(insn, ID2SYM(rb_intern("ascii_word_bound")));
623
+ break;
624
+ }
625
+ case OP_NOT_ASCII_WORD_BOUND: {
626
+ rb_ary_push(insn, ID2SYM(rb_intern("not_ascii_word_bound")));
627
+ break;
628
+ }
629
+ case OP_ASCII_WORD_BEGIN: {
630
+ rb_ary_push(insn, ID2SYM(rb_intern("ascii_word_begin")));
631
+ break;
632
+ }
633
+ case OP_ASCII_WORD_END: {
634
+ rb_ary_push(insn, ID2SYM(rb_intern("ascii_word_end")));
635
+ break;
636
+ }
637
+ case OP_BEGIN_BUF: {
638
+ rb_ary_push(insn, ID2SYM(rb_intern("begin_buf")));
639
+ break;
640
+ }
641
+ case OP_END_BUF: {
642
+ rb_ary_push(insn, ID2SYM(rb_intern("end_buf")));
643
+ break;
644
+ }
645
+ case OP_BEGIN_LINE: {
646
+ rb_ary_push(insn, ID2SYM(rb_intern("begin_line")));
647
+ break;
648
+ }
649
+ case OP_END_LINE: {
650
+ rb_ary_push(insn, ID2SYM(rb_intern("end_line")));
651
+ break;
652
+ }
653
+ case OP_SEMI_END_BUF: {
654
+ rb_ary_push(insn, ID2SYM(rb_intern("semi_end_buf")));
655
+ break;
656
+ }
657
+ case OP_BEGIN_POSITION: {
658
+ rb_ary_push(insn, ID2SYM(rb_intern("begin_position")));
659
+ break;
660
+ }
661
+ case OP_BACKREF1: {
662
+ rb_ary_push(insn, ID2SYM(rb_intern("backref1")));
663
+ break;
664
+ }
665
+ case OP_BACKREF2: {
666
+ rb_ary_push(insn, ID2SYM(rb_intern("backref2")));
667
+ break;
668
+ }
669
+ case OP_BACKREFN: {
670
+ rb_ary_push(insn, ID2SYM(rb_intern("backrefn")));
671
+ rb_ary_push(insn, read_memnum(&cursor));
672
+ break;
673
+ }
674
+ case OP_BACKREFN_IC: {
675
+ rb_ary_push(insn, ID2SYM(rb_intern("backrefn_ic")));
676
+ rb_ary_push(insn, read_memnum(&cursor));
677
+ break;
678
+ }
679
+ case OP_BACKREF_MULTI: {
680
+ rb_ary_push(insn, ID2SYM(rb_intern("backref_multi")));
681
+
682
+ VALUE length = read_length(&cursor);
683
+ rb_ary_push(insn, length);
684
+
685
+ for (int i = 0; i < NUM2INT(length); i++) {
686
+ rb_ary_push(insn, read_memnum(&cursor));
687
+ }
688
+
689
+ break;
690
+ }
691
+ case OP_BACKREF_MULTI_IC: {
692
+ rb_ary_push(insn, ID2SYM(rb_intern("backref_multi_ic")));
693
+
694
+ VALUE length = read_length(&cursor);
695
+ rb_ary_push(insn, length);
696
+
697
+ for (int i = 0; i < NUM2INT(length); i++) {
698
+ rb_ary_push(insn, read_memnum(&cursor));
699
+ }
700
+
701
+ break;
702
+ }
703
+ case OP_BACKREF_WITH_LEVEL: {
704
+ rb_ary_push(insn, ID2SYM(rb_intern("backref_with_level")));
705
+
706
+ rb_ary_push(insn, read_option(&cursor));
707
+ rb_ary_push(insn, read_length(&cursor));
708
+
709
+ VALUE length = read_length(&cursor);
710
+ rb_ary_push(insn, length);
711
+
712
+ for (int i = 0; i < NUM2INT(length); i++) {
713
+ rb_ary_push(insn, read_memnum(&cursor));
714
+ }
715
+
716
+ break;
717
+ }
718
+ case OP_MEMORY_START:
719
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_start")));
720
+ rb_ary_push(insn, read_memnum(&cursor));
721
+ break;
722
+
723
+ case OP_MEMORY_START_PUSH:
724
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_start_push")));
725
+ rb_ary_push(insn, read_memnum(&cursor));
726
+ break;
727
+
728
+ case OP_MEMORY_END_PUSH:
729
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_end_push")));
730
+ rb_ary_push(insn, read_memnum(&cursor));
731
+ break;
732
+
733
+ case OP_MEMORY_END_PUSH_REC:
734
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_end_push_rec")));
735
+ rb_ary_push(insn, read_memnum(&cursor));
736
+ break;
737
+
738
+ case OP_MEMORY_END:
739
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_end")));
740
+ rb_ary_push(insn, read_memnum(&cursor));
741
+ break;
742
+
743
+ case OP_MEMORY_END_REC:
744
+ rb_ary_push(insn, ID2SYM(rb_intern("memory_end_rec")));
745
+ rb_ary_push(insn, read_memnum(&cursor));
746
+ break;
747
+
748
+ case OP_KEEP: {
749
+ rb_ary_push(insn, ID2SYM(rb_intern("keep")));
750
+ break;
751
+ }
752
+ case OP_FAIL: {
753
+ rb_ary_push(insn, ID2SYM(rb_intern("fail")));
754
+ break;
755
+ }
756
+ case OP_JUMP: {
757
+ rb_ary_push(insn, ID2SYM(rb_intern("jump")));
758
+ rb_ary_push(insn, read_reladdr(&cursor));
759
+ break;
760
+ }
761
+ case OP_PUSH: {
762
+ rb_ary_push(insn, ID2SYM(rb_intern("push")));
763
+ rb_ary_push(insn, read_reladdr(&cursor));
764
+ break;
765
+ }
766
+ case OP_POP: {
767
+ rb_ary_push(insn, ID2SYM(rb_intern("pop")));
768
+ break;
769
+ }
770
+ case OP_PUSH_OR_JUMP_EXACT1: {
771
+ rb_ary_push(insn, ID2SYM(rb_intern("push_or_jump_exact1")));
772
+ rb_ary_push(insn, read_reladdr(&cursor));
773
+ rb_ary_push(insn, read_exact(&cursor, 1, encoding));
774
+ break;
775
+ }
776
+ case OP_PUSH_IF_PEEK_NEXT: {
777
+ rb_ary_push(insn, ID2SYM(rb_intern("push_if_peek_next")));
778
+ rb_ary_push(insn, read_reladdr(&cursor));
779
+ rb_ary_push(insn, read_exact(&cursor, 1, encoding));
780
+ break;
781
+ }
782
+ case OP_REPEAT: {
783
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat")));
784
+ rb_ary_push(insn, read_memnum(&cursor));
785
+ rb_ary_push(insn, read_reladdr(&cursor));
786
+ break;
787
+ }
788
+ case OP_REPEAT_NG: {
789
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat_ng")));
790
+ rb_ary_push(insn, read_memnum(&cursor));
791
+ rb_ary_push(insn, read_reladdr(&cursor));
792
+ break;
793
+ }
794
+ case OP_REPEAT_INC: {
795
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc")));
796
+ rb_ary_push(insn, read_memnum(&cursor));
797
+ break;
798
+ }
799
+ case OP_REPEAT_INC_NG: {
800
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc_ng")));
801
+ rb_ary_push(insn, read_memnum(&cursor));
802
+ break;
803
+ }
804
+ case OP_REPEAT_INC_SG: {
805
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc_sg")));
806
+ rb_ary_push(insn, read_memnum(&cursor));
807
+ break;
808
+ }
809
+ case OP_REPEAT_INC_NG_SG: {
810
+ rb_ary_push(insn, ID2SYM(rb_intern("repeat_inc_ng_sg")));
811
+ rb_ary_push(insn, read_memnum(&cursor));
812
+ break;
813
+ }
814
+ case OP_NULL_CHECK_START: {
815
+ rb_ary_push(insn, ID2SYM(rb_intern("null_check_start")));
816
+ rb_ary_push(insn, read_memnum(&cursor));
817
+ break;
818
+ }
819
+ case OP_NULL_CHECK_END: {
820
+ rb_ary_push(insn, ID2SYM(rb_intern("null_check_end")));
821
+ rb_ary_push(insn, read_memnum(&cursor));
822
+ break;
823
+ }
824
+ case OP_NULL_CHECK_END_MEMST: {
825
+ rb_ary_push(insn, ID2SYM(rb_intern("null_check_end_memst")));
826
+ rb_ary_push(insn, read_memnum(&cursor));
827
+ break;
828
+ }
829
+ case OP_NULL_CHECK_END_MEMST_PUSH: {
830
+ rb_ary_push(insn, ID2SYM(rb_intern("null_check_end_memst_push")));
831
+ rb_ary_push(insn, read_memnum(&cursor));
832
+ break;
833
+ }
834
+ case OP_PUSH_POS: {
835
+ rb_ary_push(insn, ID2SYM(rb_intern("push_pos")));
836
+ break;
837
+ }
838
+ case OP_POP_POS: {
839
+ rb_ary_push(insn, ID2SYM(rb_intern("pop_pos")));
840
+ break;
841
+ }
842
+ case OP_PUSH_POS_NOT: {
843
+ rb_ary_push(insn, ID2SYM(rb_intern("push_pos_not")));
844
+ rb_ary_push(insn, read_reladdr(&cursor));
845
+ break;
846
+ }
847
+ case OP_FAIL_POS: {
848
+ rb_ary_push(insn, ID2SYM(rb_intern("fail_pos")));
849
+ break;
850
+ }
851
+ case OP_PUSH_STOP_BT: {
852
+ rb_ary_push(insn, ID2SYM(rb_intern("push_stop_bt")));
853
+ break;
854
+ }
855
+ case OP_POP_STOP_BT: {
856
+ rb_ary_push(insn, ID2SYM(rb_intern("pop_stop_bt")));
857
+ break;
858
+ }
859
+ case OP_LOOK_BEHIND: {
860
+ rb_ary_push(insn, ID2SYM(rb_intern("look_behind")));
861
+ rb_ary_push(insn, read_length(&cursor));
862
+ break;
863
+ }
864
+ case OP_PUSH_LOOK_BEHIND_NOT: {
865
+ rb_ary_push(insn, ID2SYM(rb_intern("push_look_behind_not")));
866
+ rb_ary_push(insn, read_reladdr(&cursor));
867
+ rb_ary_push(insn, read_length(&cursor));
868
+ break;
869
+ }
870
+ case OP_FAIL_LOOK_BEHIND_NOT: {
871
+ rb_ary_push(insn, ID2SYM(rb_intern("fail_look_behind_not")));
872
+ break;
873
+ }
874
+ case OP_PUSH_ABSENT_POS: {
875
+ rb_ary_push(insn, ID2SYM(rb_intern("push_absent_pos")));
876
+ break;
877
+ }
878
+ case OP_ABSENT: {
879
+ rb_ary_push(insn, ID2SYM(rb_intern("absent")));
880
+ rb_ary_push(insn, read_reladdr(&cursor));
881
+ break;
882
+ }
883
+ case OP_ABSENT_END: {
884
+ rb_ary_push(insn, ID2SYM(rb_intern("absent_end")));
885
+ break;
886
+ }
887
+ case OP_CALL: {
888
+ rb_ary_push(insn, ID2SYM(rb_intern("call")));
889
+ rb_ary_push(insn, read_absaddr(&cursor));
890
+ break;
891
+ }
892
+ case OP_RETURN: {
893
+ rb_ary_push(insn, ID2SYM(rb_intern("return")));
894
+ break;
895
+ }
896
+ case OP_CONDITION: {
897
+ rb_ary_push(insn, ID2SYM(rb_intern("condition")));
898
+ rb_ary_push(insn, read_memnum(&cursor));
899
+ rb_ary_push(insn, read_reladdr(&cursor));
900
+ break;
901
+ }
902
+ case OP_STATE_CHECK_PUSH: {
903
+ rb_ary_push(insn, ID2SYM(rb_intern("state_check_push")));
904
+ rb_ary_push(insn, read_state_check(&cursor));
905
+ rb_ary_push(insn, read_reladdr(&cursor));
906
+ break;
907
+ }
908
+ case OP_STATE_CHECK_PUSH_OR_JUMP: {
909
+ rb_ary_push(insn, ID2SYM(rb_intern("state_check_push_or_jump")));
910
+ rb_ary_push(insn, read_state_check(&cursor));
911
+ rb_ary_push(insn, read_reladdr(&cursor));
912
+ break;
913
+ }
914
+ case OP_STATE_CHECK: {
915
+ rb_ary_push(insn, ID2SYM(rb_intern("state_check")));
916
+ rb_ary_push(insn, read_state_check(&cursor));
917
+ break;
918
+ }
919
+ case OP_STATE_CHECK_ANYCHAR_STAR: {
920
+ rb_ary_push(insn, ID2SYM(rb_intern("state_check_anychar_star")));
921
+ rb_ary_push(insn, read_state_check(&cursor));
922
+ break;
923
+ }
924
+ case OP_STATE_CHECK_ANYCHAR_ML_STAR: {
925
+ rb_ary_push(insn, ID2SYM(rb_intern("state_check_anychar_ml_star")));
926
+ rb_ary_push(insn, read_state_check(&cursor));
927
+ break;
928
+ }
929
+ case OP_SET_OPTION_PUSH: {
930
+ rb_ary_push(insn, ID2SYM(rb_intern("set_option_push")));
931
+ rb_ary_push(insn, read_option(&cursor));
932
+ break;
933
+ }
934
+ case OP_SET_OPTION: {
935
+ rb_ary_push(insn, ID2SYM(rb_intern("set_option")));
936
+ rb_ary_push(insn, read_option(&cursor));
937
+ break;
938
+ }
939
+ }
940
+
941
+ rb_ary_push(insns, insn);
942
+ }
943
+
944
+ onig_free(regex);
945
+ onig_end();
946
+ return insns;
947
+ }
948
+
949
+ void
950
+ Init_onigmo(void) {
951
+ VALUE rb_cOnigmo = rb_define_module("Onigmo");
952
+ rb_define_singleton_method(rb_cOnigmo, "parse", parse, 1);
953
+ rb_define_singleton_method(rb_cOnigmo, "compile", compile, 1);
954
+
955
+ rb_cOnigmoNode = rb_define_class_under(rb_cOnigmo, "Node", rb_cObject);
956
+ rb_cOnigmoAlternationNode = rb_define_class_under(rb_cOnigmo, "AlternationNode", rb_cOnigmoNode);
957
+ rb_cOnigmoAnchorBufferBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorBufferBeginNode", rb_cOnigmoNode);
958
+ rb_cOnigmoAnchorBufferEndNode = rb_define_class_under(rb_cOnigmo, "AnchorBufferEndNode", rb_cOnigmoNode);
959
+ rb_cOnigmoAnchorKeepNode = rb_define_class_under(rb_cOnigmo, "AnchorKeepNode", rb_cOnigmoNode);
960
+ rb_cOnigmoAnchorLineBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorLineBeginNode", rb_cOnigmoNode);
961
+ rb_cOnigmoAnchorLineEndNode = rb_define_class_under(rb_cOnigmo, "AnchorLineEndNode", rb_cOnigmoNode);
962
+ rb_cOnigmoAnchorPositionBeginNode = rb_define_class_under(rb_cOnigmo, "AnchorPositionBeginNode", rb_cOnigmoNode);
963
+ rb_cOnigmoAnchorSemiEndNode = rb_define_class_under(rb_cOnigmo, "AnchorSemiEndNode", rb_cOnigmoNode);
964
+ rb_cOnigmoAnchorWordBoundaryNode = rb_define_class_under(rb_cOnigmo, "AnchorWordBoundaryNode", rb_cOnigmoNode);
965
+ rb_cOnigmoAnchorWordBoundaryInvertNode = rb_define_class_under(rb_cOnigmo, "AnchorWordBoundaryInvertNode", rb_cOnigmoNode);
966
+ rb_cOnigmoAnyNode = rb_define_class_under(rb_cOnigmo, "AnyNode", rb_cOnigmoNode);
967
+ rb_cOnigmoBackrefNode = rb_define_class_under(rb_cOnigmo, "BackrefNode", rb_cOnigmoNode);
968
+ rb_cOnigmoCallNode = rb_define_class_under(rb_cOnigmo, "CallNode", rb_cOnigmoNode);
969
+ rb_cOnigmoCClassNode = rb_define_class_under(rb_cOnigmo, "CClassNode", rb_cOnigmoNode);
970
+ rb_cOnigmoCClassInvertNode = rb_define_class_under(rb_cOnigmo, "CClassInvertNode", rb_cOnigmoNode);
971
+ rb_cOnigmoEncloseAbsentNode = rb_define_class_under(rb_cOnigmo, "EncloseAbsentNode", rb_cOnigmoNode);
972
+ rb_cOnigmoEncloseConditionNode = rb_define_class_under(rb_cOnigmo, "EncloseConditionNode", rb_cOnigmoNode);
973
+ rb_cOnigmoEncloseMemoryNode = rb_define_class_under(rb_cOnigmo, "EncloseMemoryNode", rb_cOnigmoNode);
974
+ rb_cOnigmoEncloseOptionsNode = rb_define_class_under(rb_cOnigmo, "EncloseOptionsNode", rb_cOnigmoNode);
975
+ rb_cOnigmoEncloseStopBacktrackNode = rb_define_class_under(rb_cOnigmo, "EncloseStopBacktrackNode", rb_cOnigmoNode);
976
+ rb_cOnigmoListNode = rb_define_class_under(rb_cOnigmo, "ListNode", rb_cOnigmoNode);
977
+ rb_cOnigmoLookAheadNode = rb_define_class_under(rb_cOnigmo, "LookAheadNode", rb_cOnigmoNode);
978
+ rb_cOnigmoLookAheadInvertNode = rb_define_class_under(rb_cOnigmo, "LookAheadInvertNode", rb_cOnigmoNode);
979
+ rb_cOnigmoLookBehindNode = rb_define_class_under(rb_cOnigmo, "LookBehindNode", rb_cOnigmoNode);
980
+ rb_cOnigmoLookBehindInvertNode = rb_define_class_under(rb_cOnigmo, "LookBehindInvertNode", rb_cOnigmoNode);
981
+ rb_cOnigmoQuantifierNode = rb_define_class_under(rb_cOnigmo, "QuantifierNode", rb_cOnigmoNode);
982
+ rb_cOnigmoStringNode = rb_define_class_under(rb_cOnigmo, "StringNode", rb_cOnigmoNode);
983
+ rb_cOnigmoWordNode = rb_define_class_under(rb_cOnigmo, "WordNode", rb_cOnigmoNode);
984
+ rb_cOnigmoWordInvertNode = rb_define_class_under(rb_cOnigmo, "WordInvertNode", rb_cOnigmoNode);
985
+ }