wikitext 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/ext/ary.h ADDED
@@ -0,0 +1,99 @@
1
+ // Copyright 2008 Wincent Colaiuta
2
+ // This program is free software: you can redistribute it and/or modify
3
+ // it under the terms of the GNU General Public License as published by
4
+ // the Free Software Foundation, either version 3 of the License, or
5
+ // (at your option) any later version.
6
+ //
7
+ // This program is distributed in the hope that it will be useful,
8
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ // GNU General Public License for more details.
11
+ //
12
+ // You should have received a copy of the GNU General Public License
13
+ // along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ #include <ruby/ruby.h>
16
+
17
+ typedef struct
18
+ {
19
+ int count;
20
+ int max;
21
+ int *entries;
22
+ } ary_t;
23
+
24
+ // in the test suite array count goes no higher than 25 or 26
25
+ #define DEFAULT_ENTRY_COUNT 64
26
+
27
+ #define NO_ITEM(item) (item == INT_MAX)
28
+
29
+ inline ary_t *ary_new(void)
30
+ {
31
+ ary_t *ary = ALLOC_N(ary_t, 1);
32
+ ary->count = 0;
33
+ ary->max = DEFAULT_ENTRY_COUNT;
34
+ ary->entries = ALLOC_N(int, DEFAULT_ENTRY_COUNT);
35
+ return ary;
36
+ }
37
+
38
+ inline void ary_free(ary_t *ary)
39
+ {
40
+ free(ary->entries);
41
+ free(ary);
42
+ }
43
+
44
+ inline int ary_entry(ary_t *ary, int idx)
45
+ {
46
+ if (idx < 0)
47
+ idx = ary->count + idx;
48
+ return (idx >= 0 && ary->count > idx) ? ary->entries[idx] : INT_MAX;
49
+ }
50
+
51
+ inline void ary_clear(ary_t *ary)
52
+ {
53
+ ary->count = 0;
54
+ }
55
+
56
+ inline int ary_pop(ary_t *ary)
57
+ {
58
+ if (ary->count > 0)
59
+ {
60
+ ary->count--;
61
+ return 1;
62
+ }
63
+ return 0;
64
+ }
65
+
66
+ inline void ary_push(ary_t *ary, int val)
67
+ {
68
+ if (ary->count == ary->max)
69
+ {
70
+ ary->max += DEFAULT_ENTRY_COUNT;
71
+ REALLOC_N(ary->entries, int, ary->max);
72
+ }
73
+ ary->entries[ary->count] = val;
74
+ ary->count++;
75
+ }
76
+
77
+ inline int ary_includes(ary_t *ary, int val)
78
+ {
79
+ for (int i = 0, max = ary->count; i < max; i++)
80
+ {
81
+ if (ary->entries[i] == val)
82
+ return 1;
83
+ }
84
+ return 0;
85
+ }
86
+
87
+ // returns a count indicating the number of times the value appears in the collection
88
+ // refactored from _Wikitext_count()
89
+ inline int ary_count(ary_t *ary, int item)
90
+ {
91
+ int count = 0;
92
+ for (int i = 0, max = ary->count; i < max; i++)
93
+ {
94
+ if (ary->entries[i] == item)
95
+ count++;
96
+ }
97
+ return count;
98
+ }
99
+
data/ext/depend ADDED
@@ -0,0 +1,22 @@
1
+ # depend
2
+ # Additional material for Makefile
3
+ # Copyright 2008 Wincent Colaiuta
4
+ # This program is free software: you can redistribute it and/or modify
5
+ # it under the terms of the GNU General Public License as published by
6
+ # the Free Software Foundation, either version 3 of the License, or
7
+ # (at your option) any later version.
8
+ #
9
+ # This program is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ # GNU General Public License for more details.
13
+ #
14
+ # You should have received a copy of the GNU General Public License
15
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
+
17
+ CFLAGS += -std=gnu99
18
+
19
+ parser.o : ary.h parser.c parser.h token.h str.h wikitext.h wikitext_ragel.h
20
+ token.o : token.c token.h wikitext.h
21
+ wikitext.o : parser.h token.h wikitext.c wikitext.h wikitext_ragel.h
22
+ wikitext_ragel.o : token.h wikitext.h wikitext_ragel.h wikitext_ragel.c
data/ext/extconf.rb ADDED
@@ -0,0 +1,23 @@
1
+ # Copyright 2008 Wincent Colaiuta
2
+ # This program is free software: you can redistribute it and/or modify
3
+ # it under the terms of the GNU General Public License as published by
4
+ # the Free Software Foundation, either version 3 of the License, or
5
+ # (at your option) any later version.
6
+ #
7
+ # This program is distributed in the hope that it will be useful,
8
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ # GNU General Public License for more details.
11
+ #
12
+ # You should have received a copy of the GNU General Public License
13
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ require 'mkmf'
16
+
17
+ def missing item
18
+ puts "couldn't find #{item} (required)"
19
+ exit 1
20
+ end
21
+
22
+ have_header('ruby.h') or missing 'ruby.h'
23
+ create_makefile('wikitext')
data/ext/parser.c ADDED
@@ -0,0 +1,2174 @@
1
+ // Copyright 2007-2008 Wincent Colaiuta
2
+ // This program is free software: you can redistribute it and/or modify
3
+ // it under the terms of the GNU General Public License as published by
4
+ // the Free Software Foundation, either version 3 of the License, or
5
+ // (at your option) any later version.
6
+ //
7
+ // This program is distributed in the hope that it will be useful,
8
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ // GNU General Public License for more details.
11
+ //
12
+ // You should have received a copy of the GNU General Public License
13
+ // along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ #include "parser.h"
16
+ #include "ary.h"
17
+ #include "str.h"
18
+ #include "wikitext.h"
19
+ #include "wikitext_ragel.h"
20
+
21
+ #define IN(type) ary_includes(parser->scope, type)
22
+
23
+ // poor man's object orientation in C:
24
+ // instead of parsing around multiple parameters between functions in the parser
25
+ // we pack everything into a struct and pass around only a pointer to that
26
+ typedef struct
27
+ {
28
+ VALUE output; // for accumulating output to be returned
29
+ VALUE capture; // for capturing substrings
30
+ VALUE link_target; // short term "memory" for parsing links
31
+ VALUE link_text; // short term "memory" for parsing links
32
+ VALUE external_link_class; // CSS class applied to external links
33
+ ary_t *scope; // stack for tracking scope
34
+ ary_t *line; // stack for tracking scope as implied by current line
35
+ ary_t *line_buffer; // stack for tracking raw tokens (not scope) on current line
36
+ VALUE pending_crlf; // boolean (Qtrue or Qfalse)
37
+ VALUE autolink; // boolean (Qtrue or Qfalse)
38
+ VALUE treat_slash_as_special; // boolean (Qtrue or Qfalse)
39
+ VALUE special_link; // boolean (Qtrue or Qfalse): is the current link_target a "special" link?
40
+ str_t *line_ending;
41
+ int base_indent; // controlled by the :indent option to Wikitext::Parser#parse
42
+ int current_indent; // fluctuates according to currently nested structures
43
+ str_t *tabulation; // caching buffer for emitting indentation
44
+ } parser_t;
45
+
46
+ const char escaped_no_wiki_start[] = "&lt;nowiki&gt;";
47
+ const char escaped_no_wiki_end[] = "&lt;/nowiki&gt;";
48
+ const char literal_strong_em[] = "'''''";
49
+ const char literal_strong[] = "'''";
50
+ const char literal_em[] = "''";
51
+ const char escaped_em_start[] = "&lt;em&gt;";
52
+ const char escaped_em_end[] = "&lt;/em&gt;";
53
+ const char escaped_strong_start[] = "&lt;strong&gt;";
54
+ const char escaped_strong_end[] = "&lt;/strong&gt;";
55
+ const char escaped_tt_start[] = "&lt;tt&gt;";
56
+ const char escaped_tt_end[] = "&lt;/tt&gt;";
57
+ const char literal_h6[] = "======";
58
+ const char literal_h5[] = "=====";
59
+ const char literal_h4[] = "====";
60
+ const char literal_h3[] = "===";
61
+ const char literal_h2[] = "==";
62
+ const char literal_h1[] = "=";
63
+ const char pre_start[] = "<pre>";
64
+ const char pre_end[] = "</pre>";
65
+ const char escaped_pre_start[] = "&lt;pre&gt;";
66
+ const char escaped_pre_end[] = "&lt;/pre&gt;";
67
+ const char blockquote_start[] = "<blockquote>";
68
+ const char blockquote_end[] = "</blockquote>";
69
+ const char escaped_blockquote_start[] = "&lt;blockquote&gt;";
70
+ const char escaped_blockquote_end[] = "&lt;/blockquote&gt;";
71
+ const char strong_em_start[] = "<strong><em>";
72
+ const char strong_start[] = "<strong>";
73
+ const char strong_end[] = "</strong>";
74
+ const char em_start[] = "<em>";
75
+ const char em_end[] = "</em>";
76
+ const char tt_start[] = "<tt>";
77
+ const char tt_end[] = "</tt>";
78
+ const char ol_start[] = "<ol>";
79
+ const char ol_end[] = "</ol>";
80
+ const char ul_start[] = "<ul>";
81
+ const char ul_end[] = "</ul>";
82
+ const char li_start[] = "<li>";
83
+ const char li_end[] = "</li>";
84
+ const char h6_start[] = "<h6>";
85
+ const char h6_end[] = "</h6>";
86
+ const char h5_start[] = "<h5>";
87
+ const char h5_end[] = "</h5>";
88
+ const char h4_start[] = "<h4>";
89
+ const char h4_end[] = "</h4>";
90
+ const char h3_start[] = "<h3>";
91
+ const char h3_end[] = "</h3>";
92
+ const char h2_start[] = "<h2>";
93
+ const char h2_end[] = "</h2>";
94
+ const char h1_start[] = "<h1>";
95
+ const char h1_end[] = "</h1>";
96
+ const char p_start[] = "<p>";
97
+ const char p_end[] = "</p>";
98
+ const char space[] = " ";
99
+ const char a_start[] = "<a href=\"";
100
+ const char a_class[] = "\" class=\"";
101
+ const char a_start_close[] = "\">";
102
+ const char a_end[] = "</a>";
103
+ const char link_start[] = "[[";
104
+ const char link_end[] = "]]";
105
+ const char separator[] = "|";
106
+ const char ext_link_start[] = "[";
107
+ const char backtick[] = "`";
108
+ const char quote[] = "\"";
109
+ const char ampersand[] = "&";
110
+ const char quot_entity[] = "&quot;";
111
+ const char amp_entity[] = "&amp;";
112
+ const char lt_entity[] = "&lt;";
113
+ const char gt_entity[] = "&gt;";
114
+ const char escaped_blockquote[] = "&gt; ";
115
+ const char ext_link_end[] = "]";
116
+
117
+ // for testing and debugging only
118
+ VALUE Wikitext_parser_tokenize(VALUE self, VALUE string)
119
+ {
120
+ if (NIL_P(string))
121
+ return Qnil;
122
+ string = StringValue(string);
123
+ VALUE tokens = rb_ary_new();
124
+ char *p = RSTRING_PTR(string);
125
+ long len = RSTRING_LEN(string);
126
+ char *pe = p + len;
127
+ token_t token;
128
+ next_token(&token, NULL, p, pe);
129
+ rb_ary_push(tokens, _Wikitext_token(&token));
130
+ while (token.type != END_OF_FILE)
131
+ {
132
+ next_token(&token, &token, NULL, pe);
133
+ rb_ary_push(tokens, _Wikitext_token(&token));
134
+ }
135
+ return tokens;
136
+ }
137
+
138
+ // for benchmarking raw tokenization speed only
139
+ VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string)
140
+ {
141
+ if (NIL_P(string))
142
+ return Qnil;
143
+ string = StringValue(string);
144
+ char *p = RSTRING_PTR(string);
145
+ long len = RSTRING_LEN(string);
146
+ char *pe = p + len;
147
+ token_t token;
148
+ next_token(&token, NULL, p, pe);
149
+ while (token.type != END_OF_FILE)
150
+ next_token(&token, &token, NULL, pe);
151
+ return Qnil;
152
+ }
153
+
154
+ // we downcase "in place", overwriting the original contents of the buffer and returning the same string
155
+ inline VALUE _Wikitext_downcase(VALUE string)
156
+ {
157
+ char *ptr = RSTRING_PTR(string);
158
+ long len = RSTRING_LEN(string);
159
+ for (long i = 0; i < len; i++)
160
+ {
161
+ if (ptr[i] >= 'A' && ptr[i] <= 'Z')
162
+ ptr[i] += 32;
163
+ }
164
+ return string;
165
+ }
166
+
167
+ inline VALUE _Wikitext_hyperlink(VALUE link_prefix, VALUE link_target, VALUE link_text, VALUE link_class)
168
+ {
169
+ VALUE string = rb_str_new(a_start, sizeof(a_start) - 1); // <a href="
170
+ if (!NIL_P(link_prefix))
171
+ rb_str_append(string, link_prefix);
172
+ rb_str_append(string, link_target);
173
+ if (link_class != Qnil)
174
+ {
175
+ rb_str_cat(string, a_class, sizeof(a_class) - 1); // " class="
176
+ rb_str_append(string, link_class);
177
+ }
178
+ rb_str_cat(string, a_start_close, sizeof(a_start_close) - 1); // ">
179
+ rb_str_append(string, link_text);
180
+ rb_str_cat(string, a_end, sizeof(a_end) - 1);
181
+ return string;
182
+ }
183
+
184
+ // will emit indentation only if we are about to emit any of:
185
+ // <blockquote>, <p>, <ul>, <ol>, <li>, <h1> etc, <pre>
186
+ // each time we enter one of those spans must ++ the indentation level
187
+ inline void _Wikitext_indent(parser_t *parser)
188
+ {
189
+ int space_count = parser->current_indent + parser->base_indent;
190
+ if (space_count > 0)
191
+ {
192
+ char *old_end, *new_end;
193
+ if (!parser->tabulation)
194
+ {
195
+ parser->tabulation = str_new_size(space_count);
196
+ old_end = parser->tabulation->ptr;
197
+ }
198
+ else if (parser->tabulation->len < space_count)
199
+ {
200
+ old_end = parser->tabulation->ptr;
201
+ str_grow(parser->tabulation, space_count);
202
+ }
203
+ else
204
+ old_end = parser->tabulation->ptr;
205
+ new_end = parser->tabulation->ptr + space_count;
206
+ while (old_end < new_end)
207
+ *old_end++ = ' ';
208
+ rb_str_cat(parser->output, parser->tabulation->ptr, space_count);
209
+ }
210
+ parser->current_indent += 2;
211
+ }
212
+
213
+ inline void _Wikitext_dedent(parser_t *parser, VALUE emit)
214
+ {
215
+ parser->current_indent -= 2;
216
+ if (emit != Qtrue)
217
+ return;
218
+ int space_count = parser->current_indent + parser->base_indent;
219
+ if (space_count > 0)
220
+ rb_str_cat(parser->output, parser->tabulation->ptr, space_count);
221
+ }
222
+
223
+ // Pops a single item off the parser's scope stack.
224
+ // A corresponding closing tag is written to the target string.
225
+ // The target string may be the main output buffer, or a substring capturing buffer if a link is being scanned.
226
+ void _Wikitext_pop_from_stack(parser_t *parser, VALUE target)
227
+ {
228
+ int top = ary_entry(parser->scope, -1);
229
+ if (NO_ITEM(top))
230
+ return;
231
+ if (NIL_P(target))
232
+ target = parser->output;
233
+ switch (top)
234
+ {
235
+ case PRE:
236
+ case PRE_START:
237
+ rb_str_cat(target, pre_end, sizeof(pre_end) - 1);
238
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
239
+ _Wikitext_dedent(parser, Qfalse);
240
+ break;
241
+
242
+ case BLOCKQUOTE:
243
+ case BLOCKQUOTE_START:
244
+ _Wikitext_dedent(parser, Qtrue);
245
+ rb_str_cat(target, blockquote_end, sizeof(blockquote_end) - 1);
246
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
247
+ break;
248
+
249
+ case NO_WIKI_START:
250
+ // not a real HTML tag; so nothing to pop
251
+ break;
252
+
253
+ case STRONG:
254
+ case STRONG_START:
255
+ rb_str_cat(target, strong_end, sizeof(strong_end) - 1);
256
+ break;
257
+
258
+ case EM:
259
+ case EM_START:
260
+ rb_str_cat(target, em_end, sizeof(em_end) - 1);
261
+ break;
262
+
263
+ case TT:
264
+ case TT_START:
265
+ rb_str_cat(target, tt_end, sizeof(tt_end) - 1);
266
+ break;
267
+
268
+ case OL:
269
+ _Wikitext_dedent(parser, Qtrue);
270
+ rb_str_cat(target, ol_end, sizeof(ol_end) - 1);
271
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
272
+ break;
273
+
274
+ case UL:
275
+ _Wikitext_dedent(parser, Qtrue);
276
+ rb_str_cat(target, ul_end, sizeof(ul_end) - 1);
277
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
278
+ break;
279
+
280
+ case NESTED_LIST:
281
+ // next token to pop will be a LI
282
+ // LI is an interesting token because sometimes we want it to behave like P (ie. do a non-emitting indent)
283
+ // and other times we want it to behave like BLOCKQUOTE (ie. when it has a nested list inside)
284
+ // hence this hack: we do an emitting dedent on behalf of the LI that we know must be coming
285
+ // and then when we pop the actual LI itself (below) we do the standard non-emitting indent
286
+ _Wikitext_dedent(parser, Qtrue); // we really only want to emit the spaces
287
+ parser->current_indent += 2; // we don't want to decrement the actual indent level, so put it back
288
+ break;
289
+
290
+ case LI:
291
+ rb_str_cat(target, li_end, sizeof(li_end) - 1);
292
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
293
+ _Wikitext_dedent(parser, Qfalse);
294
+ break;
295
+
296
+ case H6_START:
297
+ rb_str_cat(target, h6_end, sizeof(h6_end) - 1);
298
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
299
+ _Wikitext_dedent(parser, Qfalse);
300
+ break;
301
+
302
+ case H5_START:
303
+ rb_str_cat(target, h5_end, sizeof(h5_end) - 1);
304
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
305
+ _Wikitext_dedent(parser, Qfalse);
306
+ break;
307
+
308
+ case H4_START:
309
+ rb_str_cat(target, h4_end, sizeof(h4_end) - 1);
310
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
311
+ _Wikitext_dedent(parser, Qfalse);
312
+ break;
313
+
314
+ case H3_START:
315
+ rb_str_cat(target, h3_end, sizeof(h3_end) - 1);
316
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
317
+ _Wikitext_dedent(parser, Qfalse);
318
+ break;
319
+
320
+ case H2_START:
321
+ rb_str_cat(target, h2_end, sizeof(h2_end) - 1);
322
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
323
+ _Wikitext_dedent(parser, Qfalse);
324
+ break;
325
+
326
+ case H1_START:
327
+ rb_str_cat(target, h1_end, sizeof(h1_end) - 1);
328
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
329
+ _Wikitext_dedent(parser, Qfalse);
330
+ break;
331
+
332
+ case LINK_START:
333
+ // not an HTML tag; so nothing to emit
334
+ break;
335
+
336
+ case EXT_LINK_START:
337
+ // not an HTML tag; so nothing to emit
338
+ break;
339
+
340
+ case SPACE:
341
+ // not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
342
+ break;
343
+
344
+ case SEPARATOR:
345
+ // not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
346
+ break;
347
+
348
+ case P:
349
+ rb_str_cat(target, p_end, sizeof(p_end) - 1);
350
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
351
+ _Wikitext_dedent(parser, Qfalse);
352
+ break;
353
+
354
+ case END_OF_FILE:
355
+ // nothing to do
356
+ break;
357
+
358
+ default:
359
+ // should probably raise an exception here
360
+ break;
361
+ }
362
+ ary_pop(parser->scope);
363
+ }
364
+
365
+ // Pops items off the top of parser's scope stack, accumulating closing tags for them into the target string, until item is reached.
366
+ // If including is Qtrue then the item itself is also popped.
367
+ // The target string may be the main output buffer, or a substring capturing buffer when scanning links.
368
+ void _Wikitext_pop_from_stack_up_to(parser_t *parser, VALUE target, int item, VALUE including)
369
+ {
370
+ int continue_looping = 1;
371
+ do
372
+ {
373
+ int top = ary_entry(parser->scope, -1);
374
+ if (NO_ITEM(top))
375
+ return;
376
+ if (top == item)
377
+ {
378
+ if (including != Qtrue)
379
+ return;
380
+ continue_looping = 0;
381
+ }
382
+ _Wikitext_pop_from_stack(parser, target);
383
+ } while (continue_looping);
384
+ }
385
+
386
+ inline void _Wikitext_start_para_if_necessary(parser_t *parser)
387
+ {
388
+ if (!NIL_P(parser->capture)) // we don't do anything if in capturing mode
389
+ return;
390
+
391
+ // if no block open yet, or top of stack is BLOCKQUOTE/BLOCKQUOTE_START (with nothing in it yet)
392
+ if (parser->scope->count == 0 ||
393
+ ary_entry(parser->scope, -1) == BLOCKQUOTE ||
394
+ ary_entry(parser->scope, -1) == BLOCKQUOTE_START)
395
+ {
396
+ _Wikitext_indent(parser);
397
+ rb_str_cat(parser->output, p_start, sizeof(p_start) - 1);
398
+ ary_push(parser->scope, P);
399
+ ary_push(parser->line, P);
400
+ }
401
+ else if (parser->pending_crlf == Qtrue)
402
+ {
403
+ if (IN(P))
404
+ // already in a paragraph block; convert pending CRLF into a space
405
+ rb_str_cat(parser->output, space, sizeof(space) - 1);
406
+ else if (IN(PRE))
407
+ // PRE blocks can have pending CRLF too (helps us avoid emitting the trailing newline)
408
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
409
+ }
410
+ parser->pending_crlf = Qfalse;
411
+ }
412
+
413
+ // Helper function that pops any excess elements off scope (pushing is already handled in the respective rules).
414
+ // For example, given input like:
415
+ //
416
+ // > > foo
417
+ // bar
418
+ //
419
+ // Upon seeing "bar", we want to pop two BLOCKQUOTE elements from the scope.
420
+ // The reverse case (shown below) is handled from inside the BLOCKQUOTE rule itself:
421
+ //
422
+ // foo
423
+ // > > bar
424
+ //
425
+ // Things are made slightly more complicated by the fact that there is one block-level tag that can be on the scope
426
+ // but not on the line scope:
427
+ //
428
+ // <blockquote>foo
429
+ // bar</blockquote>
430
+ //
431
+ // Here on seeing "bar" we have one item on the scope (BLOCKQUOTE_START) which we don't want to pop, but we have nothing
432
+ // on the line scope.
433
+ // Luckily, BLOCKQUOTE_START tokens can only appear at the start of the scope array, so we can check for them first before
434
+ // entering the for loop.
435
+ void inline _Wikitext_pop_excess_elements(parser_t *parser)
436
+ {
437
+ if (!NIL_P(parser->capture)) // we don't pop anything if in capturing mode
438
+ return;
439
+ for (int i = parser->scope->count - ary_count(parser->scope, BLOCKQUOTE_START), j = parser->line->count; i > j; i--)
440
+ {
441
+ // special case for last item on scope
442
+ if (i - j == 1)
443
+ {
444
+ // don't auto-pop P if it is only item on scope
445
+ if (ary_entry(parser->scope, -1) == P)
446
+ {
447
+ // add P to the line scope to prevent us entering the loop at all next time around
448
+ ary_push(parser->line, P);
449
+ continue;
450
+ }
451
+ }
452
+ _Wikitext_pop_from_stack(parser, parser->output);
453
+ }
454
+ }
455
+
456
+ #define INVALID_ENCODING(msg) do { if (dest_ptr) free(dest_ptr); rb_raise(eWikitextParserError, "invalid encoding: " msg); } while(0)
457
+
458
+ // convert a single UTF-8 codepoint to UTF-32
459
+ // expects an input buffer, src, containing a UTF-8 encoded character (which may be multi-byte)
460
+ // the end of the input buffer, end, is also passed in to allow the detection of invalidly truncated codepoints
461
+ // the number of bytes in the UTF-8 character (between 1 and 4) is returned by reference in width_out
462
+ // raises a RangeError if the supplied character is invalid UTF-8
463
+ // (in which case it also frees the block of memory indicated by dest_ptr if it is non-NULL)
464
+ inline uint32_t _Wikitext_utf8_to_utf32(char *src, char *end, long *width_out, void *dest_ptr)
465
+ {
466
+ uint32_t dest;
467
+ if ((unsigned char)src[0] <= 0x7f) // ASCII
468
+ {
469
+ dest = src[0];
470
+ *width_out = 1;
471
+ }
472
+ else if ((src[0] & 0xe0) == 0xc0) // byte starts with 110..... : this should be a two-byte sequence
473
+ {
474
+ if (src + 1 >= end)
475
+ INVALID_ENCODING("truncated byte sequence"); // no second byte
476
+ else if (((unsigned char)src[0] == 0xc0) || ((unsigned char)src[0] == 0xc1))
477
+ INVALID_ENCODING("overlong encoding"); // overlong encoding: lead byte of 110..... but code point <= 127
478
+ else if ((src[1] & 0xc0) != 0x80 )
479
+ INVALID_ENCODING("malformed byte sequence"); // should have second byte starting with 10......
480
+ dest = ((uint32_t)(src[0] & 0x1f)) << 6 | (src[1] & 0x3f);
481
+ *width_out = 2;
482
+ }
483
+ else if ((src[0] & 0xf0) == 0xe0) // byte starts with 1110.... : this should be a three-byte sequence
484
+ {
485
+ if (src + 2 >= end)
486
+ INVALID_ENCODING("truncated byte sequence"); // missing second or third byte
487
+ else if (((src[1] & 0xc0) != 0x80 ) || ((src[2] & 0xc0) != 0x80 ))
488
+ INVALID_ENCODING("malformed byte sequence"); // should have second and third bytes starting with 10......
489
+ dest = ((uint32_t)(src[0] & 0x0f)) << 12 | ((uint32_t)(src[1] & 0x3f)) << 6 | (src[2] & 0x3f);
490
+ *width_out = 3;
491
+ }
492
+ else if ((src[0] & 0xf8) == 0xf0) // bytes starts with 11110... : this should be a four-byte sequence
493
+ {
494
+ if (src + 3 >= end)
495
+ INVALID_ENCODING("truncated byte sequence"); // missing second, third, or fourth byte
496
+ else if ((unsigned char)src[0] >= 0xf5 && (unsigned char)src[0] <= 0xf7)
497
+ INVALID_ENCODING("overlong encoding"); // disallowed by RFC 3629 (codepoints above 0x10ffff)
498
+ else if (((src[1] & 0xc0) != 0x80 ) || ((src[2] & 0xc0) != 0x80 ) || ((src[3] & 0xc0) != 0x80 ))
499
+ INVALID_ENCODING("malformed byte sequence"); // should have second and third bytes starting with 10......
500
+ dest = ((uint32_t)(src[0] & 0x07)) << 18 | ((uint32_t)(src[1] & 0x3f)) << 12 | ((uint32_t)(src[1] & 0x3f)) << 6 | (src[2] & 0x3f);
501
+ *width_out = 4;
502
+ }
503
+ else // invalid input
504
+ INVALID_ENCODING("unexpected byte");
505
+ return dest;
506
+ }
507
+
508
+ inline VALUE _Wikitext_utf32_char_to_entity(uint32_t character)
509
+ {
510
+ // TODO: consider special casing some entities (ie. quot, amp, lt, gt etc)?
511
+ char hex_string[8] = { '&', '#', 'x', 0, 0, 0, 0, ';' };
512
+ char scratch = (character & 0xf000) >> 12;
513
+ hex_string[3] = (scratch <= 9 ? scratch + 48 : scratch + 87);
514
+ scratch = (character & 0x0f00) >> 8;
515
+ hex_string[4] = (scratch <= 9 ? scratch + 48 : scratch + 87);
516
+ scratch = (character & 0x00f0) >> 4;
517
+ hex_string[5] = (scratch <= 9 ? scratch + 48 : scratch + 87);
518
+ scratch = character & 0x000f;
519
+ hex_string[6] = (scratch <= 9 ? scratch + 48 : scratch + 87);
520
+ return rb_str_new((const char *)hex_string, sizeof(hex_string));
521
+ }
522
+
523
+ inline VALUE _Wikitext_parser_trim_link_target(VALUE string)
524
+ {
525
+ string = StringValue(string);
526
+ char *src = RSTRING_PTR(string);
527
+ char *start = src; // remember this so we can check if we're at the start
528
+ char *left = src;
529
+ char *non_space = src; // remember last non-space character output
530
+ long len = RSTRING_LEN(string);
531
+ char *end = src + len;
532
+ while (src < end)
533
+ {
534
+ if (*src == ' ')
535
+ {
536
+ if (src == left)
537
+ *left++;
538
+ }
539
+ else
540
+ non_space = src;
541
+ src++;
542
+ }
543
+ if (left == start && non_space + 1 == end)
544
+ return string;
545
+ else
546
+ return rb_str_new(left, (non_space + 1) - left);
547
+ }
548
+
549
+ // - non-printable (non-ASCII) characters converted to numeric entities
550
+ // - QUOT and AMP characters converted to named entities
551
+ // - leading and trailing whitespace trimmed if trim is Qtrue
552
+ inline VALUE _Wikitext_parser_sanitize_link_target(VALUE string, VALUE trim)
553
+ {
554
+ string = StringValue(string); // raises if string is nil or doesn't quack like a string
555
+ char *src = RSTRING_PTR(string);
556
+ char *start = src; // remember this so we can check if we're at the start
557
+ long len = RSTRING_LEN(string);
558
+ char *end = src + len;
559
+
560
+ // start with a destination buffer twice the size of the source, will realloc if necessary
561
+ // slop = (len / 8) * 8 (ie. one in every 8 characters can be converted into an entity, each entity requires 8 bytes)
562
+ // this efficiently handles the most common case (where the size of the buffer doesn't change much)
563
+ char *dest = ALLOC_N(char, len * 2);
564
+ char *dest_ptr = dest; // hang on to this so we can pass it to free() later
565
+ char *non_space = dest; // remember last non-space character output
566
+ while (src < end)
567
+ {
568
+ // need at most 8 characters (8 bytes) to display each character
569
+ if (dest + 8 > dest_ptr + len) // outgrowing buffer, must reallocate
570
+ {
571
+ char *old_dest = dest;
572
+ char *old_dest_ptr = dest_ptr;
573
+ len = len + (end - src) * 8; // allocate enough for worst case
574
+ dest = realloc(dest_ptr, len); // will never have to realloc more than once
575
+ if (dest == NULL)
576
+ {
577
+ // would have used reallocf, but this has to run on Linux too, not just Darwin
578
+ free(dest_ptr);
579
+ rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
580
+ }
581
+ dest_ptr = dest;
582
+ dest = dest_ptr + (old_dest - old_dest_ptr);
583
+ non_space = dest_ptr + (non_space - old_dest_ptr);
584
+ }
585
+
586
+ if (*src == '"') // QUOT
587
+ {
588
+ char quot_entity_literal[] = { '&', 'q', 'u', 'o', 't', ';' }; // no trailing NUL
589
+ memcpy(dest, quot_entity_literal, sizeof(quot_entity_literal));
590
+ dest += sizeof(quot_entity_literal);
591
+ }
592
+ else if (*src == '&') // AMP
593
+ {
594
+ char amp_entity_literal[] = { '&', 'a', 'm', 'p', ';' }; // no trailing NUL
595
+ memcpy(dest, amp_entity_literal, sizeof(amp_entity_literal));
596
+ dest += sizeof(amp_entity_literal);
597
+ }
598
+ else if (*src == '<') // LESS_THAN
599
+ {
600
+ free(dest_ptr);
601
+ rb_raise(rb_eRangeError, "invalid link text (\"<\" may not appear in link text)");
602
+ }
603
+ else if (*src == '>') // GREATER_THAN
604
+ {
605
+ free(dest_ptr);
606
+ rb_raise(rb_eRangeError, "invalid link text (\">\" may not appear in link text)");
607
+ }
608
+ else if (*src == ' ' && src == start && trim == Qtrue)
609
+ start++; // we eat leading space
610
+ else if (*src >= 0x20 && *src <= 0x7e) // printable ASCII
611
+ {
612
+ *dest = *src;
613
+ dest++;
614
+ }
615
+ else // all others: must convert to entities
616
+ {
617
+ long width;
618
+ VALUE entity = _Wikitext_utf32_char_to_entity(_Wikitext_utf8_to_utf32(src, end, &width, dest_ptr));
619
+ char *entity_src = RSTRING_PTR(entity);
620
+ long entity_len = RSTRING_LEN(entity); // should always be 8 characters (8 bytes)
621
+ memcpy(dest, entity_src, entity_len);
622
+ dest += entity_len;
623
+ src += width;
624
+ non_space = dest;
625
+ continue;
626
+ }
627
+ if (*src != ' ')
628
+ non_space = dest;
629
+ src++;
630
+ }
631
+
632
+ // trim trailing space if necessary
633
+ if (trim == Qtrue && non_space > dest_ptr && dest != non_space)
634
+ len = non_space - dest_ptr;
635
+ else
636
+ len = dest - dest_ptr;
637
+ VALUE out = rb_str_new(dest_ptr, len);
638
+ free(dest_ptr);
639
+ return out;
640
+ }
641
+
642
+ VALUE Wikitext_parser_sanitize_link_target(VALUE self, VALUE string)
643
+ {
644
+ return (_Wikitext_parser_sanitize_link_target(string, Qtrue));
645
+ }
646
+
647
+ // encodes the input string according to RFCs 2396 and 2718
648
+ // leading and trailing whitespace trimmed
649
+ // note that the first character of the target link is not case-sensitive
650
+ // (this is a recommended application-level constraint; it is not imposed at this level)
651
+ // this is to allow links like:
652
+ // ...the [[foo]] is...
653
+ // to be equivalent to:
654
+ // thing. [[Foo]] was...
655
+ // this is also where we check treat_slash_as_special is true and act accordingly
656
+ // basically any link target matching /\A[a-z]+\/\d+\z/ is flagged as special
657
+ inline static void _Wikitext_parser_encode_link_target(parser_t *parser)
658
+ {
659
+ VALUE in = StringValue(parser->link_target);
660
+ char *input = RSTRING_PTR(in);
661
+ char *start = input; // remember this so we can check if we're at the start
662
+ long len = RSTRING_LEN(in);
663
+ if (!(len > 0))
664
+ return;
665
+ char *end = input + len;
666
+ static char hex[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
667
+
668
+ // this potential shortcut requires an (admittedly cheap) prescan, so only do it when treat_slash_as_special is true
669
+ parser->special_link = Qfalse;
670
+ if (parser->treat_slash_as_special == Qtrue)
671
+ {
672
+ char *c = input; // \A
673
+ while (c < end && *c >= 'a' && *c <= 'z') // [a-z]
674
+ c++; // +
675
+ if (c > start && c < end && *c++ == '/') // \/
676
+ {
677
+ while (c < end && *c >= '0' && *c <= '9') // \d
678
+ {
679
+ c++; // +
680
+ if (c == end) // \z
681
+ {
682
+ // matches /\A[a-z]+\/\d+\z/ so no transformation required
683
+ parser->special_link = Qtrue;
684
+ return;
685
+ }
686
+ }
687
+ }
688
+ }
689
+
690
+ // to avoid most reallocations start with a destination buffer twice the size of the source
691
+ // this handles the most common case (where most chars are in the ASCII range and don't require more storage, but there are
692
+ // often quite a few spaces, which are encoded as "%20" and occupy 3 bytes)
693
+ // the worst case is where _every_ byte must be written out using 3 bytes
694
+ long dest_len = len * 2;
695
+ char *dest = ALLOC_N(char, dest_len);
696
+ char *dest_ptr = dest; // hang on to this so we can pass it to free() later
697
+ char *non_space = dest; // remember last non-space character output
698
+ for (; input < end; input++)
699
+ {
700
+ if ((dest + 3) > (dest_ptr + dest_len)) // worst case: a single character may grow to 3 characters once encoded
701
+ {
702
+ // outgrowing buffer, must reallocate
703
+ char *old_dest = dest;
704
+ char *old_dest_ptr = dest_ptr;
705
+ dest_len += len;
706
+ dest = realloc(dest_ptr, dest_len);
707
+ if (dest == NULL)
708
+ {
709
+ // would have used reallocf, but this has to run on Linux too, not just Darwin
710
+ free(dest_ptr);
711
+ rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
712
+ }
713
+ dest_ptr = dest;
714
+ dest = dest_ptr + (old_dest - old_dest_ptr);
715
+ non_space = dest_ptr + (non_space - old_dest_ptr);
716
+ }
717
+
718
+ // pass through unreserved characters
719
+ if (((*input >= 'a') && (*input <= 'z')) ||
720
+ ((*input >= 'A') && (*input <= 'Z')) ||
721
+ ((*input >= '0') && (*input <= '9')) ||
722
+ (*input == '-') ||
723
+ (*input == '_') ||
724
+ (*input == '.') ||
725
+ (*input == '~'))
726
+ {
727
+ *dest++ = *input;
728
+ non_space = dest;
729
+ }
730
+ else if (*input == ' ' && input == start)
731
+ start++; // we eat leading space
732
+ else // everything else gets URL-encoded
733
+ {
734
+ *dest++ = '%';
735
+ *dest++ = hex[(unsigned char)(*input) / 16]; // left
736
+ *dest++ = hex[(unsigned char)(*input) % 16]; // right
737
+ if (*input != ' ')
738
+ non_space = dest;
739
+ }
740
+ }
741
+
742
+ // trim trailing space if necessary
743
+ if (non_space > dest_ptr && dest - 1 != non_space)
744
+ dest_len = non_space - dest_ptr;
745
+ else
746
+ dest_len = dest - dest_ptr;
747
+ parser->link_target = rb_str_new(dest_ptr, dest_len);
748
+ free(dest_ptr);
749
+ }
750
+
751
+ VALUE Wikitext_parser_encode_link_target(VALUE self, VALUE in)
752
+ {
753
+ parser_t parser;
754
+ parser.link_target = in;
755
+ parser.treat_slash_as_special = Qfalse;
756
+ _Wikitext_parser_encode_link_target(&parser);
757
+ return parser.link_target;
758
+ }
759
+
760
+ // this method exposed for testing only
761
+ VALUE Wikitext_parser_encode_special_link_target(VALUE self, VALUE in)
762
+ {
763
+ parser_t parser;
764
+ parser.link_target = in;
765
+ parser.treat_slash_as_special = Qtrue;
766
+ _Wikitext_parser_encode_link_target(&parser);
767
+ return parser.link_target;
768
+ }
769
+
770
+ // not sure whether these rollback functions should be inline: could refactor them into a single non-inlined function
771
+ inline void _Wikitext_rollback_failed_link(parser_t *parser)
772
+ {
773
+ if (!IN(LINK_START))
774
+ return; // nothing to do!
775
+ int scope_includes_separator = IN(SEPARATOR);
776
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, LINK_START, Qtrue);
777
+ rb_str_cat(parser->output, link_start, sizeof(link_start) - 1);
778
+ if (!NIL_P(parser->link_target))
779
+ {
780
+ VALUE sanitized = _Wikitext_parser_sanitize_link_target(parser->link_target, Qfalse);
781
+ rb_str_append(parser->output, sanitized);
782
+ if (scope_includes_separator)
783
+ {
784
+ rb_str_cat(parser->output, separator, sizeof(separator) - 1);
785
+ if (!NIL_P(parser->link_text))
786
+ rb_str_append(parser->output, parser->link_text);
787
+ }
788
+ }
789
+ parser->capture = Qnil;
790
+ parser->link_target = Qnil;
791
+ parser->link_text = Qnil;
792
+ }
793
+
794
+ inline void _Wikitext_rollback_failed_external_link(parser_t *parser)
795
+ {
796
+ if (!IN(EXT_LINK_START))
797
+ return; // nothing to do!
798
+ int scope_includes_space = IN(SPACE);
799
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, EXT_LINK_START, Qtrue);
800
+ rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
801
+ if (!NIL_P(parser->link_target))
802
+ {
803
+ if (parser->autolink == Qtrue)
804
+ parser->link_target = _Wikitext_hyperlink(Qnil, parser->link_target, parser->link_target, parser->external_link_class);
805
+ rb_str_append(parser->output, parser->link_target);
806
+ if (scope_includes_space)
807
+ {
808
+ rb_str_cat(parser->output, space, sizeof(space) - 1);
809
+ if (!NIL_P(parser->link_text))
810
+ rb_str_append(parser->output, parser->link_text);
811
+ }
812
+ }
813
+ parser->capture = Qnil;
814
+ parser->link_target = Qnil;
815
+ parser->link_text = Qnil;
816
+ }
817
+
818
+ VALUE Wikitext_parser_initialize(VALUE self)
819
+ {
820
+ // no need to call super here; rb_call_super()
821
+ rb_iv_set(self, "@autolink", Qtrue);
822
+ rb_iv_set(self, "@line_ending", rb_str_new2("\n"));
823
+ rb_iv_set(self, "@external_link_class", rb_str_new2("external"));
824
+ rb_iv_set(self, "@mailto_class", rb_str_new2("mailto"));
825
+ rb_iv_set(self, "@internal_link_prefix", rb_str_new2("/wiki/"));
826
+ rb_iv_set(self, "@treat_slash_as_special", Qtrue);
827
+ return self;
828
+ }
829
+
830
+ VALUE Wikitext_parser_profiling_parse(VALUE self, VALUE string)
831
+ {
832
+ for (int i = 0; i < 100000; i++)
833
+ Wikitext_parser_parse(1, &string, self);
834
+ }
835
+
836
+ VALUE Wikitext_parser_parse(int argc, VALUE *argv, VALUE self)
837
+ {
838
+ // process arguments
839
+ VALUE string, options;
840
+ if (rb_scan_args(argc, argv, "11", &string, &options) == 1) // 1 mandatory argument, 1 optional argument
841
+ options = Qnil;
842
+ if (NIL_P(string))
843
+ return Qnil;
844
+ string = StringValue(string);
845
+
846
+ // process options hash
847
+ int base_indent = 0;
848
+ VALUE indent = Qnil;
849
+ if (!NIL_P(options) && TYPE(options) == T_HASH)
850
+ {
851
+ indent = rb_hash_aref(options, ID2SYM(rb_intern("indent")));
852
+ base_indent = NUM2INT(indent);
853
+ if (base_indent < 0)
854
+ base_indent = 0;
855
+ }
856
+
857
+ // set up scanner
858
+ char *p = RSTRING_PTR(string);
859
+ long len = RSTRING_LEN(string);
860
+ char *pe = p + len;
861
+
862
+ // access these once per parse
863
+ VALUE line_ending = rb_iv_get(self, "@line_ending");
864
+ line_ending = StringValue(line_ending);
865
+ VALUE link_class = rb_iv_get(self, "@external_link_class");
866
+ link_class = NIL_P(link_class) ? Qnil : StringValue(link_class);
867
+ VALUE mailto_class = rb_iv_get(self, "@mailto_class");
868
+ mailto_class = NIL_P(mailto_class) ? Qnil : StringValue(mailto_class);
869
+ VALUE prefix = rb_iv_get(self, "@internal_link_prefix");
870
+
871
+ // set up parser struct to make passing parameters a little easier
872
+ // eventually this will encapsulate most or all of the variables above
873
+ parser_t _parser;
874
+ parser_t *parser = &_parser;
875
+ parser->output = rb_str_new2("");
876
+ parser->capture = Qnil;
877
+ parser->link_target = Qnil;
878
+ parser->link_text = Qnil;
879
+ parser->external_link_class = link_class;
880
+ parser->scope = ary_new();
881
+ parser->line = ary_new();
882
+ parser->line_buffer = ary_new();
883
+ parser->pending_crlf = Qfalse;
884
+ parser->autolink = rb_iv_get(self, "@autolink");
885
+ parser->treat_slash_as_special = rb_iv_get(self, "@treat_slash_as_special");
886
+ parser->special_link = Qfalse;
887
+ parser->line_ending = str_new_from_string(line_ending);
888
+ parser->base_indent = base_indent;
889
+ parser->current_indent = 0;
890
+ parser->tabulation = NULL;
891
+
892
+ token_t _token;
893
+ _token.type = NO_TOKEN;
894
+ token_t *token = NULL;
895
+ do
896
+ {
897
+ // note that whenever we grab a token we push it into the line buffer
898
+ // this provides us with context-sensitive "memory" of what's been seen so far on this line
899
+ #define NEXT_TOKEN() token = &_token, next_token(token, token, NULL, pe), ary_push(parser->line_buffer, token->type)
900
+
901
+ // check to see if we have a token hanging around from a previous iteration of this loop
902
+ if (token == NULL)
903
+ {
904
+ if (_token.type == NO_TOKEN)
905
+ {
906
+ // first time here (haven't started scanning yet)
907
+ token = &_token;
908
+ next_token(token, NULL, p, pe);
909
+ ary_push(parser->line_buffer, token->type);
910
+ }
911
+ else
912
+ // already scanning
913
+ NEXT_TOKEN();
914
+ }
915
+ int type = token->type;
916
+
917
+ // many restrictions depend on what is at the top of the stack
918
+ int top = ary_entry(parser->scope, -1);
919
+
920
+ // can't declare new variables inside a switch statement, so predeclare them here
921
+ long remove_strong = -1;
922
+ long remove_em = -1;
923
+
924
+ // general purpose counters and flags
925
+ long i = 0;
926
+ long j = 0;
927
+ long k = 0;
928
+
929
+ // The following giant switch statement contains cases for all the possible token types.
930
+ // In the most basic sense we are emitting the HTML that corresponds to each token,
931
+ // but some tokens require context information in order to decide what to output.
932
+ // For example, does the STRONG token (''') translate to <strong> or </strong>?
933
+ // So when looking at any given token we have three state-maintaining variables which gives us a notion of "where we are":
934
+ //
935
+ // - the "scope" stack (indicates what HTML DOM structures we are currently nested inside, similar to a CSS selector)
936
+ // - the line buffer (records tokens seen so far on the current line)
937
+ // - the line "scope" stack (indicates what the scope should be based only on what is visible on the line so far)
938
+ //
939
+ // Although this is fairly complicated, there is one key simplifying factor:
940
+ // The translator continuously performs auto-correction, and this means that we always have a guarantee that the
941
+ // scope stack (up to the current token) is valid; our translator can take this as a given.
942
+ // Auto-correction basically consists of inserting missing tokens (preventing subsquent HTML from being messed up),
943
+ // or converting illegal (unexpected) tokens to their plain text equivalents (providing visual feedback to Wikitext author).
944
+ switch (type)
945
+ {
946
+ case PRE:
947
+ if (IN(NO_WIKI_START) || IN(PRE_START))
948
+ {
949
+ rb_str_cat(parser->output, space, sizeof(space) - 1);
950
+ break;
951
+ }
952
+ else if (IN(BLOCKQUOTE_START))
953
+ {
954
+ // this kind of nesting not allowed (to avoid user confusion)
955
+ _Wikitext_pop_excess_elements(parser);
956
+ _Wikitext_start_para_if_necessary(parser);
957
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
958
+ rb_str_cat(i, space, sizeof(space) - 1);
959
+ break;
960
+ }
961
+
962
+ // count number of BLOCKQUOTE tokens in line buffer and in scope stack
963
+ ary_push(parser->line, PRE);
964
+ i = ary_count(parser->line, BLOCKQUOTE);
965
+ j = ary_count(parser->scope, BLOCKQUOTE);
966
+ if (i < j)
967
+ {
968
+ // must pop (reduce nesting level)
969
+ for (i = j - i; i > 0; i--)
970
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
971
+ }
972
+
973
+ if (!IN(PRE))
974
+ {
975
+ parser->pending_crlf = Qfalse;
976
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
977
+ _Wikitext_indent(parser);
978
+ rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
979
+ ary_push(parser->scope, PRE);
980
+ }
981
+ break;
982
+
983
+ case PRE_START:
984
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
985
+ rb_str_cat(parser->output, escaped_pre_start, sizeof(escaped_pre_start) - 1);
986
+ else if (IN(BLOCKQUOTE_START))
987
+ {
988
+ _Wikitext_rollback_failed_link(parser); // if any
989
+ _Wikitext_rollback_failed_external_link(parser); // if any
990
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
991
+ _Wikitext_indent(parser);
992
+ rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
993
+ ary_push(parser->scope, PRE_START);
994
+ ary_push(parser->line, PRE_START);
995
+ }
996
+ else if (parser->scope->count == 0 || (IN(P) && !IN(BLOCKQUOTE)))
997
+ {
998
+ // would be nice to eliminate the repetition here but it's probably the clearest way
999
+ _Wikitext_rollback_failed_link(parser); // if any
1000
+ _Wikitext_rollback_failed_external_link(parser); // if any
1001
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, P, Qtrue);
1002
+ _Wikitext_indent(parser);
1003
+ rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
1004
+ ary_push(parser->scope, PRE_START);
1005
+ ary_push(parser->line, PRE_START);
1006
+ }
1007
+ else
1008
+ {
1009
+ // everywhere else, PRE_START is illegal (in LI, BLOCKQUOTE, H1_START etc)
1010
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1011
+ _Wikitext_pop_excess_elements(parser);
1012
+ _Wikitext_start_para_if_necessary(parser);
1013
+ rb_str_cat(i, escaped_pre_start, sizeof(escaped_pre_start) - 1);
1014
+ }
1015
+ break;
1016
+
1017
+ case PRE_END:
1018
+ if (IN(NO_WIKI_START) || IN(PRE))
1019
+ rb_str_cat(parser->output, escaped_pre_end, sizeof(escaped_pre_end) - 1);
1020
+ else
1021
+ {
1022
+ if (IN(PRE_START))
1023
+ _Wikitext_pop_from_stack_up_to(parser, parser->output, PRE_START, Qtrue);
1024
+ else
1025
+ {
1026
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1027
+ _Wikitext_pop_excess_elements(parser);
1028
+ _Wikitext_start_para_if_necessary(parser);
1029
+ rb_str_cat(i, escaped_pre_end, sizeof(escaped_pre_end) - 1);
1030
+ }
1031
+ }
1032
+ break;
1033
+
1034
+ case BLOCKQUOTE:
1035
+ if (IN(NO_WIKI_START) || IN(PRE_START))
1036
+ // no need to check for <pre>; can never appear inside it
1037
+ rb_str_cat(parser->output, escaped_blockquote, TOKEN_LEN(token) + 3); // will either emit "&gt;" or "&gt; "
1038
+ else if (IN(BLOCKQUOTE_START))
1039
+ {
1040
+ // this kind of nesting not allowed (to avoid user confusion)
1041
+ _Wikitext_pop_excess_elements(parser);
1042
+ _Wikitext_start_para_if_necessary(parser);
1043
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1044
+ rb_str_cat(i, escaped_blockquote, TOKEN_LEN(token) + 3); // will either emit "&gt;" or "&gt; "
1045
+ break;
1046
+ }
1047
+ else
1048
+ {
1049
+ ary_push(parser->line, BLOCKQUOTE);
1050
+
1051
+ // count number of BLOCKQUOTE tokens in line buffer and in scope stack
1052
+ i = ary_count(parser->line, BLOCKQUOTE);
1053
+ j = ary_count(parser->scope, BLOCKQUOTE);
1054
+
1055
+ // given that BLOCKQUOTE tokens can be nested, peek ahead and see if there are any more which might affect the decision to push or pop
1056
+ while (NEXT_TOKEN(), (token->type == BLOCKQUOTE))
1057
+ {
1058
+ ary_push(parser->line, BLOCKQUOTE);
1059
+ i++;
1060
+ }
1061
+
1062
+ // now decide whether to push, pop or do nothing
1063
+ if (i > j)
1064
+ {
1065
+ // must push (increase nesting level)
1066
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
1067
+ for (i = i - j; i > 0; i--)
1068
+ {
1069
+ _Wikitext_indent(parser);
1070
+ rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
1071
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1072
+ ary_push(parser->scope, BLOCKQUOTE);
1073
+ }
1074
+ }
1075
+ else if (i < j)
1076
+ {
1077
+ // must pop (reduce nesting level)
1078
+ for (i = j - i; i > 0; i--)
1079
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
1080
+ }
1081
+
1082
+ // jump to top of the loop to process token we scanned during lookahead
1083
+ continue;
1084
+ }
1085
+ break;
1086
+
1087
+ case BLOCKQUOTE_START:
1088
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1089
+ rb_str_cat(parser->output, escaped_blockquote_start, sizeof(escaped_blockquote_start) - 1);
1090
+ else if (IN(BLOCKQUOTE_START))
1091
+ {
1092
+ // nesting is fine here
1093
+ _Wikitext_rollback_failed_link(parser); // if any
1094
+ _Wikitext_rollback_failed_external_link(parser); // if any
1095
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
1096
+ _Wikitext_indent(parser);
1097
+ rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
1098
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1099
+ ary_push(parser->scope, BLOCKQUOTE_START);
1100
+ ary_push(parser->line, BLOCKQUOTE_START);
1101
+ }
1102
+ else if (parser->scope->count == 0 || (IN(P) && !IN(BLOCKQUOTE)))
1103
+ {
1104
+ // would be nice to eliminate the repetition here but it's probably the clearest way
1105
+ _Wikitext_rollback_failed_link(parser); // if any
1106
+ _Wikitext_rollback_failed_external_link(parser); // if any
1107
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, P, Qtrue);
1108
+ _Wikitext_indent(parser);
1109
+ rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
1110
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1111
+ ary_push(parser->scope, BLOCKQUOTE_START);
1112
+ ary_push(parser->line, BLOCKQUOTE_START);
1113
+ }
1114
+ else
1115
+ {
1116
+ // everywhere else, BLOCKQUOTE_START is illegal (in LI, BLOCKQUOTE, H1_START etc)
1117
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1118
+ _Wikitext_pop_excess_elements(parser);
1119
+ _Wikitext_start_para_if_necessary(parser);
1120
+ rb_str_cat(i, escaped_blockquote_start, sizeof(escaped_blockquote_start) - 1);
1121
+ }
1122
+ break;
1123
+
1124
+ case BLOCKQUOTE_END:
1125
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1126
+ rb_str_cat(parser->output, escaped_blockquote_end, sizeof(escaped_blockquote_end) - 1);
1127
+ else
1128
+ {
1129
+ if (IN(BLOCKQUOTE_START))
1130
+ _Wikitext_pop_from_stack_up_to(parser, parser->output, BLOCKQUOTE_START, Qtrue);
1131
+ else
1132
+ {
1133
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1134
+ _Wikitext_pop_excess_elements(parser);
1135
+ _Wikitext_start_para_if_necessary(parser);
1136
+ rb_str_cat(i, escaped_blockquote_end, sizeof(escaped_blockquote_end) - 1);
1137
+ }
1138
+ }
1139
+ break;
1140
+
1141
+ case NO_WIKI_START:
1142
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1143
+ rb_str_cat(parser->output, escaped_no_wiki_start, sizeof(escaped_no_wiki_start) - 1);
1144
+ else
1145
+ {
1146
+ _Wikitext_pop_excess_elements(parser);
1147
+ _Wikitext_start_para_if_necessary(parser);
1148
+ ary_push(parser->scope, NO_WIKI_START);
1149
+ ary_push(parser->line, NO_WIKI_START);
1150
+ }
1151
+ break;
1152
+
1153
+ case NO_WIKI_END:
1154
+ if (IN(NO_WIKI_START))
1155
+ // <nowiki> should always only ever be the last item in the stack, but use the helper routine just in case
1156
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, NO_WIKI_START, Qtrue);
1157
+ else
1158
+ {
1159
+ _Wikitext_pop_excess_elements(parser);
1160
+ _Wikitext_start_para_if_necessary(parser);
1161
+ rb_str_cat(parser->output, escaped_no_wiki_end, sizeof(escaped_no_wiki_end) - 1);
1162
+ }
1163
+ break;
1164
+
1165
+ case STRONG_EM:
1166
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1167
+ {
1168
+ rb_str_cat(parser->output, literal_strong_em, sizeof(literal_strong_em) - 1);
1169
+ break;
1170
+ }
1171
+
1172
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1173
+ _Wikitext_pop_excess_elements(parser);
1174
+
1175
+ // if you've seen STRONG/STRONG_START or EM/EM_START, must close them in the reverse order that you saw them!
1176
+ // otherwise, must open them
1177
+ remove_strong = -1;
1178
+ remove_em = -1;
1179
+ j = parser->scope->count;
1180
+ for (j = j - 1; j >= 0; j--)
1181
+ {
1182
+ int val = ary_entry(parser->scope, j);
1183
+ if (val == STRONG || val == STRONG_START)
1184
+ {
1185
+ rb_str_cat(i, strong_end, sizeof(strong_end) - 1);
1186
+ remove_strong = j;
1187
+ }
1188
+ else if (val == EM || val == EM_START)
1189
+ {
1190
+ rb_str_cat(i, em_end, sizeof(em_end) - 1);
1191
+ remove_em = j;
1192
+ }
1193
+ }
1194
+
1195
+ if (remove_strong > remove_em) // must remove strong first
1196
+ {
1197
+ ary_pop(parser->scope);
1198
+ if (remove_em > -1)
1199
+ ary_pop(parser->scope);
1200
+ else // there was no em to remove!, so consider this an opening em tag
1201
+ {
1202
+ rb_str_cat(i, em_start, sizeof(em_start) - 1);
1203
+ ary_push(parser->scope, EM);
1204
+ ary_push(parser->line, EM);
1205
+ }
1206
+ }
1207
+ else if (remove_em > remove_strong) // must remove em first
1208
+ {
1209
+ ary_pop(parser->scope);
1210
+ if (remove_strong > -1)
1211
+ ary_pop(parser->scope);
1212
+ else // there was no strong to remove!, so consider this an opening strong tag
1213
+ {
1214
+ rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
1215
+ ary_push(parser->scope, STRONG);
1216
+ ary_push(parser->line, STRONG);
1217
+ }
1218
+ }
1219
+ else // no strong or em to remove, so this must be a new opening of both
1220
+ {
1221
+ _Wikitext_start_para_if_necessary(parser);
1222
+ rb_str_cat(i, strong_em_start, sizeof(strong_em_start) - 1);
1223
+ ary_push(parser->scope, STRONG);
1224
+ ary_push(parser->line, STRONG);
1225
+ ary_push(parser->scope, EM);
1226
+ ary_push(parser->line, EM);
1227
+ }
1228
+ break;
1229
+
1230
+ case STRONG:
1231
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1232
+ rb_str_cat(parser->output, literal_strong, sizeof(literal_strong) - 1);
1233
+ else
1234
+ {
1235
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1236
+ if (IN(STRONG_START))
1237
+ // already in span started with <strong>, no choice but to emit this literally
1238
+ rb_str_cat(parser->output, literal_strong, sizeof(literal_strong) - 1);
1239
+ else if (IN(STRONG))
1240
+ // STRONG already seen, this is a closing tag
1241
+ _Wikitext_pop_from_stack_up_to(parser, i, STRONG, Qtrue);
1242
+ else
1243
+ {
1244
+ // this is a new opening
1245
+ _Wikitext_pop_excess_elements(parser);
1246
+ _Wikitext_start_para_if_necessary(parser);
1247
+ rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
1248
+ ary_push(parser->scope, STRONG);
1249
+ ary_push(parser->line, STRONG);
1250
+ }
1251
+ }
1252
+ break;
1253
+
1254
+ case STRONG_START:
1255
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1256
+ rb_str_cat(parser->output, escaped_strong_start, sizeof(escaped_strong_start) - 1);
1257
+ else
1258
+ {
1259
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1260
+ if (IN(STRONG_START) || IN(STRONG))
1261
+ rb_str_cat(parser->output, escaped_strong_start, sizeof(escaped_strong_start) - 1);
1262
+ else
1263
+ {
1264
+ _Wikitext_pop_excess_elements(parser);
1265
+ _Wikitext_start_para_if_necessary(parser);
1266
+ rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
1267
+ ary_push(parser->scope, STRONG_START);
1268
+ ary_push(parser->line, STRONG_START);
1269
+ }
1270
+ }
1271
+ break;
1272
+
1273
+ case STRONG_END:
1274
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1275
+ rb_str_cat(parser->output, escaped_strong_end, sizeof(escaped_strong_end) - 1);
1276
+ else
1277
+ {
1278
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1279
+ if (IN(STRONG_START))
1280
+ _Wikitext_pop_from_stack_up_to(parser, i, STRONG_START, Qtrue);
1281
+ else
1282
+ {
1283
+ // no STRONG_START in scope, so must interpret the STRONG_END without any special meaning
1284
+ _Wikitext_pop_excess_elements(parser);
1285
+ _Wikitext_start_para_if_necessary(parser);
1286
+ rb_str_cat(i, escaped_strong_end, sizeof(escaped_strong_end) - 1);
1287
+ }
1288
+ }
1289
+ break;
1290
+
1291
+ case EM:
1292
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1293
+ rb_str_cat(parser->output, literal_em, sizeof(literal_em) - 1);
1294
+ else
1295
+ {
1296
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1297
+ if (IN(EM_START))
1298
+ // already in span started with <em>, no choice but to emit this literally
1299
+ rb_str_cat(parser->output, literal_em, sizeof(literal_em) - 1);
1300
+ else if (IN(EM))
1301
+ // EM already seen, this is a closing tag
1302
+ _Wikitext_pop_from_stack_up_to(parser, i, EM, Qtrue);
1303
+ else
1304
+ {
1305
+ // this is a new opening
1306
+ _Wikitext_pop_excess_elements(parser);
1307
+ _Wikitext_start_para_if_necessary(parser);
1308
+ rb_str_cat(i, em_start, sizeof(em_start) - 1);
1309
+ ary_push(parser->scope, EM);
1310
+ ary_push(parser->line, EM);
1311
+ }
1312
+ }
1313
+ break;
1314
+
1315
+ case EM_START:
1316
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1317
+ rb_str_cat(parser->output, escaped_em_start, sizeof(escaped_em_start) - 1);
1318
+ else
1319
+ {
1320
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1321
+ if (IN(EM_START) || IN(EM))
1322
+ rb_str_cat(i, escaped_em_start, sizeof(escaped_em_start) - 1);
1323
+ else
1324
+ {
1325
+ _Wikitext_pop_excess_elements(parser);
1326
+ _Wikitext_start_para_if_necessary(parser);
1327
+ rb_str_cat(i, em_start, sizeof(em_start) - 1);
1328
+ ary_push(parser->scope, EM_START);
1329
+ ary_push(parser->line, EM_START);
1330
+ }
1331
+ }
1332
+ break;
1333
+
1334
+ case EM_END:
1335
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1336
+ rb_str_cat(parser->output, escaped_em_end, sizeof(escaped_em_end) - 1);
1337
+ else
1338
+ {
1339
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1340
+ if (IN(EM_START))
1341
+ _Wikitext_pop_from_stack_up_to(parser, i, EM_START, Qtrue);
1342
+ else
1343
+ {
1344
+ // no EM_START in scope, so must interpret the TT_END without any special meaning
1345
+ _Wikitext_pop_excess_elements(parser);
1346
+ _Wikitext_start_para_if_necessary(parser);
1347
+ rb_str_cat(i, escaped_em_end, sizeof(escaped_em_end) - 1);
1348
+ }
1349
+ }
1350
+ break;
1351
+
1352
+ case TT:
1353
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1354
+ rb_str_cat(parser->output, backtick, sizeof(backtick) - 1);
1355
+ else
1356
+ {
1357
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1358
+ if (IN(TT_START))
1359
+ // already in span started with <tt>, no choice but to emit this literally
1360
+ rb_str_cat(parser->output, backtick, sizeof(backtick) - 1);
1361
+ else if (IN(TT))
1362
+ // TT (`) already seen, this is a closing tag
1363
+ _Wikitext_pop_from_stack_up_to(parser, i, TT, Qtrue);
1364
+ else
1365
+ {
1366
+ // this is a new opening
1367
+ _Wikitext_pop_excess_elements(parser);
1368
+ _Wikitext_start_para_if_necessary(parser);
1369
+ rb_str_cat(i, tt_start, sizeof(tt_start) - 1);
1370
+ ary_push(parser->scope, TT);
1371
+ ary_push(parser->line, TT);
1372
+ }
1373
+ }
1374
+ break;
1375
+
1376
+ case TT_START:
1377
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1378
+ rb_str_cat(parser->output, escaped_tt_start, sizeof(escaped_tt_start) - 1);
1379
+ else
1380
+ {
1381
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1382
+ if (IN(TT_START) || IN(TT))
1383
+ rb_str_cat(i, escaped_tt_start, sizeof(escaped_tt_start) - 1);
1384
+ else
1385
+ {
1386
+ _Wikitext_pop_excess_elements(parser);
1387
+ _Wikitext_start_para_if_necessary(parser);
1388
+ rb_str_cat(i, tt_start, sizeof(tt_start) - 1);
1389
+ ary_push(parser->scope, TT_START);
1390
+ ary_push(parser->line, TT_START);
1391
+ }
1392
+ }
1393
+ break;
1394
+
1395
+ case TT_END:
1396
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1397
+ rb_str_cat(parser->output, escaped_tt_end, sizeof(escaped_tt_end) - 1);
1398
+ else
1399
+ {
1400
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1401
+ if (IN(TT_START))
1402
+ _Wikitext_pop_from_stack_up_to(parser, i, TT_START, Qtrue);
1403
+ else
1404
+ {
1405
+ // no TT_START in scope, so must interpret the TT_END without any special meaning
1406
+ _Wikitext_pop_excess_elements(parser);
1407
+ _Wikitext_start_para_if_necessary(parser);
1408
+ rb_str_cat(i, escaped_tt_end, sizeof(escaped_tt_end) - 1);
1409
+ }
1410
+ }
1411
+ break;
1412
+
1413
+ case OL:
1414
+ case UL:
1415
+ if (IN(NO_WIKI_START) || IN(PRE_START))
1416
+ {
1417
+ // no need to check for PRE; can never appear inside it
1418
+ rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
1419
+ break;
1420
+ }
1421
+
1422
+ // count number of tokens in line and scope stacks
1423
+ int bq_count = ary_count(parser->scope, BLOCKQUOTE_START);
1424
+ i = parser->line->count - ary_count(parser->line, BLOCKQUOTE_START);
1425
+ j = parser->scope->count - bq_count;
1426
+ k = i;
1427
+
1428
+ // list tokens can be nested so look ahead for any more which might affect the decision to push or pop
1429
+ for (;;)
1430
+ {
1431
+ type = token->type;
1432
+ if (type == OL || type == UL)
1433
+ {
1434
+ token = NULL;
1435
+ if (i - k >= 2) // already seen at least one OL or UL
1436
+ {
1437
+ ary_push(parser->line, NESTED_LIST); // which means this is a nested list
1438
+ i += 3;
1439
+ }
1440
+ else
1441
+ i += 2;
1442
+ ary_push(parser->line, type);
1443
+ ary_push(parser->line, LI);
1444
+
1445
+ // want to compare line with scope but can only do so if scope has enough items on it
1446
+ if (j >= i)
1447
+ {
1448
+ if (ary_entry(parser->scope, i + bq_count - 2) == type && ary_entry(parser->scope, i + bq_count - 1) == LI)
1449
+ {
1450
+ // line and scope match at this point: do nothing yet
1451
+ }
1452
+ else
1453
+ {
1454
+ // item just pushed onto line does not match corresponding slot of scope!
1455
+ for (; j >= i - 2; j--)
1456
+ // must pop back before emitting
1457
+ _Wikitext_pop_from_stack(parser, Qnil);
1458
+
1459
+ // will emit UL or OL, then LI
1460
+ break;
1461
+ }
1462
+ }
1463
+ else // line stack size now exceeds scope stack size: must increase nesting level
1464
+ break; // will emit UL or OL, then LI
1465
+ }
1466
+ else
1467
+ {
1468
+ // not a OL or UL token!
1469
+ if (j == i)
1470
+ // must close existing LI and re-open new one
1471
+ _Wikitext_pop_from_stack(parser, Qnil);
1472
+ else if (j > i)
1473
+ {
1474
+ // item just pushed onto line does not match corresponding slot of scope!
1475
+ for (; j >= i; j--)
1476
+ // must pop back before emitting
1477
+ _Wikitext_pop_from_stack(parser, Qnil);
1478
+ }
1479
+ break;
1480
+ }
1481
+ NEXT_TOKEN();
1482
+ }
1483
+
1484
+ // will emit
1485
+ if (type == OL || type == UL)
1486
+ {
1487
+ // if LI is at the top of a stack this is the start of a nested list
1488
+ if (j > 0 && ary_entry(parser->scope, -1) == LI)
1489
+ {
1490
+ // so we should precede it with a CRLF, and indicate that it's a nested list
1491
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1492
+ ary_push(parser->scope, NESTED_LIST);
1493
+ }
1494
+ else
1495
+ {
1496
+ // this is a new list
1497
+ if (IN(BLOCKQUOTE_START))
1498
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
1499
+ else
1500
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
1501
+ }
1502
+
1503
+ // emit
1504
+ _Wikitext_indent(parser);
1505
+ if (type == OL)
1506
+ rb_str_cat(parser->output, ol_start, sizeof(ol_start) - 1);
1507
+ else if (type == UL)
1508
+ rb_str_cat(parser->output, ul_start, sizeof(ul_start) - 1);
1509
+ ary_push(parser->scope, type);
1510
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1511
+ }
1512
+ else if (type == SPACE)
1513
+ // silently throw away the optional SPACE token after final list marker
1514
+ token = NULL;
1515
+
1516
+ _Wikitext_indent(parser);
1517
+ rb_str_cat(parser->output, li_start, sizeof(li_start) - 1);
1518
+ ary_push(parser->scope, LI);
1519
+
1520
+ // any subsequent UL or OL tokens on this line are syntax errors and must be emitted literally
1521
+ if (type == OL || type == UL)
1522
+ {
1523
+ k = 0;
1524
+ while (k++, NEXT_TOKEN(), (type = token->type))
1525
+ {
1526
+ if (type == OL || type == UL)
1527
+ rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
1528
+ else if (type == SPACE && k == 1)
1529
+ {
1530
+ // silently throw away the optional SPACE token after final list marker
1531
+ token = NULL;
1532
+ break;
1533
+ }
1534
+ else
1535
+ break;
1536
+ }
1537
+ }
1538
+
1539
+ // jump to top of the loop to process token we scanned during lookahead
1540
+ continue;
1541
+
1542
+ case H6_START:
1543
+ case H5_START:
1544
+ case H4_START:
1545
+ case H3_START:
1546
+ case H2_START:
1547
+ case H1_START:
1548
+ if (IN(NO_WIKI_START) || IN(PRE_START))
1549
+ {
1550
+ // no need to check for PRE; can never appear inside it
1551
+ rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
1552
+ break;
1553
+ }
1554
+
1555
+ // pop up to but not including the last BLOCKQUOTE on the scope stack
1556
+ if (IN(BLOCKQUOTE_START))
1557
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
1558
+ else
1559
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
1560
+
1561
+ // count number of BLOCKQUOTE tokens in line buffer and in scope stack
1562
+ ary_push(parser->line, type);
1563
+ i = ary_count(parser->line, BLOCKQUOTE);
1564
+ j = ary_count(parser->scope, BLOCKQUOTE);
1565
+
1566
+ // decide whether we need to pop off excess BLOCKQUOTE tokens (will never need to push; that is handled above in the BLOCKQUOTE case itself)
1567
+ if (i < j)
1568
+ {
1569
+ // must pop (reduce nesting level)
1570
+ for (i = j - i; i > 0; i--)
1571
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
1572
+ }
1573
+
1574
+ // discard any whitespace here (so that "== foo ==" will be translated to "<h2>foo</h2>" rather than "<h2> foo </h2")
1575
+ while (NEXT_TOKEN(), (token->type == SPACE))
1576
+ ; // discard
1577
+
1578
+ ary_push(parser->scope, type);
1579
+ _Wikitext_indent(parser);
1580
+
1581
+ // rather than repeat all that code for each kind of heading, share it and use a conditional here
1582
+ if (type == H6_START)
1583
+ rb_str_cat(parser->output, h6_start, sizeof(h6_start) - 1);
1584
+ else if (type == H5_START)
1585
+ rb_str_cat(parser->output, h5_start, sizeof(h5_start) - 1);
1586
+ else if (type == H4_START)
1587
+ rb_str_cat(parser->output, h4_start, sizeof(h4_start) - 1);
1588
+ else if (type == H3_START)
1589
+ rb_str_cat(parser->output, h3_start, sizeof(h3_start) - 1);
1590
+ else if (type == H2_START)
1591
+ rb_str_cat(parser->output, h2_start, sizeof(h2_start) - 1);
1592
+ else if (type == H1_START)
1593
+ rb_str_cat(parser->output, h1_start, sizeof(h1_start) - 1);
1594
+
1595
+ // jump to top of the loop to process token we scanned during lookahead
1596
+ continue;
1597
+
1598
+ case H6_END:
1599
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1600
+ rb_str_cat(parser->output, literal_h6, sizeof(literal_h6) - 1);
1601
+ else
1602
+ {
1603
+ _Wikitext_rollback_failed_external_link(parser); // if any
1604
+ if (!IN(H6_START))
1605
+ {
1606
+ // literal output only if not in h6 scope (we stay silent in that case)
1607
+ _Wikitext_start_para_if_necessary(parser);
1608
+ rb_str_cat(parser->output, literal_h6, sizeof(literal_h6) - 1);
1609
+ }
1610
+ }
1611
+ break;
1612
+
1613
+ case H5_END:
1614
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1615
+ rb_str_cat(parser->output, literal_h5, sizeof(literal_h5) - 1);
1616
+ else
1617
+ {
1618
+ _Wikitext_rollback_failed_external_link(parser); // if any
1619
+ if (!IN(H5_START))
1620
+ {
1621
+ // literal output only if not in h5 scope (we stay silent in that case)
1622
+ _Wikitext_start_para_if_necessary(parser);
1623
+ rb_str_cat(parser->output, literal_h5, sizeof(literal_h5) - 1);
1624
+ }
1625
+ }
1626
+ break;
1627
+
1628
+ case H4_END:
1629
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1630
+ rb_str_cat(parser->output, literal_h4, sizeof(literal_h4) - 1);
1631
+ else
1632
+ {
1633
+ _Wikitext_rollback_failed_external_link(parser); // if any
1634
+ if (!IN(H4_START))
1635
+ {
1636
+ // literal output only if not in h4 scope (we stay silent in that case)
1637
+ _Wikitext_start_para_if_necessary(parser);
1638
+ rb_str_cat(parser->output, literal_h4, sizeof(literal_h4) - 1);
1639
+ }
1640
+ }
1641
+ break;
1642
+
1643
+ case H3_END:
1644
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1645
+ rb_str_cat(parser->output, literal_h3, sizeof(literal_h3) - 1);
1646
+ else
1647
+ {
1648
+ _Wikitext_rollback_failed_external_link(parser); // if any
1649
+ if (!IN(H3_START))
1650
+ {
1651
+ // literal output only if not in h3 scope (we stay silent in that case)
1652
+ _Wikitext_start_para_if_necessary(parser);
1653
+ rb_str_cat(parser->output, literal_h3, sizeof(literal_h3) - 1);
1654
+ }
1655
+ }
1656
+ break;
1657
+
1658
+ case H2_END:
1659
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1660
+ rb_str_cat(parser->output, literal_h2, sizeof(literal_h2) - 1);
1661
+ else
1662
+ {
1663
+ _Wikitext_rollback_failed_external_link(parser); // if any
1664
+ if (!IN(H2_START))
1665
+ {
1666
+ // literal output only if not in h2 scope (we stay silent in that case)
1667
+ _Wikitext_start_para_if_necessary(parser);
1668
+ rb_str_cat(parser->output, literal_h2, sizeof(literal_h2) - 1);
1669
+ }
1670
+ }
1671
+ break;
1672
+
1673
+ case H1_END:
1674
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1675
+ rb_str_cat(parser->output, literal_h1, sizeof(literal_h1) - 1);
1676
+ else
1677
+ {
1678
+ _Wikitext_rollback_failed_external_link(parser); // if any
1679
+ if (!IN(H1_START))
1680
+ {
1681
+ // literal output only if not in h1 scope (we stay silent in that case)
1682
+ _Wikitext_start_para_if_necessary(parser);
1683
+ rb_str_cat(parser->output, literal_h1, sizeof(literal_h1) - 1);
1684
+ }
1685
+ }
1686
+ break;
1687
+
1688
+ case MAIL:
1689
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1690
+ rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
1691
+ else
1692
+ {
1693
+ // in plain scope, will turn into autolink (with appropriate, user-configurable CSS)
1694
+ _Wikitext_pop_excess_elements(parser);
1695
+ _Wikitext_start_para_if_necessary(parser);
1696
+ i = TOKEN_TEXT(token);
1697
+ if (parser->autolink == Qtrue)
1698
+ i = _Wikitext_hyperlink(rb_str_new2("mailto:"), i, i, mailto_class);
1699
+ rb_str_append(parser->output, i);
1700
+ }
1701
+ break;
1702
+
1703
+ case URI:
1704
+ if (IN(NO_WIKI_START))
1705
+ // user can temporarily suppress autolinking by using <nowiki></nowiki>
1706
+ // note that unlike MediaWiki, we do allow autolinking inside PRE blocks
1707
+ rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
1708
+ else if (IN(LINK_START))
1709
+ {
1710
+ // if the URI were allowed it would have been handled already in LINK_START
1711
+ _Wikitext_rollback_failed_link(parser);
1712
+ i = TOKEN_TEXT(token);
1713
+ if (parser->autolink == Qtrue)
1714
+ i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
1715
+ rb_str_append(parser->output, i);
1716
+ }
1717
+ else if (IN(EXT_LINK_START))
1718
+ {
1719
+ if (NIL_P(parser->link_target))
1720
+ {
1721
+ // this must be our link target: look ahead to make sure we see the space we're expecting to see
1722
+ i = TOKEN_TEXT(token);
1723
+ NEXT_TOKEN();
1724
+ if (token->type == SPACE)
1725
+ {
1726
+ ary_push(parser->scope, SPACE);
1727
+ parser->link_target = i;
1728
+ parser->link_text = rb_str_new2("");
1729
+ parser->capture = parser->link_text;
1730
+ token = NULL; // silently consume space
1731
+ }
1732
+ else
1733
+ {
1734
+ // didn't see the space! this must be an error
1735
+ _Wikitext_pop_from_stack(parser, Qnil);
1736
+ _Wikitext_pop_excess_elements(parser);
1737
+ _Wikitext_start_para_if_necessary(parser);
1738
+ rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
1739
+ if (parser->autolink == Qtrue)
1740
+ i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
1741
+ rb_str_append(parser->output, i);
1742
+ }
1743
+ }
1744
+ else
1745
+ {
1746
+ if (NIL_P(parser->link_text))
1747
+ // this must be the first part of our link text
1748
+ parser->link_text = TOKEN_TEXT(token);
1749
+ else
1750
+ // add to existing link text
1751
+ rb_str_cat(parser->link_text, token->start, TOKEN_LEN(token));
1752
+ }
1753
+ }
1754
+ else
1755
+ {
1756
+ // in plain scope, will turn into autolink (with appropriate, user-configurable CSS)
1757
+ _Wikitext_pop_excess_elements(parser);
1758
+ _Wikitext_start_para_if_necessary(parser);
1759
+ i = TOKEN_TEXT(token);
1760
+ if (parser->autolink == Qtrue)
1761
+ i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
1762
+ rb_str_append(parser->output, i);
1763
+ }
1764
+ break;
1765
+
1766
+ // internal links (links to other wiki articles) look like this:
1767
+ // [[another article]] (would point at, for example, "/wiki/another_article")
1768
+ // [[the other article|the link text we'll use for it]]
1769
+ // [[the other article | the link text we'll use for it]]
1770
+ // note that the forward slash is a reserved character which changes the meaning of an internal link;
1771
+ // this is a link that is external to the wiki but internal to the site as a whole:
1772
+ // [[bug/12]] (a relative link to "/bug/12")
1773
+ // MediaWiki has strict requirements about what it will accept as a link target:
1774
+ // all wikitext markup is disallowed:
1775
+ // example [[foo ''bar'' baz]]
1776
+ // renders [[foo <em>bar</em> baz]] (ie. not a link)
1777
+ // example [[foo <em>bar</em> baz]]
1778
+ // renders [[foo <em>bar</em> baz]] (ie. not a link)
1779
+ // example [[foo <nowiki>''</nowiki> baz]]
1780
+ // renders [[foo '' baz]] (ie. not a link)
1781
+ // example [[foo <bar> baz]]
1782
+ // renders [[foo &lt;bar&gt; baz]] (ie. not a link)
1783
+ // HTML entities and non-ASCII, however, make it through:
1784
+ // example [[foo &euro;]]
1785
+ // renders <a href="/wiki/Foo_%E2%82%AC">foo &euro;</a>
1786
+ // example [[foo €]]
1787
+ // renders <a href="/wiki/Foo_%E2%82%AC">foo €</a>
1788
+ // we'll impose similar restrictions here for the link target; allowed tokens will be:
1789
+ // SPACE, PRINTABLE, DEFAULT, QUOT and AMP
1790
+ // everything else will be rejected
1791
+ case LINK_START:
1792
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1793
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1794
+ rb_str_cat(i, link_start, sizeof(link_start) - 1);
1795
+ else if (IN(EXT_LINK_START))
1796
+ // already in external link scope! (and in fact, must be capturing link_text right now)
1797
+ rb_str_cat(i, link_start, sizeof(link_start) - 1);
1798
+ else if (IN(LINK_START))
1799
+ {
1800
+ // already in internal link scope! this is a syntax error
1801
+ _Wikitext_rollback_failed_link(parser);
1802
+ rb_str_cat(parser->output, link_start, sizeof(link_start) - 1);
1803
+ }
1804
+ else if (IN(SEPARATOR))
1805
+ {
1806
+ // scanning internal link text
1807
+ }
1808
+ else // not in internal link scope yet
1809
+ {
1810
+ // will either emit a link, or the rollback of a failed link, so start the para now
1811
+ _Wikitext_pop_excess_elements(parser);
1812
+ _Wikitext_start_para_if_necessary(parser);
1813
+ ary_push(parser->scope, LINK_START);
1814
+
1815
+ // look ahead and try to gobble up link target
1816
+ while (NEXT_TOKEN(), (type = token->type))
1817
+ {
1818
+ if (type == SPACE ||
1819
+ type == PRINTABLE ||
1820
+ type == DEFAULT ||
1821
+ type == QUOT ||
1822
+ type == QUOT_ENTITY ||
1823
+ type == AMP ||
1824
+ type == AMP_ENTITY)
1825
+ {
1826
+ // accumulate these tokens into link_target
1827
+ if (NIL_P(parser->link_target))
1828
+ {
1829
+ parser->link_target = rb_str_new2("");
1830
+ parser->capture = parser->link_target;
1831
+ }
1832
+ if (type == QUOT_ENTITY)
1833
+ // don't insert the entity, insert the literal quote
1834
+ rb_str_cat(parser->link_target, quote, sizeof(quote) - 1);
1835
+ else if (type == AMP_ENTITY)
1836
+ // don't insert the entity, insert the literal ampersand
1837
+ rb_str_cat(parser->link_target, ampersand, sizeof(ampersand) - 1);
1838
+ else
1839
+ rb_str_cat(parser->link_target, token->start, TOKEN_LEN(token));
1840
+ }
1841
+ else if (type == LINK_END)
1842
+ break; // jump back to top of loop (will handle this in LINK_END case below)
1843
+ else if (type == SEPARATOR)
1844
+ {
1845
+ ary_push(parser->scope, SEPARATOR);
1846
+ parser->link_text = rb_str_new2("");
1847
+ parser->capture = parser->link_text;
1848
+ token = NULL;
1849
+ break;
1850
+ }
1851
+ else // unexpected token (syntax error)
1852
+ {
1853
+ _Wikitext_rollback_failed_link(parser);
1854
+ break; // jump back to top of loop to handle unexpected token
1855
+ }
1856
+ }
1857
+
1858
+ // jump to top of the loop to process token we scanned during lookahead (if any)
1859
+ continue;
1860
+ }
1861
+ break;
1862
+
1863
+ case LINK_END:
1864
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1865
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1866
+ rb_str_cat(i, link_end, sizeof(link_end) - 1);
1867
+ else if (IN(EXT_LINK_START))
1868
+ // already in external link scope! (and in fact, must be capturing link_text right now)
1869
+ rb_str_cat(i, link_end, sizeof(link_end) - 1);
1870
+ else if (IN(LINK_START))
1871
+ {
1872
+ // in internal link scope!
1873
+ if (NIL_P(parser->link_text) || RSTRING_LEN(parser->link_text) == 0)
1874
+ // use link target as link text
1875
+ parser->link_text = _Wikitext_parser_sanitize_link_target(parser->link_target, Qtrue);
1876
+ else
1877
+ parser->link_text = _Wikitext_parser_trim_link_target(parser->link_text);
1878
+ _Wikitext_parser_encode_link_target(parser);
1879
+ _Wikitext_pop_from_stack_up_to(parser, i, LINK_START, Qtrue);
1880
+ parser->capture = Qnil;
1881
+ if (parser->special_link)
1882
+ i = _Wikitext_hyperlink(rb_str_new2("/"), parser->link_target, parser->link_text, Qnil);
1883
+ else
1884
+ i = _Wikitext_hyperlink(prefix, parser->link_target, parser->link_text, Qnil);
1885
+ rb_str_append(parser->output, i);
1886
+ parser->link_target = Qnil;
1887
+ parser->link_text = Qnil;
1888
+ }
1889
+ else // wasn't in internal link scope
1890
+ {
1891
+ _Wikitext_pop_excess_elements(parser);
1892
+ _Wikitext_start_para_if_necessary(parser);
1893
+ rb_str_cat(i, link_end, sizeof(link_end) - 1);
1894
+ }
1895
+ break;
1896
+
1897
+ // external links look like this:
1898
+ // [http://google.com/ the link text]
1899
+ // strings in square brackets which don't match this syntax get passed through literally; eg:
1900
+ // he was very angery [sic] about the turn of events
1901
+ case EXT_LINK_START:
1902
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1903
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1904
+ rb_str_cat(i, ext_link_start, sizeof(ext_link_start) - 1);
1905
+ else if (IN(EXT_LINK_START))
1906
+ // already in external link scope! (and in fact, must be capturing link_text right now)
1907
+ rb_str_cat(i, ext_link_start, sizeof(ext_link_start) - 1);
1908
+ else if (IN(LINK_START))
1909
+ {
1910
+ // already in internal link scope!
1911
+ i = rb_str_new(ext_link_start, sizeof(ext_link_start) - 1);
1912
+ if (NIL_P(parser->link_target))
1913
+ // this must be the first character of our link target
1914
+ parser->link_target = i;
1915
+ else if (IN(SPACE))
1916
+ {
1917
+ // link target has already been scanned
1918
+ if (NIL_P(parser->link_text))
1919
+ // this must be the first character of our link text
1920
+ parser->link_text = i;
1921
+ else
1922
+ // add to existing link text
1923
+ rb_str_append(parser->link_text, i);
1924
+ }
1925
+ else
1926
+ // add to existing link target
1927
+ rb_str_append(parser->link_target, i);
1928
+ }
1929
+ else // not in external link scope yet
1930
+ {
1931
+ // will either emit a link, or the rollback of a failed link, so start the para now
1932
+ _Wikitext_pop_excess_elements(parser);
1933
+ _Wikitext_start_para_if_necessary(parser);
1934
+
1935
+ // look ahead: expect a URI
1936
+ NEXT_TOKEN();
1937
+ if (token->type == URI)
1938
+ ary_push(parser->scope, EXT_LINK_START); // so far so good, jump back to the top of the loop
1939
+ else
1940
+ // only get here if there was a syntax error (missing URI)
1941
+ rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
1942
+ continue; // jump back to top of loop to handle token (either URI or whatever it is)
1943
+ }
1944
+ break;
1945
+
1946
+ case EXT_LINK_END:
1947
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1948
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1949
+ rb_str_cat(i, ext_link_end, sizeof(ext_link_end) - 1);
1950
+ else if (IN(EXT_LINK_START))
1951
+ {
1952
+ if (NIL_P(parser->link_text))
1953
+ // syntax error: external link with no link text
1954
+ _Wikitext_rollback_failed_external_link(parser);
1955
+ else
1956
+ {
1957
+ // success!
1958
+ _Wikitext_pop_from_stack_up_to(parser, i, EXT_LINK_START, Qtrue);
1959
+ parser->capture = Qnil;
1960
+ i = _Wikitext_hyperlink(Qnil, parser->link_target, parser->link_text, parser->external_link_class);
1961
+ rb_str_append(parser->output, i);
1962
+ }
1963
+ parser->link_target = Qnil;
1964
+ parser->link_text = Qnil;
1965
+ }
1966
+ else
1967
+ {
1968
+ _Wikitext_pop_excess_elements(parser);
1969
+ _Wikitext_start_para_if_necessary(parser);
1970
+ rb_str_cat(parser->output, ext_link_end, sizeof(ext_link_end) - 1);
1971
+ }
1972
+ break;
1973
+
1974
+ case SEPARATOR:
1975
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1976
+ _Wikitext_pop_excess_elements(parser);
1977
+ _Wikitext_start_para_if_necessary(parser);
1978
+ rb_str_cat(i, separator, sizeof(separator) - 1);
1979
+ break;
1980
+
1981
+ case SPACE:
1982
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1983
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1984
+ rb_str_cat(i, token->start, TOKEN_LEN(token));
1985
+ else
1986
+ {
1987
+ // peek ahead to see next token
1988
+ char *token_ptr = token->start;
1989
+ int token_len = TOKEN_LEN(token);
1990
+ NEXT_TOKEN();
1991
+ type = token->type;
1992
+ if (((type == H6_END) && IN(H6_START)) ||
1993
+ ((type == H5_END) && IN(H5_START)) ||
1994
+ ((type == H4_END) && IN(H4_START)) ||
1995
+ ((type == H3_END) && IN(H3_START)) ||
1996
+ ((type == H2_END) && IN(H2_START)) ||
1997
+ ((type == H1_END) && IN(H1_START)))
1998
+ {
1999
+ // will suppress emission of space (discard) if next token is a H6_END, H5_END etc and we are in the corresponding scope
2000
+ }
2001
+ else
2002
+ {
2003
+ // emit the space
2004
+ _Wikitext_pop_excess_elements(parser);
2005
+ _Wikitext_start_para_if_necessary(parser);
2006
+ rb_str_cat(i, token_ptr, token_len);
2007
+ }
2008
+
2009
+ // jump to top of the loop to process token we scanned during lookahead
2010
+ continue;
2011
+ }
2012
+ break;
2013
+
2014
+ case QUOT_ENTITY:
2015
+ case AMP_ENTITY:
2016
+ case NAMED_ENTITY:
2017
+ case DECIMAL_ENTITY:
2018
+ // pass these through unaltered as they are case sensitive
2019
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2020
+ _Wikitext_pop_excess_elements(parser);
2021
+ _Wikitext_start_para_if_necessary(parser);
2022
+ rb_str_cat(i, token->start, TOKEN_LEN(token));
2023
+ break;
2024
+
2025
+ case HEX_ENTITY:
2026
+ // normalize hex entities (downcase them)
2027
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2028
+ _Wikitext_pop_excess_elements(parser);
2029
+ _Wikitext_start_para_if_necessary(parser);
2030
+ rb_str_append(i, _Wikitext_downcase(TOKEN_TEXT(token)));
2031
+ break;
2032
+
2033
+ case QUOT:
2034
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2035
+ _Wikitext_pop_excess_elements(parser);
2036
+ _Wikitext_start_para_if_necessary(parser);
2037
+ rb_str_cat(i, quot_entity, sizeof(quot_entity) - 1);
2038
+ break;
2039
+
2040
+ case AMP:
2041
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2042
+ _Wikitext_pop_excess_elements(parser);
2043
+ _Wikitext_start_para_if_necessary(parser);
2044
+ rb_str_cat(i, amp_entity, sizeof(amp_entity) - 1);
2045
+ break;
2046
+
2047
+ case LESS:
2048
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2049
+ _Wikitext_pop_excess_elements(parser);
2050
+ _Wikitext_start_para_if_necessary(parser);
2051
+ rb_str_cat(i, lt_entity, sizeof(lt_entity) - 1);
2052
+ break;
2053
+
2054
+ case GREATER:
2055
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2056
+ _Wikitext_pop_excess_elements(parser);
2057
+ _Wikitext_start_para_if_necessary(parser);
2058
+ rb_str_cat(i, gt_entity, sizeof(gt_entity) - 1);
2059
+ break;
2060
+
2061
+ case CRLF:
2062
+ parser->pending_crlf = Qfalse;
2063
+ _Wikitext_rollback_failed_link(parser); // if any
2064
+ _Wikitext_rollback_failed_external_link(parser); // if any
2065
+ if (IN(NO_WIKI_START) || IN(PRE_START))
2066
+ {
2067
+ ary_clear(parser->line_buffer);
2068
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
2069
+ break;
2070
+ }
2071
+ else if (IN(PRE))
2072
+ {
2073
+ // beware when nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, that must be end of PRE block
2074
+ if (NO_ITEM(ary_entry(parser->line_buffer, -2)) || ary_entry(parser->line_buffer, -2) == BLOCKQUOTE)
2075
+ // don't emit in this case
2076
+ _Wikitext_pop_from_stack_up_to(parser, parser->output, PRE, Qtrue);
2077
+ else
2078
+ {
2079
+ // peek ahead to see if this is definitely the end of the PRE block
2080
+ NEXT_TOKEN();
2081
+ type = token->type;
2082
+ if (type != BLOCKQUOTE && type != PRE)
2083
+ {
2084
+ // this is definitely the end of the block, so don't emit
2085
+ _Wikitext_pop_from_stack_up_to(parser, parser->output, PRE, Qtrue);
2086
+ }
2087
+ else
2088
+ // potentially will emit
2089
+ parser->pending_crlf = Qtrue;
2090
+
2091
+ // delete the entire contents of the line scope stack and buffer
2092
+ ary_clear(parser->line);
2093
+ ary_clear(parser->line_buffer);
2094
+ continue; // jump back to top of loop to handle token grabbed via lookahead
2095
+ }
2096
+ }
2097
+ else
2098
+ {
2099
+ parser->pending_crlf = Qtrue;
2100
+
2101
+ // count number of BLOCKQUOTE tokens in line buffer (can be zero) and pop back to that level
2102
+ // as a side effect, this handles any open span-level elements and unclosed blocks
2103
+ // (with special handling for P blocks and LI elements)
2104
+ i = ary_count(parser->line, BLOCKQUOTE) + ary_count(parser->scope, BLOCKQUOTE_START);
2105
+ for (j = parser->scope->count; j > i; j--)
2106
+ {
2107
+ if (parser->scope->count > 0 && ary_entry(parser->scope, -1) == LI)
2108
+ {
2109
+ parser->pending_crlf = Qfalse;
2110
+ break;
2111
+ }
2112
+
2113
+ // special handling on last iteration through the loop if the top item on the scope is a P block
2114
+ if ((j - i == 1) && ary_entry(parser->scope, -1) == P)
2115
+ {
2116
+ // if nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, this must be a paragraph break
2117
+ // (note that we have to make sure we're not inside a BLOCKQUOTE_START block
2118
+ // because in those blocks BLOCKQUOTE tokens have no special meaning)
2119
+ if (NO_ITEM(ary_entry(parser->line_buffer, -2)) ||
2120
+ (ary_entry(parser->line_buffer, -2) == BLOCKQUOTE && !IN(BLOCKQUOTE_START)))
2121
+ // paragraph break
2122
+ parser->pending_crlf = Qfalse;
2123
+ else
2124
+ // not a paragraph break!
2125
+ continue;
2126
+ }
2127
+ _Wikitext_pop_from_stack(parser, Qnil);
2128
+ }
2129
+ }
2130
+
2131
+ // delete the entire contents of the line scope stack and buffer
2132
+ ary_clear(parser->line);
2133
+ ary_clear(parser->line_buffer);
2134
+ break;
2135
+
2136
+ case PRINTABLE:
2137
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2138
+ _Wikitext_pop_excess_elements(parser);
2139
+ _Wikitext_start_para_if_necessary(parser);
2140
+ rb_str_cat(i, token->start, TOKEN_LEN(token));
2141
+ break;
2142
+
2143
+ case DEFAULT:
2144
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2145
+ _Wikitext_pop_excess_elements(parser);
2146
+ _Wikitext_start_para_if_necessary(parser);
2147
+ rb_str_append(i, _Wikitext_utf32_char_to_entity(token->code_point)); // convert to entity
2148
+ break;
2149
+
2150
+ case END_OF_FILE:
2151
+ // close any open scopes on hitting EOF
2152
+ _Wikitext_rollback_failed_external_link(parser); // if any
2153
+ _Wikitext_rollback_failed_link(parser); // if any
2154
+ for (i = 0, j = parser->scope->count; i < j; i++)
2155
+ _Wikitext_pop_from_stack(parser, Qnil);
2156
+ goto return_output; // break not enough here (want to break out of outer while loop, not inner switch statement)
2157
+
2158
+ default:
2159
+ break;
2160
+ }
2161
+
2162
+ // reset current token; forcing lexer to return another token at the top of the loop
2163
+ token = NULL;
2164
+ } while (1);
2165
+ return_output:
2166
+ // BUG: these will leak if we exit this function by raising an exception; need to investigate using Data_Wrap_Struct
2167
+ ary_free(parser->scope);
2168
+ ary_free(parser->line);
2169
+ ary_free(parser->line_buffer);
2170
+ str_free(parser->line_ending);
2171
+ if (parser->tabulation)
2172
+ str_free(parser->tabulation);
2173
+ return parser->output;
2174
+ }