wikitext 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/ary.h ADDED
@@ -0,0 +1,99 @@
1
+ // Copyright 2008 Wincent Colaiuta
2
+ // This program is free software: you can redistribute it and/or modify
3
+ // it under the terms of the GNU General Public License as published by
4
+ // the Free Software Foundation, either version 3 of the License, or
5
+ // (at your option) any later version.
6
+ //
7
+ // This program is distributed in the hope that it will be useful,
8
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ // GNU General Public License for more details.
11
+ //
12
+ // You should have received a copy of the GNU General Public License
13
+ // along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ #include <ruby/ruby.h>
16
+
17
+ typedef struct
18
+ {
19
+ int count;
20
+ int max;
21
+ int *entries;
22
+ } ary_t;
23
+
24
+ // in the test suite array count goes no higher than 25 or 26
25
+ #define DEFAULT_ENTRY_COUNT 64
26
+
27
+ #define NO_ITEM(item) (item == INT_MAX)
28
+
29
+ inline ary_t *ary_new(void)
30
+ {
31
+ ary_t *ary = ALLOC_N(ary_t, 1);
32
+ ary->count = 0;
33
+ ary->max = DEFAULT_ENTRY_COUNT;
34
+ ary->entries = ALLOC_N(int, DEFAULT_ENTRY_COUNT);
35
+ return ary;
36
+ }
37
+
38
+ inline void ary_free(ary_t *ary)
39
+ {
40
+ free(ary->entries);
41
+ free(ary);
42
+ }
43
+
44
+ inline int ary_entry(ary_t *ary, int idx)
45
+ {
46
+ if (idx < 0)
47
+ idx = ary->count + idx;
48
+ return (idx >= 0 && ary->count > idx) ? ary->entries[idx] : INT_MAX;
49
+ }
50
+
51
+ inline void ary_clear(ary_t *ary)
52
+ {
53
+ ary->count = 0;
54
+ }
55
+
56
+ inline int ary_pop(ary_t *ary)
57
+ {
58
+ if (ary->count > 0)
59
+ {
60
+ ary->count--;
61
+ return 1;
62
+ }
63
+ return 0;
64
+ }
65
+
66
+ inline void ary_push(ary_t *ary, int val)
67
+ {
68
+ if (ary->count == ary->max)
69
+ {
70
+ ary->max += DEFAULT_ENTRY_COUNT;
71
+ REALLOC_N(ary->entries, int, ary->max);
72
+ }
73
+ ary->entries[ary->count] = val;
74
+ ary->count++;
75
+ }
76
+
77
+ inline int ary_includes(ary_t *ary, int val)
78
+ {
79
+ for (int i = 0, max = ary->count; i < max; i++)
80
+ {
81
+ if (ary->entries[i] == val)
82
+ return 1;
83
+ }
84
+ return 0;
85
+ }
86
+
87
+ // returns a count indicating the number of times the value appears in the collection
88
+ // refactored from _Wikitext_count()
89
+ inline int ary_count(ary_t *ary, int item)
90
+ {
91
+ int count = 0;
92
+ for (int i = 0, max = ary->count; i < max; i++)
93
+ {
94
+ if (ary->entries[i] == item)
95
+ count++;
96
+ }
97
+ return count;
98
+ }
99
+
data/ext/depend ADDED
@@ -0,0 +1,22 @@
1
+ # depend
2
+ # Additional material for Makefile
3
+ # Copyright 2008 Wincent Colaiuta
4
+ # This program is free software: you can redistribute it and/or modify
5
+ # it under the terms of the GNU General Public License as published by
6
+ # the Free Software Foundation, either version 3 of the License, or
7
+ # (at your option) any later version.
8
+ #
9
+ # This program is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ # GNU General Public License for more details.
13
+ #
14
+ # You should have received a copy of the GNU General Public License
15
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
+
17
+ CFLAGS += -std=gnu99
18
+
19
+ parser.o : ary.h parser.c parser.h token.h str.h wikitext.h wikitext_ragel.h
20
+ token.o : token.c token.h wikitext.h
21
+ wikitext.o : parser.h token.h wikitext.c wikitext.h wikitext_ragel.h
22
+ wikitext_ragel.o : token.h wikitext.h wikitext_ragel.h wikitext_ragel.c
data/ext/extconf.rb ADDED
@@ -0,0 +1,23 @@
1
+ # Copyright 2008 Wincent Colaiuta
2
+ # This program is free software: you can redistribute it and/or modify
3
+ # it under the terms of the GNU General Public License as published by
4
+ # the Free Software Foundation, either version 3 of the License, or
5
+ # (at your option) any later version.
6
+ #
7
+ # This program is distributed in the hope that it will be useful,
8
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ # GNU General Public License for more details.
11
+ #
12
+ # You should have received a copy of the GNU General Public License
13
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ require 'mkmf'
16
+
17
+ def missing item
18
+ puts "couldn't find #{item} (required)"
19
+ exit 1
20
+ end
21
+
22
+ have_header('ruby.h') or missing 'ruby.h'
23
+ create_makefile('wikitext')
data/ext/parser.c ADDED
@@ -0,0 +1,2174 @@
1
+ // Copyright 2007-2008 Wincent Colaiuta
2
+ // This program is free software: you can redistribute it and/or modify
3
+ // it under the terms of the GNU General Public License as published by
4
+ // the Free Software Foundation, either version 3 of the License, or
5
+ // (at your option) any later version.
6
+ //
7
+ // This program is distributed in the hope that it will be useful,
8
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ // GNU General Public License for more details.
11
+ //
12
+ // You should have received a copy of the GNU General Public License
13
+ // along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ #include "parser.h"
16
+ #include "ary.h"
17
+ #include "str.h"
18
+ #include "wikitext.h"
19
+ #include "wikitext_ragel.h"
20
+
21
+ #define IN(type) ary_includes(parser->scope, type)
22
+
23
+ // poor man's object orientation in C:
24
+ // instead of parsing around multiple parameters between functions in the parser
25
+ // we pack everything into a struct and pass around only a pointer to that
26
+ typedef struct
27
+ {
28
+ VALUE output; // for accumulating output to be returned
29
+ VALUE capture; // for capturing substrings
30
+ VALUE link_target; // short term "memory" for parsing links
31
+ VALUE link_text; // short term "memory" for parsing links
32
+ VALUE external_link_class; // CSS class applied to external links
33
+ ary_t *scope; // stack for tracking scope
34
+ ary_t *line; // stack for tracking scope as implied by current line
35
+ ary_t *line_buffer; // stack for tracking raw tokens (not scope) on current line
36
+ VALUE pending_crlf; // boolean (Qtrue or Qfalse)
37
+ VALUE autolink; // boolean (Qtrue or Qfalse)
38
+ VALUE treat_slash_as_special; // boolean (Qtrue or Qfalse)
39
+ VALUE special_link; // boolean (Qtrue or Qfalse): is the current link_target a "special" link?
40
+ str_t *line_ending;
41
+ int base_indent; // controlled by the :indent option to Wikitext::Parser#parse
42
+ int current_indent; // fluctuates according to currently nested structures
43
+ str_t *tabulation; // caching buffer for emitting indentation
44
+ } parser_t;
45
+
46
+ const char escaped_no_wiki_start[] = "&lt;nowiki&gt;";
47
+ const char escaped_no_wiki_end[] = "&lt;/nowiki&gt;";
48
+ const char literal_strong_em[] = "'''''";
49
+ const char literal_strong[] = "'''";
50
+ const char literal_em[] = "''";
51
+ const char escaped_em_start[] = "&lt;em&gt;";
52
+ const char escaped_em_end[] = "&lt;/em&gt;";
53
+ const char escaped_strong_start[] = "&lt;strong&gt;";
54
+ const char escaped_strong_end[] = "&lt;/strong&gt;";
55
+ const char escaped_tt_start[] = "&lt;tt&gt;";
56
+ const char escaped_tt_end[] = "&lt;/tt&gt;";
57
+ const char literal_h6[] = "======";
58
+ const char literal_h5[] = "=====";
59
+ const char literal_h4[] = "====";
60
+ const char literal_h3[] = "===";
61
+ const char literal_h2[] = "==";
62
+ const char literal_h1[] = "=";
63
+ const char pre_start[] = "<pre>";
64
+ const char pre_end[] = "</pre>";
65
+ const char escaped_pre_start[] = "&lt;pre&gt;";
66
+ const char escaped_pre_end[] = "&lt;/pre&gt;";
67
+ const char blockquote_start[] = "<blockquote>";
68
+ const char blockquote_end[] = "</blockquote>";
69
+ const char escaped_blockquote_start[] = "&lt;blockquote&gt;";
70
+ const char escaped_blockquote_end[] = "&lt;/blockquote&gt;";
71
+ const char strong_em_start[] = "<strong><em>";
72
+ const char strong_start[] = "<strong>";
73
+ const char strong_end[] = "</strong>";
74
+ const char em_start[] = "<em>";
75
+ const char em_end[] = "</em>";
76
+ const char tt_start[] = "<tt>";
77
+ const char tt_end[] = "</tt>";
78
+ const char ol_start[] = "<ol>";
79
+ const char ol_end[] = "</ol>";
80
+ const char ul_start[] = "<ul>";
81
+ const char ul_end[] = "</ul>";
82
+ const char li_start[] = "<li>";
83
+ const char li_end[] = "</li>";
84
+ const char h6_start[] = "<h6>";
85
+ const char h6_end[] = "</h6>";
86
+ const char h5_start[] = "<h5>";
87
+ const char h5_end[] = "</h5>";
88
+ const char h4_start[] = "<h4>";
89
+ const char h4_end[] = "</h4>";
90
+ const char h3_start[] = "<h3>";
91
+ const char h3_end[] = "</h3>";
92
+ const char h2_start[] = "<h2>";
93
+ const char h2_end[] = "</h2>";
94
+ const char h1_start[] = "<h1>";
95
+ const char h1_end[] = "</h1>";
96
+ const char p_start[] = "<p>";
97
+ const char p_end[] = "</p>";
98
+ const char space[] = " ";
99
+ const char a_start[] = "<a href=\"";
100
+ const char a_class[] = "\" class=\"";
101
+ const char a_start_close[] = "\">";
102
+ const char a_end[] = "</a>";
103
+ const char link_start[] = "[[";
104
+ const char link_end[] = "]]";
105
+ const char separator[] = "|";
106
+ const char ext_link_start[] = "[";
107
+ const char backtick[] = "`";
108
+ const char quote[] = "\"";
109
+ const char ampersand[] = "&";
110
+ const char quot_entity[] = "&quot;";
111
+ const char amp_entity[] = "&amp;";
112
+ const char lt_entity[] = "&lt;";
113
+ const char gt_entity[] = "&gt;";
114
+ const char escaped_blockquote[] = "&gt; ";
115
+ const char ext_link_end[] = "]";
116
+
117
+ // for testing and debugging only
118
+ VALUE Wikitext_parser_tokenize(VALUE self, VALUE string)
119
+ {
120
+ if (NIL_P(string))
121
+ return Qnil;
122
+ string = StringValue(string);
123
+ VALUE tokens = rb_ary_new();
124
+ char *p = RSTRING_PTR(string);
125
+ long len = RSTRING_LEN(string);
126
+ char *pe = p + len;
127
+ token_t token;
128
+ next_token(&token, NULL, p, pe);
129
+ rb_ary_push(tokens, _Wikitext_token(&token));
130
+ while (token.type != END_OF_FILE)
131
+ {
132
+ next_token(&token, &token, NULL, pe);
133
+ rb_ary_push(tokens, _Wikitext_token(&token));
134
+ }
135
+ return tokens;
136
+ }
137
+
138
+ // for benchmarking raw tokenization speed only
139
+ VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string)
140
+ {
141
+ if (NIL_P(string))
142
+ return Qnil;
143
+ string = StringValue(string);
144
+ char *p = RSTRING_PTR(string);
145
+ long len = RSTRING_LEN(string);
146
+ char *pe = p + len;
147
+ token_t token;
148
+ next_token(&token, NULL, p, pe);
149
+ while (token.type != END_OF_FILE)
150
+ next_token(&token, &token, NULL, pe);
151
+ return Qnil;
152
+ }
153
+
154
+ // we downcase "in place", overwriting the original contents of the buffer and returning the same string
155
+ inline VALUE _Wikitext_downcase(VALUE string)
156
+ {
157
+ char *ptr = RSTRING_PTR(string);
158
+ long len = RSTRING_LEN(string);
159
+ for (long i = 0; i < len; i++)
160
+ {
161
+ if (ptr[i] >= 'A' && ptr[i] <= 'Z')
162
+ ptr[i] += 32;
163
+ }
164
+ return string;
165
+ }
166
+
167
+ inline VALUE _Wikitext_hyperlink(VALUE link_prefix, VALUE link_target, VALUE link_text, VALUE link_class)
168
+ {
169
+ VALUE string = rb_str_new(a_start, sizeof(a_start) - 1); // <a href="
170
+ if (!NIL_P(link_prefix))
171
+ rb_str_append(string, link_prefix);
172
+ rb_str_append(string, link_target);
173
+ if (link_class != Qnil)
174
+ {
175
+ rb_str_cat(string, a_class, sizeof(a_class) - 1); // " class="
176
+ rb_str_append(string, link_class);
177
+ }
178
+ rb_str_cat(string, a_start_close, sizeof(a_start_close) - 1); // ">
179
+ rb_str_append(string, link_text);
180
+ rb_str_cat(string, a_end, sizeof(a_end) - 1);
181
+ return string;
182
+ }
183
+
184
+ // will emit indentation only if we are about to emit any of:
185
+ // <blockquote>, <p>, <ul>, <ol>, <li>, <h1> etc, <pre>
186
+ // each time we enter one of those spans must ++ the indentation level
187
+ inline void _Wikitext_indent(parser_t *parser)
188
+ {
189
+ int space_count = parser->current_indent + parser->base_indent;
190
+ if (space_count > 0)
191
+ {
192
+ char *old_end, *new_end;
193
+ if (!parser->tabulation)
194
+ {
195
+ parser->tabulation = str_new_size(space_count);
196
+ old_end = parser->tabulation->ptr;
197
+ }
198
+ else if (parser->tabulation->len < space_count)
199
+ {
200
+ old_end = parser->tabulation->ptr;
201
+ str_grow(parser->tabulation, space_count);
202
+ }
203
+ else
204
+ old_end = parser->tabulation->ptr;
205
+ new_end = parser->tabulation->ptr + space_count;
206
+ while (old_end < new_end)
207
+ *old_end++ = ' ';
208
+ rb_str_cat(parser->output, parser->tabulation->ptr, space_count);
209
+ }
210
+ parser->current_indent += 2;
211
+ }
212
+
213
+ inline void _Wikitext_dedent(parser_t *parser, VALUE emit)
214
+ {
215
+ parser->current_indent -= 2;
216
+ if (emit != Qtrue)
217
+ return;
218
+ int space_count = parser->current_indent + parser->base_indent;
219
+ if (space_count > 0)
220
+ rb_str_cat(parser->output, parser->tabulation->ptr, space_count);
221
+ }
222
+
223
+ // Pops a single item off the parser's scope stack.
224
+ // A corresponding closing tag is written to the target string.
225
+ // The target string may be the main output buffer, or a substring capturing buffer if a link is being scanned.
226
+ void _Wikitext_pop_from_stack(parser_t *parser, VALUE target)
227
+ {
228
+ int top = ary_entry(parser->scope, -1);
229
+ if (NO_ITEM(top))
230
+ return;
231
+ if (NIL_P(target))
232
+ target = parser->output;
233
+ switch (top)
234
+ {
235
+ case PRE:
236
+ case PRE_START:
237
+ rb_str_cat(target, pre_end, sizeof(pre_end) - 1);
238
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
239
+ _Wikitext_dedent(parser, Qfalse);
240
+ break;
241
+
242
+ case BLOCKQUOTE:
243
+ case BLOCKQUOTE_START:
244
+ _Wikitext_dedent(parser, Qtrue);
245
+ rb_str_cat(target, blockquote_end, sizeof(blockquote_end) - 1);
246
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
247
+ break;
248
+
249
+ case NO_WIKI_START:
250
+ // not a real HTML tag; so nothing to pop
251
+ break;
252
+
253
+ case STRONG:
254
+ case STRONG_START:
255
+ rb_str_cat(target, strong_end, sizeof(strong_end) - 1);
256
+ break;
257
+
258
+ case EM:
259
+ case EM_START:
260
+ rb_str_cat(target, em_end, sizeof(em_end) - 1);
261
+ break;
262
+
263
+ case TT:
264
+ case TT_START:
265
+ rb_str_cat(target, tt_end, sizeof(tt_end) - 1);
266
+ break;
267
+
268
+ case OL:
269
+ _Wikitext_dedent(parser, Qtrue);
270
+ rb_str_cat(target, ol_end, sizeof(ol_end) - 1);
271
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
272
+ break;
273
+
274
+ case UL:
275
+ _Wikitext_dedent(parser, Qtrue);
276
+ rb_str_cat(target, ul_end, sizeof(ul_end) - 1);
277
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
278
+ break;
279
+
280
+ case NESTED_LIST:
281
+ // next token to pop will be a LI
282
+ // LI is an interesting token because sometimes we want it to behave like P (ie. do a non-emitting indent)
283
+ // and other times we want it to behave like BLOCKQUOTE (ie. when it has a nested list inside)
284
+ // hence this hack: we do an emitting dedent on behalf of the LI that we know must be coming
285
+ // and then when we pop the actual LI itself (below) we do the standard non-emitting indent
286
+ _Wikitext_dedent(parser, Qtrue); // we really only want to emit the spaces
287
+ parser->current_indent += 2; // we don't want to decrement the actual indent level, so put it back
288
+ break;
289
+
290
+ case LI:
291
+ rb_str_cat(target, li_end, sizeof(li_end) - 1);
292
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
293
+ _Wikitext_dedent(parser, Qfalse);
294
+ break;
295
+
296
+ case H6_START:
297
+ rb_str_cat(target, h6_end, sizeof(h6_end) - 1);
298
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
299
+ _Wikitext_dedent(parser, Qfalse);
300
+ break;
301
+
302
+ case H5_START:
303
+ rb_str_cat(target, h5_end, sizeof(h5_end) - 1);
304
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
305
+ _Wikitext_dedent(parser, Qfalse);
306
+ break;
307
+
308
+ case H4_START:
309
+ rb_str_cat(target, h4_end, sizeof(h4_end) - 1);
310
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
311
+ _Wikitext_dedent(parser, Qfalse);
312
+ break;
313
+
314
+ case H3_START:
315
+ rb_str_cat(target, h3_end, sizeof(h3_end) - 1);
316
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
317
+ _Wikitext_dedent(parser, Qfalse);
318
+ break;
319
+
320
+ case H2_START:
321
+ rb_str_cat(target, h2_end, sizeof(h2_end) - 1);
322
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
323
+ _Wikitext_dedent(parser, Qfalse);
324
+ break;
325
+
326
+ case H1_START:
327
+ rb_str_cat(target, h1_end, sizeof(h1_end) - 1);
328
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
329
+ _Wikitext_dedent(parser, Qfalse);
330
+ break;
331
+
332
+ case LINK_START:
333
+ // not an HTML tag; so nothing to emit
334
+ break;
335
+
336
+ case EXT_LINK_START:
337
+ // not an HTML tag; so nothing to emit
338
+ break;
339
+
340
+ case SPACE:
341
+ // not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
342
+ break;
343
+
344
+ case SEPARATOR:
345
+ // not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
346
+ break;
347
+
348
+ case P:
349
+ rb_str_cat(target, p_end, sizeof(p_end) - 1);
350
+ rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
351
+ _Wikitext_dedent(parser, Qfalse);
352
+ break;
353
+
354
+ case END_OF_FILE:
355
+ // nothing to do
356
+ break;
357
+
358
+ default:
359
+ // should probably raise an exception here
360
+ break;
361
+ }
362
+ ary_pop(parser->scope);
363
+ }
364
+
365
+ // Pops items off the top of parser's scope stack, accumulating closing tags for them into the target string, until item is reached.
366
+ // If including is Qtrue then the item itself is also popped.
367
+ // The target string may be the main output buffer, or a substring capturing buffer when scanning links.
368
+ void _Wikitext_pop_from_stack_up_to(parser_t *parser, VALUE target, int item, VALUE including)
369
+ {
370
+ int continue_looping = 1;
371
+ do
372
+ {
373
+ int top = ary_entry(parser->scope, -1);
374
+ if (NO_ITEM(top))
375
+ return;
376
+ if (top == item)
377
+ {
378
+ if (including != Qtrue)
379
+ return;
380
+ continue_looping = 0;
381
+ }
382
+ _Wikitext_pop_from_stack(parser, target);
383
+ } while (continue_looping);
384
+ }
385
+
386
+ inline void _Wikitext_start_para_if_necessary(parser_t *parser)
387
+ {
388
+ if (!NIL_P(parser->capture)) // we don't do anything if in capturing mode
389
+ return;
390
+
391
+ // if no block open yet, or top of stack is BLOCKQUOTE/BLOCKQUOTE_START (with nothing in it yet)
392
+ if (parser->scope->count == 0 ||
393
+ ary_entry(parser->scope, -1) == BLOCKQUOTE ||
394
+ ary_entry(parser->scope, -1) == BLOCKQUOTE_START)
395
+ {
396
+ _Wikitext_indent(parser);
397
+ rb_str_cat(parser->output, p_start, sizeof(p_start) - 1);
398
+ ary_push(parser->scope, P);
399
+ ary_push(parser->line, P);
400
+ }
401
+ else if (parser->pending_crlf == Qtrue)
402
+ {
403
+ if (IN(P))
404
+ // already in a paragraph block; convert pending CRLF into a space
405
+ rb_str_cat(parser->output, space, sizeof(space) - 1);
406
+ else if (IN(PRE))
407
+ // PRE blocks can have pending CRLF too (helps us avoid emitting the trailing newline)
408
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
409
+ }
410
+ parser->pending_crlf = Qfalse;
411
+ }
412
+
413
+ // Helper function that pops any excess elements off scope (pushing is already handled in the respective rules).
414
+ // For example, given input like:
415
+ //
416
+ // > > foo
417
+ // bar
418
+ //
419
+ // Upon seeing "bar", we want to pop two BLOCKQUOTE elements from the scope.
420
+ // The reverse case (shown below) is handled from inside the BLOCKQUOTE rule itself:
421
+ //
422
+ // foo
423
+ // > > bar
424
+ //
425
+ // Things are made slightly more complicated by the fact that there is one block-level tag that can be on the scope
426
+ // but not on the line scope:
427
+ //
428
+ // <blockquote>foo
429
+ // bar</blockquote>
430
+ //
431
+ // Here on seeing "bar" we have one item on the scope (BLOCKQUOTE_START) which we don't want to pop, but we have nothing
432
+ // on the line scope.
433
+ // Luckily, BLOCKQUOTE_START tokens can only appear at the start of the scope array, so we can check for them first before
434
+ // entering the for loop.
435
+ void inline _Wikitext_pop_excess_elements(parser_t *parser)
436
+ {
437
+ if (!NIL_P(parser->capture)) // we don't pop anything if in capturing mode
438
+ return;
439
+ for (int i = parser->scope->count - ary_count(parser->scope, BLOCKQUOTE_START), j = parser->line->count; i > j; i--)
440
+ {
441
+ // special case for last item on scope
442
+ if (i - j == 1)
443
+ {
444
+ // don't auto-pop P if it is only item on scope
445
+ if (ary_entry(parser->scope, -1) == P)
446
+ {
447
+ // add P to the line scope to prevent us entering the loop at all next time around
448
+ ary_push(parser->line, P);
449
+ continue;
450
+ }
451
+ }
452
+ _Wikitext_pop_from_stack(parser, parser->output);
453
+ }
454
+ }
455
+
456
+ #define INVALID_ENCODING(msg) do { if (dest_ptr) free(dest_ptr); rb_raise(eWikitextParserError, "invalid encoding: " msg); } while(0)
457
+
458
+ // convert a single UTF-8 codepoint to UTF-32
459
+ // expects an input buffer, src, containing a UTF-8 encoded character (which may be multi-byte)
460
+ // the end of the input buffer, end, is also passed in to allow the detection of invalidly truncated codepoints
461
+ // the number of bytes in the UTF-8 character (between 1 and 4) is returned by reference in width_out
462
+ // raises a RangeError if the supplied character is invalid UTF-8
463
+ // (in which case it also frees the block of memory indicated by dest_ptr if it is non-NULL)
464
+ inline uint32_t _Wikitext_utf8_to_utf32(char *src, char *end, long *width_out, void *dest_ptr)
465
+ {
466
+ uint32_t dest;
467
+ if ((unsigned char)src[0] <= 0x7f) // ASCII
468
+ {
469
+ dest = src[0];
470
+ *width_out = 1;
471
+ }
472
+ else if ((src[0] & 0xe0) == 0xc0) // byte starts with 110..... : this should be a two-byte sequence
473
+ {
474
+ if (src + 1 >= end)
475
+ INVALID_ENCODING("truncated byte sequence"); // no second byte
476
+ else if (((unsigned char)src[0] == 0xc0) || ((unsigned char)src[0] == 0xc1))
477
+ INVALID_ENCODING("overlong encoding"); // overlong encoding: lead byte of 110..... but code point <= 127
478
+ else if ((src[1] & 0xc0) != 0x80 )
479
+ INVALID_ENCODING("malformed byte sequence"); // should have second byte starting with 10......
480
+ dest = ((uint32_t)(src[0] & 0x1f)) << 6 | (src[1] & 0x3f);
481
+ *width_out = 2;
482
+ }
483
+ else if ((src[0] & 0xf0) == 0xe0) // byte starts with 1110.... : this should be a three-byte sequence
484
+ {
485
+ if (src + 2 >= end)
486
+ INVALID_ENCODING("truncated byte sequence"); // missing second or third byte
487
+ else if (((src[1] & 0xc0) != 0x80 ) || ((src[2] & 0xc0) != 0x80 ))
488
+ INVALID_ENCODING("malformed byte sequence"); // should have second and third bytes starting with 10......
489
+ dest = ((uint32_t)(src[0] & 0x0f)) << 12 | ((uint32_t)(src[1] & 0x3f)) << 6 | (src[2] & 0x3f);
490
+ *width_out = 3;
491
+ }
492
+ else if ((src[0] & 0xf8) == 0xf0) // bytes starts with 11110... : this should be a four-byte sequence
493
+ {
494
+ if (src + 3 >= end)
495
+ INVALID_ENCODING("truncated byte sequence"); // missing second, third, or fourth byte
496
+ else if ((unsigned char)src[0] >= 0xf5 && (unsigned char)src[0] <= 0xf7)
497
+ INVALID_ENCODING("overlong encoding"); // disallowed by RFC 3629 (codepoints above 0x10ffff)
498
+ else if (((src[1] & 0xc0) != 0x80 ) || ((src[2] & 0xc0) != 0x80 ) || ((src[3] & 0xc0) != 0x80 ))
499
+ INVALID_ENCODING("malformed byte sequence"); // should have second and third bytes starting with 10......
500
+ dest = ((uint32_t)(src[0] & 0x07)) << 18 | ((uint32_t)(src[1] & 0x3f)) << 12 | ((uint32_t)(src[1] & 0x3f)) << 6 | (src[2] & 0x3f);
501
+ *width_out = 4;
502
+ }
503
+ else // invalid input
504
+ INVALID_ENCODING("unexpected byte");
505
+ return dest;
506
+ }
507
+
508
+ inline VALUE _Wikitext_utf32_char_to_entity(uint32_t character)
509
+ {
510
+ // TODO: consider special casing some entities (ie. quot, amp, lt, gt etc)?
511
+ char hex_string[8] = { '&', '#', 'x', 0, 0, 0, 0, ';' };
512
+ char scratch = (character & 0xf000) >> 12;
513
+ hex_string[3] = (scratch <= 9 ? scratch + 48 : scratch + 87);
514
+ scratch = (character & 0x0f00) >> 8;
515
+ hex_string[4] = (scratch <= 9 ? scratch + 48 : scratch + 87);
516
+ scratch = (character & 0x00f0) >> 4;
517
+ hex_string[5] = (scratch <= 9 ? scratch + 48 : scratch + 87);
518
+ scratch = character & 0x000f;
519
+ hex_string[6] = (scratch <= 9 ? scratch + 48 : scratch + 87);
520
+ return rb_str_new((const char *)hex_string, sizeof(hex_string));
521
+ }
522
+
523
+ inline VALUE _Wikitext_parser_trim_link_target(VALUE string)
524
+ {
525
+ string = StringValue(string);
526
+ char *src = RSTRING_PTR(string);
527
+ char *start = src; // remember this so we can check if we're at the start
528
+ char *left = src;
529
+ char *non_space = src; // remember last non-space character output
530
+ long len = RSTRING_LEN(string);
531
+ char *end = src + len;
532
+ while (src < end)
533
+ {
534
+ if (*src == ' ')
535
+ {
536
+ if (src == left)
537
+ *left++;
538
+ }
539
+ else
540
+ non_space = src;
541
+ src++;
542
+ }
543
+ if (left == start && non_space + 1 == end)
544
+ return string;
545
+ else
546
+ return rb_str_new(left, (non_space + 1) - left);
547
+ }
548
+
549
+ // - non-printable (non-ASCII) characters converted to numeric entities
550
+ // - QUOT and AMP characters converted to named entities
551
+ // - leading and trailing whitespace trimmed if trim is Qtrue
552
+ inline VALUE _Wikitext_parser_sanitize_link_target(VALUE string, VALUE trim)
553
+ {
554
+ string = StringValue(string); // raises if string is nil or doesn't quack like a string
555
+ char *src = RSTRING_PTR(string);
556
+ char *start = src; // remember this so we can check if we're at the start
557
+ long len = RSTRING_LEN(string);
558
+ char *end = src + len;
559
+
560
+ // start with a destination buffer twice the size of the source, will realloc if necessary
561
+ // slop = (len / 8) * 8 (ie. one in every 8 characters can be converted into an entity, each entity requires 8 bytes)
562
+ // this efficiently handles the most common case (where the size of the buffer doesn't change much)
563
+ char *dest = ALLOC_N(char, len * 2);
564
+ char *dest_ptr = dest; // hang on to this so we can pass it to free() later
565
+ char *non_space = dest; // remember last non-space character output
566
+ while (src < end)
567
+ {
568
+ // need at most 8 characters (8 bytes) to display each character
569
+ if (dest + 8 > dest_ptr + len) // outgrowing buffer, must reallocate
570
+ {
571
+ char *old_dest = dest;
572
+ char *old_dest_ptr = dest_ptr;
573
+ len = len + (end - src) * 8; // allocate enough for worst case
574
+ dest = realloc(dest_ptr, len); // will never have to realloc more than once
575
+ if (dest == NULL)
576
+ {
577
+ // would have used reallocf, but this has to run on Linux too, not just Darwin
578
+ free(dest_ptr);
579
+ rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
580
+ }
581
+ dest_ptr = dest;
582
+ dest = dest_ptr + (old_dest - old_dest_ptr);
583
+ non_space = dest_ptr + (non_space - old_dest_ptr);
584
+ }
585
+
586
+ if (*src == '"') // QUOT
587
+ {
588
+ char quot_entity_literal[] = { '&', 'q', 'u', 'o', 't', ';' }; // no trailing NUL
589
+ memcpy(dest, quot_entity_literal, sizeof(quot_entity_literal));
590
+ dest += sizeof(quot_entity_literal);
591
+ }
592
+ else if (*src == '&') // AMP
593
+ {
594
+ char amp_entity_literal[] = { '&', 'a', 'm', 'p', ';' }; // no trailing NUL
595
+ memcpy(dest, amp_entity_literal, sizeof(amp_entity_literal));
596
+ dest += sizeof(amp_entity_literal);
597
+ }
598
+ else if (*src == '<') // LESS_THAN
599
+ {
600
+ free(dest_ptr);
601
+ rb_raise(rb_eRangeError, "invalid link text (\"<\" may not appear in link text)");
602
+ }
603
+ else if (*src == '>') // GREATER_THAN
604
+ {
605
+ free(dest_ptr);
606
+ rb_raise(rb_eRangeError, "invalid link text (\">\" may not appear in link text)");
607
+ }
608
+ else if (*src == ' ' && src == start && trim == Qtrue)
609
+ start++; // we eat leading space
610
+ else if (*src >= 0x20 && *src <= 0x7e) // printable ASCII
611
+ {
612
+ *dest = *src;
613
+ dest++;
614
+ }
615
+ else // all others: must convert to entities
616
+ {
617
+ long width;
618
+ VALUE entity = _Wikitext_utf32_char_to_entity(_Wikitext_utf8_to_utf32(src, end, &width, dest_ptr));
619
+ char *entity_src = RSTRING_PTR(entity);
620
+ long entity_len = RSTRING_LEN(entity); // should always be 8 characters (8 bytes)
621
+ memcpy(dest, entity_src, entity_len);
622
+ dest += entity_len;
623
+ src += width;
624
+ non_space = dest;
625
+ continue;
626
+ }
627
+ if (*src != ' ')
628
+ non_space = dest;
629
+ src++;
630
+ }
631
+
632
+ // trim trailing space if necessary
633
+ if (trim == Qtrue && non_space > dest_ptr && dest != non_space)
634
+ len = non_space - dest_ptr;
635
+ else
636
+ len = dest - dest_ptr;
637
+ VALUE out = rb_str_new(dest_ptr, len);
638
+ free(dest_ptr);
639
+ return out;
640
+ }
641
+
642
+ VALUE Wikitext_parser_sanitize_link_target(VALUE self, VALUE string)
643
+ {
644
+ return (_Wikitext_parser_sanitize_link_target(string, Qtrue));
645
+ }
646
+
647
+ // encodes the input string according to RFCs 2396 and 2718
648
+ // leading and trailing whitespace trimmed
649
+ // note that the first character of the target link is not case-sensitive
650
+ // (this is a recommended application-level constraint; it is not imposed at this level)
651
+ // this is to allow links like:
652
+ // ...the [[foo]] is...
653
+ // to be equivalent to:
654
+ // thing. [[Foo]] was...
655
+ // this is also where we check treat_slash_as_special is true and act accordingly
656
+ // basically any link target matching /\A[a-z]+\/\d+\z/ is flagged as special
657
+ inline static void _Wikitext_parser_encode_link_target(parser_t *parser)
658
+ {
659
+ VALUE in = StringValue(parser->link_target);
660
+ char *input = RSTRING_PTR(in);
661
+ char *start = input; // remember this so we can check if we're at the start
662
+ long len = RSTRING_LEN(in);
663
+ if (!(len > 0))
664
+ return;
665
+ char *end = input + len;
666
+ static char hex[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
667
+
668
+ // this potential shortcut requires an (admittedly cheap) prescan, so only do it when treat_slash_as_special is true
669
+ parser->special_link = Qfalse;
670
+ if (parser->treat_slash_as_special == Qtrue)
671
+ {
672
+ char *c = input; // \A
673
+ while (c < end && *c >= 'a' && *c <= 'z') // [a-z]
674
+ c++; // +
675
+ if (c > start && c < end && *c++ == '/') // \/
676
+ {
677
+ while (c < end && *c >= '0' && *c <= '9') // \d
678
+ {
679
+ c++; // +
680
+ if (c == end) // \z
681
+ {
682
+ // matches /\A[a-z]+\/\d+\z/ so no transformation required
683
+ parser->special_link = Qtrue;
684
+ return;
685
+ }
686
+ }
687
+ }
688
+ }
689
+
690
+ // to avoid most reallocations start with a destination buffer twice the size of the source
691
+ // this handles the most common case (where most chars are in the ASCII range and don't require more storage, but there are
692
+ // often quite a few spaces, which are encoded as "%20" and occupy 3 bytes)
693
+ // the worst case is where _every_ byte must be written out using 3 bytes
694
+ long dest_len = len * 2;
695
+ char *dest = ALLOC_N(char, dest_len);
696
+ char *dest_ptr = dest; // hang on to this so we can pass it to free() later
697
+ char *non_space = dest; // remember last non-space character output
698
+ for (; input < end; input++)
699
+ {
700
+ if ((dest + 3) > (dest_ptr + dest_len)) // worst case: a single character may grow to 3 characters once encoded
701
+ {
702
+ // outgrowing buffer, must reallocate
703
+ char *old_dest = dest;
704
+ char *old_dest_ptr = dest_ptr;
705
+ dest_len += len;
706
+ dest = realloc(dest_ptr, dest_len);
707
+ if (dest == NULL)
708
+ {
709
+ // would have used reallocf, but this has to run on Linux too, not just Darwin
710
+ free(dest_ptr);
711
+ rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
712
+ }
713
+ dest_ptr = dest;
714
+ dest = dest_ptr + (old_dest - old_dest_ptr);
715
+ non_space = dest_ptr + (non_space - old_dest_ptr);
716
+ }
717
+
718
+ // pass through unreserved characters
719
+ if (((*input >= 'a') && (*input <= 'z')) ||
720
+ ((*input >= 'A') && (*input <= 'Z')) ||
721
+ ((*input >= '0') && (*input <= '9')) ||
722
+ (*input == '-') ||
723
+ (*input == '_') ||
724
+ (*input == '.') ||
725
+ (*input == '~'))
726
+ {
727
+ *dest++ = *input;
728
+ non_space = dest;
729
+ }
730
+ else if (*input == ' ' && input == start)
731
+ start++; // we eat leading space
732
+ else // everything else gets URL-encoded
733
+ {
734
+ *dest++ = '%';
735
+ *dest++ = hex[(unsigned char)(*input) / 16]; // left
736
+ *dest++ = hex[(unsigned char)(*input) % 16]; // right
737
+ if (*input != ' ')
738
+ non_space = dest;
739
+ }
740
+ }
741
+
742
+ // trim trailing space if necessary
743
+ if (non_space > dest_ptr && dest - 1 != non_space)
744
+ dest_len = non_space - dest_ptr;
745
+ else
746
+ dest_len = dest - dest_ptr;
747
+ parser->link_target = rb_str_new(dest_ptr, dest_len);
748
+ free(dest_ptr);
749
+ }
750
+
751
+ VALUE Wikitext_parser_encode_link_target(VALUE self, VALUE in)
752
+ {
753
+ parser_t parser;
754
+ parser.link_target = in;
755
+ parser.treat_slash_as_special = Qfalse;
756
+ _Wikitext_parser_encode_link_target(&parser);
757
+ return parser.link_target;
758
+ }
759
+
760
+ // this method exposed for testing only
761
+ VALUE Wikitext_parser_encode_special_link_target(VALUE self, VALUE in)
762
+ {
763
+ parser_t parser;
764
+ parser.link_target = in;
765
+ parser.treat_slash_as_special = Qtrue;
766
+ _Wikitext_parser_encode_link_target(&parser);
767
+ return parser.link_target;
768
+ }
769
+
770
+ // not sure whether these rollback functions should be inline: could refactor them into a single non-inlined function
771
+ inline void _Wikitext_rollback_failed_link(parser_t *parser)
772
+ {
773
+ if (!IN(LINK_START))
774
+ return; // nothing to do!
775
+ int scope_includes_separator = IN(SEPARATOR);
776
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, LINK_START, Qtrue);
777
+ rb_str_cat(parser->output, link_start, sizeof(link_start) - 1);
778
+ if (!NIL_P(parser->link_target))
779
+ {
780
+ VALUE sanitized = _Wikitext_parser_sanitize_link_target(parser->link_target, Qfalse);
781
+ rb_str_append(parser->output, sanitized);
782
+ if (scope_includes_separator)
783
+ {
784
+ rb_str_cat(parser->output, separator, sizeof(separator) - 1);
785
+ if (!NIL_P(parser->link_text))
786
+ rb_str_append(parser->output, parser->link_text);
787
+ }
788
+ }
789
+ parser->capture = Qnil;
790
+ parser->link_target = Qnil;
791
+ parser->link_text = Qnil;
792
+ }
793
+
794
+ inline void _Wikitext_rollback_failed_external_link(parser_t *parser)
795
+ {
796
+ if (!IN(EXT_LINK_START))
797
+ return; // nothing to do!
798
+ int scope_includes_space = IN(SPACE);
799
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, EXT_LINK_START, Qtrue);
800
+ rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
801
+ if (!NIL_P(parser->link_target))
802
+ {
803
+ if (parser->autolink == Qtrue)
804
+ parser->link_target = _Wikitext_hyperlink(Qnil, parser->link_target, parser->link_target, parser->external_link_class);
805
+ rb_str_append(parser->output, parser->link_target);
806
+ if (scope_includes_space)
807
+ {
808
+ rb_str_cat(parser->output, space, sizeof(space) - 1);
809
+ if (!NIL_P(parser->link_text))
810
+ rb_str_append(parser->output, parser->link_text);
811
+ }
812
+ }
813
+ parser->capture = Qnil;
814
+ parser->link_target = Qnil;
815
+ parser->link_text = Qnil;
816
+ }
817
+
818
+ VALUE Wikitext_parser_initialize(VALUE self)
819
+ {
820
+ // no need to call super here; rb_call_super()
821
+ rb_iv_set(self, "@autolink", Qtrue);
822
+ rb_iv_set(self, "@line_ending", rb_str_new2("\n"));
823
+ rb_iv_set(self, "@external_link_class", rb_str_new2("external"));
824
+ rb_iv_set(self, "@mailto_class", rb_str_new2("mailto"));
825
+ rb_iv_set(self, "@internal_link_prefix", rb_str_new2("/wiki/"));
826
+ rb_iv_set(self, "@treat_slash_as_special", Qtrue);
827
+ return self;
828
+ }
829
+
830
+ VALUE Wikitext_parser_profiling_parse(VALUE self, VALUE string)
831
+ {
832
+ for (int i = 0; i < 100000; i++)
833
+ Wikitext_parser_parse(1, &string, self);
834
+ }
835
+
836
+ VALUE Wikitext_parser_parse(int argc, VALUE *argv, VALUE self)
837
+ {
838
+ // process arguments
839
+ VALUE string, options;
840
+ if (rb_scan_args(argc, argv, "11", &string, &options) == 1) // 1 mandatory argument, 1 optional argument
841
+ options = Qnil;
842
+ if (NIL_P(string))
843
+ return Qnil;
844
+ string = StringValue(string);
845
+
846
+ // process options hash
847
+ int base_indent = 0;
848
+ VALUE indent = Qnil;
849
+ if (!NIL_P(options) && TYPE(options) == T_HASH)
850
+ {
851
+ indent = rb_hash_aref(options, ID2SYM(rb_intern("indent")));
852
+ base_indent = NUM2INT(indent);
853
+ if (base_indent < 0)
854
+ base_indent = 0;
855
+ }
856
+
857
+ // set up scanner
858
+ char *p = RSTRING_PTR(string);
859
+ long len = RSTRING_LEN(string);
860
+ char *pe = p + len;
861
+
862
+ // access these once per parse
863
+ VALUE line_ending = rb_iv_get(self, "@line_ending");
864
+ line_ending = StringValue(line_ending);
865
+ VALUE link_class = rb_iv_get(self, "@external_link_class");
866
+ link_class = NIL_P(link_class) ? Qnil : StringValue(link_class);
867
+ VALUE mailto_class = rb_iv_get(self, "@mailto_class");
868
+ mailto_class = NIL_P(mailto_class) ? Qnil : StringValue(mailto_class);
869
+ VALUE prefix = rb_iv_get(self, "@internal_link_prefix");
870
+
871
+ // set up parser struct to make passing parameters a little easier
872
+ // eventually this will encapsulate most or all of the variables above
873
+ parser_t _parser;
874
+ parser_t *parser = &_parser;
875
+ parser->output = rb_str_new2("");
876
+ parser->capture = Qnil;
877
+ parser->link_target = Qnil;
878
+ parser->link_text = Qnil;
879
+ parser->external_link_class = link_class;
880
+ parser->scope = ary_new();
881
+ parser->line = ary_new();
882
+ parser->line_buffer = ary_new();
883
+ parser->pending_crlf = Qfalse;
884
+ parser->autolink = rb_iv_get(self, "@autolink");
885
+ parser->treat_slash_as_special = rb_iv_get(self, "@treat_slash_as_special");
886
+ parser->special_link = Qfalse;
887
+ parser->line_ending = str_new_from_string(line_ending);
888
+ parser->base_indent = base_indent;
889
+ parser->current_indent = 0;
890
+ parser->tabulation = NULL;
891
+
892
+ token_t _token;
893
+ _token.type = NO_TOKEN;
894
+ token_t *token = NULL;
895
+ do
896
+ {
897
+ // note that whenever we grab a token we push it into the line buffer
898
+ // this provides us with context-sensitive "memory" of what's been seen so far on this line
899
+ #define NEXT_TOKEN() token = &_token, next_token(token, token, NULL, pe), ary_push(parser->line_buffer, token->type)
900
+
901
+ // check to see if we have a token hanging around from a previous iteration of this loop
902
+ if (token == NULL)
903
+ {
904
+ if (_token.type == NO_TOKEN)
905
+ {
906
+ // first time here (haven't started scanning yet)
907
+ token = &_token;
908
+ next_token(token, NULL, p, pe);
909
+ ary_push(parser->line_buffer, token->type);
910
+ }
911
+ else
912
+ // already scanning
913
+ NEXT_TOKEN();
914
+ }
915
+ int type = token->type;
916
+
917
+ // many restrictions depend on what is at the top of the stack
918
+ int top = ary_entry(parser->scope, -1);
919
+
920
+ // can't declare new variables inside a switch statement, so predeclare them here
921
+ long remove_strong = -1;
922
+ long remove_em = -1;
923
+
924
+ // general purpose counters and flags
925
+ long i = 0;
926
+ long j = 0;
927
+ long k = 0;
928
+
929
+ // The following giant switch statement contains cases for all the possible token types.
930
+ // In the most basic sense we are emitting the HTML that corresponds to each token,
931
+ // but some tokens require context information in order to decide what to output.
932
+ // For example, does the STRONG token (''') translate to <strong> or </strong>?
933
+ // So when looking at any given token we have three state-maintaining variables which gives us a notion of "where we are":
934
+ //
935
+ // - the "scope" stack (indicates what HTML DOM structures we are currently nested inside, similar to a CSS selector)
936
+ // - the line buffer (records tokens seen so far on the current line)
937
+ // - the line "scope" stack (indicates what the scope should be based only on what is visible on the line so far)
938
+ //
939
+ // Although this is fairly complicated, there is one key simplifying factor:
940
+ // The translator continuously performs auto-correction, and this means that we always have a guarantee that the
941
+ // scope stack (up to the current token) is valid; our translator can take this as a given.
942
+ // Auto-correction basically consists of inserting missing tokens (preventing subsquent HTML from being messed up),
943
+ // or converting illegal (unexpected) tokens to their plain text equivalents (providing visual feedback to Wikitext author).
944
+ switch (type)
945
+ {
946
+ case PRE:
947
+ if (IN(NO_WIKI_START) || IN(PRE_START))
948
+ {
949
+ rb_str_cat(parser->output, space, sizeof(space) - 1);
950
+ break;
951
+ }
952
+ else if (IN(BLOCKQUOTE_START))
953
+ {
954
+ // this kind of nesting not allowed (to avoid user confusion)
955
+ _Wikitext_pop_excess_elements(parser);
956
+ _Wikitext_start_para_if_necessary(parser);
957
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
958
+ rb_str_cat(i, space, sizeof(space) - 1);
959
+ break;
960
+ }
961
+
962
+ // count number of BLOCKQUOTE tokens in line buffer and in scope stack
963
+ ary_push(parser->line, PRE);
964
+ i = ary_count(parser->line, BLOCKQUOTE);
965
+ j = ary_count(parser->scope, BLOCKQUOTE);
966
+ if (i < j)
967
+ {
968
+ // must pop (reduce nesting level)
969
+ for (i = j - i; i > 0; i--)
970
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
971
+ }
972
+
973
+ if (!IN(PRE))
974
+ {
975
+ parser->pending_crlf = Qfalse;
976
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
977
+ _Wikitext_indent(parser);
978
+ rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
979
+ ary_push(parser->scope, PRE);
980
+ }
981
+ break;
982
+
983
+ case PRE_START:
984
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
985
+ rb_str_cat(parser->output, escaped_pre_start, sizeof(escaped_pre_start) - 1);
986
+ else if (IN(BLOCKQUOTE_START))
987
+ {
988
+ _Wikitext_rollback_failed_link(parser); // if any
989
+ _Wikitext_rollback_failed_external_link(parser); // if any
990
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
991
+ _Wikitext_indent(parser);
992
+ rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
993
+ ary_push(parser->scope, PRE_START);
994
+ ary_push(parser->line, PRE_START);
995
+ }
996
+ else if (parser->scope->count == 0 || (IN(P) && !IN(BLOCKQUOTE)))
997
+ {
998
+ // would be nice to eliminate the repetition here but it's probably the clearest way
999
+ _Wikitext_rollback_failed_link(parser); // if any
1000
+ _Wikitext_rollback_failed_external_link(parser); // if any
1001
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, P, Qtrue);
1002
+ _Wikitext_indent(parser);
1003
+ rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
1004
+ ary_push(parser->scope, PRE_START);
1005
+ ary_push(parser->line, PRE_START);
1006
+ }
1007
+ else
1008
+ {
1009
+ // everywhere else, PRE_START is illegal (in LI, BLOCKQUOTE, H1_START etc)
1010
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1011
+ _Wikitext_pop_excess_elements(parser);
1012
+ _Wikitext_start_para_if_necessary(parser);
1013
+ rb_str_cat(i, escaped_pre_start, sizeof(escaped_pre_start) - 1);
1014
+ }
1015
+ break;
1016
+
1017
+ case PRE_END:
1018
+ if (IN(NO_WIKI_START) || IN(PRE))
1019
+ rb_str_cat(parser->output, escaped_pre_end, sizeof(escaped_pre_end) - 1);
1020
+ else
1021
+ {
1022
+ if (IN(PRE_START))
1023
+ _Wikitext_pop_from_stack_up_to(parser, parser->output, PRE_START, Qtrue);
1024
+ else
1025
+ {
1026
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1027
+ _Wikitext_pop_excess_elements(parser);
1028
+ _Wikitext_start_para_if_necessary(parser);
1029
+ rb_str_cat(i, escaped_pre_end, sizeof(escaped_pre_end) - 1);
1030
+ }
1031
+ }
1032
+ break;
1033
+
1034
+ case BLOCKQUOTE:
1035
+ if (IN(NO_WIKI_START) || IN(PRE_START))
1036
+ // no need to check for <pre>; can never appear inside it
1037
+ rb_str_cat(parser->output, escaped_blockquote, TOKEN_LEN(token) + 3); // will either emit "&gt;" or "&gt; "
1038
+ else if (IN(BLOCKQUOTE_START))
1039
+ {
1040
+ // this kind of nesting not allowed (to avoid user confusion)
1041
+ _Wikitext_pop_excess_elements(parser);
1042
+ _Wikitext_start_para_if_necessary(parser);
1043
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1044
+ rb_str_cat(i, escaped_blockquote, TOKEN_LEN(token) + 3); // will either emit "&gt;" or "&gt; "
1045
+ break;
1046
+ }
1047
+ else
1048
+ {
1049
+ ary_push(parser->line, BLOCKQUOTE);
1050
+
1051
+ // count number of BLOCKQUOTE tokens in line buffer and in scope stack
1052
+ i = ary_count(parser->line, BLOCKQUOTE);
1053
+ j = ary_count(parser->scope, BLOCKQUOTE);
1054
+
1055
+ // given that BLOCKQUOTE tokens can be nested, peek ahead and see if there are any more which might affect the decision to push or pop
1056
+ while (NEXT_TOKEN(), (token->type == BLOCKQUOTE))
1057
+ {
1058
+ ary_push(parser->line, BLOCKQUOTE);
1059
+ i++;
1060
+ }
1061
+
1062
+ // now decide whether to push, pop or do nothing
1063
+ if (i > j)
1064
+ {
1065
+ // must push (increase nesting level)
1066
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
1067
+ for (i = i - j; i > 0; i--)
1068
+ {
1069
+ _Wikitext_indent(parser);
1070
+ rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
1071
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1072
+ ary_push(parser->scope, BLOCKQUOTE);
1073
+ }
1074
+ }
1075
+ else if (i < j)
1076
+ {
1077
+ // must pop (reduce nesting level)
1078
+ for (i = j - i; i > 0; i--)
1079
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
1080
+ }
1081
+
1082
+ // jump to top of the loop to process token we scanned during lookahead
1083
+ continue;
1084
+ }
1085
+ break;
1086
+
1087
+ case BLOCKQUOTE_START:
1088
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1089
+ rb_str_cat(parser->output, escaped_blockquote_start, sizeof(escaped_blockquote_start) - 1);
1090
+ else if (IN(BLOCKQUOTE_START))
1091
+ {
1092
+ // nesting is fine here
1093
+ _Wikitext_rollback_failed_link(parser); // if any
1094
+ _Wikitext_rollback_failed_external_link(parser); // if any
1095
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
1096
+ _Wikitext_indent(parser);
1097
+ rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
1098
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1099
+ ary_push(parser->scope, BLOCKQUOTE_START);
1100
+ ary_push(parser->line, BLOCKQUOTE_START);
1101
+ }
1102
+ else if (parser->scope->count == 0 || (IN(P) && !IN(BLOCKQUOTE)))
1103
+ {
1104
+ // would be nice to eliminate the repetition here but it's probably the clearest way
1105
+ _Wikitext_rollback_failed_link(parser); // if any
1106
+ _Wikitext_rollback_failed_external_link(parser); // if any
1107
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, P, Qtrue);
1108
+ _Wikitext_indent(parser);
1109
+ rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
1110
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1111
+ ary_push(parser->scope, BLOCKQUOTE_START);
1112
+ ary_push(parser->line, BLOCKQUOTE_START);
1113
+ }
1114
+ else
1115
+ {
1116
+ // everywhere else, BLOCKQUOTE_START is illegal (in LI, BLOCKQUOTE, H1_START etc)
1117
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1118
+ _Wikitext_pop_excess_elements(parser);
1119
+ _Wikitext_start_para_if_necessary(parser);
1120
+ rb_str_cat(i, escaped_blockquote_start, sizeof(escaped_blockquote_start) - 1);
1121
+ }
1122
+ break;
1123
+
1124
+ case BLOCKQUOTE_END:
1125
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1126
+ rb_str_cat(parser->output, escaped_blockquote_end, sizeof(escaped_blockquote_end) - 1);
1127
+ else
1128
+ {
1129
+ if (IN(BLOCKQUOTE_START))
1130
+ _Wikitext_pop_from_stack_up_to(parser, parser->output, BLOCKQUOTE_START, Qtrue);
1131
+ else
1132
+ {
1133
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1134
+ _Wikitext_pop_excess_elements(parser);
1135
+ _Wikitext_start_para_if_necessary(parser);
1136
+ rb_str_cat(i, escaped_blockquote_end, sizeof(escaped_blockquote_end) - 1);
1137
+ }
1138
+ }
1139
+ break;
1140
+
1141
+ case NO_WIKI_START:
1142
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1143
+ rb_str_cat(parser->output, escaped_no_wiki_start, sizeof(escaped_no_wiki_start) - 1);
1144
+ else
1145
+ {
1146
+ _Wikitext_pop_excess_elements(parser);
1147
+ _Wikitext_start_para_if_necessary(parser);
1148
+ ary_push(parser->scope, NO_WIKI_START);
1149
+ ary_push(parser->line, NO_WIKI_START);
1150
+ }
1151
+ break;
1152
+
1153
+ case NO_WIKI_END:
1154
+ if (IN(NO_WIKI_START))
1155
+ // <nowiki> should always only ever be the last item in the stack, but use the helper routine just in case
1156
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, NO_WIKI_START, Qtrue);
1157
+ else
1158
+ {
1159
+ _Wikitext_pop_excess_elements(parser);
1160
+ _Wikitext_start_para_if_necessary(parser);
1161
+ rb_str_cat(parser->output, escaped_no_wiki_end, sizeof(escaped_no_wiki_end) - 1);
1162
+ }
1163
+ break;
1164
+
1165
+ case STRONG_EM:
1166
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1167
+ {
1168
+ rb_str_cat(parser->output, literal_strong_em, sizeof(literal_strong_em) - 1);
1169
+ break;
1170
+ }
1171
+
1172
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1173
+ _Wikitext_pop_excess_elements(parser);
1174
+
1175
+ // if you've seen STRONG/STRONG_START or EM/EM_START, must close them in the reverse order that you saw them!
1176
+ // otherwise, must open them
1177
+ remove_strong = -1;
1178
+ remove_em = -1;
1179
+ j = parser->scope->count;
1180
+ for (j = j - 1; j >= 0; j--)
1181
+ {
1182
+ int val = ary_entry(parser->scope, j);
1183
+ if (val == STRONG || val == STRONG_START)
1184
+ {
1185
+ rb_str_cat(i, strong_end, sizeof(strong_end) - 1);
1186
+ remove_strong = j;
1187
+ }
1188
+ else if (val == EM || val == EM_START)
1189
+ {
1190
+ rb_str_cat(i, em_end, sizeof(em_end) - 1);
1191
+ remove_em = j;
1192
+ }
1193
+ }
1194
+
1195
+ if (remove_strong > remove_em) // must remove strong first
1196
+ {
1197
+ ary_pop(parser->scope);
1198
+ if (remove_em > -1)
1199
+ ary_pop(parser->scope);
1200
+ else // there was no em to remove!, so consider this an opening em tag
1201
+ {
1202
+ rb_str_cat(i, em_start, sizeof(em_start) - 1);
1203
+ ary_push(parser->scope, EM);
1204
+ ary_push(parser->line, EM);
1205
+ }
1206
+ }
1207
+ else if (remove_em > remove_strong) // must remove em first
1208
+ {
1209
+ ary_pop(parser->scope);
1210
+ if (remove_strong > -1)
1211
+ ary_pop(parser->scope);
1212
+ else // there was no strong to remove!, so consider this an opening strong tag
1213
+ {
1214
+ rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
1215
+ ary_push(parser->scope, STRONG);
1216
+ ary_push(parser->line, STRONG);
1217
+ }
1218
+ }
1219
+ else // no strong or em to remove, so this must be a new opening of both
1220
+ {
1221
+ _Wikitext_start_para_if_necessary(parser);
1222
+ rb_str_cat(i, strong_em_start, sizeof(strong_em_start) - 1);
1223
+ ary_push(parser->scope, STRONG);
1224
+ ary_push(parser->line, STRONG);
1225
+ ary_push(parser->scope, EM);
1226
+ ary_push(parser->line, EM);
1227
+ }
1228
+ break;
1229
+
1230
+ case STRONG:
1231
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1232
+ rb_str_cat(parser->output, literal_strong, sizeof(literal_strong) - 1);
1233
+ else
1234
+ {
1235
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1236
+ if (IN(STRONG_START))
1237
+ // already in span started with <strong>, no choice but to emit this literally
1238
+ rb_str_cat(parser->output, literal_strong, sizeof(literal_strong) - 1);
1239
+ else if (IN(STRONG))
1240
+ // STRONG already seen, this is a closing tag
1241
+ _Wikitext_pop_from_stack_up_to(parser, i, STRONG, Qtrue);
1242
+ else
1243
+ {
1244
+ // this is a new opening
1245
+ _Wikitext_pop_excess_elements(parser);
1246
+ _Wikitext_start_para_if_necessary(parser);
1247
+ rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
1248
+ ary_push(parser->scope, STRONG);
1249
+ ary_push(parser->line, STRONG);
1250
+ }
1251
+ }
1252
+ break;
1253
+
1254
+ case STRONG_START:
1255
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1256
+ rb_str_cat(parser->output, escaped_strong_start, sizeof(escaped_strong_start) - 1);
1257
+ else
1258
+ {
1259
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1260
+ if (IN(STRONG_START) || IN(STRONG))
1261
+ rb_str_cat(parser->output, escaped_strong_start, sizeof(escaped_strong_start) - 1);
1262
+ else
1263
+ {
1264
+ _Wikitext_pop_excess_elements(parser);
1265
+ _Wikitext_start_para_if_necessary(parser);
1266
+ rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
1267
+ ary_push(parser->scope, STRONG_START);
1268
+ ary_push(parser->line, STRONG_START);
1269
+ }
1270
+ }
1271
+ break;
1272
+
1273
+ case STRONG_END:
1274
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1275
+ rb_str_cat(parser->output, escaped_strong_end, sizeof(escaped_strong_end) - 1);
1276
+ else
1277
+ {
1278
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1279
+ if (IN(STRONG_START))
1280
+ _Wikitext_pop_from_stack_up_to(parser, i, STRONG_START, Qtrue);
1281
+ else
1282
+ {
1283
+ // no STRONG_START in scope, so must interpret the STRONG_END without any special meaning
1284
+ _Wikitext_pop_excess_elements(parser);
1285
+ _Wikitext_start_para_if_necessary(parser);
1286
+ rb_str_cat(i, escaped_strong_end, sizeof(escaped_strong_end) - 1);
1287
+ }
1288
+ }
1289
+ break;
1290
+
1291
+ case EM:
1292
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1293
+ rb_str_cat(parser->output, literal_em, sizeof(literal_em) - 1);
1294
+ else
1295
+ {
1296
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1297
+ if (IN(EM_START))
1298
+ // already in span started with <em>, no choice but to emit this literally
1299
+ rb_str_cat(parser->output, literal_em, sizeof(literal_em) - 1);
1300
+ else if (IN(EM))
1301
+ // EM already seen, this is a closing tag
1302
+ _Wikitext_pop_from_stack_up_to(parser, i, EM, Qtrue);
1303
+ else
1304
+ {
1305
+ // this is a new opening
1306
+ _Wikitext_pop_excess_elements(parser);
1307
+ _Wikitext_start_para_if_necessary(parser);
1308
+ rb_str_cat(i, em_start, sizeof(em_start) - 1);
1309
+ ary_push(parser->scope, EM);
1310
+ ary_push(parser->line, EM);
1311
+ }
1312
+ }
1313
+ break;
1314
+
1315
+ case EM_START:
1316
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1317
+ rb_str_cat(parser->output, escaped_em_start, sizeof(escaped_em_start) - 1);
1318
+ else
1319
+ {
1320
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1321
+ if (IN(EM_START) || IN(EM))
1322
+ rb_str_cat(i, escaped_em_start, sizeof(escaped_em_start) - 1);
1323
+ else
1324
+ {
1325
+ _Wikitext_pop_excess_elements(parser);
1326
+ _Wikitext_start_para_if_necessary(parser);
1327
+ rb_str_cat(i, em_start, sizeof(em_start) - 1);
1328
+ ary_push(parser->scope, EM_START);
1329
+ ary_push(parser->line, EM_START);
1330
+ }
1331
+ }
1332
+ break;
1333
+
1334
+ case EM_END:
1335
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1336
+ rb_str_cat(parser->output, escaped_em_end, sizeof(escaped_em_end) - 1);
1337
+ else
1338
+ {
1339
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1340
+ if (IN(EM_START))
1341
+ _Wikitext_pop_from_stack_up_to(parser, i, EM_START, Qtrue);
1342
+ else
1343
+ {
1344
+ // no EM_START in scope, so must interpret the TT_END without any special meaning
1345
+ _Wikitext_pop_excess_elements(parser);
1346
+ _Wikitext_start_para_if_necessary(parser);
1347
+ rb_str_cat(i, escaped_em_end, sizeof(escaped_em_end) - 1);
1348
+ }
1349
+ }
1350
+ break;
1351
+
1352
+ case TT:
1353
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1354
+ rb_str_cat(parser->output, backtick, sizeof(backtick) - 1);
1355
+ else
1356
+ {
1357
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1358
+ if (IN(TT_START))
1359
+ // already in span started with <tt>, no choice but to emit this literally
1360
+ rb_str_cat(parser->output, backtick, sizeof(backtick) - 1);
1361
+ else if (IN(TT))
1362
+ // TT (`) already seen, this is a closing tag
1363
+ _Wikitext_pop_from_stack_up_to(parser, i, TT, Qtrue);
1364
+ else
1365
+ {
1366
+ // this is a new opening
1367
+ _Wikitext_pop_excess_elements(parser);
1368
+ _Wikitext_start_para_if_necessary(parser);
1369
+ rb_str_cat(i, tt_start, sizeof(tt_start) - 1);
1370
+ ary_push(parser->scope, TT);
1371
+ ary_push(parser->line, TT);
1372
+ }
1373
+ }
1374
+ break;
1375
+
1376
+ case TT_START:
1377
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1378
+ rb_str_cat(parser->output, escaped_tt_start, sizeof(escaped_tt_start) - 1);
1379
+ else
1380
+ {
1381
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1382
+ if (IN(TT_START) || IN(TT))
1383
+ rb_str_cat(i, escaped_tt_start, sizeof(escaped_tt_start) - 1);
1384
+ else
1385
+ {
1386
+ _Wikitext_pop_excess_elements(parser);
1387
+ _Wikitext_start_para_if_necessary(parser);
1388
+ rb_str_cat(i, tt_start, sizeof(tt_start) - 1);
1389
+ ary_push(parser->scope, TT_START);
1390
+ ary_push(parser->line, TT_START);
1391
+ }
1392
+ }
1393
+ break;
1394
+
1395
+ case TT_END:
1396
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1397
+ rb_str_cat(parser->output, escaped_tt_end, sizeof(escaped_tt_end) - 1);
1398
+ else
1399
+ {
1400
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1401
+ if (IN(TT_START))
1402
+ _Wikitext_pop_from_stack_up_to(parser, i, TT_START, Qtrue);
1403
+ else
1404
+ {
1405
+ // no TT_START in scope, so must interpret the TT_END without any special meaning
1406
+ _Wikitext_pop_excess_elements(parser);
1407
+ _Wikitext_start_para_if_necessary(parser);
1408
+ rb_str_cat(i, escaped_tt_end, sizeof(escaped_tt_end) - 1);
1409
+ }
1410
+ }
1411
+ break;
1412
+
1413
+ case OL:
1414
+ case UL:
1415
+ if (IN(NO_WIKI_START) || IN(PRE_START))
1416
+ {
1417
+ // no need to check for PRE; can never appear inside it
1418
+ rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
1419
+ break;
1420
+ }
1421
+
1422
+ // count number of tokens in line and scope stacks
1423
+ int bq_count = ary_count(parser->scope, BLOCKQUOTE_START);
1424
+ i = parser->line->count - ary_count(parser->line, BLOCKQUOTE_START);
1425
+ j = parser->scope->count - bq_count;
1426
+ k = i;
1427
+
1428
+ // list tokens can be nested so look ahead for any more which might affect the decision to push or pop
1429
+ for (;;)
1430
+ {
1431
+ type = token->type;
1432
+ if (type == OL || type == UL)
1433
+ {
1434
+ token = NULL;
1435
+ if (i - k >= 2) // already seen at least one OL or UL
1436
+ {
1437
+ ary_push(parser->line, NESTED_LIST); // which means this is a nested list
1438
+ i += 3;
1439
+ }
1440
+ else
1441
+ i += 2;
1442
+ ary_push(parser->line, type);
1443
+ ary_push(parser->line, LI);
1444
+
1445
+ // want to compare line with scope but can only do so if scope has enough items on it
1446
+ if (j >= i)
1447
+ {
1448
+ if (ary_entry(parser->scope, i + bq_count - 2) == type && ary_entry(parser->scope, i + bq_count - 1) == LI)
1449
+ {
1450
+ // line and scope match at this point: do nothing yet
1451
+ }
1452
+ else
1453
+ {
1454
+ // item just pushed onto line does not match corresponding slot of scope!
1455
+ for (; j >= i - 2; j--)
1456
+ // must pop back before emitting
1457
+ _Wikitext_pop_from_stack(parser, Qnil);
1458
+
1459
+ // will emit UL or OL, then LI
1460
+ break;
1461
+ }
1462
+ }
1463
+ else // line stack size now exceeds scope stack size: must increase nesting level
1464
+ break; // will emit UL or OL, then LI
1465
+ }
1466
+ else
1467
+ {
1468
+ // not a OL or UL token!
1469
+ if (j == i)
1470
+ // must close existing LI and re-open new one
1471
+ _Wikitext_pop_from_stack(parser, Qnil);
1472
+ else if (j > i)
1473
+ {
1474
+ // item just pushed onto line does not match corresponding slot of scope!
1475
+ for (; j >= i; j--)
1476
+ // must pop back before emitting
1477
+ _Wikitext_pop_from_stack(parser, Qnil);
1478
+ }
1479
+ break;
1480
+ }
1481
+ NEXT_TOKEN();
1482
+ }
1483
+
1484
+ // will emit
1485
+ if (type == OL || type == UL)
1486
+ {
1487
+ // if LI is at the top of a stack this is the start of a nested list
1488
+ if (j > 0 && ary_entry(parser->scope, -1) == LI)
1489
+ {
1490
+ // so we should precede it with a CRLF, and indicate that it's a nested list
1491
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1492
+ ary_push(parser->scope, NESTED_LIST);
1493
+ }
1494
+ else
1495
+ {
1496
+ // this is a new list
1497
+ if (IN(BLOCKQUOTE_START))
1498
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
1499
+ else
1500
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
1501
+ }
1502
+
1503
+ // emit
1504
+ _Wikitext_indent(parser);
1505
+ if (type == OL)
1506
+ rb_str_cat(parser->output, ol_start, sizeof(ol_start) - 1);
1507
+ else if (type == UL)
1508
+ rb_str_cat(parser->output, ul_start, sizeof(ul_start) - 1);
1509
+ ary_push(parser->scope, type);
1510
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1511
+ }
1512
+ else if (type == SPACE)
1513
+ // silently throw away the optional SPACE token after final list marker
1514
+ token = NULL;
1515
+
1516
+ _Wikitext_indent(parser);
1517
+ rb_str_cat(parser->output, li_start, sizeof(li_start) - 1);
1518
+ ary_push(parser->scope, LI);
1519
+
1520
+ // any subsequent UL or OL tokens on this line are syntax errors and must be emitted literally
1521
+ if (type == OL || type == UL)
1522
+ {
1523
+ k = 0;
1524
+ while (k++, NEXT_TOKEN(), (type = token->type))
1525
+ {
1526
+ if (type == OL || type == UL)
1527
+ rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
1528
+ else if (type == SPACE && k == 1)
1529
+ {
1530
+ // silently throw away the optional SPACE token after final list marker
1531
+ token = NULL;
1532
+ break;
1533
+ }
1534
+ else
1535
+ break;
1536
+ }
1537
+ }
1538
+
1539
+ // jump to top of the loop to process token we scanned during lookahead
1540
+ continue;
1541
+
1542
+ case H6_START:
1543
+ case H5_START:
1544
+ case H4_START:
1545
+ case H3_START:
1546
+ case H2_START:
1547
+ case H1_START:
1548
+ if (IN(NO_WIKI_START) || IN(PRE_START))
1549
+ {
1550
+ // no need to check for PRE; can never appear inside it
1551
+ rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
1552
+ break;
1553
+ }
1554
+
1555
+ // pop up to but not including the last BLOCKQUOTE on the scope stack
1556
+ if (IN(BLOCKQUOTE_START))
1557
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
1558
+ else
1559
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
1560
+
1561
+ // count number of BLOCKQUOTE tokens in line buffer and in scope stack
1562
+ ary_push(parser->line, type);
1563
+ i = ary_count(parser->line, BLOCKQUOTE);
1564
+ j = ary_count(parser->scope, BLOCKQUOTE);
1565
+
1566
+ // decide whether we need to pop off excess BLOCKQUOTE tokens (will never need to push; that is handled above in the BLOCKQUOTE case itself)
1567
+ if (i < j)
1568
+ {
1569
+ // must pop (reduce nesting level)
1570
+ for (i = j - i; i > 0; i--)
1571
+ _Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
1572
+ }
1573
+
1574
+ // discard any whitespace here (so that "== foo ==" will be translated to "<h2>foo</h2>" rather than "<h2> foo </h2")
1575
+ while (NEXT_TOKEN(), (token->type == SPACE))
1576
+ ; // discard
1577
+
1578
+ ary_push(parser->scope, type);
1579
+ _Wikitext_indent(parser);
1580
+
1581
+ // rather than repeat all that code for each kind of heading, share it and use a conditional here
1582
+ if (type == H6_START)
1583
+ rb_str_cat(parser->output, h6_start, sizeof(h6_start) - 1);
1584
+ else if (type == H5_START)
1585
+ rb_str_cat(parser->output, h5_start, sizeof(h5_start) - 1);
1586
+ else if (type == H4_START)
1587
+ rb_str_cat(parser->output, h4_start, sizeof(h4_start) - 1);
1588
+ else if (type == H3_START)
1589
+ rb_str_cat(parser->output, h3_start, sizeof(h3_start) - 1);
1590
+ else if (type == H2_START)
1591
+ rb_str_cat(parser->output, h2_start, sizeof(h2_start) - 1);
1592
+ else if (type == H1_START)
1593
+ rb_str_cat(parser->output, h1_start, sizeof(h1_start) - 1);
1594
+
1595
+ // jump to top of the loop to process token we scanned during lookahead
1596
+ continue;
1597
+
1598
+ case H6_END:
1599
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1600
+ rb_str_cat(parser->output, literal_h6, sizeof(literal_h6) - 1);
1601
+ else
1602
+ {
1603
+ _Wikitext_rollback_failed_external_link(parser); // if any
1604
+ if (!IN(H6_START))
1605
+ {
1606
+ // literal output only if not in h6 scope (we stay silent in that case)
1607
+ _Wikitext_start_para_if_necessary(parser);
1608
+ rb_str_cat(parser->output, literal_h6, sizeof(literal_h6) - 1);
1609
+ }
1610
+ }
1611
+ break;
1612
+
1613
+ case H5_END:
1614
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1615
+ rb_str_cat(parser->output, literal_h5, sizeof(literal_h5) - 1);
1616
+ else
1617
+ {
1618
+ _Wikitext_rollback_failed_external_link(parser); // if any
1619
+ if (!IN(H5_START))
1620
+ {
1621
+ // literal output only if not in h5 scope (we stay silent in that case)
1622
+ _Wikitext_start_para_if_necessary(parser);
1623
+ rb_str_cat(parser->output, literal_h5, sizeof(literal_h5) - 1);
1624
+ }
1625
+ }
1626
+ break;
1627
+
1628
+ case H4_END:
1629
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1630
+ rb_str_cat(parser->output, literal_h4, sizeof(literal_h4) - 1);
1631
+ else
1632
+ {
1633
+ _Wikitext_rollback_failed_external_link(parser); // if any
1634
+ if (!IN(H4_START))
1635
+ {
1636
+ // literal output only if not in h4 scope (we stay silent in that case)
1637
+ _Wikitext_start_para_if_necessary(parser);
1638
+ rb_str_cat(parser->output, literal_h4, sizeof(literal_h4) - 1);
1639
+ }
1640
+ }
1641
+ break;
1642
+
1643
+ case H3_END:
1644
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1645
+ rb_str_cat(parser->output, literal_h3, sizeof(literal_h3) - 1);
1646
+ else
1647
+ {
1648
+ _Wikitext_rollback_failed_external_link(parser); // if any
1649
+ if (!IN(H3_START))
1650
+ {
1651
+ // literal output only if not in h3 scope (we stay silent in that case)
1652
+ _Wikitext_start_para_if_necessary(parser);
1653
+ rb_str_cat(parser->output, literal_h3, sizeof(literal_h3) - 1);
1654
+ }
1655
+ }
1656
+ break;
1657
+
1658
+ case H2_END:
1659
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1660
+ rb_str_cat(parser->output, literal_h2, sizeof(literal_h2) - 1);
1661
+ else
1662
+ {
1663
+ _Wikitext_rollback_failed_external_link(parser); // if any
1664
+ if (!IN(H2_START))
1665
+ {
1666
+ // literal output only if not in h2 scope (we stay silent in that case)
1667
+ _Wikitext_start_para_if_necessary(parser);
1668
+ rb_str_cat(parser->output, literal_h2, sizeof(literal_h2) - 1);
1669
+ }
1670
+ }
1671
+ break;
1672
+
1673
+ case H1_END:
1674
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1675
+ rb_str_cat(parser->output, literal_h1, sizeof(literal_h1) - 1);
1676
+ else
1677
+ {
1678
+ _Wikitext_rollback_failed_external_link(parser); // if any
1679
+ if (!IN(H1_START))
1680
+ {
1681
+ // literal output only if not in h1 scope (we stay silent in that case)
1682
+ _Wikitext_start_para_if_necessary(parser);
1683
+ rb_str_cat(parser->output, literal_h1, sizeof(literal_h1) - 1);
1684
+ }
1685
+ }
1686
+ break;
1687
+
1688
+ case MAIL:
1689
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1690
+ rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
1691
+ else
1692
+ {
1693
+ // in plain scope, will turn into autolink (with appropriate, user-configurable CSS)
1694
+ _Wikitext_pop_excess_elements(parser);
1695
+ _Wikitext_start_para_if_necessary(parser);
1696
+ i = TOKEN_TEXT(token);
1697
+ if (parser->autolink == Qtrue)
1698
+ i = _Wikitext_hyperlink(rb_str_new2("mailto:"), i, i, mailto_class);
1699
+ rb_str_append(parser->output, i);
1700
+ }
1701
+ break;
1702
+
1703
+ case URI:
1704
+ if (IN(NO_WIKI_START))
1705
+ // user can temporarily suppress autolinking by using <nowiki></nowiki>
1706
+ // note that unlike MediaWiki, we do allow autolinking inside PRE blocks
1707
+ rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
1708
+ else if (IN(LINK_START))
1709
+ {
1710
+ // if the URI were allowed it would have been handled already in LINK_START
1711
+ _Wikitext_rollback_failed_link(parser);
1712
+ i = TOKEN_TEXT(token);
1713
+ if (parser->autolink == Qtrue)
1714
+ i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
1715
+ rb_str_append(parser->output, i);
1716
+ }
1717
+ else if (IN(EXT_LINK_START))
1718
+ {
1719
+ if (NIL_P(parser->link_target))
1720
+ {
1721
+ // this must be our link target: look ahead to make sure we see the space we're expecting to see
1722
+ i = TOKEN_TEXT(token);
1723
+ NEXT_TOKEN();
1724
+ if (token->type == SPACE)
1725
+ {
1726
+ ary_push(parser->scope, SPACE);
1727
+ parser->link_target = i;
1728
+ parser->link_text = rb_str_new2("");
1729
+ parser->capture = parser->link_text;
1730
+ token = NULL; // silently consume space
1731
+ }
1732
+ else
1733
+ {
1734
+ // didn't see the space! this must be an error
1735
+ _Wikitext_pop_from_stack(parser, Qnil);
1736
+ _Wikitext_pop_excess_elements(parser);
1737
+ _Wikitext_start_para_if_necessary(parser);
1738
+ rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
1739
+ if (parser->autolink == Qtrue)
1740
+ i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
1741
+ rb_str_append(parser->output, i);
1742
+ }
1743
+ }
1744
+ else
1745
+ {
1746
+ if (NIL_P(parser->link_text))
1747
+ // this must be the first part of our link text
1748
+ parser->link_text = TOKEN_TEXT(token);
1749
+ else
1750
+ // add to existing link text
1751
+ rb_str_cat(parser->link_text, token->start, TOKEN_LEN(token));
1752
+ }
1753
+ }
1754
+ else
1755
+ {
1756
+ // in plain scope, will turn into autolink (with appropriate, user-configurable CSS)
1757
+ _Wikitext_pop_excess_elements(parser);
1758
+ _Wikitext_start_para_if_necessary(parser);
1759
+ i = TOKEN_TEXT(token);
1760
+ if (parser->autolink == Qtrue)
1761
+ i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
1762
+ rb_str_append(parser->output, i);
1763
+ }
1764
+ break;
1765
+
1766
+ // internal links (links to other wiki articles) look like this:
1767
+ // [[another article]] (would point at, for example, "/wiki/another_article")
1768
+ // [[the other article|the link text we'll use for it]]
1769
+ // [[the other article | the link text we'll use for it]]
1770
+ // note that the forward slash is a reserved character which changes the meaning of an internal link;
1771
+ // this is a link that is external to the wiki but internal to the site as a whole:
1772
+ // [[bug/12]] (a relative link to "/bug/12")
1773
+ // MediaWiki has strict requirements about what it will accept as a link target:
1774
+ // all wikitext markup is disallowed:
1775
+ // example [[foo ''bar'' baz]]
1776
+ // renders [[foo <em>bar</em> baz]] (ie. not a link)
1777
+ // example [[foo <em>bar</em> baz]]
1778
+ // renders [[foo <em>bar</em> baz]] (ie. not a link)
1779
+ // example [[foo <nowiki>''</nowiki> baz]]
1780
+ // renders [[foo '' baz]] (ie. not a link)
1781
+ // example [[foo <bar> baz]]
1782
+ // renders [[foo &lt;bar&gt; baz]] (ie. not a link)
1783
+ // HTML entities and non-ASCII, however, make it through:
1784
+ // example [[foo &euro;]]
1785
+ // renders <a href="/wiki/Foo_%E2%82%AC">foo &euro;</a>
1786
+ // example [[foo €]]
1787
+ // renders <a href="/wiki/Foo_%E2%82%AC">foo €</a>
1788
+ // we'll impose similar restrictions here for the link target; allowed tokens will be:
1789
+ // SPACE, PRINTABLE, DEFAULT, QUOT and AMP
1790
+ // everything else will be rejected
1791
+ case LINK_START:
1792
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1793
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1794
+ rb_str_cat(i, link_start, sizeof(link_start) - 1);
1795
+ else if (IN(EXT_LINK_START))
1796
+ // already in external link scope! (and in fact, must be capturing link_text right now)
1797
+ rb_str_cat(i, link_start, sizeof(link_start) - 1);
1798
+ else if (IN(LINK_START))
1799
+ {
1800
+ // already in internal link scope! this is a syntax error
1801
+ _Wikitext_rollback_failed_link(parser);
1802
+ rb_str_cat(parser->output, link_start, sizeof(link_start) - 1);
1803
+ }
1804
+ else if (IN(SEPARATOR))
1805
+ {
1806
+ // scanning internal link text
1807
+ }
1808
+ else // not in internal link scope yet
1809
+ {
1810
+ // will either emit a link, or the rollback of a failed link, so start the para now
1811
+ _Wikitext_pop_excess_elements(parser);
1812
+ _Wikitext_start_para_if_necessary(parser);
1813
+ ary_push(parser->scope, LINK_START);
1814
+
1815
+ // look ahead and try to gobble up link target
1816
+ while (NEXT_TOKEN(), (type = token->type))
1817
+ {
1818
+ if (type == SPACE ||
1819
+ type == PRINTABLE ||
1820
+ type == DEFAULT ||
1821
+ type == QUOT ||
1822
+ type == QUOT_ENTITY ||
1823
+ type == AMP ||
1824
+ type == AMP_ENTITY)
1825
+ {
1826
+ // accumulate these tokens into link_target
1827
+ if (NIL_P(parser->link_target))
1828
+ {
1829
+ parser->link_target = rb_str_new2("");
1830
+ parser->capture = parser->link_target;
1831
+ }
1832
+ if (type == QUOT_ENTITY)
1833
+ // don't insert the entity, insert the literal quote
1834
+ rb_str_cat(parser->link_target, quote, sizeof(quote) - 1);
1835
+ else if (type == AMP_ENTITY)
1836
+ // don't insert the entity, insert the literal ampersand
1837
+ rb_str_cat(parser->link_target, ampersand, sizeof(ampersand) - 1);
1838
+ else
1839
+ rb_str_cat(parser->link_target, token->start, TOKEN_LEN(token));
1840
+ }
1841
+ else if (type == LINK_END)
1842
+ break; // jump back to top of loop (will handle this in LINK_END case below)
1843
+ else if (type == SEPARATOR)
1844
+ {
1845
+ ary_push(parser->scope, SEPARATOR);
1846
+ parser->link_text = rb_str_new2("");
1847
+ parser->capture = parser->link_text;
1848
+ token = NULL;
1849
+ break;
1850
+ }
1851
+ else // unexpected token (syntax error)
1852
+ {
1853
+ _Wikitext_rollback_failed_link(parser);
1854
+ break; // jump back to top of loop to handle unexpected token
1855
+ }
1856
+ }
1857
+
1858
+ // jump to top of the loop to process token we scanned during lookahead (if any)
1859
+ continue;
1860
+ }
1861
+ break;
1862
+
1863
+ case LINK_END:
1864
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1865
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1866
+ rb_str_cat(i, link_end, sizeof(link_end) - 1);
1867
+ else if (IN(EXT_LINK_START))
1868
+ // already in external link scope! (and in fact, must be capturing link_text right now)
1869
+ rb_str_cat(i, link_end, sizeof(link_end) - 1);
1870
+ else if (IN(LINK_START))
1871
+ {
1872
+ // in internal link scope!
1873
+ if (NIL_P(parser->link_text) || RSTRING_LEN(parser->link_text) == 0)
1874
+ // use link target as link text
1875
+ parser->link_text = _Wikitext_parser_sanitize_link_target(parser->link_target, Qtrue);
1876
+ else
1877
+ parser->link_text = _Wikitext_parser_trim_link_target(parser->link_text);
1878
+ _Wikitext_parser_encode_link_target(parser);
1879
+ _Wikitext_pop_from_stack_up_to(parser, i, LINK_START, Qtrue);
1880
+ parser->capture = Qnil;
1881
+ if (parser->special_link)
1882
+ i = _Wikitext_hyperlink(rb_str_new2("/"), parser->link_target, parser->link_text, Qnil);
1883
+ else
1884
+ i = _Wikitext_hyperlink(prefix, parser->link_target, parser->link_text, Qnil);
1885
+ rb_str_append(parser->output, i);
1886
+ parser->link_target = Qnil;
1887
+ parser->link_text = Qnil;
1888
+ }
1889
+ else // wasn't in internal link scope
1890
+ {
1891
+ _Wikitext_pop_excess_elements(parser);
1892
+ _Wikitext_start_para_if_necessary(parser);
1893
+ rb_str_cat(i, link_end, sizeof(link_end) - 1);
1894
+ }
1895
+ break;
1896
+
1897
+ // external links look like this:
1898
+ // [http://google.com/ the link text]
1899
+ // strings in square brackets which don't match this syntax get passed through literally; eg:
1900
+ // he was very angery [sic] about the turn of events
1901
+ case EXT_LINK_START:
1902
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1903
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1904
+ rb_str_cat(i, ext_link_start, sizeof(ext_link_start) - 1);
1905
+ else if (IN(EXT_LINK_START))
1906
+ // already in external link scope! (and in fact, must be capturing link_text right now)
1907
+ rb_str_cat(i, ext_link_start, sizeof(ext_link_start) - 1);
1908
+ else if (IN(LINK_START))
1909
+ {
1910
+ // already in internal link scope!
1911
+ i = rb_str_new(ext_link_start, sizeof(ext_link_start) - 1);
1912
+ if (NIL_P(parser->link_target))
1913
+ // this must be the first character of our link target
1914
+ parser->link_target = i;
1915
+ else if (IN(SPACE))
1916
+ {
1917
+ // link target has already been scanned
1918
+ if (NIL_P(parser->link_text))
1919
+ // this must be the first character of our link text
1920
+ parser->link_text = i;
1921
+ else
1922
+ // add to existing link text
1923
+ rb_str_append(parser->link_text, i);
1924
+ }
1925
+ else
1926
+ // add to existing link target
1927
+ rb_str_append(parser->link_target, i);
1928
+ }
1929
+ else // not in external link scope yet
1930
+ {
1931
+ // will either emit a link, or the rollback of a failed link, so start the para now
1932
+ _Wikitext_pop_excess_elements(parser);
1933
+ _Wikitext_start_para_if_necessary(parser);
1934
+
1935
+ // look ahead: expect a URI
1936
+ NEXT_TOKEN();
1937
+ if (token->type == URI)
1938
+ ary_push(parser->scope, EXT_LINK_START); // so far so good, jump back to the top of the loop
1939
+ else
1940
+ // only get here if there was a syntax error (missing URI)
1941
+ rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
1942
+ continue; // jump back to top of loop to handle token (either URI or whatever it is)
1943
+ }
1944
+ break;
1945
+
1946
+ case EXT_LINK_END:
1947
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1948
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1949
+ rb_str_cat(i, ext_link_end, sizeof(ext_link_end) - 1);
1950
+ else if (IN(EXT_LINK_START))
1951
+ {
1952
+ if (NIL_P(parser->link_text))
1953
+ // syntax error: external link with no link text
1954
+ _Wikitext_rollback_failed_external_link(parser);
1955
+ else
1956
+ {
1957
+ // success!
1958
+ _Wikitext_pop_from_stack_up_to(parser, i, EXT_LINK_START, Qtrue);
1959
+ parser->capture = Qnil;
1960
+ i = _Wikitext_hyperlink(Qnil, parser->link_target, parser->link_text, parser->external_link_class);
1961
+ rb_str_append(parser->output, i);
1962
+ }
1963
+ parser->link_target = Qnil;
1964
+ parser->link_text = Qnil;
1965
+ }
1966
+ else
1967
+ {
1968
+ _Wikitext_pop_excess_elements(parser);
1969
+ _Wikitext_start_para_if_necessary(parser);
1970
+ rb_str_cat(parser->output, ext_link_end, sizeof(ext_link_end) - 1);
1971
+ }
1972
+ break;
1973
+
1974
+ case SEPARATOR:
1975
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1976
+ _Wikitext_pop_excess_elements(parser);
1977
+ _Wikitext_start_para_if_necessary(parser);
1978
+ rb_str_cat(i, separator, sizeof(separator) - 1);
1979
+ break;
1980
+
1981
+ case SPACE:
1982
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
1983
+ if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
1984
+ rb_str_cat(i, token->start, TOKEN_LEN(token));
1985
+ else
1986
+ {
1987
+ // peek ahead to see next token
1988
+ char *token_ptr = token->start;
1989
+ int token_len = TOKEN_LEN(token);
1990
+ NEXT_TOKEN();
1991
+ type = token->type;
1992
+ if (((type == H6_END) && IN(H6_START)) ||
1993
+ ((type == H5_END) && IN(H5_START)) ||
1994
+ ((type == H4_END) && IN(H4_START)) ||
1995
+ ((type == H3_END) && IN(H3_START)) ||
1996
+ ((type == H2_END) && IN(H2_START)) ||
1997
+ ((type == H1_END) && IN(H1_START)))
1998
+ {
1999
+ // will suppress emission of space (discard) if next token is a H6_END, H5_END etc and we are in the corresponding scope
2000
+ }
2001
+ else
2002
+ {
2003
+ // emit the space
2004
+ _Wikitext_pop_excess_elements(parser);
2005
+ _Wikitext_start_para_if_necessary(parser);
2006
+ rb_str_cat(i, token_ptr, token_len);
2007
+ }
2008
+
2009
+ // jump to top of the loop to process token we scanned during lookahead
2010
+ continue;
2011
+ }
2012
+ break;
2013
+
2014
+ case QUOT_ENTITY:
2015
+ case AMP_ENTITY:
2016
+ case NAMED_ENTITY:
2017
+ case DECIMAL_ENTITY:
2018
+ // pass these through unaltered as they are case sensitive
2019
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2020
+ _Wikitext_pop_excess_elements(parser);
2021
+ _Wikitext_start_para_if_necessary(parser);
2022
+ rb_str_cat(i, token->start, TOKEN_LEN(token));
2023
+ break;
2024
+
2025
+ case HEX_ENTITY:
2026
+ // normalize hex entities (downcase them)
2027
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2028
+ _Wikitext_pop_excess_elements(parser);
2029
+ _Wikitext_start_para_if_necessary(parser);
2030
+ rb_str_append(i, _Wikitext_downcase(TOKEN_TEXT(token)));
2031
+ break;
2032
+
2033
+ case QUOT:
2034
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2035
+ _Wikitext_pop_excess_elements(parser);
2036
+ _Wikitext_start_para_if_necessary(parser);
2037
+ rb_str_cat(i, quot_entity, sizeof(quot_entity) - 1);
2038
+ break;
2039
+
2040
+ case AMP:
2041
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2042
+ _Wikitext_pop_excess_elements(parser);
2043
+ _Wikitext_start_para_if_necessary(parser);
2044
+ rb_str_cat(i, amp_entity, sizeof(amp_entity) - 1);
2045
+ break;
2046
+
2047
+ case LESS:
2048
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2049
+ _Wikitext_pop_excess_elements(parser);
2050
+ _Wikitext_start_para_if_necessary(parser);
2051
+ rb_str_cat(i, lt_entity, sizeof(lt_entity) - 1);
2052
+ break;
2053
+
2054
+ case GREATER:
2055
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2056
+ _Wikitext_pop_excess_elements(parser);
2057
+ _Wikitext_start_para_if_necessary(parser);
2058
+ rb_str_cat(i, gt_entity, sizeof(gt_entity) - 1);
2059
+ break;
2060
+
2061
+ case CRLF:
2062
+ parser->pending_crlf = Qfalse;
2063
+ _Wikitext_rollback_failed_link(parser); // if any
2064
+ _Wikitext_rollback_failed_external_link(parser); // if any
2065
+ if (IN(NO_WIKI_START) || IN(PRE_START))
2066
+ {
2067
+ ary_clear(parser->line_buffer);
2068
+ rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
2069
+ break;
2070
+ }
2071
+ else if (IN(PRE))
2072
+ {
2073
+ // beware when nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, that must be end of PRE block
2074
+ if (NO_ITEM(ary_entry(parser->line_buffer, -2)) || ary_entry(parser->line_buffer, -2) == BLOCKQUOTE)
2075
+ // don't emit in this case
2076
+ _Wikitext_pop_from_stack_up_to(parser, parser->output, PRE, Qtrue);
2077
+ else
2078
+ {
2079
+ // peek ahead to see if this is definitely the end of the PRE block
2080
+ NEXT_TOKEN();
2081
+ type = token->type;
2082
+ if (type != BLOCKQUOTE && type != PRE)
2083
+ {
2084
+ // this is definitely the end of the block, so don't emit
2085
+ _Wikitext_pop_from_stack_up_to(parser, parser->output, PRE, Qtrue);
2086
+ }
2087
+ else
2088
+ // potentially will emit
2089
+ parser->pending_crlf = Qtrue;
2090
+
2091
+ // delete the entire contents of the line scope stack and buffer
2092
+ ary_clear(parser->line);
2093
+ ary_clear(parser->line_buffer);
2094
+ continue; // jump back to top of loop to handle token grabbed via lookahead
2095
+ }
2096
+ }
2097
+ else
2098
+ {
2099
+ parser->pending_crlf = Qtrue;
2100
+
2101
+ // count number of BLOCKQUOTE tokens in line buffer (can be zero) and pop back to that level
2102
+ // as a side effect, this handles any open span-level elements and unclosed blocks
2103
+ // (with special handling for P blocks and LI elements)
2104
+ i = ary_count(parser->line, BLOCKQUOTE) + ary_count(parser->scope, BLOCKQUOTE_START);
2105
+ for (j = parser->scope->count; j > i; j--)
2106
+ {
2107
+ if (parser->scope->count > 0 && ary_entry(parser->scope, -1) == LI)
2108
+ {
2109
+ parser->pending_crlf = Qfalse;
2110
+ break;
2111
+ }
2112
+
2113
+ // special handling on last iteration through the loop if the top item on the scope is a P block
2114
+ if ((j - i == 1) && ary_entry(parser->scope, -1) == P)
2115
+ {
2116
+ // if nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, this must be a paragraph break
2117
+ // (note that we have to make sure we're not inside a BLOCKQUOTE_START block
2118
+ // because in those blocks BLOCKQUOTE tokens have no special meaning)
2119
+ if (NO_ITEM(ary_entry(parser->line_buffer, -2)) ||
2120
+ (ary_entry(parser->line_buffer, -2) == BLOCKQUOTE && !IN(BLOCKQUOTE_START)))
2121
+ // paragraph break
2122
+ parser->pending_crlf = Qfalse;
2123
+ else
2124
+ // not a paragraph break!
2125
+ continue;
2126
+ }
2127
+ _Wikitext_pop_from_stack(parser, Qnil);
2128
+ }
2129
+ }
2130
+
2131
+ // delete the entire contents of the line scope stack and buffer
2132
+ ary_clear(parser->line);
2133
+ ary_clear(parser->line_buffer);
2134
+ break;
2135
+
2136
+ case PRINTABLE:
2137
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2138
+ _Wikitext_pop_excess_elements(parser);
2139
+ _Wikitext_start_para_if_necessary(parser);
2140
+ rb_str_cat(i, token->start, TOKEN_LEN(token));
2141
+ break;
2142
+
2143
+ case DEFAULT:
2144
+ i = NIL_P(parser->capture) ? parser->output : parser->capture;
2145
+ _Wikitext_pop_excess_elements(parser);
2146
+ _Wikitext_start_para_if_necessary(parser);
2147
+ rb_str_append(i, _Wikitext_utf32_char_to_entity(token->code_point)); // convert to entity
2148
+ break;
2149
+
2150
+ case END_OF_FILE:
2151
+ // close any open scopes on hitting EOF
2152
+ _Wikitext_rollback_failed_external_link(parser); // if any
2153
+ _Wikitext_rollback_failed_link(parser); // if any
2154
+ for (i = 0, j = parser->scope->count; i < j; i++)
2155
+ _Wikitext_pop_from_stack(parser, Qnil);
2156
+ goto return_output; // break not enough here (want to break out of outer while loop, not inner switch statement)
2157
+
2158
+ default:
2159
+ break;
2160
+ }
2161
+
2162
+ // reset current token; forcing lexer to return another token at the top of the loop
2163
+ token = NULL;
2164
+ } while (1);
2165
+ return_output:
2166
+ // BUG: these will leak if we exit this function by raising an exception; need to investigate using Data_Wrap_Struct
2167
+ ary_free(parser->scope);
2168
+ ary_free(parser->line);
2169
+ ary_free(parser->line_buffer);
2170
+ str_free(parser->line_ending);
2171
+ if (parser->tabulation)
2172
+ str_free(parser->tabulation);
2173
+ return parser->output;
2174
+ }