wikitext 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/ary.h +99 -0
- data/ext/depend +22 -0
- data/ext/extconf.rb +23 -0
- data/ext/parser.c +2174 -0
- data/ext/parser.h +31 -0
- data/ext/str.h +135 -0
- data/ext/token.c +109 -0
- data/ext/token.h +95 -0
- data/ext/wikitext.c +60 -0
- data/ext/wikitext.h +30 -0
- data/ext/wikitext_ragel.c +3354 -0
- data/ext/wikitext_ragel.h +17 -0
- data/spec/autolinking_spec.rb +122 -0
- data/spec/blockquote_spec.rb +570 -0
- data/spec/em_spec.rb +97 -0
- data/spec/encoding_spec.rb +124 -0
- data/spec/entity_spec.rb +40 -0
- data/spec/external_link_spec.rb +289 -0
- data/spec/h1_spec.rb +59 -0
- data/spec/h2_spec.rb +59 -0
- data/spec/h3_spec.rb +59 -0
- data/spec/h4_spec.rb +59 -0
- data/spec/h5_spec.rb +59 -0
- data/spec/h6_spec.rb +59 -0
- data/spec/indentation_spec.rb +70 -0
- data/spec/integration_spec.rb +265 -0
- data/spec/internal_link_spec.rb +445 -0
- data/spec/line_endings_spec.rb +81 -0
- data/spec/link_encoding_spec.rb +132 -0
- data/spec/link_sanitizing_spec.rb +228 -0
- data/spec/nowiki_spec.rb +155 -0
- data/spec/p_spec.rb +44 -0
- data/spec/pre_spec.rb +411 -0
- data/spec/regressions_spec.rb +45 -0
- data/spec/spec_helper.rb +77 -0
- data/spec/strong_em_spec.rb +89 -0
- data/spec/strong_spec.rb +99 -0
- data/spec/tokenizing_spec.rb +190 -0
- data/spec/tt_spec.rb +100 -0
- data/spec/ul_spec.rb +307 -0
- data/spec/wikitext_spec.rb +50 -0
- metadata +93 -0
data/ext/ary.h
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
// Copyright 2008 Wincent Colaiuta
|
2
|
+
// This program is free software: you can redistribute it and/or modify
|
3
|
+
// it under the terms of the GNU General Public License as published by
|
4
|
+
// the Free Software Foundation, either version 3 of the License, or
|
5
|
+
// (at your option) any later version.
|
6
|
+
//
|
7
|
+
// This program is distributed in the hope that it will be useful,
|
8
|
+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
9
|
+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
10
|
+
// GNU General Public License for more details.
|
11
|
+
//
|
12
|
+
// You should have received a copy of the GNU General Public License
|
13
|
+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
14
|
+
|
15
|
+
#include <ruby/ruby.h>
|
16
|
+
|
17
|
+
typedef struct
|
18
|
+
{
|
19
|
+
int count;
|
20
|
+
int max;
|
21
|
+
int *entries;
|
22
|
+
} ary_t;
|
23
|
+
|
24
|
+
// in the test suite array count goes no higher than 25 or 26
|
25
|
+
#define DEFAULT_ENTRY_COUNT 64
|
26
|
+
|
27
|
+
#define NO_ITEM(item) (item == INT_MAX)
|
28
|
+
|
29
|
+
inline ary_t *ary_new(void)
|
30
|
+
{
|
31
|
+
ary_t *ary = ALLOC_N(ary_t, 1);
|
32
|
+
ary->count = 0;
|
33
|
+
ary->max = DEFAULT_ENTRY_COUNT;
|
34
|
+
ary->entries = ALLOC_N(int, DEFAULT_ENTRY_COUNT);
|
35
|
+
return ary;
|
36
|
+
}
|
37
|
+
|
38
|
+
inline void ary_free(ary_t *ary)
|
39
|
+
{
|
40
|
+
free(ary->entries);
|
41
|
+
free(ary);
|
42
|
+
}
|
43
|
+
|
44
|
+
inline int ary_entry(ary_t *ary, int idx)
|
45
|
+
{
|
46
|
+
if (idx < 0)
|
47
|
+
idx = ary->count + idx;
|
48
|
+
return (idx >= 0 && ary->count > idx) ? ary->entries[idx] : INT_MAX;
|
49
|
+
}
|
50
|
+
|
51
|
+
inline void ary_clear(ary_t *ary)
|
52
|
+
{
|
53
|
+
ary->count = 0;
|
54
|
+
}
|
55
|
+
|
56
|
+
inline int ary_pop(ary_t *ary)
|
57
|
+
{
|
58
|
+
if (ary->count > 0)
|
59
|
+
{
|
60
|
+
ary->count--;
|
61
|
+
return 1;
|
62
|
+
}
|
63
|
+
return 0;
|
64
|
+
}
|
65
|
+
|
66
|
+
inline void ary_push(ary_t *ary, int val)
|
67
|
+
{
|
68
|
+
if (ary->count == ary->max)
|
69
|
+
{
|
70
|
+
ary->max += DEFAULT_ENTRY_COUNT;
|
71
|
+
REALLOC_N(ary->entries, int, ary->max);
|
72
|
+
}
|
73
|
+
ary->entries[ary->count] = val;
|
74
|
+
ary->count++;
|
75
|
+
}
|
76
|
+
|
77
|
+
inline int ary_includes(ary_t *ary, int val)
|
78
|
+
{
|
79
|
+
for (int i = 0, max = ary->count; i < max; i++)
|
80
|
+
{
|
81
|
+
if (ary->entries[i] == val)
|
82
|
+
return 1;
|
83
|
+
}
|
84
|
+
return 0;
|
85
|
+
}
|
86
|
+
|
87
|
+
// returns a count indicating the number of times the value appears in the collection
|
88
|
+
// refactored from _Wikitext_count()
|
89
|
+
inline int ary_count(ary_t *ary, int item)
|
90
|
+
{
|
91
|
+
int count = 0;
|
92
|
+
for (int i = 0, max = ary->count; i < max; i++)
|
93
|
+
{
|
94
|
+
if (ary->entries[i] == item)
|
95
|
+
count++;
|
96
|
+
}
|
97
|
+
return count;
|
98
|
+
}
|
99
|
+
|
data/ext/depend
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# depend
|
2
|
+
# Additional material for Makefile
|
3
|
+
# Copyright 2008 Wincent Colaiuta
|
4
|
+
# This program is free software: you can redistribute it and/or modify
|
5
|
+
# it under the terms of the GNU General Public License as published by
|
6
|
+
# the Free Software Foundation, either version 3 of the License, or
|
7
|
+
# (at your option) any later version.
|
8
|
+
#
|
9
|
+
# This program is distributed in the hope that it will be useful,
|
10
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12
|
+
# GNU General Public License for more details.
|
13
|
+
#
|
14
|
+
# You should have received a copy of the GNU General Public License
|
15
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
16
|
+
|
17
|
+
CFLAGS += -std=gnu99
|
18
|
+
|
19
|
+
parser.o : ary.h parser.c parser.h token.h str.h wikitext.h wikitext_ragel.h
|
20
|
+
token.o : token.c token.h wikitext.h
|
21
|
+
wikitext.o : parser.h token.h wikitext.c wikitext.h wikitext_ragel.h
|
22
|
+
wikitext_ragel.o : token.h wikitext.h wikitext_ragel.h wikitext_ragel.c
|
data/ext/extconf.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# Copyright 2008 Wincent Colaiuta
|
2
|
+
# This program is free software: you can redistribute it and/or modify
|
3
|
+
# it under the terms of the GNU General Public License as published by
|
4
|
+
# the Free Software Foundation, either version 3 of the License, or
|
5
|
+
# (at your option) any later version.
|
6
|
+
#
|
7
|
+
# This program is distributed in the hope that it will be useful,
|
8
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
9
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
10
|
+
# GNU General Public License for more details.
|
11
|
+
#
|
12
|
+
# You should have received a copy of the GNU General Public License
|
13
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
14
|
+
|
15
|
+
require 'mkmf'
|
16
|
+
|
17
|
+
def missing item
|
18
|
+
puts "couldn't find #{item} (required)"
|
19
|
+
exit 1
|
20
|
+
end
|
21
|
+
|
22
|
+
have_header('ruby.h') or missing 'ruby.h'
|
23
|
+
create_makefile('wikitext')
|
data/ext/parser.c
ADDED
@@ -0,0 +1,2174 @@
|
|
1
|
+
// Copyright 2007-2008 Wincent Colaiuta
|
2
|
+
// This program is free software: you can redistribute it and/or modify
|
3
|
+
// it under the terms of the GNU General Public License as published by
|
4
|
+
// the Free Software Foundation, either version 3 of the License, or
|
5
|
+
// (at your option) any later version.
|
6
|
+
//
|
7
|
+
// This program is distributed in the hope that it will be useful,
|
8
|
+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
9
|
+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
10
|
+
// GNU General Public License for more details.
|
11
|
+
//
|
12
|
+
// You should have received a copy of the GNU General Public License
|
13
|
+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
14
|
+
|
15
|
+
#include "parser.h"
|
16
|
+
#include "ary.h"
|
17
|
+
#include "str.h"
|
18
|
+
#include "wikitext.h"
|
19
|
+
#include "wikitext_ragel.h"
|
20
|
+
|
21
|
+
#define IN(type) ary_includes(parser->scope, type)
|
22
|
+
|
23
|
+
// poor man's object orientation in C:
|
24
|
+
// instead of parsing around multiple parameters between functions in the parser
|
25
|
+
// we pack everything into a struct and pass around only a pointer to that
|
26
|
+
typedef struct
|
27
|
+
{
|
28
|
+
VALUE output; // for accumulating output to be returned
|
29
|
+
VALUE capture; // for capturing substrings
|
30
|
+
VALUE link_target; // short term "memory" for parsing links
|
31
|
+
VALUE link_text; // short term "memory" for parsing links
|
32
|
+
VALUE external_link_class; // CSS class applied to external links
|
33
|
+
ary_t *scope; // stack for tracking scope
|
34
|
+
ary_t *line; // stack for tracking scope as implied by current line
|
35
|
+
ary_t *line_buffer; // stack for tracking raw tokens (not scope) on current line
|
36
|
+
VALUE pending_crlf; // boolean (Qtrue or Qfalse)
|
37
|
+
VALUE autolink; // boolean (Qtrue or Qfalse)
|
38
|
+
VALUE treat_slash_as_special; // boolean (Qtrue or Qfalse)
|
39
|
+
VALUE special_link; // boolean (Qtrue or Qfalse): is the current link_target a "special" link?
|
40
|
+
str_t *line_ending;
|
41
|
+
int base_indent; // controlled by the :indent option to Wikitext::Parser#parse
|
42
|
+
int current_indent; // fluctuates according to currently nested structures
|
43
|
+
str_t *tabulation; // caching buffer for emitting indentation
|
44
|
+
} parser_t;
|
45
|
+
|
46
|
+
const char escaped_no_wiki_start[] = "<nowiki>";
|
47
|
+
const char escaped_no_wiki_end[] = "</nowiki>";
|
48
|
+
const char literal_strong_em[] = "'''''";
|
49
|
+
const char literal_strong[] = "'''";
|
50
|
+
const char literal_em[] = "''";
|
51
|
+
const char escaped_em_start[] = "<em>";
|
52
|
+
const char escaped_em_end[] = "</em>";
|
53
|
+
const char escaped_strong_start[] = "<strong>";
|
54
|
+
const char escaped_strong_end[] = "</strong>";
|
55
|
+
const char escaped_tt_start[] = "<tt>";
|
56
|
+
const char escaped_tt_end[] = "</tt>";
|
57
|
+
const char literal_h6[] = "======";
|
58
|
+
const char literal_h5[] = "=====";
|
59
|
+
const char literal_h4[] = "====";
|
60
|
+
const char literal_h3[] = "===";
|
61
|
+
const char literal_h2[] = "==";
|
62
|
+
const char literal_h1[] = "=";
|
63
|
+
const char pre_start[] = "<pre>";
|
64
|
+
const char pre_end[] = "</pre>";
|
65
|
+
const char escaped_pre_start[] = "<pre>";
|
66
|
+
const char escaped_pre_end[] = "</pre>";
|
67
|
+
const char blockquote_start[] = "<blockquote>";
|
68
|
+
const char blockquote_end[] = "</blockquote>";
|
69
|
+
const char escaped_blockquote_start[] = "<blockquote>";
|
70
|
+
const char escaped_blockquote_end[] = "</blockquote>";
|
71
|
+
const char strong_em_start[] = "<strong><em>";
|
72
|
+
const char strong_start[] = "<strong>";
|
73
|
+
const char strong_end[] = "</strong>";
|
74
|
+
const char em_start[] = "<em>";
|
75
|
+
const char em_end[] = "</em>";
|
76
|
+
const char tt_start[] = "<tt>";
|
77
|
+
const char tt_end[] = "</tt>";
|
78
|
+
const char ol_start[] = "<ol>";
|
79
|
+
const char ol_end[] = "</ol>";
|
80
|
+
const char ul_start[] = "<ul>";
|
81
|
+
const char ul_end[] = "</ul>";
|
82
|
+
const char li_start[] = "<li>";
|
83
|
+
const char li_end[] = "</li>";
|
84
|
+
const char h6_start[] = "<h6>";
|
85
|
+
const char h6_end[] = "</h6>";
|
86
|
+
const char h5_start[] = "<h5>";
|
87
|
+
const char h5_end[] = "</h5>";
|
88
|
+
const char h4_start[] = "<h4>";
|
89
|
+
const char h4_end[] = "</h4>";
|
90
|
+
const char h3_start[] = "<h3>";
|
91
|
+
const char h3_end[] = "</h3>";
|
92
|
+
const char h2_start[] = "<h2>";
|
93
|
+
const char h2_end[] = "</h2>";
|
94
|
+
const char h1_start[] = "<h1>";
|
95
|
+
const char h1_end[] = "</h1>";
|
96
|
+
const char p_start[] = "<p>";
|
97
|
+
const char p_end[] = "</p>";
|
98
|
+
const char space[] = " ";
|
99
|
+
const char a_start[] = "<a href=\"";
|
100
|
+
const char a_class[] = "\" class=\"";
|
101
|
+
const char a_start_close[] = "\">";
|
102
|
+
const char a_end[] = "</a>";
|
103
|
+
const char link_start[] = "[[";
|
104
|
+
const char link_end[] = "]]";
|
105
|
+
const char separator[] = "|";
|
106
|
+
const char ext_link_start[] = "[";
|
107
|
+
const char backtick[] = "`";
|
108
|
+
const char quote[] = "\"";
|
109
|
+
const char ampersand[] = "&";
|
110
|
+
const char quot_entity[] = """;
|
111
|
+
const char amp_entity[] = "&";
|
112
|
+
const char lt_entity[] = "<";
|
113
|
+
const char gt_entity[] = ">";
|
114
|
+
const char escaped_blockquote[] = "> ";
|
115
|
+
const char ext_link_end[] = "]";
|
116
|
+
|
117
|
+
// for testing and debugging only
|
118
|
+
VALUE Wikitext_parser_tokenize(VALUE self, VALUE string)
|
119
|
+
{
|
120
|
+
if (NIL_P(string))
|
121
|
+
return Qnil;
|
122
|
+
string = StringValue(string);
|
123
|
+
VALUE tokens = rb_ary_new();
|
124
|
+
char *p = RSTRING_PTR(string);
|
125
|
+
long len = RSTRING_LEN(string);
|
126
|
+
char *pe = p + len;
|
127
|
+
token_t token;
|
128
|
+
next_token(&token, NULL, p, pe);
|
129
|
+
rb_ary_push(tokens, _Wikitext_token(&token));
|
130
|
+
while (token.type != END_OF_FILE)
|
131
|
+
{
|
132
|
+
next_token(&token, &token, NULL, pe);
|
133
|
+
rb_ary_push(tokens, _Wikitext_token(&token));
|
134
|
+
}
|
135
|
+
return tokens;
|
136
|
+
}
|
137
|
+
|
138
|
+
// for benchmarking raw tokenization speed only
|
139
|
+
VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string)
|
140
|
+
{
|
141
|
+
if (NIL_P(string))
|
142
|
+
return Qnil;
|
143
|
+
string = StringValue(string);
|
144
|
+
char *p = RSTRING_PTR(string);
|
145
|
+
long len = RSTRING_LEN(string);
|
146
|
+
char *pe = p + len;
|
147
|
+
token_t token;
|
148
|
+
next_token(&token, NULL, p, pe);
|
149
|
+
while (token.type != END_OF_FILE)
|
150
|
+
next_token(&token, &token, NULL, pe);
|
151
|
+
return Qnil;
|
152
|
+
}
|
153
|
+
|
154
|
+
// we downcase "in place", overwriting the original contents of the buffer and returning the same string
|
155
|
+
inline VALUE _Wikitext_downcase(VALUE string)
|
156
|
+
{
|
157
|
+
char *ptr = RSTRING_PTR(string);
|
158
|
+
long len = RSTRING_LEN(string);
|
159
|
+
for (long i = 0; i < len; i++)
|
160
|
+
{
|
161
|
+
if (ptr[i] >= 'A' && ptr[i] <= 'Z')
|
162
|
+
ptr[i] += 32;
|
163
|
+
}
|
164
|
+
return string;
|
165
|
+
}
|
166
|
+
|
167
|
+
inline VALUE _Wikitext_hyperlink(VALUE link_prefix, VALUE link_target, VALUE link_text, VALUE link_class)
|
168
|
+
{
|
169
|
+
VALUE string = rb_str_new(a_start, sizeof(a_start) - 1); // <a href="
|
170
|
+
if (!NIL_P(link_prefix))
|
171
|
+
rb_str_append(string, link_prefix);
|
172
|
+
rb_str_append(string, link_target);
|
173
|
+
if (link_class != Qnil)
|
174
|
+
{
|
175
|
+
rb_str_cat(string, a_class, sizeof(a_class) - 1); // " class="
|
176
|
+
rb_str_append(string, link_class);
|
177
|
+
}
|
178
|
+
rb_str_cat(string, a_start_close, sizeof(a_start_close) - 1); // ">
|
179
|
+
rb_str_append(string, link_text);
|
180
|
+
rb_str_cat(string, a_end, sizeof(a_end) - 1);
|
181
|
+
return string;
|
182
|
+
}
|
183
|
+
|
184
|
+
// will emit indentation only if we are about to emit any of:
|
185
|
+
// <blockquote>, <p>, <ul>, <ol>, <li>, <h1> etc, <pre>
|
186
|
+
// each time we enter one of those spans must ++ the indentation level
|
187
|
+
inline void _Wikitext_indent(parser_t *parser)
|
188
|
+
{
|
189
|
+
int space_count = parser->current_indent + parser->base_indent;
|
190
|
+
if (space_count > 0)
|
191
|
+
{
|
192
|
+
char *old_end, *new_end;
|
193
|
+
if (!parser->tabulation)
|
194
|
+
{
|
195
|
+
parser->tabulation = str_new_size(space_count);
|
196
|
+
old_end = parser->tabulation->ptr;
|
197
|
+
}
|
198
|
+
else if (parser->tabulation->len < space_count)
|
199
|
+
{
|
200
|
+
old_end = parser->tabulation->ptr;
|
201
|
+
str_grow(parser->tabulation, space_count);
|
202
|
+
}
|
203
|
+
else
|
204
|
+
old_end = parser->tabulation->ptr;
|
205
|
+
new_end = parser->tabulation->ptr + space_count;
|
206
|
+
while (old_end < new_end)
|
207
|
+
*old_end++ = ' ';
|
208
|
+
rb_str_cat(parser->output, parser->tabulation->ptr, space_count);
|
209
|
+
}
|
210
|
+
parser->current_indent += 2;
|
211
|
+
}
|
212
|
+
|
213
|
+
inline void _Wikitext_dedent(parser_t *parser, VALUE emit)
|
214
|
+
{
|
215
|
+
parser->current_indent -= 2;
|
216
|
+
if (emit != Qtrue)
|
217
|
+
return;
|
218
|
+
int space_count = parser->current_indent + parser->base_indent;
|
219
|
+
if (space_count > 0)
|
220
|
+
rb_str_cat(parser->output, parser->tabulation->ptr, space_count);
|
221
|
+
}
|
222
|
+
|
223
|
+
// Pops a single item off the parser's scope stack.
|
224
|
+
// A corresponding closing tag is written to the target string.
|
225
|
+
// The target string may be the main output buffer, or a substring capturing buffer if a link is being scanned.
|
226
|
+
void _Wikitext_pop_from_stack(parser_t *parser, VALUE target)
|
227
|
+
{
|
228
|
+
int top = ary_entry(parser->scope, -1);
|
229
|
+
if (NO_ITEM(top))
|
230
|
+
return;
|
231
|
+
if (NIL_P(target))
|
232
|
+
target = parser->output;
|
233
|
+
switch (top)
|
234
|
+
{
|
235
|
+
case PRE:
|
236
|
+
case PRE_START:
|
237
|
+
rb_str_cat(target, pre_end, sizeof(pre_end) - 1);
|
238
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
239
|
+
_Wikitext_dedent(parser, Qfalse);
|
240
|
+
break;
|
241
|
+
|
242
|
+
case BLOCKQUOTE:
|
243
|
+
case BLOCKQUOTE_START:
|
244
|
+
_Wikitext_dedent(parser, Qtrue);
|
245
|
+
rb_str_cat(target, blockquote_end, sizeof(blockquote_end) - 1);
|
246
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
247
|
+
break;
|
248
|
+
|
249
|
+
case NO_WIKI_START:
|
250
|
+
// not a real HTML tag; so nothing to pop
|
251
|
+
break;
|
252
|
+
|
253
|
+
case STRONG:
|
254
|
+
case STRONG_START:
|
255
|
+
rb_str_cat(target, strong_end, sizeof(strong_end) - 1);
|
256
|
+
break;
|
257
|
+
|
258
|
+
case EM:
|
259
|
+
case EM_START:
|
260
|
+
rb_str_cat(target, em_end, sizeof(em_end) - 1);
|
261
|
+
break;
|
262
|
+
|
263
|
+
case TT:
|
264
|
+
case TT_START:
|
265
|
+
rb_str_cat(target, tt_end, sizeof(tt_end) - 1);
|
266
|
+
break;
|
267
|
+
|
268
|
+
case OL:
|
269
|
+
_Wikitext_dedent(parser, Qtrue);
|
270
|
+
rb_str_cat(target, ol_end, sizeof(ol_end) - 1);
|
271
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
272
|
+
break;
|
273
|
+
|
274
|
+
case UL:
|
275
|
+
_Wikitext_dedent(parser, Qtrue);
|
276
|
+
rb_str_cat(target, ul_end, sizeof(ul_end) - 1);
|
277
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
278
|
+
break;
|
279
|
+
|
280
|
+
case NESTED_LIST:
|
281
|
+
// next token to pop will be a LI
|
282
|
+
// LI is an interesting token because sometimes we want it to behave like P (ie. do a non-emitting indent)
|
283
|
+
// and other times we want it to behave like BLOCKQUOTE (ie. when it has a nested list inside)
|
284
|
+
// hence this hack: we do an emitting dedent on behalf of the LI that we know must be coming
|
285
|
+
// and then when we pop the actual LI itself (below) we do the standard non-emitting indent
|
286
|
+
_Wikitext_dedent(parser, Qtrue); // we really only want to emit the spaces
|
287
|
+
parser->current_indent += 2; // we don't want to decrement the actual indent level, so put it back
|
288
|
+
break;
|
289
|
+
|
290
|
+
case LI:
|
291
|
+
rb_str_cat(target, li_end, sizeof(li_end) - 1);
|
292
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
293
|
+
_Wikitext_dedent(parser, Qfalse);
|
294
|
+
break;
|
295
|
+
|
296
|
+
case H6_START:
|
297
|
+
rb_str_cat(target, h6_end, sizeof(h6_end) - 1);
|
298
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
299
|
+
_Wikitext_dedent(parser, Qfalse);
|
300
|
+
break;
|
301
|
+
|
302
|
+
case H5_START:
|
303
|
+
rb_str_cat(target, h5_end, sizeof(h5_end) - 1);
|
304
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
305
|
+
_Wikitext_dedent(parser, Qfalse);
|
306
|
+
break;
|
307
|
+
|
308
|
+
case H4_START:
|
309
|
+
rb_str_cat(target, h4_end, sizeof(h4_end) - 1);
|
310
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
311
|
+
_Wikitext_dedent(parser, Qfalse);
|
312
|
+
break;
|
313
|
+
|
314
|
+
case H3_START:
|
315
|
+
rb_str_cat(target, h3_end, sizeof(h3_end) - 1);
|
316
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
317
|
+
_Wikitext_dedent(parser, Qfalse);
|
318
|
+
break;
|
319
|
+
|
320
|
+
case H2_START:
|
321
|
+
rb_str_cat(target, h2_end, sizeof(h2_end) - 1);
|
322
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
323
|
+
_Wikitext_dedent(parser, Qfalse);
|
324
|
+
break;
|
325
|
+
|
326
|
+
case H1_START:
|
327
|
+
rb_str_cat(target, h1_end, sizeof(h1_end) - 1);
|
328
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
329
|
+
_Wikitext_dedent(parser, Qfalse);
|
330
|
+
break;
|
331
|
+
|
332
|
+
case LINK_START:
|
333
|
+
// not an HTML tag; so nothing to emit
|
334
|
+
break;
|
335
|
+
|
336
|
+
case EXT_LINK_START:
|
337
|
+
// not an HTML tag; so nothing to emit
|
338
|
+
break;
|
339
|
+
|
340
|
+
case SPACE:
|
341
|
+
// not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
|
342
|
+
break;
|
343
|
+
|
344
|
+
case SEPARATOR:
|
345
|
+
// not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
|
346
|
+
break;
|
347
|
+
|
348
|
+
case P:
|
349
|
+
rb_str_cat(target, p_end, sizeof(p_end) - 1);
|
350
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
351
|
+
_Wikitext_dedent(parser, Qfalse);
|
352
|
+
break;
|
353
|
+
|
354
|
+
case END_OF_FILE:
|
355
|
+
// nothing to do
|
356
|
+
break;
|
357
|
+
|
358
|
+
default:
|
359
|
+
// should probably raise an exception here
|
360
|
+
break;
|
361
|
+
}
|
362
|
+
ary_pop(parser->scope);
|
363
|
+
}
|
364
|
+
|
365
|
+
// Pops items off the top of parser's scope stack, accumulating closing tags for them into the target string, until item is reached.
|
366
|
+
// If including is Qtrue then the item itself is also popped.
|
367
|
+
// The target string may be the main output buffer, or a substring capturing buffer when scanning links.
|
368
|
+
void _Wikitext_pop_from_stack_up_to(parser_t *parser, VALUE target, int item, VALUE including)
|
369
|
+
{
|
370
|
+
int continue_looping = 1;
|
371
|
+
do
|
372
|
+
{
|
373
|
+
int top = ary_entry(parser->scope, -1);
|
374
|
+
if (NO_ITEM(top))
|
375
|
+
return;
|
376
|
+
if (top == item)
|
377
|
+
{
|
378
|
+
if (including != Qtrue)
|
379
|
+
return;
|
380
|
+
continue_looping = 0;
|
381
|
+
}
|
382
|
+
_Wikitext_pop_from_stack(parser, target);
|
383
|
+
} while (continue_looping);
|
384
|
+
}
|
385
|
+
|
386
|
+
inline void _Wikitext_start_para_if_necessary(parser_t *parser)
|
387
|
+
{
|
388
|
+
if (!NIL_P(parser->capture)) // we don't do anything if in capturing mode
|
389
|
+
return;
|
390
|
+
|
391
|
+
// if no block open yet, or top of stack is BLOCKQUOTE/BLOCKQUOTE_START (with nothing in it yet)
|
392
|
+
if (parser->scope->count == 0 ||
|
393
|
+
ary_entry(parser->scope, -1) == BLOCKQUOTE ||
|
394
|
+
ary_entry(parser->scope, -1) == BLOCKQUOTE_START)
|
395
|
+
{
|
396
|
+
_Wikitext_indent(parser);
|
397
|
+
rb_str_cat(parser->output, p_start, sizeof(p_start) - 1);
|
398
|
+
ary_push(parser->scope, P);
|
399
|
+
ary_push(parser->line, P);
|
400
|
+
}
|
401
|
+
else if (parser->pending_crlf == Qtrue)
|
402
|
+
{
|
403
|
+
if (IN(P))
|
404
|
+
// already in a paragraph block; convert pending CRLF into a space
|
405
|
+
rb_str_cat(parser->output, space, sizeof(space) - 1);
|
406
|
+
else if (IN(PRE))
|
407
|
+
// PRE blocks can have pending CRLF too (helps us avoid emitting the trailing newline)
|
408
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
409
|
+
}
|
410
|
+
parser->pending_crlf = Qfalse;
|
411
|
+
}
|
412
|
+
|
413
|
+
// Helper function that pops any excess elements off scope (pushing is already handled in the respective rules).
|
414
|
+
// For example, given input like:
|
415
|
+
//
|
416
|
+
// > > foo
|
417
|
+
// bar
|
418
|
+
//
|
419
|
+
// Upon seeing "bar", we want to pop two BLOCKQUOTE elements from the scope.
|
420
|
+
// The reverse case (shown below) is handled from inside the BLOCKQUOTE rule itself:
|
421
|
+
//
|
422
|
+
// foo
|
423
|
+
// > > bar
|
424
|
+
//
|
425
|
+
// Things are made slightly more complicated by the fact that there is one block-level tag that can be on the scope
|
426
|
+
// but not on the line scope:
|
427
|
+
//
|
428
|
+
// <blockquote>foo
|
429
|
+
// bar</blockquote>
|
430
|
+
//
|
431
|
+
// Here on seeing "bar" we have one item on the scope (BLOCKQUOTE_START) which we don't want to pop, but we have nothing
|
432
|
+
// on the line scope.
|
433
|
+
// Luckily, BLOCKQUOTE_START tokens can only appear at the start of the scope array, so we can check for them first before
|
434
|
+
// entering the for loop.
|
435
|
+
void inline _Wikitext_pop_excess_elements(parser_t *parser)
|
436
|
+
{
|
437
|
+
if (!NIL_P(parser->capture)) // we don't pop anything if in capturing mode
|
438
|
+
return;
|
439
|
+
for (int i = parser->scope->count - ary_count(parser->scope, BLOCKQUOTE_START), j = parser->line->count; i > j; i--)
|
440
|
+
{
|
441
|
+
// special case for last item on scope
|
442
|
+
if (i - j == 1)
|
443
|
+
{
|
444
|
+
// don't auto-pop P if it is only item on scope
|
445
|
+
if (ary_entry(parser->scope, -1) == P)
|
446
|
+
{
|
447
|
+
// add P to the line scope to prevent us entering the loop at all next time around
|
448
|
+
ary_push(parser->line, P);
|
449
|
+
continue;
|
450
|
+
}
|
451
|
+
}
|
452
|
+
_Wikitext_pop_from_stack(parser, parser->output);
|
453
|
+
}
|
454
|
+
}
|
455
|
+
|
456
|
+
#define INVALID_ENCODING(msg) do { if (dest_ptr) free(dest_ptr); rb_raise(eWikitextParserError, "invalid encoding: " msg); } while(0)
|
457
|
+
|
458
|
+
// convert a single UTF-8 codepoint to UTF-32
|
459
|
+
// expects an input buffer, src, containing a UTF-8 encoded character (which may be multi-byte)
|
460
|
+
// the end of the input buffer, end, is also passed in to allow the detection of invalidly truncated codepoints
|
461
|
+
// the number of bytes in the UTF-8 character (between 1 and 4) is returned by reference in width_out
|
462
|
+
// raises a RangeError if the supplied character is invalid UTF-8
|
463
|
+
// (in which case it also frees the block of memory indicated by dest_ptr if it is non-NULL)
|
464
|
+
inline uint32_t _Wikitext_utf8_to_utf32(char *src, char *end, long *width_out, void *dest_ptr)
|
465
|
+
{
|
466
|
+
uint32_t dest;
|
467
|
+
if ((unsigned char)src[0] <= 0x7f) // ASCII
|
468
|
+
{
|
469
|
+
dest = src[0];
|
470
|
+
*width_out = 1;
|
471
|
+
}
|
472
|
+
else if ((src[0] & 0xe0) == 0xc0) // byte starts with 110..... : this should be a two-byte sequence
|
473
|
+
{
|
474
|
+
if (src + 1 >= end)
|
475
|
+
INVALID_ENCODING("truncated byte sequence"); // no second byte
|
476
|
+
else if (((unsigned char)src[0] == 0xc0) || ((unsigned char)src[0] == 0xc1))
|
477
|
+
INVALID_ENCODING("overlong encoding"); // overlong encoding: lead byte of 110..... but code point <= 127
|
478
|
+
else if ((src[1] & 0xc0) != 0x80 )
|
479
|
+
INVALID_ENCODING("malformed byte sequence"); // should have second byte starting with 10......
|
480
|
+
dest = ((uint32_t)(src[0] & 0x1f)) << 6 | (src[1] & 0x3f);
|
481
|
+
*width_out = 2;
|
482
|
+
}
|
483
|
+
else if ((src[0] & 0xf0) == 0xe0) // byte starts with 1110.... : this should be a three-byte sequence
|
484
|
+
{
|
485
|
+
if (src + 2 >= end)
|
486
|
+
INVALID_ENCODING("truncated byte sequence"); // missing second or third byte
|
487
|
+
else if (((src[1] & 0xc0) != 0x80 ) || ((src[2] & 0xc0) != 0x80 ))
|
488
|
+
INVALID_ENCODING("malformed byte sequence"); // should have second and third bytes starting with 10......
|
489
|
+
dest = ((uint32_t)(src[0] & 0x0f)) << 12 | ((uint32_t)(src[1] & 0x3f)) << 6 | (src[2] & 0x3f);
|
490
|
+
*width_out = 3;
|
491
|
+
}
|
492
|
+
else if ((src[0] & 0xf8) == 0xf0) // bytes starts with 11110... : this should be a four-byte sequence
|
493
|
+
{
|
494
|
+
if (src + 3 >= end)
|
495
|
+
INVALID_ENCODING("truncated byte sequence"); // missing second, third, or fourth byte
|
496
|
+
else if ((unsigned char)src[0] >= 0xf5 && (unsigned char)src[0] <= 0xf7)
|
497
|
+
INVALID_ENCODING("overlong encoding"); // disallowed by RFC 3629 (codepoints above 0x10ffff)
|
498
|
+
else if (((src[1] & 0xc0) != 0x80 ) || ((src[2] & 0xc0) != 0x80 ) || ((src[3] & 0xc0) != 0x80 ))
|
499
|
+
INVALID_ENCODING("malformed byte sequence"); // should have second and third bytes starting with 10......
|
500
|
+
dest = ((uint32_t)(src[0] & 0x07)) << 18 | ((uint32_t)(src[1] & 0x3f)) << 12 | ((uint32_t)(src[1] & 0x3f)) << 6 | (src[2] & 0x3f);
|
501
|
+
*width_out = 4;
|
502
|
+
}
|
503
|
+
else // invalid input
|
504
|
+
INVALID_ENCODING("unexpected byte");
|
505
|
+
return dest;
|
506
|
+
}
|
507
|
+
|
508
|
+
inline VALUE _Wikitext_utf32_char_to_entity(uint32_t character)
|
509
|
+
{
|
510
|
+
// TODO: consider special casing some entities (ie. quot, amp, lt, gt etc)?
|
511
|
+
char hex_string[8] = { '&', '#', 'x', 0, 0, 0, 0, ';' };
|
512
|
+
char scratch = (character & 0xf000) >> 12;
|
513
|
+
hex_string[3] = (scratch <= 9 ? scratch + 48 : scratch + 87);
|
514
|
+
scratch = (character & 0x0f00) >> 8;
|
515
|
+
hex_string[4] = (scratch <= 9 ? scratch + 48 : scratch + 87);
|
516
|
+
scratch = (character & 0x00f0) >> 4;
|
517
|
+
hex_string[5] = (scratch <= 9 ? scratch + 48 : scratch + 87);
|
518
|
+
scratch = character & 0x000f;
|
519
|
+
hex_string[6] = (scratch <= 9 ? scratch + 48 : scratch + 87);
|
520
|
+
return rb_str_new((const char *)hex_string, sizeof(hex_string));
|
521
|
+
}
|
522
|
+
|
523
|
+
inline VALUE _Wikitext_parser_trim_link_target(VALUE string)
|
524
|
+
{
|
525
|
+
string = StringValue(string);
|
526
|
+
char *src = RSTRING_PTR(string);
|
527
|
+
char *start = src; // remember this so we can check if we're at the start
|
528
|
+
char *left = src;
|
529
|
+
char *non_space = src; // remember last non-space character output
|
530
|
+
long len = RSTRING_LEN(string);
|
531
|
+
char *end = src + len;
|
532
|
+
while (src < end)
|
533
|
+
{
|
534
|
+
if (*src == ' ')
|
535
|
+
{
|
536
|
+
if (src == left)
|
537
|
+
*left++;
|
538
|
+
}
|
539
|
+
else
|
540
|
+
non_space = src;
|
541
|
+
src++;
|
542
|
+
}
|
543
|
+
if (left == start && non_space + 1 == end)
|
544
|
+
return string;
|
545
|
+
else
|
546
|
+
return rb_str_new(left, (non_space + 1) - left);
|
547
|
+
}
|
548
|
+
|
549
|
+
// - non-printable (non-ASCII) characters converted to numeric entities
|
550
|
+
// - QUOT and AMP characters converted to named entities
|
551
|
+
// - leading and trailing whitespace trimmed if trim is Qtrue
|
552
|
+
inline VALUE _Wikitext_parser_sanitize_link_target(VALUE string, VALUE trim)
|
553
|
+
{
|
554
|
+
string = StringValue(string); // raises if string is nil or doesn't quack like a string
|
555
|
+
char *src = RSTRING_PTR(string);
|
556
|
+
char *start = src; // remember this so we can check if we're at the start
|
557
|
+
long len = RSTRING_LEN(string);
|
558
|
+
char *end = src + len;
|
559
|
+
|
560
|
+
// start with a destination buffer twice the size of the source, will realloc if necessary
|
561
|
+
// slop = (len / 8) * 8 (ie. one in every 8 characters can be converted into an entity, each entity requires 8 bytes)
|
562
|
+
// this efficiently handles the most common case (where the size of the buffer doesn't change much)
|
563
|
+
char *dest = ALLOC_N(char, len * 2);
|
564
|
+
char *dest_ptr = dest; // hang on to this so we can pass it to free() later
|
565
|
+
char *non_space = dest; // remember last non-space character output
|
566
|
+
while (src < end)
|
567
|
+
{
|
568
|
+
// need at most 8 characters (8 bytes) to display each character
|
569
|
+
if (dest + 8 > dest_ptr + len) // outgrowing buffer, must reallocate
|
570
|
+
{
|
571
|
+
char *old_dest = dest;
|
572
|
+
char *old_dest_ptr = dest_ptr;
|
573
|
+
len = len + (end - src) * 8; // allocate enough for worst case
|
574
|
+
dest = realloc(dest_ptr, len); // will never have to realloc more than once
|
575
|
+
if (dest == NULL)
|
576
|
+
{
|
577
|
+
// would have used reallocf, but this has to run on Linux too, not just Darwin
|
578
|
+
free(dest_ptr);
|
579
|
+
rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
|
580
|
+
}
|
581
|
+
dest_ptr = dest;
|
582
|
+
dest = dest_ptr + (old_dest - old_dest_ptr);
|
583
|
+
non_space = dest_ptr + (non_space - old_dest_ptr);
|
584
|
+
}
|
585
|
+
|
586
|
+
if (*src == '"') // QUOT
|
587
|
+
{
|
588
|
+
char quot_entity_literal[] = { '&', 'q', 'u', 'o', 't', ';' }; // no trailing NUL
|
589
|
+
memcpy(dest, quot_entity_literal, sizeof(quot_entity_literal));
|
590
|
+
dest += sizeof(quot_entity_literal);
|
591
|
+
}
|
592
|
+
else if (*src == '&') // AMP
|
593
|
+
{
|
594
|
+
char amp_entity_literal[] = { '&', 'a', 'm', 'p', ';' }; // no trailing NUL
|
595
|
+
memcpy(dest, amp_entity_literal, sizeof(amp_entity_literal));
|
596
|
+
dest += sizeof(amp_entity_literal);
|
597
|
+
}
|
598
|
+
else if (*src == '<') // LESS_THAN
|
599
|
+
{
|
600
|
+
free(dest_ptr);
|
601
|
+
rb_raise(rb_eRangeError, "invalid link text (\"<\" may not appear in link text)");
|
602
|
+
}
|
603
|
+
else if (*src == '>') // GREATER_THAN
|
604
|
+
{
|
605
|
+
free(dest_ptr);
|
606
|
+
rb_raise(rb_eRangeError, "invalid link text (\">\" may not appear in link text)");
|
607
|
+
}
|
608
|
+
else if (*src == ' ' && src == start && trim == Qtrue)
|
609
|
+
start++; // we eat leading space
|
610
|
+
else if (*src >= 0x20 && *src <= 0x7e) // printable ASCII
|
611
|
+
{
|
612
|
+
*dest = *src;
|
613
|
+
dest++;
|
614
|
+
}
|
615
|
+
else // all others: must convert to entities
|
616
|
+
{
|
617
|
+
long width;
|
618
|
+
VALUE entity = _Wikitext_utf32_char_to_entity(_Wikitext_utf8_to_utf32(src, end, &width, dest_ptr));
|
619
|
+
char *entity_src = RSTRING_PTR(entity);
|
620
|
+
long entity_len = RSTRING_LEN(entity); // should always be 8 characters (8 bytes)
|
621
|
+
memcpy(dest, entity_src, entity_len);
|
622
|
+
dest += entity_len;
|
623
|
+
src += width;
|
624
|
+
non_space = dest;
|
625
|
+
continue;
|
626
|
+
}
|
627
|
+
if (*src != ' ')
|
628
|
+
non_space = dest;
|
629
|
+
src++;
|
630
|
+
}
|
631
|
+
|
632
|
+
// trim trailing space if necessary
|
633
|
+
if (trim == Qtrue && non_space > dest_ptr && dest != non_space)
|
634
|
+
len = non_space - dest_ptr;
|
635
|
+
else
|
636
|
+
len = dest - dest_ptr;
|
637
|
+
VALUE out = rb_str_new(dest_ptr, len);
|
638
|
+
free(dest_ptr);
|
639
|
+
return out;
|
640
|
+
}
|
641
|
+
|
642
|
+
VALUE Wikitext_parser_sanitize_link_target(VALUE self, VALUE string)
|
643
|
+
{
|
644
|
+
return (_Wikitext_parser_sanitize_link_target(string, Qtrue));
|
645
|
+
}
|
646
|
+
|
647
|
+
// encodes the input string according to RFCs 2396 and 2718
|
648
|
+
// leading and trailing whitespace trimmed
|
649
|
+
// note that the first character of the target link is not case-sensitive
|
650
|
+
// (this is a recommended application-level constraint; it is not imposed at this level)
|
651
|
+
// this is to allow links like:
|
652
|
+
// ...the [[foo]] is...
|
653
|
+
// to be equivalent to:
|
654
|
+
// thing. [[Foo]] was...
|
655
|
+
// this is also where we check treat_slash_as_special is true and act accordingly
|
656
|
+
// basically any link target matching /\A[a-z]+\/\d+\z/ is flagged as special
|
657
|
+
inline static void _Wikitext_parser_encode_link_target(parser_t *parser)
|
658
|
+
{
|
659
|
+
VALUE in = StringValue(parser->link_target);
|
660
|
+
char *input = RSTRING_PTR(in);
|
661
|
+
char *start = input; // remember this so we can check if we're at the start
|
662
|
+
long len = RSTRING_LEN(in);
|
663
|
+
if (!(len > 0))
|
664
|
+
return;
|
665
|
+
char *end = input + len;
|
666
|
+
static char hex[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
|
667
|
+
|
668
|
+
// this potential shortcut requires an (admittedly cheap) prescan, so only do it when treat_slash_as_special is true
|
669
|
+
parser->special_link = Qfalse;
|
670
|
+
if (parser->treat_slash_as_special == Qtrue)
|
671
|
+
{
|
672
|
+
char *c = input; // \A
|
673
|
+
while (c < end && *c >= 'a' && *c <= 'z') // [a-z]
|
674
|
+
c++; // +
|
675
|
+
if (c > start && c < end && *c++ == '/') // \/
|
676
|
+
{
|
677
|
+
while (c < end && *c >= '0' && *c <= '9') // \d
|
678
|
+
{
|
679
|
+
c++; // +
|
680
|
+
if (c == end) // \z
|
681
|
+
{
|
682
|
+
// matches /\A[a-z]+\/\d+\z/ so no transformation required
|
683
|
+
parser->special_link = Qtrue;
|
684
|
+
return;
|
685
|
+
}
|
686
|
+
}
|
687
|
+
}
|
688
|
+
}
|
689
|
+
|
690
|
+
// to avoid most reallocations start with a destination buffer twice the size of the source
|
691
|
+
// this handles the most common case (where most chars are in the ASCII range and don't require more storage, but there are
|
692
|
+
// often quite a few spaces, which are encoded as "%20" and occupy 3 bytes)
|
693
|
+
// the worst case is where _every_ byte must be written out using 3 bytes
|
694
|
+
long dest_len = len * 2;
|
695
|
+
char *dest = ALLOC_N(char, dest_len);
|
696
|
+
char *dest_ptr = dest; // hang on to this so we can pass it to free() later
|
697
|
+
char *non_space = dest; // remember last non-space character output
|
698
|
+
for (; input < end; input++)
|
699
|
+
{
|
700
|
+
if ((dest + 3) > (dest_ptr + dest_len)) // worst case: a single character may grow to 3 characters once encoded
|
701
|
+
{
|
702
|
+
// outgrowing buffer, must reallocate
|
703
|
+
char *old_dest = dest;
|
704
|
+
char *old_dest_ptr = dest_ptr;
|
705
|
+
dest_len += len;
|
706
|
+
dest = realloc(dest_ptr, dest_len);
|
707
|
+
if (dest == NULL)
|
708
|
+
{
|
709
|
+
// would have used reallocf, but this has to run on Linux too, not just Darwin
|
710
|
+
free(dest_ptr);
|
711
|
+
rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
|
712
|
+
}
|
713
|
+
dest_ptr = dest;
|
714
|
+
dest = dest_ptr + (old_dest - old_dest_ptr);
|
715
|
+
non_space = dest_ptr + (non_space - old_dest_ptr);
|
716
|
+
}
|
717
|
+
|
718
|
+
// pass through unreserved characters
|
719
|
+
if (((*input >= 'a') && (*input <= 'z')) ||
|
720
|
+
((*input >= 'A') && (*input <= 'Z')) ||
|
721
|
+
((*input >= '0') && (*input <= '9')) ||
|
722
|
+
(*input == '-') ||
|
723
|
+
(*input == '_') ||
|
724
|
+
(*input == '.') ||
|
725
|
+
(*input == '~'))
|
726
|
+
{
|
727
|
+
*dest++ = *input;
|
728
|
+
non_space = dest;
|
729
|
+
}
|
730
|
+
else if (*input == ' ' && input == start)
|
731
|
+
start++; // we eat leading space
|
732
|
+
else // everything else gets URL-encoded
|
733
|
+
{
|
734
|
+
*dest++ = '%';
|
735
|
+
*dest++ = hex[(unsigned char)(*input) / 16]; // left
|
736
|
+
*dest++ = hex[(unsigned char)(*input) % 16]; // right
|
737
|
+
if (*input != ' ')
|
738
|
+
non_space = dest;
|
739
|
+
}
|
740
|
+
}
|
741
|
+
|
742
|
+
// trim trailing space if necessary
|
743
|
+
if (non_space > dest_ptr && dest - 1 != non_space)
|
744
|
+
dest_len = non_space - dest_ptr;
|
745
|
+
else
|
746
|
+
dest_len = dest - dest_ptr;
|
747
|
+
parser->link_target = rb_str_new(dest_ptr, dest_len);
|
748
|
+
free(dest_ptr);
|
749
|
+
}
|
750
|
+
|
751
|
+
VALUE Wikitext_parser_encode_link_target(VALUE self, VALUE in)
|
752
|
+
{
|
753
|
+
parser_t parser;
|
754
|
+
parser.link_target = in;
|
755
|
+
parser.treat_slash_as_special = Qfalse;
|
756
|
+
_Wikitext_parser_encode_link_target(&parser);
|
757
|
+
return parser.link_target;
|
758
|
+
}
|
759
|
+
|
760
|
+
// this method exposed for testing only
|
761
|
+
VALUE Wikitext_parser_encode_special_link_target(VALUE self, VALUE in)
|
762
|
+
{
|
763
|
+
parser_t parser;
|
764
|
+
parser.link_target = in;
|
765
|
+
parser.treat_slash_as_special = Qtrue;
|
766
|
+
_Wikitext_parser_encode_link_target(&parser);
|
767
|
+
return parser.link_target;
|
768
|
+
}
|
769
|
+
|
770
|
+
// not sure whether these rollback functions should be inline: could refactor them into a single non-inlined function
|
771
|
+
inline void _Wikitext_rollback_failed_link(parser_t *parser)
|
772
|
+
{
|
773
|
+
if (!IN(LINK_START))
|
774
|
+
return; // nothing to do!
|
775
|
+
int scope_includes_separator = IN(SEPARATOR);
|
776
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, LINK_START, Qtrue);
|
777
|
+
rb_str_cat(parser->output, link_start, sizeof(link_start) - 1);
|
778
|
+
if (!NIL_P(parser->link_target))
|
779
|
+
{
|
780
|
+
VALUE sanitized = _Wikitext_parser_sanitize_link_target(parser->link_target, Qfalse);
|
781
|
+
rb_str_append(parser->output, sanitized);
|
782
|
+
if (scope_includes_separator)
|
783
|
+
{
|
784
|
+
rb_str_cat(parser->output, separator, sizeof(separator) - 1);
|
785
|
+
if (!NIL_P(parser->link_text))
|
786
|
+
rb_str_append(parser->output, parser->link_text);
|
787
|
+
}
|
788
|
+
}
|
789
|
+
parser->capture = Qnil;
|
790
|
+
parser->link_target = Qnil;
|
791
|
+
parser->link_text = Qnil;
|
792
|
+
}
|
793
|
+
|
794
|
+
inline void _Wikitext_rollback_failed_external_link(parser_t *parser)
|
795
|
+
{
|
796
|
+
if (!IN(EXT_LINK_START))
|
797
|
+
return; // nothing to do!
|
798
|
+
int scope_includes_space = IN(SPACE);
|
799
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, EXT_LINK_START, Qtrue);
|
800
|
+
rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
|
801
|
+
if (!NIL_P(parser->link_target))
|
802
|
+
{
|
803
|
+
if (parser->autolink == Qtrue)
|
804
|
+
parser->link_target = _Wikitext_hyperlink(Qnil, parser->link_target, parser->link_target, parser->external_link_class);
|
805
|
+
rb_str_append(parser->output, parser->link_target);
|
806
|
+
if (scope_includes_space)
|
807
|
+
{
|
808
|
+
rb_str_cat(parser->output, space, sizeof(space) - 1);
|
809
|
+
if (!NIL_P(parser->link_text))
|
810
|
+
rb_str_append(parser->output, parser->link_text);
|
811
|
+
}
|
812
|
+
}
|
813
|
+
parser->capture = Qnil;
|
814
|
+
parser->link_target = Qnil;
|
815
|
+
parser->link_text = Qnil;
|
816
|
+
}
|
817
|
+
|
818
|
+
VALUE Wikitext_parser_initialize(VALUE self)
|
819
|
+
{
|
820
|
+
// no need to call super here; rb_call_super()
|
821
|
+
rb_iv_set(self, "@autolink", Qtrue);
|
822
|
+
rb_iv_set(self, "@line_ending", rb_str_new2("\n"));
|
823
|
+
rb_iv_set(self, "@external_link_class", rb_str_new2("external"));
|
824
|
+
rb_iv_set(self, "@mailto_class", rb_str_new2("mailto"));
|
825
|
+
rb_iv_set(self, "@internal_link_prefix", rb_str_new2("/wiki/"));
|
826
|
+
rb_iv_set(self, "@treat_slash_as_special", Qtrue);
|
827
|
+
return self;
|
828
|
+
}
|
829
|
+
|
830
|
+
VALUE Wikitext_parser_profiling_parse(VALUE self, VALUE string)
|
831
|
+
{
|
832
|
+
for (int i = 0; i < 100000; i++)
|
833
|
+
Wikitext_parser_parse(1, &string, self);
|
834
|
+
}
|
835
|
+
|
836
|
+
VALUE Wikitext_parser_parse(int argc, VALUE *argv, VALUE self)
|
837
|
+
{
|
838
|
+
// process arguments
|
839
|
+
VALUE string, options;
|
840
|
+
if (rb_scan_args(argc, argv, "11", &string, &options) == 1) // 1 mandatory argument, 1 optional argument
|
841
|
+
options = Qnil;
|
842
|
+
if (NIL_P(string))
|
843
|
+
return Qnil;
|
844
|
+
string = StringValue(string);
|
845
|
+
|
846
|
+
// process options hash
|
847
|
+
int base_indent = 0;
|
848
|
+
VALUE indent = Qnil;
|
849
|
+
if (!NIL_P(options) && TYPE(options) == T_HASH)
|
850
|
+
{
|
851
|
+
indent = rb_hash_aref(options, ID2SYM(rb_intern("indent")));
|
852
|
+
base_indent = NUM2INT(indent);
|
853
|
+
if (base_indent < 0)
|
854
|
+
base_indent = 0;
|
855
|
+
}
|
856
|
+
|
857
|
+
// set up scanner
|
858
|
+
char *p = RSTRING_PTR(string);
|
859
|
+
long len = RSTRING_LEN(string);
|
860
|
+
char *pe = p + len;
|
861
|
+
|
862
|
+
// access these once per parse
|
863
|
+
VALUE line_ending = rb_iv_get(self, "@line_ending");
|
864
|
+
line_ending = StringValue(line_ending);
|
865
|
+
VALUE link_class = rb_iv_get(self, "@external_link_class");
|
866
|
+
link_class = NIL_P(link_class) ? Qnil : StringValue(link_class);
|
867
|
+
VALUE mailto_class = rb_iv_get(self, "@mailto_class");
|
868
|
+
mailto_class = NIL_P(mailto_class) ? Qnil : StringValue(mailto_class);
|
869
|
+
VALUE prefix = rb_iv_get(self, "@internal_link_prefix");
|
870
|
+
|
871
|
+
// set up parser struct to make passing parameters a little easier
|
872
|
+
// eventually this will encapsulate most or all of the variables above
|
873
|
+
parser_t _parser;
|
874
|
+
parser_t *parser = &_parser;
|
875
|
+
parser->output = rb_str_new2("");
|
876
|
+
parser->capture = Qnil;
|
877
|
+
parser->link_target = Qnil;
|
878
|
+
parser->link_text = Qnil;
|
879
|
+
parser->external_link_class = link_class;
|
880
|
+
parser->scope = ary_new();
|
881
|
+
parser->line = ary_new();
|
882
|
+
parser->line_buffer = ary_new();
|
883
|
+
parser->pending_crlf = Qfalse;
|
884
|
+
parser->autolink = rb_iv_get(self, "@autolink");
|
885
|
+
parser->treat_slash_as_special = rb_iv_get(self, "@treat_slash_as_special");
|
886
|
+
parser->special_link = Qfalse;
|
887
|
+
parser->line_ending = str_new_from_string(line_ending);
|
888
|
+
parser->base_indent = base_indent;
|
889
|
+
parser->current_indent = 0;
|
890
|
+
parser->tabulation = NULL;
|
891
|
+
|
892
|
+
token_t _token;
|
893
|
+
_token.type = NO_TOKEN;
|
894
|
+
token_t *token = NULL;
|
895
|
+
do
|
896
|
+
{
|
897
|
+
// note that whenever we grab a token we push it into the line buffer
|
898
|
+
// this provides us with context-sensitive "memory" of what's been seen so far on this line
|
899
|
+
#define NEXT_TOKEN() token = &_token, next_token(token, token, NULL, pe), ary_push(parser->line_buffer, token->type)
|
900
|
+
|
901
|
+
// check to see if we have a token hanging around from a previous iteration of this loop
|
902
|
+
if (token == NULL)
|
903
|
+
{
|
904
|
+
if (_token.type == NO_TOKEN)
|
905
|
+
{
|
906
|
+
// first time here (haven't started scanning yet)
|
907
|
+
token = &_token;
|
908
|
+
next_token(token, NULL, p, pe);
|
909
|
+
ary_push(parser->line_buffer, token->type);
|
910
|
+
}
|
911
|
+
else
|
912
|
+
// already scanning
|
913
|
+
NEXT_TOKEN();
|
914
|
+
}
|
915
|
+
int type = token->type;
|
916
|
+
|
917
|
+
// many restrictions depend on what is at the top of the stack
|
918
|
+
int top = ary_entry(parser->scope, -1);
|
919
|
+
|
920
|
+
// can't declare new variables inside a switch statement, so predeclare them here
|
921
|
+
long remove_strong = -1;
|
922
|
+
long remove_em = -1;
|
923
|
+
|
924
|
+
// general purpose counters and flags
|
925
|
+
long i = 0;
|
926
|
+
long j = 0;
|
927
|
+
long k = 0;
|
928
|
+
|
929
|
+
// The following giant switch statement contains cases for all the possible token types.
|
930
|
+
// In the most basic sense we are emitting the HTML that corresponds to each token,
|
931
|
+
// but some tokens require context information in order to decide what to output.
|
932
|
+
// For example, does the STRONG token (''') translate to <strong> or </strong>?
|
933
|
+
// So when looking at any given token we have three state-maintaining variables which gives us a notion of "where we are":
|
934
|
+
//
|
935
|
+
// - the "scope" stack (indicates what HTML DOM structures we are currently nested inside, similar to a CSS selector)
|
936
|
+
// - the line buffer (records tokens seen so far on the current line)
|
937
|
+
// - the line "scope" stack (indicates what the scope should be based only on what is visible on the line so far)
|
938
|
+
//
|
939
|
+
// Although this is fairly complicated, there is one key simplifying factor:
|
940
|
+
// The translator continuously performs auto-correction, and this means that we always have a guarantee that the
|
941
|
+
// scope stack (up to the current token) is valid; our translator can take this as a given.
|
942
|
+
// Auto-correction basically consists of inserting missing tokens (preventing subsquent HTML from being messed up),
|
943
|
+
// or converting illegal (unexpected) tokens to their plain text equivalents (providing visual feedback to Wikitext author).
|
944
|
+
switch (type)
|
945
|
+
{
|
946
|
+
case PRE:
|
947
|
+
if (IN(NO_WIKI_START) || IN(PRE_START))
|
948
|
+
{
|
949
|
+
rb_str_cat(parser->output, space, sizeof(space) - 1);
|
950
|
+
break;
|
951
|
+
}
|
952
|
+
else if (IN(BLOCKQUOTE_START))
|
953
|
+
{
|
954
|
+
// this kind of nesting not allowed (to avoid user confusion)
|
955
|
+
_Wikitext_pop_excess_elements(parser);
|
956
|
+
_Wikitext_start_para_if_necessary(parser);
|
957
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
958
|
+
rb_str_cat(i, space, sizeof(space) - 1);
|
959
|
+
break;
|
960
|
+
}
|
961
|
+
|
962
|
+
// count number of BLOCKQUOTE tokens in line buffer and in scope stack
|
963
|
+
ary_push(parser->line, PRE);
|
964
|
+
i = ary_count(parser->line, BLOCKQUOTE);
|
965
|
+
j = ary_count(parser->scope, BLOCKQUOTE);
|
966
|
+
if (i < j)
|
967
|
+
{
|
968
|
+
// must pop (reduce nesting level)
|
969
|
+
for (i = j - i; i > 0; i--)
|
970
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
|
971
|
+
}
|
972
|
+
|
973
|
+
if (!IN(PRE))
|
974
|
+
{
|
975
|
+
parser->pending_crlf = Qfalse;
|
976
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
|
977
|
+
_Wikitext_indent(parser);
|
978
|
+
rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
|
979
|
+
ary_push(parser->scope, PRE);
|
980
|
+
}
|
981
|
+
break;
|
982
|
+
|
983
|
+
case PRE_START:
|
984
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
985
|
+
rb_str_cat(parser->output, escaped_pre_start, sizeof(escaped_pre_start) - 1);
|
986
|
+
else if (IN(BLOCKQUOTE_START))
|
987
|
+
{
|
988
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
989
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
990
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
|
991
|
+
_Wikitext_indent(parser);
|
992
|
+
rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
|
993
|
+
ary_push(parser->scope, PRE_START);
|
994
|
+
ary_push(parser->line, PRE_START);
|
995
|
+
}
|
996
|
+
else if (parser->scope->count == 0 || (IN(P) && !IN(BLOCKQUOTE)))
|
997
|
+
{
|
998
|
+
// would be nice to eliminate the repetition here but it's probably the clearest way
|
999
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
1000
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1001
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, P, Qtrue);
|
1002
|
+
_Wikitext_indent(parser);
|
1003
|
+
rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
|
1004
|
+
ary_push(parser->scope, PRE_START);
|
1005
|
+
ary_push(parser->line, PRE_START);
|
1006
|
+
}
|
1007
|
+
else
|
1008
|
+
{
|
1009
|
+
// everywhere else, PRE_START is illegal (in LI, BLOCKQUOTE, H1_START etc)
|
1010
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1011
|
+
_Wikitext_pop_excess_elements(parser);
|
1012
|
+
_Wikitext_start_para_if_necessary(parser);
|
1013
|
+
rb_str_cat(i, escaped_pre_start, sizeof(escaped_pre_start) - 1);
|
1014
|
+
}
|
1015
|
+
break;
|
1016
|
+
|
1017
|
+
case PRE_END:
|
1018
|
+
if (IN(NO_WIKI_START) || IN(PRE))
|
1019
|
+
rb_str_cat(parser->output, escaped_pre_end, sizeof(escaped_pre_end) - 1);
|
1020
|
+
else
|
1021
|
+
{
|
1022
|
+
if (IN(PRE_START))
|
1023
|
+
_Wikitext_pop_from_stack_up_to(parser, parser->output, PRE_START, Qtrue);
|
1024
|
+
else
|
1025
|
+
{
|
1026
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1027
|
+
_Wikitext_pop_excess_elements(parser);
|
1028
|
+
_Wikitext_start_para_if_necessary(parser);
|
1029
|
+
rb_str_cat(i, escaped_pre_end, sizeof(escaped_pre_end) - 1);
|
1030
|
+
}
|
1031
|
+
}
|
1032
|
+
break;
|
1033
|
+
|
1034
|
+
case BLOCKQUOTE:
|
1035
|
+
if (IN(NO_WIKI_START) || IN(PRE_START))
|
1036
|
+
// no need to check for <pre>; can never appear inside it
|
1037
|
+
rb_str_cat(parser->output, escaped_blockquote, TOKEN_LEN(token) + 3); // will either emit ">" or "> "
|
1038
|
+
else if (IN(BLOCKQUOTE_START))
|
1039
|
+
{
|
1040
|
+
// this kind of nesting not allowed (to avoid user confusion)
|
1041
|
+
_Wikitext_pop_excess_elements(parser);
|
1042
|
+
_Wikitext_start_para_if_necessary(parser);
|
1043
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1044
|
+
rb_str_cat(i, escaped_blockquote, TOKEN_LEN(token) + 3); // will either emit ">" or "> "
|
1045
|
+
break;
|
1046
|
+
}
|
1047
|
+
else
|
1048
|
+
{
|
1049
|
+
ary_push(parser->line, BLOCKQUOTE);
|
1050
|
+
|
1051
|
+
// count number of BLOCKQUOTE tokens in line buffer and in scope stack
|
1052
|
+
i = ary_count(parser->line, BLOCKQUOTE);
|
1053
|
+
j = ary_count(parser->scope, BLOCKQUOTE);
|
1054
|
+
|
1055
|
+
// given that BLOCKQUOTE tokens can be nested, peek ahead and see if there are any more which might affect the decision to push or pop
|
1056
|
+
while (NEXT_TOKEN(), (token->type == BLOCKQUOTE))
|
1057
|
+
{
|
1058
|
+
ary_push(parser->line, BLOCKQUOTE);
|
1059
|
+
i++;
|
1060
|
+
}
|
1061
|
+
|
1062
|
+
// now decide whether to push, pop or do nothing
|
1063
|
+
if (i > j)
|
1064
|
+
{
|
1065
|
+
// must push (increase nesting level)
|
1066
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
|
1067
|
+
for (i = i - j; i > 0; i--)
|
1068
|
+
{
|
1069
|
+
_Wikitext_indent(parser);
|
1070
|
+
rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
|
1071
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
1072
|
+
ary_push(parser->scope, BLOCKQUOTE);
|
1073
|
+
}
|
1074
|
+
}
|
1075
|
+
else if (i < j)
|
1076
|
+
{
|
1077
|
+
// must pop (reduce nesting level)
|
1078
|
+
for (i = j - i; i > 0; i--)
|
1079
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
|
1080
|
+
}
|
1081
|
+
|
1082
|
+
// jump to top of the loop to process token we scanned during lookahead
|
1083
|
+
continue;
|
1084
|
+
}
|
1085
|
+
break;
|
1086
|
+
|
1087
|
+
case BLOCKQUOTE_START:
|
1088
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1089
|
+
rb_str_cat(parser->output, escaped_blockquote_start, sizeof(escaped_blockquote_start) - 1);
|
1090
|
+
else if (IN(BLOCKQUOTE_START))
|
1091
|
+
{
|
1092
|
+
// nesting is fine here
|
1093
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
1094
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1095
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
|
1096
|
+
_Wikitext_indent(parser);
|
1097
|
+
rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
|
1098
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
1099
|
+
ary_push(parser->scope, BLOCKQUOTE_START);
|
1100
|
+
ary_push(parser->line, BLOCKQUOTE_START);
|
1101
|
+
}
|
1102
|
+
else if (parser->scope->count == 0 || (IN(P) && !IN(BLOCKQUOTE)))
|
1103
|
+
{
|
1104
|
+
// would be nice to eliminate the repetition here but it's probably the clearest way
|
1105
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
1106
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1107
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, P, Qtrue);
|
1108
|
+
_Wikitext_indent(parser);
|
1109
|
+
rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
|
1110
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
1111
|
+
ary_push(parser->scope, BLOCKQUOTE_START);
|
1112
|
+
ary_push(parser->line, BLOCKQUOTE_START);
|
1113
|
+
}
|
1114
|
+
else
|
1115
|
+
{
|
1116
|
+
// everywhere else, BLOCKQUOTE_START is illegal (in LI, BLOCKQUOTE, H1_START etc)
|
1117
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1118
|
+
_Wikitext_pop_excess_elements(parser);
|
1119
|
+
_Wikitext_start_para_if_necessary(parser);
|
1120
|
+
rb_str_cat(i, escaped_blockquote_start, sizeof(escaped_blockquote_start) - 1);
|
1121
|
+
}
|
1122
|
+
break;
|
1123
|
+
|
1124
|
+
case BLOCKQUOTE_END:
|
1125
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1126
|
+
rb_str_cat(parser->output, escaped_blockquote_end, sizeof(escaped_blockquote_end) - 1);
|
1127
|
+
else
|
1128
|
+
{
|
1129
|
+
if (IN(BLOCKQUOTE_START))
|
1130
|
+
_Wikitext_pop_from_stack_up_to(parser, parser->output, BLOCKQUOTE_START, Qtrue);
|
1131
|
+
else
|
1132
|
+
{
|
1133
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1134
|
+
_Wikitext_pop_excess_elements(parser);
|
1135
|
+
_Wikitext_start_para_if_necessary(parser);
|
1136
|
+
rb_str_cat(i, escaped_blockquote_end, sizeof(escaped_blockquote_end) - 1);
|
1137
|
+
}
|
1138
|
+
}
|
1139
|
+
break;
|
1140
|
+
|
1141
|
+
case NO_WIKI_START:
|
1142
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1143
|
+
rb_str_cat(parser->output, escaped_no_wiki_start, sizeof(escaped_no_wiki_start) - 1);
|
1144
|
+
else
|
1145
|
+
{
|
1146
|
+
_Wikitext_pop_excess_elements(parser);
|
1147
|
+
_Wikitext_start_para_if_necessary(parser);
|
1148
|
+
ary_push(parser->scope, NO_WIKI_START);
|
1149
|
+
ary_push(parser->line, NO_WIKI_START);
|
1150
|
+
}
|
1151
|
+
break;
|
1152
|
+
|
1153
|
+
case NO_WIKI_END:
|
1154
|
+
if (IN(NO_WIKI_START))
|
1155
|
+
// <nowiki> should always only ever be the last item in the stack, but use the helper routine just in case
|
1156
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, NO_WIKI_START, Qtrue);
|
1157
|
+
else
|
1158
|
+
{
|
1159
|
+
_Wikitext_pop_excess_elements(parser);
|
1160
|
+
_Wikitext_start_para_if_necessary(parser);
|
1161
|
+
rb_str_cat(parser->output, escaped_no_wiki_end, sizeof(escaped_no_wiki_end) - 1);
|
1162
|
+
}
|
1163
|
+
break;
|
1164
|
+
|
1165
|
+
case STRONG_EM:
|
1166
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1167
|
+
{
|
1168
|
+
rb_str_cat(parser->output, literal_strong_em, sizeof(literal_strong_em) - 1);
|
1169
|
+
break;
|
1170
|
+
}
|
1171
|
+
|
1172
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1173
|
+
_Wikitext_pop_excess_elements(parser);
|
1174
|
+
|
1175
|
+
// if you've seen STRONG/STRONG_START or EM/EM_START, must close them in the reverse order that you saw them!
|
1176
|
+
// otherwise, must open them
|
1177
|
+
remove_strong = -1;
|
1178
|
+
remove_em = -1;
|
1179
|
+
j = parser->scope->count;
|
1180
|
+
for (j = j - 1; j >= 0; j--)
|
1181
|
+
{
|
1182
|
+
int val = ary_entry(parser->scope, j);
|
1183
|
+
if (val == STRONG || val == STRONG_START)
|
1184
|
+
{
|
1185
|
+
rb_str_cat(i, strong_end, sizeof(strong_end) - 1);
|
1186
|
+
remove_strong = j;
|
1187
|
+
}
|
1188
|
+
else if (val == EM || val == EM_START)
|
1189
|
+
{
|
1190
|
+
rb_str_cat(i, em_end, sizeof(em_end) - 1);
|
1191
|
+
remove_em = j;
|
1192
|
+
}
|
1193
|
+
}
|
1194
|
+
|
1195
|
+
if (remove_strong > remove_em) // must remove strong first
|
1196
|
+
{
|
1197
|
+
ary_pop(parser->scope);
|
1198
|
+
if (remove_em > -1)
|
1199
|
+
ary_pop(parser->scope);
|
1200
|
+
else // there was no em to remove!, so consider this an opening em tag
|
1201
|
+
{
|
1202
|
+
rb_str_cat(i, em_start, sizeof(em_start) - 1);
|
1203
|
+
ary_push(parser->scope, EM);
|
1204
|
+
ary_push(parser->line, EM);
|
1205
|
+
}
|
1206
|
+
}
|
1207
|
+
else if (remove_em > remove_strong) // must remove em first
|
1208
|
+
{
|
1209
|
+
ary_pop(parser->scope);
|
1210
|
+
if (remove_strong > -1)
|
1211
|
+
ary_pop(parser->scope);
|
1212
|
+
else // there was no strong to remove!, so consider this an opening strong tag
|
1213
|
+
{
|
1214
|
+
rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
|
1215
|
+
ary_push(parser->scope, STRONG);
|
1216
|
+
ary_push(parser->line, STRONG);
|
1217
|
+
}
|
1218
|
+
}
|
1219
|
+
else // no strong or em to remove, so this must be a new opening of both
|
1220
|
+
{
|
1221
|
+
_Wikitext_start_para_if_necessary(parser);
|
1222
|
+
rb_str_cat(i, strong_em_start, sizeof(strong_em_start) - 1);
|
1223
|
+
ary_push(parser->scope, STRONG);
|
1224
|
+
ary_push(parser->line, STRONG);
|
1225
|
+
ary_push(parser->scope, EM);
|
1226
|
+
ary_push(parser->line, EM);
|
1227
|
+
}
|
1228
|
+
break;
|
1229
|
+
|
1230
|
+
case STRONG:
|
1231
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1232
|
+
rb_str_cat(parser->output, literal_strong, sizeof(literal_strong) - 1);
|
1233
|
+
else
|
1234
|
+
{
|
1235
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1236
|
+
if (IN(STRONG_START))
|
1237
|
+
// already in span started with <strong>, no choice but to emit this literally
|
1238
|
+
rb_str_cat(parser->output, literal_strong, sizeof(literal_strong) - 1);
|
1239
|
+
else if (IN(STRONG))
|
1240
|
+
// STRONG already seen, this is a closing tag
|
1241
|
+
_Wikitext_pop_from_stack_up_to(parser, i, STRONG, Qtrue);
|
1242
|
+
else
|
1243
|
+
{
|
1244
|
+
// this is a new opening
|
1245
|
+
_Wikitext_pop_excess_elements(parser);
|
1246
|
+
_Wikitext_start_para_if_necessary(parser);
|
1247
|
+
rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
|
1248
|
+
ary_push(parser->scope, STRONG);
|
1249
|
+
ary_push(parser->line, STRONG);
|
1250
|
+
}
|
1251
|
+
}
|
1252
|
+
break;
|
1253
|
+
|
1254
|
+
case STRONG_START:
|
1255
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1256
|
+
rb_str_cat(parser->output, escaped_strong_start, sizeof(escaped_strong_start) - 1);
|
1257
|
+
else
|
1258
|
+
{
|
1259
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1260
|
+
if (IN(STRONG_START) || IN(STRONG))
|
1261
|
+
rb_str_cat(parser->output, escaped_strong_start, sizeof(escaped_strong_start) - 1);
|
1262
|
+
else
|
1263
|
+
{
|
1264
|
+
_Wikitext_pop_excess_elements(parser);
|
1265
|
+
_Wikitext_start_para_if_necessary(parser);
|
1266
|
+
rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
|
1267
|
+
ary_push(parser->scope, STRONG_START);
|
1268
|
+
ary_push(parser->line, STRONG_START);
|
1269
|
+
}
|
1270
|
+
}
|
1271
|
+
break;
|
1272
|
+
|
1273
|
+
case STRONG_END:
|
1274
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1275
|
+
rb_str_cat(parser->output, escaped_strong_end, sizeof(escaped_strong_end) - 1);
|
1276
|
+
else
|
1277
|
+
{
|
1278
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1279
|
+
if (IN(STRONG_START))
|
1280
|
+
_Wikitext_pop_from_stack_up_to(parser, i, STRONG_START, Qtrue);
|
1281
|
+
else
|
1282
|
+
{
|
1283
|
+
// no STRONG_START in scope, so must interpret the STRONG_END without any special meaning
|
1284
|
+
_Wikitext_pop_excess_elements(parser);
|
1285
|
+
_Wikitext_start_para_if_necessary(parser);
|
1286
|
+
rb_str_cat(i, escaped_strong_end, sizeof(escaped_strong_end) - 1);
|
1287
|
+
}
|
1288
|
+
}
|
1289
|
+
break;
|
1290
|
+
|
1291
|
+
case EM:
|
1292
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1293
|
+
rb_str_cat(parser->output, literal_em, sizeof(literal_em) - 1);
|
1294
|
+
else
|
1295
|
+
{
|
1296
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1297
|
+
if (IN(EM_START))
|
1298
|
+
// already in span started with <em>, no choice but to emit this literally
|
1299
|
+
rb_str_cat(parser->output, literal_em, sizeof(literal_em) - 1);
|
1300
|
+
else if (IN(EM))
|
1301
|
+
// EM already seen, this is a closing tag
|
1302
|
+
_Wikitext_pop_from_stack_up_to(parser, i, EM, Qtrue);
|
1303
|
+
else
|
1304
|
+
{
|
1305
|
+
// this is a new opening
|
1306
|
+
_Wikitext_pop_excess_elements(parser);
|
1307
|
+
_Wikitext_start_para_if_necessary(parser);
|
1308
|
+
rb_str_cat(i, em_start, sizeof(em_start) - 1);
|
1309
|
+
ary_push(parser->scope, EM);
|
1310
|
+
ary_push(parser->line, EM);
|
1311
|
+
}
|
1312
|
+
}
|
1313
|
+
break;
|
1314
|
+
|
1315
|
+
case EM_START:
|
1316
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1317
|
+
rb_str_cat(parser->output, escaped_em_start, sizeof(escaped_em_start) - 1);
|
1318
|
+
else
|
1319
|
+
{
|
1320
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1321
|
+
if (IN(EM_START) || IN(EM))
|
1322
|
+
rb_str_cat(i, escaped_em_start, sizeof(escaped_em_start) - 1);
|
1323
|
+
else
|
1324
|
+
{
|
1325
|
+
_Wikitext_pop_excess_elements(parser);
|
1326
|
+
_Wikitext_start_para_if_necessary(parser);
|
1327
|
+
rb_str_cat(i, em_start, sizeof(em_start) - 1);
|
1328
|
+
ary_push(parser->scope, EM_START);
|
1329
|
+
ary_push(parser->line, EM_START);
|
1330
|
+
}
|
1331
|
+
}
|
1332
|
+
break;
|
1333
|
+
|
1334
|
+
case EM_END:
|
1335
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1336
|
+
rb_str_cat(parser->output, escaped_em_end, sizeof(escaped_em_end) - 1);
|
1337
|
+
else
|
1338
|
+
{
|
1339
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1340
|
+
if (IN(EM_START))
|
1341
|
+
_Wikitext_pop_from_stack_up_to(parser, i, EM_START, Qtrue);
|
1342
|
+
else
|
1343
|
+
{
|
1344
|
+
// no EM_START in scope, so must interpret the TT_END without any special meaning
|
1345
|
+
_Wikitext_pop_excess_elements(parser);
|
1346
|
+
_Wikitext_start_para_if_necessary(parser);
|
1347
|
+
rb_str_cat(i, escaped_em_end, sizeof(escaped_em_end) - 1);
|
1348
|
+
}
|
1349
|
+
}
|
1350
|
+
break;
|
1351
|
+
|
1352
|
+
case TT:
|
1353
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1354
|
+
rb_str_cat(parser->output, backtick, sizeof(backtick) - 1);
|
1355
|
+
else
|
1356
|
+
{
|
1357
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1358
|
+
if (IN(TT_START))
|
1359
|
+
// already in span started with <tt>, no choice but to emit this literally
|
1360
|
+
rb_str_cat(parser->output, backtick, sizeof(backtick) - 1);
|
1361
|
+
else if (IN(TT))
|
1362
|
+
// TT (`) already seen, this is a closing tag
|
1363
|
+
_Wikitext_pop_from_stack_up_to(parser, i, TT, Qtrue);
|
1364
|
+
else
|
1365
|
+
{
|
1366
|
+
// this is a new opening
|
1367
|
+
_Wikitext_pop_excess_elements(parser);
|
1368
|
+
_Wikitext_start_para_if_necessary(parser);
|
1369
|
+
rb_str_cat(i, tt_start, sizeof(tt_start) - 1);
|
1370
|
+
ary_push(parser->scope, TT);
|
1371
|
+
ary_push(parser->line, TT);
|
1372
|
+
}
|
1373
|
+
}
|
1374
|
+
break;
|
1375
|
+
|
1376
|
+
case TT_START:
|
1377
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1378
|
+
rb_str_cat(parser->output, escaped_tt_start, sizeof(escaped_tt_start) - 1);
|
1379
|
+
else
|
1380
|
+
{
|
1381
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1382
|
+
if (IN(TT_START) || IN(TT))
|
1383
|
+
rb_str_cat(i, escaped_tt_start, sizeof(escaped_tt_start) - 1);
|
1384
|
+
else
|
1385
|
+
{
|
1386
|
+
_Wikitext_pop_excess_elements(parser);
|
1387
|
+
_Wikitext_start_para_if_necessary(parser);
|
1388
|
+
rb_str_cat(i, tt_start, sizeof(tt_start) - 1);
|
1389
|
+
ary_push(parser->scope, TT_START);
|
1390
|
+
ary_push(parser->line, TT_START);
|
1391
|
+
}
|
1392
|
+
}
|
1393
|
+
break;
|
1394
|
+
|
1395
|
+
case TT_END:
|
1396
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1397
|
+
rb_str_cat(parser->output, escaped_tt_end, sizeof(escaped_tt_end) - 1);
|
1398
|
+
else
|
1399
|
+
{
|
1400
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1401
|
+
if (IN(TT_START))
|
1402
|
+
_Wikitext_pop_from_stack_up_to(parser, i, TT_START, Qtrue);
|
1403
|
+
else
|
1404
|
+
{
|
1405
|
+
// no TT_START in scope, so must interpret the TT_END without any special meaning
|
1406
|
+
_Wikitext_pop_excess_elements(parser);
|
1407
|
+
_Wikitext_start_para_if_necessary(parser);
|
1408
|
+
rb_str_cat(i, escaped_tt_end, sizeof(escaped_tt_end) - 1);
|
1409
|
+
}
|
1410
|
+
}
|
1411
|
+
break;
|
1412
|
+
|
1413
|
+
case OL:
|
1414
|
+
case UL:
|
1415
|
+
if (IN(NO_WIKI_START) || IN(PRE_START))
|
1416
|
+
{
|
1417
|
+
// no need to check for PRE; can never appear inside it
|
1418
|
+
rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
|
1419
|
+
break;
|
1420
|
+
}
|
1421
|
+
|
1422
|
+
// count number of tokens in line and scope stacks
|
1423
|
+
int bq_count = ary_count(parser->scope, BLOCKQUOTE_START);
|
1424
|
+
i = parser->line->count - ary_count(parser->line, BLOCKQUOTE_START);
|
1425
|
+
j = parser->scope->count - bq_count;
|
1426
|
+
k = i;
|
1427
|
+
|
1428
|
+
// list tokens can be nested so look ahead for any more which might affect the decision to push or pop
|
1429
|
+
for (;;)
|
1430
|
+
{
|
1431
|
+
type = token->type;
|
1432
|
+
if (type == OL || type == UL)
|
1433
|
+
{
|
1434
|
+
token = NULL;
|
1435
|
+
if (i - k >= 2) // already seen at least one OL or UL
|
1436
|
+
{
|
1437
|
+
ary_push(parser->line, NESTED_LIST); // which means this is a nested list
|
1438
|
+
i += 3;
|
1439
|
+
}
|
1440
|
+
else
|
1441
|
+
i += 2;
|
1442
|
+
ary_push(parser->line, type);
|
1443
|
+
ary_push(parser->line, LI);
|
1444
|
+
|
1445
|
+
// want to compare line with scope but can only do so if scope has enough items on it
|
1446
|
+
if (j >= i)
|
1447
|
+
{
|
1448
|
+
if (ary_entry(parser->scope, i + bq_count - 2) == type && ary_entry(parser->scope, i + bq_count - 1) == LI)
|
1449
|
+
{
|
1450
|
+
// line and scope match at this point: do nothing yet
|
1451
|
+
}
|
1452
|
+
else
|
1453
|
+
{
|
1454
|
+
// item just pushed onto line does not match corresponding slot of scope!
|
1455
|
+
for (; j >= i - 2; j--)
|
1456
|
+
// must pop back before emitting
|
1457
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
1458
|
+
|
1459
|
+
// will emit UL or OL, then LI
|
1460
|
+
break;
|
1461
|
+
}
|
1462
|
+
}
|
1463
|
+
else // line stack size now exceeds scope stack size: must increase nesting level
|
1464
|
+
break; // will emit UL or OL, then LI
|
1465
|
+
}
|
1466
|
+
else
|
1467
|
+
{
|
1468
|
+
// not a OL or UL token!
|
1469
|
+
if (j == i)
|
1470
|
+
// must close existing LI and re-open new one
|
1471
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
1472
|
+
else if (j > i)
|
1473
|
+
{
|
1474
|
+
// item just pushed onto line does not match corresponding slot of scope!
|
1475
|
+
for (; j >= i; j--)
|
1476
|
+
// must pop back before emitting
|
1477
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
1478
|
+
}
|
1479
|
+
break;
|
1480
|
+
}
|
1481
|
+
NEXT_TOKEN();
|
1482
|
+
}
|
1483
|
+
|
1484
|
+
// will emit
|
1485
|
+
if (type == OL || type == UL)
|
1486
|
+
{
|
1487
|
+
// if LI is at the top of a stack this is the start of a nested list
|
1488
|
+
if (j > 0 && ary_entry(parser->scope, -1) == LI)
|
1489
|
+
{
|
1490
|
+
// so we should precede it with a CRLF, and indicate that it's a nested list
|
1491
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
1492
|
+
ary_push(parser->scope, NESTED_LIST);
|
1493
|
+
}
|
1494
|
+
else
|
1495
|
+
{
|
1496
|
+
// this is a new list
|
1497
|
+
if (IN(BLOCKQUOTE_START))
|
1498
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
|
1499
|
+
else
|
1500
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
|
1501
|
+
}
|
1502
|
+
|
1503
|
+
// emit
|
1504
|
+
_Wikitext_indent(parser);
|
1505
|
+
if (type == OL)
|
1506
|
+
rb_str_cat(parser->output, ol_start, sizeof(ol_start) - 1);
|
1507
|
+
else if (type == UL)
|
1508
|
+
rb_str_cat(parser->output, ul_start, sizeof(ul_start) - 1);
|
1509
|
+
ary_push(parser->scope, type);
|
1510
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
1511
|
+
}
|
1512
|
+
else if (type == SPACE)
|
1513
|
+
// silently throw away the optional SPACE token after final list marker
|
1514
|
+
token = NULL;
|
1515
|
+
|
1516
|
+
_Wikitext_indent(parser);
|
1517
|
+
rb_str_cat(parser->output, li_start, sizeof(li_start) - 1);
|
1518
|
+
ary_push(parser->scope, LI);
|
1519
|
+
|
1520
|
+
// any subsequent UL or OL tokens on this line are syntax errors and must be emitted literally
|
1521
|
+
if (type == OL || type == UL)
|
1522
|
+
{
|
1523
|
+
k = 0;
|
1524
|
+
while (k++, NEXT_TOKEN(), (type = token->type))
|
1525
|
+
{
|
1526
|
+
if (type == OL || type == UL)
|
1527
|
+
rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
|
1528
|
+
else if (type == SPACE && k == 1)
|
1529
|
+
{
|
1530
|
+
// silently throw away the optional SPACE token after final list marker
|
1531
|
+
token = NULL;
|
1532
|
+
break;
|
1533
|
+
}
|
1534
|
+
else
|
1535
|
+
break;
|
1536
|
+
}
|
1537
|
+
}
|
1538
|
+
|
1539
|
+
// jump to top of the loop to process token we scanned during lookahead
|
1540
|
+
continue;
|
1541
|
+
|
1542
|
+
case H6_START:
|
1543
|
+
case H5_START:
|
1544
|
+
case H4_START:
|
1545
|
+
case H3_START:
|
1546
|
+
case H2_START:
|
1547
|
+
case H1_START:
|
1548
|
+
if (IN(NO_WIKI_START) || IN(PRE_START))
|
1549
|
+
{
|
1550
|
+
// no need to check for PRE; can never appear inside it
|
1551
|
+
rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
|
1552
|
+
break;
|
1553
|
+
}
|
1554
|
+
|
1555
|
+
// pop up to but not including the last BLOCKQUOTE on the scope stack
|
1556
|
+
if (IN(BLOCKQUOTE_START))
|
1557
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
|
1558
|
+
else
|
1559
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
|
1560
|
+
|
1561
|
+
// count number of BLOCKQUOTE tokens in line buffer and in scope stack
|
1562
|
+
ary_push(parser->line, type);
|
1563
|
+
i = ary_count(parser->line, BLOCKQUOTE);
|
1564
|
+
j = ary_count(parser->scope, BLOCKQUOTE);
|
1565
|
+
|
1566
|
+
// decide whether we need to pop off excess BLOCKQUOTE tokens (will never need to push; that is handled above in the BLOCKQUOTE case itself)
|
1567
|
+
if (i < j)
|
1568
|
+
{
|
1569
|
+
// must pop (reduce nesting level)
|
1570
|
+
for (i = j - i; i > 0; i--)
|
1571
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
|
1572
|
+
}
|
1573
|
+
|
1574
|
+
// discard any whitespace here (so that "== foo ==" will be translated to "<h2>foo</h2>" rather than "<h2> foo </h2")
|
1575
|
+
while (NEXT_TOKEN(), (token->type == SPACE))
|
1576
|
+
; // discard
|
1577
|
+
|
1578
|
+
ary_push(parser->scope, type);
|
1579
|
+
_Wikitext_indent(parser);
|
1580
|
+
|
1581
|
+
// rather than repeat all that code for each kind of heading, share it and use a conditional here
|
1582
|
+
if (type == H6_START)
|
1583
|
+
rb_str_cat(parser->output, h6_start, sizeof(h6_start) - 1);
|
1584
|
+
else if (type == H5_START)
|
1585
|
+
rb_str_cat(parser->output, h5_start, sizeof(h5_start) - 1);
|
1586
|
+
else if (type == H4_START)
|
1587
|
+
rb_str_cat(parser->output, h4_start, sizeof(h4_start) - 1);
|
1588
|
+
else if (type == H3_START)
|
1589
|
+
rb_str_cat(parser->output, h3_start, sizeof(h3_start) - 1);
|
1590
|
+
else if (type == H2_START)
|
1591
|
+
rb_str_cat(parser->output, h2_start, sizeof(h2_start) - 1);
|
1592
|
+
else if (type == H1_START)
|
1593
|
+
rb_str_cat(parser->output, h1_start, sizeof(h1_start) - 1);
|
1594
|
+
|
1595
|
+
// jump to top of the loop to process token we scanned during lookahead
|
1596
|
+
continue;
|
1597
|
+
|
1598
|
+
case H6_END:
|
1599
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1600
|
+
rb_str_cat(parser->output, literal_h6, sizeof(literal_h6) - 1);
|
1601
|
+
else
|
1602
|
+
{
|
1603
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1604
|
+
if (!IN(H6_START))
|
1605
|
+
{
|
1606
|
+
// literal output only if not in h6 scope (we stay silent in that case)
|
1607
|
+
_Wikitext_start_para_if_necessary(parser);
|
1608
|
+
rb_str_cat(parser->output, literal_h6, sizeof(literal_h6) - 1);
|
1609
|
+
}
|
1610
|
+
}
|
1611
|
+
break;
|
1612
|
+
|
1613
|
+
case H5_END:
|
1614
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1615
|
+
rb_str_cat(parser->output, literal_h5, sizeof(literal_h5) - 1);
|
1616
|
+
else
|
1617
|
+
{
|
1618
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1619
|
+
if (!IN(H5_START))
|
1620
|
+
{
|
1621
|
+
// literal output only if not in h5 scope (we stay silent in that case)
|
1622
|
+
_Wikitext_start_para_if_necessary(parser);
|
1623
|
+
rb_str_cat(parser->output, literal_h5, sizeof(literal_h5) - 1);
|
1624
|
+
}
|
1625
|
+
}
|
1626
|
+
break;
|
1627
|
+
|
1628
|
+
case H4_END:
|
1629
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1630
|
+
rb_str_cat(parser->output, literal_h4, sizeof(literal_h4) - 1);
|
1631
|
+
else
|
1632
|
+
{
|
1633
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1634
|
+
if (!IN(H4_START))
|
1635
|
+
{
|
1636
|
+
// literal output only if not in h4 scope (we stay silent in that case)
|
1637
|
+
_Wikitext_start_para_if_necessary(parser);
|
1638
|
+
rb_str_cat(parser->output, literal_h4, sizeof(literal_h4) - 1);
|
1639
|
+
}
|
1640
|
+
}
|
1641
|
+
break;
|
1642
|
+
|
1643
|
+
case H3_END:
|
1644
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1645
|
+
rb_str_cat(parser->output, literal_h3, sizeof(literal_h3) - 1);
|
1646
|
+
else
|
1647
|
+
{
|
1648
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1649
|
+
if (!IN(H3_START))
|
1650
|
+
{
|
1651
|
+
// literal output only if not in h3 scope (we stay silent in that case)
|
1652
|
+
_Wikitext_start_para_if_necessary(parser);
|
1653
|
+
rb_str_cat(parser->output, literal_h3, sizeof(literal_h3) - 1);
|
1654
|
+
}
|
1655
|
+
}
|
1656
|
+
break;
|
1657
|
+
|
1658
|
+
case H2_END:
|
1659
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1660
|
+
rb_str_cat(parser->output, literal_h2, sizeof(literal_h2) - 1);
|
1661
|
+
else
|
1662
|
+
{
|
1663
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1664
|
+
if (!IN(H2_START))
|
1665
|
+
{
|
1666
|
+
// literal output only if not in h2 scope (we stay silent in that case)
|
1667
|
+
_Wikitext_start_para_if_necessary(parser);
|
1668
|
+
rb_str_cat(parser->output, literal_h2, sizeof(literal_h2) - 1);
|
1669
|
+
}
|
1670
|
+
}
|
1671
|
+
break;
|
1672
|
+
|
1673
|
+
case H1_END:
|
1674
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1675
|
+
rb_str_cat(parser->output, literal_h1, sizeof(literal_h1) - 1);
|
1676
|
+
else
|
1677
|
+
{
|
1678
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1679
|
+
if (!IN(H1_START))
|
1680
|
+
{
|
1681
|
+
// literal output only if not in h1 scope (we stay silent in that case)
|
1682
|
+
_Wikitext_start_para_if_necessary(parser);
|
1683
|
+
rb_str_cat(parser->output, literal_h1, sizeof(literal_h1) - 1);
|
1684
|
+
}
|
1685
|
+
}
|
1686
|
+
break;
|
1687
|
+
|
1688
|
+
case MAIL:
|
1689
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1690
|
+
rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
|
1691
|
+
else
|
1692
|
+
{
|
1693
|
+
// in plain scope, will turn into autolink (with appropriate, user-configurable CSS)
|
1694
|
+
_Wikitext_pop_excess_elements(parser);
|
1695
|
+
_Wikitext_start_para_if_necessary(parser);
|
1696
|
+
i = TOKEN_TEXT(token);
|
1697
|
+
if (parser->autolink == Qtrue)
|
1698
|
+
i = _Wikitext_hyperlink(rb_str_new2("mailto:"), i, i, mailto_class);
|
1699
|
+
rb_str_append(parser->output, i);
|
1700
|
+
}
|
1701
|
+
break;
|
1702
|
+
|
1703
|
+
case URI:
|
1704
|
+
if (IN(NO_WIKI_START))
|
1705
|
+
// user can temporarily suppress autolinking by using <nowiki></nowiki>
|
1706
|
+
// note that unlike MediaWiki, we do allow autolinking inside PRE blocks
|
1707
|
+
rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
|
1708
|
+
else if (IN(LINK_START))
|
1709
|
+
{
|
1710
|
+
// if the URI were allowed it would have been handled already in LINK_START
|
1711
|
+
_Wikitext_rollback_failed_link(parser);
|
1712
|
+
i = TOKEN_TEXT(token);
|
1713
|
+
if (parser->autolink == Qtrue)
|
1714
|
+
i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
|
1715
|
+
rb_str_append(parser->output, i);
|
1716
|
+
}
|
1717
|
+
else if (IN(EXT_LINK_START))
|
1718
|
+
{
|
1719
|
+
if (NIL_P(parser->link_target))
|
1720
|
+
{
|
1721
|
+
// this must be our link target: look ahead to make sure we see the space we're expecting to see
|
1722
|
+
i = TOKEN_TEXT(token);
|
1723
|
+
NEXT_TOKEN();
|
1724
|
+
if (token->type == SPACE)
|
1725
|
+
{
|
1726
|
+
ary_push(parser->scope, SPACE);
|
1727
|
+
parser->link_target = i;
|
1728
|
+
parser->link_text = rb_str_new2("");
|
1729
|
+
parser->capture = parser->link_text;
|
1730
|
+
token = NULL; // silently consume space
|
1731
|
+
}
|
1732
|
+
else
|
1733
|
+
{
|
1734
|
+
// didn't see the space! this must be an error
|
1735
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
1736
|
+
_Wikitext_pop_excess_elements(parser);
|
1737
|
+
_Wikitext_start_para_if_necessary(parser);
|
1738
|
+
rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
|
1739
|
+
if (parser->autolink == Qtrue)
|
1740
|
+
i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
|
1741
|
+
rb_str_append(parser->output, i);
|
1742
|
+
}
|
1743
|
+
}
|
1744
|
+
else
|
1745
|
+
{
|
1746
|
+
if (NIL_P(parser->link_text))
|
1747
|
+
// this must be the first part of our link text
|
1748
|
+
parser->link_text = TOKEN_TEXT(token);
|
1749
|
+
else
|
1750
|
+
// add to existing link text
|
1751
|
+
rb_str_cat(parser->link_text, token->start, TOKEN_LEN(token));
|
1752
|
+
}
|
1753
|
+
}
|
1754
|
+
else
|
1755
|
+
{
|
1756
|
+
// in plain scope, will turn into autolink (with appropriate, user-configurable CSS)
|
1757
|
+
_Wikitext_pop_excess_elements(parser);
|
1758
|
+
_Wikitext_start_para_if_necessary(parser);
|
1759
|
+
i = TOKEN_TEXT(token);
|
1760
|
+
if (parser->autolink == Qtrue)
|
1761
|
+
i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
|
1762
|
+
rb_str_append(parser->output, i);
|
1763
|
+
}
|
1764
|
+
break;
|
1765
|
+
|
1766
|
+
// internal links (links to other wiki articles) look like this:
|
1767
|
+
// [[another article]] (would point at, for example, "/wiki/another_article")
|
1768
|
+
// [[the other article|the link text we'll use for it]]
|
1769
|
+
// [[the other article | the link text we'll use for it]]
|
1770
|
+
// note that the forward slash is a reserved character which changes the meaning of an internal link;
|
1771
|
+
// this is a link that is external to the wiki but internal to the site as a whole:
|
1772
|
+
// [[bug/12]] (a relative link to "/bug/12")
|
1773
|
+
// MediaWiki has strict requirements about what it will accept as a link target:
|
1774
|
+
// all wikitext markup is disallowed:
|
1775
|
+
// example [[foo ''bar'' baz]]
|
1776
|
+
// renders [[foo <em>bar</em> baz]] (ie. not a link)
|
1777
|
+
// example [[foo <em>bar</em> baz]]
|
1778
|
+
// renders [[foo <em>bar</em> baz]] (ie. not a link)
|
1779
|
+
// example [[foo <nowiki>''</nowiki> baz]]
|
1780
|
+
// renders [[foo '' baz]] (ie. not a link)
|
1781
|
+
// example [[foo <bar> baz]]
|
1782
|
+
// renders [[foo <bar> baz]] (ie. not a link)
|
1783
|
+
// HTML entities and non-ASCII, however, make it through:
|
1784
|
+
// example [[foo €]]
|
1785
|
+
// renders <a href="/wiki/Foo_%E2%82%AC">foo €</a>
|
1786
|
+
// example [[foo €]]
|
1787
|
+
// renders <a href="/wiki/Foo_%E2%82%AC">foo €</a>
|
1788
|
+
// we'll impose similar restrictions here for the link target; allowed tokens will be:
|
1789
|
+
// SPACE, PRINTABLE, DEFAULT, QUOT and AMP
|
1790
|
+
// everything else will be rejected
|
1791
|
+
case LINK_START:
|
1792
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1793
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1794
|
+
rb_str_cat(i, link_start, sizeof(link_start) - 1);
|
1795
|
+
else if (IN(EXT_LINK_START))
|
1796
|
+
// already in external link scope! (and in fact, must be capturing link_text right now)
|
1797
|
+
rb_str_cat(i, link_start, sizeof(link_start) - 1);
|
1798
|
+
else if (IN(LINK_START))
|
1799
|
+
{
|
1800
|
+
// already in internal link scope! this is a syntax error
|
1801
|
+
_Wikitext_rollback_failed_link(parser);
|
1802
|
+
rb_str_cat(parser->output, link_start, sizeof(link_start) - 1);
|
1803
|
+
}
|
1804
|
+
else if (IN(SEPARATOR))
|
1805
|
+
{
|
1806
|
+
// scanning internal link text
|
1807
|
+
}
|
1808
|
+
else // not in internal link scope yet
|
1809
|
+
{
|
1810
|
+
// will either emit a link, or the rollback of a failed link, so start the para now
|
1811
|
+
_Wikitext_pop_excess_elements(parser);
|
1812
|
+
_Wikitext_start_para_if_necessary(parser);
|
1813
|
+
ary_push(parser->scope, LINK_START);
|
1814
|
+
|
1815
|
+
// look ahead and try to gobble up link target
|
1816
|
+
while (NEXT_TOKEN(), (type = token->type))
|
1817
|
+
{
|
1818
|
+
if (type == SPACE ||
|
1819
|
+
type == PRINTABLE ||
|
1820
|
+
type == DEFAULT ||
|
1821
|
+
type == QUOT ||
|
1822
|
+
type == QUOT_ENTITY ||
|
1823
|
+
type == AMP ||
|
1824
|
+
type == AMP_ENTITY)
|
1825
|
+
{
|
1826
|
+
// accumulate these tokens into link_target
|
1827
|
+
if (NIL_P(parser->link_target))
|
1828
|
+
{
|
1829
|
+
parser->link_target = rb_str_new2("");
|
1830
|
+
parser->capture = parser->link_target;
|
1831
|
+
}
|
1832
|
+
if (type == QUOT_ENTITY)
|
1833
|
+
// don't insert the entity, insert the literal quote
|
1834
|
+
rb_str_cat(parser->link_target, quote, sizeof(quote) - 1);
|
1835
|
+
else if (type == AMP_ENTITY)
|
1836
|
+
// don't insert the entity, insert the literal ampersand
|
1837
|
+
rb_str_cat(parser->link_target, ampersand, sizeof(ampersand) - 1);
|
1838
|
+
else
|
1839
|
+
rb_str_cat(parser->link_target, token->start, TOKEN_LEN(token));
|
1840
|
+
}
|
1841
|
+
else if (type == LINK_END)
|
1842
|
+
break; // jump back to top of loop (will handle this in LINK_END case below)
|
1843
|
+
else if (type == SEPARATOR)
|
1844
|
+
{
|
1845
|
+
ary_push(parser->scope, SEPARATOR);
|
1846
|
+
parser->link_text = rb_str_new2("");
|
1847
|
+
parser->capture = parser->link_text;
|
1848
|
+
token = NULL;
|
1849
|
+
break;
|
1850
|
+
}
|
1851
|
+
else // unexpected token (syntax error)
|
1852
|
+
{
|
1853
|
+
_Wikitext_rollback_failed_link(parser);
|
1854
|
+
break; // jump back to top of loop to handle unexpected token
|
1855
|
+
}
|
1856
|
+
}
|
1857
|
+
|
1858
|
+
// jump to top of the loop to process token we scanned during lookahead (if any)
|
1859
|
+
continue;
|
1860
|
+
}
|
1861
|
+
break;
|
1862
|
+
|
1863
|
+
case LINK_END:
|
1864
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1865
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1866
|
+
rb_str_cat(i, link_end, sizeof(link_end) - 1);
|
1867
|
+
else if (IN(EXT_LINK_START))
|
1868
|
+
// already in external link scope! (and in fact, must be capturing link_text right now)
|
1869
|
+
rb_str_cat(i, link_end, sizeof(link_end) - 1);
|
1870
|
+
else if (IN(LINK_START))
|
1871
|
+
{
|
1872
|
+
// in internal link scope!
|
1873
|
+
if (NIL_P(parser->link_text) || RSTRING_LEN(parser->link_text) == 0)
|
1874
|
+
// use link target as link text
|
1875
|
+
parser->link_text = _Wikitext_parser_sanitize_link_target(parser->link_target, Qtrue);
|
1876
|
+
else
|
1877
|
+
parser->link_text = _Wikitext_parser_trim_link_target(parser->link_text);
|
1878
|
+
_Wikitext_parser_encode_link_target(parser);
|
1879
|
+
_Wikitext_pop_from_stack_up_to(parser, i, LINK_START, Qtrue);
|
1880
|
+
parser->capture = Qnil;
|
1881
|
+
if (parser->special_link)
|
1882
|
+
i = _Wikitext_hyperlink(rb_str_new2("/"), parser->link_target, parser->link_text, Qnil);
|
1883
|
+
else
|
1884
|
+
i = _Wikitext_hyperlink(prefix, parser->link_target, parser->link_text, Qnil);
|
1885
|
+
rb_str_append(parser->output, i);
|
1886
|
+
parser->link_target = Qnil;
|
1887
|
+
parser->link_text = Qnil;
|
1888
|
+
}
|
1889
|
+
else // wasn't in internal link scope
|
1890
|
+
{
|
1891
|
+
_Wikitext_pop_excess_elements(parser);
|
1892
|
+
_Wikitext_start_para_if_necessary(parser);
|
1893
|
+
rb_str_cat(i, link_end, sizeof(link_end) - 1);
|
1894
|
+
}
|
1895
|
+
break;
|
1896
|
+
|
1897
|
+
// external links look like this:
|
1898
|
+
// [http://google.com/ the link text]
|
1899
|
+
// strings in square brackets which don't match this syntax get passed through literally; eg:
|
1900
|
+
// he was very angery [sic] about the turn of events
|
1901
|
+
case EXT_LINK_START:
|
1902
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1903
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1904
|
+
rb_str_cat(i, ext_link_start, sizeof(ext_link_start) - 1);
|
1905
|
+
else if (IN(EXT_LINK_START))
|
1906
|
+
// already in external link scope! (and in fact, must be capturing link_text right now)
|
1907
|
+
rb_str_cat(i, ext_link_start, sizeof(ext_link_start) - 1);
|
1908
|
+
else if (IN(LINK_START))
|
1909
|
+
{
|
1910
|
+
// already in internal link scope!
|
1911
|
+
i = rb_str_new(ext_link_start, sizeof(ext_link_start) - 1);
|
1912
|
+
if (NIL_P(parser->link_target))
|
1913
|
+
// this must be the first character of our link target
|
1914
|
+
parser->link_target = i;
|
1915
|
+
else if (IN(SPACE))
|
1916
|
+
{
|
1917
|
+
// link target has already been scanned
|
1918
|
+
if (NIL_P(parser->link_text))
|
1919
|
+
// this must be the first character of our link text
|
1920
|
+
parser->link_text = i;
|
1921
|
+
else
|
1922
|
+
// add to existing link text
|
1923
|
+
rb_str_append(parser->link_text, i);
|
1924
|
+
}
|
1925
|
+
else
|
1926
|
+
// add to existing link target
|
1927
|
+
rb_str_append(parser->link_target, i);
|
1928
|
+
}
|
1929
|
+
else // not in external link scope yet
|
1930
|
+
{
|
1931
|
+
// will either emit a link, or the rollback of a failed link, so start the para now
|
1932
|
+
_Wikitext_pop_excess_elements(parser);
|
1933
|
+
_Wikitext_start_para_if_necessary(parser);
|
1934
|
+
|
1935
|
+
// look ahead: expect a URI
|
1936
|
+
NEXT_TOKEN();
|
1937
|
+
if (token->type == URI)
|
1938
|
+
ary_push(parser->scope, EXT_LINK_START); // so far so good, jump back to the top of the loop
|
1939
|
+
else
|
1940
|
+
// only get here if there was a syntax error (missing URI)
|
1941
|
+
rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
|
1942
|
+
continue; // jump back to top of loop to handle token (either URI or whatever it is)
|
1943
|
+
}
|
1944
|
+
break;
|
1945
|
+
|
1946
|
+
case EXT_LINK_END:
|
1947
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1948
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1949
|
+
rb_str_cat(i, ext_link_end, sizeof(ext_link_end) - 1);
|
1950
|
+
else if (IN(EXT_LINK_START))
|
1951
|
+
{
|
1952
|
+
if (NIL_P(parser->link_text))
|
1953
|
+
// syntax error: external link with no link text
|
1954
|
+
_Wikitext_rollback_failed_external_link(parser);
|
1955
|
+
else
|
1956
|
+
{
|
1957
|
+
// success!
|
1958
|
+
_Wikitext_pop_from_stack_up_to(parser, i, EXT_LINK_START, Qtrue);
|
1959
|
+
parser->capture = Qnil;
|
1960
|
+
i = _Wikitext_hyperlink(Qnil, parser->link_target, parser->link_text, parser->external_link_class);
|
1961
|
+
rb_str_append(parser->output, i);
|
1962
|
+
}
|
1963
|
+
parser->link_target = Qnil;
|
1964
|
+
parser->link_text = Qnil;
|
1965
|
+
}
|
1966
|
+
else
|
1967
|
+
{
|
1968
|
+
_Wikitext_pop_excess_elements(parser);
|
1969
|
+
_Wikitext_start_para_if_necessary(parser);
|
1970
|
+
rb_str_cat(parser->output, ext_link_end, sizeof(ext_link_end) - 1);
|
1971
|
+
}
|
1972
|
+
break;
|
1973
|
+
|
1974
|
+
case SEPARATOR:
|
1975
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1976
|
+
_Wikitext_pop_excess_elements(parser);
|
1977
|
+
_Wikitext_start_para_if_necessary(parser);
|
1978
|
+
rb_str_cat(i, separator, sizeof(separator) - 1);
|
1979
|
+
break;
|
1980
|
+
|
1981
|
+
case SPACE:
|
1982
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1983
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1984
|
+
rb_str_cat(i, token->start, TOKEN_LEN(token));
|
1985
|
+
else
|
1986
|
+
{
|
1987
|
+
// peek ahead to see next token
|
1988
|
+
char *token_ptr = token->start;
|
1989
|
+
int token_len = TOKEN_LEN(token);
|
1990
|
+
NEXT_TOKEN();
|
1991
|
+
type = token->type;
|
1992
|
+
if (((type == H6_END) && IN(H6_START)) ||
|
1993
|
+
((type == H5_END) && IN(H5_START)) ||
|
1994
|
+
((type == H4_END) && IN(H4_START)) ||
|
1995
|
+
((type == H3_END) && IN(H3_START)) ||
|
1996
|
+
((type == H2_END) && IN(H2_START)) ||
|
1997
|
+
((type == H1_END) && IN(H1_START)))
|
1998
|
+
{
|
1999
|
+
// will suppress emission of space (discard) if next token is a H6_END, H5_END etc and we are in the corresponding scope
|
2000
|
+
}
|
2001
|
+
else
|
2002
|
+
{
|
2003
|
+
// emit the space
|
2004
|
+
_Wikitext_pop_excess_elements(parser);
|
2005
|
+
_Wikitext_start_para_if_necessary(parser);
|
2006
|
+
rb_str_cat(i, token_ptr, token_len);
|
2007
|
+
}
|
2008
|
+
|
2009
|
+
// jump to top of the loop to process token we scanned during lookahead
|
2010
|
+
continue;
|
2011
|
+
}
|
2012
|
+
break;
|
2013
|
+
|
2014
|
+
case QUOT_ENTITY:
|
2015
|
+
case AMP_ENTITY:
|
2016
|
+
case NAMED_ENTITY:
|
2017
|
+
case DECIMAL_ENTITY:
|
2018
|
+
// pass these through unaltered as they are case sensitive
|
2019
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2020
|
+
_Wikitext_pop_excess_elements(parser);
|
2021
|
+
_Wikitext_start_para_if_necessary(parser);
|
2022
|
+
rb_str_cat(i, token->start, TOKEN_LEN(token));
|
2023
|
+
break;
|
2024
|
+
|
2025
|
+
case HEX_ENTITY:
|
2026
|
+
// normalize hex entities (downcase them)
|
2027
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2028
|
+
_Wikitext_pop_excess_elements(parser);
|
2029
|
+
_Wikitext_start_para_if_necessary(parser);
|
2030
|
+
rb_str_append(i, _Wikitext_downcase(TOKEN_TEXT(token)));
|
2031
|
+
break;
|
2032
|
+
|
2033
|
+
case QUOT:
|
2034
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2035
|
+
_Wikitext_pop_excess_elements(parser);
|
2036
|
+
_Wikitext_start_para_if_necessary(parser);
|
2037
|
+
rb_str_cat(i, quot_entity, sizeof(quot_entity) - 1);
|
2038
|
+
break;
|
2039
|
+
|
2040
|
+
case AMP:
|
2041
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2042
|
+
_Wikitext_pop_excess_elements(parser);
|
2043
|
+
_Wikitext_start_para_if_necessary(parser);
|
2044
|
+
rb_str_cat(i, amp_entity, sizeof(amp_entity) - 1);
|
2045
|
+
break;
|
2046
|
+
|
2047
|
+
case LESS:
|
2048
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2049
|
+
_Wikitext_pop_excess_elements(parser);
|
2050
|
+
_Wikitext_start_para_if_necessary(parser);
|
2051
|
+
rb_str_cat(i, lt_entity, sizeof(lt_entity) - 1);
|
2052
|
+
break;
|
2053
|
+
|
2054
|
+
case GREATER:
|
2055
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2056
|
+
_Wikitext_pop_excess_elements(parser);
|
2057
|
+
_Wikitext_start_para_if_necessary(parser);
|
2058
|
+
rb_str_cat(i, gt_entity, sizeof(gt_entity) - 1);
|
2059
|
+
break;
|
2060
|
+
|
2061
|
+
case CRLF:
|
2062
|
+
parser->pending_crlf = Qfalse;
|
2063
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
2064
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
2065
|
+
if (IN(NO_WIKI_START) || IN(PRE_START))
|
2066
|
+
{
|
2067
|
+
ary_clear(parser->line_buffer);
|
2068
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
2069
|
+
break;
|
2070
|
+
}
|
2071
|
+
else if (IN(PRE))
|
2072
|
+
{
|
2073
|
+
// beware when nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, that must be end of PRE block
|
2074
|
+
if (NO_ITEM(ary_entry(parser->line_buffer, -2)) || ary_entry(parser->line_buffer, -2) == BLOCKQUOTE)
|
2075
|
+
// don't emit in this case
|
2076
|
+
_Wikitext_pop_from_stack_up_to(parser, parser->output, PRE, Qtrue);
|
2077
|
+
else
|
2078
|
+
{
|
2079
|
+
// peek ahead to see if this is definitely the end of the PRE block
|
2080
|
+
NEXT_TOKEN();
|
2081
|
+
type = token->type;
|
2082
|
+
if (type != BLOCKQUOTE && type != PRE)
|
2083
|
+
{
|
2084
|
+
// this is definitely the end of the block, so don't emit
|
2085
|
+
_Wikitext_pop_from_stack_up_to(parser, parser->output, PRE, Qtrue);
|
2086
|
+
}
|
2087
|
+
else
|
2088
|
+
// potentially will emit
|
2089
|
+
parser->pending_crlf = Qtrue;
|
2090
|
+
|
2091
|
+
// delete the entire contents of the line scope stack and buffer
|
2092
|
+
ary_clear(parser->line);
|
2093
|
+
ary_clear(parser->line_buffer);
|
2094
|
+
continue; // jump back to top of loop to handle token grabbed via lookahead
|
2095
|
+
}
|
2096
|
+
}
|
2097
|
+
else
|
2098
|
+
{
|
2099
|
+
parser->pending_crlf = Qtrue;
|
2100
|
+
|
2101
|
+
// count number of BLOCKQUOTE tokens in line buffer (can be zero) and pop back to that level
|
2102
|
+
// as a side effect, this handles any open span-level elements and unclosed blocks
|
2103
|
+
// (with special handling for P blocks and LI elements)
|
2104
|
+
i = ary_count(parser->line, BLOCKQUOTE) + ary_count(parser->scope, BLOCKQUOTE_START);
|
2105
|
+
for (j = parser->scope->count; j > i; j--)
|
2106
|
+
{
|
2107
|
+
if (parser->scope->count > 0 && ary_entry(parser->scope, -1) == LI)
|
2108
|
+
{
|
2109
|
+
parser->pending_crlf = Qfalse;
|
2110
|
+
break;
|
2111
|
+
}
|
2112
|
+
|
2113
|
+
// special handling on last iteration through the loop if the top item on the scope is a P block
|
2114
|
+
if ((j - i == 1) && ary_entry(parser->scope, -1) == P)
|
2115
|
+
{
|
2116
|
+
// if nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, this must be a paragraph break
|
2117
|
+
// (note that we have to make sure we're not inside a BLOCKQUOTE_START block
|
2118
|
+
// because in those blocks BLOCKQUOTE tokens have no special meaning)
|
2119
|
+
if (NO_ITEM(ary_entry(parser->line_buffer, -2)) ||
|
2120
|
+
(ary_entry(parser->line_buffer, -2) == BLOCKQUOTE && !IN(BLOCKQUOTE_START)))
|
2121
|
+
// paragraph break
|
2122
|
+
parser->pending_crlf = Qfalse;
|
2123
|
+
else
|
2124
|
+
// not a paragraph break!
|
2125
|
+
continue;
|
2126
|
+
}
|
2127
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
2128
|
+
}
|
2129
|
+
}
|
2130
|
+
|
2131
|
+
// delete the entire contents of the line scope stack and buffer
|
2132
|
+
ary_clear(parser->line);
|
2133
|
+
ary_clear(parser->line_buffer);
|
2134
|
+
break;
|
2135
|
+
|
2136
|
+
case PRINTABLE:
|
2137
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2138
|
+
_Wikitext_pop_excess_elements(parser);
|
2139
|
+
_Wikitext_start_para_if_necessary(parser);
|
2140
|
+
rb_str_cat(i, token->start, TOKEN_LEN(token));
|
2141
|
+
break;
|
2142
|
+
|
2143
|
+
case DEFAULT:
|
2144
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2145
|
+
_Wikitext_pop_excess_elements(parser);
|
2146
|
+
_Wikitext_start_para_if_necessary(parser);
|
2147
|
+
rb_str_append(i, _Wikitext_utf32_char_to_entity(token->code_point)); // convert to entity
|
2148
|
+
break;
|
2149
|
+
|
2150
|
+
case END_OF_FILE:
|
2151
|
+
// close any open scopes on hitting EOF
|
2152
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
2153
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
2154
|
+
for (i = 0, j = parser->scope->count; i < j; i++)
|
2155
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
2156
|
+
goto return_output; // break not enough here (want to break out of outer while loop, not inner switch statement)
|
2157
|
+
|
2158
|
+
default:
|
2159
|
+
break;
|
2160
|
+
}
|
2161
|
+
|
2162
|
+
// reset current token; forcing lexer to return another token at the top of the loop
|
2163
|
+
token = NULL;
|
2164
|
+
} while (1);
|
2165
|
+
return_output:
|
2166
|
+
// BUG: these will leak if we exit this function by raising an exception; need to investigate using Data_Wrap_Struct
|
2167
|
+
ary_free(parser->scope);
|
2168
|
+
ary_free(parser->line);
|
2169
|
+
ary_free(parser->line_buffer);
|
2170
|
+
str_free(parser->line_ending);
|
2171
|
+
if (parser->tabulation)
|
2172
|
+
str_free(parser->tabulation);
|
2173
|
+
return parser->output;
|
2174
|
+
}
|