wikitext 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/ary.h +99 -0
- data/ext/depend +22 -0
- data/ext/extconf.rb +23 -0
- data/ext/parser.c +2174 -0
- data/ext/parser.h +31 -0
- data/ext/str.h +135 -0
- data/ext/token.c +109 -0
- data/ext/token.h +95 -0
- data/ext/wikitext.c +60 -0
- data/ext/wikitext.h +30 -0
- data/ext/wikitext_ragel.c +3354 -0
- data/ext/wikitext_ragel.h +17 -0
- data/spec/autolinking_spec.rb +122 -0
- data/spec/blockquote_spec.rb +570 -0
- data/spec/em_spec.rb +97 -0
- data/spec/encoding_spec.rb +124 -0
- data/spec/entity_spec.rb +40 -0
- data/spec/external_link_spec.rb +289 -0
- data/spec/h1_spec.rb +59 -0
- data/spec/h2_spec.rb +59 -0
- data/spec/h3_spec.rb +59 -0
- data/spec/h4_spec.rb +59 -0
- data/spec/h5_spec.rb +59 -0
- data/spec/h6_spec.rb +59 -0
- data/spec/indentation_spec.rb +70 -0
- data/spec/integration_spec.rb +265 -0
- data/spec/internal_link_spec.rb +445 -0
- data/spec/line_endings_spec.rb +81 -0
- data/spec/link_encoding_spec.rb +132 -0
- data/spec/link_sanitizing_spec.rb +228 -0
- data/spec/nowiki_spec.rb +155 -0
- data/spec/p_spec.rb +44 -0
- data/spec/pre_spec.rb +411 -0
- data/spec/regressions_spec.rb +45 -0
- data/spec/spec_helper.rb +77 -0
- data/spec/strong_em_spec.rb +89 -0
- data/spec/strong_spec.rb +99 -0
- data/spec/tokenizing_spec.rb +190 -0
- data/spec/tt_spec.rb +100 -0
- data/spec/ul_spec.rb +307 -0
- data/spec/wikitext_spec.rb +50 -0
- metadata +93 -0
data/ext/ary.h
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
// Copyright 2008 Wincent Colaiuta
|
2
|
+
// This program is free software: you can redistribute it and/or modify
|
3
|
+
// it under the terms of the GNU General Public License as published by
|
4
|
+
// the Free Software Foundation, either version 3 of the License, or
|
5
|
+
// (at your option) any later version.
|
6
|
+
//
|
7
|
+
// This program is distributed in the hope that it will be useful,
|
8
|
+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
9
|
+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
10
|
+
// GNU General Public License for more details.
|
11
|
+
//
|
12
|
+
// You should have received a copy of the GNU General Public License
|
13
|
+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
14
|
+
|
15
|
+
#include <ruby/ruby.h>
|
16
|
+
|
17
|
+
typedef struct
|
18
|
+
{
|
19
|
+
int count;
|
20
|
+
int max;
|
21
|
+
int *entries;
|
22
|
+
} ary_t;
|
23
|
+
|
24
|
+
// in the test suite array count goes no higher than 25 or 26
|
25
|
+
#define DEFAULT_ENTRY_COUNT 64
|
26
|
+
|
27
|
+
#define NO_ITEM(item) (item == INT_MAX)
|
28
|
+
|
29
|
+
inline ary_t *ary_new(void)
|
30
|
+
{
|
31
|
+
ary_t *ary = ALLOC_N(ary_t, 1);
|
32
|
+
ary->count = 0;
|
33
|
+
ary->max = DEFAULT_ENTRY_COUNT;
|
34
|
+
ary->entries = ALLOC_N(int, DEFAULT_ENTRY_COUNT);
|
35
|
+
return ary;
|
36
|
+
}
|
37
|
+
|
38
|
+
inline void ary_free(ary_t *ary)
|
39
|
+
{
|
40
|
+
free(ary->entries);
|
41
|
+
free(ary);
|
42
|
+
}
|
43
|
+
|
44
|
+
inline int ary_entry(ary_t *ary, int idx)
|
45
|
+
{
|
46
|
+
if (idx < 0)
|
47
|
+
idx = ary->count + idx;
|
48
|
+
return (idx >= 0 && ary->count > idx) ? ary->entries[idx] : INT_MAX;
|
49
|
+
}
|
50
|
+
|
51
|
+
inline void ary_clear(ary_t *ary)
|
52
|
+
{
|
53
|
+
ary->count = 0;
|
54
|
+
}
|
55
|
+
|
56
|
+
inline int ary_pop(ary_t *ary)
|
57
|
+
{
|
58
|
+
if (ary->count > 0)
|
59
|
+
{
|
60
|
+
ary->count--;
|
61
|
+
return 1;
|
62
|
+
}
|
63
|
+
return 0;
|
64
|
+
}
|
65
|
+
|
66
|
+
inline void ary_push(ary_t *ary, int val)
|
67
|
+
{
|
68
|
+
if (ary->count == ary->max)
|
69
|
+
{
|
70
|
+
ary->max += DEFAULT_ENTRY_COUNT;
|
71
|
+
REALLOC_N(ary->entries, int, ary->max);
|
72
|
+
}
|
73
|
+
ary->entries[ary->count] = val;
|
74
|
+
ary->count++;
|
75
|
+
}
|
76
|
+
|
77
|
+
inline int ary_includes(ary_t *ary, int val)
|
78
|
+
{
|
79
|
+
for (int i = 0, max = ary->count; i < max; i++)
|
80
|
+
{
|
81
|
+
if (ary->entries[i] == val)
|
82
|
+
return 1;
|
83
|
+
}
|
84
|
+
return 0;
|
85
|
+
}
|
86
|
+
|
87
|
+
// returns a count indicating the number of times the value appears in the collection
|
88
|
+
// refactored from _Wikitext_count()
|
89
|
+
inline int ary_count(ary_t *ary, int item)
|
90
|
+
{
|
91
|
+
int count = 0;
|
92
|
+
for (int i = 0, max = ary->count; i < max; i++)
|
93
|
+
{
|
94
|
+
if (ary->entries[i] == item)
|
95
|
+
count++;
|
96
|
+
}
|
97
|
+
return count;
|
98
|
+
}
|
99
|
+
|
data/ext/depend
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# depend
|
2
|
+
# Additional material for Makefile
|
3
|
+
# Copyright 2008 Wincent Colaiuta
|
4
|
+
# This program is free software: you can redistribute it and/or modify
|
5
|
+
# it under the terms of the GNU General Public License as published by
|
6
|
+
# the Free Software Foundation, either version 3 of the License, or
|
7
|
+
# (at your option) any later version.
|
8
|
+
#
|
9
|
+
# This program is distributed in the hope that it will be useful,
|
10
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12
|
+
# GNU General Public License for more details.
|
13
|
+
#
|
14
|
+
# You should have received a copy of the GNU General Public License
|
15
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
16
|
+
|
17
|
+
CFLAGS += -std=gnu99
|
18
|
+
|
19
|
+
parser.o : ary.h parser.c parser.h token.h str.h wikitext.h wikitext_ragel.h
|
20
|
+
token.o : token.c token.h wikitext.h
|
21
|
+
wikitext.o : parser.h token.h wikitext.c wikitext.h wikitext_ragel.h
|
22
|
+
wikitext_ragel.o : token.h wikitext.h wikitext_ragel.h wikitext_ragel.c
|
data/ext/extconf.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# Copyright 2008 Wincent Colaiuta
|
2
|
+
# This program is free software: you can redistribute it and/or modify
|
3
|
+
# it under the terms of the GNU General Public License as published by
|
4
|
+
# the Free Software Foundation, either version 3 of the License, or
|
5
|
+
# (at your option) any later version.
|
6
|
+
#
|
7
|
+
# This program is distributed in the hope that it will be useful,
|
8
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
9
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
10
|
+
# GNU General Public License for more details.
|
11
|
+
#
|
12
|
+
# You should have received a copy of the GNU General Public License
|
13
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
14
|
+
|
15
|
+
require 'mkmf'
|
16
|
+
|
17
|
+
def missing item
|
18
|
+
puts "couldn't find #{item} (required)"
|
19
|
+
exit 1
|
20
|
+
end
|
21
|
+
|
22
|
+
have_header('ruby.h') or missing 'ruby.h'
|
23
|
+
create_makefile('wikitext')
|
data/ext/parser.c
ADDED
@@ -0,0 +1,2174 @@
|
|
1
|
+
// Copyright 2007-2008 Wincent Colaiuta
|
2
|
+
// This program is free software: you can redistribute it and/or modify
|
3
|
+
// it under the terms of the GNU General Public License as published by
|
4
|
+
// the Free Software Foundation, either version 3 of the License, or
|
5
|
+
// (at your option) any later version.
|
6
|
+
//
|
7
|
+
// This program is distributed in the hope that it will be useful,
|
8
|
+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
9
|
+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
10
|
+
// GNU General Public License for more details.
|
11
|
+
//
|
12
|
+
// You should have received a copy of the GNU General Public License
|
13
|
+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
14
|
+
|
15
|
+
#include "parser.h"
|
16
|
+
#include "ary.h"
|
17
|
+
#include "str.h"
|
18
|
+
#include "wikitext.h"
|
19
|
+
#include "wikitext_ragel.h"
|
20
|
+
|
21
|
+
#define IN(type) ary_includes(parser->scope, type)
|
22
|
+
|
23
|
+
// poor man's object orientation in C:
|
24
|
+
// instead of parsing around multiple parameters between functions in the parser
|
25
|
+
// we pack everything into a struct and pass around only a pointer to that
|
26
|
+
typedef struct
|
27
|
+
{
|
28
|
+
VALUE output; // for accumulating output to be returned
|
29
|
+
VALUE capture; // for capturing substrings
|
30
|
+
VALUE link_target; // short term "memory" for parsing links
|
31
|
+
VALUE link_text; // short term "memory" for parsing links
|
32
|
+
VALUE external_link_class; // CSS class applied to external links
|
33
|
+
ary_t *scope; // stack for tracking scope
|
34
|
+
ary_t *line; // stack for tracking scope as implied by current line
|
35
|
+
ary_t *line_buffer; // stack for tracking raw tokens (not scope) on current line
|
36
|
+
VALUE pending_crlf; // boolean (Qtrue or Qfalse)
|
37
|
+
VALUE autolink; // boolean (Qtrue or Qfalse)
|
38
|
+
VALUE treat_slash_as_special; // boolean (Qtrue or Qfalse)
|
39
|
+
VALUE special_link; // boolean (Qtrue or Qfalse): is the current link_target a "special" link?
|
40
|
+
str_t *line_ending;
|
41
|
+
int base_indent; // controlled by the :indent option to Wikitext::Parser#parse
|
42
|
+
int current_indent; // fluctuates according to currently nested structures
|
43
|
+
str_t *tabulation; // caching buffer for emitting indentation
|
44
|
+
} parser_t;
|
45
|
+
|
46
|
+
const char escaped_no_wiki_start[] = "<nowiki>";
|
47
|
+
const char escaped_no_wiki_end[] = "</nowiki>";
|
48
|
+
const char literal_strong_em[] = "'''''";
|
49
|
+
const char literal_strong[] = "'''";
|
50
|
+
const char literal_em[] = "''";
|
51
|
+
const char escaped_em_start[] = "<em>";
|
52
|
+
const char escaped_em_end[] = "</em>";
|
53
|
+
const char escaped_strong_start[] = "<strong>";
|
54
|
+
const char escaped_strong_end[] = "</strong>";
|
55
|
+
const char escaped_tt_start[] = "<tt>";
|
56
|
+
const char escaped_tt_end[] = "</tt>";
|
57
|
+
const char literal_h6[] = "======";
|
58
|
+
const char literal_h5[] = "=====";
|
59
|
+
const char literal_h4[] = "====";
|
60
|
+
const char literal_h3[] = "===";
|
61
|
+
const char literal_h2[] = "==";
|
62
|
+
const char literal_h1[] = "=";
|
63
|
+
const char pre_start[] = "<pre>";
|
64
|
+
const char pre_end[] = "</pre>";
|
65
|
+
const char escaped_pre_start[] = "<pre>";
|
66
|
+
const char escaped_pre_end[] = "</pre>";
|
67
|
+
const char blockquote_start[] = "<blockquote>";
|
68
|
+
const char blockquote_end[] = "</blockquote>";
|
69
|
+
const char escaped_blockquote_start[] = "<blockquote>";
|
70
|
+
const char escaped_blockquote_end[] = "</blockquote>";
|
71
|
+
const char strong_em_start[] = "<strong><em>";
|
72
|
+
const char strong_start[] = "<strong>";
|
73
|
+
const char strong_end[] = "</strong>";
|
74
|
+
const char em_start[] = "<em>";
|
75
|
+
const char em_end[] = "</em>";
|
76
|
+
const char tt_start[] = "<tt>";
|
77
|
+
const char tt_end[] = "</tt>";
|
78
|
+
const char ol_start[] = "<ol>";
|
79
|
+
const char ol_end[] = "</ol>";
|
80
|
+
const char ul_start[] = "<ul>";
|
81
|
+
const char ul_end[] = "</ul>";
|
82
|
+
const char li_start[] = "<li>";
|
83
|
+
const char li_end[] = "</li>";
|
84
|
+
const char h6_start[] = "<h6>";
|
85
|
+
const char h6_end[] = "</h6>";
|
86
|
+
const char h5_start[] = "<h5>";
|
87
|
+
const char h5_end[] = "</h5>";
|
88
|
+
const char h4_start[] = "<h4>";
|
89
|
+
const char h4_end[] = "</h4>";
|
90
|
+
const char h3_start[] = "<h3>";
|
91
|
+
const char h3_end[] = "</h3>";
|
92
|
+
const char h2_start[] = "<h2>";
|
93
|
+
const char h2_end[] = "</h2>";
|
94
|
+
const char h1_start[] = "<h1>";
|
95
|
+
const char h1_end[] = "</h1>";
|
96
|
+
const char p_start[] = "<p>";
|
97
|
+
const char p_end[] = "</p>";
|
98
|
+
const char space[] = " ";
|
99
|
+
const char a_start[] = "<a href=\"";
|
100
|
+
const char a_class[] = "\" class=\"";
|
101
|
+
const char a_start_close[] = "\">";
|
102
|
+
const char a_end[] = "</a>";
|
103
|
+
const char link_start[] = "[[";
|
104
|
+
const char link_end[] = "]]";
|
105
|
+
const char separator[] = "|";
|
106
|
+
const char ext_link_start[] = "[";
|
107
|
+
const char backtick[] = "`";
|
108
|
+
const char quote[] = "\"";
|
109
|
+
const char ampersand[] = "&";
|
110
|
+
const char quot_entity[] = """;
|
111
|
+
const char amp_entity[] = "&";
|
112
|
+
const char lt_entity[] = "<";
|
113
|
+
const char gt_entity[] = ">";
|
114
|
+
const char escaped_blockquote[] = "> ";
|
115
|
+
const char ext_link_end[] = "]";
|
116
|
+
|
117
|
+
// for testing and debugging only
|
118
|
+
VALUE Wikitext_parser_tokenize(VALUE self, VALUE string)
|
119
|
+
{
|
120
|
+
if (NIL_P(string))
|
121
|
+
return Qnil;
|
122
|
+
string = StringValue(string);
|
123
|
+
VALUE tokens = rb_ary_new();
|
124
|
+
char *p = RSTRING_PTR(string);
|
125
|
+
long len = RSTRING_LEN(string);
|
126
|
+
char *pe = p + len;
|
127
|
+
token_t token;
|
128
|
+
next_token(&token, NULL, p, pe);
|
129
|
+
rb_ary_push(tokens, _Wikitext_token(&token));
|
130
|
+
while (token.type != END_OF_FILE)
|
131
|
+
{
|
132
|
+
next_token(&token, &token, NULL, pe);
|
133
|
+
rb_ary_push(tokens, _Wikitext_token(&token));
|
134
|
+
}
|
135
|
+
return tokens;
|
136
|
+
}
|
137
|
+
|
138
|
+
// for benchmarking raw tokenization speed only
|
139
|
+
VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string)
|
140
|
+
{
|
141
|
+
if (NIL_P(string))
|
142
|
+
return Qnil;
|
143
|
+
string = StringValue(string);
|
144
|
+
char *p = RSTRING_PTR(string);
|
145
|
+
long len = RSTRING_LEN(string);
|
146
|
+
char *pe = p + len;
|
147
|
+
token_t token;
|
148
|
+
next_token(&token, NULL, p, pe);
|
149
|
+
while (token.type != END_OF_FILE)
|
150
|
+
next_token(&token, &token, NULL, pe);
|
151
|
+
return Qnil;
|
152
|
+
}
|
153
|
+
|
154
|
+
// we downcase "in place", overwriting the original contents of the buffer and returning the same string
|
155
|
+
inline VALUE _Wikitext_downcase(VALUE string)
|
156
|
+
{
|
157
|
+
char *ptr = RSTRING_PTR(string);
|
158
|
+
long len = RSTRING_LEN(string);
|
159
|
+
for (long i = 0; i < len; i++)
|
160
|
+
{
|
161
|
+
if (ptr[i] >= 'A' && ptr[i] <= 'Z')
|
162
|
+
ptr[i] += 32;
|
163
|
+
}
|
164
|
+
return string;
|
165
|
+
}
|
166
|
+
|
167
|
+
inline VALUE _Wikitext_hyperlink(VALUE link_prefix, VALUE link_target, VALUE link_text, VALUE link_class)
|
168
|
+
{
|
169
|
+
VALUE string = rb_str_new(a_start, sizeof(a_start) - 1); // <a href="
|
170
|
+
if (!NIL_P(link_prefix))
|
171
|
+
rb_str_append(string, link_prefix);
|
172
|
+
rb_str_append(string, link_target);
|
173
|
+
if (link_class != Qnil)
|
174
|
+
{
|
175
|
+
rb_str_cat(string, a_class, sizeof(a_class) - 1); // " class="
|
176
|
+
rb_str_append(string, link_class);
|
177
|
+
}
|
178
|
+
rb_str_cat(string, a_start_close, sizeof(a_start_close) - 1); // ">
|
179
|
+
rb_str_append(string, link_text);
|
180
|
+
rb_str_cat(string, a_end, sizeof(a_end) - 1);
|
181
|
+
return string;
|
182
|
+
}
|
183
|
+
|
184
|
+
// will emit indentation only if we are about to emit any of:
|
185
|
+
// <blockquote>, <p>, <ul>, <ol>, <li>, <h1> etc, <pre>
|
186
|
+
// each time we enter one of those spans must ++ the indentation level
|
187
|
+
inline void _Wikitext_indent(parser_t *parser)
|
188
|
+
{
|
189
|
+
int space_count = parser->current_indent + parser->base_indent;
|
190
|
+
if (space_count > 0)
|
191
|
+
{
|
192
|
+
char *old_end, *new_end;
|
193
|
+
if (!parser->tabulation)
|
194
|
+
{
|
195
|
+
parser->tabulation = str_new_size(space_count);
|
196
|
+
old_end = parser->tabulation->ptr;
|
197
|
+
}
|
198
|
+
else if (parser->tabulation->len < space_count)
|
199
|
+
{
|
200
|
+
old_end = parser->tabulation->ptr;
|
201
|
+
str_grow(parser->tabulation, space_count);
|
202
|
+
}
|
203
|
+
else
|
204
|
+
old_end = parser->tabulation->ptr;
|
205
|
+
new_end = parser->tabulation->ptr + space_count;
|
206
|
+
while (old_end < new_end)
|
207
|
+
*old_end++ = ' ';
|
208
|
+
rb_str_cat(parser->output, parser->tabulation->ptr, space_count);
|
209
|
+
}
|
210
|
+
parser->current_indent += 2;
|
211
|
+
}
|
212
|
+
|
213
|
+
inline void _Wikitext_dedent(parser_t *parser, VALUE emit)
|
214
|
+
{
|
215
|
+
parser->current_indent -= 2;
|
216
|
+
if (emit != Qtrue)
|
217
|
+
return;
|
218
|
+
int space_count = parser->current_indent + parser->base_indent;
|
219
|
+
if (space_count > 0)
|
220
|
+
rb_str_cat(parser->output, parser->tabulation->ptr, space_count);
|
221
|
+
}
|
222
|
+
|
223
|
+
// Pops a single item off the parser's scope stack.
|
224
|
+
// A corresponding closing tag is written to the target string.
|
225
|
+
// The target string may be the main output buffer, or a substring capturing buffer if a link is being scanned.
|
226
|
+
void _Wikitext_pop_from_stack(parser_t *parser, VALUE target)
|
227
|
+
{
|
228
|
+
int top = ary_entry(parser->scope, -1);
|
229
|
+
if (NO_ITEM(top))
|
230
|
+
return;
|
231
|
+
if (NIL_P(target))
|
232
|
+
target = parser->output;
|
233
|
+
switch (top)
|
234
|
+
{
|
235
|
+
case PRE:
|
236
|
+
case PRE_START:
|
237
|
+
rb_str_cat(target, pre_end, sizeof(pre_end) - 1);
|
238
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
239
|
+
_Wikitext_dedent(parser, Qfalse);
|
240
|
+
break;
|
241
|
+
|
242
|
+
case BLOCKQUOTE:
|
243
|
+
case BLOCKQUOTE_START:
|
244
|
+
_Wikitext_dedent(parser, Qtrue);
|
245
|
+
rb_str_cat(target, blockquote_end, sizeof(blockquote_end) - 1);
|
246
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
247
|
+
break;
|
248
|
+
|
249
|
+
case NO_WIKI_START:
|
250
|
+
// not a real HTML tag; so nothing to pop
|
251
|
+
break;
|
252
|
+
|
253
|
+
case STRONG:
|
254
|
+
case STRONG_START:
|
255
|
+
rb_str_cat(target, strong_end, sizeof(strong_end) - 1);
|
256
|
+
break;
|
257
|
+
|
258
|
+
case EM:
|
259
|
+
case EM_START:
|
260
|
+
rb_str_cat(target, em_end, sizeof(em_end) - 1);
|
261
|
+
break;
|
262
|
+
|
263
|
+
case TT:
|
264
|
+
case TT_START:
|
265
|
+
rb_str_cat(target, tt_end, sizeof(tt_end) - 1);
|
266
|
+
break;
|
267
|
+
|
268
|
+
case OL:
|
269
|
+
_Wikitext_dedent(parser, Qtrue);
|
270
|
+
rb_str_cat(target, ol_end, sizeof(ol_end) - 1);
|
271
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
272
|
+
break;
|
273
|
+
|
274
|
+
case UL:
|
275
|
+
_Wikitext_dedent(parser, Qtrue);
|
276
|
+
rb_str_cat(target, ul_end, sizeof(ul_end) - 1);
|
277
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
278
|
+
break;
|
279
|
+
|
280
|
+
case NESTED_LIST:
|
281
|
+
// next token to pop will be a LI
|
282
|
+
// LI is an interesting token because sometimes we want it to behave like P (ie. do a non-emitting indent)
|
283
|
+
// and other times we want it to behave like BLOCKQUOTE (ie. when it has a nested list inside)
|
284
|
+
// hence this hack: we do an emitting dedent on behalf of the LI that we know must be coming
|
285
|
+
// and then when we pop the actual LI itself (below) we do the standard non-emitting indent
|
286
|
+
_Wikitext_dedent(parser, Qtrue); // we really only want to emit the spaces
|
287
|
+
parser->current_indent += 2; // we don't want to decrement the actual indent level, so put it back
|
288
|
+
break;
|
289
|
+
|
290
|
+
case LI:
|
291
|
+
rb_str_cat(target, li_end, sizeof(li_end) - 1);
|
292
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
293
|
+
_Wikitext_dedent(parser, Qfalse);
|
294
|
+
break;
|
295
|
+
|
296
|
+
case H6_START:
|
297
|
+
rb_str_cat(target, h6_end, sizeof(h6_end) - 1);
|
298
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
299
|
+
_Wikitext_dedent(parser, Qfalse);
|
300
|
+
break;
|
301
|
+
|
302
|
+
case H5_START:
|
303
|
+
rb_str_cat(target, h5_end, sizeof(h5_end) - 1);
|
304
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
305
|
+
_Wikitext_dedent(parser, Qfalse);
|
306
|
+
break;
|
307
|
+
|
308
|
+
case H4_START:
|
309
|
+
rb_str_cat(target, h4_end, sizeof(h4_end) - 1);
|
310
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
311
|
+
_Wikitext_dedent(parser, Qfalse);
|
312
|
+
break;
|
313
|
+
|
314
|
+
case H3_START:
|
315
|
+
rb_str_cat(target, h3_end, sizeof(h3_end) - 1);
|
316
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
317
|
+
_Wikitext_dedent(parser, Qfalse);
|
318
|
+
break;
|
319
|
+
|
320
|
+
case H2_START:
|
321
|
+
rb_str_cat(target, h2_end, sizeof(h2_end) - 1);
|
322
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
323
|
+
_Wikitext_dedent(parser, Qfalse);
|
324
|
+
break;
|
325
|
+
|
326
|
+
case H1_START:
|
327
|
+
rb_str_cat(target, h1_end, sizeof(h1_end) - 1);
|
328
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
329
|
+
_Wikitext_dedent(parser, Qfalse);
|
330
|
+
break;
|
331
|
+
|
332
|
+
case LINK_START:
|
333
|
+
// not an HTML tag; so nothing to emit
|
334
|
+
break;
|
335
|
+
|
336
|
+
case EXT_LINK_START:
|
337
|
+
// not an HTML tag; so nothing to emit
|
338
|
+
break;
|
339
|
+
|
340
|
+
case SPACE:
|
341
|
+
// not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
|
342
|
+
break;
|
343
|
+
|
344
|
+
case SEPARATOR:
|
345
|
+
// not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
|
346
|
+
break;
|
347
|
+
|
348
|
+
case P:
|
349
|
+
rb_str_cat(target, p_end, sizeof(p_end) - 1);
|
350
|
+
rb_str_cat(target, parser->line_ending->ptr, parser->line_ending->len);
|
351
|
+
_Wikitext_dedent(parser, Qfalse);
|
352
|
+
break;
|
353
|
+
|
354
|
+
case END_OF_FILE:
|
355
|
+
// nothing to do
|
356
|
+
break;
|
357
|
+
|
358
|
+
default:
|
359
|
+
// should probably raise an exception here
|
360
|
+
break;
|
361
|
+
}
|
362
|
+
ary_pop(parser->scope);
|
363
|
+
}
|
364
|
+
|
365
|
+
// Pops items off the top of parser's scope stack, accumulating closing tags for them into the target string, until item is reached.
|
366
|
+
// If including is Qtrue then the item itself is also popped.
|
367
|
+
// The target string may be the main output buffer, or a substring capturing buffer when scanning links.
|
368
|
+
void _Wikitext_pop_from_stack_up_to(parser_t *parser, VALUE target, int item, VALUE including)
|
369
|
+
{
|
370
|
+
int continue_looping = 1;
|
371
|
+
do
|
372
|
+
{
|
373
|
+
int top = ary_entry(parser->scope, -1);
|
374
|
+
if (NO_ITEM(top))
|
375
|
+
return;
|
376
|
+
if (top == item)
|
377
|
+
{
|
378
|
+
if (including != Qtrue)
|
379
|
+
return;
|
380
|
+
continue_looping = 0;
|
381
|
+
}
|
382
|
+
_Wikitext_pop_from_stack(parser, target);
|
383
|
+
} while (continue_looping);
|
384
|
+
}
|
385
|
+
|
386
|
+
inline void _Wikitext_start_para_if_necessary(parser_t *parser)
|
387
|
+
{
|
388
|
+
if (!NIL_P(parser->capture)) // we don't do anything if in capturing mode
|
389
|
+
return;
|
390
|
+
|
391
|
+
// if no block open yet, or top of stack is BLOCKQUOTE/BLOCKQUOTE_START (with nothing in it yet)
|
392
|
+
if (parser->scope->count == 0 ||
|
393
|
+
ary_entry(parser->scope, -1) == BLOCKQUOTE ||
|
394
|
+
ary_entry(parser->scope, -1) == BLOCKQUOTE_START)
|
395
|
+
{
|
396
|
+
_Wikitext_indent(parser);
|
397
|
+
rb_str_cat(parser->output, p_start, sizeof(p_start) - 1);
|
398
|
+
ary_push(parser->scope, P);
|
399
|
+
ary_push(parser->line, P);
|
400
|
+
}
|
401
|
+
else if (parser->pending_crlf == Qtrue)
|
402
|
+
{
|
403
|
+
if (IN(P))
|
404
|
+
// already in a paragraph block; convert pending CRLF into a space
|
405
|
+
rb_str_cat(parser->output, space, sizeof(space) - 1);
|
406
|
+
else if (IN(PRE))
|
407
|
+
// PRE blocks can have pending CRLF too (helps us avoid emitting the trailing newline)
|
408
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
409
|
+
}
|
410
|
+
parser->pending_crlf = Qfalse;
|
411
|
+
}
|
412
|
+
|
413
|
+
// Helper function that pops any excess elements off scope (pushing is already handled in the respective rules).
|
414
|
+
// For example, given input like:
|
415
|
+
//
|
416
|
+
// > > foo
|
417
|
+
// bar
|
418
|
+
//
|
419
|
+
// Upon seeing "bar", we want to pop two BLOCKQUOTE elements from the scope.
|
420
|
+
// The reverse case (shown below) is handled from inside the BLOCKQUOTE rule itself:
|
421
|
+
//
|
422
|
+
// foo
|
423
|
+
// > > bar
|
424
|
+
//
|
425
|
+
// Things are made slightly more complicated by the fact that there is one block-level tag that can be on the scope
|
426
|
+
// but not on the line scope:
|
427
|
+
//
|
428
|
+
// <blockquote>foo
|
429
|
+
// bar</blockquote>
|
430
|
+
//
|
431
|
+
// Here on seeing "bar" we have one item on the scope (BLOCKQUOTE_START) which we don't want to pop, but we have nothing
|
432
|
+
// on the line scope.
|
433
|
+
// Luckily, BLOCKQUOTE_START tokens can only appear at the start of the scope array, so we can check for them first before
|
434
|
+
// entering the for loop.
|
435
|
+
void inline _Wikitext_pop_excess_elements(parser_t *parser)
|
436
|
+
{
|
437
|
+
if (!NIL_P(parser->capture)) // we don't pop anything if in capturing mode
|
438
|
+
return;
|
439
|
+
for (int i = parser->scope->count - ary_count(parser->scope, BLOCKQUOTE_START), j = parser->line->count; i > j; i--)
|
440
|
+
{
|
441
|
+
// special case for last item on scope
|
442
|
+
if (i - j == 1)
|
443
|
+
{
|
444
|
+
// don't auto-pop P if it is only item on scope
|
445
|
+
if (ary_entry(parser->scope, -1) == P)
|
446
|
+
{
|
447
|
+
// add P to the line scope to prevent us entering the loop at all next time around
|
448
|
+
ary_push(parser->line, P);
|
449
|
+
continue;
|
450
|
+
}
|
451
|
+
}
|
452
|
+
_Wikitext_pop_from_stack(parser, parser->output);
|
453
|
+
}
|
454
|
+
}
|
455
|
+
|
456
|
+
#define INVALID_ENCODING(msg) do { if (dest_ptr) free(dest_ptr); rb_raise(eWikitextParserError, "invalid encoding: " msg); } while(0)
|
457
|
+
|
458
|
+
// convert a single UTF-8 codepoint to UTF-32
|
459
|
+
// expects an input buffer, src, containing a UTF-8 encoded character (which may be multi-byte)
|
460
|
+
// the end of the input buffer, end, is also passed in to allow the detection of invalidly truncated codepoints
|
461
|
+
// the number of bytes in the UTF-8 character (between 1 and 4) is returned by reference in width_out
|
462
|
+
// raises a RangeError if the supplied character is invalid UTF-8
|
463
|
+
// (in which case it also frees the block of memory indicated by dest_ptr if it is non-NULL)
|
464
|
+
inline uint32_t _Wikitext_utf8_to_utf32(char *src, char *end, long *width_out, void *dest_ptr)
|
465
|
+
{
|
466
|
+
uint32_t dest;
|
467
|
+
if ((unsigned char)src[0] <= 0x7f) // ASCII
|
468
|
+
{
|
469
|
+
dest = src[0];
|
470
|
+
*width_out = 1;
|
471
|
+
}
|
472
|
+
else if ((src[0] & 0xe0) == 0xc0) // byte starts with 110..... : this should be a two-byte sequence
|
473
|
+
{
|
474
|
+
if (src + 1 >= end)
|
475
|
+
INVALID_ENCODING("truncated byte sequence"); // no second byte
|
476
|
+
else if (((unsigned char)src[0] == 0xc0) || ((unsigned char)src[0] == 0xc1))
|
477
|
+
INVALID_ENCODING("overlong encoding"); // overlong encoding: lead byte of 110..... but code point <= 127
|
478
|
+
else if ((src[1] & 0xc0) != 0x80 )
|
479
|
+
INVALID_ENCODING("malformed byte sequence"); // should have second byte starting with 10......
|
480
|
+
dest = ((uint32_t)(src[0] & 0x1f)) << 6 | (src[1] & 0x3f);
|
481
|
+
*width_out = 2;
|
482
|
+
}
|
483
|
+
else if ((src[0] & 0xf0) == 0xe0) // byte starts with 1110.... : this should be a three-byte sequence
|
484
|
+
{
|
485
|
+
if (src + 2 >= end)
|
486
|
+
INVALID_ENCODING("truncated byte sequence"); // missing second or third byte
|
487
|
+
else if (((src[1] & 0xc0) != 0x80 ) || ((src[2] & 0xc0) != 0x80 ))
|
488
|
+
INVALID_ENCODING("malformed byte sequence"); // should have second and third bytes starting with 10......
|
489
|
+
dest = ((uint32_t)(src[0] & 0x0f)) << 12 | ((uint32_t)(src[1] & 0x3f)) << 6 | (src[2] & 0x3f);
|
490
|
+
*width_out = 3;
|
491
|
+
}
|
492
|
+
else if ((src[0] & 0xf8) == 0xf0) // bytes starts with 11110... : this should be a four-byte sequence
|
493
|
+
{
|
494
|
+
if (src + 3 >= end)
|
495
|
+
INVALID_ENCODING("truncated byte sequence"); // missing second, third, or fourth byte
|
496
|
+
else if ((unsigned char)src[0] >= 0xf5 && (unsigned char)src[0] <= 0xf7)
|
497
|
+
INVALID_ENCODING("overlong encoding"); // disallowed by RFC 3629 (codepoints above 0x10ffff)
|
498
|
+
else if (((src[1] & 0xc0) != 0x80 ) || ((src[2] & 0xc0) != 0x80 ) || ((src[3] & 0xc0) != 0x80 ))
|
499
|
+
INVALID_ENCODING("malformed byte sequence"); // should have second and third bytes starting with 10......
|
500
|
+
dest = ((uint32_t)(src[0] & 0x07)) << 18 | ((uint32_t)(src[1] & 0x3f)) << 12 | ((uint32_t)(src[1] & 0x3f)) << 6 | (src[2] & 0x3f);
|
501
|
+
*width_out = 4;
|
502
|
+
}
|
503
|
+
else // invalid input
|
504
|
+
INVALID_ENCODING("unexpected byte");
|
505
|
+
return dest;
|
506
|
+
}
|
507
|
+
|
508
|
+
inline VALUE _Wikitext_utf32_char_to_entity(uint32_t character)
|
509
|
+
{
|
510
|
+
// TODO: consider special casing some entities (ie. quot, amp, lt, gt etc)?
|
511
|
+
char hex_string[8] = { '&', '#', 'x', 0, 0, 0, 0, ';' };
|
512
|
+
char scratch = (character & 0xf000) >> 12;
|
513
|
+
hex_string[3] = (scratch <= 9 ? scratch + 48 : scratch + 87);
|
514
|
+
scratch = (character & 0x0f00) >> 8;
|
515
|
+
hex_string[4] = (scratch <= 9 ? scratch + 48 : scratch + 87);
|
516
|
+
scratch = (character & 0x00f0) >> 4;
|
517
|
+
hex_string[5] = (scratch <= 9 ? scratch + 48 : scratch + 87);
|
518
|
+
scratch = character & 0x000f;
|
519
|
+
hex_string[6] = (scratch <= 9 ? scratch + 48 : scratch + 87);
|
520
|
+
return rb_str_new((const char *)hex_string, sizeof(hex_string));
|
521
|
+
}
|
522
|
+
|
523
|
+
inline VALUE _Wikitext_parser_trim_link_target(VALUE string)
|
524
|
+
{
|
525
|
+
string = StringValue(string);
|
526
|
+
char *src = RSTRING_PTR(string);
|
527
|
+
char *start = src; // remember this so we can check if we're at the start
|
528
|
+
char *left = src;
|
529
|
+
char *non_space = src; // remember last non-space character output
|
530
|
+
long len = RSTRING_LEN(string);
|
531
|
+
char *end = src + len;
|
532
|
+
while (src < end)
|
533
|
+
{
|
534
|
+
if (*src == ' ')
|
535
|
+
{
|
536
|
+
if (src == left)
|
537
|
+
*left++;
|
538
|
+
}
|
539
|
+
else
|
540
|
+
non_space = src;
|
541
|
+
src++;
|
542
|
+
}
|
543
|
+
if (left == start && non_space + 1 == end)
|
544
|
+
return string;
|
545
|
+
else
|
546
|
+
return rb_str_new(left, (non_space + 1) - left);
|
547
|
+
}
|
548
|
+
|
549
|
+
// - non-printable (non-ASCII) characters converted to numeric entities
|
550
|
+
// - QUOT and AMP characters converted to named entities
|
551
|
+
// - leading and trailing whitespace trimmed if trim is Qtrue
|
552
|
+
inline VALUE _Wikitext_parser_sanitize_link_target(VALUE string, VALUE trim)
|
553
|
+
{
|
554
|
+
string = StringValue(string); // raises if string is nil or doesn't quack like a string
|
555
|
+
char *src = RSTRING_PTR(string);
|
556
|
+
char *start = src; // remember this so we can check if we're at the start
|
557
|
+
long len = RSTRING_LEN(string);
|
558
|
+
char *end = src + len;
|
559
|
+
|
560
|
+
// start with a destination buffer twice the size of the source, will realloc if necessary
|
561
|
+
// slop = (len / 8) * 8 (ie. one in every 8 characters can be converted into an entity, each entity requires 8 bytes)
|
562
|
+
// this efficiently handles the most common case (where the size of the buffer doesn't change much)
|
563
|
+
char *dest = ALLOC_N(char, len * 2);
|
564
|
+
char *dest_ptr = dest; // hang on to this so we can pass it to free() later
|
565
|
+
char *non_space = dest; // remember last non-space character output
|
566
|
+
while (src < end)
|
567
|
+
{
|
568
|
+
// need at most 8 characters (8 bytes) to display each character
|
569
|
+
if (dest + 8 > dest_ptr + len) // outgrowing buffer, must reallocate
|
570
|
+
{
|
571
|
+
char *old_dest = dest;
|
572
|
+
char *old_dest_ptr = dest_ptr;
|
573
|
+
len = len + (end - src) * 8; // allocate enough for worst case
|
574
|
+
dest = realloc(dest_ptr, len); // will never have to realloc more than once
|
575
|
+
if (dest == NULL)
|
576
|
+
{
|
577
|
+
// would have used reallocf, but this has to run on Linux too, not just Darwin
|
578
|
+
free(dest_ptr);
|
579
|
+
rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
|
580
|
+
}
|
581
|
+
dest_ptr = dest;
|
582
|
+
dest = dest_ptr + (old_dest - old_dest_ptr);
|
583
|
+
non_space = dest_ptr + (non_space - old_dest_ptr);
|
584
|
+
}
|
585
|
+
|
586
|
+
if (*src == '"') // QUOT
|
587
|
+
{
|
588
|
+
char quot_entity_literal[] = { '&', 'q', 'u', 'o', 't', ';' }; // no trailing NUL
|
589
|
+
memcpy(dest, quot_entity_literal, sizeof(quot_entity_literal));
|
590
|
+
dest += sizeof(quot_entity_literal);
|
591
|
+
}
|
592
|
+
else if (*src == '&') // AMP
|
593
|
+
{
|
594
|
+
char amp_entity_literal[] = { '&', 'a', 'm', 'p', ';' }; // no trailing NUL
|
595
|
+
memcpy(dest, amp_entity_literal, sizeof(amp_entity_literal));
|
596
|
+
dest += sizeof(amp_entity_literal);
|
597
|
+
}
|
598
|
+
else if (*src == '<') // LESS_THAN
|
599
|
+
{
|
600
|
+
free(dest_ptr);
|
601
|
+
rb_raise(rb_eRangeError, "invalid link text (\"<\" may not appear in link text)");
|
602
|
+
}
|
603
|
+
else if (*src == '>') // GREATER_THAN
|
604
|
+
{
|
605
|
+
free(dest_ptr);
|
606
|
+
rb_raise(rb_eRangeError, "invalid link text (\">\" may not appear in link text)");
|
607
|
+
}
|
608
|
+
else if (*src == ' ' && src == start && trim == Qtrue)
|
609
|
+
start++; // we eat leading space
|
610
|
+
else if (*src >= 0x20 && *src <= 0x7e) // printable ASCII
|
611
|
+
{
|
612
|
+
*dest = *src;
|
613
|
+
dest++;
|
614
|
+
}
|
615
|
+
else // all others: must convert to entities
|
616
|
+
{
|
617
|
+
long width;
|
618
|
+
VALUE entity = _Wikitext_utf32_char_to_entity(_Wikitext_utf8_to_utf32(src, end, &width, dest_ptr));
|
619
|
+
char *entity_src = RSTRING_PTR(entity);
|
620
|
+
long entity_len = RSTRING_LEN(entity); // should always be 8 characters (8 bytes)
|
621
|
+
memcpy(dest, entity_src, entity_len);
|
622
|
+
dest += entity_len;
|
623
|
+
src += width;
|
624
|
+
non_space = dest;
|
625
|
+
continue;
|
626
|
+
}
|
627
|
+
if (*src != ' ')
|
628
|
+
non_space = dest;
|
629
|
+
src++;
|
630
|
+
}
|
631
|
+
|
632
|
+
// trim trailing space if necessary
|
633
|
+
if (trim == Qtrue && non_space > dest_ptr && dest != non_space)
|
634
|
+
len = non_space - dest_ptr;
|
635
|
+
else
|
636
|
+
len = dest - dest_ptr;
|
637
|
+
VALUE out = rb_str_new(dest_ptr, len);
|
638
|
+
free(dest_ptr);
|
639
|
+
return out;
|
640
|
+
}
|
641
|
+
|
642
|
+
VALUE Wikitext_parser_sanitize_link_target(VALUE self, VALUE string)
|
643
|
+
{
|
644
|
+
return (_Wikitext_parser_sanitize_link_target(string, Qtrue));
|
645
|
+
}
|
646
|
+
|
647
|
+
// encodes the input string according to RFCs 2396 and 2718
|
648
|
+
// leading and trailing whitespace trimmed
|
649
|
+
// note that the first character of the target link is not case-sensitive
|
650
|
+
// (this is a recommended application-level constraint; it is not imposed at this level)
|
651
|
+
// this is to allow links like:
|
652
|
+
// ...the [[foo]] is...
|
653
|
+
// to be equivalent to:
|
654
|
+
// thing. [[Foo]] was...
|
655
|
+
// this is also where we check treat_slash_as_special is true and act accordingly
|
656
|
+
// basically any link target matching /\A[a-z]+\/\d+\z/ is flagged as special
|
657
|
+
inline static void _Wikitext_parser_encode_link_target(parser_t *parser)
|
658
|
+
{
|
659
|
+
VALUE in = StringValue(parser->link_target);
|
660
|
+
char *input = RSTRING_PTR(in);
|
661
|
+
char *start = input; // remember this so we can check if we're at the start
|
662
|
+
long len = RSTRING_LEN(in);
|
663
|
+
if (!(len > 0))
|
664
|
+
return;
|
665
|
+
char *end = input + len;
|
666
|
+
static char hex[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
|
667
|
+
|
668
|
+
// this potential shortcut requires an (admittedly cheap) prescan, so only do it when treat_slash_as_special is true
|
669
|
+
parser->special_link = Qfalse;
|
670
|
+
if (parser->treat_slash_as_special == Qtrue)
|
671
|
+
{
|
672
|
+
char *c = input; // \A
|
673
|
+
while (c < end && *c >= 'a' && *c <= 'z') // [a-z]
|
674
|
+
c++; // +
|
675
|
+
if (c > start && c < end && *c++ == '/') // \/
|
676
|
+
{
|
677
|
+
while (c < end && *c >= '0' && *c <= '9') // \d
|
678
|
+
{
|
679
|
+
c++; // +
|
680
|
+
if (c == end) // \z
|
681
|
+
{
|
682
|
+
// matches /\A[a-z]+\/\d+\z/ so no transformation required
|
683
|
+
parser->special_link = Qtrue;
|
684
|
+
return;
|
685
|
+
}
|
686
|
+
}
|
687
|
+
}
|
688
|
+
}
|
689
|
+
|
690
|
+
// to avoid most reallocations start with a destination buffer twice the size of the source
|
691
|
+
// this handles the most common case (where most chars are in the ASCII range and don't require more storage, but there are
|
692
|
+
// often quite a few spaces, which are encoded as "%20" and occupy 3 bytes)
|
693
|
+
// the worst case is where _every_ byte must be written out using 3 bytes
|
694
|
+
long dest_len = len * 2;
|
695
|
+
char *dest = ALLOC_N(char, dest_len);
|
696
|
+
char *dest_ptr = dest; // hang on to this so we can pass it to free() later
|
697
|
+
char *non_space = dest; // remember last non-space character output
|
698
|
+
for (; input < end; input++)
|
699
|
+
{
|
700
|
+
if ((dest + 3) > (dest_ptr + dest_len)) // worst case: a single character may grow to 3 characters once encoded
|
701
|
+
{
|
702
|
+
// outgrowing buffer, must reallocate
|
703
|
+
char *old_dest = dest;
|
704
|
+
char *old_dest_ptr = dest_ptr;
|
705
|
+
dest_len += len;
|
706
|
+
dest = realloc(dest_ptr, dest_len);
|
707
|
+
if (dest == NULL)
|
708
|
+
{
|
709
|
+
// would have used reallocf, but this has to run on Linux too, not just Darwin
|
710
|
+
free(dest_ptr);
|
711
|
+
rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
|
712
|
+
}
|
713
|
+
dest_ptr = dest;
|
714
|
+
dest = dest_ptr + (old_dest - old_dest_ptr);
|
715
|
+
non_space = dest_ptr + (non_space - old_dest_ptr);
|
716
|
+
}
|
717
|
+
|
718
|
+
// pass through unreserved characters
|
719
|
+
if (((*input >= 'a') && (*input <= 'z')) ||
|
720
|
+
((*input >= 'A') && (*input <= 'Z')) ||
|
721
|
+
((*input >= '0') && (*input <= '9')) ||
|
722
|
+
(*input == '-') ||
|
723
|
+
(*input == '_') ||
|
724
|
+
(*input == '.') ||
|
725
|
+
(*input == '~'))
|
726
|
+
{
|
727
|
+
*dest++ = *input;
|
728
|
+
non_space = dest;
|
729
|
+
}
|
730
|
+
else if (*input == ' ' && input == start)
|
731
|
+
start++; // we eat leading space
|
732
|
+
else // everything else gets URL-encoded
|
733
|
+
{
|
734
|
+
*dest++ = '%';
|
735
|
+
*dest++ = hex[(unsigned char)(*input) / 16]; // left
|
736
|
+
*dest++ = hex[(unsigned char)(*input) % 16]; // right
|
737
|
+
if (*input != ' ')
|
738
|
+
non_space = dest;
|
739
|
+
}
|
740
|
+
}
|
741
|
+
|
742
|
+
// trim trailing space if necessary
|
743
|
+
if (non_space > dest_ptr && dest - 1 != non_space)
|
744
|
+
dest_len = non_space - dest_ptr;
|
745
|
+
else
|
746
|
+
dest_len = dest - dest_ptr;
|
747
|
+
parser->link_target = rb_str_new(dest_ptr, dest_len);
|
748
|
+
free(dest_ptr);
|
749
|
+
}
|
750
|
+
|
751
|
+
VALUE Wikitext_parser_encode_link_target(VALUE self, VALUE in)
|
752
|
+
{
|
753
|
+
parser_t parser;
|
754
|
+
parser.link_target = in;
|
755
|
+
parser.treat_slash_as_special = Qfalse;
|
756
|
+
_Wikitext_parser_encode_link_target(&parser);
|
757
|
+
return parser.link_target;
|
758
|
+
}
|
759
|
+
|
760
|
+
// this method exposed for testing only
|
761
|
+
VALUE Wikitext_parser_encode_special_link_target(VALUE self, VALUE in)
|
762
|
+
{
|
763
|
+
parser_t parser;
|
764
|
+
parser.link_target = in;
|
765
|
+
parser.treat_slash_as_special = Qtrue;
|
766
|
+
_Wikitext_parser_encode_link_target(&parser);
|
767
|
+
return parser.link_target;
|
768
|
+
}
|
769
|
+
|
770
|
+
// not sure whether these rollback functions should be inline: could refactor them into a single non-inlined function
|
771
|
+
inline void _Wikitext_rollback_failed_link(parser_t *parser)
|
772
|
+
{
|
773
|
+
if (!IN(LINK_START))
|
774
|
+
return; // nothing to do!
|
775
|
+
int scope_includes_separator = IN(SEPARATOR);
|
776
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, LINK_START, Qtrue);
|
777
|
+
rb_str_cat(parser->output, link_start, sizeof(link_start) - 1);
|
778
|
+
if (!NIL_P(parser->link_target))
|
779
|
+
{
|
780
|
+
VALUE sanitized = _Wikitext_parser_sanitize_link_target(parser->link_target, Qfalse);
|
781
|
+
rb_str_append(parser->output, sanitized);
|
782
|
+
if (scope_includes_separator)
|
783
|
+
{
|
784
|
+
rb_str_cat(parser->output, separator, sizeof(separator) - 1);
|
785
|
+
if (!NIL_P(parser->link_text))
|
786
|
+
rb_str_append(parser->output, parser->link_text);
|
787
|
+
}
|
788
|
+
}
|
789
|
+
parser->capture = Qnil;
|
790
|
+
parser->link_target = Qnil;
|
791
|
+
parser->link_text = Qnil;
|
792
|
+
}
|
793
|
+
|
794
|
+
inline void _Wikitext_rollback_failed_external_link(parser_t *parser)
|
795
|
+
{
|
796
|
+
if (!IN(EXT_LINK_START))
|
797
|
+
return; // nothing to do!
|
798
|
+
int scope_includes_space = IN(SPACE);
|
799
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, EXT_LINK_START, Qtrue);
|
800
|
+
rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
|
801
|
+
if (!NIL_P(parser->link_target))
|
802
|
+
{
|
803
|
+
if (parser->autolink == Qtrue)
|
804
|
+
parser->link_target = _Wikitext_hyperlink(Qnil, parser->link_target, parser->link_target, parser->external_link_class);
|
805
|
+
rb_str_append(parser->output, parser->link_target);
|
806
|
+
if (scope_includes_space)
|
807
|
+
{
|
808
|
+
rb_str_cat(parser->output, space, sizeof(space) - 1);
|
809
|
+
if (!NIL_P(parser->link_text))
|
810
|
+
rb_str_append(parser->output, parser->link_text);
|
811
|
+
}
|
812
|
+
}
|
813
|
+
parser->capture = Qnil;
|
814
|
+
parser->link_target = Qnil;
|
815
|
+
parser->link_text = Qnil;
|
816
|
+
}
|
817
|
+
|
818
|
+
VALUE Wikitext_parser_initialize(VALUE self)
|
819
|
+
{
|
820
|
+
// no need to call super here; rb_call_super()
|
821
|
+
rb_iv_set(self, "@autolink", Qtrue);
|
822
|
+
rb_iv_set(self, "@line_ending", rb_str_new2("\n"));
|
823
|
+
rb_iv_set(self, "@external_link_class", rb_str_new2("external"));
|
824
|
+
rb_iv_set(self, "@mailto_class", rb_str_new2("mailto"));
|
825
|
+
rb_iv_set(self, "@internal_link_prefix", rb_str_new2("/wiki/"));
|
826
|
+
rb_iv_set(self, "@treat_slash_as_special", Qtrue);
|
827
|
+
return self;
|
828
|
+
}
|
829
|
+
|
830
|
+
VALUE Wikitext_parser_profiling_parse(VALUE self, VALUE string)
|
831
|
+
{
|
832
|
+
for (int i = 0; i < 100000; i++)
|
833
|
+
Wikitext_parser_parse(1, &string, self);
|
834
|
+
}
|
835
|
+
|
836
|
+
VALUE Wikitext_parser_parse(int argc, VALUE *argv, VALUE self)
|
837
|
+
{
|
838
|
+
// process arguments
|
839
|
+
VALUE string, options;
|
840
|
+
if (rb_scan_args(argc, argv, "11", &string, &options) == 1) // 1 mandatory argument, 1 optional argument
|
841
|
+
options = Qnil;
|
842
|
+
if (NIL_P(string))
|
843
|
+
return Qnil;
|
844
|
+
string = StringValue(string);
|
845
|
+
|
846
|
+
// process options hash
|
847
|
+
int base_indent = 0;
|
848
|
+
VALUE indent = Qnil;
|
849
|
+
if (!NIL_P(options) && TYPE(options) == T_HASH)
|
850
|
+
{
|
851
|
+
indent = rb_hash_aref(options, ID2SYM(rb_intern("indent")));
|
852
|
+
base_indent = NUM2INT(indent);
|
853
|
+
if (base_indent < 0)
|
854
|
+
base_indent = 0;
|
855
|
+
}
|
856
|
+
|
857
|
+
// set up scanner
|
858
|
+
char *p = RSTRING_PTR(string);
|
859
|
+
long len = RSTRING_LEN(string);
|
860
|
+
char *pe = p + len;
|
861
|
+
|
862
|
+
// access these once per parse
|
863
|
+
VALUE line_ending = rb_iv_get(self, "@line_ending");
|
864
|
+
line_ending = StringValue(line_ending);
|
865
|
+
VALUE link_class = rb_iv_get(self, "@external_link_class");
|
866
|
+
link_class = NIL_P(link_class) ? Qnil : StringValue(link_class);
|
867
|
+
VALUE mailto_class = rb_iv_get(self, "@mailto_class");
|
868
|
+
mailto_class = NIL_P(mailto_class) ? Qnil : StringValue(mailto_class);
|
869
|
+
VALUE prefix = rb_iv_get(self, "@internal_link_prefix");
|
870
|
+
|
871
|
+
// set up parser struct to make passing parameters a little easier
|
872
|
+
// eventually this will encapsulate most or all of the variables above
|
873
|
+
parser_t _parser;
|
874
|
+
parser_t *parser = &_parser;
|
875
|
+
parser->output = rb_str_new2("");
|
876
|
+
parser->capture = Qnil;
|
877
|
+
parser->link_target = Qnil;
|
878
|
+
parser->link_text = Qnil;
|
879
|
+
parser->external_link_class = link_class;
|
880
|
+
parser->scope = ary_new();
|
881
|
+
parser->line = ary_new();
|
882
|
+
parser->line_buffer = ary_new();
|
883
|
+
parser->pending_crlf = Qfalse;
|
884
|
+
parser->autolink = rb_iv_get(self, "@autolink");
|
885
|
+
parser->treat_slash_as_special = rb_iv_get(self, "@treat_slash_as_special");
|
886
|
+
parser->special_link = Qfalse;
|
887
|
+
parser->line_ending = str_new_from_string(line_ending);
|
888
|
+
parser->base_indent = base_indent;
|
889
|
+
parser->current_indent = 0;
|
890
|
+
parser->tabulation = NULL;
|
891
|
+
|
892
|
+
token_t _token;
|
893
|
+
_token.type = NO_TOKEN;
|
894
|
+
token_t *token = NULL;
|
895
|
+
do
|
896
|
+
{
|
897
|
+
// note that whenever we grab a token we push it into the line buffer
|
898
|
+
// this provides us with context-sensitive "memory" of what's been seen so far on this line
|
899
|
+
#define NEXT_TOKEN() token = &_token, next_token(token, token, NULL, pe), ary_push(parser->line_buffer, token->type)
|
900
|
+
|
901
|
+
// check to see if we have a token hanging around from a previous iteration of this loop
|
902
|
+
if (token == NULL)
|
903
|
+
{
|
904
|
+
if (_token.type == NO_TOKEN)
|
905
|
+
{
|
906
|
+
// first time here (haven't started scanning yet)
|
907
|
+
token = &_token;
|
908
|
+
next_token(token, NULL, p, pe);
|
909
|
+
ary_push(parser->line_buffer, token->type);
|
910
|
+
}
|
911
|
+
else
|
912
|
+
// already scanning
|
913
|
+
NEXT_TOKEN();
|
914
|
+
}
|
915
|
+
int type = token->type;
|
916
|
+
|
917
|
+
// many restrictions depend on what is at the top of the stack
|
918
|
+
int top = ary_entry(parser->scope, -1);
|
919
|
+
|
920
|
+
// can't declare new variables inside a switch statement, so predeclare them here
|
921
|
+
long remove_strong = -1;
|
922
|
+
long remove_em = -1;
|
923
|
+
|
924
|
+
// general purpose counters and flags
|
925
|
+
long i = 0;
|
926
|
+
long j = 0;
|
927
|
+
long k = 0;
|
928
|
+
|
929
|
+
// The following giant switch statement contains cases for all the possible token types.
|
930
|
+
// In the most basic sense we are emitting the HTML that corresponds to each token,
|
931
|
+
// but some tokens require context information in order to decide what to output.
|
932
|
+
// For example, does the STRONG token (''') translate to <strong> or </strong>?
|
933
|
+
// So when looking at any given token we have three state-maintaining variables which gives us a notion of "where we are":
|
934
|
+
//
|
935
|
+
// - the "scope" stack (indicates what HTML DOM structures we are currently nested inside, similar to a CSS selector)
|
936
|
+
// - the line buffer (records tokens seen so far on the current line)
|
937
|
+
// - the line "scope" stack (indicates what the scope should be based only on what is visible on the line so far)
|
938
|
+
//
|
939
|
+
// Although this is fairly complicated, there is one key simplifying factor:
|
940
|
+
// The translator continuously performs auto-correction, and this means that we always have a guarantee that the
|
941
|
+
// scope stack (up to the current token) is valid; our translator can take this as a given.
|
942
|
+
// Auto-correction basically consists of inserting missing tokens (preventing subsquent HTML from being messed up),
|
943
|
+
// or converting illegal (unexpected) tokens to their plain text equivalents (providing visual feedback to Wikitext author).
|
944
|
+
switch (type)
|
945
|
+
{
|
946
|
+
case PRE:
|
947
|
+
if (IN(NO_WIKI_START) || IN(PRE_START))
|
948
|
+
{
|
949
|
+
rb_str_cat(parser->output, space, sizeof(space) - 1);
|
950
|
+
break;
|
951
|
+
}
|
952
|
+
else if (IN(BLOCKQUOTE_START))
|
953
|
+
{
|
954
|
+
// this kind of nesting not allowed (to avoid user confusion)
|
955
|
+
_Wikitext_pop_excess_elements(parser);
|
956
|
+
_Wikitext_start_para_if_necessary(parser);
|
957
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
958
|
+
rb_str_cat(i, space, sizeof(space) - 1);
|
959
|
+
break;
|
960
|
+
}
|
961
|
+
|
962
|
+
// count number of BLOCKQUOTE tokens in line buffer and in scope stack
|
963
|
+
ary_push(parser->line, PRE);
|
964
|
+
i = ary_count(parser->line, BLOCKQUOTE);
|
965
|
+
j = ary_count(parser->scope, BLOCKQUOTE);
|
966
|
+
if (i < j)
|
967
|
+
{
|
968
|
+
// must pop (reduce nesting level)
|
969
|
+
for (i = j - i; i > 0; i--)
|
970
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
|
971
|
+
}
|
972
|
+
|
973
|
+
if (!IN(PRE))
|
974
|
+
{
|
975
|
+
parser->pending_crlf = Qfalse;
|
976
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
|
977
|
+
_Wikitext_indent(parser);
|
978
|
+
rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
|
979
|
+
ary_push(parser->scope, PRE);
|
980
|
+
}
|
981
|
+
break;
|
982
|
+
|
983
|
+
case PRE_START:
|
984
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
985
|
+
rb_str_cat(parser->output, escaped_pre_start, sizeof(escaped_pre_start) - 1);
|
986
|
+
else if (IN(BLOCKQUOTE_START))
|
987
|
+
{
|
988
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
989
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
990
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
|
991
|
+
_Wikitext_indent(parser);
|
992
|
+
rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
|
993
|
+
ary_push(parser->scope, PRE_START);
|
994
|
+
ary_push(parser->line, PRE_START);
|
995
|
+
}
|
996
|
+
else if (parser->scope->count == 0 || (IN(P) && !IN(BLOCKQUOTE)))
|
997
|
+
{
|
998
|
+
// would be nice to eliminate the repetition here but it's probably the clearest way
|
999
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
1000
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1001
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, P, Qtrue);
|
1002
|
+
_Wikitext_indent(parser);
|
1003
|
+
rb_str_cat(parser->output, pre_start, sizeof(pre_start) - 1);
|
1004
|
+
ary_push(parser->scope, PRE_START);
|
1005
|
+
ary_push(parser->line, PRE_START);
|
1006
|
+
}
|
1007
|
+
else
|
1008
|
+
{
|
1009
|
+
// everywhere else, PRE_START is illegal (in LI, BLOCKQUOTE, H1_START etc)
|
1010
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1011
|
+
_Wikitext_pop_excess_elements(parser);
|
1012
|
+
_Wikitext_start_para_if_necessary(parser);
|
1013
|
+
rb_str_cat(i, escaped_pre_start, sizeof(escaped_pre_start) - 1);
|
1014
|
+
}
|
1015
|
+
break;
|
1016
|
+
|
1017
|
+
case PRE_END:
|
1018
|
+
if (IN(NO_WIKI_START) || IN(PRE))
|
1019
|
+
rb_str_cat(parser->output, escaped_pre_end, sizeof(escaped_pre_end) - 1);
|
1020
|
+
else
|
1021
|
+
{
|
1022
|
+
if (IN(PRE_START))
|
1023
|
+
_Wikitext_pop_from_stack_up_to(parser, parser->output, PRE_START, Qtrue);
|
1024
|
+
else
|
1025
|
+
{
|
1026
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1027
|
+
_Wikitext_pop_excess_elements(parser);
|
1028
|
+
_Wikitext_start_para_if_necessary(parser);
|
1029
|
+
rb_str_cat(i, escaped_pre_end, sizeof(escaped_pre_end) - 1);
|
1030
|
+
}
|
1031
|
+
}
|
1032
|
+
break;
|
1033
|
+
|
1034
|
+
case BLOCKQUOTE:
|
1035
|
+
if (IN(NO_WIKI_START) || IN(PRE_START))
|
1036
|
+
// no need to check for <pre>; can never appear inside it
|
1037
|
+
rb_str_cat(parser->output, escaped_blockquote, TOKEN_LEN(token) + 3); // will either emit ">" or "> "
|
1038
|
+
else if (IN(BLOCKQUOTE_START))
|
1039
|
+
{
|
1040
|
+
// this kind of nesting not allowed (to avoid user confusion)
|
1041
|
+
_Wikitext_pop_excess_elements(parser);
|
1042
|
+
_Wikitext_start_para_if_necessary(parser);
|
1043
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1044
|
+
rb_str_cat(i, escaped_blockquote, TOKEN_LEN(token) + 3); // will either emit ">" or "> "
|
1045
|
+
break;
|
1046
|
+
}
|
1047
|
+
else
|
1048
|
+
{
|
1049
|
+
ary_push(parser->line, BLOCKQUOTE);
|
1050
|
+
|
1051
|
+
// count number of BLOCKQUOTE tokens in line buffer and in scope stack
|
1052
|
+
i = ary_count(parser->line, BLOCKQUOTE);
|
1053
|
+
j = ary_count(parser->scope, BLOCKQUOTE);
|
1054
|
+
|
1055
|
+
// given that BLOCKQUOTE tokens can be nested, peek ahead and see if there are any more which might affect the decision to push or pop
|
1056
|
+
while (NEXT_TOKEN(), (token->type == BLOCKQUOTE))
|
1057
|
+
{
|
1058
|
+
ary_push(parser->line, BLOCKQUOTE);
|
1059
|
+
i++;
|
1060
|
+
}
|
1061
|
+
|
1062
|
+
// now decide whether to push, pop or do nothing
|
1063
|
+
if (i > j)
|
1064
|
+
{
|
1065
|
+
// must push (increase nesting level)
|
1066
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
|
1067
|
+
for (i = i - j; i > 0; i--)
|
1068
|
+
{
|
1069
|
+
_Wikitext_indent(parser);
|
1070
|
+
rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
|
1071
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
1072
|
+
ary_push(parser->scope, BLOCKQUOTE);
|
1073
|
+
}
|
1074
|
+
}
|
1075
|
+
else if (i < j)
|
1076
|
+
{
|
1077
|
+
// must pop (reduce nesting level)
|
1078
|
+
for (i = j - i; i > 0; i--)
|
1079
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
|
1080
|
+
}
|
1081
|
+
|
1082
|
+
// jump to top of the loop to process token we scanned during lookahead
|
1083
|
+
continue;
|
1084
|
+
}
|
1085
|
+
break;
|
1086
|
+
|
1087
|
+
case BLOCKQUOTE_START:
|
1088
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1089
|
+
rb_str_cat(parser->output, escaped_blockquote_start, sizeof(escaped_blockquote_start) - 1);
|
1090
|
+
else if (IN(BLOCKQUOTE_START))
|
1091
|
+
{
|
1092
|
+
// nesting is fine here
|
1093
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
1094
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1095
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
|
1096
|
+
_Wikitext_indent(parser);
|
1097
|
+
rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
|
1098
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
1099
|
+
ary_push(parser->scope, BLOCKQUOTE_START);
|
1100
|
+
ary_push(parser->line, BLOCKQUOTE_START);
|
1101
|
+
}
|
1102
|
+
else if (parser->scope->count == 0 || (IN(P) && !IN(BLOCKQUOTE)))
|
1103
|
+
{
|
1104
|
+
// would be nice to eliminate the repetition here but it's probably the clearest way
|
1105
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
1106
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1107
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, P, Qtrue);
|
1108
|
+
_Wikitext_indent(parser);
|
1109
|
+
rb_str_cat(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
|
1110
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
1111
|
+
ary_push(parser->scope, BLOCKQUOTE_START);
|
1112
|
+
ary_push(parser->line, BLOCKQUOTE_START);
|
1113
|
+
}
|
1114
|
+
else
|
1115
|
+
{
|
1116
|
+
// everywhere else, BLOCKQUOTE_START is illegal (in LI, BLOCKQUOTE, H1_START etc)
|
1117
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1118
|
+
_Wikitext_pop_excess_elements(parser);
|
1119
|
+
_Wikitext_start_para_if_necessary(parser);
|
1120
|
+
rb_str_cat(i, escaped_blockquote_start, sizeof(escaped_blockquote_start) - 1);
|
1121
|
+
}
|
1122
|
+
break;
|
1123
|
+
|
1124
|
+
case BLOCKQUOTE_END:
|
1125
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1126
|
+
rb_str_cat(parser->output, escaped_blockquote_end, sizeof(escaped_blockquote_end) - 1);
|
1127
|
+
else
|
1128
|
+
{
|
1129
|
+
if (IN(BLOCKQUOTE_START))
|
1130
|
+
_Wikitext_pop_from_stack_up_to(parser, parser->output, BLOCKQUOTE_START, Qtrue);
|
1131
|
+
else
|
1132
|
+
{
|
1133
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1134
|
+
_Wikitext_pop_excess_elements(parser);
|
1135
|
+
_Wikitext_start_para_if_necessary(parser);
|
1136
|
+
rb_str_cat(i, escaped_blockquote_end, sizeof(escaped_blockquote_end) - 1);
|
1137
|
+
}
|
1138
|
+
}
|
1139
|
+
break;
|
1140
|
+
|
1141
|
+
case NO_WIKI_START:
|
1142
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1143
|
+
rb_str_cat(parser->output, escaped_no_wiki_start, sizeof(escaped_no_wiki_start) - 1);
|
1144
|
+
else
|
1145
|
+
{
|
1146
|
+
_Wikitext_pop_excess_elements(parser);
|
1147
|
+
_Wikitext_start_para_if_necessary(parser);
|
1148
|
+
ary_push(parser->scope, NO_WIKI_START);
|
1149
|
+
ary_push(parser->line, NO_WIKI_START);
|
1150
|
+
}
|
1151
|
+
break;
|
1152
|
+
|
1153
|
+
case NO_WIKI_END:
|
1154
|
+
if (IN(NO_WIKI_START))
|
1155
|
+
// <nowiki> should always only ever be the last item in the stack, but use the helper routine just in case
|
1156
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, NO_WIKI_START, Qtrue);
|
1157
|
+
else
|
1158
|
+
{
|
1159
|
+
_Wikitext_pop_excess_elements(parser);
|
1160
|
+
_Wikitext_start_para_if_necessary(parser);
|
1161
|
+
rb_str_cat(parser->output, escaped_no_wiki_end, sizeof(escaped_no_wiki_end) - 1);
|
1162
|
+
}
|
1163
|
+
break;
|
1164
|
+
|
1165
|
+
case STRONG_EM:
|
1166
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1167
|
+
{
|
1168
|
+
rb_str_cat(parser->output, literal_strong_em, sizeof(literal_strong_em) - 1);
|
1169
|
+
break;
|
1170
|
+
}
|
1171
|
+
|
1172
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1173
|
+
_Wikitext_pop_excess_elements(parser);
|
1174
|
+
|
1175
|
+
// if you've seen STRONG/STRONG_START or EM/EM_START, must close them in the reverse order that you saw them!
|
1176
|
+
// otherwise, must open them
|
1177
|
+
remove_strong = -1;
|
1178
|
+
remove_em = -1;
|
1179
|
+
j = parser->scope->count;
|
1180
|
+
for (j = j - 1; j >= 0; j--)
|
1181
|
+
{
|
1182
|
+
int val = ary_entry(parser->scope, j);
|
1183
|
+
if (val == STRONG || val == STRONG_START)
|
1184
|
+
{
|
1185
|
+
rb_str_cat(i, strong_end, sizeof(strong_end) - 1);
|
1186
|
+
remove_strong = j;
|
1187
|
+
}
|
1188
|
+
else if (val == EM || val == EM_START)
|
1189
|
+
{
|
1190
|
+
rb_str_cat(i, em_end, sizeof(em_end) - 1);
|
1191
|
+
remove_em = j;
|
1192
|
+
}
|
1193
|
+
}
|
1194
|
+
|
1195
|
+
if (remove_strong > remove_em) // must remove strong first
|
1196
|
+
{
|
1197
|
+
ary_pop(parser->scope);
|
1198
|
+
if (remove_em > -1)
|
1199
|
+
ary_pop(parser->scope);
|
1200
|
+
else // there was no em to remove!, so consider this an opening em tag
|
1201
|
+
{
|
1202
|
+
rb_str_cat(i, em_start, sizeof(em_start) - 1);
|
1203
|
+
ary_push(parser->scope, EM);
|
1204
|
+
ary_push(parser->line, EM);
|
1205
|
+
}
|
1206
|
+
}
|
1207
|
+
else if (remove_em > remove_strong) // must remove em first
|
1208
|
+
{
|
1209
|
+
ary_pop(parser->scope);
|
1210
|
+
if (remove_strong > -1)
|
1211
|
+
ary_pop(parser->scope);
|
1212
|
+
else // there was no strong to remove!, so consider this an opening strong tag
|
1213
|
+
{
|
1214
|
+
rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
|
1215
|
+
ary_push(parser->scope, STRONG);
|
1216
|
+
ary_push(parser->line, STRONG);
|
1217
|
+
}
|
1218
|
+
}
|
1219
|
+
else // no strong or em to remove, so this must be a new opening of both
|
1220
|
+
{
|
1221
|
+
_Wikitext_start_para_if_necessary(parser);
|
1222
|
+
rb_str_cat(i, strong_em_start, sizeof(strong_em_start) - 1);
|
1223
|
+
ary_push(parser->scope, STRONG);
|
1224
|
+
ary_push(parser->line, STRONG);
|
1225
|
+
ary_push(parser->scope, EM);
|
1226
|
+
ary_push(parser->line, EM);
|
1227
|
+
}
|
1228
|
+
break;
|
1229
|
+
|
1230
|
+
case STRONG:
|
1231
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1232
|
+
rb_str_cat(parser->output, literal_strong, sizeof(literal_strong) - 1);
|
1233
|
+
else
|
1234
|
+
{
|
1235
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1236
|
+
if (IN(STRONG_START))
|
1237
|
+
// already in span started with <strong>, no choice but to emit this literally
|
1238
|
+
rb_str_cat(parser->output, literal_strong, sizeof(literal_strong) - 1);
|
1239
|
+
else if (IN(STRONG))
|
1240
|
+
// STRONG already seen, this is a closing tag
|
1241
|
+
_Wikitext_pop_from_stack_up_to(parser, i, STRONG, Qtrue);
|
1242
|
+
else
|
1243
|
+
{
|
1244
|
+
// this is a new opening
|
1245
|
+
_Wikitext_pop_excess_elements(parser);
|
1246
|
+
_Wikitext_start_para_if_necessary(parser);
|
1247
|
+
rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
|
1248
|
+
ary_push(parser->scope, STRONG);
|
1249
|
+
ary_push(parser->line, STRONG);
|
1250
|
+
}
|
1251
|
+
}
|
1252
|
+
break;
|
1253
|
+
|
1254
|
+
case STRONG_START:
|
1255
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1256
|
+
rb_str_cat(parser->output, escaped_strong_start, sizeof(escaped_strong_start) - 1);
|
1257
|
+
else
|
1258
|
+
{
|
1259
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1260
|
+
if (IN(STRONG_START) || IN(STRONG))
|
1261
|
+
rb_str_cat(parser->output, escaped_strong_start, sizeof(escaped_strong_start) - 1);
|
1262
|
+
else
|
1263
|
+
{
|
1264
|
+
_Wikitext_pop_excess_elements(parser);
|
1265
|
+
_Wikitext_start_para_if_necessary(parser);
|
1266
|
+
rb_str_cat(i, strong_start, sizeof(strong_start) - 1);
|
1267
|
+
ary_push(parser->scope, STRONG_START);
|
1268
|
+
ary_push(parser->line, STRONG_START);
|
1269
|
+
}
|
1270
|
+
}
|
1271
|
+
break;
|
1272
|
+
|
1273
|
+
case STRONG_END:
|
1274
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1275
|
+
rb_str_cat(parser->output, escaped_strong_end, sizeof(escaped_strong_end) - 1);
|
1276
|
+
else
|
1277
|
+
{
|
1278
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1279
|
+
if (IN(STRONG_START))
|
1280
|
+
_Wikitext_pop_from_stack_up_to(parser, i, STRONG_START, Qtrue);
|
1281
|
+
else
|
1282
|
+
{
|
1283
|
+
// no STRONG_START in scope, so must interpret the STRONG_END without any special meaning
|
1284
|
+
_Wikitext_pop_excess_elements(parser);
|
1285
|
+
_Wikitext_start_para_if_necessary(parser);
|
1286
|
+
rb_str_cat(i, escaped_strong_end, sizeof(escaped_strong_end) - 1);
|
1287
|
+
}
|
1288
|
+
}
|
1289
|
+
break;
|
1290
|
+
|
1291
|
+
case EM:
|
1292
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1293
|
+
rb_str_cat(parser->output, literal_em, sizeof(literal_em) - 1);
|
1294
|
+
else
|
1295
|
+
{
|
1296
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1297
|
+
if (IN(EM_START))
|
1298
|
+
// already in span started with <em>, no choice but to emit this literally
|
1299
|
+
rb_str_cat(parser->output, literal_em, sizeof(literal_em) - 1);
|
1300
|
+
else if (IN(EM))
|
1301
|
+
// EM already seen, this is a closing tag
|
1302
|
+
_Wikitext_pop_from_stack_up_to(parser, i, EM, Qtrue);
|
1303
|
+
else
|
1304
|
+
{
|
1305
|
+
// this is a new opening
|
1306
|
+
_Wikitext_pop_excess_elements(parser);
|
1307
|
+
_Wikitext_start_para_if_necessary(parser);
|
1308
|
+
rb_str_cat(i, em_start, sizeof(em_start) - 1);
|
1309
|
+
ary_push(parser->scope, EM);
|
1310
|
+
ary_push(parser->line, EM);
|
1311
|
+
}
|
1312
|
+
}
|
1313
|
+
break;
|
1314
|
+
|
1315
|
+
case EM_START:
|
1316
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1317
|
+
rb_str_cat(parser->output, escaped_em_start, sizeof(escaped_em_start) - 1);
|
1318
|
+
else
|
1319
|
+
{
|
1320
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1321
|
+
if (IN(EM_START) || IN(EM))
|
1322
|
+
rb_str_cat(i, escaped_em_start, sizeof(escaped_em_start) - 1);
|
1323
|
+
else
|
1324
|
+
{
|
1325
|
+
_Wikitext_pop_excess_elements(parser);
|
1326
|
+
_Wikitext_start_para_if_necessary(parser);
|
1327
|
+
rb_str_cat(i, em_start, sizeof(em_start) - 1);
|
1328
|
+
ary_push(parser->scope, EM_START);
|
1329
|
+
ary_push(parser->line, EM_START);
|
1330
|
+
}
|
1331
|
+
}
|
1332
|
+
break;
|
1333
|
+
|
1334
|
+
case EM_END:
|
1335
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1336
|
+
rb_str_cat(parser->output, escaped_em_end, sizeof(escaped_em_end) - 1);
|
1337
|
+
else
|
1338
|
+
{
|
1339
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1340
|
+
if (IN(EM_START))
|
1341
|
+
_Wikitext_pop_from_stack_up_to(parser, i, EM_START, Qtrue);
|
1342
|
+
else
|
1343
|
+
{
|
1344
|
+
// no EM_START in scope, so must interpret the TT_END without any special meaning
|
1345
|
+
_Wikitext_pop_excess_elements(parser);
|
1346
|
+
_Wikitext_start_para_if_necessary(parser);
|
1347
|
+
rb_str_cat(i, escaped_em_end, sizeof(escaped_em_end) - 1);
|
1348
|
+
}
|
1349
|
+
}
|
1350
|
+
break;
|
1351
|
+
|
1352
|
+
case TT:
|
1353
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1354
|
+
rb_str_cat(parser->output, backtick, sizeof(backtick) - 1);
|
1355
|
+
else
|
1356
|
+
{
|
1357
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1358
|
+
if (IN(TT_START))
|
1359
|
+
// already in span started with <tt>, no choice but to emit this literally
|
1360
|
+
rb_str_cat(parser->output, backtick, sizeof(backtick) - 1);
|
1361
|
+
else if (IN(TT))
|
1362
|
+
// TT (`) already seen, this is a closing tag
|
1363
|
+
_Wikitext_pop_from_stack_up_to(parser, i, TT, Qtrue);
|
1364
|
+
else
|
1365
|
+
{
|
1366
|
+
// this is a new opening
|
1367
|
+
_Wikitext_pop_excess_elements(parser);
|
1368
|
+
_Wikitext_start_para_if_necessary(parser);
|
1369
|
+
rb_str_cat(i, tt_start, sizeof(tt_start) - 1);
|
1370
|
+
ary_push(parser->scope, TT);
|
1371
|
+
ary_push(parser->line, TT);
|
1372
|
+
}
|
1373
|
+
}
|
1374
|
+
break;
|
1375
|
+
|
1376
|
+
case TT_START:
|
1377
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1378
|
+
rb_str_cat(parser->output, escaped_tt_start, sizeof(escaped_tt_start) - 1);
|
1379
|
+
else
|
1380
|
+
{
|
1381
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1382
|
+
if (IN(TT_START) || IN(TT))
|
1383
|
+
rb_str_cat(i, escaped_tt_start, sizeof(escaped_tt_start) - 1);
|
1384
|
+
else
|
1385
|
+
{
|
1386
|
+
_Wikitext_pop_excess_elements(parser);
|
1387
|
+
_Wikitext_start_para_if_necessary(parser);
|
1388
|
+
rb_str_cat(i, tt_start, sizeof(tt_start) - 1);
|
1389
|
+
ary_push(parser->scope, TT_START);
|
1390
|
+
ary_push(parser->line, TT_START);
|
1391
|
+
}
|
1392
|
+
}
|
1393
|
+
break;
|
1394
|
+
|
1395
|
+
case TT_END:
|
1396
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1397
|
+
rb_str_cat(parser->output, escaped_tt_end, sizeof(escaped_tt_end) - 1);
|
1398
|
+
else
|
1399
|
+
{
|
1400
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1401
|
+
if (IN(TT_START))
|
1402
|
+
_Wikitext_pop_from_stack_up_to(parser, i, TT_START, Qtrue);
|
1403
|
+
else
|
1404
|
+
{
|
1405
|
+
// no TT_START in scope, so must interpret the TT_END without any special meaning
|
1406
|
+
_Wikitext_pop_excess_elements(parser);
|
1407
|
+
_Wikitext_start_para_if_necessary(parser);
|
1408
|
+
rb_str_cat(i, escaped_tt_end, sizeof(escaped_tt_end) - 1);
|
1409
|
+
}
|
1410
|
+
}
|
1411
|
+
break;
|
1412
|
+
|
1413
|
+
case OL:
|
1414
|
+
case UL:
|
1415
|
+
if (IN(NO_WIKI_START) || IN(PRE_START))
|
1416
|
+
{
|
1417
|
+
// no need to check for PRE; can never appear inside it
|
1418
|
+
rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
|
1419
|
+
break;
|
1420
|
+
}
|
1421
|
+
|
1422
|
+
// count number of tokens in line and scope stacks
|
1423
|
+
int bq_count = ary_count(parser->scope, BLOCKQUOTE_START);
|
1424
|
+
i = parser->line->count - ary_count(parser->line, BLOCKQUOTE_START);
|
1425
|
+
j = parser->scope->count - bq_count;
|
1426
|
+
k = i;
|
1427
|
+
|
1428
|
+
// list tokens can be nested so look ahead for any more which might affect the decision to push or pop
|
1429
|
+
for (;;)
|
1430
|
+
{
|
1431
|
+
type = token->type;
|
1432
|
+
if (type == OL || type == UL)
|
1433
|
+
{
|
1434
|
+
token = NULL;
|
1435
|
+
if (i - k >= 2) // already seen at least one OL or UL
|
1436
|
+
{
|
1437
|
+
ary_push(parser->line, NESTED_LIST); // which means this is a nested list
|
1438
|
+
i += 3;
|
1439
|
+
}
|
1440
|
+
else
|
1441
|
+
i += 2;
|
1442
|
+
ary_push(parser->line, type);
|
1443
|
+
ary_push(parser->line, LI);
|
1444
|
+
|
1445
|
+
// want to compare line with scope but can only do so if scope has enough items on it
|
1446
|
+
if (j >= i)
|
1447
|
+
{
|
1448
|
+
if (ary_entry(parser->scope, i + bq_count - 2) == type && ary_entry(parser->scope, i + bq_count - 1) == LI)
|
1449
|
+
{
|
1450
|
+
// line and scope match at this point: do nothing yet
|
1451
|
+
}
|
1452
|
+
else
|
1453
|
+
{
|
1454
|
+
// item just pushed onto line does not match corresponding slot of scope!
|
1455
|
+
for (; j >= i - 2; j--)
|
1456
|
+
// must pop back before emitting
|
1457
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
1458
|
+
|
1459
|
+
// will emit UL or OL, then LI
|
1460
|
+
break;
|
1461
|
+
}
|
1462
|
+
}
|
1463
|
+
else // line stack size now exceeds scope stack size: must increase nesting level
|
1464
|
+
break; // will emit UL or OL, then LI
|
1465
|
+
}
|
1466
|
+
else
|
1467
|
+
{
|
1468
|
+
// not a OL or UL token!
|
1469
|
+
if (j == i)
|
1470
|
+
// must close existing LI and re-open new one
|
1471
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
1472
|
+
else if (j > i)
|
1473
|
+
{
|
1474
|
+
// item just pushed onto line does not match corresponding slot of scope!
|
1475
|
+
for (; j >= i; j--)
|
1476
|
+
// must pop back before emitting
|
1477
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
1478
|
+
}
|
1479
|
+
break;
|
1480
|
+
}
|
1481
|
+
NEXT_TOKEN();
|
1482
|
+
}
|
1483
|
+
|
1484
|
+
// will emit
|
1485
|
+
if (type == OL || type == UL)
|
1486
|
+
{
|
1487
|
+
// if LI is at the top of a stack this is the start of a nested list
|
1488
|
+
if (j > 0 && ary_entry(parser->scope, -1) == LI)
|
1489
|
+
{
|
1490
|
+
// so we should precede it with a CRLF, and indicate that it's a nested list
|
1491
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
1492
|
+
ary_push(parser->scope, NESTED_LIST);
|
1493
|
+
}
|
1494
|
+
else
|
1495
|
+
{
|
1496
|
+
// this is a new list
|
1497
|
+
if (IN(BLOCKQUOTE_START))
|
1498
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
|
1499
|
+
else
|
1500
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
|
1501
|
+
}
|
1502
|
+
|
1503
|
+
// emit
|
1504
|
+
_Wikitext_indent(parser);
|
1505
|
+
if (type == OL)
|
1506
|
+
rb_str_cat(parser->output, ol_start, sizeof(ol_start) - 1);
|
1507
|
+
else if (type == UL)
|
1508
|
+
rb_str_cat(parser->output, ul_start, sizeof(ul_start) - 1);
|
1509
|
+
ary_push(parser->scope, type);
|
1510
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
1511
|
+
}
|
1512
|
+
else if (type == SPACE)
|
1513
|
+
// silently throw away the optional SPACE token after final list marker
|
1514
|
+
token = NULL;
|
1515
|
+
|
1516
|
+
_Wikitext_indent(parser);
|
1517
|
+
rb_str_cat(parser->output, li_start, sizeof(li_start) - 1);
|
1518
|
+
ary_push(parser->scope, LI);
|
1519
|
+
|
1520
|
+
// any subsequent UL or OL tokens on this line are syntax errors and must be emitted literally
|
1521
|
+
if (type == OL || type == UL)
|
1522
|
+
{
|
1523
|
+
k = 0;
|
1524
|
+
while (k++, NEXT_TOKEN(), (type = token->type))
|
1525
|
+
{
|
1526
|
+
if (type == OL || type == UL)
|
1527
|
+
rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
|
1528
|
+
else if (type == SPACE && k == 1)
|
1529
|
+
{
|
1530
|
+
// silently throw away the optional SPACE token after final list marker
|
1531
|
+
token = NULL;
|
1532
|
+
break;
|
1533
|
+
}
|
1534
|
+
else
|
1535
|
+
break;
|
1536
|
+
}
|
1537
|
+
}
|
1538
|
+
|
1539
|
+
// jump to top of the loop to process token we scanned during lookahead
|
1540
|
+
continue;
|
1541
|
+
|
1542
|
+
case H6_START:
|
1543
|
+
case H5_START:
|
1544
|
+
case H4_START:
|
1545
|
+
case H3_START:
|
1546
|
+
case H2_START:
|
1547
|
+
case H1_START:
|
1548
|
+
if (IN(NO_WIKI_START) || IN(PRE_START))
|
1549
|
+
{
|
1550
|
+
// no need to check for PRE; can never appear inside it
|
1551
|
+
rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
|
1552
|
+
break;
|
1553
|
+
}
|
1554
|
+
|
1555
|
+
// pop up to but not including the last BLOCKQUOTE on the scope stack
|
1556
|
+
if (IN(BLOCKQUOTE_START))
|
1557
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE_START, Qfalse);
|
1558
|
+
else
|
1559
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qfalse);
|
1560
|
+
|
1561
|
+
// count number of BLOCKQUOTE tokens in line buffer and in scope stack
|
1562
|
+
ary_push(parser->line, type);
|
1563
|
+
i = ary_count(parser->line, BLOCKQUOTE);
|
1564
|
+
j = ary_count(parser->scope, BLOCKQUOTE);
|
1565
|
+
|
1566
|
+
// decide whether we need to pop off excess BLOCKQUOTE tokens (will never need to push; that is handled above in the BLOCKQUOTE case itself)
|
1567
|
+
if (i < j)
|
1568
|
+
{
|
1569
|
+
// must pop (reduce nesting level)
|
1570
|
+
for (i = j - i; i > 0; i--)
|
1571
|
+
_Wikitext_pop_from_stack_up_to(parser, Qnil, BLOCKQUOTE, Qtrue);
|
1572
|
+
}
|
1573
|
+
|
1574
|
+
// discard any whitespace here (so that "== foo ==" will be translated to "<h2>foo</h2>" rather than "<h2> foo </h2")
|
1575
|
+
while (NEXT_TOKEN(), (token->type == SPACE))
|
1576
|
+
; // discard
|
1577
|
+
|
1578
|
+
ary_push(parser->scope, type);
|
1579
|
+
_Wikitext_indent(parser);
|
1580
|
+
|
1581
|
+
// rather than repeat all that code for each kind of heading, share it and use a conditional here
|
1582
|
+
if (type == H6_START)
|
1583
|
+
rb_str_cat(parser->output, h6_start, sizeof(h6_start) - 1);
|
1584
|
+
else if (type == H5_START)
|
1585
|
+
rb_str_cat(parser->output, h5_start, sizeof(h5_start) - 1);
|
1586
|
+
else if (type == H4_START)
|
1587
|
+
rb_str_cat(parser->output, h4_start, sizeof(h4_start) - 1);
|
1588
|
+
else if (type == H3_START)
|
1589
|
+
rb_str_cat(parser->output, h3_start, sizeof(h3_start) - 1);
|
1590
|
+
else if (type == H2_START)
|
1591
|
+
rb_str_cat(parser->output, h2_start, sizeof(h2_start) - 1);
|
1592
|
+
else if (type == H1_START)
|
1593
|
+
rb_str_cat(parser->output, h1_start, sizeof(h1_start) - 1);
|
1594
|
+
|
1595
|
+
// jump to top of the loop to process token we scanned during lookahead
|
1596
|
+
continue;
|
1597
|
+
|
1598
|
+
case H6_END:
|
1599
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1600
|
+
rb_str_cat(parser->output, literal_h6, sizeof(literal_h6) - 1);
|
1601
|
+
else
|
1602
|
+
{
|
1603
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1604
|
+
if (!IN(H6_START))
|
1605
|
+
{
|
1606
|
+
// literal output only if not in h6 scope (we stay silent in that case)
|
1607
|
+
_Wikitext_start_para_if_necessary(parser);
|
1608
|
+
rb_str_cat(parser->output, literal_h6, sizeof(literal_h6) - 1);
|
1609
|
+
}
|
1610
|
+
}
|
1611
|
+
break;
|
1612
|
+
|
1613
|
+
case H5_END:
|
1614
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1615
|
+
rb_str_cat(parser->output, literal_h5, sizeof(literal_h5) - 1);
|
1616
|
+
else
|
1617
|
+
{
|
1618
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1619
|
+
if (!IN(H5_START))
|
1620
|
+
{
|
1621
|
+
// literal output only if not in h5 scope (we stay silent in that case)
|
1622
|
+
_Wikitext_start_para_if_necessary(parser);
|
1623
|
+
rb_str_cat(parser->output, literal_h5, sizeof(literal_h5) - 1);
|
1624
|
+
}
|
1625
|
+
}
|
1626
|
+
break;
|
1627
|
+
|
1628
|
+
case H4_END:
|
1629
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1630
|
+
rb_str_cat(parser->output, literal_h4, sizeof(literal_h4) - 1);
|
1631
|
+
else
|
1632
|
+
{
|
1633
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1634
|
+
if (!IN(H4_START))
|
1635
|
+
{
|
1636
|
+
// literal output only if not in h4 scope (we stay silent in that case)
|
1637
|
+
_Wikitext_start_para_if_necessary(parser);
|
1638
|
+
rb_str_cat(parser->output, literal_h4, sizeof(literal_h4) - 1);
|
1639
|
+
}
|
1640
|
+
}
|
1641
|
+
break;
|
1642
|
+
|
1643
|
+
case H3_END:
|
1644
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1645
|
+
rb_str_cat(parser->output, literal_h3, sizeof(literal_h3) - 1);
|
1646
|
+
else
|
1647
|
+
{
|
1648
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1649
|
+
if (!IN(H3_START))
|
1650
|
+
{
|
1651
|
+
// literal output only if not in h3 scope (we stay silent in that case)
|
1652
|
+
_Wikitext_start_para_if_necessary(parser);
|
1653
|
+
rb_str_cat(parser->output, literal_h3, sizeof(literal_h3) - 1);
|
1654
|
+
}
|
1655
|
+
}
|
1656
|
+
break;
|
1657
|
+
|
1658
|
+
case H2_END:
|
1659
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1660
|
+
rb_str_cat(parser->output, literal_h2, sizeof(literal_h2) - 1);
|
1661
|
+
else
|
1662
|
+
{
|
1663
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1664
|
+
if (!IN(H2_START))
|
1665
|
+
{
|
1666
|
+
// literal output only if not in h2 scope (we stay silent in that case)
|
1667
|
+
_Wikitext_start_para_if_necessary(parser);
|
1668
|
+
rb_str_cat(parser->output, literal_h2, sizeof(literal_h2) - 1);
|
1669
|
+
}
|
1670
|
+
}
|
1671
|
+
break;
|
1672
|
+
|
1673
|
+
case H1_END:
|
1674
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1675
|
+
rb_str_cat(parser->output, literal_h1, sizeof(literal_h1) - 1);
|
1676
|
+
else
|
1677
|
+
{
|
1678
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
1679
|
+
if (!IN(H1_START))
|
1680
|
+
{
|
1681
|
+
// literal output only if not in h1 scope (we stay silent in that case)
|
1682
|
+
_Wikitext_start_para_if_necessary(parser);
|
1683
|
+
rb_str_cat(parser->output, literal_h1, sizeof(literal_h1) - 1);
|
1684
|
+
}
|
1685
|
+
}
|
1686
|
+
break;
|
1687
|
+
|
1688
|
+
case MAIL:
|
1689
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1690
|
+
rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
|
1691
|
+
else
|
1692
|
+
{
|
1693
|
+
// in plain scope, will turn into autolink (with appropriate, user-configurable CSS)
|
1694
|
+
_Wikitext_pop_excess_elements(parser);
|
1695
|
+
_Wikitext_start_para_if_necessary(parser);
|
1696
|
+
i = TOKEN_TEXT(token);
|
1697
|
+
if (parser->autolink == Qtrue)
|
1698
|
+
i = _Wikitext_hyperlink(rb_str_new2("mailto:"), i, i, mailto_class);
|
1699
|
+
rb_str_append(parser->output, i);
|
1700
|
+
}
|
1701
|
+
break;
|
1702
|
+
|
1703
|
+
case URI:
|
1704
|
+
if (IN(NO_WIKI_START))
|
1705
|
+
// user can temporarily suppress autolinking by using <nowiki></nowiki>
|
1706
|
+
// note that unlike MediaWiki, we do allow autolinking inside PRE blocks
|
1707
|
+
rb_str_cat(parser->output, token->start, TOKEN_LEN(token));
|
1708
|
+
else if (IN(LINK_START))
|
1709
|
+
{
|
1710
|
+
// if the URI were allowed it would have been handled already in LINK_START
|
1711
|
+
_Wikitext_rollback_failed_link(parser);
|
1712
|
+
i = TOKEN_TEXT(token);
|
1713
|
+
if (parser->autolink == Qtrue)
|
1714
|
+
i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
|
1715
|
+
rb_str_append(parser->output, i);
|
1716
|
+
}
|
1717
|
+
else if (IN(EXT_LINK_START))
|
1718
|
+
{
|
1719
|
+
if (NIL_P(parser->link_target))
|
1720
|
+
{
|
1721
|
+
// this must be our link target: look ahead to make sure we see the space we're expecting to see
|
1722
|
+
i = TOKEN_TEXT(token);
|
1723
|
+
NEXT_TOKEN();
|
1724
|
+
if (token->type == SPACE)
|
1725
|
+
{
|
1726
|
+
ary_push(parser->scope, SPACE);
|
1727
|
+
parser->link_target = i;
|
1728
|
+
parser->link_text = rb_str_new2("");
|
1729
|
+
parser->capture = parser->link_text;
|
1730
|
+
token = NULL; // silently consume space
|
1731
|
+
}
|
1732
|
+
else
|
1733
|
+
{
|
1734
|
+
// didn't see the space! this must be an error
|
1735
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
1736
|
+
_Wikitext_pop_excess_elements(parser);
|
1737
|
+
_Wikitext_start_para_if_necessary(parser);
|
1738
|
+
rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
|
1739
|
+
if (parser->autolink == Qtrue)
|
1740
|
+
i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
|
1741
|
+
rb_str_append(parser->output, i);
|
1742
|
+
}
|
1743
|
+
}
|
1744
|
+
else
|
1745
|
+
{
|
1746
|
+
if (NIL_P(parser->link_text))
|
1747
|
+
// this must be the first part of our link text
|
1748
|
+
parser->link_text = TOKEN_TEXT(token);
|
1749
|
+
else
|
1750
|
+
// add to existing link text
|
1751
|
+
rb_str_cat(parser->link_text, token->start, TOKEN_LEN(token));
|
1752
|
+
}
|
1753
|
+
}
|
1754
|
+
else
|
1755
|
+
{
|
1756
|
+
// in plain scope, will turn into autolink (with appropriate, user-configurable CSS)
|
1757
|
+
_Wikitext_pop_excess_elements(parser);
|
1758
|
+
_Wikitext_start_para_if_necessary(parser);
|
1759
|
+
i = TOKEN_TEXT(token);
|
1760
|
+
if (parser->autolink == Qtrue)
|
1761
|
+
i = _Wikitext_hyperlink(Qnil, i, i, parser->external_link_class); // link target, link text
|
1762
|
+
rb_str_append(parser->output, i);
|
1763
|
+
}
|
1764
|
+
break;
|
1765
|
+
|
1766
|
+
// internal links (links to other wiki articles) look like this:
|
1767
|
+
// [[another article]] (would point at, for example, "/wiki/another_article")
|
1768
|
+
// [[the other article|the link text we'll use for it]]
|
1769
|
+
// [[the other article | the link text we'll use for it]]
|
1770
|
+
// note that the forward slash is a reserved character which changes the meaning of an internal link;
|
1771
|
+
// this is a link that is external to the wiki but internal to the site as a whole:
|
1772
|
+
// [[bug/12]] (a relative link to "/bug/12")
|
1773
|
+
// MediaWiki has strict requirements about what it will accept as a link target:
|
1774
|
+
// all wikitext markup is disallowed:
|
1775
|
+
// example [[foo ''bar'' baz]]
|
1776
|
+
// renders [[foo <em>bar</em> baz]] (ie. not a link)
|
1777
|
+
// example [[foo <em>bar</em> baz]]
|
1778
|
+
// renders [[foo <em>bar</em> baz]] (ie. not a link)
|
1779
|
+
// example [[foo <nowiki>''</nowiki> baz]]
|
1780
|
+
// renders [[foo '' baz]] (ie. not a link)
|
1781
|
+
// example [[foo <bar> baz]]
|
1782
|
+
// renders [[foo <bar> baz]] (ie. not a link)
|
1783
|
+
// HTML entities and non-ASCII, however, make it through:
|
1784
|
+
// example [[foo €]]
|
1785
|
+
// renders <a href="/wiki/Foo_%E2%82%AC">foo €</a>
|
1786
|
+
// example [[foo €]]
|
1787
|
+
// renders <a href="/wiki/Foo_%E2%82%AC">foo €</a>
|
1788
|
+
// we'll impose similar restrictions here for the link target; allowed tokens will be:
|
1789
|
+
// SPACE, PRINTABLE, DEFAULT, QUOT and AMP
|
1790
|
+
// everything else will be rejected
|
1791
|
+
case LINK_START:
|
1792
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1793
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1794
|
+
rb_str_cat(i, link_start, sizeof(link_start) - 1);
|
1795
|
+
else if (IN(EXT_LINK_START))
|
1796
|
+
// already in external link scope! (and in fact, must be capturing link_text right now)
|
1797
|
+
rb_str_cat(i, link_start, sizeof(link_start) - 1);
|
1798
|
+
else if (IN(LINK_START))
|
1799
|
+
{
|
1800
|
+
// already in internal link scope! this is a syntax error
|
1801
|
+
_Wikitext_rollback_failed_link(parser);
|
1802
|
+
rb_str_cat(parser->output, link_start, sizeof(link_start) - 1);
|
1803
|
+
}
|
1804
|
+
else if (IN(SEPARATOR))
|
1805
|
+
{
|
1806
|
+
// scanning internal link text
|
1807
|
+
}
|
1808
|
+
else // not in internal link scope yet
|
1809
|
+
{
|
1810
|
+
// will either emit a link, or the rollback of a failed link, so start the para now
|
1811
|
+
_Wikitext_pop_excess_elements(parser);
|
1812
|
+
_Wikitext_start_para_if_necessary(parser);
|
1813
|
+
ary_push(parser->scope, LINK_START);
|
1814
|
+
|
1815
|
+
// look ahead and try to gobble up link target
|
1816
|
+
while (NEXT_TOKEN(), (type = token->type))
|
1817
|
+
{
|
1818
|
+
if (type == SPACE ||
|
1819
|
+
type == PRINTABLE ||
|
1820
|
+
type == DEFAULT ||
|
1821
|
+
type == QUOT ||
|
1822
|
+
type == QUOT_ENTITY ||
|
1823
|
+
type == AMP ||
|
1824
|
+
type == AMP_ENTITY)
|
1825
|
+
{
|
1826
|
+
// accumulate these tokens into link_target
|
1827
|
+
if (NIL_P(parser->link_target))
|
1828
|
+
{
|
1829
|
+
parser->link_target = rb_str_new2("");
|
1830
|
+
parser->capture = parser->link_target;
|
1831
|
+
}
|
1832
|
+
if (type == QUOT_ENTITY)
|
1833
|
+
// don't insert the entity, insert the literal quote
|
1834
|
+
rb_str_cat(parser->link_target, quote, sizeof(quote) - 1);
|
1835
|
+
else if (type == AMP_ENTITY)
|
1836
|
+
// don't insert the entity, insert the literal ampersand
|
1837
|
+
rb_str_cat(parser->link_target, ampersand, sizeof(ampersand) - 1);
|
1838
|
+
else
|
1839
|
+
rb_str_cat(parser->link_target, token->start, TOKEN_LEN(token));
|
1840
|
+
}
|
1841
|
+
else if (type == LINK_END)
|
1842
|
+
break; // jump back to top of loop (will handle this in LINK_END case below)
|
1843
|
+
else if (type == SEPARATOR)
|
1844
|
+
{
|
1845
|
+
ary_push(parser->scope, SEPARATOR);
|
1846
|
+
parser->link_text = rb_str_new2("");
|
1847
|
+
parser->capture = parser->link_text;
|
1848
|
+
token = NULL;
|
1849
|
+
break;
|
1850
|
+
}
|
1851
|
+
else // unexpected token (syntax error)
|
1852
|
+
{
|
1853
|
+
_Wikitext_rollback_failed_link(parser);
|
1854
|
+
break; // jump back to top of loop to handle unexpected token
|
1855
|
+
}
|
1856
|
+
}
|
1857
|
+
|
1858
|
+
// jump to top of the loop to process token we scanned during lookahead (if any)
|
1859
|
+
continue;
|
1860
|
+
}
|
1861
|
+
break;
|
1862
|
+
|
1863
|
+
case LINK_END:
|
1864
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1865
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1866
|
+
rb_str_cat(i, link_end, sizeof(link_end) - 1);
|
1867
|
+
else if (IN(EXT_LINK_START))
|
1868
|
+
// already in external link scope! (and in fact, must be capturing link_text right now)
|
1869
|
+
rb_str_cat(i, link_end, sizeof(link_end) - 1);
|
1870
|
+
else if (IN(LINK_START))
|
1871
|
+
{
|
1872
|
+
// in internal link scope!
|
1873
|
+
if (NIL_P(parser->link_text) || RSTRING_LEN(parser->link_text) == 0)
|
1874
|
+
// use link target as link text
|
1875
|
+
parser->link_text = _Wikitext_parser_sanitize_link_target(parser->link_target, Qtrue);
|
1876
|
+
else
|
1877
|
+
parser->link_text = _Wikitext_parser_trim_link_target(parser->link_text);
|
1878
|
+
_Wikitext_parser_encode_link_target(parser);
|
1879
|
+
_Wikitext_pop_from_stack_up_to(parser, i, LINK_START, Qtrue);
|
1880
|
+
parser->capture = Qnil;
|
1881
|
+
if (parser->special_link)
|
1882
|
+
i = _Wikitext_hyperlink(rb_str_new2("/"), parser->link_target, parser->link_text, Qnil);
|
1883
|
+
else
|
1884
|
+
i = _Wikitext_hyperlink(prefix, parser->link_target, parser->link_text, Qnil);
|
1885
|
+
rb_str_append(parser->output, i);
|
1886
|
+
parser->link_target = Qnil;
|
1887
|
+
parser->link_text = Qnil;
|
1888
|
+
}
|
1889
|
+
else // wasn't in internal link scope
|
1890
|
+
{
|
1891
|
+
_Wikitext_pop_excess_elements(parser);
|
1892
|
+
_Wikitext_start_para_if_necessary(parser);
|
1893
|
+
rb_str_cat(i, link_end, sizeof(link_end) - 1);
|
1894
|
+
}
|
1895
|
+
break;
|
1896
|
+
|
1897
|
+
// external links look like this:
|
1898
|
+
// [http://google.com/ the link text]
|
1899
|
+
// strings in square brackets which don't match this syntax get passed through literally; eg:
|
1900
|
+
// he was very angery [sic] about the turn of events
|
1901
|
+
case EXT_LINK_START:
|
1902
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1903
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1904
|
+
rb_str_cat(i, ext_link_start, sizeof(ext_link_start) - 1);
|
1905
|
+
else if (IN(EXT_LINK_START))
|
1906
|
+
// already in external link scope! (and in fact, must be capturing link_text right now)
|
1907
|
+
rb_str_cat(i, ext_link_start, sizeof(ext_link_start) - 1);
|
1908
|
+
else if (IN(LINK_START))
|
1909
|
+
{
|
1910
|
+
// already in internal link scope!
|
1911
|
+
i = rb_str_new(ext_link_start, sizeof(ext_link_start) - 1);
|
1912
|
+
if (NIL_P(parser->link_target))
|
1913
|
+
// this must be the first character of our link target
|
1914
|
+
parser->link_target = i;
|
1915
|
+
else if (IN(SPACE))
|
1916
|
+
{
|
1917
|
+
// link target has already been scanned
|
1918
|
+
if (NIL_P(parser->link_text))
|
1919
|
+
// this must be the first character of our link text
|
1920
|
+
parser->link_text = i;
|
1921
|
+
else
|
1922
|
+
// add to existing link text
|
1923
|
+
rb_str_append(parser->link_text, i);
|
1924
|
+
}
|
1925
|
+
else
|
1926
|
+
// add to existing link target
|
1927
|
+
rb_str_append(parser->link_target, i);
|
1928
|
+
}
|
1929
|
+
else // not in external link scope yet
|
1930
|
+
{
|
1931
|
+
// will either emit a link, or the rollback of a failed link, so start the para now
|
1932
|
+
_Wikitext_pop_excess_elements(parser);
|
1933
|
+
_Wikitext_start_para_if_necessary(parser);
|
1934
|
+
|
1935
|
+
// look ahead: expect a URI
|
1936
|
+
NEXT_TOKEN();
|
1937
|
+
if (token->type == URI)
|
1938
|
+
ary_push(parser->scope, EXT_LINK_START); // so far so good, jump back to the top of the loop
|
1939
|
+
else
|
1940
|
+
// only get here if there was a syntax error (missing URI)
|
1941
|
+
rb_str_cat(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
|
1942
|
+
continue; // jump back to top of loop to handle token (either URI or whatever it is)
|
1943
|
+
}
|
1944
|
+
break;
|
1945
|
+
|
1946
|
+
case EXT_LINK_END:
|
1947
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1948
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1949
|
+
rb_str_cat(i, ext_link_end, sizeof(ext_link_end) - 1);
|
1950
|
+
else if (IN(EXT_LINK_START))
|
1951
|
+
{
|
1952
|
+
if (NIL_P(parser->link_text))
|
1953
|
+
// syntax error: external link with no link text
|
1954
|
+
_Wikitext_rollback_failed_external_link(parser);
|
1955
|
+
else
|
1956
|
+
{
|
1957
|
+
// success!
|
1958
|
+
_Wikitext_pop_from_stack_up_to(parser, i, EXT_LINK_START, Qtrue);
|
1959
|
+
parser->capture = Qnil;
|
1960
|
+
i = _Wikitext_hyperlink(Qnil, parser->link_target, parser->link_text, parser->external_link_class);
|
1961
|
+
rb_str_append(parser->output, i);
|
1962
|
+
}
|
1963
|
+
parser->link_target = Qnil;
|
1964
|
+
parser->link_text = Qnil;
|
1965
|
+
}
|
1966
|
+
else
|
1967
|
+
{
|
1968
|
+
_Wikitext_pop_excess_elements(parser);
|
1969
|
+
_Wikitext_start_para_if_necessary(parser);
|
1970
|
+
rb_str_cat(parser->output, ext_link_end, sizeof(ext_link_end) - 1);
|
1971
|
+
}
|
1972
|
+
break;
|
1973
|
+
|
1974
|
+
case SEPARATOR:
|
1975
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1976
|
+
_Wikitext_pop_excess_elements(parser);
|
1977
|
+
_Wikitext_start_para_if_necessary(parser);
|
1978
|
+
rb_str_cat(i, separator, sizeof(separator) - 1);
|
1979
|
+
break;
|
1980
|
+
|
1981
|
+
case SPACE:
|
1982
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
1983
|
+
if (IN(NO_WIKI_START) || IN(PRE) || IN(PRE_START))
|
1984
|
+
rb_str_cat(i, token->start, TOKEN_LEN(token));
|
1985
|
+
else
|
1986
|
+
{
|
1987
|
+
// peek ahead to see next token
|
1988
|
+
char *token_ptr = token->start;
|
1989
|
+
int token_len = TOKEN_LEN(token);
|
1990
|
+
NEXT_TOKEN();
|
1991
|
+
type = token->type;
|
1992
|
+
if (((type == H6_END) && IN(H6_START)) ||
|
1993
|
+
((type == H5_END) && IN(H5_START)) ||
|
1994
|
+
((type == H4_END) && IN(H4_START)) ||
|
1995
|
+
((type == H3_END) && IN(H3_START)) ||
|
1996
|
+
((type == H2_END) && IN(H2_START)) ||
|
1997
|
+
((type == H1_END) && IN(H1_START)))
|
1998
|
+
{
|
1999
|
+
// will suppress emission of space (discard) if next token is a H6_END, H5_END etc and we are in the corresponding scope
|
2000
|
+
}
|
2001
|
+
else
|
2002
|
+
{
|
2003
|
+
// emit the space
|
2004
|
+
_Wikitext_pop_excess_elements(parser);
|
2005
|
+
_Wikitext_start_para_if_necessary(parser);
|
2006
|
+
rb_str_cat(i, token_ptr, token_len);
|
2007
|
+
}
|
2008
|
+
|
2009
|
+
// jump to top of the loop to process token we scanned during lookahead
|
2010
|
+
continue;
|
2011
|
+
}
|
2012
|
+
break;
|
2013
|
+
|
2014
|
+
case QUOT_ENTITY:
|
2015
|
+
case AMP_ENTITY:
|
2016
|
+
case NAMED_ENTITY:
|
2017
|
+
case DECIMAL_ENTITY:
|
2018
|
+
// pass these through unaltered as they are case sensitive
|
2019
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2020
|
+
_Wikitext_pop_excess_elements(parser);
|
2021
|
+
_Wikitext_start_para_if_necessary(parser);
|
2022
|
+
rb_str_cat(i, token->start, TOKEN_LEN(token));
|
2023
|
+
break;
|
2024
|
+
|
2025
|
+
case HEX_ENTITY:
|
2026
|
+
// normalize hex entities (downcase them)
|
2027
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2028
|
+
_Wikitext_pop_excess_elements(parser);
|
2029
|
+
_Wikitext_start_para_if_necessary(parser);
|
2030
|
+
rb_str_append(i, _Wikitext_downcase(TOKEN_TEXT(token)));
|
2031
|
+
break;
|
2032
|
+
|
2033
|
+
case QUOT:
|
2034
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2035
|
+
_Wikitext_pop_excess_elements(parser);
|
2036
|
+
_Wikitext_start_para_if_necessary(parser);
|
2037
|
+
rb_str_cat(i, quot_entity, sizeof(quot_entity) - 1);
|
2038
|
+
break;
|
2039
|
+
|
2040
|
+
case AMP:
|
2041
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2042
|
+
_Wikitext_pop_excess_elements(parser);
|
2043
|
+
_Wikitext_start_para_if_necessary(parser);
|
2044
|
+
rb_str_cat(i, amp_entity, sizeof(amp_entity) - 1);
|
2045
|
+
break;
|
2046
|
+
|
2047
|
+
case LESS:
|
2048
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2049
|
+
_Wikitext_pop_excess_elements(parser);
|
2050
|
+
_Wikitext_start_para_if_necessary(parser);
|
2051
|
+
rb_str_cat(i, lt_entity, sizeof(lt_entity) - 1);
|
2052
|
+
break;
|
2053
|
+
|
2054
|
+
case GREATER:
|
2055
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2056
|
+
_Wikitext_pop_excess_elements(parser);
|
2057
|
+
_Wikitext_start_para_if_necessary(parser);
|
2058
|
+
rb_str_cat(i, gt_entity, sizeof(gt_entity) - 1);
|
2059
|
+
break;
|
2060
|
+
|
2061
|
+
case CRLF:
|
2062
|
+
parser->pending_crlf = Qfalse;
|
2063
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
2064
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
2065
|
+
if (IN(NO_WIKI_START) || IN(PRE_START))
|
2066
|
+
{
|
2067
|
+
ary_clear(parser->line_buffer);
|
2068
|
+
rb_str_cat(parser->output, parser->line_ending->ptr, parser->line_ending->len);
|
2069
|
+
break;
|
2070
|
+
}
|
2071
|
+
else if (IN(PRE))
|
2072
|
+
{
|
2073
|
+
// beware when nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, that must be end of PRE block
|
2074
|
+
if (NO_ITEM(ary_entry(parser->line_buffer, -2)) || ary_entry(parser->line_buffer, -2) == BLOCKQUOTE)
|
2075
|
+
// don't emit in this case
|
2076
|
+
_Wikitext_pop_from_stack_up_to(parser, parser->output, PRE, Qtrue);
|
2077
|
+
else
|
2078
|
+
{
|
2079
|
+
// peek ahead to see if this is definitely the end of the PRE block
|
2080
|
+
NEXT_TOKEN();
|
2081
|
+
type = token->type;
|
2082
|
+
if (type != BLOCKQUOTE && type != PRE)
|
2083
|
+
{
|
2084
|
+
// this is definitely the end of the block, so don't emit
|
2085
|
+
_Wikitext_pop_from_stack_up_to(parser, parser->output, PRE, Qtrue);
|
2086
|
+
}
|
2087
|
+
else
|
2088
|
+
// potentially will emit
|
2089
|
+
parser->pending_crlf = Qtrue;
|
2090
|
+
|
2091
|
+
// delete the entire contents of the line scope stack and buffer
|
2092
|
+
ary_clear(parser->line);
|
2093
|
+
ary_clear(parser->line_buffer);
|
2094
|
+
continue; // jump back to top of loop to handle token grabbed via lookahead
|
2095
|
+
}
|
2096
|
+
}
|
2097
|
+
else
|
2098
|
+
{
|
2099
|
+
parser->pending_crlf = Qtrue;
|
2100
|
+
|
2101
|
+
// count number of BLOCKQUOTE tokens in line buffer (can be zero) and pop back to that level
|
2102
|
+
// as a side effect, this handles any open span-level elements and unclosed blocks
|
2103
|
+
// (with special handling for P blocks and LI elements)
|
2104
|
+
i = ary_count(parser->line, BLOCKQUOTE) + ary_count(parser->scope, BLOCKQUOTE_START);
|
2105
|
+
for (j = parser->scope->count; j > i; j--)
|
2106
|
+
{
|
2107
|
+
if (parser->scope->count > 0 && ary_entry(parser->scope, -1) == LI)
|
2108
|
+
{
|
2109
|
+
parser->pending_crlf = Qfalse;
|
2110
|
+
break;
|
2111
|
+
}
|
2112
|
+
|
2113
|
+
// special handling on last iteration through the loop if the top item on the scope is a P block
|
2114
|
+
if ((j - i == 1) && ary_entry(parser->scope, -1) == P)
|
2115
|
+
{
|
2116
|
+
// if nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, this must be a paragraph break
|
2117
|
+
// (note that we have to make sure we're not inside a BLOCKQUOTE_START block
|
2118
|
+
// because in those blocks BLOCKQUOTE tokens have no special meaning)
|
2119
|
+
if (NO_ITEM(ary_entry(parser->line_buffer, -2)) ||
|
2120
|
+
(ary_entry(parser->line_buffer, -2) == BLOCKQUOTE && !IN(BLOCKQUOTE_START)))
|
2121
|
+
// paragraph break
|
2122
|
+
parser->pending_crlf = Qfalse;
|
2123
|
+
else
|
2124
|
+
// not a paragraph break!
|
2125
|
+
continue;
|
2126
|
+
}
|
2127
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
2128
|
+
}
|
2129
|
+
}
|
2130
|
+
|
2131
|
+
// delete the entire contents of the line scope stack and buffer
|
2132
|
+
ary_clear(parser->line);
|
2133
|
+
ary_clear(parser->line_buffer);
|
2134
|
+
break;
|
2135
|
+
|
2136
|
+
case PRINTABLE:
|
2137
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2138
|
+
_Wikitext_pop_excess_elements(parser);
|
2139
|
+
_Wikitext_start_para_if_necessary(parser);
|
2140
|
+
rb_str_cat(i, token->start, TOKEN_LEN(token));
|
2141
|
+
break;
|
2142
|
+
|
2143
|
+
case DEFAULT:
|
2144
|
+
i = NIL_P(parser->capture) ? parser->output : parser->capture;
|
2145
|
+
_Wikitext_pop_excess_elements(parser);
|
2146
|
+
_Wikitext_start_para_if_necessary(parser);
|
2147
|
+
rb_str_append(i, _Wikitext_utf32_char_to_entity(token->code_point)); // convert to entity
|
2148
|
+
break;
|
2149
|
+
|
2150
|
+
case END_OF_FILE:
|
2151
|
+
// close any open scopes on hitting EOF
|
2152
|
+
_Wikitext_rollback_failed_external_link(parser); // if any
|
2153
|
+
_Wikitext_rollback_failed_link(parser); // if any
|
2154
|
+
for (i = 0, j = parser->scope->count; i < j; i++)
|
2155
|
+
_Wikitext_pop_from_stack(parser, Qnil);
|
2156
|
+
goto return_output; // break not enough here (want to break out of outer while loop, not inner switch statement)
|
2157
|
+
|
2158
|
+
default:
|
2159
|
+
break;
|
2160
|
+
}
|
2161
|
+
|
2162
|
+
// reset current token; forcing lexer to return another token at the top of the loop
|
2163
|
+
token = NULL;
|
2164
|
+
} while (1);
|
2165
|
+
return_output:
|
2166
|
+
// BUG: these will leak if we exit this function by raising an exception; need to investigate using Data_Wrap_Struct
|
2167
|
+
ary_free(parser->scope);
|
2168
|
+
ary_free(parser->line);
|
2169
|
+
ary_free(parser->line_buffer);
|
2170
|
+
str_free(parser->line_ending);
|
2171
|
+
if (parser->tabulation)
|
2172
|
+
str_free(parser->tabulation);
|
2173
|
+
return parser->output;
|
2174
|
+
}
|