wikitext 4.0.1 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1e73f2d1a7097dd2a9a1e0264f395ad0e331e291
4
- data.tar.gz: 2b4a73fe78d5ac9906ec32cece324f62a6142ed4
3
+ metadata.gz: c61ab6467120d8def1be560fcb4604ee7c90454b
4
+ data.tar.gz: 1c7be94a73a1d038a18744775054910772a964f0
5
5
  SHA512:
6
- metadata.gz: cc07aaadf9d10dcdf5c0f5cc1abf0e1336717881cd7385b9c2394ac1e4bf7c6bd6e0f1837b4a821d0c9ec13f5a2f17ff764c8416d34f5806ade940452ef3dca9
7
- data.tar.gz: 216d912ae01cfefc7abb1f8d700003d58d61c2757e2044ffc9d28f65b584f59d26ae21e0afffc20891d64733b19613fa1491a621b73db6f252c585ac90f9fd24
6
+ metadata.gz: df484b7d09e76c9b01a53cf4c8cb7b7bcea8fba56d6e7d1443e01259b3b41934e4f517a87cf0a4593ac33646c47a9b9b4c96accf4f810db83013d32a248dd3f7
7
+ data.tar.gz: 66d3a468c2d787a2ec572b24ff53e8210cc520f37fd4caab68fc3536c0640f01deaddbe59e22f892eea8f2872788ef2f50bc66bba363298c1e5cee7c655cd444
data/bin/wikitext CHANGED
@@ -1,116 +1,16 @@
1
1
  #!/usr/bin/env ruby
2
- # Copyright 2008-2013 Wincent Colaiuta. All rights reserved.
3
2
  #
4
- # Redistribution and use in source and binary forms, with or without
5
- # modification, are permitted provided that the following conditions are met:
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'wikitext' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
6
7
  #
7
- # 1. Redistributions of source code must retain the above copyright notice,
8
- # this list of conditions and the following disclaimer.
9
- # 2. Redistributions in binary form must reproduce the above copyright notice,
10
- # this list of conditions and the following disclaimer in the documentation
11
- # and/or other materials provided with the distribution.
12
-
13
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14
- # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16
- # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
17
- # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18
- # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19
- # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20
- # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21
- # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22
- # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
23
- # POSSIBILITY OF SUCH DAMAGE.
24
-
25
- require 'wikitext'
26
- require 'wikitext/version'
27
-
28
- module Wikitext
29
- module Tool
30
- # Simple substitute for the HighLine library if it is not available.
31
- class FakeHighLine
32
- def color(str, _)
33
- str
34
- end
35
-
36
- def output_cols
37
- 80
38
- end
39
- end
40
-
41
- INPUT_FILES = []
42
-
43
- def self.interactive?
44
- STDOUT.tty? && STDIN.tty? && INPUT_FILES.empty?
45
- end
46
-
47
- def self.pretty_print tokens
48
- tokens.each do |token|
49
- puts <<-END
50
- Token: type: #{token.token_type}
51
- line: #{token.line_start}..#{token.line_stop} column: #{token.column_start}..#{token.column_stop}
52
- pointer: #{token.start}..#{token.stop}
53
- code_point: #{token.code_point}
54
- string_value: #{token.string_value.inspect}
55
-
56
- END
57
- end
58
- end
59
8
 
60
- method = :parse
61
- ARGV.each do |arg|
62
- if arg =~ /\A--tok/
63
- method = :tokenize
64
- else
65
- INPUT_FILES << arg
66
- end
67
- end
9
+ require 'pathname'
10
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
11
+ Pathname.new(__FILE__).realpath)
68
12
 
69
- if interactive?
70
- begin
71
- require 'highline'
72
- rescue LoadError
73
- begin
74
- require 'rubygems'
75
- require 'highline'
76
- rescue LoadError
77
- end
78
- end
79
- puts "wikitext #{Wikitext::VERSION}"
80
- highline = (defined?(HighLine) ? HighLine : FakeHighLine).new
81
- end
13
+ require 'rubygems'
14
+ require 'bundler/setup'
82
15
 
83
- parser = Parser.new
84
- if INPUT_FILES.empty?
85
- begin
86
- while true
87
- puts highline.color('(Ctrl+D to process, Ctrl+C to exit)>>', :bold) if interactive?
88
- input = STDIN.read
89
- puts '-' * highline.output_cols if interactive?
90
- if method == :tokenize
91
- pretty_print parser.tokenize(input)
92
- else
93
- puts parser.parse(input)
94
- end
95
- puts '-' * highline.output_cols if interactive?
96
- exit unless interactive?
97
- end
98
- rescue Interrupt
99
- end
100
- else # we have INPUT_FILES
101
- exit_status = 0
102
- INPUT_FILES.each do |file|
103
- begin
104
- puts parser.parse(File.new(file).read)
105
- rescue Errno::ENOENT
106
- STDERR.puts "error: no such file or directory: #{file}"
107
- exit_status |= 1
108
- rescue Errno::EACCES
109
- STDERR.puts "error: permission denied: #{file}"
110
- exit_status |= 2
111
- end
112
- end
113
- exit exit_status
114
- end
115
- end # module Tool
116
- end # module Wikitext
16
+ load Gem.bin_path('wikitext', 'wikitext')
@@ -0,0 +1,116 @@
1
+ // Copyright 2008-2009 Wincent Colaiuta. All rights reserved.
2
+ //
3
+ // Redistribution and use in source and binary forms, with or without
4
+ // modification, are permitted provided that the following conditions are met:
5
+ //
6
+ // 1. Redistributions of source code must retain the above copyright notice,
7
+ // this list of conditions and the following disclaimer.
8
+ // 2. Redistributions in binary form must reproduce the above copyright notice,
9
+ // this list of conditions and the following disclaimer in the documentation
10
+ // and/or other materials provided with the distribution.
11
+ //
12
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
13
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
14
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
15
+ // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
16
+ // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
17
+ // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
18
+ // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
19
+ // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
20
+ // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
21
+ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
22
+ // POSSIBILITY OF SUCH DAMAGE.
23
+
24
+ #include "ary.h"
25
+
26
+ ary_t *ary_new(void)
27
+ {
28
+ ary_t *ary = ALLOC_N(ary_t, 1);
29
+ ary->count = 0;
30
+ ary->max = DEFAULT_ENTRY_COUNT;
31
+ ary->entries = ALLOC_N(int, DEFAULT_ENTRY_COUNT);
32
+ return ary;
33
+ }
34
+
35
+ int ary_entry(ary_t *ary, int idx)
36
+ {
37
+ if (idx < 0)
38
+ idx = ary->count + idx;
39
+ return (idx >= 0 && ary->count > idx) ? ary->entries[idx] : INT_MAX;
40
+ }
41
+
42
+ void ary_clear(ary_t *ary)
43
+ {
44
+ ary->count = 0;
45
+ }
46
+
47
+ int ary_pop(ary_t *ary)
48
+ {
49
+ if (ary->count > 0)
50
+ {
51
+ ary->count--;
52
+ return 1;
53
+ }
54
+ return 0;
55
+ }
56
+
57
+ void ary_push(ary_t *ary, int val)
58
+ {
59
+ if (ary->count == ary->max)
60
+ {
61
+ ary->max += DEFAULT_ENTRY_COUNT;
62
+ REALLOC_N(ary->entries, int, ary->max);
63
+ }
64
+ ary->entries[ary->count] = val;
65
+ ary->count++;
66
+ }
67
+
68
+ int ary_includes(ary_t *ary, int val)
69
+ {
70
+ for (int i = 0, max = ary->count; i < max; i++)
71
+ {
72
+ if (ary->entries[i] == val)
73
+ return 1;
74
+ }
75
+ return 0;
76
+ }
77
+
78
+ int ary_includes2(ary_t *ary, int val1, int val2)
79
+ {
80
+ for (int i = 0, max = ary->count; i < max; i++)
81
+ {
82
+ if (ary->entries[i] == val1 ||
83
+ ary->entries[i] == val2)
84
+ return 1;
85
+ }
86
+ return 0;
87
+ }
88
+
89
+ int ary_includes3(ary_t *ary, int val1, int val2, int val3)
90
+ {
91
+ for (int i = 0, max = ary->count; i < max; i++)
92
+ {
93
+ if (ary->entries[i] == val1 ||
94
+ ary->entries[i] == val2 ||
95
+ ary->entries[i] == val3)
96
+ return 1;
97
+ }
98
+ return 0;
99
+ }
100
+
101
+ int ary_count(ary_t *ary, int item)
102
+ {
103
+ int count = 0;
104
+ for (int i = 0, max = ary->count; i < max; i++)
105
+ {
106
+ if (ary->entries[i] == item)
107
+ count++;
108
+ }
109
+ return count;
110
+ }
111
+
112
+ void ary_free(ary_t *ary)
113
+ {
114
+ free(ary->entries);
115
+ free(ary);
116
+ }
@@ -0,0 +1,50 @@
1
+ // Copyright 2008-2009 Wincent Colaiuta. All rights reserved.
2
+ //
3
+ // Redistribution and use in source and binary forms, with or without
4
+ // modification, are permitted provided that the following conditions are met:
5
+ //
6
+ // 1. Redistributions of source code must retain the above copyright notice,
7
+ // this list of conditions and the following disclaimer.
8
+ // 2. Redistributions in binary form must reproduce the above copyright notice,
9
+ // this list of conditions and the following disclaimer in the documentation
10
+ // and/or other materials provided with the distribution.
11
+ //
12
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
13
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
14
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
15
+ // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
16
+ // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
17
+ // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
18
+ // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
19
+ // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
20
+ // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
21
+ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
22
+ // POSSIBILITY OF SUCH DAMAGE.
23
+
24
+ #include "ruby_compat.h"
25
+
26
+ typedef struct
27
+ {
28
+ int count;
29
+ int max;
30
+ int *entries;
31
+ } ary_t;
32
+
33
+ // in the test suite array count goes no higher than 25 or 26
34
+ #define DEFAULT_ENTRY_COUNT 64
35
+
36
+ #define NO_ITEM(item) (item == INT_MAX)
37
+
38
+ ary_t *ary_new(void);
39
+ int ary_entry(ary_t *ary, int idx);
40
+ void ary_clear(ary_t *ary);
41
+ int ary_pop(ary_t *ary);
42
+ void ary_push(ary_t *ary, int val);
43
+ int ary_includes(ary_t *ary, int val);
44
+ int ary_includes2(ary_t *ary, int val1, int val2);
45
+ int ary_includes3(ary_t *ary, int val1, int val2, int val3);
46
+
47
+ // returns a count indicating the number of times the value appears in the collection
48
+ int ary_count(ary_t *ary, int item);
49
+
50
+ void ary_free(ary_t *ary);
@@ -0,0 +1,32 @@
1
+ # Copyright 2008-2010 Wincent Colaiuta. All rights reserved.
2
+ #
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions are met:
5
+ #
6
+ # 1. Redistributions of source code must retain the above copyright notice,
7
+ # this list of conditions and the following disclaimer.
8
+ # 2. Redistributions in binary form must reproduce the above copyright notice,
9
+ # this list of conditions and the following disclaimer in the documentation
10
+ # and/or other materials provided with the distribution.
11
+ #
12
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
13
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
14
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
15
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
16
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
17
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
18
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
19
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
20
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
21
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
22
+ # POSSIBILITY OF SUCH DAMAGE.
23
+
24
+ # don't warn about unused params because many Ruby methods accept "self" but don't use it
25
+ CFLAGS += -std=gnu99 -Wall -Wextra -Wno-unused-parameter
26
+
27
+ ary.o : ary.c ary.h ruby_compat.h
28
+ parser.o : ary.c ary.h parser.c parser.h ruby_compat.h str.c str.h token.h wikitext.h wikitext_ragel.h
29
+ str.o : ruby_compat.h str.c str.h
30
+ token.o : ruby_compat.h token.c token.h wikitext.h
31
+ wikitext.o : parser.h ruby_compat.h token.h wikitext.c wikitext.h wikitext_ragel.h
32
+ wikitext_ragel.o : ruby_compat.h token.h wikitext.h wikitext_ragel.h wikitext_ragel.c
@@ -0,0 +1,2595 @@
1
+ // Copyright 2007-2013 Wincent Colaiuta. All rights reserved.
2
+ //
3
+ // Redistribution and use in source and binary forms, with or without
4
+ // modification, are permitted provided that the following conditions are met:
5
+ //
6
+ // 1. Redistributions of source code must retain the above copyright notice,
7
+ // this list of conditions and the following disclaimer.
8
+ // 2. Redistributions in binary form must reproduce the above copyright notice,
9
+ // this list of conditions and the following disclaimer in the documentation
10
+ // and/or other materials provided with the distribution.
11
+ //
12
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
13
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
14
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
15
+ // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
16
+ // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
17
+ // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
18
+ // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
19
+ // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
20
+ // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
21
+ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
22
+ // POSSIBILITY OF SUCH DAMAGE.
23
+
24
+ #include <stdbool.h>
25
+
26
+ #include "parser.h"
27
+ #include "ary.h"
28
+ #include "str.h"
29
+ #include "wikitext.h"
30
+ #include "wikitext_ragel.h"
31
+
32
+ #define IN(type) ary_includes(parser->scope, type)
33
+ #define IN_EITHER_OF(type1, type2) ary_includes2(parser->scope, type1, type2)
34
+ #define IN_ANY_OF(type1, type2, type3) ary_includes3(parser->scope, type1, type2, type3)
35
+
36
+ // output styles
37
+ enum { HTML_OUTPUT, XML_OUTPUT };
38
+
39
+ // poor man's object orientation in C:
40
+ // instead of passing around multiple parameters between functions in the parser
41
+ // we pack everything into a struct and pass around only a pointer to that
42
+ typedef struct
43
+ {
44
+ str_t *capture; // capturing to link_target, link_text, or NULL (direct to output, not capturing)
45
+ str_t *output; // for accumulating output to be returned
46
+ str_t *link_target; // short term "memory" for parsing links
47
+ str_t *link_text; // short term "memory" for parsing links
48
+ str_t *line_ending;
49
+ str_t *tabulation; // caching buffer for emitting indentation
50
+ ary_t *scope; // stack for tracking scope
51
+ ary_t *line; // stack for tracking scope as implied by current line
52
+ ary_t *line_buffer; // stack for tracking raw tokens (not scope) on current line
53
+ VALUE external_link_class; // CSS class applied to external links
54
+ VALUE external_link_rel; // rel attribute applied to external links
55
+ VALUE mailto_class; // CSS class applied to email (mailto) links
56
+ VALUE img_prefix; // path prepended when emitting img tags
57
+ int output_style; // HTML_OUTPUT (default) or XML_OUTPUT
58
+ int base_indent; // controlled by the :indent option to Wikitext::Parser#parse
59
+ int current_indent; // fluctuates according to currently nested structures
60
+ int base_heading_level;
61
+ bool pending_crlf;
62
+ bool autolink;
63
+ bool space_to_underscore;
64
+ } parser_t;
65
+
66
+ const char null_str[] = { 0 };
67
+ const char escaped_no_wiki_start[] = "&lt;nowiki&gt;";
68
+ const char escaped_no_wiki_end[] = "&lt;/nowiki&gt;";
69
+ const char literal_strong_em[] = "'''''";
70
+ const char literal_strong[] = "'''";
71
+ const char literal_em[] = "''";
72
+ const char escaped_em_start[] = "&lt;em&gt;";
73
+ const char escaped_em_end[] = "&lt;/em&gt;";
74
+ const char escaped_strong_start[] = "&lt;strong&gt;";
75
+ const char escaped_strong_end[] = "&lt;/strong&gt;";
76
+ const char escaped_tt_start[] = "&lt;tt&gt;";
77
+ const char escaped_tt_end[] = "&lt;/tt&gt;";
78
+ const char pre_start[] = "<pre>";
79
+ const char pre_end[] = "</pre>";
80
+ const char escaped_pre_start[] = "&lt;pre&gt;";
81
+ const char escaped_pre_end[] = "&lt;/pre&gt;";
82
+ const char blockquote_start[] = "<blockquote>";
83
+ const char blockquote_end[] = "</blockquote>";
84
+ const char escaped_blockquote_start[] = "&lt;blockquote&gt;";
85
+ const char escaped_blockquote_end[] = "&lt;/blockquote&gt;";
86
+ const char strong_em_start[] = "<strong><em>";
87
+ const char strong_start[] = "<strong>";
88
+ const char strong_end[] = "</strong>";
89
+ const char em_start[] = "<em>";
90
+ const char em_end[] = "</em>";
91
+ const char code_start[] = "<code>";
92
+ const char code_end[] = "</code>";
93
+ const char ol_start[] = "<ol>";
94
+ const char ol_end[] = "</ol>";
95
+ const char ul_start[] = "<ul>";
96
+ const char ul_end[] = "</ul>";
97
+ const char li_start[] = "<li>";
98
+ const char li_end[] = "</li>";
99
+ const char h6_start[] = "<h6>";
100
+ const char h6_end[] = "</h6>";
101
+ const char h5_start[] = "<h5>";
102
+ const char h5_end[] = "</h5>";
103
+ const char h4_start[] = "<h4>";
104
+ const char h4_end[] = "</h4>";
105
+ const char h3_start[] = "<h3>";
106
+ const char h3_end[] = "</h3>";
107
+ const char h2_start[] = "<h2>";
108
+ const char h2_end[] = "</h2>";
109
+ const char h1_start[] = "<h1>";
110
+ const char h1_end[] = "</h1>";
111
+ const char p_start[] = "<p>";
112
+ const char p_end[] = "</p>";
113
+ const char space[] = " ";
114
+ const char a_start[] = "<a href=\"";
115
+ const char a_class[] = "\" class=\"";
116
+ const char a_rel[] = "\" rel=\"";
117
+ const char a_start_close[] = "\">";
118
+ const char a_end[] = "</a>";
119
+ const char link_start[] = "[[";
120
+ const char link_end[] = "]]";
121
+ const char separator[] = "|";
122
+ const char ext_link_start[] = "[";
123
+ const char backtick[] = "`";
124
+ const char quote[] = "\"";
125
+ const char ampersand[] = "&";
126
+ const char quot_entity[] = "&quot;";
127
+ const char amp_entity[] = "&amp;";
128
+ const char lt_entity[] = "&lt;";
129
+ const char gt_entity[] = "&gt;";
130
+ const char escaped_blockquote[] = "&gt; ";
131
+ const char ext_link_end[] = "]";
132
+ const char literal_img_start[] = "{{";
133
+ const char img_start[] = "<img src=\"";
134
+ const char img_end_xml[] = "\" />";
135
+ const char img_end_html[] = "\">";
136
+ const char img_alt[] = "\" alt=\"";
137
+ const char pre_class_start[] = "<pre class=\"";
138
+ const char pre_class_end[] = "-syntax\">";
139
+
140
+ // Mark the parser struct designated by ptr as a participant in Ruby's
141
+ // mark-and-sweep garbage collection scheme. A variable named name is placed on
142
+ // the C stack to prevent the structure from being prematurely collected.
143
+ #define GC_WRAP_PARSER(ptr, name) volatile VALUE name __attribute__((unused)) = Data_Wrap_Struct(rb_cObject, 0, parser_free, ptr)
144
+
145
+ parser_t *parser_new(void)
146
+ {
147
+ parser_t *parser = ALLOC_N(parser_t, 1);
148
+ parser->capture = NULL; // not a real instance, pointer to other member's instance
149
+ parser->output = str_new();
150
+ parser->link_target = str_new();
151
+ parser->link_text = str_new();
152
+ parser->line_ending = NULL; // caller should set up
153
+ parser->tabulation = str_new();
154
+ parser->scope = ary_new();
155
+ parser->line = ary_new();
156
+ parser->line_buffer = ary_new();
157
+ parser->external_link_class = Qnil; // caller should set up
158
+ parser->external_link_rel = Qnil; // caller should set up
159
+ parser->mailto_class = Qnil; // caller should set up
160
+ parser->img_prefix = Qnil; // caller should set up
161
+ parser->output_style = HTML_OUTPUT;
162
+ parser->base_indent = 0;
163
+ parser->current_indent = 0;
164
+ parser->base_heading_level = 0;
165
+ parser->pending_crlf = false;
166
+ parser->autolink = true;
167
+ parser->space_to_underscore = true;
168
+ return parser;
169
+ }
170
+
171
+ void parser_free(parser_t *parser)
172
+ {
173
+ // we don't free parser->capture; it's just a redundant pointer
174
+ if (parser->output) str_free(parser->output);
175
+ if (parser->link_target) str_free(parser->link_target);
176
+ if (parser->link_text) str_free(parser->link_text);
177
+ if (parser->line_ending) str_free(parser->line_ending);
178
+ if (parser->tabulation) str_free(parser->tabulation);
179
+ if (parser->scope) ary_free(parser->scope);
180
+ if (parser->line) ary_free(parser->line);
181
+ if (parser->line_buffer) ary_free(parser->line_buffer);
182
+ free(parser);
183
+ }
184
+
185
+ // for testing and debugging only
186
+ VALUE Wikitext_parser_tokenize(VALUE self, VALUE string)
187
+ {
188
+ if (NIL_P(string))
189
+ return Qnil;
190
+ string = StringValue(string);
191
+ VALUE tokens = rb_ary_new();
192
+ char *p = RSTRING_PTR(string);
193
+ long len = RSTRING_LEN(string);
194
+ char *pe = p + len;
195
+ token_t token;
196
+ next_token(&token, NULL, p, pe);
197
+ rb_ary_push(tokens, wiki_token(&token));
198
+ while (token.type != END_OF_FILE)
199
+ {
200
+ next_token(&token, &token, NULL, pe);
201
+ rb_ary_push(tokens, wiki_token(&token));
202
+ }
203
+ return tokens;
204
+ }
205
+
206
+ // for benchmarking raw tokenization speed only
207
+ VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string)
208
+ {
209
+ if (NIL_P(string))
210
+ return Qnil;
211
+ string = StringValue(string);
212
+ char *p = RSTRING_PTR(string);
213
+ long len = RSTRING_LEN(string);
214
+ char *pe = p + len;
215
+ token_t token;
216
+ next_token(&token, NULL, p, pe);
217
+ while (token.type != END_OF_FILE)
218
+ next_token(&token, &token, NULL, pe);
219
+ return Qnil;
220
+ }
221
+
222
+ VALUE Wikitext_parser_fulltext_tokenize(int argc, VALUE *argv, VALUE self)
223
+ {
224
+ // process arguments
225
+ VALUE string, options;
226
+ if (rb_scan_args(argc, argv, "11", &string, &options) == 1) // 1 mandatory argument, 1 optional argument
227
+ options = Qnil;
228
+ if (NIL_P(string))
229
+ return Qnil;
230
+ string = StringValue(string);
231
+ VALUE tokens = rb_ary_new();
232
+
233
+ // check instance variables
234
+ VALUE min = rb_iv_get(self, "@minimum_fulltext_token_length");
235
+
236
+ // process options hash (can override instance variables)
237
+ if (!NIL_P(options) && TYPE(options) == T_HASH)
238
+ {
239
+ if (rb_funcall(options, rb_intern("has_key?"), 1, ID2SYM(rb_intern("minimum"))) == Qtrue)
240
+ min = rb_hash_aref(options, ID2SYM(rb_intern("minimum")));
241
+ }
242
+ int min_len = NIL_P(min) ? 3 : NUM2INT(min);
243
+ if (min_len < 0)
244
+ min_len = 0;
245
+
246
+ // set up scanner
247
+ char *p = RSTRING_PTR(string);
248
+ long len = RSTRING_LEN(string);
249
+ char *pe = p + len;
250
+ token_t token;
251
+ token_t *_token = &token;
252
+ next_token(&token, NULL, p, pe);
253
+ while (token.type != END_OF_FILE)
254
+ {
255
+ switch (token.type)
256
+ {
257
+ case URI:
258
+ case MAIL:
259
+ case ALNUM:
260
+ if (TOKEN_LEN(_token) >= min_len)
261
+ rb_ary_push(tokens, TOKEN_TEXT(_token));
262
+ break;
263
+ default:
264
+ // ignore everything else
265
+ break;
266
+ }
267
+ next_token(&token, &token, NULL, pe);
268
+ }
269
+ return tokens;
270
+ }
271
+
272
+ // we downcase "in place", overwriting the original contents of the buffer
273
+ void wiki_downcase_bang(char *ptr, long len)
274
+ {
275
+ for (long i = 0; i < len; i++)
276
+ {
277
+ if (ptr[i] >= 'A' && ptr[i] <= 'Z')
278
+ ptr[i] += 32;
279
+ }
280
+ }
281
+
282
+ void wiki_append_entity_from_utf32_char(str_t *output, uint32_t character)
283
+ {
284
+ char hex_string[8] = { '&', '#', 'x', 0, 0, 0, 0, ';' };
285
+ char scratch = (character & 0xf000) >> 12;
286
+ hex_string[3] = (scratch <= 9 ? scratch + 48 : scratch + 87);
287
+ scratch = (character & 0x0f00) >> 8;
288
+ hex_string[4] = (scratch <= 9 ? scratch + 48 : scratch + 87);
289
+ scratch = (character & 0x00f0) >> 4;
290
+ hex_string[5] = (scratch <= 9 ? scratch + 48 : scratch + 87);
291
+ scratch = character & 0x000f;
292
+ hex_string[6] = (scratch <= 9 ? scratch + 48 : scratch + 87);
293
+ str_append(output, hex_string, sizeof(hex_string));
294
+ }
295
+
296
+ // Convert a single UTF-8 codepoint to UTF-32
297
+ //
298
+ // Expects an input buffer, src, containing a UTF-8 encoded character (which
299
+ // may be multi-byte). The end of the input buffer, end, is also passed in to
300
+ // allow the detection of invalidly truncated codepoints. The number of bytes
301
+ // in the UTF-8 character (between 1 and 4) is returned by reference in
302
+ // width_out.
303
+ //
304
+ // Raises a RangeError if the supplied character is invalid UTF-8.
305
+ uint32_t wiki_utf8_to_utf32(char *src, char *end, long *width_out)
306
+ {
307
+ uint32_t dest = 0;
308
+ if ((unsigned char)src[0] <= 0x7f)
309
+ {
310
+ // ASCII
311
+ dest = src[0];
312
+ *width_out = 1;
313
+ }
314
+ else if ((src[0] & 0xe0) == 0xc0)
315
+ {
316
+ // byte starts with 110..... : this should be a two-byte sequence
317
+ if (src + 1 >= end)
318
+ // no second byte
319
+ rb_raise(eWikitextParserError, "invalid encoding: truncated byte sequence");
320
+ else if (((unsigned char)src[0] == 0xc0) ||
321
+ ((unsigned char)src[0] == 0xc1))
322
+ // overlong encoding: lead byte of 110..... but code point <= 127
323
+ rb_raise(eWikitextParserError, "invalid encoding: overlong encoding");
324
+ else if ((src[1] & 0xc0) != 0x80 )
325
+ // should have second byte starting with 10......
326
+ rb_raise(eWikitextParserError, "invalid encoding: malformed byte sequence");
327
+
328
+ dest =
329
+ ((uint32_t)(src[0] & 0x1f)) << 6 |
330
+ (src[1] & 0x3f);
331
+ *width_out = 2;
332
+ }
333
+ else if ((src[0] & 0xf0) == 0xe0)
334
+ {
335
+ // byte starts with 1110.... : this should be a three-byte sequence
336
+ if (src + 2 >= end)
337
+ // missing second or third byte
338
+ rb_raise(eWikitextParserError, "invalid encoding: truncated byte sequence");
339
+ else if (((src[1] & 0xc0) != 0x80 ) ||
340
+ ((src[2] & 0xc0) != 0x80 ))
341
+ // should have second and third bytes starting with 10......
342
+ rb_raise(eWikitextParserError, "invalid encoding: malformed byte sequence");
343
+
344
+ dest =
345
+ ((uint32_t)(src[0] & 0x0f)) << 12 |
346
+ ((uint32_t)(src[1] & 0x3f)) << 6 |
347
+ (src[2] & 0x3f);
348
+ *width_out = 3;
349
+ }
350
+ else if ((src[0] & 0xf8) == 0xf0)
351
+ {
352
+ // bytes starts with 11110... : this should be a four-byte sequence
353
+ if (src + 3 >= end)
354
+ // missing second, third, or fourth byte
355
+ rb_raise(eWikitextParserError, "invalid encoding: truncated byte sequence");
356
+ else if ((unsigned char)src[0] >= 0xf5 &&
357
+ (unsigned char)src[0] <= 0xf7)
358
+ // disallowed by RFC 3629 (codepoints above 0x10ffff)
359
+ rb_raise(eWikitextParserError, "invalid encoding: overlong encoding");
360
+ else if (((src[1] & 0xc0) != 0x80 ) ||
361
+ ((src[2] & 0xc0) != 0x80 ) ||
362
+ ((src[3] & 0xc0) != 0x80 ))
363
+ // should have second and third bytes starting with 10......
364
+ rb_raise(eWikitextParserError, "invalid encoding: malformed byte sequence");
365
+
366
+ dest =
367
+ ((uint32_t)(src[0] & 0x07)) << 18 |
368
+ ((uint32_t)(src[1] & 0x3f)) << 12 |
369
+ ((uint32_t)(src[1] & 0x3f)) << 6 |
370
+ (src[2] & 0x3f);
371
+ *width_out = 4;
372
+ }
373
+ else
374
+ rb_raise(eWikitextParserError, "invalid encoding: unexpected byte");
375
+ return dest;
376
+ }
377
+
378
+ // - non-printable (non-ASCII) characters converted to numeric entities
379
+ // - QUOT and AMP characters converted to named entities
380
+ // - if trim is true, leading and trailing whitespace trimmed
381
+ // - if trim is false, there is no special treatment of spaces
382
+ void wiki_append_sanitized_link_target(str_t *link_target, str_t *output, bool trim)
383
+ {
384
+ char *src = link_target->ptr;
385
+ char *start = src; // remember this so we can check if we're at the start
386
+ char *non_space = output->ptr + output->len; // remember last non-space character output
387
+ char *end = src + link_target->len;
388
+ while (src < end)
389
+ {
390
+ // need at most 8 bytes to display each input character (&#x0000;)
391
+ if (output->ptr + output->len + 8 > output->ptr + output->capacity) // outgrowing buffer, must grow
392
+ {
393
+ char *old_ptr = output->ptr;
394
+ str_grow(output, output->len + (end - src) * 8); // allocate enough for worst case
395
+ if (old_ptr != output->ptr) // may have moved
396
+ non_space += output->ptr - old_ptr;
397
+ }
398
+
399
+ if (*src == '"')
400
+ {
401
+ char quot_entity_literal[] = { '&', 'q', 'u', 'o', 't', ';' }; // no trailing NUL
402
+ str_append(output, quot_entity_literal, sizeof(quot_entity_literal));
403
+ }
404
+ else if (*src == '&')
405
+ {
406
+ char amp_entity_literal[] = { '&', 'a', 'm', 'p', ';' }; // no trailing NUL
407
+ str_append(output, amp_entity_literal, sizeof(amp_entity_literal));
408
+ }
409
+ else if (*src == '<' || *src == '>')
410
+ rb_raise(rb_eRangeError, "invalid link text (\"%c\" may not appear in link text)", *src);
411
+ else if (*src == ' ' && src == start && trim)
412
+ start++; // we eat leading space
413
+ else if (*src >= 0x20 && *src <= 0x7e) // printable ASCII
414
+ {
415
+ *(output->ptr + output->len) = *src;
416
+ output->len++;
417
+ }
418
+ else // all others: must convert to entities
419
+ {
420
+ long width;
421
+ wiki_append_entity_from_utf32_char(output, wiki_utf8_to_utf32(src, end, &width));
422
+ src += width;
423
+ non_space = output->ptr + output->len;
424
+ continue;
425
+ }
426
+ if (*src != ' ')
427
+ non_space = output->ptr + output->len;
428
+ src++;
429
+ }
430
+
431
+ // trim trailing space if necessary
432
+ if (trim && output->ptr + output->len != non_space)
433
+ output->len -= (output->ptr + output->len) - non_space;
434
+ }
435
+
436
+ // prepare hyperlink and append it to parser->output
437
+ // if check_autolink is true, checks parser->autolink to decide whether to emit a real hyperlink
438
+ // or merely the literal link target
439
+ // if link_text is Qnil, the link_target is re-used for the link text
440
+ void wiki_append_hyperlink(parser_t *parser, VALUE link_prefix, str_t *link_target, str_t *link_text, VALUE link_class, VALUE link_rel, bool check_autolink)
441
+ {
442
+ if (check_autolink && !parser->autolink)
443
+ wiki_append_sanitized_link_target(link_target, parser->output, true);
444
+ else
445
+ {
446
+ str_append(parser->output, a_start, sizeof(a_start) - 1); // <a href="
447
+ if (!NIL_P(link_prefix))
448
+ str_append_string(parser->output, link_prefix);
449
+ wiki_append_sanitized_link_target(link_target, parser->output, true);
450
+
451
+ // special handling for mailto URIs
452
+ const char *mailto = "mailto:";
453
+ long mailto_len = (long)sizeof(mailto) - 1; // don't count NUL byte
454
+ if ((link_target->len >= mailto_len &&
455
+ strncmp(mailto, link_target->ptr, mailto_len) == 0) ||
456
+ (!NIL_P(link_prefix) &&
457
+ RSTRING_LEN(link_prefix) >= mailto_len &&
458
+ strncmp(mailto, RSTRING_PTR(link_prefix), mailto_len) == 0))
459
+ link_class = parser->mailto_class; // use mailto_class from parser
460
+ if (link_class != Qnil)
461
+ {
462
+ str_append(parser->output, a_class, sizeof(a_class) - 1); // " class="
463
+ str_append_string(parser->output, link_class);
464
+ }
465
+ if (link_rel != Qnil)
466
+ {
467
+ str_append(parser->output, a_rel, sizeof(a_rel) - 1); // " rel="
468
+ str_append_string(parser->output, link_rel);
469
+ }
470
+ str_append(parser->output, a_start_close, sizeof(a_start_close) - 1); // ">
471
+ if (!link_text || link_text->len == 0) // re-use link_target
472
+ wiki_append_sanitized_link_target(link_target, parser->output, true);
473
+ else
474
+ str_append_str(parser->output, link_text);
475
+ str_append(parser->output, a_end, sizeof(a_end) - 1); // </a>
476
+ }
477
+ }
478
+
479
+ void wiki_append_img(parser_t *parser, char *token_ptr, long token_len)
480
+ {
481
+ str_append(parser->output, img_start, sizeof(img_start) - 1); // <img src="
482
+ if (!NIL_P(parser->img_prefix) && *token_ptr != '/') // len always > 0
483
+ str_append_string(parser->output, parser->img_prefix);
484
+ str_append(parser->output, token_ptr, token_len);
485
+ str_append(parser->output, img_alt, sizeof(img_alt) - 1); // " alt="
486
+ str_append(parser->output, token_ptr, token_len);
487
+ if (parser->output_style == XML_OUTPUT)
488
+ str_append(parser->output, img_end_xml, sizeof(img_end_xml) - 1); // " />
489
+ else
490
+ str_append(parser->output, img_end_html, sizeof(img_end_html) - 1); // ">
491
+ }
492
+
493
+ // will emit indentation only if we are about to emit any of:
494
+ // <blockquote>, <p>, <ul>, <ol>, <li>, <h1> etc, <pre>
495
+ // each time we enter one of those spans must ++ the indentation level
496
+ void wiki_indent(parser_t *parser)
497
+ {
498
+ if (parser->base_indent == -1) // indentation disabled
499
+ return;
500
+ int space_count = parser->current_indent + parser->base_indent;
501
+ if (space_count > 0)
502
+ {
503
+ char *old_end, *new_end;
504
+ if (parser->tabulation->len < space_count)
505
+ str_grow(parser->tabulation, space_count); // reallocates if necessary
506
+ old_end = parser->tabulation->ptr + parser->tabulation->len;
507
+ new_end = parser->tabulation->ptr + space_count;
508
+ while (old_end < new_end)
509
+ *old_end++ = ' ';
510
+ if (space_count > parser->tabulation->len)
511
+ parser->tabulation->len = space_count;
512
+ str_append(parser->output, parser->tabulation->ptr, space_count);
513
+ }
514
+ parser->current_indent += 2;
515
+ }
516
+
517
+ void wiki_append_pre_start(parser_t *parser, token_t *token)
518
+ {
519
+ wiki_indent(parser);
520
+ if ((size_t)TOKEN_LEN(token) > sizeof(pre_start) - 1)
521
+ {
522
+ str_append(parser->output, pre_class_start, sizeof(pre_class_start) - 1); // <pre class="
523
+ str_append(parser->output, token->start + 11, TOKEN_LEN(token) - 13); // (the "lang" substring)
524
+ str_append(parser->output, pre_class_end, sizeof(pre_class_end) - 1); // -syntax">
525
+ }
526
+ else
527
+ str_append(parser->output, pre_start, sizeof(pre_start) - 1);
528
+ ary_push(parser->scope, PRE_START);
529
+ ary_push(parser->line, PRE_START);
530
+ }
531
+
532
+ void wiki_dedent(parser_t *parser, bool emit)
533
+ {
534
+ if (parser->base_indent == -1) // indentation disabled
535
+ return;
536
+ parser->current_indent -= 2;
537
+ if (!emit)
538
+ return;
539
+ int space_count = parser->current_indent + parser->base_indent;
540
+ if (space_count > 0)
541
+ str_append(parser->output, parser->tabulation->ptr, space_count);
542
+ }
543
+
544
+ // Pops a single item off the parser's scope stack.
545
+ // A corresponding closing tag is written to the target string.
546
+ // The target string may be the main output buffer, or a substring capturing buffer if a link is being scanned.
547
+ void wiki_pop_from_stack(parser_t *parser, str_t *target)
548
+ {
549
+ int top = ary_entry(parser->scope, -1);
550
+ if (NO_ITEM(top))
551
+ return;
552
+ if (!target)
553
+ target = parser->output;
554
+
555
+ // for headings, take base_heading_level into account
556
+ if (top >= H1_START && top <= H6_START)
557
+ {
558
+ top += parser->base_heading_level;
559
+ // no need to check for underflow (base_heading_level is never negative)
560
+ if (top > H6_START)
561
+ top = H6_START;
562
+ }
563
+
564
+ switch (top)
565
+ {
566
+ case PRE:
567
+ case PRE_START:
568
+ str_append(target, pre_end, sizeof(pre_end) - 1);
569
+ str_append_str(target, parser->line_ending);
570
+ wiki_dedent(parser, false);
571
+ break;
572
+
573
+ case BLOCKQUOTE:
574
+ case BLOCKQUOTE_START:
575
+ wiki_dedent(parser, true);
576
+ str_append(target, blockquote_end, sizeof(blockquote_end) - 1);
577
+ str_append_str(target, parser->line_ending);
578
+ break;
579
+
580
+ case NO_WIKI_START:
581
+ // not a real HTML tag; so nothing to pop
582
+ break;
583
+
584
+ case STRONG:
585
+ case STRONG_START:
586
+ str_append(target, strong_end, sizeof(strong_end) - 1);
587
+ break;
588
+
589
+ case EM:
590
+ case EM_START:
591
+ str_append(target, em_end, sizeof(em_end) - 1);
592
+ break;
593
+
594
+ case TT:
595
+ case TT_START:
596
+ str_append(target, code_end, sizeof(code_end) - 1);
597
+ break;
598
+
599
+ case OL:
600
+ wiki_dedent(parser, true);
601
+ str_append(target, ol_end, sizeof(ol_end) - 1);
602
+ str_append_str(target, parser->line_ending);
603
+ break;
604
+
605
+ case UL:
606
+ wiki_dedent(parser, true);
607
+ str_append(target, ul_end, sizeof(ul_end) - 1);
608
+ str_append_str(target, parser->line_ending);
609
+ break;
610
+
611
+ case NESTED_LIST:
612
+ // next token to pop will be a LI
613
+ // LI is an interesting token because sometimes we want it to behave like P (ie. do a non-emitting indent)
614
+ // and other times we want it to behave like BLOCKQUOTE (ie. when it has a nested list inside)
615
+ // hence this hack: we do an emitting dedent on behalf of the LI that we know must be coming
616
+ // and then when we pop the actual LI itself (below) we do the standard non-emitting indent
617
+ wiki_dedent(parser, true); // we really only want to emit the spaces
618
+ parser->current_indent += 2; // we don't want to decrement the actual indent level, so put it back
619
+ break;
620
+
621
+ case LI:
622
+ str_append(target, li_end, sizeof(li_end) - 1);
623
+ str_append_str(target, parser->line_ending);
624
+ wiki_dedent(parser, false);
625
+ break;
626
+
627
+ case H6_START:
628
+ str_append(target, h6_end, sizeof(h6_end) - 1);
629
+ str_append_str(target, parser->line_ending);
630
+ wiki_dedent(parser, false);
631
+ break;
632
+
633
+ case H5_START:
634
+ str_append(target, h5_end, sizeof(h5_end) - 1);
635
+ str_append_str(target, parser->line_ending);
636
+ wiki_dedent(parser, false);
637
+ break;
638
+
639
+ case H4_START:
640
+ str_append(target, h4_end, sizeof(h4_end) - 1);
641
+ str_append_str(target, parser->line_ending);
642
+ wiki_dedent(parser, false);
643
+ break;
644
+
645
+ case H3_START:
646
+ str_append(target, h3_end, sizeof(h3_end) - 1);
647
+ str_append_str(target, parser->line_ending);
648
+ wiki_dedent(parser, false);
649
+ break;
650
+
651
+ case H2_START:
652
+ str_append(target, h2_end, sizeof(h2_end) - 1);
653
+ str_append_str(target, parser->line_ending);
654
+ wiki_dedent(parser, false);
655
+ break;
656
+
657
+ case H1_START:
658
+ str_append(target, h1_end, sizeof(h1_end) - 1);
659
+ str_append_str(target, parser->line_ending);
660
+ wiki_dedent(parser, false);
661
+ break;
662
+
663
+ case LINK_START:
664
+ // not an HTML tag; so nothing to emit
665
+ break;
666
+
667
+ case EXT_LINK_START:
668
+ // not an HTML tag; so nothing to emit
669
+ break;
670
+
671
+ case PATH:
672
+ // not an HTML tag; so nothing to emit
673
+ break;
674
+
675
+ case SPACE:
676
+ // not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
677
+ break;
678
+
679
+ case SEPARATOR:
680
+ // not an HTML tag (only used to separate an external link target from the link text); so nothing to emit
681
+ break;
682
+
683
+ case P:
684
+ str_append(target, p_end, sizeof(p_end) - 1);
685
+ str_append_str(target, parser->line_ending);
686
+ wiki_dedent(parser, false);
687
+ break;
688
+
689
+ case END_OF_FILE:
690
+ // nothing to do
691
+ break;
692
+
693
+ default:
694
+ // should probably raise an exception here
695
+ break;
696
+ }
697
+ ary_pop(parser->scope);
698
+ }
699
+
700
+ // Pops items off the top of parser's scope stack, accumulating closing tags for them into the target string, until item is reached.
701
+ // If including is true then the item itself is also popped.
702
+ // The target string may be the main output buffer, or a substring capturing buffer when scanning links.
703
+ void wiki_pop_from_stack_up_to(parser_t *parser, str_t *target, int item, bool including)
704
+ {
705
+ int continue_looping = 1;
706
+ do
707
+ {
708
+ int top = ary_entry(parser->scope, -1);
709
+ if (NO_ITEM(top))
710
+ return;
711
+ if (top == item)
712
+ {
713
+ if (!including)
714
+ return;
715
+ continue_looping = 0;
716
+ }
717
+ wiki_pop_from_stack(parser, target);
718
+ } while (continue_looping);
719
+ }
720
+
721
+ void wiki_pop_all_from_stack(parser_t *parser)
722
+ {
723
+ for (int i = 0, max = parser->scope->count; i < max; i++)
724
+ wiki_pop_from_stack(parser, NULL);
725
+ }
726
+
727
+ void wiki_start_para_if_necessary(parser_t *parser)
728
+ {
729
+ if (parser->capture)
730
+ return;
731
+
732
+ // if no block open yet, or top of stack is BLOCKQUOTE/BLOCKQUOTE_START (with nothing in it yet)
733
+ if (parser->scope->count == 0 ||
734
+ ary_entry(parser->scope, -1) == BLOCKQUOTE ||
735
+ ary_entry(parser->scope, -1) == BLOCKQUOTE_START)
736
+ {
737
+ wiki_indent(parser);
738
+ str_append(parser->output, p_start, sizeof(p_start) - 1);
739
+ ary_push(parser->scope, P);
740
+ ary_push(parser->line, P);
741
+ }
742
+ else if (parser->pending_crlf)
743
+ {
744
+ if (IN(P))
745
+ // already in a paragraph block; convert pending CRLF into a space
746
+ str_append(parser->output, space, sizeof(space) - 1);
747
+ else if (IN(PRE))
748
+ // PRE blocks can have pending CRLF too (helps us avoid emitting the trailing newline)
749
+ str_append_str(parser->output, parser->line_ending);
750
+ }
751
+ parser->pending_crlf = false;
752
+ }
753
+
754
+ void wiki_emit_pending_crlf_if_necessary(parser_t *parser)
755
+ {
756
+ if (parser->pending_crlf)
757
+ {
758
+ str_append_str(parser->output, parser->line_ending);
759
+ parser->pending_crlf = false;
760
+ }
761
+ }
762
+
763
+ // Helper function that pops any excess elements off scope (pushing is already handled in the respective rules).
764
+ // For example, given input like:
765
+ //
766
+ // > > foo
767
+ // bar
768
+ //
769
+ // Upon seeing "bar", we want to pop two BLOCKQUOTE elements from the scope.
770
+ // The reverse case (shown below) is handled from inside the BLOCKQUOTE rule itself:
771
+ //
772
+ // foo
773
+ // > > bar
774
+ //
775
+ // Things are made slightly more complicated by the fact that there is one block-level tag that can be on the scope
776
+ // but not on the line scope:
777
+ //
778
+ // <blockquote>foo
779
+ // bar</blockquote>
780
+ //
781
+ // Here on seeing "bar" we have one item on the scope (BLOCKQUOTE_START) which we don't want to pop, but we have nothing
782
+ // on the line scope.
783
+ // Luckily, BLOCKQUOTE_START tokens can only appear at the start of the scope array, so we can check for them first before
784
+ // entering the for loop.
785
+ void wiki_pop_excess_elements(parser_t *parser)
786
+ {
787
+ if (parser->capture)
788
+ return;
789
+ for (int i = parser->scope->count - ary_count(parser->scope, BLOCKQUOTE_START), j = parser->line->count; i > j; i--)
790
+ {
791
+ // special case for last item on scope
792
+ if (i - j == 1)
793
+ {
794
+ // don't auto-pop P if it is only item on scope
795
+ if (ary_entry(parser->scope, -1) == P)
796
+ {
797
+ // add P to the line scope to prevent us entering the loop at all next time around
798
+ ary_push(parser->line, P);
799
+ continue;
800
+ }
801
+ }
802
+ wiki_pop_from_stack(parser, NULL);
803
+ }
804
+ }
805
+
806
+ // trim parser->link_text in place
807
+ void wiki_trim_link_text(parser_t *parser)
808
+ {
809
+ char *src = parser->link_text->ptr;
810
+ char *start = src; // remember this so we can check if we're at the start
811
+ char *left = src;
812
+ char *non_space = src; // remember last non-space character output
813
+ char *end = src + parser->link_text->len;
814
+ while (src < end)
815
+ {
816
+ if (*src == ' ')
817
+ {
818
+ if (src == left)
819
+ left++;
820
+ }
821
+ else
822
+ non_space = src;
823
+ src++;
824
+ }
825
+ if (left != start || non_space + 1 != end)
826
+ {
827
+ // TODO: could potentially avoid this memmove by extending the str_t struct with an "offset" or "free" member
828
+ parser->link_text->len = (non_space + 1) - left;
829
+ memmove(parser->link_text->ptr, left, parser->link_text->len);
830
+ }
831
+ }
832
+
833
+ VALUE Wikitext_parser_sanitize_link_target(VALUE self, VALUE string)
834
+ {
835
+ str_t *link_target = str_new_from_string(string);
836
+ GC_WRAP_STR(link_target, link_target_gc);
837
+ str_t *output = str_new();
838
+ GC_WRAP_STR(output, output_gc);
839
+ wiki_append_sanitized_link_target(link_target, output, true);
840
+ return string_from_str(output);
841
+ }
842
+
843
+ // Encodes the parser link_target member (in-place) according to RFCs 2396 and 2718
844
+ //
845
+ // Leading and trailing whitespace trimmed. Spaces are converted to
846
+ // underscores if the parser space_to_underscore member is true.
847
+ static void wiki_encode_link_target(parser_t *parser)
848
+ {
849
+ char *src = parser->link_target->ptr;
850
+ char *start = src; // remember this so we can check if we're at the start
851
+ long len = parser->link_target->len;
852
+ if (!(len > 0))
853
+ return;
854
+ char *end = src + len;
855
+ long dest_len = len * 2;
856
+ char *dest = ALLOC_N(char, dest_len);
857
+ char *dest_ptr = dest; // hang on to this so we can pass it to free() later
858
+ char *non_space = dest; // remember last non-space character output
859
+ static char hex[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
860
+ for (; src < end; src++)
861
+ {
862
+ // worst case: a single character may grow to 3 characters once encoded
863
+ if ((dest + 3) > (dest_ptr + dest_len))
864
+ {
865
+ // outgrowing buffer, must reallocate
866
+ char *old_dest = dest;
867
+ char *old_dest_ptr = dest_ptr;
868
+ dest_len += len;
869
+ dest = realloc(dest_ptr, dest_len);
870
+ if (dest == NULL)
871
+ {
872
+ // would have used reallocf, but this has to run on Linux too, not just Darwin
873
+ free(dest_ptr);
874
+ rb_raise(rb_eNoMemError, "failed to re-allocate temporary storage (memory allocation error)");
875
+ }
876
+ dest_ptr = dest;
877
+ dest = dest_ptr + (old_dest - old_dest_ptr);
878
+ non_space = dest_ptr + (non_space - old_dest_ptr);
879
+ }
880
+
881
+ // pass through unreserved characters
882
+ if ((*src >= 'a' && *src <= 'z') ||
883
+ (*src >= 'A' && *src <= 'Z') ||
884
+ (*src >= '0' && *src <= '9') ||
885
+ *src == '-' ||
886
+ *src == '_' ||
887
+ *src == '.' ||
888
+ *src == '~')
889
+ {
890
+ *dest++ = *src;
891
+ non_space = dest;
892
+ }
893
+ else if (*src == ' ' && src == start)
894
+ start++; // we eat leading space
895
+ else if (*src == ' ' && parser->space_to_underscore)
896
+ *dest++ = '_';
897
+ else // everything else gets URL-encoded
898
+ {
899
+ *dest++ = '%';
900
+ *dest++ = hex[(unsigned char)(*src) / 16]; // left
901
+ *dest++ = hex[(unsigned char)(*src) % 16]; // right
902
+ if (*src != ' ')
903
+ non_space = dest;
904
+ }
905
+ }
906
+
907
+ // trim trailing space if necessary
908
+ if (non_space > dest_ptr && dest != non_space)
909
+ dest_len = non_space - dest_ptr;
910
+ else
911
+ dest_len = dest - dest_ptr;
912
+ str_clear(parser->link_target);
913
+ str_append(parser->link_target, dest_ptr, dest_len);
914
+ free(dest_ptr);
915
+ }
916
+
917
+ VALUE Wikitext_parser_encode_link_target(VALUE self, VALUE in)
918
+ {
919
+ parser_t parser;
920
+ parser.space_to_underscore = false;
921
+ parser.link_target = str_new_from_string(in);
922
+ GC_WRAP_STR(parser.link_target, link_target_gc);
923
+ wiki_encode_link_target(&parser);
924
+ return string_from_str(parser.link_target);
925
+ }
926
+
927
+ // returns 1 (true) if supplied string is blank (nil, empty, or all whitespace)
928
+ // returns 0 (false) otherwise
929
+ bool wiki_blank(str_t *str)
930
+ {
931
+ if (str->len == 0)
932
+ return true;
933
+ for (char *ptr = str->ptr,
934
+ *end = str->ptr + str->len;
935
+ ptr < end; ptr++)
936
+ {
937
+ if (*ptr != ' ')
938
+ return false;
939
+ }
940
+ return true;
941
+ }
942
+
943
+ void wiki_rollback_failed_internal_link(parser_t *parser)
944
+ {
945
+ if (!IN(LINK_START))
946
+ return; // nothing to do!
947
+ int scope_includes_separator = IN(SEPARATOR);
948
+ wiki_pop_from_stack_up_to(parser, NULL, LINK_START, true);
949
+ str_append(parser->output, link_start, sizeof(link_start) - 1);
950
+ if (parser->link_target->len > 0)
951
+ {
952
+ wiki_append_sanitized_link_target(parser->link_target, parser->output, false);
953
+ if (scope_includes_separator)
954
+ {
955
+ str_append(parser->output, separator, sizeof(separator) - 1);
956
+ if (parser->link_text->len > 0)
957
+ str_append_str(parser->output, parser->link_text);
958
+ }
959
+ }
960
+ parser->capture = NULL;
961
+ str_clear(parser->link_target);
962
+ str_clear(parser->link_text);
963
+ }
964
+
965
+ void wiki_rollback_failed_external_link(parser_t *parser)
966
+ {
967
+ if (!IN(EXT_LINK_START))
968
+ return; // nothing to do!
969
+
970
+ // store a couple of values before popping
971
+ int scope_includes_space = IN(SPACE);
972
+ VALUE link_class = IN(PATH) ? Qnil : parser->external_link_class;
973
+ VALUE link_rel = IN(PATH) ? Qnil : parser->external_link_rel;
974
+ wiki_pop_from_stack_up_to(parser, NULL, EXT_LINK_START, true);
975
+
976
+ str_append(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
977
+ if (parser->link_target->len > 0)
978
+ {
979
+ wiki_append_hyperlink(parser, Qnil, parser->link_target, NULL, link_class, link_rel, true);
980
+ if (scope_includes_space)
981
+ {
982
+ str_append(parser->output, space, sizeof(space) - 1);
983
+ if (parser->link_text->len > 0)
984
+ str_append_str(parser->output, parser->link_text);
985
+ }
986
+ }
987
+ parser->capture = NULL;
988
+ str_clear(parser->link_target);
989
+ str_clear(parser->link_text);
990
+ }
991
+
992
+ void wiki_rollback_failed_link(parser_t *parser)
993
+ {
994
+ wiki_rollback_failed_internal_link(parser);
995
+ wiki_rollback_failed_external_link(parser);
996
+ }
997
+
998
+ VALUE Wikitext_parser_initialize(int argc, VALUE *argv, VALUE self)
999
+ {
1000
+ // process arguments
1001
+ VALUE options;
1002
+ if (rb_scan_args(argc, argv, "01", &options) == 0) // 0 mandatory arguments, 1 optional argument
1003
+ options = Qnil;
1004
+
1005
+ // defaults
1006
+ VALUE autolink = Qtrue;
1007
+ VALUE line_ending = rb_str_new2("\n");
1008
+ VALUE external_link_class = rb_str_new2("external");
1009
+ VALUE external_link_rel = Qnil;
1010
+ VALUE mailto_class = rb_str_new2("mailto");
1011
+ VALUE link_proc = Qnil;
1012
+ VALUE internal_link_prefix = rb_str_new2("/wiki/");
1013
+ VALUE img_prefix = rb_str_new2("/images/");
1014
+ VALUE output_style = ID2SYM(rb_intern("html"));
1015
+ VALUE space_to_underscore = Qtrue;
1016
+ VALUE minimum_fulltext_token_length = INT2NUM(3);
1017
+ VALUE base_heading_level = INT2NUM(0);
1018
+
1019
+ // process options hash (override defaults)
1020
+ if (!NIL_P(options) && TYPE(options) == T_HASH)
1021
+ {
1022
+ #define OVERRIDE_IF_SET(name) rb_funcall(options, rb_intern("has_key?"), 1, ID2SYM(rb_intern(#name))) == Qtrue ? \
1023
+ rb_hash_aref(options, ID2SYM(rb_intern(#name))) : name
1024
+ autolink = OVERRIDE_IF_SET(autolink);
1025
+ line_ending = OVERRIDE_IF_SET(line_ending);
1026
+ external_link_class = OVERRIDE_IF_SET(external_link_class);
1027
+ external_link_rel = OVERRIDE_IF_SET(external_link_rel);
1028
+ mailto_class = OVERRIDE_IF_SET(mailto_class);
1029
+ link_proc = OVERRIDE_IF_SET(link_proc);
1030
+ internal_link_prefix = OVERRIDE_IF_SET(internal_link_prefix);
1031
+ img_prefix = OVERRIDE_IF_SET(img_prefix);
1032
+ output_style = OVERRIDE_IF_SET(output_style);
1033
+ space_to_underscore = OVERRIDE_IF_SET(space_to_underscore);
1034
+ minimum_fulltext_token_length = OVERRIDE_IF_SET(minimum_fulltext_token_length);
1035
+ base_heading_level = OVERRIDE_IF_SET(base_heading_level);
1036
+ }
1037
+
1038
+ // no need to call super here; rb_call_super()
1039
+ rb_iv_set(self, "@autolink", autolink);
1040
+ rb_iv_set(self, "@line_ending", line_ending);
1041
+ rb_iv_set(self, "@external_link_class", external_link_class);
1042
+ rb_iv_set(self, "@external_link_rel", external_link_rel);
1043
+ rb_iv_set(self, "@mailto_class", mailto_class);
1044
+ rb_iv_set(self, "@link_proc", link_proc);
1045
+ rb_iv_set(self, "@internal_link_prefix", internal_link_prefix);
1046
+ rb_iv_set(self, "@img_prefix", img_prefix);
1047
+ rb_iv_set(self, "@output_style", output_style);
1048
+ rb_iv_set(self, "@space_to_underscore", space_to_underscore);
1049
+ rb_iv_set(self, "@minimum_fulltext_token_length", minimum_fulltext_token_length);
1050
+ rb_iv_set(self, "@base_heading_level", base_heading_level);
1051
+ return self;
1052
+ }
1053
+
1054
+ VALUE Wikitext_parser_profiling_parse(VALUE self, VALUE string)
1055
+ {
1056
+ for (int i = 0; i < 100000; i++)
1057
+ Wikitext_parser_parse(1, &string, self);
1058
+ return Qnil;
1059
+ }
1060
+
1061
+ // convert a Ruby object (:xml, :html etc) into an int output style
1062
+ int Wikitext_output_style(VALUE output)
1063
+ {
1064
+ if (TYPE(output) == T_SYMBOL)
1065
+ {
1066
+ if (SYM2ID(output) == rb_intern("xml"))
1067
+ return XML_OUTPUT;
1068
+ }
1069
+ return HTML_OUTPUT; // fall back to default
1070
+ }
1071
+
1072
+ VALUE Wikitext_parser_parse(int argc, VALUE *argv, VALUE self)
1073
+ {
1074
+ // process arguments
1075
+ VALUE string, options;
1076
+ if (rb_scan_args(argc, argv, "11", &string, &options) == 1) // 1 mandatory argument, 1 optional argument
1077
+ options = Qnil;
1078
+ if (NIL_P(string))
1079
+ return Qnil;
1080
+ string = StringValue(string);
1081
+
1082
+ // access these once per parse
1083
+ VALUE line_ending = rb_iv_get(self, "@line_ending");
1084
+ line_ending = StringValue(line_ending);
1085
+ VALUE link_class = rb_iv_get(self, "@external_link_class");
1086
+ link_class = NIL_P(link_class) ? Qnil : StringValue(link_class);
1087
+ VALUE link_rel = rb_iv_get(self, "@external_link_rel");
1088
+ link_rel = NIL_P(link_rel) ? Qnil : StringValue(link_rel);
1089
+ VALUE link_proc = rb_iv_get(self, "@link_proc");
1090
+ VALUE mailto_class = rb_iv_get(self, "@mailto_class");
1091
+ mailto_class = NIL_P(mailto_class) ? Qnil : StringValue(mailto_class);
1092
+ VALUE prefix = rb_iv_get(self, "@internal_link_prefix");
1093
+ int output_style = Wikitext_output_style(rb_iv_get(self, "@output_style"));
1094
+
1095
+ // process options hash
1096
+ int base_indent = 0;
1097
+ int base_heading_level = NUM2INT(rb_iv_get(self, "@base_heading_level"));
1098
+ if (!NIL_P(options) && TYPE(options) == T_HASH)
1099
+ {
1100
+ // :indent => 0 (or more)
1101
+ ID has_key = rb_intern("has_key?");
1102
+ ID id = ID2SYM(rb_intern("indent"));
1103
+ if (rb_funcall(options, has_key, 1, id) == Qtrue)
1104
+ {
1105
+ VALUE indent = rb_hash_aref(options, id);
1106
+ if (indent == Qfalse)
1107
+ base_indent = -1; // indentation disabled
1108
+ else
1109
+ {
1110
+ base_indent = NUM2INT(indent);
1111
+ if (base_indent < 0)
1112
+ base_indent = 0;
1113
+ }
1114
+ }
1115
+
1116
+ // :base_heading_level => 0/1/2/3/4/5/6
1117
+ id = ID2SYM(rb_intern("base_heading_level"));
1118
+ if (rb_funcall(options, has_key, 1, id) == Qtrue)
1119
+ base_heading_level = NUM2INT(rb_hash_aref(options, id));
1120
+
1121
+ // :external_link_rel => 'nofollow'
1122
+ id = ID2SYM(rb_intern("external_link_rel"));
1123
+ if (rb_funcall(options, has_key, 1, id) == Qtrue)
1124
+ {
1125
+ link_rel = rb_hash_aref(options, id);
1126
+ link_rel = NIL_P(link_rel) ? Qnil : StringValue(link_rel);
1127
+ }
1128
+
1129
+ // :output_style => :html/:xml
1130
+ id = ID2SYM(rb_intern("output_style"));
1131
+ if (rb_funcall(options, has_key, 1, id) == Qtrue)
1132
+ output_style = Wikitext_output_style(rb_hash_aref(options, id));
1133
+
1134
+ // :link_proc => lambda { |link_target| ... }
1135
+ id = ID2SYM(rb_intern("link_proc"));
1136
+ if (rb_funcall(options, has_key, 1, id) == Qtrue)
1137
+ link_proc = rb_hash_aref(options, id);
1138
+ }
1139
+
1140
+ // normalize, regardless of whether this came from instance variable or override
1141
+ if (base_heading_level < 0)
1142
+ base_heading_level = 0;
1143
+ if (base_heading_level > 6)
1144
+ base_heading_level = 6;
1145
+
1146
+ // set up scanner
1147
+ char *p = RSTRING_PTR(string);
1148
+ long len = RSTRING_LEN(string);
1149
+ char *pe = p + len;
1150
+
1151
+ // set up parser struct to make passing parameters a little easier
1152
+ parser_t *parser = parser_new();
1153
+ GC_WRAP_PARSER(parser, parser_gc);
1154
+ parser->external_link_class = link_class;
1155
+ parser->external_link_rel = link_rel;
1156
+ parser->mailto_class = mailto_class;
1157
+ parser->img_prefix = rb_iv_get(self, "@img_prefix");
1158
+ parser->autolink = rb_iv_get(self, "@autolink") == Qtrue ? true : false;
1159
+ parser->space_to_underscore = rb_iv_get(self, "@space_to_underscore") == Qtrue ? true : false;
1160
+ parser->line_ending = str_new_from_string(line_ending);
1161
+ parser->base_indent = base_indent;
1162
+ parser->base_heading_level = base_heading_level;
1163
+ parser->output_style = output_style;
1164
+
1165
+ // this simple looping design leads to a single enormous function,
1166
+ // but it's faster than doing actual recursive descent and also secure in the face of
1167
+ // malicious input that seeks to overflow the stack
1168
+ // (with "<blockquote><blockquote><blockquote>..." times by 10,000, for example)
1169
+ // given that we expect to deal with a lot of malformed input, a recursive descent design is less appropriate
1170
+ // than a straightforward looping translator like this one anyway
1171
+ token_t _token;
1172
+ _token.type = NO_TOKEN;
1173
+ token_t *token = NULL;
1174
+ do
1175
+ {
1176
+ // note that whenever we grab a token we push it into the line buffer
1177
+ // this provides us with context-sensitive "memory" of what's been seen so far on this line
1178
+ #define NEXT_TOKEN() token = &_token, next_token(token, token, NULL, pe), ary_push(parser->line_buffer, token->type)
1179
+
1180
+ // check to see if we have a token hanging around from a previous iteration of this loop
1181
+ if (token == NULL)
1182
+ {
1183
+ if (_token.type == NO_TOKEN)
1184
+ {
1185
+ // first time here (haven't started scanning yet)
1186
+ token = &_token;
1187
+ next_token(token, NULL, p, pe);
1188
+ ary_push(parser->line_buffer, token->type);
1189
+ }
1190
+ else
1191
+ // already scanning
1192
+ NEXT_TOKEN();
1193
+ }
1194
+ int type = token->type;
1195
+
1196
+ // can't declare new variables inside a switch statement, so predeclare them here
1197
+ long remove_strong = -1;
1198
+ long remove_em = -1;
1199
+
1200
+ // general purpose counters, flags and pointers
1201
+ long i = 0;
1202
+ long j = 0;
1203
+ long k = 0;
1204
+ str_t *output = NULL;
1205
+ str_t _token_str;
1206
+ str_t *token_str = &_token_str;
1207
+
1208
+ // The following giant switch statement contains cases for all the possible token types.
1209
+ // In the most basic sense we are emitting the HTML that corresponds to each token,
1210
+ // but some tokens require context information in order to decide what to output.
1211
+ // For example, does the STRONG token (''') translate to <strong> or </strong>?
1212
+ // So when looking at any given token we have three state-maintaining variables which gives us a notion of "where we are":
1213
+ //
1214
+ // - the "scope" stack (indicates what HTML DOM structures we are currently nested inside, similar to a CSS selector)
1215
+ // - the line buffer (records tokens seen so far on the current line)
1216
+ // - the line "scope" stack (indicates what the scope should be based only on what is visible on the line so far)
1217
+ //
1218
+ // Although this is fairly complicated, there is one key simplifying factor:
1219
+ // The translator continuously performs auto-correction, and this means that we always have a guarantee that the
1220
+ // scope stack (up to the current token) is valid; our translator can take this as a given.
1221
+ // Auto-correction basically consists of inserting missing tokens (preventing subsquent HTML from being messed up),
1222
+ // or converting illegal (unexpected) tokens to their plain text equivalents (providing visual feedback to Wikitext author).
1223
+ switch (type)
1224
+ {
1225
+ case PRE:
1226
+ if (IN_EITHER_OF(NO_WIKI_START, PRE_START))
1227
+ {
1228
+ str_append(parser->output, space, sizeof(space) - 1);
1229
+ break;
1230
+ }
1231
+ else if (IN(BLOCKQUOTE_START))
1232
+ {
1233
+ // this kind of nesting not allowed (to avoid user confusion)
1234
+ wiki_pop_excess_elements(parser);
1235
+ wiki_start_para_if_necessary(parser);
1236
+ output = parser->capture ? parser->capture : parser->output;
1237
+ str_append(output, space, sizeof(space) - 1);
1238
+ break;
1239
+ }
1240
+
1241
+ // count number of BLOCKQUOTE tokens in line buffer and in scope stack
1242
+ ary_push(parser->line, PRE);
1243
+ i = ary_count(parser->line, BLOCKQUOTE);
1244
+ j = ary_count(parser->scope, BLOCKQUOTE);
1245
+ if (i < j)
1246
+ {
1247
+ // must pop (reduce nesting level)
1248
+ for (i = j - i; i > 0; i--)
1249
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE, true);
1250
+ }
1251
+
1252
+ if (!IN(PRE))
1253
+ {
1254
+ parser->pending_crlf = false;
1255
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE, false);
1256
+ wiki_indent(parser);
1257
+ str_append(parser->output, pre_start, sizeof(pre_start) - 1);
1258
+ ary_push(parser->scope, PRE);
1259
+ }
1260
+ break;
1261
+
1262
+ case PRE_START:
1263
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1264
+ {
1265
+ wiki_emit_pending_crlf_if_necessary(parser);
1266
+ str_append(parser->output, escaped_pre_start, sizeof(escaped_pre_start) - 1);
1267
+ }
1268
+ else if (IN(BLOCKQUOTE_START))
1269
+ {
1270
+ wiki_rollback_failed_link(parser); // if any
1271
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE_START, false);
1272
+ wiki_append_pre_start(parser, token);
1273
+ }
1274
+ else if (IN(BLOCKQUOTE))
1275
+ {
1276
+ if (token->column_start == 1) // only allowed in first column
1277
+ {
1278
+ wiki_rollback_failed_link(parser); // if any
1279
+ wiki_pop_all_from_stack(parser);
1280
+ wiki_append_pre_start(parser, token);
1281
+ }
1282
+ else // PRE_START illegal here
1283
+ {
1284
+ output = parser->capture ? parser->capture : parser->output;
1285
+ wiki_pop_excess_elements(parser);
1286
+ wiki_start_para_if_necessary(parser);
1287
+ str_append(output, escaped_pre_start, sizeof(escaped_pre_start) - 1);
1288
+ }
1289
+ }
1290
+ else
1291
+ {
1292
+ wiki_rollback_failed_link(parser); // if any
1293
+ wiki_pop_from_stack_up_to(parser, NULL, P, true);
1294
+ wiki_append_pre_start(parser, token);
1295
+ }
1296
+ break;
1297
+
1298
+ case PRE_END:
1299
+ if (IN_EITHER_OF(NO_WIKI_START, PRE))
1300
+ {
1301
+ wiki_emit_pending_crlf_if_necessary(parser);
1302
+ str_append(parser->output, escaped_pre_end, sizeof(escaped_pre_end) - 1);
1303
+ }
1304
+ else
1305
+ {
1306
+ if (IN(PRE_START))
1307
+ wiki_pop_from_stack_up_to(parser, parser->output, PRE_START, true);
1308
+ else
1309
+ {
1310
+ output = parser->capture ? parser->capture : parser->output;
1311
+ wiki_pop_excess_elements(parser);
1312
+ wiki_start_para_if_necessary(parser);
1313
+ str_append(output, escaped_pre_end, sizeof(escaped_pre_end) - 1);
1314
+ }
1315
+ }
1316
+ break;
1317
+
1318
+ case BLOCKQUOTE:
1319
+ if (IN_EITHER_OF(NO_WIKI_START, PRE_START))
1320
+ // no need to check for <pre>; can never appear inside it
1321
+ str_append(parser->output, escaped_blockquote, TOKEN_LEN(token) + 3); // will either emit "&gt;" or "&gt; "
1322
+ else if (IN(BLOCKQUOTE_START))
1323
+ {
1324
+ // this kind of nesting not allowed (to avoid user confusion)
1325
+ wiki_pop_excess_elements(parser);
1326
+ wiki_start_para_if_necessary(parser);
1327
+ output = parser->capture ? parser->capture : parser->output;
1328
+ str_append(output, escaped_blockquote, TOKEN_LEN(token) + 3); // will either emit "&gt;" or "&gt; "
1329
+ break;
1330
+ }
1331
+ else
1332
+ {
1333
+ ary_push(parser->line, BLOCKQUOTE);
1334
+
1335
+ // count number of BLOCKQUOTE tokens in line buffer and in scope stack
1336
+ i = ary_count(parser->line, BLOCKQUOTE);
1337
+ j = ary_count(parser->scope, BLOCKQUOTE);
1338
+
1339
+ // given that BLOCKQUOTE tokens can be nested, peek ahead and see if there are any more which might affect the decision to push or pop
1340
+ while (NEXT_TOKEN(), (token->type == BLOCKQUOTE))
1341
+ {
1342
+ ary_push(parser->line, BLOCKQUOTE);
1343
+ i++;
1344
+ }
1345
+
1346
+ // now decide whether to push, pop or do nothing
1347
+ if (i > j)
1348
+ {
1349
+ // must push (increase nesting level)
1350
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE, false);
1351
+ for (i = i - j; i > 0; i--)
1352
+ {
1353
+ wiki_indent(parser);
1354
+ str_append(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
1355
+ str_append_str(parser->output, parser->line_ending);
1356
+ ary_push(parser->scope, BLOCKQUOTE);
1357
+ }
1358
+ }
1359
+ else if (i < j)
1360
+ {
1361
+ // must pop (reduce nesting level)
1362
+ for (i = j - i; i > 0; i--)
1363
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE, true);
1364
+ }
1365
+
1366
+ // jump to top of the loop to process token we scanned during lookahead
1367
+ continue;
1368
+ }
1369
+ break;
1370
+
1371
+ case BLOCKQUOTE_START:
1372
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1373
+ {
1374
+ wiki_emit_pending_crlf_if_necessary(parser);
1375
+ str_append(parser->output, escaped_blockquote_start, sizeof(escaped_blockquote_start) - 1);
1376
+ }
1377
+ else if (IN(BLOCKQUOTE_START))
1378
+ {
1379
+ // nesting is fine here
1380
+ wiki_rollback_failed_link(parser); // if any
1381
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE_START, false);
1382
+ wiki_indent(parser);
1383
+ str_append(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
1384
+ str_append_str(parser->output, parser->line_ending);
1385
+ ary_push(parser->scope, BLOCKQUOTE_START);
1386
+ ary_push(parser->line, BLOCKQUOTE_START);
1387
+ }
1388
+ else if (IN(BLOCKQUOTE))
1389
+ {
1390
+ if (token->column_start == 1) // only allowed in first column
1391
+ {
1392
+ wiki_rollback_failed_link(parser); // if any
1393
+ wiki_pop_all_from_stack(parser);
1394
+ wiki_indent(parser);
1395
+ str_append(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
1396
+ str_append_str(parser->output, parser->line_ending);
1397
+ ary_push(parser->scope, BLOCKQUOTE_START);
1398
+ ary_push(parser->line, BLOCKQUOTE_START);
1399
+ }
1400
+ else // BLOCKQUOTE_START illegal here
1401
+ {
1402
+ output = parser->capture ? parser->capture : parser->output;
1403
+ wiki_pop_excess_elements(parser);
1404
+ wiki_start_para_if_necessary(parser);
1405
+ str_append(output, escaped_blockquote_start, sizeof(escaped_blockquote_start) - 1);
1406
+ }
1407
+ }
1408
+ else
1409
+ {
1410
+ // would be nice to eliminate the repetition here but it's probably the clearest way
1411
+ wiki_rollback_failed_link(parser); // if any
1412
+ wiki_pop_from_stack_up_to(parser, NULL, P, true);
1413
+ wiki_indent(parser);
1414
+ str_append(parser->output, blockquote_start, sizeof(blockquote_start) - 1);
1415
+ str_append_str(parser->output, parser->line_ending);
1416
+ ary_push(parser->scope, BLOCKQUOTE_START);
1417
+ ary_push(parser->line, BLOCKQUOTE_START);
1418
+ }
1419
+ break;
1420
+
1421
+ case BLOCKQUOTE_END:
1422
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1423
+ {
1424
+ wiki_emit_pending_crlf_if_necessary(parser);
1425
+ str_append(parser->output, escaped_blockquote_end, sizeof(escaped_blockquote_end) - 1);
1426
+ }
1427
+ else
1428
+ {
1429
+ if (IN(BLOCKQUOTE_START))
1430
+ wiki_pop_from_stack_up_to(parser, parser->output, BLOCKQUOTE_START, true);
1431
+ else
1432
+ {
1433
+ output = parser->capture ? parser->capture : parser->output;
1434
+ wiki_pop_excess_elements(parser);
1435
+ wiki_start_para_if_necessary(parser);
1436
+ str_append(output, escaped_blockquote_end, sizeof(escaped_blockquote_end) - 1);
1437
+ }
1438
+ }
1439
+ break;
1440
+
1441
+ case NO_WIKI_START:
1442
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1443
+ {
1444
+ wiki_emit_pending_crlf_if_necessary(parser);
1445
+ str_append(parser->output, escaped_no_wiki_start, sizeof(escaped_no_wiki_start) - 1);
1446
+ }
1447
+ else
1448
+ {
1449
+ wiki_pop_excess_elements(parser);
1450
+ wiki_start_para_if_necessary(parser);
1451
+ ary_push(parser->scope, NO_WIKI_START);
1452
+ ary_push(parser->line, NO_WIKI_START);
1453
+ }
1454
+ break;
1455
+
1456
+ case NO_WIKI_END:
1457
+ if (IN(NO_WIKI_START))
1458
+ // <nowiki> should always only ever be the last item in the stack, but use the helper routine just in case
1459
+ wiki_pop_from_stack_up_to(parser, NULL, NO_WIKI_START, true);
1460
+ else
1461
+ {
1462
+ wiki_pop_excess_elements(parser);
1463
+ wiki_start_para_if_necessary(parser);
1464
+ str_append(parser->output, escaped_no_wiki_end, sizeof(escaped_no_wiki_end) - 1);
1465
+ }
1466
+ break;
1467
+
1468
+ case STRONG_EM:
1469
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1470
+ {
1471
+ wiki_emit_pending_crlf_if_necessary(parser);
1472
+ str_append(parser->output, literal_strong_em, sizeof(literal_strong_em) - 1);
1473
+ break;
1474
+ }
1475
+
1476
+ output = parser->capture ? parser->capture : parser->output;
1477
+ wiki_pop_excess_elements(parser);
1478
+
1479
+ // if you've seen STRONG/STRONG_START or EM/EM_START, must close them in the reverse order that you saw them!
1480
+ // otherwise, must open them
1481
+ remove_strong = -1;
1482
+ remove_em = -1;
1483
+ j = parser->scope->count;
1484
+ for (j = j - 1; j >= 0; j--)
1485
+ {
1486
+ int val = ary_entry(parser->scope, (int)j);
1487
+ if (val == STRONG || val == STRONG_START)
1488
+ {
1489
+ str_append(output, strong_end, sizeof(strong_end) - 1);
1490
+ remove_strong = j;
1491
+ }
1492
+ else if (val == EM || val == EM_START)
1493
+ {
1494
+ str_append(output, em_end, sizeof(em_end) - 1);
1495
+ remove_em = j;
1496
+ }
1497
+ }
1498
+
1499
+ if (remove_strong > remove_em) // must remove strong first
1500
+ {
1501
+ ary_pop(parser->scope);
1502
+ if (remove_em > -1)
1503
+ ary_pop(parser->scope);
1504
+ else // there was no em to remove!, so consider this an opening em tag
1505
+ {
1506
+ str_append(output, em_start, sizeof(em_start) - 1);
1507
+ ary_push(parser->scope, EM);
1508
+ ary_push(parser->line, EM);
1509
+ }
1510
+ }
1511
+ else if (remove_em > remove_strong) // must remove em first
1512
+ {
1513
+ ary_pop(parser->scope);
1514
+ if (remove_strong > -1)
1515
+ ary_pop(parser->scope);
1516
+ else // there was no strong to remove!, so consider this an opening strong tag
1517
+ {
1518
+ str_append(output, strong_start, sizeof(strong_start) - 1);
1519
+ ary_push(parser->scope, STRONG);
1520
+ ary_push(parser->line, STRONG);
1521
+ }
1522
+ }
1523
+ else // no strong or em to remove, so this must be a new opening of both
1524
+ {
1525
+ wiki_start_para_if_necessary(parser);
1526
+ str_append(output, strong_em_start, sizeof(strong_em_start) - 1);
1527
+ ary_push(parser->scope, STRONG);
1528
+ ary_push(parser->line, STRONG);
1529
+ ary_push(parser->scope, EM);
1530
+ ary_push(parser->line, EM);
1531
+ }
1532
+ break;
1533
+
1534
+ case STRONG:
1535
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1536
+ {
1537
+ wiki_emit_pending_crlf_if_necessary(parser);
1538
+ str_append(parser->output, literal_strong, sizeof(literal_strong) - 1);
1539
+ }
1540
+ else
1541
+ {
1542
+ output = parser->capture ? parser->capture : parser->output;
1543
+ if (IN(STRONG_START))
1544
+ // already in span started with <strong>, no choice but to emit this literally
1545
+ str_append(output, literal_strong, sizeof(literal_strong) - 1);
1546
+ else if (IN(STRONG))
1547
+ // STRONG already seen, this is a closing tag
1548
+ wiki_pop_from_stack_up_to(parser, output, STRONG, true);
1549
+ else
1550
+ {
1551
+ // this is a new opening
1552
+ wiki_pop_excess_elements(parser);
1553
+ wiki_start_para_if_necessary(parser);
1554
+ str_append(output, strong_start, sizeof(strong_start) - 1);
1555
+ ary_push(parser->scope, STRONG);
1556
+ ary_push(parser->line, STRONG);
1557
+ }
1558
+ }
1559
+ break;
1560
+
1561
+ case STRONG_START:
1562
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1563
+ {
1564
+ wiki_emit_pending_crlf_if_necessary(parser);
1565
+ str_append(parser->output, escaped_strong_start, sizeof(escaped_strong_start) - 1);
1566
+ }
1567
+ else
1568
+ {
1569
+ output = parser->capture ? parser->capture : parser->output;
1570
+ if (IN_EITHER_OF(STRONG_START, STRONG))
1571
+ str_append(output, escaped_strong_start, sizeof(escaped_strong_start) - 1);
1572
+ else
1573
+ {
1574
+ wiki_pop_excess_elements(parser);
1575
+ wiki_start_para_if_necessary(parser);
1576
+ str_append(output, strong_start, sizeof(strong_start) - 1);
1577
+ ary_push(parser->scope, STRONG_START);
1578
+ ary_push(parser->line, STRONG_START);
1579
+ }
1580
+ }
1581
+ break;
1582
+
1583
+ case STRONG_END:
1584
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1585
+ {
1586
+ wiki_emit_pending_crlf_if_necessary(parser);
1587
+ str_append(parser->output, escaped_strong_end, sizeof(escaped_strong_end) - 1);
1588
+ }
1589
+ else
1590
+ {
1591
+ output = parser->capture ? parser->capture : parser->output;
1592
+ if (IN(STRONG_START))
1593
+ wiki_pop_from_stack_up_to(parser, output, STRONG_START, true);
1594
+ else
1595
+ {
1596
+ // no STRONG_START in scope, so must interpret the STRONG_END without any special meaning
1597
+ wiki_pop_excess_elements(parser);
1598
+ wiki_start_para_if_necessary(parser);
1599
+ str_append(output, escaped_strong_end, sizeof(escaped_strong_end) - 1);
1600
+ }
1601
+ }
1602
+ break;
1603
+
1604
+ case EM:
1605
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1606
+ {
1607
+ wiki_emit_pending_crlf_if_necessary(parser);
1608
+ str_append(parser->output, literal_em, sizeof(literal_em) - 1);
1609
+ }
1610
+ else
1611
+ {
1612
+ output = parser->capture ? parser->capture : parser->output;
1613
+ if (IN(EM_START))
1614
+ // already in span started with <em>, no choice but to emit this literally
1615
+ str_append(output, literal_em, sizeof(literal_em) - 1);
1616
+ else if (IN(EM))
1617
+ // EM already seen, this is a closing tag
1618
+ wiki_pop_from_stack_up_to(parser, output, EM, true);
1619
+ else
1620
+ {
1621
+ // this is a new opening
1622
+ wiki_pop_excess_elements(parser);
1623
+ wiki_start_para_if_necessary(parser);
1624
+ str_append(output, em_start, sizeof(em_start) - 1);
1625
+ ary_push(parser->scope, EM);
1626
+ ary_push(parser->line, EM);
1627
+ }
1628
+ }
1629
+ break;
1630
+
1631
+ case EM_START:
1632
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1633
+ {
1634
+ wiki_emit_pending_crlf_if_necessary(parser);
1635
+ str_append(parser->output, escaped_em_start, sizeof(escaped_em_start) - 1);
1636
+ }
1637
+ else
1638
+ {
1639
+ output = parser->capture ? parser->capture : parser->output;
1640
+ if (IN_EITHER_OF(EM_START, EM))
1641
+ str_append(output, escaped_em_start, sizeof(escaped_em_start) - 1);
1642
+ else
1643
+ {
1644
+ wiki_pop_excess_elements(parser);
1645
+ wiki_start_para_if_necessary(parser);
1646
+ str_append(output, em_start, sizeof(em_start) - 1);
1647
+ ary_push(parser->scope, EM_START);
1648
+ ary_push(parser->line, EM_START);
1649
+ }
1650
+ }
1651
+ break;
1652
+
1653
+ case EM_END:
1654
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1655
+ {
1656
+ wiki_emit_pending_crlf_if_necessary(parser);
1657
+ str_append(parser->output, escaped_em_end, sizeof(escaped_em_end) - 1);
1658
+ }
1659
+ else
1660
+ {
1661
+ output = parser->capture ? parser->capture : parser->output;
1662
+ if (IN(EM_START))
1663
+ wiki_pop_from_stack_up_to(parser, output, EM_START, true);
1664
+ else
1665
+ {
1666
+ // no EM_START in scope, so must interpret the EM_END without any special meaning
1667
+ wiki_pop_excess_elements(parser);
1668
+ wiki_start_para_if_necessary(parser);
1669
+ str_append(output, escaped_em_end, sizeof(escaped_em_end) - 1);
1670
+ }
1671
+ }
1672
+ break;
1673
+
1674
+ case TT:
1675
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1676
+ {
1677
+ wiki_emit_pending_crlf_if_necessary(parser);
1678
+ str_append(parser->output, backtick, sizeof(backtick) - 1);
1679
+ }
1680
+ else
1681
+ {
1682
+ output = parser->capture ? parser->capture : parser->output;
1683
+ if (IN(TT_START))
1684
+ // already in span started with <tt>, no choice but to emit this literally
1685
+ str_append(output, backtick, sizeof(backtick) - 1);
1686
+ else if (IN(TT))
1687
+ // TT (`) already seen, this is a closing tag
1688
+ wiki_pop_from_stack_up_to(parser, output, TT, true);
1689
+ else
1690
+ {
1691
+ // this is a new opening
1692
+ wiki_pop_excess_elements(parser);
1693
+ wiki_start_para_if_necessary(parser);
1694
+ str_append(output, code_start, sizeof(code_start) - 1);
1695
+ ary_push(parser->scope, TT);
1696
+ ary_push(parser->line, TT);
1697
+ }
1698
+ }
1699
+ break;
1700
+
1701
+ case TT_START:
1702
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1703
+ {
1704
+ wiki_emit_pending_crlf_if_necessary(parser);
1705
+ str_append(parser->output, escaped_tt_start, sizeof(escaped_tt_start) - 1);
1706
+ }
1707
+ else
1708
+ {
1709
+ output = parser->capture ? parser->capture : parser->output;
1710
+ if (IN_EITHER_OF(TT_START, TT))
1711
+ str_append(output, escaped_tt_start, sizeof(escaped_tt_start) - 1);
1712
+ else
1713
+ {
1714
+ wiki_pop_excess_elements(parser);
1715
+ wiki_start_para_if_necessary(parser);
1716
+ str_append(output, code_start, sizeof(code_start) - 1);
1717
+ ary_push(parser->scope, TT_START);
1718
+ ary_push(parser->line, TT_START);
1719
+ }
1720
+ }
1721
+ break;
1722
+
1723
+ case TT_END:
1724
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1725
+ {
1726
+ wiki_emit_pending_crlf_if_necessary(parser);
1727
+ str_append(parser->output, escaped_tt_end, sizeof(escaped_tt_end) - 1);
1728
+ }
1729
+ else
1730
+ {
1731
+ output = parser->capture ? parser->capture : parser->output;
1732
+ if (IN(TT_START))
1733
+ wiki_pop_from_stack_up_to(parser, output, TT_START, true);
1734
+ else
1735
+ {
1736
+ // no TT_START in scope, so must interpret the TT_END without any special meaning
1737
+ wiki_pop_excess_elements(parser);
1738
+ wiki_start_para_if_necessary(parser);
1739
+ str_append(output, escaped_tt_end, sizeof(escaped_tt_end) - 1);
1740
+ }
1741
+ }
1742
+ break;
1743
+
1744
+ case OL:
1745
+ case UL:
1746
+ if (IN_EITHER_OF(NO_WIKI_START, PRE_START))
1747
+ {
1748
+ // no need to check for PRE; can never appear inside it
1749
+ str_append(parser->output, token->start, TOKEN_LEN(token));
1750
+ break;
1751
+ }
1752
+
1753
+ // count number of tokens in line and scope stacks
1754
+ int bq_count = ary_count(parser->scope, BLOCKQUOTE_START);
1755
+ i = parser->line->count - ary_count(parser->line, BLOCKQUOTE_START);
1756
+ j = parser->scope->count - bq_count;
1757
+ k = i;
1758
+
1759
+ // list tokens can be nested so look ahead for any more which might affect the decision to push or pop
1760
+ for (;;)
1761
+ {
1762
+ type = token->type;
1763
+ if (type == OL || type == UL)
1764
+ {
1765
+ token = NULL;
1766
+ if (i - k >= 2) // already seen at least one OL or UL
1767
+ {
1768
+ ary_push(parser->line, NESTED_LIST); // which means this is a nested list
1769
+ i += 3;
1770
+ }
1771
+ else
1772
+ i += 2;
1773
+ ary_push(parser->line, type);
1774
+ ary_push(parser->line, LI);
1775
+
1776
+ // want to compare line with scope but can only do so if scope has enough items on it
1777
+ if (j >= i)
1778
+ {
1779
+ if (ary_entry(parser->scope, (int)(i + bq_count - 2)) == type &&
1780
+ ary_entry(parser->scope, (int)(i + bq_count - 1)) == LI)
1781
+ {
1782
+ // line and scope match at this point: do nothing yet
1783
+ }
1784
+ else
1785
+ {
1786
+ // item just pushed onto line does not match corresponding slot of scope!
1787
+ for (; j >= i - 2; j--)
1788
+ // must pop back before emitting
1789
+ wiki_pop_from_stack(parser, NULL);
1790
+
1791
+ // will emit UL or OL, then LI
1792
+ break;
1793
+ }
1794
+ }
1795
+ else // line stack size now exceeds scope stack size: must increase nesting level
1796
+ break; // will emit UL or OL, then LI
1797
+ }
1798
+ else
1799
+ {
1800
+ // not a OL or UL token!
1801
+ if (j == i)
1802
+ // must close existing LI and re-open new one
1803
+ wiki_pop_from_stack(parser, NULL);
1804
+ else if (j > i)
1805
+ {
1806
+ // item just pushed onto line does not match corresponding slot of scope!
1807
+ for (; j >= i; j--)
1808
+ // must pop back before emitting
1809
+ wiki_pop_from_stack(parser, NULL);
1810
+ }
1811
+ break;
1812
+ }
1813
+ NEXT_TOKEN();
1814
+ }
1815
+
1816
+ // will emit
1817
+ if (type == OL || type == UL)
1818
+ {
1819
+ // if LI is at the top of a stack this is the start of a nested list
1820
+ if (j > 0 && ary_entry(parser->scope, -1) == LI)
1821
+ {
1822
+ // so we should precede it with a CRLF, and indicate that it's a nested list
1823
+ str_append(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1824
+ ary_push(parser->scope, NESTED_LIST);
1825
+ }
1826
+ else
1827
+ {
1828
+ // this is a new list
1829
+ if (IN(BLOCKQUOTE_START))
1830
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE_START, false);
1831
+ else
1832
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE, false);
1833
+ }
1834
+
1835
+ // emit
1836
+ wiki_indent(parser);
1837
+ if (type == OL)
1838
+ str_append(parser->output, ol_start, sizeof(ol_start) - 1);
1839
+ else if (type == UL)
1840
+ str_append(parser->output, ul_start, sizeof(ul_start) - 1);
1841
+ ary_push(parser->scope, type);
1842
+ str_append(parser->output, parser->line_ending->ptr, parser->line_ending->len);
1843
+ }
1844
+ else if (type == SPACE)
1845
+ // silently throw away the optional SPACE token after final list marker
1846
+ token = NULL;
1847
+
1848
+ wiki_indent(parser);
1849
+ str_append(parser->output, li_start, sizeof(li_start) - 1);
1850
+ ary_push(parser->scope, LI);
1851
+
1852
+ // any subsequent UL or OL tokens on this line are syntax errors and must be emitted literally
1853
+ if (type == OL || type == UL)
1854
+ {
1855
+ k = 0;
1856
+ while (k++, NEXT_TOKEN(), (type = token->type))
1857
+ {
1858
+ if (type == OL || type == UL)
1859
+ str_append(parser->output, token->start, TOKEN_LEN(token));
1860
+ else if (type == SPACE && k == 1)
1861
+ {
1862
+ // silently throw away the optional SPACE token after final list marker
1863
+ token = NULL;
1864
+ break;
1865
+ }
1866
+ else
1867
+ break;
1868
+ }
1869
+ }
1870
+
1871
+ // jump to top of the loop to process token we scanned during lookahead
1872
+ continue;
1873
+
1874
+ case H6_START:
1875
+ case H5_START:
1876
+ case H4_START:
1877
+ case H3_START:
1878
+ case H2_START:
1879
+ case H1_START:
1880
+ if (IN_EITHER_OF(NO_WIKI_START, PRE_START))
1881
+ {
1882
+ // no need to check for PRE; can never appear inside it
1883
+ str_append(parser->output, token->start, TOKEN_LEN(token));
1884
+ break;
1885
+ }
1886
+
1887
+ // pop up to but not including the last BLOCKQUOTE on the scope stack
1888
+ if (IN(BLOCKQUOTE_START))
1889
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE_START, false);
1890
+ else
1891
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE, false);
1892
+
1893
+ // count number of BLOCKQUOTE tokens in line buffer and in scope stack
1894
+ ary_push(parser->line, type);
1895
+ i = ary_count(parser->line, BLOCKQUOTE);
1896
+ j = ary_count(parser->scope, BLOCKQUOTE);
1897
+
1898
+ // decide whether we need to pop off excess BLOCKQUOTE tokens (will never need to push; that is handled above in the BLOCKQUOTE case itself)
1899
+ if (i < j)
1900
+ {
1901
+ // must pop (reduce nesting level)
1902
+ for (i = j - i; i > 0; i--)
1903
+ wiki_pop_from_stack_up_to(parser, NULL, BLOCKQUOTE, true);
1904
+ }
1905
+
1906
+ // discard any whitespace here (so that "== foo ==" will be translated to "<h2>foo</h2>" rather than "<h2> foo </h2")
1907
+ while (NEXT_TOKEN(), (token->type == SPACE))
1908
+ ; // discard
1909
+
1910
+ ary_push(parser->scope, type);
1911
+ wiki_indent(parser);
1912
+
1913
+ // take base_heading_level into account
1914
+ type += base_heading_level;
1915
+ if (type > H6_START) // no need to check for underflow (base_heading_level never negative)
1916
+ type = H6_START;
1917
+
1918
+ // rather than repeat all that code for each kind of heading, share it and use a conditional here
1919
+ if (type == H6_START)
1920
+ str_append(parser->output, h6_start, sizeof(h6_start) - 1);
1921
+ else if (type == H5_START)
1922
+ str_append(parser->output, h5_start, sizeof(h5_start) - 1);
1923
+ else if (type == H4_START)
1924
+ str_append(parser->output, h4_start, sizeof(h4_start) - 1);
1925
+ else if (type == H3_START)
1926
+ str_append(parser->output, h3_start, sizeof(h3_start) - 1);
1927
+ else if (type == H2_START)
1928
+ str_append(parser->output, h2_start, sizeof(h2_start) - 1);
1929
+ else if (type == H1_START)
1930
+ str_append(parser->output, h1_start, sizeof(h1_start) - 1);
1931
+
1932
+ // jump to top of the loop to process token we scanned during lookahead
1933
+ continue;
1934
+
1935
+ case H6_END:
1936
+ case H5_END:
1937
+ case H4_END:
1938
+ case H3_END:
1939
+ case H2_END:
1940
+ case H1_END:
1941
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1942
+ {
1943
+ wiki_emit_pending_crlf_if_necessary(parser);
1944
+ str_append(parser->output, token->start, TOKEN_LEN(token));
1945
+ }
1946
+ else
1947
+ {
1948
+ wiki_rollback_failed_external_link(parser); // if any
1949
+ if ((type == H6_END && !IN(H6_START)) ||
1950
+ (type == H5_END && !IN(H5_START)) ||
1951
+ (type == H4_END && !IN(H4_START)) ||
1952
+ (type == H3_END && !IN(H3_START)) ||
1953
+ (type == H2_END && !IN(H2_START)) ||
1954
+ (type == H1_END && !IN(H1_START)))
1955
+ {
1956
+ // literal output only if not in appropriate scope (we stay silent in that case)
1957
+ wiki_start_para_if_necessary(parser);
1958
+ str_append(parser->output, token->start, TOKEN_LEN(token));
1959
+ }
1960
+ }
1961
+ break;
1962
+
1963
+ case MAIL:
1964
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
1965
+ {
1966
+ wiki_emit_pending_crlf_if_necessary(parser);
1967
+ str_append(parser->output, token->start, TOKEN_LEN(token));
1968
+ }
1969
+ else if (IN(EXT_LINK_START))
1970
+ // must be capturing and this must be part of the link text
1971
+ str_append(parser->capture, token->start, TOKEN_LEN(token));
1972
+ else
1973
+ {
1974
+ wiki_pop_excess_elements(parser);
1975
+ wiki_start_para_if_necessary(parser);
1976
+ token_str->ptr = token->start;
1977
+ token_str->len = TOKEN_LEN(token);
1978
+ wiki_append_hyperlink(parser, rb_str_new2("mailto:"), token_str, NULL, mailto_class, Qnil, true);
1979
+ }
1980
+ break;
1981
+
1982
+ case URI:
1983
+ if (IN(NO_WIKI_START))
1984
+ {
1985
+ // user can temporarily suppress autolinking by using <nowiki></nowiki>
1986
+ // note that unlike MediaWiki, we do allow autolinking inside PRE blocks
1987
+ token_str->ptr = token->start;
1988
+ token_str->len = TOKEN_LEN(token);
1989
+ wiki_append_sanitized_link_target(token_str, parser->output, false);
1990
+ }
1991
+ else if (IN(LINK_START))
1992
+ {
1993
+ // if the URI were allowed it would have been handled already in LINK_START
1994
+ wiki_rollback_failed_internal_link(parser);
1995
+ token_str->ptr = token->start;
1996
+ token_str->len = TOKEN_LEN(token);
1997
+ wiki_append_hyperlink(parser, Qnil, token_str, NULL, parser->external_link_class, parser->external_link_rel, true);
1998
+ }
1999
+ else if (IN(EXT_LINK_START))
2000
+ {
2001
+ if (parser->link_target->len == 0)
2002
+ {
2003
+ // this must be our link target: look ahead to make sure we see the space we're expecting to see
2004
+ token_str->ptr = token->start;
2005
+ token_str->len = TOKEN_LEN(token);
2006
+ NEXT_TOKEN();
2007
+ if (token->type == SPACE)
2008
+ {
2009
+ ary_push(parser->scope, SPACE);
2010
+ str_append_str(parser->link_target, token_str);
2011
+ str_clear(parser->link_text);
2012
+ parser->capture = parser->link_text;
2013
+ token = NULL; // silently consume space
2014
+ }
2015
+ else
2016
+ {
2017
+ // didn't see the space! this must be an error
2018
+ wiki_pop_from_stack(parser, NULL);
2019
+ wiki_pop_excess_elements(parser);
2020
+ wiki_start_para_if_necessary(parser);
2021
+ str_append(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
2022
+ wiki_append_hyperlink(parser, Qnil, token_str, NULL, parser->external_link_class, parser->external_link_rel, true);
2023
+ continue;
2024
+ }
2025
+ }
2026
+ else
2027
+ {
2028
+ token_str->ptr = token->start;
2029
+ token_str->len = TOKEN_LEN(token);
2030
+ wiki_append_sanitized_link_target(token_str, parser->link_text, false);
2031
+ }
2032
+ }
2033
+ else
2034
+ {
2035
+ wiki_pop_excess_elements(parser);
2036
+ wiki_start_para_if_necessary(parser);
2037
+ token_str->ptr = token->start;
2038
+ token_str->len = TOKEN_LEN(token);
2039
+ wiki_append_hyperlink(parser, Qnil, token_str, NULL, parser->external_link_class, parser->external_link_rel, true);
2040
+ }
2041
+ break;
2042
+
2043
+ case PATH:
2044
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
2045
+ {
2046
+ wiki_emit_pending_crlf_if_necessary(parser);
2047
+ str_append(parser->output, token->start, TOKEN_LEN(token));
2048
+ }
2049
+ else if (IN(EXT_LINK_START))
2050
+ {
2051
+ if (parser->link_target->len == 0)
2052
+ {
2053
+ // this must be our link target: look ahead to make sure we see the space we're expecting to see
2054
+ token_str->ptr = token->start;
2055
+ token_str->len = TOKEN_LEN(token);
2056
+ NEXT_TOKEN();
2057
+ if (token->type == SPACE)
2058
+ {
2059
+ ary_push(parser->scope, PATH);
2060
+ ary_push(parser->scope, SPACE);
2061
+ str_append_str(parser->link_target, token_str);
2062
+ str_clear(parser->link_text);
2063
+ parser->capture = parser->link_text;
2064
+ token = NULL; // silently consume space
2065
+ }
2066
+ else
2067
+ {
2068
+ // didn't see the space! this must be an error
2069
+ wiki_pop_from_stack(parser, NULL);
2070
+ wiki_pop_excess_elements(parser);
2071
+ wiki_start_para_if_necessary(parser);
2072
+ str_append(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
2073
+ str_append_str(parser->output, token_str);
2074
+ continue;
2075
+ }
2076
+ }
2077
+ else
2078
+ str_append(parser->link_text, token->start, TOKEN_LEN(token));
2079
+ }
2080
+ else
2081
+ {
2082
+ output = parser->capture ? parser->capture : parser->output;
2083
+ wiki_pop_excess_elements(parser);
2084
+ wiki_start_para_if_necessary(parser);
2085
+ str_append(output, token->start, TOKEN_LEN(token));
2086
+ }
2087
+ break;
2088
+
2089
+ // internal links (links to other wiki articles) look like this:
2090
+ // [[another article]] (would point at, for example, "/wiki/another_article")
2091
+ // [[the other article|the link text we'll use for it]]
2092
+ // [[the other article | the link text we'll use for it]]
2093
+ // MediaWiki has strict requirements about what it will accept as a link target:
2094
+ // all wikitext markup is disallowed:
2095
+ // example [[foo ''bar'' baz]]
2096
+ // renders [[foo <em>bar</em> baz]] (ie. not a link)
2097
+ // example [[foo <em>bar</em> baz]]
2098
+ // renders [[foo <em>bar</em> baz]] (ie. not a link)
2099
+ // example [[foo <nowiki>''</nowiki> baz]]
2100
+ // renders [[foo '' baz]] (ie. not a link)
2101
+ // example [[foo <bar> baz]]
2102
+ // renders [[foo &lt;bar&gt; baz]] (ie. not a link)
2103
+ // HTML entities and non-ASCII, however, make it through:
2104
+ // example [[foo &euro;]]
2105
+ // renders <a href="/wiki/Foo_%E2%82%AC">foo &euro;</a>
2106
+ // example [[foo €]]
2107
+ // renders <a href="/wiki/Foo_%E2%82%AC">foo €</a>
2108
+ // we'll impose similar restrictions here for the link target; allowed tokens will be:
2109
+ // SPACE, SPECIAL_URI_CHARS, PRINTABLE, PATH, ALNUM, DEFAULT, QUOT and AMP
2110
+ // everything else will be rejected
2111
+ case LINK_START:
2112
+ output = parser->capture ? parser->capture : parser->output;
2113
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
2114
+ {
2115
+ wiki_emit_pending_crlf_if_necessary(parser);
2116
+ str_append(output, link_start, sizeof(link_start) - 1);
2117
+ }
2118
+ else if (IN(EXT_LINK_START))
2119
+ // already in external link scope! (and in fact, must be capturing link_text right now)
2120
+ str_append(output, link_start, sizeof(link_start) - 1);
2121
+ else if (IN(LINK_START))
2122
+ {
2123
+ // already in internal link scope! this is a syntax error
2124
+ wiki_rollback_failed_internal_link(parser);
2125
+ str_append(parser->output, link_start, sizeof(link_start) - 1);
2126
+ }
2127
+ else if (IN(SEPARATOR))
2128
+ {
2129
+ // scanning internal link text
2130
+ }
2131
+ else // not in internal link scope yet
2132
+ {
2133
+ // will either emit a link, or the rollback of a failed link, so start the para now
2134
+ wiki_pop_excess_elements(parser);
2135
+ wiki_start_para_if_necessary(parser);
2136
+ ary_push(parser->scope, LINK_START);
2137
+
2138
+ // look ahead and try to gobble up link target
2139
+ while (NEXT_TOKEN(), (type = token->type))
2140
+ {
2141
+ if (type == SPACE ||
2142
+ type == SPECIAL_URI_CHARS ||
2143
+ type == PATH ||
2144
+ type == PRINTABLE ||
2145
+ type == ALNUM ||
2146
+ type == DEFAULT ||
2147
+ type == QUOT ||
2148
+ type == QUOT_ENTITY ||
2149
+ type == AMP ||
2150
+ type == AMP_ENTITY ||
2151
+ type == IMG_START ||
2152
+ type == IMG_END ||
2153
+ type == LEFT_CURLY ||
2154
+ type == RIGHT_CURLY)
2155
+ {
2156
+ // accumulate these tokens into link_target
2157
+ if (parser->link_target->len == 0)
2158
+ {
2159
+ str_clear(parser->link_target);
2160
+ parser->capture = parser->link_target;
2161
+ }
2162
+ if (type == QUOT_ENTITY)
2163
+ // don't insert the entity, insert the literal quote
2164
+ str_append(parser->link_target, quote, sizeof(quote) - 1);
2165
+ else if (type == AMP_ENTITY)
2166
+ // don't insert the entity, insert the literal ampersand
2167
+ str_append(parser->link_target, ampersand, sizeof(ampersand) - 1);
2168
+ else
2169
+ str_append(parser->link_target, token->start, TOKEN_LEN(token));
2170
+ }
2171
+ else if (type == LINK_END)
2172
+ {
2173
+ if (parser->link_target->len == 0) // bail for inputs like "[[]]"
2174
+ wiki_rollback_failed_internal_link(parser);
2175
+ break; // jump back to top of loop (will handle this in LINK_END case below)
2176
+ }
2177
+ else if (type == SEPARATOR)
2178
+ {
2179
+ if (parser->link_target->len == 0) // bail for inputs like "[[|"
2180
+ wiki_rollback_failed_internal_link(parser);
2181
+ else
2182
+ {
2183
+ ary_push(parser->scope, SEPARATOR);
2184
+ str_clear(parser->link_text);
2185
+ parser->capture = parser->link_text;
2186
+ token = NULL;
2187
+ }
2188
+ break;
2189
+ }
2190
+ else // unexpected token (syntax error)
2191
+ {
2192
+ wiki_rollback_failed_internal_link(parser);
2193
+ break; // jump back to top of loop to handle unexpected token
2194
+ }
2195
+ }
2196
+
2197
+ // jump to top of the loop to process token we scanned during lookahead (if any)
2198
+ continue;
2199
+ }
2200
+ break;
2201
+
2202
+ case LINK_END:
2203
+ output = parser->capture ? parser->capture : parser->output;
2204
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
2205
+ {
2206
+ wiki_emit_pending_crlf_if_necessary(parser);
2207
+ str_append(output, link_end, sizeof(link_end) - 1);
2208
+ }
2209
+ else if (IN(EXT_LINK_START))
2210
+ // already in external link scope! (and in fact, must be capturing link_text right now)
2211
+ str_append(output, link_end, sizeof(link_end) - 1);
2212
+ else if (IN(LINK_START)) // in internal link scope!
2213
+ {
2214
+ if (wiki_blank(parser->link_target))
2215
+ {
2216
+ // special case for inputs like "[[ ]]"
2217
+ wiki_rollback_failed_internal_link(parser);
2218
+ str_append(parser->output, link_end, sizeof(link_end) - 1);
2219
+ break;
2220
+ }
2221
+ if (parser->link_text->len == 0 ||
2222
+ wiki_blank(parser->link_text))
2223
+ {
2224
+ // use link target as link text
2225
+ str_clear(parser->link_text);
2226
+ wiki_append_sanitized_link_target(parser->link_target, parser->link_text, true);
2227
+ }
2228
+ else
2229
+ wiki_trim_link_text(parser);
2230
+
2231
+ // perform "redlink" check before manipulating link_target
2232
+ if (NIL_P(link_proc))
2233
+ j = Qnil;
2234
+ else
2235
+ {
2236
+ j = rb_funcall(link_proc, rb_intern("call"), 1, string_from_str(parser->link_target));
2237
+ if (!NIL_P(j))
2238
+ {
2239
+ VALUE l = j; // can't cast inside StringValue macro
2240
+ j = StringValue(l);
2241
+ }
2242
+ }
2243
+ wiki_encode_link_target(parser);
2244
+ wiki_pop_from_stack_up_to(parser, output, LINK_START, true);
2245
+ parser->capture = NULL;
2246
+ wiki_append_hyperlink(parser, prefix, parser->link_target, parser->link_text, j, Qnil, false);
2247
+ str_clear(parser->link_target);
2248
+ str_clear(parser->link_text);
2249
+ }
2250
+ else // wasn't in internal link scope
2251
+ {
2252
+ wiki_pop_excess_elements(parser);
2253
+ wiki_start_para_if_necessary(parser);
2254
+ str_append(output, link_end, sizeof(link_end) - 1);
2255
+ }
2256
+ break;
2257
+
2258
+ // external links look like this:
2259
+ // [http://google.com/ the link text]
2260
+ // [/other/page/on/site see this page]
2261
+ // strings in square brackets which don't match this syntax get passed through literally; eg:
2262
+ // he was very angery [sic] about the turn of events
2263
+ case EXT_LINK_START:
2264
+ output = parser->capture ? parser->capture : parser->output;
2265
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
2266
+ {
2267
+ wiki_emit_pending_crlf_if_necessary(parser);
2268
+ str_append(output, ext_link_start, sizeof(ext_link_start) - 1);
2269
+ }
2270
+ else if (IN(EXT_LINK_START))
2271
+ // already in external link scope! (and in fact, must be capturing link_text right now)
2272
+ str_append(output, ext_link_start, sizeof(ext_link_start) - 1);
2273
+ else if (IN(LINK_START))
2274
+ {
2275
+ // already in internal link scope!
2276
+ if (parser->link_target->len == 0 || !IN(SPACE))
2277
+ str_append(parser->link_target, ext_link_start, sizeof(ext_link_start) - 1);
2278
+ else // link target has already been scanned
2279
+ str_append(parser->link_text, ext_link_start, sizeof(ext_link_start) - 1);
2280
+ }
2281
+ else // not in external link scope yet
2282
+ {
2283
+ // will either emit a link, or the rollback of a failed link, so start the para now
2284
+ wiki_pop_excess_elements(parser);
2285
+ wiki_start_para_if_necessary(parser);
2286
+
2287
+ // look ahead: expect an absolute URI (with protocol) or "relative" (path) URI
2288
+ NEXT_TOKEN();
2289
+ if (token->type == URI || token->type == PATH)
2290
+ ary_push(parser->scope, EXT_LINK_START); // so far so good, jump back to the top of the loop
2291
+ else
2292
+ // only get here if there was a syntax error (missing URI)
2293
+ str_append(parser->output, ext_link_start, sizeof(ext_link_start) - 1);
2294
+ continue; // jump back to top of loop to handle token (either URI or whatever it is)
2295
+ }
2296
+ break;
2297
+
2298
+ case EXT_LINK_END:
2299
+ output = parser->capture ? parser->capture : parser->output;
2300
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
2301
+ {
2302
+ wiki_emit_pending_crlf_if_necessary(parser);
2303
+ str_append(output, ext_link_end, sizeof(ext_link_end) - 1);
2304
+ }
2305
+ else if (IN(EXT_LINK_START))
2306
+ {
2307
+ if (parser->link_text->len == 0)
2308
+ // syntax error: external link with no link text
2309
+ wiki_rollback_failed_external_link(parser);
2310
+ else
2311
+ {
2312
+ // success!
2313
+ j = IN(PATH) ? Qnil : parser->external_link_class;
2314
+ k = IN(PATH) ? Qnil : parser->external_link_rel;
2315
+ wiki_pop_from_stack_up_to(parser, output, EXT_LINK_START, true);
2316
+ parser->capture = NULL;
2317
+ wiki_append_hyperlink(parser, Qnil, parser->link_target, parser->link_text, j, k, false);
2318
+ }
2319
+ str_clear(parser->link_target);
2320
+ str_clear(parser->link_text);
2321
+ }
2322
+ else
2323
+ {
2324
+ wiki_pop_excess_elements(parser);
2325
+ wiki_start_para_if_necessary(parser);
2326
+ str_append(parser->output, ext_link_end, sizeof(ext_link_end) - 1);
2327
+ }
2328
+ break;
2329
+
2330
+ case SEPARATOR:
2331
+ output = parser->capture ? parser->capture : parser->output;
2332
+ wiki_pop_excess_elements(parser);
2333
+ wiki_start_para_if_necessary(parser);
2334
+ str_append(output, separator, sizeof(separator) - 1);
2335
+ break;
2336
+
2337
+ case SPACE:
2338
+ output = parser->capture ? parser->capture : parser->output;
2339
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
2340
+ {
2341
+ wiki_emit_pending_crlf_if_necessary(parser);
2342
+ str_append(output, token->start, TOKEN_LEN(token));
2343
+ }
2344
+ else
2345
+ {
2346
+ // peek ahead to see next token
2347
+ char *token_ptr = token->start;
2348
+ long token_len = TOKEN_LEN(token);
2349
+ NEXT_TOKEN();
2350
+ type = token->type;
2351
+ if ((type == H6_END && IN(H6_START)) ||
2352
+ (type == H5_END && IN(H5_START)) ||
2353
+ (type == H4_END && IN(H4_START)) ||
2354
+ (type == H3_END && IN(H3_START)) ||
2355
+ (type == H2_END && IN(H2_START)) ||
2356
+ (type == H1_END && IN(H1_START)))
2357
+ {
2358
+ // will suppress emission of space (discard) if next token is a H6_END, H5_END etc and we are in the corresponding scope
2359
+ }
2360
+ else
2361
+ {
2362
+ // emit the space
2363
+ wiki_pop_excess_elements(parser);
2364
+ wiki_start_para_if_necessary(parser);
2365
+ str_append(output, token_ptr, token_len);
2366
+ }
2367
+
2368
+ // jump to top of the loop to process token we scanned during lookahead
2369
+ continue;
2370
+ }
2371
+ break;
2372
+
2373
+ case QUOT_ENTITY:
2374
+ case AMP_ENTITY:
2375
+ case NAMED_ENTITY:
2376
+ case DECIMAL_ENTITY:
2377
+ // pass these through unaltered as they are case sensitive
2378
+ output = parser->capture ? parser->capture : parser->output;
2379
+ wiki_pop_excess_elements(parser);
2380
+ wiki_start_para_if_necessary(parser);
2381
+ str_append(output, token->start, TOKEN_LEN(token));
2382
+ break;
2383
+
2384
+ case HEX_ENTITY:
2385
+ // normalize hex entities (downcase them)
2386
+ output = parser->capture ? parser->capture : parser->output;
2387
+ wiki_pop_excess_elements(parser);
2388
+ wiki_start_para_if_necessary(parser);
2389
+ str_append(output, token->start, TOKEN_LEN(token));
2390
+ wiki_downcase_bang(output->ptr + output->len - TOKEN_LEN(token), TOKEN_LEN(token));
2391
+ break;
2392
+
2393
+ case QUOT:
2394
+ output = parser->capture ? parser->capture : parser->output;
2395
+ wiki_pop_excess_elements(parser);
2396
+ wiki_start_para_if_necessary(parser);
2397
+ str_append(output, quot_entity, sizeof(quot_entity) - 1);
2398
+ break;
2399
+
2400
+ case AMP:
2401
+ output = parser->capture ? parser->capture : parser->output;
2402
+ wiki_pop_excess_elements(parser);
2403
+ wiki_start_para_if_necessary(parser);
2404
+ str_append(output, amp_entity, sizeof(amp_entity) - 1);
2405
+ break;
2406
+
2407
+ case LESS:
2408
+ output = parser->capture ? parser->capture : parser->output;
2409
+ wiki_pop_excess_elements(parser);
2410
+ wiki_start_para_if_necessary(parser);
2411
+ str_append(output, lt_entity, sizeof(lt_entity) - 1);
2412
+ break;
2413
+
2414
+ case GREATER:
2415
+ output = parser->capture ? parser->capture : parser->output;
2416
+ wiki_pop_excess_elements(parser);
2417
+ wiki_start_para_if_necessary(parser);
2418
+ str_append(output, gt_entity, sizeof(gt_entity) - 1);
2419
+ break;
2420
+
2421
+ case IMG_START:
2422
+ if (IN_ANY_OF(NO_WIKI_START, PRE, PRE_START))
2423
+ {
2424
+ wiki_emit_pending_crlf_if_necessary(parser);
2425
+ str_append(parser->output, token->start, TOKEN_LEN(token));
2426
+ }
2427
+ else if (parser->capture)
2428
+ str_append(parser->capture, token->start, TOKEN_LEN(token));
2429
+ else
2430
+ {
2431
+ // not currently capturing: will be emitting something on success or failure, so get ready
2432
+ wiki_pop_excess_elements(parser);
2433
+ wiki_start_para_if_necessary(parser);
2434
+
2435
+ // scan ahead consuming PATH, PRINTABLE, ALNUM and SPECIAL_URI_CHARS tokens
2436
+ // will cheat here and abuse the link_target capture buffer to accumulate text
2437
+ while (NEXT_TOKEN(), (type = token->type))
2438
+ {
2439
+ if (type == PATH || type == PRINTABLE || type == ALNUM || type == SPECIAL_URI_CHARS)
2440
+ str_append(parser->link_target, token->start, TOKEN_LEN(token));
2441
+ else if (type == IMG_END && parser->link_target->len > 0)
2442
+ {
2443
+ // success
2444
+ wiki_append_img(parser, parser->link_target->ptr, parser->link_target->len);
2445
+ token = NULL;
2446
+ break;
2447
+ }
2448
+ else // unexpected token or zero-length target (syntax error)
2449
+ {
2450
+ // rollback
2451
+ str_append(parser->output, literal_img_start, sizeof(literal_img_start) - 1);
2452
+ if (parser->link_target->len > 0)
2453
+ str_append(parser->output, parser->link_target->ptr, parser->link_target->len);
2454
+ break;
2455
+ }
2456
+ }
2457
+
2458
+ // jump to top of the loop to process token we scanned during lookahead
2459
+ str_clear(parser->link_target);
2460
+ continue;
2461
+ }
2462
+ break;
2463
+
2464
+ case CRLF:
2465
+ i = parser->pending_crlf;
2466
+ parser->pending_crlf = false;
2467
+ wiki_rollback_failed_link(parser); // if any
2468
+ if (IN_EITHER_OF(NO_WIKI_START, PRE_START))
2469
+ {
2470
+ ary_clear(parser->line_buffer);
2471
+ str_append_str(parser->output, parser->line_ending);
2472
+ break;
2473
+ }
2474
+ else if (IN(PRE))
2475
+ {
2476
+ // beware when BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, that must be end of PRE block
2477
+ if (ary_entry(parser->line_buffer, -2) == BLOCKQUOTE)
2478
+ // don't emit in this case
2479
+ wiki_pop_from_stack_up_to(parser, parser->output, PRE, true);
2480
+ else
2481
+ {
2482
+ if (ary_entry(parser->line_buffer, -2) == PRE)
2483
+ {
2484
+ // only thing on line is the PRE: emit pending line ending (if we had one)
2485
+ if (i)
2486
+ str_append_str(parser->output, parser->line_ending);
2487
+ }
2488
+
2489
+ // clear these _before_ calling NEXT_TOKEN (NEXT_TOKEN adds to the line_buffer)
2490
+ ary_clear(parser->line);
2491
+ ary_clear(parser->line_buffer);
2492
+
2493
+ // peek ahead to see if this is definitely the end of the PRE block
2494
+ NEXT_TOKEN();
2495
+ type = token->type;
2496
+ if (type != BLOCKQUOTE && type != PRE)
2497
+ // this is definitely the end of the block, so don't emit
2498
+ wiki_pop_from_stack_up_to(parser, parser->output, PRE, true);
2499
+ else
2500
+ // potentially will emit
2501
+ parser->pending_crlf = true;
2502
+
2503
+ continue; // jump back to top of loop to handle token grabbed via lookahead
2504
+ }
2505
+ }
2506
+ else
2507
+ {
2508
+ parser->pending_crlf = true;
2509
+
2510
+ // count number of BLOCKQUOTE tokens in line buffer (can be zero) and pop back to that level
2511
+ // as a side effect, this handles any open span-level elements and unclosed blocks
2512
+ // (with special handling for P blocks and LI elements)
2513
+ i = ary_count(parser->line, BLOCKQUOTE) + ary_count(parser->scope, BLOCKQUOTE_START);
2514
+ for (j = parser->scope->count; j > i; j--)
2515
+ {
2516
+ if (parser->scope->count > 0 && ary_entry(parser->scope, -1) == LI)
2517
+ {
2518
+ parser->pending_crlf = false;
2519
+ break;
2520
+ }
2521
+
2522
+ // special handling on last iteration through the loop if the top item on the scope is a P block
2523
+ if ((j - i == 1) && ary_entry(parser->scope, -1) == P)
2524
+ {
2525
+ // if nothing or BLOCKQUOTE on line buffer (not line stack!) prior to CRLF, this must be a paragraph break
2526
+ // (note that we have to make sure we're not inside a BLOCKQUOTE_START block
2527
+ // because in those blocks BLOCKQUOTE tokens have no special meaning)
2528
+ if (NO_ITEM(ary_entry(parser->line_buffer, -2)) ||
2529
+ (ary_entry(parser->line_buffer, -2) == BLOCKQUOTE && !IN(BLOCKQUOTE_START)))
2530
+ // paragraph break
2531
+ parser->pending_crlf = false;
2532
+ else
2533
+ // not a paragraph break!
2534
+ continue;
2535
+ }
2536
+ wiki_pop_from_stack(parser, NULL);
2537
+ }
2538
+ }
2539
+
2540
+ // delete the entire contents of the line scope stack and buffer
2541
+ ary_clear(parser->line);
2542
+ ary_clear(parser->line_buffer);
2543
+ break;
2544
+
2545
+ case SPECIAL_URI_CHARS:
2546
+ case PRINTABLE:
2547
+ case ALNUM:
2548
+ case IMG_END:
2549
+ case LEFT_CURLY:
2550
+ case RIGHT_CURLY:
2551
+ output = parser->capture ? parser->capture : parser->output;
2552
+ wiki_pop_excess_elements(parser);
2553
+ wiki_start_para_if_necessary(parser);
2554
+ str_append(output, token->start, TOKEN_LEN(token));
2555
+ break;
2556
+
2557
+ case DEFAULT:
2558
+ output = parser->capture ? parser->capture : parser->output;
2559
+ wiki_pop_excess_elements(parser);
2560
+ wiki_start_para_if_necessary(parser);
2561
+ wiki_append_entity_from_utf32_char(output, token->code_point);
2562
+ break;
2563
+
2564
+ case END_OF_FILE:
2565
+ // special case for input like " foo\n " (see pre_spec.rb)
2566
+ if (IN(PRE) &&
2567
+ ary_entry(parser->line_buffer, -2) == PRE &&
2568
+ parser->pending_crlf)
2569
+ str_append(parser->output, parser->line_ending->ptr, parser->line_ending->len);
2570
+
2571
+ // close any open scopes on hitting EOF
2572
+ wiki_rollback_failed_link(parser); // if any
2573
+ wiki_pop_all_from_stack(parser);
2574
+ goto return_output; // break not enough here (want to break out of outer while loop, not inner switch statement)
2575
+
2576
+ default:
2577
+ break;
2578
+ }
2579
+
2580
+ // reset current token; forcing lexer to return another token at the top of the loop
2581
+ token = NULL;
2582
+ } while (1);
2583
+ return_output:
2584
+ // nasty hack to avoid re-allocating our return value
2585
+ str_append(parser->output, null_str, 1); // null-terminate
2586
+ len = parser->output->len - 1; // don't count null termination
2587
+
2588
+ VALUE out = rb_str_buf_new(RSTRING_EMBED_LEN_MAX + 1);
2589
+ free(RSTRING_PTR(out));
2590
+ RSTRING(out)->as.heap.aux.capa = len;
2591
+ RSTRING(out)->as.heap.ptr = parser->output->ptr;
2592
+ RSTRING(out)->as.heap.len = len;
2593
+ parser->output->ptr = NULL; // don't double-free
2594
+ return out;
2595
+ }