wikitext 4.0.1 → 4.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wikitext +10 -110
- data/ext/wikitext/ary.c +116 -0
- data/ext/wikitext/ary.h +50 -0
- data/ext/wikitext/depend +32 -0
- data/ext/wikitext/parser.c +2595 -0
- data/ext/wikitext/parser.h +40 -0
- data/ext/wikitext/ruby_compat.h +34 -0
- data/ext/wikitext/str.c +109 -0
- data/ext/wikitext/str.h +64 -0
- data/ext/wikitext/token.c +125 -0
- data/ext/wikitext/token.h +117 -0
- data/ext/wikitext/wikitext.c +125 -0
- data/ext/wikitext/wikitext.h +39 -0
- data/ext/wikitext/wikitext_ragel.c +3211 -0
- data/ext/wikitext/wikitext_ragel.h +26 -0
- data/lib/wikitext/version.rb +1 -1
- metadata +17 -3
@@ -0,0 +1,40 @@
|
|
1
|
+
// Copyright 2008-2009 Wincent Colaiuta. All rights reserved.
|
2
|
+
//
|
3
|
+
// Redistribution and use in source and binary forms, with or without
|
4
|
+
// modification, are permitted provided that the following conditions are met:
|
5
|
+
//
|
6
|
+
// 1. Redistributions of source code must retain the above copyright notice,
|
7
|
+
// this list of conditions and the following disclaimer.
|
8
|
+
// 2. Redistributions in binary form must reproduce the above copyright notice,
|
9
|
+
// this list of conditions and the following disclaimer in the documentation
|
10
|
+
// and/or other materials provided with the distribution.
|
11
|
+
//
|
12
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
13
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
14
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
15
|
+
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
16
|
+
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
17
|
+
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
18
|
+
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
19
|
+
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
20
|
+
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
21
|
+
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
22
|
+
// POSSIBILITY OF SUCH DAMAGE.
|
23
|
+
|
24
|
+
#include "ruby_compat.h"
|
25
|
+
|
26
|
+
VALUE Wikitext_parser_initialize(int argc, VALUE *argv, VALUE self);
|
27
|
+
|
28
|
+
VALUE Wikitext_parser_tokenize(VALUE self, VALUE string);
|
29
|
+
|
30
|
+
VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string);
|
31
|
+
|
32
|
+
VALUE Wikitext_parser_fulltext_tokenize(int argc, VALUE *argv, VALUE self);
|
33
|
+
|
34
|
+
VALUE Wikitext_parser_sanitize_link_target(VALUE self, VALUE string);
|
35
|
+
|
36
|
+
VALUE Wikitext_parser_encode_link_target(VALUE self, VALUE in);
|
37
|
+
|
38
|
+
VALUE Wikitext_parser_parse(int argc, VALUE *argv, VALUE self);
|
39
|
+
|
40
|
+
VALUE Wikitext_parser_profiling_parse(VALUE self, VALUE string);
|
@@ -0,0 +1,34 @@
|
|
1
|
+
// Copyright 2008-2009 Wincent Colaiuta. All rights reserved.
|
2
|
+
//
|
3
|
+
// Redistribution and use in source and binary forms, with or without
|
4
|
+
// modification, are permitted provided that the following conditions are met:
|
5
|
+
//
|
6
|
+
// 1. Redistributions of source code must retain the above copyright notice,
|
7
|
+
// this list of conditions and the following disclaimer.
|
8
|
+
// 2. Redistributions in binary form must reproduce the above copyright notice,
|
9
|
+
// this list of conditions and the following disclaimer in the documentation
|
10
|
+
// and/or other materials provided with the distribution.
|
11
|
+
//
|
12
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
13
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
14
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
15
|
+
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
16
|
+
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
17
|
+
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
18
|
+
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
19
|
+
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
20
|
+
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
21
|
+
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
22
|
+
// POSSIBILITY OF SUCH DAMAGE.
|
23
|
+
|
24
|
+
#include <ruby.h>
|
25
|
+
|
26
|
+
// for compatibility with Ruby 1.8.5, which doesn't declare RSTRING_PTR
|
27
|
+
#ifndef RSTRING_PTR
|
28
|
+
#define RSTRING_PTR(s) (RSTRING(s)->ptr)
|
29
|
+
#endif
|
30
|
+
|
31
|
+
// for compatibility with Ruby 1.8.5, which doesn't declare RSTRING_LEN
|
32
|
+
#ifndef RSTRING_LEN
|
33
|
+
#define RSTRING_LEN(s) (RSTRING(s)->len)
|
34
|
+
#endif
|
data/ext/wikitext/str.c
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
// Copyright 2008-2009 Wincent Colaiuta. All rights reserved.
|
2
|
+
//
|
3
|
+
// Redistribution and use in source and binary forms, with or without
|
4
|
+
// modification, are permitted provided that the following conditions are met:
|
5
|
+
//
|
6
|
+
// 1. Redistributions of source code must retain the above copyright notice,
|
7
|
+
// this list of conditions and the following disclaimer.
|
8
|
+
// 2. Redistributions in binary form must reproduce the above copyright notice,
|
9
|
+
// this list of conditions and the following disclaimer in the documentation
|
10
|
+
// and/or other materials provided with the distribution.
|
11
|
+
//
|
12
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
13
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
14
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
15
|
+
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
16
|
+
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
17
|
+
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
18
|
+
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
19
|
+
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
20
|
+
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
21
|
+
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
22
|
+
// POSSIBILITY OF SUCH DAMAGE.
|
23
|
+
|
24
|
+
#include "str.h"
|
25
|
+
|
26
|
+
// when allocating memory, reserve a little more than was asked for,
|
27
|
+
// which can help to avoid subsequent allocations
|
28
|
+
#define STR_OVERALLOC 256
|
29
|
+
|
30
|
+
str_t *str_new(void)
|
31
|
+
{
|
32
|
+
str_t *str = ALLOC_N(str_t, 1);
|
33
|
+
str->ptr = NULL;
|
34
|
+
str->len = 0;
|
35
|
+
str->capacity = 0;
|
36
|
+
return str;
|
37
|
+
}
|
38
|
+
|
39
|
+
str_t *str_new_copy(const char *src, long len)
|
40
|
+
{
|
41
|
+
str_t *str = ALLOC_N(str_t, 1);
|
42
|
+
str->ptr = ALLOC_N(char, len + STR_OVERALLOC);
|
43
|
+
memcpy(str->ptr, src, len);
|
44
|
+
str->len = len;
|
45
|
+
str->capacity = len + STR_OVERALLOC;
|
46
|
+
return str;
|
47
|
+
}
|
48
|
+
|
49
|
+
str_t *str_new_from_string(VALUE string)
|
50
|
+
{
|
51
|
+
string = StringValue(string);
|
52
|
+
return str_new_copy(RSTRING_PTR(string), RSTRING_LEN(string));
|
53
|
+
}
|
54
|
+
|
55
|
+
VALUE string_from_str(str_t *str)
|
56
|
+
{
|
57
|
+
VALUE string = rb_str_new(str->ptr, str->len);
|
58
|
+
rb_funcall(string, rb_intern("force_encoding"), 1, rb_str_new2("UTF-8"));
|
59
|
+
return string;
|
60
|
+
}
|
61
|
+
|
62
|
+
void str_grow(str_t *str, long len)
|
63
|
+
{
|
64
|
+
if (str->capacity < len)
|
65
|
+
{
|
66
|
+
if (str->ptr)
|
67
|
+
REALLOC_N(str->ptr, char, len + STR_OVERALLOC);
|
68
|
+
else
|
69
|
+
str->ptr = ALLOC_N(char, len + STR_OVERALLOC);
|
70
|
+
str->capacity = len + STR_OVERALLOC;
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
void str_append(str_t *str, const char *src, long len)
|
75
|
+
{
|
76
|
+
long new_len = str->len + len;
|
77
|
+
if (str->capacity < new_len)
|
78
|
+
{
|
79
|
+
if (str->ptr)
|
80
|
+
REALLOC_N(str->ptr, char, new_len + STR_OVERALLOC);
|
81
|
+
else
|
82
|
+
str->ptr = ALLOC_N(char, new_len + STR_OVERALLOC);
|
83
|
+
str->capacity = new_len + STR_OVERALLOC;
|
84
|
+
}
|
85
|
+
memcpy(str->ptr + str->len, src, len);
|
86
|
+
str->len = new_len;
|
87
|
+
}
|
88
|
+
|
89
|
+
void str_append_str(str_t *str, str_t *other)
|
90
|
+
{
|
91
|
+
str_append(str, other->ptr, other->len);
|
92
|
+
}
|
93
|
+
|
94
|
+
void str_append_string(str_t *str, VALUE other)
|
95
|
+
{
|
96
|
+
str_append(str, RSTRING_PTR(other), RSTRING_LEN(other));
|
97
|
+
}
|
98
|
+
|
99
|
+
void str_clear(str_t *str)
|
100
|
+
{
|
101
|
+
str->len = 0;
|
102
|
+
}
|
103
|
+
|
104
|
+
void str_free(str_t *str)
|
105
|
+
{
|
106
|
+
if (str->ptr)
|
107
|
+
free(str->ptr);
|
108
|
+
free(str);
|
109
|
+
}
|
data/ext/wikitext/str.h
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
// Copyright 2008-2009 Wincent Colaiuta. All rights reserved.
|
2
|
+
//
|
3
|
+
// Redistribution and use in source and binary forms, with or without
|
4
|
+
// modification, are permitted provided that the following conditions are met:
|
5
|
+
//
|
6
|
+
// 1. Redistributions of source code must retain the above copyright notice,
|
7
|
+
// this list of conditions and the following disclaimer.
|
8
|
+
// 2. Redistributions in binary form must reproduce the above copyright notice,
|
9
|
+
// this list of conditions and the following disclaimer in the documentation
|
10
|
+
// and/or other materials provided with the distribution.
|
11
|
+
//
|
12
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
13
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
14
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
15
|
+
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
16
|
+
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
17
|
+
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
18
|
+
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
19
|
+
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
20
|
+
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
21
|
+
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
22
|
+
// POSSIBILITY OF SUCH DAMAGE.
|
23
|
+
|
24
|
+
#include "ruby_compat.h"
|
25
|
+
|
26
|
+
typedef struct
|
27
|
+
{
|
28
|
+
char *ptr;
|
29
|
+
long len;
|
30
|
+
long capacity;
|
31
|
+
} str_t;
|
32
|
+
|
33
|
+
// Mark the str struct designated by ptr as a participant in Ruby's mark-and-sweep garbage collection scheme.
|
34
|
+
// A variable named name is placed on the C stack to prevent the structure from being prematurely collected.
|
35
|
+
#define GC_WRAP_STR(ptr, name) volatile VALUE name __attribute__((unused)) = Data_Wrap_Struct(rb_cObject, 0, str_free, ptr)
|
36
|
+
|
37
|
+
// create a new, empty string struct
|
38
|
+
str_t *str_new(void);
|
39
|
+
|
40
|
+
// create a new string struct and initialize it with a copy of the buffer of length len pointed to by src
|
41
|
+
str_t *str_new_copy(const char *src, long len);
|
42
|
+
|
43
|
+
// convenience method for testing
|
44
|
+
str_t *str_new_from_string(VALUE string);
|
45
|
+
|
46
|
+
// convenience method for testing
|
47
|
+
VALUE string_from_str(str_t *str);
|
48
|
+
|
49
|
+
// grows a string's capacity to the specified length
|
50
|
+
void str_grow(str_t *str, long len);
|
51
|
+
|
52
|
+
void str_append(str_t *str, const char *src, long len);
|
53
|
+
|
54
|
+
// appends the "other" string struct onto str
|
55
|
+
void str_append_str(str_t *str, str_t *other);
|
56
|
+
|
57
|
+
// appends the "other" string (a Ruby String) onto str
|
58
|
+
void str_append_string(str_t *str, VALUE other);
|
59
|
+
|
60
|
+
// don't actually free the memory yet
|
61
|
+
// this makes str structs very useful when reusing buffers because it avoids reallocation
|
62
|
+
void str_clear(str_t *str);
|
63
|
+
|
64
|
+
void str_free(str_t *str);
|
@@ -0,0 +1,125 @@
|
|
1
|
+
// Copyright 2008-2009 Wincent Colaiuta. All rights reserved.
|
2
|
+
//
|
3
|
+
// Redistribution and use in source and binary forms, with or without
|
4
|
+
// modification, are permitted provided that the following conditions are met:
|
5
|
+
//
|
6
|
+
// 1. Redistributions of source code must retain the above copyright notice,
|
7
|
+
// this list of conditions and the following disclaimer.
|
8
|
+
// 2. Redistributions in binary form must reproduce the above copyright notice,
|
9
|
+
// this list of conditions and the following disclaimer in the documentation
|
10
|
+
// and/or other materials provided with the distribution.
|
11
|
+
//
|
12
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
13
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
14
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
15
|
+
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
16
|
+
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
17
|
+
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
18
|
+
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
19
|
+
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
20
|
+
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
21
|
+
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
22
|
+
// POSSIBILITY OF SUCH DAMAGE.
|
23
|
+
|
24
|
+
#include "token.h"
|
25
|
+
#include "wikitext.h"
|
26
|
+
|
27
|
+
// return a hash of token types
|
28
|
+
// we make this available for unit testing purposes
|
29
|
+
|
30
|
+
VALUE Wikitext_parser_token_types(VALUE self)
|
31
|
+
{
|
32
|
+
VALUE hash = rb_hash_new();
|
33
|
+
|
34
|
+
#define SET_TOKEN_TYPE(identifier) (void)rb_hash_aset(hash, INT2FIX(identifier), \
|
35
|
+
rb_funcall(rb_funcall(rb_str_new2(#identifier), rb_intern("downcase"), 0), rb_intern("to_sym"), 0))
|
36
|
+
|
37
|
+
SET_TOKEN_TYPE(NO_TOKEN);
|
38
|
+
SET_TOKEN_TYPE(P);
|
39
|
+
SET_TOKEN_TYPE(LI);
|
40
|
+
SET_TOKEN_TYPE(NESTED_LIST);
|
41
|
+
SET_TOKEN_TYPE(PRE);
|
42
|
+
SET_TOKEN_TYPE(PRE_START);
|
43
|
+
SET_TOKEN_TYPE(PRE_END);
|
44
|
+
SET_TOKEN_TYPE(NO_WIKI_START);
|
45
|
+
SET_TOKEN_TYPE(NO_WIKI_END);
|
46
|
+
SET_TOKEN_TYPE(BLOCKQUOTE);
|
47
|
+
SET_TOKEN_TYPE(BLOCKQUOTE_START);
|
48
|
+
SET_TOKEN_TYPE(BLOCKQUOTE_END);
|
49
|
+
SET_TOKEN_TYPE(STRONG_EM);
|
50
|
+
SET_TOKEN_TYPE(STRONG_START);
|
51
|
+
SET_TOKEN_TYPE(STRONG_END);
|
52
|
+
SET_TOKEN_TYPE(STRONG);
|
53
|
+
SET_TOKEN_TYPE(EM_START);
|
54
|
+
SET_TOKEN_TYPE(EM_END);
|
55
|
+
SET_TOKEN_TYPE(EM);
|
56
|
+
SET_TOKEN_TYPE(TT_START);
|
57
|
+
SET_TOKEN_TYPE(TT_END);
|
58
|
+
SET_TOKEN_TYPE(TT);
|
59
|
+
SET_TOKEN_TYPE(OL);
|
60
|
+
SET_TOKEN_TYPE(UL);
|
61
|
+
SET_TOKEN_TYPE(H1_START);
|
62
|
+
SET_TOKEN_TYPE(H2_START);
|
63
|
+
SET_TOKEN_TYPE(H3_START);
|
64
|
+
SET_TOKEN_TYPE(H4_START);
|
65
|
+
SET_TOKEN_TYPE(H5_START);
|
66
|
+
SET_TOKEN_TYPE(H6_START);
|
67
|
+
SET_TOKEN_TYPE(H1_END);
|
68
|
+
SET_TOKEN_TYPE(H2_END);
|
69
|
+
SET_TOKEN_TYPE(H3_END);
|
70
|
+
SET_TOKEN_TYPE(H4_END);
|
71
|
+
SET_TOKEN_TYPE(H5_END);
|
72
|
+
SET_TOKEN_TYPE(H6_END);
|
73
|
+
SET_TOKEN_TYPE(URI);
|
74
|
+
SET_TOKEN_TYPE(MAIL);
|
75
|
+
SET_TOKEN_TYPE(PATH);
|
76
|
+
SET_TOKEN_TYPE(LINK_START);
|
77
|
+
SET_TOKEN_TYPE(LINK_END);
|
78
|
+
SET_TOKEN_TYPE(EXT_LINK_START);
|
79
|
+
SET_TOKEN_TYPE(EXT_LINK_END);
|
80
|
+
SET_TOKEN_TYPE(SEPARATOR);
|
81
|
+
SET_TOKEN_TYPE(SPACE);
|
82
|
+
SET_TOKEN_TYPE(QUOT_ENTITY);
|
83
|
+
SET_TOKEN_TYPE(AMP_ENTITY);
|
84
|
+
SET_TOKEN_TYPE(NAMED_ENTITY);
|
85
|
+
SET_TOKEN_TYPE(HEX_ENTITY);
|
86
|
+
SET_TOKEN_TYPE(DECIMAL_ENTITY);
|
87
|
+
SET_TOKEN_TYPE(QUOT);
|
88
|
+
SET_TOKEN_TYPE(AMP);
|
89
|
+
SET_TOKEN_TYPE(LESS);
|
90
|
+
SET_TOKEN_TYPE(GREATER);
|
91
|
+
SET_TOKEN_TYPE(IMG_START);
|
92
|
+
SET_TOKEN_TYPE(IMG_END);
|
93
|
+
SET_TOKEN_TYPE(LEFT_CURLY);
|
94
|
+
SET_TOKEN_TYPE(RIGHT_CURLY);
|
95
|
+
SET_TOKEN_TYPE(CRLF);
|
96
|
+
SET_TOKEN_TYPE(SPECIAL_URI_CHARS);
|
97
|
+
SET_TOKEN_TYPE(PRINTABLE);
|
98
|
+
SET_TOKEN_TYPE(ALNUM);
|
99
|
+
SET_TOKEN_TYPE(DEFAULT);
|
100
|
+
SET_TOKEN_TYPE(END_OF_FILE);
|
101
|
+
|
102
|
+
#undef SET_TOKEN_TYPE
|
103
|
+
|
104
|
+
return hash;
|
105
|
+
}
|
106
|
+
|
107
|
+
// for testing and debugging only
|
108
|
+
VALUE wiki_token(token_t *token)
|
109
|
+
{
|
110
|
+
VALUE object = rb_class_new_instance(0, NULL, cWikitextParserToken);
|
111
|
+
(void)rb_iv_set(object, "@start", LONG2NUM((long)token->start));
|
112
|
+
(void)rb_iv_set(object, "@stop", LONG2NUM((long)token->stop));
|
113
|
+
(void)rb_iv_set(object, "@line_start", LONG2NUM(token->line_start));
|
114
|
+
(void)rb_iv_set(object, "@line_stop", LONG2NUM(token->line_stop));
|
115
|
+
(void)rb_iv_set(object, "@column_start", LONG2NUM(token->column_start));
|
116
|
+
(void)rb_iv_set(object, "@column_stop", LONG2NUM(token->column_stop));
|
117
|
+
(void)rb_iv_set(object, "@code_point", INT2NUM(token->code_point));
|
118
|
+
|
119
|
+
// look-up the token type
|
120
|
+
VALUE types = Wikitext_parser_token_types(Qnil);
|
121
|
+
VALUE type = rb_hash_aref(types, INT2FIX(token->type));
|
122
|
+
(void)rb_iv_set(object, "@token_type", type);
|
123
|
+
(void)rb_iv_set(object, "@string_value", rb_str_new(token->start, token->stop - token->start));
|
124
|
+
return object;
|
125
|
+
}
|
@@ -0,0 +1,117 @@
|
|
1
|
+
// Copyright 2008-2009 Wincent Colaiuta. All rights reserved.
|
2
|
+
//
|
3
|
+
// Redistribution and use in source and binary forms, with or without
|
4
|
+
// modification, are permitted provided that the following conditions are met:
|
5
|
+
//
|
6
|
+
// 1. Redistributions of source code must retain the above copyright notice,
|
7
|
+
// this list of conditions and the following disclaimer.
|
8
|
+
// 2. Redistributions in binary form must reproduce the above copyright notice,
|
9
|
+
// this list of conditions and the following disclaimer in the documentation
|
10
|
+
// and/or other materials provided with the distribution.
|
11
|
+
//
|
12
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
13
|
+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
14
|
+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
15
|
+
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
|
16
|
+
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
17
|
+
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
18
|
+
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
19
|
+
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
20
|
+
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
21
|
+
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
22
|
+
// POSSIBILITY OF SUCH DAMAGE.
|
23
|
+
|
24
|
+
#include "ruby_compat.h"
|
25
|
+
#include <stdint.h> /* uint32_t */
|
26
|
+
|
27
|
+
#define TOKEN_LEN(token) (token->stop - token->start)
|
28
|
+
#define TOKEN_TEXT(token) rb_str_new((const char *)token->start, TOKEN_LEN(token))
|
29
|
+
|
30
|
+
typedef struct
|
31
|
+
{
|
32
|
+
char *start;
|
33
|
+
char *stop;
|
34
|
+
size_t line_start;
|
35
|
+
size_t line_stop;
|
36
|
+
size_t column_start;
|
37
|
+
size_t column_stop;
|
38
|
+
uint32_t code_point;
|
39
|
+
int type;
|
40
|
+
} token_t;
|
41
|
+
|
42
|
+
enum token_types {
|
43
|
+
NO_TOKEN,
|
44
|
+
P, // imaginary token (never explicitly marked up)
|
45
|
+
LI, // imaginary token (never explicitly marked up)
|
46
|
+
NESTED_LIST, // imaginary token (never explicitly marked up)
|
47
|
+
PRE,
|
48
|
+
PRE_START,
|
49
|
+
PRE_END,
|
50
|
+
NO_WIKI_START,
|
51
|
+
NO_WIKI_END,
|
52
|
+
BLOCKQUOTE,
|
53
|
+
BLOCKQUOTE_START,
|
54
|
+
BLOCKQUOTE_END,
|
55
|
+
STRONG_EM,
|
56
|
+
STRONG_START,
|
57
|
+
STRONG_END,
|
58
|
+
STRONG,
|
59
|
+
EM_START,
|
60
|
+
EM_END,
|
61
|
+
EM,
|
62
|
+
TT_START,
|
63
|
+
TT_END,
|
64
|
+
TT,
|
65
|
+
OL,
|
66
|
+
UL,
|
67
|
+
|
68
|
+
// keep these consecutive, and in ascending order
|
69
|
+
// (the arithmetic for the base_heading_level feature assumes this)
|
70
|
+
H1_START,
|
71
|
+
H2_START,
|
72
|
+
H3_START,
|
73
|
+
H4_START,
|
74
|
+
H5_START,
|
75
|
+
H6_START,
|
76
|
+
|
77
|
+
// likewise for the H*_END tokens
|
78
|
+
H1_END,
|
79
|
+
H2_END,
|
80
|
+
H3_END,
|
81
|
+
H4_END,
|
82
|
+
H5_END,
|
83
|
+
H6_END,
|
84
|
+
|
85
|
+
URI,
|
86
|
+
MAIL,
|
87
|
+
PATH,
|
88
|
+
LINK_START,
|
89
|
+
LINK_END,
|
90
|
+
EXT_LINK_START,
|
91
|
+
EXT_LINK_END,
|
92
|
+
SEPARATOR,
|
93
|
+
SPACE,
|
94
|
+
QUOT_ENTITY,
|
95
|
+
AMP_ENTITY,
|
96
|
+
NAMED_ENTITY,
|
97
|
+
HEX_ENTITY,
|
98
|
+
DECIMAL_ENTITY,
|
99
|
+
QUOT,
|
100
|
+
AMP,
|
101
|
+
LESS,
|
102
|
+
GREATER,
|
103
|
+
IMG_START,
|
104
|
+
IMG_END,
|
105
|
+
LEFT_CURLY,
|
106
|
+
RIGHT_CURLY,
|
107
|
+
CRLF,
|
108
|
+
SPECIAL_URI_CHARS,
|
109
|
+
PRINTABLE,
|
110
|
+
ALNUM,
|
111
|
+
DEFAULT,
|
112
|
+
END_OF_FILE
|
113
|
+
};
|
114
|
+
|
115
|
+
VALUE Wikitext_parser_token_types(VALUE self);
|
116
|
+
|
117
|
+
VALUE wiki_token(token_t *token);
|