wikitext 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/ext/parser.h ADDED
@@ -0,0 +1,31 @@
1
+ // Copyright 2008 Wincent Colaiuta
2
+ // This program is free software: you can redistribute it and/or modify
3
+ // it under the terms of the GNU General Public License as published by
4
+ // the Free Software Foundation, either version 3 of the License, or
5
+ // (at your option) any later version.
6
+ //
7
+ // This program is distributed in the hope that it will be useful,
8
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ // GNU General Public License for more details.
11
+ //
12
+ // You should have received a copy of the GNU General Public License
13
+ // along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ #include <ruby/ruby.h>
16
+
17
+ VALUE Wikitext_parser_initialize(VALUE self);
18
+
19
+ VALUE Wikitext_parser_tokenize(VALUE self, VALUE string);
20
+
21
+ VALUE Wikitext_parser_benchmarking_tokenize(VALUE self, VALUE string);
22
+
23
+ VALUE Wikitext_parser_sanitize_link_target(VALUE self, VALUE string);
24
+
25
+ VALUE Wikitext_parser_encode_link_target(VALUE self, VALUE in);
26
+
27
+ VALUE Wikitext_parser_encode_special_link_target(VALUE self, VALUE in);
28
+
29
+ VALUE Wikitext_parser_parse(int argc, VALUE *argv, VALUE self);
30
+
31
+ VALUE Wikitext_parser_profiling_parse(VALUE self, VALUE string);
data/ext/str.h ADDED
@@ -0,0 +1,135 @@
1
+ // Copyright 2008 Wincent Colaiuta
2
+ // This program is free software: you can redistribute it and/or modify
3
+ // it under the terms of the GNU General Public License as published by
4
+ // the Free Software Foundation, either version 3 of the License, or
5
+ // (at your option) any later version.
6
+ //
7
+ // This program is distributed in the hope that it will be useful,
8
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ // GNU General Public License for more details.
11
+ //
12
+ // You should have received a copy of the GNU General Public License
13
+ // along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ #include <ruby/ruby.h>
16
+
17
+ typedef struct
18
+ {
19
+ char *ptr;
20
+ long len;
21
+ long capacity;
22
+ } str_t;
23
+
24
+ // create a new, empty string struct
25
+ inline str_t *str_new(void)
26
+ {
27
+ str_t *str = ALLOC_N(str_t, 1);
28
+ str->ptr = NULL;
29
+ str->len = 0;
30
+ str->capacity = 0;
31
+ return str;
32
+ }
33
+
34
+ // create a new, empty string struct with capacity len
35
+ inline str_t *str_new_size(long len)
36
+ {
37
+ str_t *str = ALLOC_N(str_t, 1);
38
+ str->ptr = ALLOC_N(char, len);
39
+ str->len = 0;
40
+ str->capacity = len;
41
+ return str;
42
+ }
43
+
44
+ // create a new string struct and initialize it with a copy of the buffer of length len pointed to by src
45
+ inline str_t *str_new_copy(char *src, long len)
46
+ {
47
+ str_t *str = ALLOC_N(str_t, 1);
48
+ str->ptr = ALLOC_N(char, len);
49
+ memcpy(str->ptr, src, len);
50
+ str->len = len;
51
+ str->capacity = len;
52
+ return str;
53
+ }
54
+
55
+ // create a new string struct and initialize it with the buffer of length len pointed to by src
56
+ // no copy is made; the struct takes ownership of the buffer and will free it when the struct is disposed of
57
+ inline str_t *str_new_no_copy(char *src, long len)
58
+ {
59
+ str_t *str = ALLOC_N(str_t, 1);
60
+ str->ptr = src;
61
+ str->len = len;
62
+ str->capacity = len;
63
+ return str;
64
+ }
65
+
66
+ // convenience method for testing
67
+ inline str_t *str_new_from_string(VALUE string)
68
+ {
69
+ string = StringValue(string);
70
+ return str_new_copy(RSTRING_PTR(string), RSTRING_LEN(string));
71
+ }
72
+
73
+ // convenience method for testing
74
+ inline VALUE string_from_str(str_t *str)
75
+ {
76
+ return rb_str_new(str->ptr, str->len);
77
+ }
78
+
79
+ // grows a string's capacity to the specified length
80
+ inline void str_grow(str_t *str, long len)
81
+ {
82
+ if (str->capacity < len)
83
+ {
84
+ if (str->ptr)
85
+ REALLOC_N(str->ptr, char, len);
86
+ else
87
+ str->ptr = ALLOC_N(char, len);
88
+ str->capacity = len;
89
+ }
90
+ }
91
+
92
+ inline void str_append(str_t *str, char *src, long len)
93
+ {
94
+ long new_len = str->len + len;
95
+ if (str->capacity < new_len)
96
+ {
97
+ if (str->ptr)
98
+ REALLOC_N(str->ptr, char, new_len);
99
+ else
100
+ str->ptr = ALLOC_N(char, new_len);
101
+ str->capacity = new_len;
102
+ }
103
+ memcpy(str->ptr + str->len, src, len);
104
+ str->len = new_len;
105
+ }
106
+
107
+ // appends the "other" string struct onto str
108
+ inline void str_append_str(str_t *str, str_t *other)
109
+ {
110
+ str_append(str, other->ptr, other->len);
111
+ }
112
+
113
+ // this is a temporary convenience measure
114
+ // later on if I develop in-place variants of some functions this won't be needed
115
+ inline void str_swap(str_t **a, str_t **b)
116
+ {
117
+ str_t *c;
118
+ c = *a;
119
+ *a = *b;
120
+ *b = c;
121
+ }
122
+
123
+ // don't actually free the memory yet
124
+ // this makes str structs very useful when reusing buffers because it avoids reallocation
125
+ inline void str_clear(str_t *str)
126
+ {
127
+ str->len = 0;
128
+ }
129
+
130
+ inline void str_free(str_t *str)
131
+ {
132
+ if (str->ptr)
133
+ free(str->ptr);
134
+ free(str);
135
+ }
data/ext/token.c ADDED
@@ -0,0 +1,109 @@
1
+ // Copyright 2008 Wincent Colaiuta
2
+ // This program is free software: you can redistribute it and/or modify
3
+ // it under the terms of the GNU General Public License as published by
4
+ // the Free Software Foundation, either version 3 of the License, or
5
+ // (at your option) any later version.
6
+ //
7
+ // This program is distributed in the hope that it will be useful,
8
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ // GNU General Public License for more details.
11
+ //
12
+ // You should have received a copy of the GNU General Public License
13
+ // along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ #include "token.h"
16
+ #include "wikitext.h"
17
+
18
+ // return a hash of token types
19
+ // we make this available for unit testing purposes
20
+
21
+ VALUE Wikitext_parser_token_types(VALUE self)
22
+ {
23
+ VALUE hash = rb_hash_new();
24
+
25
+ #define SET_TOKEN_TYPE(identifier) (void)rb_hash_aset(hash, INT2FIX(identifier), \
26
+ rb_funcall(rb_funcall(rb_str_new2(#identifier), rb_intern("downcase"), 0), rb_intern("to_sym"), 0))
27
+
28
+ SET_TOKEN_TYPE(NO_TOKEN);
29
+ SET_TOKEN_TYPE(P);
30
+ SET_TOKEN_TYPE(LI);
31
+ SET_TOKEN_TYPE(NESTED_LIST);
32
+ SET_TOKEN_TYPE(PRE);
33
+ SET_TOKEN_TYPE(PRE_START);
34
+ SET_TOKEN_TYPE(PRE_END);
35
+ SET_TOKEN_TYPE(NO_WIKI_START);
36
+ SET_TOKEN_TYPE(NO_WIKI_END);
37
+ SET_TOKEN_TYPE(BLOCKQUOTE);
38
+ SET_TOKEN_TYPE(BLOCKQUOTE_START);
39
+ SET_TOKEN_TYPE(BLOCKQUOTE_END);
40
+ SET_TOKEN_TYPE(STRONG_EM);
41
+ SET_TOKEN_TYPE(STRONG_START);
42
+ SET_TOKEN_TYPE(STRONG_END);
43
+ SET_TOKEN_TYPE(STRONG);
44
+ SET_TOKEN_TYPE(EM_START);
45
+ SET_TOKEN_TYPE(EM_END);
46
+ SET_TOKEN_TYPE(EM);
47
+ SET_TOKEN_TYPE(TT_START);
48
+ SET_TOKEN_TYPE(TT_END);
49
+ SET_TOKEN_TYPE(TT);
50
+ SET_TOKEN_TYPE(OL);
51
+ SET_TOKEN_TYPE(UL);
52
+ SET_TOKEN_TYPE(H6_START);
53
+ SET_TOKEN_TYPE(H5_START);
54
+ SET_TOKEN_TYPE(H4_START);
55
+ SET_TOKEN_TYPE(H3_START);
56
+ SET_TOKEN_TYPE(H2_START);
57
+ SET_TOKEN_TYPE(H1_START);
58
+ SET_TOKEN_TYPE(H6_END);
59
+ SET_TOKEN_TYPE(H5_END);
60
+ SET_TOKEN_TYPE(H4_END);
61
+ SET_TOKEN_TYPE(H3_END);
62
+ SET_TOKEN_TYPE(H2_END);
63
+ SET_TOKEN_TYPE(H1_END);
64
+ SET_TOKEN_TYPE(URI);
65
+ SET_TOKEN_TYPE(MAIL);
66
+ SET_TOKEN_TYPE(LINK_START);
67
+ SET_TOKEN_TYPE(LINK_END);
68
+ SET_TOKEN_TYPE(EXT_LINK_START);
69
+ SET_TOKEN_TYPE(EXT_LINK_END);
70
+ SET_TOKEN_TYPE(SEPARATOR);
71
+ SET_TOKEN_TYPE(SPACE);
72
+ SET_TOKEN_TYPE(QUOT_ENTITY);
73
+ SET_TOKEN_TYPE(AMP_ENTITY);
74
+ SET_TOKEN_TYPE(NAMED_ENTITY);
75
+ SET_TOKEN_TYPE(HEX_ENTITY);
76
+ SET_TOKEN_TYPE(DECIMAL_ENTITY);
77
+ SET_TOKEN_TYPE(QUOT);
78
+ SET_TOKEN_TYPE(AMP);
79
+ SET_TOKEN_TYPE(LESS);
80
+ SET_TOKEN_TYPE(GREATER);
81
+ SET_TOKEN_TYPE(CRLF);
82
+ SET_TOKEN_TYPE(PRINTABLE);
83
+ SET_TOKEN_TYPE(DEFAULT);
84
+ SET_TOKEN_TYPE(END_OF_FILE);
85
+
86
+ #undef SET_TOKEN_TYPE
87
+
88
+ return hash;
89
+ }
90
+
91
+ // for testing and debugging only
92
+ VALUE _Wikitext_token(token_t *token)
93
+ {
94
+ VALUE object = rb_class_new_instance(0, NULL, cWikitextParserToken);
95
+ (void)rb_iv_set(object, "@start", LONG2NUM((long)token->start));
96
+ (void)rb_iv_set(object, "@stop", LONG2NUM((long)token->stop));
97
+ (void)rb_iv_set(object, "@line_start", LONG2NUM(token->line_start));
98
+ (void)rb_iv_set(object, "@line_stop", LONG2NUM(token->line_stop));
99
+ (void)rb_iv_set(object, "@column_start", LONG2NUM(token->column_start));
100
+ (void)rb_iv_set(object, "@column_stop", LONG2NUM(token->column_stop));
101
+ (void)rb_iv_set(object, "@code_point", INT2NUM(token->code_point));
102
+
103
+ // look-up the token type
104
+ VALUE types = Wikitext_parser_token_types(Qnil);
105
+ VALUE type = rb_hash_aref(types, INT2FIX(token->type));
106
+ (void)rb_iv_set(object, "@token_type", type);
107
+ (void)rb_iv_set(object, "@string_value", rb_str_new(token->start, token->stop - token->start));
108
+ return object;
109
+ }
data/ext/token.h ADDED
@@ -0,0 +1,95 @@
1
+ // Copyright 2008 Wincent Colaiuta
2
+ // This program is free software: you can redistribute it and/or modify
3
+ // it under the terms of the GNU General Public License as published by
4
+ // the Free Software Foundation, either version 3 of the License, or
5
+ // (at your option) any later version.
6
+ //
7
+ // This program is distributed in the hope that it will be useful,
8
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ // GNU General Public License for more details.
11
+ //
12
+ // You should have received a copy of the GNU General Public License
13
+ // along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ #include <ruby/ruby.h>
16
+ #include <stdint.h> /* uint32_t */
17
+
18
+ #define TOKEN_TEXT(token) rb_str_new((const char *)token->start, (token->stop - token->start))
19
+ #define TOKEN_LEN(token) (token->stop - token->start)
20
+
21
+ typedef struct
22
+ {
23
+ char *start;
24
+ char *stop;
25
+ size_t line_start;
26
+ size_t line_stop;
27
+ size_t column_start;
28
+ size_t column_stop;
29
+ uint32_t code_point;
30
+ int type;
31
+ } token_t;
32
+
33
+ enum token_types {
34
+ NO_TOKEN,
35
+ P, // imaginary token (never explicitly marked up)
36
+ LI, // imaginary token (never explicitly marked up)
37
+ NESTED_LIST, // imaginary token (never explicitly marked up)
38
+ PRE,
39
+ PRE_START,
40
+ PRE_END,
41
+ NO_WIKI_START,
42
+ NO_WIKI_END,
43
+ BLOCKQUOTE,
44
+ BLOCKQUOTE_START,
45
+ BLOCKQUOTE_END,
46
+ STRONG_EM,
47
+ STRONG_START,
48
+ STRONG_END,
49
+ STRONG,
50
+ EM_START,
51
+ EM_END,
52
+ EM,
53
+ TT_START,
54
+ TT_END,
55
+ TT,
56
+ OL,
57
+ UL,
58
+ H6_START,
59
+ H5_START,
60
+ H4_START,
61
+ H3_START,
62
+ H2_START,
63
+ H1_START,
64
+ H6_END,
65
+ H5_END,
66
+ H4_END,
67
+ H3_END,
68
+ H2_END,
69
+ H1_END,
70
+ URI,
71
+ MAIL,
72
+ LINK_START,
73
+ LINK_END,
74
+ EXT_LINK_START,
75
+ EXT_LINK_END,
76
+ SEPARATOR,
77
+ SPACE,
78
+ QUOT_ENTITY,
79
+ AMP_ENTITY,
80
+ NAMED_ENTITY,
81
+ HEX_ENTITY,
82
+ DECIMAL_ENTITY,
83
+ QUOT,
84
+ AMP,
85
+ LESS,
86
+ GREATER,
87
+ CRLF,
88
+ PRINTABLE,
89
+ DEFAULT,
90
+ END_OF_FILE
91
+ };
92
+
93
+ VALUE Wikitext_parser_token_types(VALUE self);
94
+
95
+ VALUE _Wikitext_token(token_t *token);
data/ext/wikitext.c ADDED
@@ -0,0 +1,60 @@
1
+ // Copyright 2008 Wincent Colaiuta
2
+ // This program is free software: you can redistribute it and/or modify
3
+ // it under the terms of the GNU General Public License as published by
4
+ // the Free Software Foundation, either version 3 of the License, or
5
+ // (at your option) any later version.
6
+ //
7
+ // This program is distributed in the hope that it will be useful,
8
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ // GNU General Public License for more details.
11
+ //
12
+ // You should have received a copy of the GNU General Public License
13
+ // along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ #include "wikitext_ragel.h"
16
+ #include "parser.h"
17
+
18
+ VALUE mWikitext = 0; // module Wikitext
19
+ VALUE cWikitextParser = 0; // class Wikitext::Parser
20
+ VALUE eWikitextParserError = 0; // class Wikitext::Parser::Error
21
+ VALUE cWikitextParserToken = 0; // class Wikitext::Parser::Token
22
+
23
+ void Init_wikitext()
24
+ {
25
+ // Wikitext
26
+ mWikitext = rb_define_module("Wikitext");
27
+
28
+ // Wikitext::Parser
29
+ cWikitextParser = rb_define_class_under(mWikitext, "Parser", rb_cObject);
30
+ rb_define_method(cWikitextParser, "initialize", Wikitext_parser_initialize, 0);
31
+ rb_define_method(cWikitextParser, "parse", Wikitext_parser_parse, -1);
32
+ rb_define_method(cWikitextParser, "profiling_parse", Wikitext_parser_profiling_parse, 1);
33
+ rb_define_method(cWikitextParser, "tokenize", Wikitext_parser_tokenize, 1);
34
+ rb_define_method(cWikitextParser, "benchmarking_tokenize", Wikitext_parser_benchmarking_tokenize, 1);
35
+ rb_define_singleton_method(cWikitextParser, "sanitize_link_target", Wikitext_parser_sanitize_link_target, 1);
36
+ rb_define_singleton_method(cWikitextParser, "encode_link_target", Wikitext_parser_encode_link_target, 1);
37
+ rb_define_singleton_method(cWikitextParser, "encode_special_link_target", Wikitext_parser_encode_special_link_target, 1);
38
+ rb_define_attr(cWikitextParser, "line_ending", Qtrue, Qtrue);
39
+ rb_define_attr(cWikitextParser, "internal_link_prefix", Qtrue, Qtrue);
40
+ rb_define_attr(cWikitextParser, "external_link_class", Qtrue, Qtrue);
41
+ rb_define_attr(cWikitextParser, "mailto_class", Qtrue, Qtrue);
42
+ rb_define_attr(cWikitextParser, "autolink", Qtrue, Qtrue);
43
+ rb_define_attr(cWikitextParser, "treat_slash_as_special", Qtrue, Qtrue);
44
+
45
+ // Wikitext::Parser::Error
46
+ eWikitextParserError = rb_define_class_under(cWikitextParser, "Error", rb_eException);
47
+
48
+ // Wikitext::Parser::Token
49
+ cWikitextParserToken = rb_define_class_under(cWikitextParser, "Token", rb_cObject);
50
+ rb_define_singleton_method(cWikitextParserToken, "types", Wikitext_parser_token_types, 0);
51
+ rb_define_attr(cWikitextParserToken, "start", Qtrue, Qfalse);
52
+ rb_define_attr(cWikitextParserToken, "stop", Qtrue, Qfalse);
53
+ rb_define_attr(cWikitextParserToken, "line_start", Qtrue, Qfalse);
54
+ rb_define_attr(cWikitextParserToken, "line_stop", Qtrue, Qfalse);
55
+ rb_define_attr(cWikitextParserToken, "column_start", Qtrue, Qfalse);
56
+ rb_define_attr(cWikitextParserToken, "column_stop", Qtrue, Qfalse);
57
+ rb_define_attr(cWikitextParserToken, "code_point", Qtrue, Qfalse);
58
+ rb_define_attr(cWikitextParserToken, "token_type", Qtrue, Qfalse);
59
+ rb_define_attr(cWikitextParserToken, "string_value", Qtrue, Qfalse);
60
+ }
data/ext/wikitext.h ADDED
@@ -0,0 +1,30 @@
1
+ // Copyright 2008 Wincent Colaiuta
2
+ // This program is free software: you can redistribute it and/or modify
3
+ // it under the terms of the GNU General Public License as published by
4
+ // the Free Software Foundation, either version 3 of the License, or
5
+ // (at your option) any later version.
6
+ //
7
+ // This program is distributed in the hope that it will be useful,
8
+ // but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
+ // GNU General Public License for more details.
11
+ //
12
+ // You should have received a copy of the GNU General Public License
13
+ // along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+
15
+ #include <ruby/ruby.h>
16
+ #include <stdint.h>
17
+
18
+ #define ruby_inspect(obj) rb_funcall(rb_mKernel, rb_intern("p"), 1, obj)
19
+
20
+ // module Wikitext
21
+ extern VALUE mWikitext;
22
+
23
+ // class Wikitext::Parser
24
+ extern VALUE cWikitextParser;
25
+
26
+ // class Wikitext::Parser::Error
27
+ extern VALUE eWikitextParserError;
28
+
29
+ // class Wikitext::Parser::Token
30
+ extern VALUE cWikitextParserToken;