ruxml 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+
5
+ #include "str.hpp"
6
+
7
+ #define TOKEN2(a) (TokenType)(((uint16_t)((a)[1])<<7)+(uint16_t)((a)[0]))
8
+
9
+ enum TokenType : uint16_t {
10
+ TOK_INVALID = 0,
11
+ TOK_L_ANGLED = '<',
12
+ TOK_R_ANGLED = '>',
13
+ TOK_EQUALS = '=',
14
+ TOK_COLON = ':',
15
+ TOK_SLASH = '/',
16
+ TOK_QUESTION = '?',
17
+ TOK_BANG = '!',
18
+ TOK_HYPHEN = '-',
19
+ TOK_ONE_CHAR = 127,
20
+
21
+ TOK_TAG_START_CLOSE = TOKEN2("</"),
22
+ TOK_TAG_XML_START = TOKEN2("<?"),
23
+ TOK_TAG_XML_END = TOKEN2("?>"),
24
+ TOK_TAG_SELF_CLOSE = TOKEN2("/>"),
25
+ TOK_TWO_CHAR = 128 * 127,
26
+
27
+ TOK_IDENTIFIER = 20000,
28
+ TOK_VALUE,
29
+ TOK_TEXT,
30
+ TOK_COMMENT_START,
31
+ TOK_COMMENT_END,
32
+ };
33
+
34
+ struct Token {
35
+ TokenType type;
36
+ int64_t line;
37
+ int64_t c0;
38
+ int64_t c1;
39
+ int64_t offset;
40
+ String text;
41
+ };
42
+
43
+ enum LexerAction : uint8_t {
44
+ LA_INVALID,
45
+ LA_ONE_CHAR,
46
+ LA_TWO_CHAR_END,
47
+ LA_WHITESPACE,
48
+ LA_NEWLINE,
49
+ LA_IDENTIFIER,
50
+ LA_VALUE
51
+ };
52
+
53
+ enum LexerMode : uint8_t {
54
+ LM_OUT,
55
+ LM_TAG,
56
+ LM_COMMENT
57
+ };
58
+
59
+ enum ParserSourceType {
60
+ PST_NONE = 0,
61
+ PST_MEMORY,
62
+ PST_MMAP
63
+ };
64
+
65
+ enum NodeType {
66
+ NODE_INVALID,
67
+ NODE_ELEMENT_BEGIN,
68
+ NODE_ELEMENT_END,
69
+ NODE_TEXT,
70
+ NODE_XML_HEADER,
71
+ NODE_COMMENT,
72
+
73
+ MAX_NODE_TYPES
74
+ };
75
+
76
+ struct Node {
77
+ NodeType type;
78
+ int64_t line;
79
+ int64_t c0;
80
+ int64_t c1;
81
+ int64_t offset;
82
+ int64_t depth;
83
+
84
+ bool self_closing;
85
+ String text;
86
+ };
87
+
88
+ struct Parser {
89
+ String source;
90
+ ParserSourceType source_type;
91
+ char *buffer;
92
+ int64_t length;
93
+
94
+ char *ptr;
95
+ char *end_ptr;
96
+
97
+ int64_t line;
98
+ int64_t col;
99
+
100
+ bool done;
101
+ bool errored;
102
+ LexerMode mode;
103
+
104
+ LexerAction tag_initial_map[256];
105
+
106
+ uint8_t identifier_map[256];
107
+
108
+ Token token;
109
+ Node node;
110
+
111
+ int64_t depth;
112
+ };
113
+
114
+ void parser_init(Parser *parser);
115
+ bool parser_open_memory(Parser *parser, String name, const char *memory, int64_t offset = 0, int64_t length = 0);
116
+ bool parser_open_file_mmap(Parser *parser, String filename, int64_t offset = 0, int64_t length = 0);
117
+ void parser_destroy(Parser *parser);
118
+
119
+ void print_error_start(Parser *parser, Token token);
120
+ bool expect_type(Parser *parser, TokenType type);
121
+
122
+ Token read_token(Parser *parser); // Internal only: use get_token instead
123
+ Node read_node(Parser *parser); // Internal only: use get_node instead
124
+
125
+ inline Token peek_token(Parser *parser) { return parser->token; }
126
+
127
+ inline Token get_token(Parser *parser) {
128
+ parser->token = read_token(parser);
129
+ if (!parser->token.type) parser->done = true;
130
+ return parser->token;
131
+ }
132
+
133
+ inline Node get_node(Parser *parser) {
134
+ parser->node = read_node(parser);
135
+ return parser->node;
136
+ }
137
+
138
+ void print_token(Token token);
139
+
140
+ void print_node(Node node);
@@ -0,0 +1,273 @@
1
+ #include "parser.hpp"
2
+ #include <ruby/ruby.h>
3
+
4
+ extern "C"
5
+ {
6
+
7
+ VALUE ruxmlModule;
8
+ VALUE ruxmlParser;
9
+ VALUE ruxmlNode;
10
+
11
+ ID node_type_ids[MAX_NODE_TYPES];
12
+
13
+ //
14
+ // Helpers
15
+ //
16
+
17
+ VALUE rbstr_from_str(String str) { return rb_str_export_locale(rb_str_new(str.data, str.length)); }
18
+
19
+ String str_from_rbstr(VALUE rbstr) { return String{(int) RSTRING_LEN(rbstr), StringValuePtr(rbstr)}; }
20
+
21
+ //
22
+ // Node
23
+ //
24
+
25
+ static Node *Node_instance(VALUE self) {
26
+ return (Node *) RDATA(self)->data;
27
+ }
28
+
29
+ static size_t Node_size(const void *data) {
30
+ return sizeof(Node);
31
+ }
32
+
33
+ static void Node_free(void *data) {
34
+ free(data);
35
+ }
36
+
37
+ rb_data_type_t Node_data_type = {
38
+ "Node",
39
+ {NULL, Node_free, Node_size},
40
+ 0, 0,
41
+ RUBY_TYPED_FREE_IMMEDIATELY
42
+ };
43
+
44
+ static VALUE Node_allocate(VALUE self) {
45
+ Node *node;
46
+ return TypedData_Make_Struct(self, Node, &Node_data_type, node);
47
+ }
48
+
49
+ static VALUE Node_initialize(VALUE self) {
50
+ Node *node;
51
+ TypedData_Get_Struct(self, Node, &Node_data_type, node);
52
+ *node = Node{};
53
+ return self;
54
+ }
55
+
56
+ static VALUE Node_column_start(VALUE self) {
57
+ auto node = Node_instance(self);
58
+ return INT2NUM(node->c0);
59
+ }
60
+
61
+ static VALUE Node_line(VALUE self) {
62
+ auto node = Node_instance(self);
63
+ return INT2NUM(node->line);
64
+ }
65
+
66
+ static VALUE Node_offset(VALUE self) {
67
+ auto node = Node_instance(self);
68
+ return INT2NUM(node->offset);
69
+ }
70
+
71
+ static VALUE Node_text(VALUE self) {
72
+ auto node = Node_instance(self);
73
+ return rbstr_from_str(node->text);
74
+ }
75
+
76
+ static VALUE Node_type(VALUE self) {
77
+ auto node = Node_instance(self);
78
+ return ID2SYM(node_type_ids[node->type]);
79
+ }
80
+
81
+ static VALUE Node_self_closing(VALUE self) {
82
+ auto node = Node_instance(self);
83
+ return node->self_closing ? Qtrue : Qfalse;
84
+ }
85
+
86
+ //
87
+ // Parser
88
+ //
89
+
90
+ static Parser *Parser_instance(VALUE self) {
91
+ return (Parser *) RDATA(self)->data;
92
+ }
93
+
94
+ static size_t Parser_size(const void *data) {
95
+ return sizeof(Parser);
96
+ }
97
+
98
+ static void Parser_free(void *data) {
99
+ parser_destroy((Parser *) data);
100
+ free(data);
101
+ }
102
+
103
+ rb_data_type_t Parser_data_type = {
104
+ "Parser",
105
+ {NULL, Parser_free, Parser_size},
106
+ 0, 0,
107
+ RUBY_TYPED_FREE_IMMEDIATELY
108
+ };
109
+
110
+ static VALUE Parser_allocate(VALUE self) {
111
+ Parser *parser;
112
+ return TypedData_Make_Struct(self, Parser, &Parser_data_type, parser);
113
+ }
114
+
115
+ static VALUE Parser_initialize(VALUE self) {
116
+ Parser *parser;
117
+ TypedData_Get_Struct(self, Parser, &Parser_data_type, parser);
118
+
119
+ *parser = Parser{};
120
+ parser_init(parser);
121
+ return self;
122
+ }
123
+
124
+ static VALUE Parser_open_string(int argc, VALUE* argv, VALUE self) {
125
+ VALUE name;
126
+ VALUE data;
127
+ VALUE offset;
128
+ VALUE length;
129
+ rb_scan_args(argc, argv, "22", &name, &data, &offset, &length);
130
+
131
+ Check_Type(name, T_STRING);
132
+ Check_Type(data, T_STRING);
133
+
134
+ int64_t data_offset = 0;
135
+ if (!NIL_P(offset)) {
136
+ Check_Type(offset, T_FIXNUM);
137
+ data_offset = NUM2INT(offset);
138
+ }
139
+
140
+ int64_t data_length;
141
+ if (NIL_P(length)) {
142
+ data_length = RSTRING_LEN(data);
143
+ } else {
144
+ Check_Type(length, T_FIXNUM);
145
+ data_length = NUM2INT(length);
146
+ }
147
+
148
+ auto parser = Parser_instance(self);
149
+ auto success = parser_open_memory(parser, str_from_rbstr(name), StringValuePtr(data), data_offset, data_length);
150
+ return success ? Qtrue : Qfalse;
151
+ }
152
+
153
+ static VALUE Parser_open_file(int argc, VALUE* argv, VALUE self) {
154
+ VALUE filename;
155
+ VALUE offset;
156
+ VALUE length;
157
+ rb_scan_args(argc, argv, "12", &filename, &offset, &length);
158
+
159
+ Check_Type(filename, T_STRING);
160
+
161
+ int64_t data_offset = 0;
162
+ if (!NIL_P(offset)) {
163
+ Check_Type(offset, T_FIXNUM);
164
+ data_offset = NUM2INT(offset);
165
+ }
166
+
167
+ int64_t data_length = 0;
168
+ if (!NIL_P(length)) {
169
+ Check_Type(length, T_FIXNUM);
170
+ data_length = NUM2INT(length);
171
+ }
172
+
173
+ auto parser = Parser_instance(self);
174
+ auto success = parser_open_file_mmap(parser, str_from_rbstr(filename), data_offset, data_length);
175
+ return success ? Qtrue : Qfalse;
176
+ }
177
+
178
+ static VALUE Parser_node(VALUE self) {
179
+ auto parser = Parser_instance(self);
180
+ auto node_ptr = raw_allocate_type(Node);
181
+ *node_ptr = parser->node;
182
+ return TypedData_Wrap_Struct(ruxmlNode, &Node_data_type, node_ptr);
183
+ }
184
+
185
+ static VALUE Parser_next_node(VALUE self) {
186
+ auto parser = Parser_instance(self);
187
+ get_node(parser);
188
+ return parser->done ? Qfalse : Qtrue;
189
+ }
190
+
191
+ static VALUE Parser_done(VALUE self) {
192
+ auto parser = Parser_instance(self);
193
+ return parser->done ? Qtrue : Qfalse;
194
+ }
195
+
196
+ static VALUE Parser_errored(VALUE self) {
197
+ auto parser = Parser_instance(self);
198
+ return parser->errored ? Qtrue : Qfalse;
199
+ }
200
+
201
+ static VALUE Parser_node_column_start(VALUE self) {
202
+ auto parser = Parser_instance(self);
203
+ return INT2NUM(parser->node.c0);
204
+ }
205
+
206
+ static VALUE Parser_node_line(VALUE self) {
207
+ auto parser = Parser_instance(self);
208
+ return INT2NUM(parser->node.line);
209
+ }
210
+
211
+ static VALUE Parser_node_offset(VALUE self) {
212
+ auto parser = Parser_instance(self);
213
+ return INT2NUM(parser->node.offset);
214
+ }
215
+
216
+ static VALUE Parser_node_text(VALUE self) {
217
+ auto parser = Parser_instance(self);
218
+ return rbstr_from_str(parser->node.text);
219
+ }
220
+
221
+ static VALUE Parser_node_type(VALUE self) {
222
+ auto parser = Parser_instance(self);
223
+ return ID2SYM(node_type_ids[parser->node.type]);
224
+ }
225
+
226
+ static VALUE Parser_node_self_closing(VALUE self) {
227
+ auto parser = Parser_instance(self);
228
+ return parser->node.self_closing ? Qtrue : Qfalse;
229
+ }
230
+
231
+ //
232
+ // Init
233
+ //
234
+
235
+ void Init_ruxml() {
236
+ node_type_ids[NODE_INVALID] = rb_intern("invalid");
237
+ node_type_ids[NODE_ELEMENT_BEGIN] = rb_intern("begin");
238
+ node_type_ids[NODE_ELEMENT_END] = rb_intern("end");
239
+ node_type_ids[NODE_TEXT] = rb_intern("text");
240
+ node_type_ids[NODE_XML_HEADER] = rb_intern("xml_header");
241
+ node_type_ids[NODE_COMMENT] = rb_intern("comment");
242
+
243
+ ruxmlModule = rb_define_module("RUXML");
244
+
245
+ ruxmlNode = rb_define_class_under(ruxmlModule, "Node", rb_cData);
246
+ rb_define_alloc_func(ruxmlNode, Node_allocate);
247
+ rb_define_method(ruxmlNode, "initialize", reinterpret_cast<VALUE (*)(...)>(Node_initialize), 0);
248
+ rb_define_method(ruxmlNode, "column_start", reinterpret_cast<VALUE (*)(...)>(Node_column_start), 0);
249
+ rb_define_method(ruxmlNode, "line", reinterpret_cast<VALUE (*)(...)>(Node_line), 0);
250
+ rb_define_method(ruxmlNode, "offset", reinterpret_cast<VALUE (*)(...)>(Node_offset), 0);
251
+ rb_define_method(ruxmlNode, "text", reinterpret_cast<VALUE (*)(...)>(Node_text), 0);
252
+ rb_define_method(ruxmlNode, "type", reinterpret_cast<VALUE (*)(...)>(Node_type), 0);
253
+ rb_define_method(ruxmlNode, "self_closing", reinterpret_cast<VALUE (*)(...)>(Node_self_closing), 0);
254
+
255
+ ruxmlParser = rb_define_class_under(ruxmlModule, "Parser", rb_cData);
256
+ rb_define_alloc_func(ruxmlParser, Parser_allocate);
257
+ rb_define_method(ruxmlParser, "initialize", reinterpret_cast<VALUE (*)(...)>(Parser_initialize), 0);
258
+ rb_define_method(ruxmlParser, "open_string", reinterpret_cast<VALUE (*)(...)>(Parser_open_string), -1);
259
+ rb_define_method(ruxmlParser, "open_file", reinterpret_cast<VALUE (*)(...)>(Parser_open_file), -1);
260
+ rb_define_method(ruxmlParser, "node", reinterpret_cast<VALUE (*)(...)>(Parser_node), 0);
261
+ rb_define_method(ruxmlParser, "next_node", reinterpret_cast<VALUE (*)(...)>(Parser_next_node), 0);
262
+ rb_define_method(ruxmlParser, "done", reinterpret_cast<VALUE (*)(...)>(Parser_done), 0);
263
+ rb_define_method(ruxmlParser, "errored", reinterpret_cast<VALUE (*)(...)>(Parser_errored), 0);
264
+
265
+ rb_define_method(ruxmlParser, "node_column_start", reinterpret_cast<VALUE (*)(...)>(Parser_node_column_start), 0);
266
+ rb_define_method(ruxmlParser, "node_line", reinterpret_cast<VALUE (*)(...)>(Parser_node_line), 0);
267
+ rb_define_method(ruxmlParser, "node_offset", reinterpret_cast<VALUE (*)(...)>(Parser_node_offset), 0);
268
+ rb_define_method(ruxmlParser, "node_text", reinterpret_cast<VALUE (*)(...)>(Parser_node_text), 0);
269
+ rb_define_method(ruxmlParser, "node_type", reinterpret_cast<VALUE (*)(...)>(Parser_node_type), 0);
270
+ rb_define_method(ruxmlParser, "node_self_closing", reinterpret_cast<VALUE (*)(...)>(Parser_node_self_closing), 0);
271
+ }
272
+
273
+ }
data/ext/ruxml/str.cpp ADDED
@@ -0,0 +1,216 @@
1
+ //
2
+ // Created by divan on 27/12/18.
3
+ //
4
+
5
+ #include <cstdarg>
6
+ #include <cstdio>
7
+ #include "str.hpp"
8
+
9
+ uint64_t zstr_length(const char *str) {
10
+ uint64_t result = 0;
11
+ while (*(str++)) result++;
12
+ return result;
13
+ }
14
+
15
+ char *zstr_dup(const char *str, int64_t size) {
16
+ char *result = raw_allocate_string_zt(size);
17
+ memcpy(result, str, sizeof(char) * size);
18
+ return result;
19
+ }
20
+
21
+ char *zstr_dup(const char *str) {
22
+ return zstr_dup(str, zstr_length(str));
23
+ }
24
+
25
+ char *str_to_zstr(Allocator *allocator, String s) {
26
+ char *result = (char *) allocate_size(allocator, (size_t) (s.length + 1));
27
+ memcpy(result, s.data, sizeof(char) * s.length);
28
+ result[s.length] = 0;
29
+ return result;
30
+ }
31
+
32
+ char *str_to_zstr(String s) { return str_to_zstr(temp_allocator, s); }
33
+
34
+ String str_dup(Allocator *allocator, String s) {
35
+ String result;
36
+ result.length = s.length;
37
+ result.data = (char *) allocate_size(allocator, (size_t) (s.length + 1));
38
+ memcpy(result.data, s.data, sizeof(char) * s.length);
39
+ result.data[s.length] = 0;
40
+ return result;
41
+ }
42
+
43
+ String str_dup(String s) { return str_dup(temp_allocator, s); }
44
+
45
+ String str_dup(Allocator *allocator, const char *str) {
46
+ return str_dup(allocator, str, static_cast<int>(zstr_length(str)));
47
+ }
48
+
49
+ String str_dup(const char *str) { return str_dup(temp_allocator, str); }
50
+
51
+ String str_dup(Allocator *allocator, const char *str, int length) {
52
+ String result;
53
+ result.length = length;
54
+ result.data = allocate_zstring(allocator, length);
55
+ memcpy(result.data, str, sizeof(char) * length);
56
+ result.data[length] = 0;
57
+ return result;
58
+ }
59
+
60
+ String str_dup(const char *str, int length) { return str_dup(temp_allocator, str, length); }
61
+
62
+ int zstr_find_last(const char *str, char c) {
63
+ int result = -1;
64
+ int index = 0;
65
+ while (*str != 0) {
66
+ if (*str == c) result = index;
67
+ str++;
68
+ index++;
69
+ }
70
+ return result;
71
+ }
72
+
73
+ int str_find_last(String s, char c, int after) {
74
+ int result = -1;
75
+ int index = 0;
76
+ char *str = s.data + after;
77
+ for (int i = after; i < s.length; i++) {
78
+ if (*str == c) result = index;
79
+ str++;
80
+ index++;
81
+ }
82
+ return result;
83
+ }
84
+
85
+ int str_find_first(String s, char c, int after) {
86
+ char *str = s.data + after;
87
+ for (int i = after; i < s.length; i++) {
88
+ if (*str == c) return i;
89
+ str++;
90
+ }
91
+ return -1;
92
+ }
93
+
94
+ bool str_equal(String a, String b) {
95
+ if (a.length != b.length) return false;
96
+ return memcmp(a.data, b.data, a.length) == 0;
97
+ }
98
+
99
+ bool str_equal(String a, const char *b) {
100
+ return str_equal(a, b, zstr_length(b));
101
+ }
102
+
103
+ bool str_equal(String a, const char *b_data, int b_length) {
104
+ if (a.length != b_length) return false;
105
+ return memcmp(a.data, b_data, a.length) == 0;
106
+ }
107
+
108
+ int str_compare(String a, String b) {
109
+ if (a.length < b.length) return -1;
110
+ if (b.length > a.length) return 1;
111
+ return memcmp(a.data, b.data, a.length);
112
+ }
113
+
114
+ bool str_empty(String s) {
115
+ return s.length == 0 || !s.data;
116
+ }
117
+
118
+ bool parse_int(String string, int32_t *result_ptr) {
119
+ bool valid = false;
120
+ int result = 0;
121
+
122
+ auto buffer = string.data;
123
+
124
+ for (int i = 0; i < string.length; i++) {
125
+ char c = buffer[0];
126
+ if (c >= '0' && c <= '9') {
127
+ result *= 10;
128
+ result += c - '0';
129
+ valid = true;
130
+ } else {
131
+ valid = false;
132
+ break;
133
+ }
134
+ buffer++;
135
+ }
136
+
137
+ *result_ptr = result;
138
+ return valid;
139
+ }
140
+
141
+ bool parse_int64(String string, int64_t *result_ptr) {
142
+ bool valid = false;
143
+ int result = 0;
144
+
145
+ auto buffer = string.data;
146
+
147
+ for (int i = 0; i < string.length; i++) {
148
+ char c = buffer[0];
149
+ if (c >= '0' && c <= '9') {
150
+ result *= 10;
151
+ result += c - '0';
152
+ valid = true;
153
+ } else {
154
+ valid = false;
155
+ break;
156
+ }
157
+ buffer++;
158
+ }
159
+
160
+ *result_ptr = result;
161
+ return valid;
162
+ }
163
+
164
+ String str_print(Allocator *allocator, const char *fmt, ...) {
165
+ char buffer[1024];
166
+
167
+ va_list v, v2;
168
+ va_start(v, fmt);
169
+ va_copy(v2, v);
170
+
171
+ auto res = vsnprintf(buffer, sizeof(buffer), fmt, v);
172
+ va_end(v);
173
+
174
+ if (res <= array_size(buffer)) {
175
+ return str_dup(allocator, buffer, res);
176
+ } else {
177
+ int big_size = res + 1;
178
+
179
+ String result;
180
+ result.length = res;
181
+ result.data = allocate_zstring(allocator, res);
182
+ res = vsnprintf(result.data, static_cast<size_t>(big_size), fmt, v2);
183
+
184
+ va_end(v2);
185
+ assert(res >= 0);
186
+
187
+ return result;
188
+ }
189
+ }
190
+
191
+ String str_print(const char *fmt, ...) {
192
+ char buffer[1024];
193
+
194
+ va_list v, v2;
195
+ va_start(v, fmt);
196
+ va_copy(v2, v);
197
+
198
+ auto res = vsnprintf(buffer, sizeof(buffer), fmt, v);
199
+ va_end(v);
200
+
201
+ if (res <= array_size(buffer)) {
202
+ return str_dup(temp_allocator, buffer, res);
203
+ } else {
204
+ int big_size = res + 1;
205
+
206
+ String result;
207
+ result.length = res;
208
+ result.data = allocate_zstring(temp_allocator, res);
209
+ res = vsnprintf(result.data, static_cast<size_t>(big_size), fmt, v2);
210
+
211
+ va_end(v2);
212
+ assert(res >= 0);
213
+
214
+ return result;
215
+ }
216
+ }