html_tokenizer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.autotest +3 -0
- data/.gitignore +35 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +24 -0
- data/LICENSE +21 -0
- data/Manifest.txt +8 -0
- data/README.md +2 -0
- data/Rakefile +20 -0
- data/bin/html_tokenizer +3 -0
- data/ext/html_tokenizer_ext/extconf.rb +6 -0
- data/ext/html_tokenizer_ext/html_tokenizer.c +12 -0
- data/ext/html_tokenizer_ext/html_tokenizer.h +7 -0
- data/ext/html_tokenizer_ext/parser.c +767 -0
- data/ext/html_tokenizer_ext/parser.h +87 -0
- data/ext/html_tokenizer_ext/tokenizer.c +682 -0
- data/ext/html_tokenizer_ext/tokenizer.h +74 -0
- data/html_tokenizer.gemspec +19 -0
- data/lib/html_tokenizer.rb +12 -0
- data/test/unit/parser_test.rb +575 -0
- data/test/unit/tokenizer_test.rb +337 -0
- metadata +109 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a3d58539284af566692b81cc4633af1137baabea
|
4
|
+
data.tar.gz: 1877010598cbadadb27212eae39346769fa2afde
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9d6e46dfd48e7bb4967cfa13e90bdd213331b6dd38f9a57739ac1317f4b8147a9f11c5a39001ebf1c36554350ec444eaccc19dfe04040dadf2fc71d92435d5d5
|
7
|
+
data.tar.gz: f8fad88c25ff9404d710d4609ffc89fe80b08a71ba864cdca91a50cb9c5c98844befb75c43f5d4c6e0c56641fba57b4fda35ddf1c485abdcaf6bcd121bb4b0be
|
data/.autotest
ADDED
data/.gitignore
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
*.bundle
|
2
|
+
tmp/
|
3
|
+
|
4
|
+
# Object files
|
5
|
+
*.o
|
6
|
+
*.ko
|
7
|
+
*.obj
|
8
|
+
*.elf
|
9
|
+
|
10
|
+
# Precompiled Headers
|
11
|
+
*.gch
|
12
|
+
*.pch
|
13
|
+
|
14
|
+
# Libraries
|
15
|
+
*.lib
|
16
|
+
*.a
|
17
|
+
*.la
|
18
|
+
*.lo
|
19
|
+
|
20
|
+
# Shared objects (inc. Windows DLLs)
|
21
|
+
*.dll
|
22
|
+
*.so
|
23
|
+
*.so.*
|
24
|
+
*.dylib
|
25
|
+
|
26
|
+
# Executables
|
27
|
+
*.exe
|
28
|
+
*.out
|
29
|
+
*.app
|
30
|
+
*.i*86
|
31
|
+
*.x86_64
|
32
|
+
*.hex
|
33
|
+
|
34
|
+
# Debug files
|
35
|
+
*.dSYM/
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
html_tokenizer (0.0.1)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
minitest (5.9.0)
|
10
|
+
rake (11.1.2)
|
11
|
+
rake-compiler (0.9.9)
|
12
|
+
rake
|
13
|
+
|
14
|
+
PLATFORMS
|
15
|
+
ruby
|
16
|
+
|
17
|
+
DEPENDENCIES
|
18
|
+
html_tokenizer!
|
19
|
+
minitest
|
20
|
+
rake
|
21
|
+
rake-compiler
|
22
|
+
|
23
|
+
BUNDLED WITH
|
24
|
+
1.12.3
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 Francois Chagnon
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/Manifest.txt
ADDED
data/README.md
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require "rubygems"
|
4
|
+
require 'rake'
|
5
|
+
require 'rake/testtask'
|
6
|
+
require 'bundler/gem_tasks'
|
7
|
+
require 'rake/extensiontask'
|
8
|
+
|
9
|
+
Rake::ExtensionTask.new("html_tokenizer_ext")
|
10
|
+
|
11
|
+
task :default => :test
|
12
|
+
|
13
|
+
task :test => ['test:unit']
|
14
|
+
|
15
|
+
namespace :test do
|
16
|
+
Rake::TestTask.new(:unit => :compile) do |t|
|
17
|
+
t.libs << 'lib' << 'test'
|
18
|
+
t.test_files = FileList['test/unit/**/*_test.rb']
|
19
|
+
end
|
20
|
+
end
|
data/bin/html_tokenizer
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include "tokenizer.h"
|
3
|
+
#include "parser.h"
|
4
|
+
|
5
|
+
static VALUE mHtmlTokenizer = Qnil;
|
6
|
+
|
7
|
+
void Init_html_tokenizer_ext()
|
8
|
+
{
|
9
|
+
mHtmlTokenizer = rb_define_module("HtmlTokenizer");
|
10
|
+
Init_html_tokenizer_tokenizer(mHtmlTokenizer);
|
11
|
+
Init_html_tokenizer_parser(mHtmlTokenizer);
|
12
|
+
}
|
@@ -0,0 +1,767 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include "html_tokenizer.h"
|
3
|
+
#include "parser.h"
|
4
|
+
|
5
|
+
static VALUE cParser = Qnil;
|
6
|
+
|
7
|
+
static void parser_mark(void *ptr)
|
8
|
+
{}
|
9
|
+
|
10
|
+
static void parser_free(void *ptr)
|
11
|
+
{
|
12
|
+
struct parser_t *parser = ptr;
|
13
|
+
size_t i;
|
14
|
+
|
15
|
+
if(parser) {
|
16
|
+
if(parser->doc.data) {
|
17
|
+
DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
|
18
|
+
xfree(parser->doc.data);
|
19
|
+
parser->doc.data = NULL;
|
20
|
+
}
|
21
|
+
if(parser->errors_count && parser->errors) {
|
22
|
+
for(i=0; i<parser->errors_count; i++) {
|
23
|
+
if(!parser->errors[i].message)
|
24
|
+
continue;
|
25
|
+
DBG_PRINT("parser=%p xfree(parser->errors.messages[%u]) %p", parser, i, parser->errors[i].message);
|
26
|
+
xfree(parser->errors[i].message);
|
27
|
+
parser->errors[i].message = NULL;
|
28
|
+
}
|
29
|
+
DBG_PRINT("parser=%p xfree(parser->errors.messages) %p", parser, parser->errors);
|
30
|
+
xfree(parser->errors);
|
31
|
+
parser->errors = NULL;
|
32
|
+
parser->errors_count = 0;
|
33
|
+
}
|
34
|
+
DBG_PRINT("parser=%p xfree(parser)", parser);
|
35
|
+
xfree(parser);
|
36
|
+
}
|
37
|
+
}
|
38
|
+
|
39
|
+
static size_t parser_memsize(const void *ptr)
|
40
|
+
{
|
41
|
+
return ptr ? sizeof(struct parser_t) : 0;
|
42
|
+
}
|
43
|
+
|
44
|
+
const rb_data_type_t ht_parser_data_type = {
|
45
|
+
"ht_parser_data_type",
|
46
|
+
{ parser_mark, parser_free, parser_memsize, },
|
47
|
+
#if defined(RUBY_TYPED_FREE_IMMEDIATELY)
|
48
|
+
NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY
|
49
|
+
#endif
|
50
|
+
};
|
51
|
+
|
52
|
+
static VALUE parser_allocate(VALUE klass)
|
53
|
+
{
|
54
|
+
VALUE obj;
|
55
|
+
struct parser_t *parser = NULL;
|
56
|
+
|
57
|
+
obj = TypedData_Make_Struct(klass, struct parser_t, &ht_parser_data_type, parser);
|
58
|
+
DBG_PRINT("parser=%p allocate", parser);
|
59
|
+
|
60
|
+
return obj;
|
61
|
+
}
|
62
|
+
|
63
|
+
static inline void parser_append_ref(struct token_reference_t *dest, struct token_reference_t *src)
|
64
|
+
{
|
65
|
+
if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
|
66
|
+
dest->type = src->type;
|
67
|
+
dest->start = src->start;
|
68
|
+
dest->length = src->length;
|
69
|
+
dest->line_number = src->line_number;
|
70
|
+
dest->column_number = src->column_number;
|
71
|
+
}
|
72
|
+
else {
|
73
|
+
dest->type = src->type;
|
74
|
+
dest->length += src->length;
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
static void parser_add_error(struct parser_t *parser, const char *message)
|
79
|
+
{
|
80
|
+
REALLOC_N(parser->errors, struct parser_document_error_t, parser->errors_count + 1);
|
81
|
+
parser->errors[parser->errors_count].message = strdup(message);
|
82
|
+
parser->errors[parser->errors_count].line_number = parser->doc.line_number;
|
83
|
+
parser->errors[parser->errors_count].column_number = parser->doc.column_number;
|
84
|
+
parser->errors_count += 1;
|
85
|
+
return;
|
86
|
+
}
|
87
|
+
|
88
|
+
static int parse_none(struct parser_t *parser, struct token_reference_t *ref)
|
89
|
+
{
|
90
|
+
if(ref->type == TOKEN_TAG_START) {
|
91
|
+
parser->tag.self_closing = 0;
|
92
|
+
parser->context = PARSER_SOLIDUS_OR_TAG_NAME;
|
93
|
+
parser->tag.name.type = TOKEN_NONE;
|
94
|
+
}
|
95
|
+
else if(ref->type == TOKEN_COMMENT_START) {
|
96
|
+
parser->context = PARSER_COMMENT;
|
97
|
+
parser->comment.text.type = TOKEN_NONE;
|
98
|
+
}
|
99
|
+
else if(ref->type == TOKEN_CDATA_START) {
|
100
|
+
parser->context = PARSER_CDATA;
|
101
|
+
parser->cdata.text.type = TOKEN_NONE;
|
102
|
+
}
|
103
|
+
PARSE_DONE;
|
104
|
+
}
|
105
|
+
|
106
|
+
static int parse_rawtext(struct parser_t *parser, struct token_reference_t *ref)
|
107
|
+
{
|
108
|
+
if(ref->type == TOKEN_TEXT) {
|
109
|
+
parser_append_ref(&parser->rawtext.text, ref);
|
110
|
+
}
|
111
|
+
else {
|
112
|
+
parser->context = PARSER_NONE;
|
113
|
+
parse_none(parser, ref);
|
114
|
+
}
|
115
|
+
PARSE_DONE;
|
116
|
+
}
|
117
|
+
|
118
|
+
static int parse_comment(struct parser_t *parser, struct token_reference_t *ref)
|
119
|
+
{
|
120
|
+
if(ref->type == TOKEN_COMMENT_END) {
|
121
|
+
parser->context = PARSER_NONE;
|
122
|
+
}
|
123
|
+
else if(ref->type == TOKEN_TEXT) {
|
124
|
+
parser_append_ref(&parser->comment.text, ref);
|
125
|
+
}
|
126
|
+
PARSE_DONE;
|
127
|
+
}
|
128
|
+
|
129
|
+
static int parse_cdata(struct parser_t *parser, struct token_reference_t *ref)
|
130
|
+
{
|
131
|
+
if(ref->type == TOKEN_CDATA_END) {
|
132
|
+
parser->context = PARSER_NONE;
|
133
|
+
}
|
134
|
+
else if(ref->type == TOKEN_TEXT) {
|
135
|
+
parser_append_ref(&parser->cdata.text, ref);
|
136
|
+
}
|
137
|
+
PARSE_DONE;
|
138
|
+
}
|
139
|
+
|
140
|
+
static int parse_solidus_or_tag_name(struct parser_t *parser, struct token_reference_t *ref)
|
141
|
+
{
|
142
|
+
if(ref->type == TOKEN_SOLIDUS) {
|
143
|
+
// ignore solidus before tag name
|
144
|
+
parser->context = PARSER_TAG_NAME;
|
145
|
+
}
|
146
|
+
else if(ref->type == TOKEN_TAG_NAME) {
|
147
|
+
parser->context = PARSER_TAG_NAME;
|
148
|
+
PARSE_AGAIN;
|
149
|
+
}
|
150
|
+
else {
|
151
|
+
parser_add_error(parser, "expected '/' or tag name");
|
152
|
+
parser->context = PARSER_TAG;
|
153
|
+
PARSE_AGAIN;
|
154
|
+
}
|
155
|
+
PARSE_DONE;
|
156
|
+
}
|
157
|
+
|
158
|
+
static int parse_tag_name(struct parser_t *parser, struct token_reference_t *ref)
|
159
|
+
{
|
160
|
+
if(ref->type == TOKEN_TAG_NAME) {
|
161
|
+
parser_append_ref(&parser->tag.name, ref);
|
162
|
+
}
|
163
|
+
else if(ref->type == TOKEN_WHITESPACE) {
|
164
|
+
parser->context = PARSER_TAG;
|
165
|
+
}
|
166
|
+
else if(ref->type == TOKEN_TAG_END) {
|
167
|
+
parser->context = PARSER_NONE;
|
168
|
+
}
|
169
|
+
else if(ref->type == TOKEN_SOLIDUS) {
|
170
|
+
parser->context = PARSER_TAG;
|
171
|
+
PARSE_AGAIN;
|
172
|
+
}
|
173
|
+
else {
|
174
|
+
// not reachable
|
175
|
+
rb_raise(rb_eArgError, "expected whitespace, '/' or '>' after tag name");
|
176
|
+
}
|
177
|
+
PARSE_DONE;
|
178
|
+
}
|
179
|
+
|
180
|
+
static int parse_tag(struct parser_t *parser, struct token_reference_t *ref)
|
181
|
+
{
|
182
|
+
if(ref->type == TOKEN_TAG_END) {
|
183
|
+
parser->context = PARSER_NONE;
|
184
|
+
}
|
185
|
+
else if(ref->type == TOKEN_WHITESPACE) {
|
186
|
+
// ignore whitespaces
|
187
|
+
}
|
188
|
+
else if(ref->type == TOKEN_SOLIDUS) {
|
189
|
+
parser->context = PARSER_TAG_END;
|
190
|
+
}
|
191
|
+
else if(ref->type == TOKEN_ATTRIBUTE_NAME) {
|
192
|
+
parser->context = PARSER_ATTRIBUTE_NAME;
|
193
|
+
parser->attribute.name.type = TOKEN_NONE;
|
194
|
+
parser->attribute.value.type = TOKEN_NONE;
|
195
|
+
parser->attribute.is_quoted = 0;
|
196
|
+
PARSE_AGAIN;
|
197
|
+
}
|
198
|
+
else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_START) {
|
199
|
+
parser->context = PARSER_ATTRIBUTE_QUOTED_VALUE;
|
200
|
+
parser->attribute.name.type = TOKEN_NONE;
|
201
|
+
parser->attribute.value.type = TOKEN_NONE;
|
202
|
+
parser->attribute.is_quoted = 1;
|
203
|
+
}
|
204
|
+
else {
|
205
|
+
// unexpected
|
206
|
+
parser_add_error(parser, "expected whitespace, '>', attribute name or value");
|
207
|
+
}
|
208
|
+
PARSE_DONE;
|
209
|
+
}
|
210
|
+
|
211
|
+
static int parse_tag_end(struct parser_t *parser, struct token_reference_t *ref)
|
212
|
+
{
|
213
|
+
if(ref->type == TOKEN_TAG_END) {
|
214
|
+
parser->tag.self_closing = 1;
|
215
|
+
parser->context = PARSER_NONE;
|
216
|
+
}
|
217
|
+
else {
|
218
|
+
parser_add_error(parser, "expected '>' after '/'");
|
219
|
+
parser->context = PARSER_TAG;
|
220
|
+
PARSE_AGAIN;
|
221
|
+
}
|
222
|
+
PARSE_DONE;
|
223
|
+
}
|
224
|
+
|
225
|
+
static int parse_attribute_name(struct parser_t *parser, struct token_reference_t *ref)
|
226
|
+
{
|
227
|
+
if(ref->type == TOKEN_ATTRIBUTE_NAME) {
|
228
|
+
parser_append_ref(&parser->attribute.name, ref);
|
229
|
+
}
|
230
|
+
else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
|
231
|
+
parser->context = PARSER_TAG;
|
232
|
+
PARSE_AGAIN;
|
233
|
+
}
|
234
|
+
else if(ref->type == TOKEN_WHITESPACE) {
|
235
|
+
parser->context = PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL;
|
236
|
+
PARSE_AGAIN;
|
237
|
+
}
|
238
|
+
else if(ref->type == TOKEN_EQUAL) {
|
239
|
+
parser->context = PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE;
|
240
|
+
}
|
241
|
+
else {
|
242
|
+
parser_add_error(parser, "expected whitespace, '>' or '=' after attribute name");
|
243
|
+
parser->context = PARSER_TAG;
|
244
|
+
PARSE_AGAIN;
|
245
|
+
}
|
246
|
+
PARSE_DONE;
|
247
|
+
}
|
248
|
+
|
249
|
+
static int parse_attribute_whitespace_or_equal(struct parser_t *parser, struct token_reference_t *ref)
|
250
|
+
{
|
251
|
+
if(ref->type == TOKEN_WHITESPACE) {
|
252
|
+
// swallow whitespace after attribute name
|
253
|
+
}
|
254
|
+
else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
|
255
|
+
parser->context = PARSER_TAG;
|
256
|
+
PARSE_AGAIN;
|
257
|
+
}
|
258
|
+
else if(ref->type == TOKEN_EQUAL) {
|
259
|
+
parser->context = PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE;
|
260
|
+
}
|
261
|
+
else if(ref->type == TOKEN_ATTRIBUTE_NAME) {
|
262
|
+
// start new attribute after whitespace
|
263
|
+
parser->context = PARSER_TAG;
|
264
|
+
PARSE_AGAIN;
|
265
|
+
}
|
266
|
+
else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_START) {
|
267
|
+
// start quoted value after whitespace
|
268
|
+
parser->context = PARSER_TAG;
|
269
|
+
PARSE_AGAIN;
|
270
|
+
}
|
271
|
+
else {
|
272
|
+
parser_add_error(parser, "expected '/', '>', \", ' or '=' after attribute name");
|
273
|
+
parser->context = PARSER_TAG;
|
274
|
+
PARSE_AGAIN;
|
275
|
+
}
|
276
|
+
|
277
|
+
PARSE_DONE;
|
278
|
+
}
|
279
|
+
|
280
|
+
static int parse_attribute_whitespace_or_value(struct parser_t *parser, struct token_reference_t *ref)
|
281
|
+
{
|
282
|
+
if(ref->type == TOKEN_WHITESPACE) {
|
283
|
+
// swallow whitespace after equal sign
|
284
|
+
}
|
285
|
+
else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_START) {
|
286
|
+
parser->context = PARSER_ATTRIBUTE_QUOTED_VALUE;
|
287
|
+
parser->attribute.is_quoted = 1;
|
288
|
+
}
|
289
|
+
else if(ref->type == TOKEN_ATTRIBUTE_UNQUOTED_VALUE) {
|
290
|
+
parser->context = PARSER_ATTRIBUTE_UNQUOTED_VALUE;
|
291
|
+
PARSE_AGAIN;
|
292
|
+
}
|
293
|
+
else {
|
294
|
+
parser_add_error(parser, "expected attribute value after '='");
|
295
|
+
parser->context = PARSER_TAG;
|
296
|
+
PARSE_AGAIN;
|
297
|
+
}
|
298
|
+
|
299
|
+
PARSE_DONE;
|
300
|
+
}
|
301
|
+
|
302
|
+
static int parse_attribute_quoted_value(struct parser_t *parser, struct token_reference_t *ref)
|
303
|
+
{
|
304
|
+
if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE) {
|
305
|
+
parser_append_ref(&parser->attribute.value, ref);
|
306
|
+
}
|
307
|
+
else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_END) {
|
308
|
+
parser->context = PARSER_SPACE_AFTER_ATTRIBUTE;
|
309
|
+
}
|
310
|
+
else {
|
311
|
+
// not reachable
|
312
|
+
rb_raise(rb_eArgError, "expected end-quote after quoted value");
|
313
|
+
}
|
314
|
+
|
315
|
+
PARSE_DONE;
|
316
|
+
}
|
317
|
+
|
318
|
+
static int parse_space_after_attribute(struct parser_t *parser, struct token_reference_t *ref)
|
319
|
+
{
|
320
|
+
if(ref->type == TOKEN_WHITESPACE) {
|
321
|
+
parser->context = PARSER_TAG;
|
322
|
+
}
|
323
|
+
else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
|
324
|
+
parser->context = PARSER_TAG;
|
325
|
+
PARSE_AGAIN;
|
326
|
+
}
|
327
|
+
else {
|
328
|
+
parser_add_error(parser, "expected space after attribute value");
|
329
|
+
parser->context = PARSER_TAG;
|
330
|
+
PARSE_AGAIN;
|
331
|
+
}
|
332
|
+
|
333
|
+
PARSE_DONE;
|
334
|
+
}
|
335
|
+
|
336
|
+
static int parse_attribute_unquoted_value(struct parser_t *parser, struct token_reference_t *ref)
|
337
|
+
{
|
338
|
+
if(ref->type == TOKEN_ATTRIBUTE_UNQUOTED_VALUE) {
|
339
|
+
parser_append_ref(&parser->attribute.value, ref);
|
340
|
+
}
|
341
|
+
else if(ref->type == TOKEN_WHITESPACE) {
|
342
|
+
parser->context = PARSER_TAG;
|
343
|
+
}
|
344
|
+
else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
|
345
|
+
parser->context = PARSER_TAG;
|
346
|
+
PARSE_AGAIN;
|
347
|
+
}
|
348
|
+
else {
|
349
|
+
// not reachable
|
350
|
+
rb_raise(rb_eArgError, "expected space or end-of-tag after unquoted value");
|
351
|
+
}
|
352
|
+
|
353
|
+
PARSE_DONE;
|
354
|
+
}
|
355
|
+
|
356
|
+
static inline int rawtext_context(struct parser_t *parser)
|
357
|
+
{
|
358
|
+
enum tokenizer_context ctx = parser->tk.context[parser->tk.current_context];
|
359
|
+
return (ctx == TOKENIZER_RCDATA || ctx == TOKENIZER_RAWTEXT ||
|
360
|
+
ctx == TOKENIZER_SCRIPT_DATA || ctx == TOKENIZER_PLAINTEXT);
|
361
|
+
}
|
362
|
+
|
363
|
+
static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
|
364
|
+
{
|
365
|
+
long unsigned int i;
|
366
|
+
|
367
|
+
for(i = start;i < (start + length); i++) {
|
368
|
+
if(parser->doc.data[i] == '\n') {
|
369
|
+
parser->doc.column_number = 0;
|
370
|
+
parser->doc.line_number += 1;
|
371
|
+
}
|
372
|
+
else {
|
373
|
+
parser->doc.column_number += 1;
|
374
|
+
}
|
375
|
+
}
|
376
|
+
|
377
|
+
return;
|
378
|
+
}
|
379
|
+
|
380
|
+
static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type type, unsigned long int length, void *data)
|
381
|
+
{
|
382
|
+
struct parser_t *parser = (struct parser_t *)data;
|
383
|
+
struct token_reference_t ref = {
|
384
|
+
.type = type,
|
385
|
+
.start = tk->scan.cursor,
|
386
|
+
.length = length,
|
387
|
+
.line_number = parser->doc.line_number,
|
388
|
+
.column_number = parser->doc.column_number,
|
389
|
+
};
|
390
|
+
int parse_again = 1;
|
391
|
+
|
392
|
+
while(parse_again) {
|
393
|
+
switch(parser->context)
|
394
|
+
{
|
395
|
+
case PARSER_NONE:
|
396
|
+
if(rawtext_context(parser))
|
397
|
+
parse_again = parse_rawtext(parser, &ref);
|
398
|
+
else
|
399
|
+
parse_again = parse_none(parser, &ref);
|
400
|
+
break;
|
401
|
+
case PARSER_SOLIDUS_OR_TAG_NAME:
|
402
|
+
parse_again = parse_solidus_or_tag_name(parser, &ref);
|
403
|
+
break;
|
404
|
+
case PARSER_TAG_NAME:
|
405
|
+
parse_again = parse_tag_name(parser, &ref);
|
406
|
+
break;
|
407
|
+
case PARSER_TAG:
|
408
|
+
parse_again = parse_tag(parser, &ref);
|
409
|
+
break;
|
410
|
+
case PARSER_ATTRIBUTE_NAME:
|
411
|
+
parse_again = parse_attribute_name(parser, &ref);
|
412
|
+
break;
|
413
|
+
case PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL:
|
414
|
+
parse_again = parse_attribute_whitespace_or_equal(parser, &ref);
|
415
|
+
break;
|
416
|
+
case PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE:
|
417
|
+
parse_again = parse_attribute_whitespace_or_value(parser, &ref);
|
418
|
+
break;
|
419
|
+
case PARSER_ATTRIBUTE_QUOTED_VALUE:
|
420
|
+
parse_again = parse_attribute_quoted_value(parser, &ref);
|
421
|
+
break;
|
422
|
+
case PARSER_SPACE_AFTER_ATTRIBUTE:
|
423
|
+
parse_again = parse_space_after_attribute(parser, &ref);
|
424
|
+
break;
|
425
|
+
case PARSER_ATTRIBUTE_UNQUOTED_VALUE:
|
426
|
+
parse_again = parse_attribute_unquoted_value(parser, &ref);
|
427
|
+
break;
|
428
|
+
case PARSER_TAG_END:
|
429
|
+
parse_again = parse_tag_end(parser, &ref);
|
430
|
+
break;
|
431
|
+
case PARSER_CDATA:
|
432
|
+
parse_again = parse_cdata(parser, &ref);
|
433
|
+
break;
|
434
|
+
case PARSER_COMMENT:
|
435
|
+
parse_again = parse_comment(parser, &ref);
|
436
|
+
break;
|
437
|
+
}
|
438
|
+
}
|
439
|
+
|
440
|
+
if(rb_block_given_p()) {
|
441
|
+
rb_yield_values(5, token_type_to_symbol(type),
|
442
|
+
INT2NUM(ref.start), INT2NUM(ref.start + ref.length),
|
443
|
+
INT2NUM(ref.line_number), INT2NUM(ref.column_number));
|
444
|
+
}
|
445
|
+
|
446
|
+
parser_adjust_line_number(parser, ref.start, ref.length);
|
447
|
+
|
448
|
+
return;
|
449
|
+
}
|
450
|
+
|
451
|
+
static VALUE parser_initialize_method(VALUE self)
|
452
|
+
{
|
453
|
+
struct parser_t *parser = NULL;
|
454
|
+
|
455
|
+
Parser_Get_Struct(self, parser);
|
456
|
+
DBG_PRINT("parser=%p initialize", parser);
|
457
|
+
|
458
|
+
memset(parser, 0, sizeof(struct parser_t));
|
459
|
+
|
460
|
+
parser->context = PARSER_NONE;
|
461
|
+
|
462
|
+
tokenizer_init(&parser->tk);
|
463
|
+
parser->tk.callback_data = parser;
|
464
|
+
parser->tk.f_callback = parser_tokenize_callback;
|
465
|
+
|
466
|
+
parser->doc.length = 0;
|
467
|
+
parser->doc.data = NULL;
|
468
|
+
|
469
|
+
parser->doc.line_number = 1;
|
470
|
+
parser->doc.column_number = 0;
|
471
|
+
|
472
|
+
parser->errors_count = 0;
|
473
|
+
parser->errors = NULL;
|
474
|
+
|
475
|
+
return Qnil;
|
476
|
+
}
|
477
|
+
|
478
|
+
static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
|
479
|
+
{
|
480
|
+
void *old = parser->doc.data;
|
481
|
+
REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
|
482
|
+
DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
|
483
|
+
parser->doc.data, parser->doc.length + length + 1);
|
484
|
+
strcpy(parser->doc.data+parser->doc.length, string);
|
485
|
+
parser->doc.length += length;
|
486
|
+
return 1;
|
487
|
+
}
|
488
|
+
|
489
|
+
static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
|
490
|
+
{
|
491
|
+
struct parser_t *parser = NULL;
|
492
|
+
char *string = NULL;
|
493
|
+
long unsigned int length = 0, cursor = 0;
|
494
|
+
|
495
|
+
if(NIL_P(source))
|
496
|
+
return Qnil;
|
497
|
+
|
498
|
+
Check_Type(source, T_STRING);
|
499
|
+
Parser_Get_Struct(self, parser);
|
500
|
+
|
501
|
+
string = StringValueCStr(source);
|
502
|
+
length = strlen(string);
|
503
|
+
|
504
|
+
cursor = parser->doc.length;
|
505
|
+
|
506
|
+
if(!parser_document_append(parser, string, length)) {
|
507
|
+
// error
|
508
|
+
return Qnil;
|
509
|
+
}
|
510
|
+
|
511
|
+
if(is_placeholder) {
|
512
|
+
parser_adjust_line_number(parser, cursor, length);
|
513
|
+
}
|
514
|
+
else {
|
515
|
+
parser->tk.scan.cursor = cursor;
|
516
|
+
parser->tk.scan.string = parser->doc.data;
|
517
|
+
parser->tk.scan.length = parser->doc.length;
|
518
|
+
|
519
|
+
tokenizer_scan_all(&parser->tk);
|
520
|
+
}
|
521
|
+
|
522
|
+
return Qtrue;
|
523
|
+
}
|
524
|
+
|
525
|
+
static VALUE parser_parse_method(VALUE self, VALUE source)
|
526
|
+
{
|
527
|
+
return parser_append_data(self, source, 0);
|
528
|
+
}
|
529
|
+
|
530
|
+
static VALUE parser_append_placeholder_method(VALUE self, VALUE source)
|
531
|
+
{
|
532
|
+
return parser_append_data(self, source, 1);
|
533
|
+
}
|
534
|
+
|
535
|
+
static VALUE parser_document_method(VALUE self)
|
536
|
+
{
|
537
|
+
struct parser_t *parser = NULL;
|
538
|
+
Parser_Get_Struct(self, parser);
|
539
|
+
if(!parser->doc.data)
|
540
|
+
return Qnil;
|
541
|
+
return rb_str_new(parser->doc.data, parser->doc.length);
|
542
|
+
}
|
543
|
+
|
544
|
+
static VALUE parser_document_length_method(VALUE self)
|
545
|
+
{
|
546
|
+
struct parser_t *parser = NULL;
|
547
|
+
Parser_Get_Struct(self, parser);
|
548
|
+
return ULONG2NUM(parser->doc.length);
|
549
|
+
}
|
550
|
+
|
551
|
+
static VALUE parser_context_method(VALUE self)
|
552
|
+
{
|
553
|
+
struct parser_t *parser = NULL;
|
554
|
+
|
555
|
+
Parser_Get_Struct(self, parser);
|
556
|
+
|
557
|
+
switch(parser->context) {
|
558
|
+
case PARSER_NONE:
|
559
|
+
return rawtext_context(parser) ? ID2SYM(rb_intern("rawtext")) : ID2SYM(rb_intern("none"));
|
560
|
+
case PARSER_SOLIDUS_OR_TAG_NAME:
|
561
|
+
return ID2SYM(rb_intern("solidus_or_tag_name"));
|
562
|
+
case PARSER_TAG_NAME:
|
563
|
+
return ID2SYM(rb_intern("tag_name"));
|
564
|
+
case PARSER_TAG:
|
565
|
+
return ID2SYM(rb_intern("tag"));
|
566
|
+
case PARSER_ATTRIBUTE_NAME:
|
567
|
+
return ID2SYM(rb_intern("attribute_name"));
|
568
|
+
case PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL:
|
569
|
+
return ID2SYM(rb_intern("after_attribute_name"));
|
570
|
+
case PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE:
|
571
|
+
return ID2SYM(rb_intern("after_equal"));
|
572
|
+
case PARSER_ATTRIBUTE_QUOTED_VALUE:
|
573
|
+
return ID2SYM(rb_intern("quoted_value"));
|
574
|
+
case PARSER_SPACE_AFTER_ATTRIBUTE:
|
575
|
+
return ID2SYM(rb_intern("space_after_attribute"));
|
576
|
+
case PARSER_ATTRIBUTE_UNQUOTED_VALUE:
|
577
|
+
return ID2SYM(rb_intern("unquoted_value"));
|
578
|
+
case PARSER_TAG_END:
|
579
|
+
return ID2SYM(rb_intern("tag_end"));
|
580
|
+
case PARSER_COMMENT:
|
581
|
+
return ID2SYM(rb_intern("comment"));
|
582
|
+
case PARSER_CDATA:
|
583
|
+
return ID2SYM(rb_intern("cdata"));
|
584
|
+
}
|
585
|
+
|
586
|
+
return Qnil;
|
587
|
+
}
|
588
|
+
|
589
|
+
static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
|
590
|
+
{
|
591
|
+
if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
|
592
|
+
return Qnil;
|
593
|
+
return rb_str_new(parser->doc.data+ref->start, ref->length);
|
594
|
+
}
|
595
|
+
|
596
|
+
static VALUE parser_tag_name_method(VALUE self)
|
597
|
+
{
|
598
|
+
struct parser_t *parser = NULL;
|
599
|
+
Parser_Get_Struct(self, parser);
|
600
|
+
return ref_to_str(parser, &parser->tag.name);
|
601
|
+
}
|
602
|
+
|
603
|
+
static VALUE parser_closing_tag_method(VALUE self)
|
604
|
+
{
|
605
|
+
struct parser_t *parser = NULL;
|
606
|
+
Parser_Get_Struct(self, parser);
|
607
|
+
return parser->tk.is_closing_tag ? Qtrue : Qfalse;
|
608
|
+
}
|
609
|
+
|
610
|
+
static VALUE parser_self_closing_tag_method(VALUE self)
|
611
|
+
{
|
612
|
+
struct parser_t *parser = NULL;
|
613
|
+
Parser_Get_Struct(self, parser);
|
614
|
+
return parser->tag.self_closing ? Qtrue : Qfalse;
|
615
|
+
}
|
616
|
+
|
617
|
+
static VALUE parser_attribute_name_method(VALUE self)
|
618
|
+
{
|
619
|
+
struct parser_t *parser = NULL;
|
620
|
+
Parser_Get_Struct(self, parser);
|
621
|
+
return ref_to_str(parser, &parser->attribute.name);
|
622
|
+
}
|
623
|
+
|
624
|
+
static VALUE parser_attribute_value_method(VALUE self)
|
625
|
+
{
|
626
|
+
struct parser_t *parser = NULL;
|
627
|
+
Parser_Get_Struct(self, parser);
|
628
|
+
return ref_to_str(parser, &parser->attribute.value);
|
629
|
+
}
|
630
|
+
|
631
|
+
static VALUE parser_quote_character_method(VALUE self)
|
632
|
+
{
|
633
|
+
struct parser_t *parser = NULL;
|
634
|
+
Parser_Get_Struct(self, parser);
|
635
|
+
return parser->attribute.is_quoted ?
|
636
|
+
rb_str_new(&parser->tk.attribute_value_start, 1) :
|
637
|
+
Qnil;
|
638
|
+
}
|
639
|
+
|
640
|
+
static VALUE parser_attribute_is_quoted_method(VALUE self)
|
641
|
+
{
|
642
|
+
struct parser_t *parser = NULL;
|
643
|
+
Parser_Get_Struct(self, parser);
|
644
|
+
return parser->attribute.is_quoted ? Qtrue : Qfalse;
|
645
|
+
}
|
646
|
+
|
647
|
+
static VALUE parser_comment_text_method(VALUE self)
|
648
|
+
{
|
649
|
+
struct parser_t *parser = NULL;
|
650
|
+
Parser_Get_Struct(self, parser);
|
651
|
+
return ref_to_str(parser, &parser->comment.text);
|
652
|
+
}
|
653
|
+
|
654
|
+
static VALUE parser_cdata_text_method(VALUE self)
|
655
|
+
{
|
656
|
+
struct parser_t *parser = NULL;
|
657
|
+
Parser_Get_Struct(self, parser);
|
658
|
+
return ref_to_str(parser, &parser->cdata.text);
|
659
|
+
}
|
660
|
+
|
661
|
+
static VALUE parser_rawtext_text_method(VALUE self)
|
662
|
+
{
|
663
|
+
struct parser_t *parser = NULL;
|
664
|
+
Parser_Get_Struct(self, parser);
|
665
|
+
return ref_to_str(parser, &parser->rawtext.text);
|
666
|
+
}
|
667
|
+
|
668
|
+
static VALUE parser_extract_method(VALUE self, VALUE start_p, VALUE end_p)
|
669
|
+
{
|
670
|
+
struct parser_t *parser = NULL;
|
671
|
+
unsigned long int start, end;
|
672
|
+
struct token_reference_t ref;
|
673
|
+
|
674
|
+
Parser_Get_Struct(self, parser);
|
675
|
+
|
676
|
+
start = NUM2ULONG(start_p);
|
677
|
+
end = NUM2ULONG(end_p);
|
678
|
+
if(end < start) {
|
679
|
+
rb_raise(rb_eArgError, "'end' must be greater or equal than 'start'");
|
680
|
+
}
|
681
|
+
if(end > parser->doc.length) {
|
682
|
+
rb_raise(rb_eArgError, "'end' argument not in range of document");
|
683
|
+
}
|
684
|
+
|
685
|
+
ref.type = TOKEN_TEXT; // anything not NONE
|
686
|
+
ref.start = start;
|
687
|
+
ref.length = end - start;
|
688
|
+
return ref_to_str(parser, &ref);
|
689
|
+
}
|
690
|
+
|
691
|
+
static VALUE parser_errors_count_method(VALUE self)
|
692
|
+
{
|
693
|
+
struct parser_t *parser = NULL;
|
694
|
+
Parser_Get_Struct(self, parser);
|
695
|
+
return INT2NUM(parser->errors_count);
|
696
|
+
}
|
697
|
+
|
698
|
+
static VALUE create_parser_error(struct parser_document_error_t *error)
|
699
|
+
{
|
700
|
+
VALUE module = rb_const_get(rb_cObject, rb_intern("HtmlTokenizer"));
|
701
|
+
VALUE klass = rb_const_get(module, rb_intern("ParserError"));
|
702
|
+
VALUE args[3] = {
|
703
|
+
rb_str_new2(error->message),
|
704
|
+
ULONG2NUM(error->line_number),
|
705
|
+
ULONG2NUM(error->column_number),
|
706
|
+
};
|
707
|
+
return rb_class_new_instance(3, args, klass);
|
708
|
+
}
|
709
|
+
|
710
|
+
static VALUE parser_errors_method(VALUE self, VALUE error_p)
|
711
|
+
{
|
712
|
+
struct parser_t *parser = NULL;
|
713
|
+
VALUE list;
|
714
|
+
size_t i;
|
715
|
+
Parser_Get_Struct(self, parser);
|
716
|
+
|
717
|
+
list = rb_ary_new();
|
718
|
+
for(i=0; i<parser->errors_count; i++) {
|
719
|
+
if(parser->errors[i].message) {
|
720
|
+
rb_ary_push(list, create_parser_error(&parser->errors[i]));
|
721
|
+
}
|
722
|
+
}
|
723
|
+
|
724
|
+
return list;
|
725
|
+
}
|
726
|
+
|
727
|
+
static VALUE parser_line_number_method(VALUE self)
|
728
|
+
{
|
729
|
+
struct parser_t *parser = NULL;
|
730
|
+
Parser_Get_Struct(self, parser);
|
731
|
+
return ULONG2NUM(parser->doc.line_number);
|
732
|
+
}
|
733
|
+
|
734
|
+
static VALUE parser_column_number_method(VALUE self)
|
735
|
+
{
|
736
|
+
struct parser_t *parser = NULL;
|
737
|
+
Parser_Get_Struct(self, parser);
|
738
|
+
return ULONG2NUM(parser->doc.column_number);
|
739
|
+
}
|
740
|
+
|
741
|
+
void Init_html_tokenizer_parser(VALUE mHtmlTokenizer)
|
742
|
+
{
|
743
|
+
cParser = rb_define_class_under(mHtmlTokenizer, "Parser", rb_cObject);
|
744
|
+
rb_define_alloc_func(cParser, parser_allocate);
|
745
|
+
rb_define_method(cParser, "initialize", parser_initialize_method, 0);
|
746
|
+
rb_define_method(cParser, "document", parser_document_method, 0);
|
747
|
+
rb_define_method(cParser, "document_length", parser_document_length_method, 0);
|
748
|
+
rb_define_method(cParser, "line_number", parser_line_number_method, 0);
|
749
|
+
rb_define_method(cParser, "column_number", parser_column_number_method, 0);
|
750
|
+
rb_define_method(cParser, "parse", parser_parse_method, 1);
|
751
|
+
rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
|
752
|
+
rb_define_method(cParser, "extract", parser_extract_method, 2);
|
753
|
+
rb_define_method(cParser, "context", parser_context_method, 0);
|
754
|
+
rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
|
755
|
+
rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);
|
756
|
+
rb_define_method(cParser, "self_closing_tag?", parser_self_closing_tag_method, 0);
|
757
|
+
rb_define_method(cParser, "attribute_name", parser_attribute_name_method, 0);
|
758
|
+
rb_define_method(cParser, "attribute_value", parser_attribute_value_method, 0);
|
759
|
+
rb_define_method(cParser, "quote_character", parser_quote_character_method, 0);
|
760
|
+
rb_define_method(cParser, "attribute_quoted?", parser_attribute_is_quoted_method, 0);
|
761
|
+
rb_define_method(cParser, "comment_text", parser_comment_text_method, 0);
|
762
|
+
rb_define_method(cParser, "cdata_text", parser_cdata_text_method, 0);
|
763
|
+
rb_define_method(cParser, "rawtext_text", parser_rawtext_text_method, 0);
|
764
|
+
|
765
|
+
rb_define_method(cParser, "errors_count", parser_errors_count_method, 0);
|
766
|
+
rb_define_method(cParser, "errors", parser_errors_method, 0);
|
767
|
+
}
|