nokogumbo 1.1.12 → 1.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/nokogumboc/extconf.rb +1 -1
- data/gumbo-parser/src/char_ref.c +22828 -2291
- data/gumbo-parser/src/char_ref.rl +2548 -0
- data/gumbo-parser/src/error.c +21 -0
- data/gumbo-parser/src/parser.c +109 -105
- data/gumbo-parser/src/tokenizer.c +103 -103
- data/gumbo-parser/src/utf8.c +114 -120
- data/gumbo-parser/src/utf8.h +6 -0
- metadata +3 -2
data/gumbo-parser/src/utf8.c
CHANGED
@@ -33,7 +33,64 @@ const int kUtf8ReplacementChar = 0xFFFD;
|
|
33
33
|
// Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
|
34
34
|
// RFC 3629: http://tools.ietf.org/html/rfc3629
|
35
35
|
// HTML5 Unicode handling:
|
36
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/
|
36
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
|
37
|
+
//
|
38
|
+
// This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
|
39
|
+
// <bjoern@hoehrmann.de>. We wrap the inner table-based decoder routine in our
|
40
|
+
// own handling for newlines, tabs, invalid continuation bytes, and other
|
41
|
+
// conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
|
42
|
+
// not handle.
|
43
|
+
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of
|
44
|
+
// the license agreement and code follows.
|
45
|
+
|
46
|
+
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
47
|
+
|
48
|
+
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
49
|
+
// of this software and associated documentation files (the "Software"), to deal
|
50
|
+
// in the Software without restriction, including without limitation the rights to
|
51
|
+
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
52
|
+
// of the Software, and to permit persons to whom the Software is furnished to do
|
53
|
+
// so, subject to the following conditions:
|
54
|
+
|
55
|
+
// The above copyright notice and this permission notice shall be included in
|
56
|
+
// all copies or substantial portions of the Software.
|
57
|
+
|
58
|
+
#define UTF8_ACCEPT 0
|
59
|
+
#define UTF8_REJECT 12
|
60
|
+
|
61
|
+
static const uint8_t utf8d[] = {
|
62
|
+
// The first part of the table maps bytes to character classes that
|
63
|
+
// to reduce the size of the transition table and create bitmasks.
|
64
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
65
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
66
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
67
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
68
|
+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
69
|
+
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
70
|
+
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
71
|
+
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
72
|
+
|
73
|
+
// The second part is a transition table that maps a combination
|
74
|
+
// of a state of the automaton and a character class to a state.
|
75
|
+
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
76
|
+
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
77
|
+
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
78
|
+
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
79
|
+
12,36,12,12,12,12,12,12,12,12,12,12,
|
80
|
+
};
|
81
|
+
|
82
|
+
uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
83
|
+
uint32_t type = utf8d[byte];
|
84
|
+
|
85
|
+
*codep = (*state != UTF8_ACCEPT) ?
|
86
|
+
(byte & 0x3fu) | (*codep << 6) :
|
87
|
+
(0xff >> type) & (byte);
|
88
|
+
|
89
|
+
*state = utf8d[256 + *state + type];
|
90
|
+
return *state;
|
91
|
+
}
|
92
|
+
|
93
|
+
// END COPIED CODE.
|
37
94
|
|
38
95
|
// Adds a decoding error to the parser's error list, based on the current state
|
39
96
|
// of the Utf8Iterator.
|
@@ -63,110 +120,58 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
|
|
63
120
|
// When this method returns, iter->_width and iter->_current will be set
|
64
121
|
// appropriately, as well as any error flags.
|
65
122
|
static void read_char(Utf8Iterator* iter) {
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
if (c < 0x80) {
|
72
|
-
// Valid one-byte sequence.
|
73
|
-
iter->_width = 1;
|
74
|
-
mask = 0xFF;
|
75
|
-
} else if (c < 0xC0) {
|
76
|
-
// Continuation character not following a multibyte sequence.
|
77
|
-
// The HTML5 spec here says to consume the byte and output a replacement
|
78
|
-
// character.
|
79
|
-
iter->_width = 1;
|
80
|
-
is_bad_char = true;
|
81
|
-
} else if (c < 0xE0) {
|
82
|
-
iter->_width = 2;
|
83
|
-
mask = 0x1F; // 00011111 in binary.
|
84
|
-
if (c < 0xC2) {
|
85
|
-
// Overlong encoding; error according to UTF8/HTML5 spec.
|
86
|
-
is_bad_char = true;
|
87
|
-
}
|
88
|
-
} else if (c < 0xF0) {
|
89
|
-
iter->_width = 3;
|
90
|
-
mask = 0xF; // 00001111 in binary.
|
91
|
-
} else if (c < 0xF5) {
|
92
|
-
iter->_width = 4;
|
93
|
-
mask = 0x7; // 00000111 in binary.
|
94
|
-
} else if (c < 0xF8) {
|
95
|
-
// The following cases are all errors, but we need to handle them separately
|
96
|
-
// so that we consume the proper number of bytes from the input stream
|
97
|
-
// before replacing them with the replacement char. The HTML5 spec
|
98
|
-
// specifies that we should consume the shorter of the length specified by
|
99
|
-
// the first bit or the run leading up to the first non-continuation
|
100
|
-
// character.
|
101
|
-
iter->_width = 5;
|
102
|
-
is_bad_char = true;
|
103
|
-
} else if (c < 0xFC) {
|
104
|
-
iter->_width = 6;
|
105
|
-
is_bad_char = true;
|
106
|
-
} else if (c < 0xFE) {
|
107
|
-
iter->_width = 7;
|
108
|
-
is_bad_char = true;
|
109
|
-
} else {
|
110
|
-
iter->_width = 1;
|
111
|
-
is_bad_char = true;
|
112
|
-
}
|
113
|
-
|
114
|
-
// Check to make sure we have enough bytes left in the iter to read all that
|
115
|
-
// we want. If not, we set the iter_truncated flag, mark this as a bad
|
116
|
-
// character, and adjust the current width so that it consumes the rest of the
|
117
|
-
// iter.
|
118
|
-
uint64_t code_point = c & mask;
|
119
|
-
if (iter->_start + iter->_width > iter->_end) {
|
120
|
-
iter->_width = iter->_end - iter->_start;
|
121
|
-
add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
|
122
|
-
is_bad_char = true;
|
123
|
-
}
|
124
|
-
|
125
|
-
// Now we decode continuation bytes, shift them appropriately, and build up
|
126
|
-
// the appropriate code point.
|
127
|
-
assert(iter->_width < 8);
|
128
|
-
for (int i = 1; i < iter->_width; ++i) {
|
129
|
-
c = (unsigned char) iter->_start[i];
|
130
|
-
if (c < 0x80 || c > 0xBF) {
|
131
|
-
// Per HTML5 spec, we don't include the invalid continuation char in the
|
132
|
-
// run that we consume here.
|
133
|
-
iter->_width = i;
|
134
|
-
is_bad_char = true;
|
135
|
-
break;
|
136
|
-
}
|
137
|
-
code_point = (code_point << 6) | (c & ~0x80);
|
138
|
-
}
|
139
|
-
if (code_point > 0x10FFFF) is_bad_char = true;
|
140
|
-
|
141
|
-
// If we had a decode error, set the current code point to the replacement
|
142
|
-
// character and flip the flag indicating that a decode error occurred.
|
143
|
-
// Ditto if we have a code point that is explicitly on the list of characters
|
144
|
-
// prohibited by the HTML5 spec, such as control characters.
|
145
|
-
if (is_bad_char || utf8_is_invalid_code_point(code_point)) {
|
146
|
-
add_error(iter, GUMBO_ERR_UTF8_INVALID);
|
147
|
-
code_point = kUtf8ReplacementChar;
|
123
|
+
if (iter->_start >= iter->_end) {
|
124
|
+
// No input left to consume; emit an EOF and set width = 0.
|
125
|
+
iter->_current = -1;
|
126
|
+
iter->_width = 0;
|
127
|
+
return;
|
148
128
|
}
|
149
129
|
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
//
|
159
|
-
|
160
|
-
//
|
161
|
-
|
162
|
-
|
130
|
+
uint32_t code_point = 0;
|
131
|
+
uint32_t state = UTF8_ACCEPT;
|
132
|
+
for (const char* c = iter->_start; c < iter->_end; ++c) {
|
133
|
+
decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
|
134
|
+
if (state == UTF8_ACCEPT) {
|
135
|
+
iter->_width = c - iter->_start + 1;
|
136
|
+
// This is the special handling for carriage returns that is mandated by the
|
137
|
+
// HTML5 spec. Since we're looking for particular 7-bit literal characters,
|
138
|
+
// we operate in terms of chars and only need a check for iter overrun,
|
139
|
+
// instead of having to read in a full next code point.
|
140
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
|
141
|
+
if (code_point == '\r') {
|
142
|
+
assert(iter->_width == 1);
|
143
|
+
const char* next = c + 1;
|
144
|
+
if (next < iter->_end && *next == '\n') {
|
145
|
+
// Advance the iter, as if the carriage return didn't exist.
|
146
|
+
++iter->_start;
|
147
|
+
// Preserve the true offset, since other tools that look at it may be
|
148
|
+
// unaware of HTML5's rules for converting \r into \n.
|
149
|
+
++iter->_pos.offset;
|
150
|
+
}
|
151
|
+
code_point = '\n';
|
152
|
+
}
|
153
|
+
if (utf8_is_invalid_code_point(code_point)) {
|
154
|
+
add_error(iter, GUMBO_ERR_UTF8_INVALID);
|
155
|
+
code_point = kUtf8ReplacementChar;
|
156
|
+
}
|
157
|
+
iter->_current = code_point;
|
158
|
+
return;
|
159
|
+
} else if (state == UTF8_REJECT) {
|
160
|
+
// We don't want to consume the invalid continuation byte of a multi-byte
|
161
|
+
// run, but we do want to skip past an invalid first byte.
|
162
|
+
iter->_width = c - iter->_start + (c == iter->_start);
|
163
|
+
iter->_current = kUtf8ReplacementChar;
|
164
|
+
add_error(iter, GUMBO_ERR_UTF8_INVALID);
|
165
|
+
return;
|
163
166
|
}
|
164
|
-
code_point = '\n';
|
165
167
|
}
|
166
|
-
|
167
|
-
//
|
168
|
-
//
|
169
|
-
|
168
|
+
// If we got here without exiting early, then we've reached the end of the iterator.
|
169
|
+
// Add an error for truncated input, set the width to consume the rest of the
|
170
|
+
// iterator, and emit a replacement character. The next time we enter this method,
|
171
|
+
// it will detect that there's no input to consume and
|
172
|
+
iter->_current = kUtf8ReplacementChar;
|
173
|
+
iter->_width = iter->_end - iter->_start;
|
174
|
+
add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
|
170
175
|
}
|
171
176
|
|
172
177
|
static void update_position(Utf8Iterator* iter) {
|
@@ -177,7 +182,7 @@ static void update_position(Utf8Iterator* iter) {
|
|
177
182
|
} else if(iter->_current == '\t') {
|
178
183
|
int tab_stop = iter->_parser->_options->tab_stop;
|
179
184
|
iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
|
180
|
-
} else {
|
185
|
+
} else if(iter->_current != -1) {
|
181
186
|
++iter->_pos.column;
|
182
187
|
}
|
183
188
|
}
|
@@ -195,34 +200,19 @@ void utf8iterator_init(
|
|
195
200
|
Utf8Iterator* iter) {
|
196
201
|
iter->_start = source;
|
197
202
|
iter->_end = source + source_length;
|
198
|
-
iter->_width = 0;
|
199
203
|
iter->_pos.line = 1;
|
200
204
|
iter->_pos.column = 1;
|
201
205
|
iter->_pos.offset = 0;
|
202
206
|
iter->_parser = parser;
|
203
|
-
|
204
|
-
read_char(iter);
|
205
|
-
} else {
|
206
|
-
iter->_current = -1;
|
207
|
-
}
|
207
|
+
read_char(iter);
|
208
208
|
}
|
209
209
|
|
210
210
|
void utf8iterator_next(Utf8Iterator* iter) {
|
211
|
-
if (iter->_current == -1) {
|
212
|
-
// If we're already at EOF, bail out before advancing anything to avoid
|
213
|
-
// reading past the end of the buffer. It's easier to catch this case here
|
214
|
-
// than litter the code with lots of individual checks for EOF.
|
215
|
-
return;
|
216
|
-
}
|
217
|
-
iter->_start += iter->_width;
|
218
211
|
// We update positions based on the *last* character read, so that the first
|
219
212
|
// character following a newline is at column 1 in the next line.
|
220
213
|
update_position(iter);
|
221
|
-
|
222
|
-
|
223
|
-
} else { // EOF
|
224
|
-
iter->_current = -1;
|
225
|
-
}
|
214
|
+
iter->_start += iter->_width;
|
215
|
+
read_char(iter);
|
226
216
|
}
|
227
217
|
|
228
218
|
int utf8iterator_current(const Utf8Iterator* iter) {
|
@@ -238,6 +228,10 @@ const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
|
|
238
228
|
return iter->_start;
|
239
229
|
}
|
240
230
|
|
231
|
+
const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
|
232
|
+
return iter->_end;
|
233
|
+
}
|
234
|
+
|
241
235
|
bool utf8iterator_maybe_consume_match(
|
242
236
|
Utf8Iterator* iter, const char* prefix, size_t length,
|
243
237
|
bool case_sensitive) {
|
data/gumbo-parser/src/utf8.h
CHANGED
@@ -98,6 +98,12 @@ void utf8iterator_get_position(
|
|
98
98
|
// Retrieves a character pointer to the start of the current character.
|
99
99
|
const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
|
100
100
|
|
101
|
+
// Retrieves a character pointer to 1 past the end of the buffer. This is
|
102
|
+
// necessary for certain state machines and string comparisons that would like
|
103
|
+
// to look directly for ASCII text in the buffer without going through the
|
104
|
+
// decoder.
|
105
|
+
const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
|
106
|
+
|
101
107
|
// If the upcoming text in the buffer matches the specified prefix (which has
|
102
108
|
// length 'length'), consume it and return true. Otherwise, return false with
|
103
109
|
// no other effects. If the length of the string would overflow the buffer,
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.13
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-09
|
12
|
+
date: 2014-10-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -44,6 +44,7 @@ files:
|
|
44
44
|
- gumbo-parser/src/attribute.h
|
45
45
|
- gumbo-parser/src/char_ref.c
|
46
46
|
- gumbo-parser/src/char_ref.h
|
47
|
+
- gumbo-parser/src/char_ref.rl
|
47
48
|
- gumbo-parser/src/error.c
|
48
49
|
- gumbo-parser/src/error.h
|
49
50
|
- gumbo-parser/src/gumbo.h
|