nokogumbo 1.1.12 → 1.1.13
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/nokogumboc/extconf.rb +1 -1
- data/gumbo-parser/src/char_ref.c +22828 -2291
- data/gumbo-parser/src/char_ref.rl +2548 -0
- data/gumbo-parser/src/error.c +21 -0
- data/gumbo-parser/src/parser.c +109 -105
- data/gumbo-parser/src/tokenizer.c +103 -103
- data/gumbo-parser/src/utf8.c +114 -120
- data/gumbo-parser/src/utf8.h +6 -0
- metadata +3 -2
data/gumbo-parser/src/utf8.c
CHANGED
@@ -33,7 +33,64 @@ const int kUtf8ReplacementChar = 0xFFFD;
|
|
33
33
|
// Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
|
34
34
|
// RFC 3629: http://tools.ietf.org/html/rfc3629
|
35
35
|
// HTML5 Unicode handling:
|
36
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/
|
36
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
|
37
|
+
//
|
38
|
+
// This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
|
39
|
+
// <bjoern@hoehrmann.de>. We wrap the inner table-based decoder routine in our
|
40
|
+
// own handling for newlines, tabs, invalid continuation bytes, and other
|
41
|
+
// conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
|
42
|
+
// not handle.
|
43
|
+
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of
|
44
|
+
// the license agreement and code follows.
|
45
|
+
|
46
|
+
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
47
|
+
|
48
|
+
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
49
|
+
// of this software and associated documentation files (the "Software"), to deal
|
50
|
+
// in the Software without restriction, including without limitation the rights to
|
51
|
+
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
52
|
+
// of the Software, and to permit persons to whom the Software is furnished to do
|
53
|
+
// so, subject to the following conditions:
|
54
|
+
|
55
|
+
// The above copyright notice and this permission notice shall be included in
|
56
|
+
// all copies or substantial portions of the Software.
|
57
|
+
|
58
|
+
#define UTF8_ACCEPT 0
|
59
|
+
#define UTF8_REJECT 12
|
60
|
+
|
61
|
+
static const uint8_t utf8d[] = {
|
62
|
+
// The first part of the table maps bytes to character classes that
|
63
|
+
// to reduce the size of the transition table and create bitmasks.
|
64
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
65
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
66
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
67
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
68
|
+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
69
|
+
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
70
|
+
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
71
|
+
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
72
|
+
|
73
|
+
// The second part is a transition table that maps a combination
|
74
|
+
// of a state of the automaton and a character class to a state.
|
75
|
+
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
76
|
+
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
77
|
+
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
78
|
+
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
79
|
+
12,36,12,12,12,12,12,12,12,12,12,12,
|
80
|
+
};
|
81
|
+
|
82
|
+
uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
83
|
+
uint32_t type = utf8d[byte];
|
84
|
+
|
85
|
+
*codep = (*state != UTF8_ACCEPT) ?
|
86
|
+
(byte & 0x3fu) | (*codep << 6) :
|
87
|
+
(0xff >> type) & (byte);
|
88
|
+
|
89
|
+
*state = utf8d[256 + *state + type];
|
90
|
+
return *state;
|
91
|
+
}
|
92
|
+
|
93
|
+
// END COPIED CODE.
|
37
94
|
|
38
95
|
// Adds a decoding error to the parser's error list, based on the current state
|
39
96
|
// of the Utf8Iterator.
|
@@ -63,110 +120,58 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
|
|
63
120
|
// When this method returns, iter->_width and iter->_current will be set
|
64
121
|
// appropriately, as well as any error flags.
|
65
122
|
static void read_char(Utf8Iterator* iter) {
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
if (c < 0x80) {
|
72
|
-
// Valid one-byte sequence.
|
73
|
-
iter->_width = 1;
|
74
|
-
mask = 0xFF;
|
75
|
-
} else if (c < 0xC0) {
|
76
|
-
// Continuation character not following a multibyte sequence.
|
77
|
-
// The HTML5 spec here says to consume the byte and output a replacement
|
78
|
-
// character.
|
79
|
-
iter->_width = 1;
|
80
|
-
is_bad_char = true;
|
81
|
-
} else if (c < 0xE0) {
|
82
|
-
iter->_width = 2;
|
83
|
-
mask = 0x1F; // 00011111 in binary.
|
84
|
-
if (c < 0xC2) {
|
85
|
-
// Overlong encoding; error according to UTF8/HTML5 spec.
|
86
|
-
is_bad_char = true;
|
87
|
-
}
|
88
|
-
} else if (c < 0xF0) {
|
89
|
-
iter->_width = 3;
|
90
|
-
mask = 0xF; // 00001111 in binary.
|
91
|
-
} else if (c < 0xF5) {
|
92
|
-
iter->_width = 4;
|
93
|
-
mask = 0x7; // 00000111 in binary.
|
94
|
-
} else if (c < 0xF8) {
|
95
|
-
// The following cases are all errors, but we need to handle them separately
|
96
|
-
// so that we consume the proper number of bytes from the input stream
|
97
|
-
// before replacing them with the replacement char. The HTML5 spec
|
98
|
-
// specifies that we should consume the shorter of the length specified by
|
99
|
-
// the first bit or the run leading up to the first non-continuation
|
100
|
-
// character.
|
101
|
-
iter->_width = 5;
|
102
|
-
is_bad_char = true;
|
103
|
-
} else if (c < 0xFC) {
|
104
|
-
iter->_width = 6;
|
105
|
-
is_bad_char = true;
|
106
|
-
} else if (c < 0xFE) {
|
107
|
-
iter->_width = 7;
|
108
|
-
is_bad_char = true;
|
109
|
-
} else {
|
110
|
-
iter->_width = 1;
|
111
|
-
is_bad_char = true;
|
112
|
-
}
|
113
|
-
|
114
|
-
// Check to make sure we have enough bytes left in the iter to read all that
|
115
|
-
// we want. If not, we set the iter_truncated flag, mark this as a bad
|
116
|
-
// character, and adjust the current width so that it consumes the rest of the
|
117
|
-
// iter.
|
118
|
-
uint64_t code_point = c & mask;
|
119
|
-
if (iter->_start + iter->_width > iter->_end) {
|
120
|
-
iter->_width = iter->_end - iter->_start;
|
121
|
-
add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
|
122
|
-
is_bad_char = true;
|
123
|
-
}
|
124
|
-
|
125
|
-
// Now we decode continuation bytes, shift them appropriately, and build up
|
126
|
-
// the appropriate code point.
|
127
|
-
assert(iter->_width < 8);
|
128
|
-
for (int i = 1; i < iter->_width; ++i) {
|
129
|
-
c = (unsigned char) iter->_start[i];
|
130
|
-
if (c < 0x80 || c > 0xBF) {
|
131
|
-
// Per HTML5 spec, we don't include the invalid continuation char in the
|
132
|
-
// run that we consume here.
|
133
|
-
iter->_width = i;
|
134
|
-
is_bad_char = true;
|
135
|
-
break;
|
136
|
-
}
|
137
|
-
code_point = (code_point << 6) | (c & ~0x80);
|
138
|
-
}
|
139
|
-
if (code_point > 0x10FFFF) is_bad_char = true;
|
140
|
-
|
141
|
-
// If we had a decode error, set the current code point to the replacement
|
142
|
-
// character and flip the flag indicating that a decode error occurred.
|
143
|
-
// Ditto if we have a code point that is explicitly on the list of characters
|
144
|
-
// prohibited by the HTML5 spec, such as control characters.
|
145
|
-
if (is_bad_char || utf8_is_invalid_code_point(code_point)) {
|
146
|
-
add_error(iter, GUMBO_ERR_UTF8_INVALID);
|
147
|
-
code_point = kUtf8ReplacementChar;
|
123
|
+
if (iter->_start >= iter->_end) {
|
124
|
+
// No input left to consume; emit an EOF and set width = 0.
|
125
|
+
iter->_current = -1;
|
126
|
+
iter->_width = 0;
|
127
|
+
return;
|
148
128
|
}
|
149
129
|
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
//
|
159
|
-
|
160
|
-
//
|
161
|
-
|
162
|
-
|
130
|
+
uint32_t code_point = 0;
|
131
|
+
uint32_t state = UTF8_ACCEPT;
|
132
|
+
for (const char* c = iter->_start; c < iter->_end; ++c) {
|
133
|
+
decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
|
134
|
+
if (state == UTF8_ACCEPT) {
|
135
|
+
iter->_width = c - iter->_start + 1;
|
136
|
+
// This is the special handling for carriage returns that is mandated by the
|
137
|
+
// HTML5 spec. Since we're looking for particular 7-bit literal characters,
|
138
|
+
// we operate in terms of chars and only need a check for iter overrun,
|
139
|
+
// instead of having to read in a full next code point.
|
140
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
|
141
|
+
if (code_point == '\r') {
|
142
|
+
assert(iter->_width == 1);
|
143
|
+
const char* next = c + 1;
|
144
|
+
if (next < iter->_end && *next == '\n') {
|
145
|
+
// Advance the iter, as if the carriage return didn't exist.
|
146
|
+
++iter->_start;
|
147
|
+
// Preserve the true offset, since other tools that look at it may be
|
148
|
+
// unaware of HTML5's rules for converting \r into \n.
|
149
|
+
++iter->_pos.offset;
|
150
|
+
}
|
151
|
+
code_point = '\n';
|
152
|
+
}
|
153
|
+
if (utf8_is_invalid_code_point(code_point)) {
|
154
|
+
add_error(iter, GUMBO_ERR_UTF8_INVALID);
|
155
|
+
code_point = kUtf8ReplacementChar;
|
156
|
+
}
|
157
|
+
iter->_current = code_point;
|
158
|
+
return;
|
159
|
+
} else if (state == UTF8_REJECT) {
|
160
|
+
// We don't want to consume the invalid continuation byte of a multi-byte
|
161
|
+
// run, but we do want to skip past an invalid first byte.
|
162
|
+
iter->_width = c - iter->_start + (c == iter->_start);
|
163
|
+
iter->_current = kUtf8ReplacementChar;
|
164
|
+
add_error(iter, GUMBO_ERR_UTF8_INVALID);
|
165
|
+
return;
|
163
166
|
}
|
164
|
-
code_point = '\n';
|
165
167
|
}
|
166
|
-
|
167
|
-
//
|
168
|
-
//
|
169
|
-
|
168
|
+
// If we got here without exiting early, then we've reached the end of the iterator.
|
169
|
+
// Add an error for truncated input, set the width to consume the rest of the
|
170
|
+
// iterator, and emit a replacement character. The next time we enter this method,
|
171
|
+
// it will detect that there's no input to consume and
|
172
|
+
iter->_current = kUtf8ReplacementChar;
|
173
|
+
iter->_width = iter->_end - iter->_start;
|
174
|
+
add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
|
170
175
|
}
|
171
176
|
|
172
177
|
static void update_position(Utf8Iterator* iter) {
|
@@ -177,7 +182,7 @@ static void update_position(Utf8Iterator* iter) {
|
|
177
182
|
} else if(iter->_current == '\t') {
|
178
183
|
int tab_stop = iter->_parser->_options->tab_stop;
|
179
184
|
iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
|
180
|
-
} else {
|
185
|
+
} else if(iter->_current != -1) {
|
181
186
|
++iter->_pos.column;
|
182
187
|
}
|
183
188
|
}
|
@@ -195,34 +200,19 @@ void utf8iterator_init(
|
|
195
200
|
Utf8Iterator* iter) {
|
196
201
|
iter->_start = source;
|
197
202
|
iter->_end = source + source_length;
|
198
|
-
iter->_width = 0;
|
199
203
|
iter->_pos.line = 1;
|
200
204
|
iter->_pos.column = 1;
|
201
205
|
iter->_pos.offset = 0;
|
202
206
|
iter->_parser = parser;
|
203
|
-
|
204
|
-
read_char(iter);
|
205
|
-
} else {
|
206
|
-
iter->_current = -1;
|
207
|
-
}
|
207
|
+
read_char(iter);
|
208
208
|
}
|
209
209
|
|
210
210
|
void utf8iterator_next(Utf8Iterator* iter) {
|
211
|
-
if (iter->_current == -1) {
|
212
|
-
// If we're already at EOF, bail out before advancing anything to avoid
|
213
|
-
// reading past the end of the buffer. It's easier to catch this case here
|
214
|
-
// than litter the code with lots of individual checks for EOF.
|
215
|
-
return;
|
216
|
-
}
|
217
|
-
iter->_start += iter->_width;
|
218
211
|
// We update positions based on the *last* character read, so that the first
|
219
212
|
// character following a newline is at column 1 in the next line.
|
220
213
|
update_position(iter);
|
221
|
-
|
222
|
-
|
223
|
-
} else { // EOF
|
224
|
-
iter->_current = -1;
|
225
|
-
}
|
214
|
+
iter->_start += iter->_width;
|
215
|
+
read_char(iter);
|
226
216
|
}
|
227
217
|
|
228
218
|
int utf8iterator_current(const Utf8Iterator* iter) {
|
@@ -238,6 +228,10 @@ const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
|
|
238
228
|
return iter->_start;
|
239
229
|
}
|
240
230
|
|
231
|
+
const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
|
232
|
+
return iter->_end;
|
233
|
+
}
|
234
|
+
|
241
235
|
bool utf8iterator_maybe_consume_match(
|
242
236
|
Utf8Iterator* iter, const char* prefix, size_t length,
|
243
237
|
bool case_sensitive) {
|
data/gumbo-parser/src/utf8.h
CHANGED
@@ -98,6 +98,12 @@ void utf8iterator_get_position(
|
|
98
98
|
// Retrieves a character pointer to the start of the current character.
|
99
99
|
const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
|
100
100
|
|
101
|
+
// Retrieves a character pointer to 1 past the end of the buffer. This is
|
102
|
+
// necessary for certain state machines and string comparisons that would like
|
103
|
+
// to look directly for ASCII text in the buffer without going through the
|
104
|
+
// decoder.
|
105
|
+
const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
|
106
|
+
|
101
107
|
// If the upcoming text in the buffer matches the specified prefix (which has
|
102
108
|
// length 'length'), consume it and return true. Otherwise, return false with
|
103
109
|
// no other effects. If the length of the string would overflow the buffer,
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.13
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-09
|
12
|
+
date: 2014-10-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -44,6 +44,7 @@ files:
|
|
44
44
|
- gumbo-parser/src/attribute.h
|
45
45
|
- gumbo-parser/src/char_ref.c
|
46
46
|
- gumbo-parser/src/char_ref.h
|
47
|
+
- gumbo-parser/src/char_ref.rl
|
47
48
|
- gumbo-parser/src/error.c
|
48
49
|
- gumbo-parser/src/error.h
|
49
50
|
- gumbo-parser/src/gumbo.h
|