nokogumbo 1.1.12 → 1.1.13

Sign up to get free protection for your applications and to get access to all the features.
@@ -33,7 +33,64 @@ const int kUtf8ReplacementChar = 0xFFFD;
33
33
  // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
34
34
  // RFC 3629: http://tools.ietf.org/html/rfc3629
35
35
  // HTML5 Unicode handling:
36
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
36
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
37
+ //
38
+ // This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
39
+ // <bjoern@hoehrmann.de>. We wrap the inner table-based decoder routine in our
40
+ // own handling for newlines, tabs, invalid continuation bytes, and other
41
+ // conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
42
+ // not handle.
43
+ // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of
44
+ // the license agreement and code follows.
45
+
46
+ // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
47
+
48
+ // Permission is hereby granted, free of charge, to any person obtaining a copy
49
+ // of this software and associated documentation files (the "Software"), to deal
50
+ // in the Software without restriction, including without limitation the rights to
51
+ // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
52
+ // of the Software, and to permit persons to whom the Software is furnished to do
53
+ // so, subject to the following conditions:
54
+
55
+ // The above copyright notice and this permission notice shall be included in
56
+ // all copies or substantial portions of the Software.
57
+
58
+ #define UTF8_ACCEPT 0
59
+ #define UTF8_REJECT 12
60
+
61
+ static const uint8_t utf8d[] = {
62
+ // The first part of the table maps bytes to character classes that
63
+ // to reduce the size of the transition table and create bitmasks.
64
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
65
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
66
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
67
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
68
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
69
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
70
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
71
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
72
+
73
+ // The second part is a transition table that maps a combination
74
+ // of a state of the automaton and a character class to a state.
75
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
76
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
77
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
78
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
79
+ 12,36,12,12,12,12,12,12,12,12,12,12,
80
+ };
81
+
82
+ uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
83
+ uint32_t type = utf8d[byte];
84
+
85
+ *codep = (*state != UTF8_ACCEPT) ?
86
+ (byte & 0x3fu) | (*codep << 6) :
87
+ (0xff >> type) & (byte);
88
+
89
+ *state = utf8d[256 + *state + type];
90
+ return *state;
91
+ }
92
+
93
+ // END COPIED CODE.
37
94
 
38
95
  // Adds a decoding error to the parser's error list, based on the current state
39
96
  // of the Utf8Iterator.
@@ -63,110 +120,58 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
63
120
  // When this method returns, iter->_width and iter->_current will be set
64
121
  // appropriately, as well as any error flags.
65
122
  static void read_char(Utf8Iterator* iter) {
66
- unsigned char c;
67
- unsigned char mask = '\0';
68
- int is_bad_char = false;
69
-
70
- c = (unsigned char) *iter->_start;
71
- if (c < 0x80) {
72
- // Valid one-byte sequence.
73
- iter->_width = 1;
74
- mask = 0xFF;
75
- } else if (c < 0xC0) {
76
- // Continuation character not following a multibyte sequence.
77
- // The HTML5 spec here says to consume the byte and output a replacement
78
- // character.
79
- iter->_width = 1;
80
- is_bad_char = true;
81
- } else if (c < 0xE0) {
82
- iter->_width = 2;
83
- mask = 0x1F; // 00011111 in binary.
84
- if (c < 0xC2) {
85
- // Overlong encoding; error according to UTF8/HTML5 spec.
86
- is_bad_char = true;
87
- }
88
- } else if (c < 0xF0) {
89
- iter->_width = 3;
90
- mask = 0xF; // 00001111 in binary.
91
- } else if (c < 0xF5) {
92
- iter->_width = 4;
93
- mask = 0x7; // 00000111 in binary.
94
- } else if (c < 0xF8) {
95
- // The following cases are all errors, but we need to handle them separately
96
- // so that we consume the proper number of bytes from the input stream
97
- // before replacing them with the replacement char. The HTML5 spec
98
- // specifies that we should consume the shorter of the length specified by
99
- // the first bit or the run leading up to the first non-continuation
100
- // character.
101
- iter->_width = 5;
102
- is_bad_char = true;
103
- } else if (c < 0xFC) {
104
- iter->_width = 6;
105
- is_bad_char = true;
106
- } else if (c < 0xFE) {
107
- iter->_width = 7;
108
- is_bad_char = true;
109
- } else {
110
- iter->_width = 1;
111
- is_bad_char = true;
112
- }
113
-
114
- // Check to make sure we have enough bytes left in the iter to read all that
115
- // we want. If not, we set the iter_truncated flag, mark this as a bad
116
- // character, and adjust the current width so that it consumes the rest of the
117
- // iter.
118
- uint64_t code_point = c & mask;
119
- if (iter->_start + iter->_width > iter->_end) {
120
- iter->_width = iter->_end - iter->_start;
121
- add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
122
- is_bad_char = true;
123
- }
124
-
125
- // Now we decode continuation bytes, shift them appropriately, and build up
126
- // the appropriate code point.
127
- assert(iter->_width < 8);
128
- for (int i = 1; i < iter->_width; ++i) {
129
- c = (unsigned char) iter->_start[i];
130
- if (c < 0x80 || c > 0xBF) {
131
- // Per HTML5 spec, we don't include the invalid continuation char in the
132
- // run that we consume here.
133
- iter->_width = i;
134
- is_bad_char = true;
135
- break;
136
- }
137
- code_point = (code_point << 6) | (c & ~0x80);
138
- }
139
- if (code_point > 0x10FFFF) is_bad_char = true;
140
-
141
- // If we had a decode error, set the current code point to the replacement
142
- // character and flip the flag indicating that a decode error occurred.
143
- // Ditto if we have a code point that is explicitly on the list of characters
144
- // prohibited by the HTML5 spec, such as control characters.
145
- if (is_bad_char || utf8_is_invalid_code_point(code_point)) {
146
- add_error(iter, GUMBO_ERR_UTF8_INVALID);
147
- code_point = kUtf8ReplacementChar;
123
+ if (iter->_start >= iter->_end) {
124
+ // No input left to consume; emit an EOF and set width = 0.
125
+ iter->_current = -1;
126
+ iter->_width = 0;
127
+ return;
148
128
  }
149
129
 
150
- // This is the special handling for carriage returns that is mandated by the
151
- // HTML5 spec. Since we're looking for particular 7-bit literal characters,
152
- // we operate in terms of chars and only need a check for iter overrun,
153
- // instead of having to read in a full next code point.
154
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
155
- if (code_point == '\r') {
156
- const char* next = iter->_start + iter->_width;
157
- if (next < iter->_end && *next == '\n') {
158
- // Advance the iter, as if the carriage return didn't exist.
159
- ++iter->_start;
160
- // Preserve the true offset, since other tools that look at it may be
161
- // unaware of HTML5's rules for converting \r into \n.
162
- ++iter->_pos.offset;
130
+ uint32_t code_point = 0;
131
+ uint32_t state = UTF8_ACCEPT;
132
+ for (const char* c = iter->_start; c < iter->_end; ++c) {
133
+ decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
134
+ if (state == UTF8_ACCEPT) {
135
+ iter->_width = c - iter->_start + 1;
136
+ // This is the special handling for carriage returns that is mandated by the
137
+ // HTML5 spec. Since we're looking for particular 7-bit literal characters,
138
+ // we operate in terms of chars and only need a check for iter overrun,
139
+ // instead of having to read in a full next code point.
140
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
141
+ if (code_point == '\r') {
142
+ assert(iter->_width == 1);
143
+ const char* next = c + 1;
144
+ if (next < iter->_end && *next == '\n') {
145
+ // Advance the iter, as if the carriage return didn't exist.
146
+ ++iter->_start;
147
+ // Preserve the true offset, since other tools that look at it may be
148
+ // unaware of HTML5's rules for converting \r into \n.
149
+ ++iter->_pos.offset;
150
+ }
151
+ code_point = '\n';
152
+ }
153
+ if (utf8_is_invalid_code_point(code_point)) {
154
+ add_error(iter, GUMBO_ERR_UTF8_INVALID);
155
+ code_point = kUtf8ReplacementChar;
156
+ }
157
+ iter->_current = code_point;
158
+ return;
159
+ } else if (state == UTF8_REJECT) {
160
+ // We don't want to consume the invalid continuation byte of a multi-byte
161
+ // run, but we do want to skip past an invalid first byte.
162
+ iter->_width = c - iter->_start + (c == iter->_start);
163
+ iter->_current = kUtf8ReplacementChar;
164
+ add_error(iter, GUMBO_ERR_UTF8_INVALID);
165
+ return;
163
166
  }
164
- code_point = '\n';
165
167
  }
166
-
167
- // At this point, we know we have a valid character as the code point, so we
168
- // set it, and we're done.
169
- iter->_current = code_point;
168
+ // If we got here without exiting early, then we've reached the end of the iterator.
169
+ // Add an error for truncated input, set the width to consume the rest of the
170
+ // iterator, and emit a replacement character. The next time we enter this method,
171
+ // it will detect that there's no input to consume and
172
+ iter->_current = kUtf8ReplacementChar;
173
+ iter->_width = iter->_end - iter->_start;
174
+ add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
170
175
  }
171
176
 
172
177
  static void update_position(Utf8Iterator* iter) {
@@ -177,7 +182,7 @@ static void update_position(Utf8Iterator* iter) {
177
182
  } else if(iter->_current == '\t') {
178
183
  int tab_stop = iter->_parser->_options->tab_stop;
179
184
  iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
180
- } else {
185
+ } else if(iter->_current != -1) {
181
186
  ++iter->_pos.column;
182
187
  }
183
188
  }
@@ -195,34 +200,19 @@ void utf8iterator_init(
195
200
  Utf8Iterator* iter) {
196
201
  iter->_start = source;
197
202
  iter->_end = source + source_length;
198
- iter->_width = 0;
199
203
  iter->_pos.line = 1;
200
204
  iter->_pos.column = 1;
201
205
  iter->_pos.offset = 0;
202
206
  iter->_parser = parser;
203
- if (source_length) {
204
- read_char(iter);
205
- } else {
206
- iter->_current = -1;
207
- }
207
+ read_char(iter);
208
208
  }
209
209
 
210
210
  void utf8iterator_next(Utf8Iterator* iter) {
211
- if (iter->_current == -1) {
212
- // If we're already at EOF, bail out before advancing anything to avoid
213
- // reading past the end of the buffer. It's easier to catch this case here
214
- // than litter the code with lots of individual checks for EOF.
215
- return;
216
- }
217
- iter->_start += iter->_width;
218
211
  // We update positions based on the *last* character read, so that the first
219
212
  // character following a newline is at column 1 in the next line.
220
213
  update_position(iter);
221
- if (iter->_start < iter->_end) {
222
- read_char(iter);
223
- } else { // EOF
224
- iter->_current = -1;
225
- }
214
+ iter->_start += iter->_width;
215
+ read_char(iter);
226
216
  }
227
217
 
228
218
  int utf8iterator_current(const Utf8Iterator* iter) {
@@ -238,6 +228,10 @@ const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
238
228
  return iter->_start;
239
229
  }
240
230
 
231
+ const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
232
+ return iter->_end;
233
+ }
234
+
241
235
  bool utf8iterator_maybe_consume_match(
242
236
  Utf8Iterator* iter, const char* prefix, size_t length,
243
237
  bool case_sensitive) {
@@ -98,6 +98,12 @@ void utf8iterator_get_position(
98
98
  // Retrieves a character pointer to the start of the current character.
99
99
  const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
100
100
 
101
+ // Retrieves a character pointer to 1 past the end of the buffer. This is
102
+ // necessary for certain state machines and string comparisons that would like
103
+ // to look directly for ASCII text in the buffer without going through the
104
+ // decoder.
105
+ const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
106
+
101
107
  // If the upcoming text in the buffer matches the specified prefix (which has
102
108
  // length 'length'), consume it and return true. Otherwise, return false with
103
109
  // no other effects. If the length of the string would overflow the buffer,
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.12
4
+ version: 1.1.13
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-09-02 00:00:00.000000000 Z
12
+ date: 2014-10-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -44,6 +44,7 @@ files:
44
44
  - gumbo-parser/src/attribute.h
45
45
  - gumbo-parser/src/char_ref.c
46
46
  - gumbo-parser/src/char_ref.h
47
+ - gumbo-parser/src/char_ref.rl
47
48
  - gumbo-parser/src/error.c
48
49
  - gumbo-parser/src/error.h
49
50
  - gumbo-parser/src/gumbo.h