nokogumbo 1.1.12 → 1.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,7 +33,64 @@ const int kUtf8ReplacementChar = 0xFFFD;
33
33
  // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
34
34
  // RFC 3629: http://tools.ietf.org/html/rfc3629
35
35
  // HTML5 Unicode handling:
36
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
36
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
37
+ //
38
+ // This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
39
+ // <bjoern@hoehrmann.de>. We wrap the inner table-based decoder routine in our
40
+ // own handling for newlines, tabs, invalid continuation bytes, and other
41
+ // conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
42
+ // not handle.
43
+ // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of
44
+ // the license agreement and code follows.
45
+
46
+ // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
47
+
48
+ // Permission is hereby granted, free of charge, to any person obtaining a copy
49
+ // of this software and associated documentation files (the "Software"), to deal
50
+ // in the Software without restriction, including without limitation the rights to
51
+ // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
52
+ // of the Software, and to permit persons to whom the Software is furnished to do
53
+ // so, subject to the following conditions:
54
+
55
+ // The above copyright notice and this permission notice shall be included in
56
+ // all copies or substantial portions of the Software.
57
+
58
+ #define UTF8_ACCEPT 0
59
+ #define UTF8_REJECT 12
60
+
61
+ static const uint8_t utf8d[] = {
62
+ // The first part of the table maps bytes to character classes that
63
+ // to reduce the size of the transition table and create bitmasks.
64
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
65
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
66
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
67
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
68
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
69
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
70
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
71
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
72
+
73
+ // The second part is a transition table that maps a combination
74
+ // of a state of the automaton and a character class to a state.
75
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
76
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
77
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
78
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
79
+ 12,36,12,12,12,12,12,12,12,12,12,12,
80
+ };
81
+
82
+ uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
83
+ uint32_t type = utf8d[byte];
84
+
85
+ *codep = (*state != UTF8_ACCEPT) ?
86
+ (byte & 0x3fu) | (*codep << 6) :
87
+ (0xff >> type) & (byte);
88
+
89
+ *state = utf8d[256 + *state + type];
90
+ return *state;
91
+ }
92
+
93
+ // END COPIED CODE.
37
94
 
38
95
  // Adds a decoding error to the parser's error list, based on the current state
39
96
  // of the Utf8Iterator.
@@ -63,110 +120,58 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
63
120
  // When this method returns, iter->_width and iter->_current will be set
64
121
  // appropriately, as well as any error flags.
65
122
  static void read_char(Utf8Iterator* iter) {
66
- unsigned char c;
67
- unsigned char mask = '\0';
68
- int is_bad_char = false;
69
-
70
- c = (unsigned char) *iter->_start;
71
- if (c < 0x80) {
72
- // Valid one-byte sequence.
73
- iter->_width = 1;
74
- mask = 0xFF;
75
- } else if (c < 0xC0) {
76
- // Continuation character not following a multibyte sequence.
77
- // The HTML5 spec here says to consume the byte and output a replacement
78
- // character.
79
- iter->_width = 1;
80
- is_bad_char = true;
81
- } else if (c < 0xE0) {
82
- iter->_width = 2;
83
- mask = 0x1F; // 00011111 in binary.
84
- if (c < 0xC2) {
85
- // Overlong encoding; error according to UTF8/HTML5 spec.
86
- is_bad_char = true;
87
- }
88
- } else if (c < 0xF0) {
89
- iter->_width = 3;
90
- mask = 0xF; // 00001111 in binary.
91
- } else if (c < 0xF5) {
92
- iter->_width = 4;
93
- mask = 0x7; // 00000111 in binary.
94
- } else if (c < 0xF8) {
95
- // The following cases are all errors, but we need to handle them separately
96
- // so that we consume the proper number of bytes from the input stream
97
- // before replacing them with the replacement char. The HTML5 spec
98
- // specifies that we should consume the shorter of the length specified by
99
- // the first bit or the run leading up to the first non-continuation
100
- // character.
101
- iter->_width = 5;
102
- is_bad_char = true;
103
- } else if (c < 0xFC) {
104
- iter->_width = 6;
105
- is_bad_char = true;
106
- } else if (c < 0xFE) {
107
- iter->_width = 7;
108
- is_bad_char = true;
109
- } else {
110
- iter->_width = 1;
111
- is_bad_char = true;
112
- }
113
-
114
- // Check to make sure we have enough bytes left in the iter to read all that
115
- // we want. If not, we set the iter_truncated flag, mark this as a bad
116
- // character, and adjust the current width so that it consumes the rest of the
117
- // iter.
118
- uint64_t code_point = c & mask;
119
- if (iter->_start + iter->_width > iter->_end) {
120
- iter->_width = iter->_end - iter->_start;
121
- add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
122
- is_bad_char = true;
123
- }
124
-
125
- // Now we decode continuation bytes, shift them appropriately, and build up
126
- // the appropriate code point.
127
- assert(iter->_width < 8);
128
- for (int i = 1; i < iter->_width; ++i) {
129
- c = (unsigned char) iter->_start[i];
130
- if (c < 0x80 || c > 0xBF) {
131
- // Per HTML5 spec, we don't include the invalid continuation char in the
132
- // run that we consume here.
133
- iter->_width = i;
134
- is_bad_char = true;
135
- break;
136
- }
137
- code_point = (code_point << 6) | (c & ~0x80);
138
- }
139
- if (code_point > 0x10FFFF) is_bad_char = true;
140
-
141
- // If we had a decode error, set the current code point to the replacement
142
- // character and flip the flag indicating that a decode error occurred.
143
- // Ditto if we have a code point that is explicitly on the list of characters
144
- // prohibited by the HTML5 spec, such as control characters.
145
- if (is_bad_char || utf8_is_invalid_code_point(code_point)) {
146
- add_error(iter, GUMBO_ERR_UTF8_INVALID);
147
- code_point = kUtf8ReplacementChar;
123
+ if (iter->_start >= iter->_end) {
124
+ // No input left to consume; emit an EOF and set width = 0.
125
+ iter->_current = -1;
126
+ iter->_width = 0;
127
+ return;
148
128
  }
149
129
 
150
- // This is the special handling for carriage returns that is mandated by the
151
- // HTML5 spec. Since we're looking for particular 7-bit literal characters,
152
- // we operate in terms of chars and only need a check for iter overrun,
153
- // instead of having to read in a full next code point.
154
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
155
- if (code_point == '\r') {
156
- const char* next = iter->_start + iter->_width;
157
- if (next < iter->_end && *next == '\n') {
158
- // Advance the iter, as if the carriage return didn't exist.
159
- ++iter->_start;
160
- // Preserve the true offset, since other tools that look at it may be
161
- // unaware of HTML5's rules for converting \r into \n.
162
- ++iter->_pos.offset;
130
+ uint32_t code_point = 0;
131
+ uint32_t state = UTF8_ACCEPT;
132
+ for (const char* c = iter->_start; c < iter->_end; ++c) {
133
+ decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
134
+ if (state == UTF8_ACCEPT) {
135
+ iter->_width = c - iter->_start + 1;
136
+ // This is the special handling for carriage returns that is mandated by the
137
+ // HTML5 spec. Since we're looking for particular 7-bit literal characters,
138
+ // we operate in terms of chars and only need a check for iter overrun,
139
+ // instead of having to read in a full next code point.
140
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
141
+ if (code_point == '\r') {
142
+ assert(iter->_width == 1);
143
+ const char* next = c + 1;
144
+ if (next < iter->_end && *next == '\n') {
145
+ // Advance the iter, as if the carriage return didn't exist.
146
+ ++iter->_start;
147
+ // Preserve the true offset, since other tools that look at it may be
148
+ // unaware of HTML5's rules for converting \r into \n.
149
+ ++iter->_pos.offset;
150
+ }
151
+ code_point = '\n';
152
+ }
153
+ if (utf8_is_invalid_code_point(code_point)) {
154
+ add_error(iter, GUMBO_ERR_UTF8_INVALID);
155
+ code_point = kUtf8ReplacementChar;
156
+ }
157
+ iter->_current = code_point;
158
+ return;
159
+ } else if (state == UTF8_REJECT) {
160
+ // We don't want to consume the invalid continuation byte of a multi-byte
161
+ // run, but we do want to skip past an invalid first byte.
162
+ iter->_width = c - iter->_start + (c == iter->_start);
163
+ iter->_current = kUtf8ReplacementChar;
164
+ add_error(iter, GUMBO_ERR_UTF8_INVALID);
165
+ return;
163
166
  }
164
- code_point = '\n';
165
167
  }
166
-
167
- // At this point, we know we have a valid character as the code point, so we
168
- // set it, and we're done.
169
- iter->_current = code_point;
168
+ // If we got here without exiting early, then we've reached the end of the iterator.
169
+ // Add an error for truncated input, set the width to consume the rest of the
170
+ // iterator, and emit a replacement character. The next time we enter this method,
171
+ // it will detect that there's no input to consume and
172
+ iter->_current = kUtf8ReplacementChar;
173
+ iter->_width = iter->_end - iter->_start;
174
+ add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
170
175
  }
171
176
 
172
177
  static void update_position(Utf8Iterator* iter) {
@@ -177,7 +182,7 @@ static void update_position(Utf8Iterator* iter) {
177
182
  } else if(iter->_current == '\t') {
178
183
  int tab_stop = iter->_parser->_options->tab_stop;
179
184
  iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
180
- } else {
185
+ } else if(iter->_current != -1) {
181
186
  ++iter->_pos.column;
182
187
  }
183
188
  }
@@ -195,34 +200,19 @@ void utf8iterator_init(
195
200
  Utf8Iterator* iter) {
196
201
  iter->_start = source;
197
202
  iter->_end = source + source_length;
198
- iter->_width = 0;
199
203
  iter->_pos.line = 1;
200
204
  iter->_pos.column = 1;
201
205
  iter->_pos.offset = 0;
202
206
  iter->_parser = parser;
203
- if (source_length) {
204
- read_char(iter);
205
- } else {
206
- iter->_current = -1;
207
- }
207
+ read_char(iter);
208
208
  }
209
209
 
210
210
  void utf8iterator_next(Utf8Iterator* iter) {
211
- if (iter->_current == -1) {
212
- // If we're already at EOF, bail out before advancing anything to avoid
213
- // reading past the end of the buffer. It's easier to catch this case here
214
- // than litter the code with lots of individual checks for EOF.
215
- return;
216
- }
217
- iter->_start += iter->_width;
218
211
  // We update positions based on the *last* character read, so that the first
219
212
  // character following a newline is at column 1 in the next line.
220
213
  update_position(iter);
221
- if (iter->_start < iter->_end) {
222
- read_char(iter);
223
- } else { // EOF
224
- iter->_current = -1;
225
- }
214
+ iter->_start += iter->_width;
215
+ read_char(iter);
226
216
  }
227
217
 
228
218
  int utf8iterator_current(const Utf8Iterator* iter) {
@@ -238,6 +228,10 @@ const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
238
228
  return iter->_start;
239
229
  }
240
230
 
231
+ const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
232
+ return iter->_end;
233
+ }
234
+
241
235
  bool utf8iterator_maybe_consume_match(
242
236
  Utf8Iterator* iter, const char* prefix, size_t length,
243
237
  bool case_sensitive) {
@@ -98,6 +98,12 @@ void utf8iterator_get_position(
98
98
  // Retrieves a character pointer to the start of the current character.
99
99
  const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
100
100
 
101
+ // Retrieves a character pointer to 1 past the end of the buffer. This is
102
+ // necessary for certain state machines and string comparisons that would like
103
+ // to look directly for ASCII text in the buffer without going through the
104
+ // decoder.
105
+ const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
106
+
101
107
  // If the upcoming text in the buffer matches the specified prefix (which has
102
108
  // length 'length'), consume it and return true. Otherwise, return false with
103
109
  // no other effects. If the length of the string would overflow the buffer,
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.12
4
+ version: 1.1.13
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-09-02 00:00:00.000000000 Z
12
+ date: 2014-10-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -44,6 +44,7 @@ files:
44
44
  - gumbo-parser/src/attribute.h
45
45
  - gumbo-parser/src/char_ref.c
46
46
  - gumbo-parser/src/char_ref.h
47
+ - gumbo-parser/src/char_ref.rl
47
48
  - gumbo-parser/src/error.c
48
49
  - gumbo-parser/src/error.h
49
50
  - gumbo-parser/src/gumbo.h