nokogumbo 1.5.0 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +237 -26
  3. data/ext/nokogumbo/extconf.rb +121 -0
  4. data/ext/nokogumbo/nokogumbo.c +793 -0
  5. data/gumbo-parser/src/ascii.c +75 -0
  6. data/gumbo-parser/src/ascii.h +115 -0
  7. data/gumbo-parser/src/attribute.c +26 -28
  8. data/gumbo-parser/src/attribute.h +3 -23
  9. data/gumbo-parser/src/char_ref.c +5972 -6816
  10. data/gumbo-parser/src/char_ref.h +14 -45
  11. data/gumbo-parser/src/error.c +510 -163
  12. data/gumbo-parser/src/error.h +70 -147
  13. data/gumbo-parser/src/foreign_attrs.c +104 -0
  14. data/gumbo-parser/src/gumbo.h +577 -305
  15. data/gumbo-parser/src/insertion_mode.h +4 -28
  16. data/gumbo-parser/src/macros.h +91 -0
  17. data/gumbo-parser/src/parser.c +2922 -2228
  18. data/gumbo-parser/src/parser.h +6 -22
  19. data/gumbo-parser/src/replacement.h +33 -0
  20. data/gumbo-parser/src/string_buffer.c +43 -50
  21. data/gumbo-parser/src/string_buffer.h +24 -40
  22. data/gumbo-parser/src/string_piece.c +39 -39
  23. data/gumbo-parser/src/svg_attrs.c +174 -0
  24. data/gumbo-parser/src/svg_tags.c +137 -0
  25. data/gumbo-parser/src/tag.c +186 -59
  26. data/gumbo-parser/src/tag_lookup.c +382 -0
  27. data/gumbo-parser/src/tag_lookup.h +13 -0
  28. data/gumbo-parser/src/token_buffer.c +79 -0
  29. data/gumbo-parser/src/token_buffer.h +71 -0
  30. data/gumbo-parser/src/token_type.h +1 -25
  31. data/gumbo-parser/src/tokenizer.c +2127 -1561
  32. data/gumbo-parser/src/tokenizer.h +41 -52
  33. data/gumbo-parser/src/tokenizer_states.h +281 -45
  34. data/gumbo-parser/src/utf8.c +98 -123
  35. data/gumbo-parser/src/utf8.h +84 -52
  36. data/gumbo-parser/src/util.c +48 -38
  37. data/gumbo-parser/src/util.h +10 -40
  38. data/gumbo-parser/src/vector.c +45 -57
  39. data/gumbo-parser/src/vector.h +17 -39
  40. data/lib/nokogumbo.rb +11 -173
  41. data/lib/nokogumbo/html5.rb +252 -0
  42. data/lib/nokogumbo/html5/document.rb +53 -0
  43. data/lib/nokogumbo/html5/document_fragment.rb +62 -0
  44. data/lib/nokogumbo/html5/node.rb +72 -0
  45. data/lib/nokogumbo/version.rb +3 -0
  46. metadata +43 -24
  47. data/ext/nokogumboc/extconf.rb +0 -60
  48. data/ext/nokogumboc/nokogumbo.c +0 -295
  49. data/gumbo-parser/src/char_ref.rl +0 -2554
  50. data/gumbo-parser/src/string_piece.h +0 -38
  51. data/gumbo-parser/src/tag.in +0 -150
  52. data/gumbo-parser/src/tag_enum.h +0 -153
  53. data/gumbo-parser/src/tag_gperf.h +0 -105
  54. data/gumbo-parser/src/tag_sizes.h +0 -4
  55. data/gumbo-parser/src/tag_strings.h +0 -153
  56. data/gumbo-parser/visualc/include/strings.h +0 -4
  57. data/test-nokogumbo.rb +0 -190
@@ -1,59 +1,51 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
1
+ /*
2
+ Copyright 2018 Craig Barnes.
3
+ Copyright 2010 Google Inc.
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ https://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+ */
16
17
 
17
18
  #include "utf8.h"
18
19
 
19
20
  #include <assert.h>
20
21
  #include <stdint.h>
21
22
  #include <string.h>
22
- #include <strings.h> // For strncasecmp.
23
23
 
24
24
  #include "error.h"
25
25
  #include "gumbo.h"
26
26
  #include "parser.h"
27
- #include "util.h"
27
+ #include "ascii.h"
28
28
  #include "vector.h"
29
29
 
30
- const int kUtf8ReplacementChar = 0xFFFD;
30
+ // References:
31
+ // * https://tools.ietf.org/html/rfc3629
32
+ // * https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
31
33
 
32
- // Reference material:
33
- // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
34
- // RFC 3629: http://tools.ietf.org/html/rfc3629
35
- // HTML5 Unicode handling:
36
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
37
- //
38
- // This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
39
- // <bjoern@hoehrmann.de>. We wrap the inner table-based decoder routine in our
40
- // own handling for newlines, tabs, invalid continuation bytes, and other
41
- // conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
42
- // not handle.
43
- // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of
44
- // the license agreement and code follows.
34
+ // The following code is a DFA-based UTF-8 decoder by Bjoern Hoehrmann.
35
+ // We wrap the inner table-based decoder routine in our own handling for
36
+ // newlines, tabs, invalid continuation bytes, and other conditions that
37
+ // the HTML5 spec fully specifies but normal UTF-8 decoders do not handle.
38
+ // See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
45
39
 
46
40
  // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
47
-
48
- // Permission is hereby granted, free of charge, to any person obtaining a copy
49
- // of this software and associated documentation files (the "Software"), to deal
50
- // in the Software without restriction, including without limitation the rights
51
- // to
52
- // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
53
- // of the Software, and to permit persons to whom the Software is furnished to
54
- // do
55
- // so, subject to the following conditions:
56
-
41
+ //
42
+ // Permission is hereby granted, free of charge, to any person obtaining a
43
+ // copy of this software and associated documentation files (the "Software"),
44
+ // to deal in the Software without restriction, including without limitation
45
+ // the rights to use, copy, modify, merge, publish, distribute, sublicense,
46
+ // and/or sell copies of the Software, and to permit persons to whom the
47
+ // Software is furnished to do so, subject to the following conditions:
48
+ //
57
49
  // The above copyright notice and this permission notice shall be included in
58
50
  // all copies or substantial portions of the Software.
59
51
 
@@ -61,35 +53,33 @@ const int kUtf8ReplacementChar = 0xFFFD;
61
53
  #define UTF8_REJECT 12
62
54
 
63
55
  static const uint8_t utf8d[] = {
64
- // The first part of the table maps bytes to character classes that
65
- // to reduce the size of the transition table and create bitmasks.
66
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71
- 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9,
72
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
73
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2,
74
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10,
75
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8,
76
- 8, 8, 8, 8, 8, 8,
77
-
78
- // The second part is a transition table that maps a combination
79
- // of a state of the automaton and a character class to a state.
80
- 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12,
81
- 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12,
82
- 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
83
- 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12,
84
- 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
85
- 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
56
+ // The first part of the table maps bytes to character classes that
57
+ // to reduce the size of the transition table and create bitmasks.
58
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
61
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
62
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
63
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
64
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
65
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
66
+
67
+ // The second part is a transition table that maps a combination
68
+ // of a state of the automaton and a character class to a state.
69
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
70
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
71
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
72
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
73
+ 12,36,12,12,12,12,12,12,12,12,12,12,
86
74
  };
87
75
 
88
- uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
76
+ static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
89
77
  uint32_t type = utf8d[byte];
90
78
 
91
- *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
92
- : (0xff >> type) & (byte);
79
+ *codep =
80
+ (*state != UTF8_ACCEPT)
81
+ ? (byte & 0x3fu) | (*codep << 6)
82
+ : (0xff >> type) & (byte);
93
83
 
94
84
  *state = utf8d[256 + *state + type];
95
85
  return *state;
@@ -108,16 +98,9 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
108
98
  }
109
99
  error->type = type;
110
100
  error->position = iter->_pos;
111
- error->original_text = iter->_start;
112
-
113
- // At the point the error is recorded, the code point hasn't been computed
114
- // yet (and can't be, because it's invalid), so we need to build up the raw
115
- // hex value from the bytes under the cursor.
116
- uint64_t code_point = 0;
117
- for (int i = 0; i < iter->_width; ++i) {
118
- code_point = (code_point << 8) | (unsigned char) iter->_start[i];
119
- }
120
- error->v.codepoint = code_point;
101
+ error->original_text.data = iter->_start;
102
+ error->original_text.length = iter->_width;
103
+ error->v.tokenizer.codepoint = iter->_current;
121
104
  }
122
105
 
123
106
  // Reads the next UTF-8 character in the iter.
@@ -139,10 +122,10 @@ static void read_char(Utf8Iterator* iter) {
139
122
  if (state == UTF8_ACCEPT) {
140
123
  iter->_width = c - iter->_start + 1;
141
124
  // This is the special handling for carriage returns that is mandated by
142
- // the HTML5 spec. Since we're looking for particular 7-bit literal
125
+ // the HTML5 spec. Since we're looking for particular 7-bit literal
143
126
  // characters, we operate in terms of chars and only need a check for iter
144
127
  // overrun, instead of having to read in a full next code point.
145
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
128
+ // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
146
129
  if (code_point == '\r') {
147
130
  assert(iter->_width == 1);
148
131
  const char* next = c + 1;
@@ -155,11 +138,15 @@ static void read_char(Utf8Iterator* iter) {
155
138
  }
156
139
  code_point = '\n';
157
140
  }
158
- if (utf8_is_invalid_code_point(code_point)) {
159
- add_error(iter, GUMBO_ERR_UTF8_INVALID);
160
- code_point = kUtf8ReplacementChar;
161
- }
162
141
  iter->_current = code_point;
142
+ if (utf8_is_surrogate(code_point)) {
143
+ add_error(iter, GUMBO_ERR_SURROGATE_IN_INPUT_STREAM);
144
+ } else if (utf8_is_noncharacter(code_point)) {
145
+ add_error(iter, GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM);
146
+ } else if (utf8_is_control(code_point)
147
+ && !(gumbo_ascii_isspace(code_point) || code_point == 0)) {
148
+ add_error(iter, GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM);
149
+ }
163
150
  return;
164
151
  } else if (state == UTF8_REJECT) {
165
152
  // We don't want to consume the invalid continuation byte of a multi-byte
@@ -171,12 +158,12 @@ static void read_char(Utf8Iterator* iter) {
171
158
  }
172
159
  }
173
160
  // If we got here without exiting early, then we've reached the end of the
174
- // iterator. Add an error for truncated input, set the width to consume the
175
- // rest of the iterator, and emit a replacement character. The next time we
161
+ // iterator. Add an error for truncated input, set the width to consume the
162
+ // rest of the iterator, and emit a replacement character. The next time we
176
163
  // enter this method, it will detect that there's no input to consume and
177
164
  // output an EOF.
178
- iter->_current = kUtf8ReplacementChar;
179
165
  iter->_width = iter->_end - iter->_start;
166
+ iter->_current = kUtf8ReplacementChar;
180
167
  add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
181
168
  }
182
169
 
@@ -193,16 +180,12 @@ static void update_position(Utf8Iterator* iter) {
193
180
  }
194
181
  }
195
182
 
196
- // Returns true if this Unicode code point is in the list of characters
197
- // forbidden by the HTML5 spec, such as undefined control chars.
198
- bool utf8_is_invalid_code_point(int c) {
199
- return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
200
- (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
201
- ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
202
- }
203
-
204
- void utf8iterator_init(GumboParser* parser, const char* source,
205
- size_t source_length, Utf8Iterator* iter) {
183
+ void utf8iterator_init (
184
+ GumboParser* parser,
185
+ const char* source,
186
+ size_t source_length,
187
+ Utf8Iterator* iter
188
+ ) {
206
189
  iter->_start = source;
207
190
  iter->_end = source + source_length;
208
191
  iter->_pos.line = 1;
@@ -210,6 +193,11 @@ void utf8iterator_init(GumboParser* parser, const char* source,
210
193
  iter->_pos.offset = 0;
211
194
  iter->_parser = parser;
212
195
  read_char(iter);
196
+ if (iter->_current == kUtf8BomChar) {
197
+ iter->_start += iter->_width;
198
+ iter->_pos.offset += iter->_width;
199
+ read_char(iter);
200
+ }
213
201
  }
214
202
 
215
203
  void utf8iterator_next(Utf8Iterator* iter) {
@@ -220,28 +208,22 @@ void utf8iterator_next(Utf8Iterator* iter) {
220
208
  read_char(iter);
221
209
  }
222
210
 
223
- int utf8iterator_current(const Utf8Iterator* iter) { return iter->_current; }
224
-
225
- void utf8iterator_get_position(
226
- const Utf8Iterator* iter, GumboSourcePosition* output) {
227
- *output = iter->_pos;
228
- }
229
-
230
- const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
231
- return iter->_start;
232
- }
233
-
234
- const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
235
- return iter->_end;
236
- }
237
-
238
- bool utf8iterator_maybe_consume_match(Utf8Iterator* iter, const char* prefix,
239
- size_t length, bool case_sensitive) {
240
- bool matched = (iter->_start + length <= iter->_end) &&
241
- (case_sensitive ? !strncmp(iter->_start, prefix, length)
242
- : !strncasecmp(iter->_start, prefix, length));
211
+ bool utf8iterator_maybe_consume_match (
212
+ Utf8Iterator* iter,
213
+ const char* prefix,
214
+ size_t length,
215
+ bool case_sensitive
216
+ ) {
217
+ bool matched =
218
+ (iter->_start + length <= iter->_end)
219
+ && (
220
+ case_sensitive
221
+ ? !strncmp(iter->_start, prefix, length)
222
+ : !gumbo_ascii_strncasecmp(iter->_start, prefix, length)
223
+ )
224
+ ;
243
225
  if (matched) {
244
- for (unsigned int i = 0; i < length; ++i) {
226
+ for (size_t i = 0; i < length; ++i) {
245
227
  utf8iterator_next(iter);
246
228
  }
247
229
  return true;
@@ -261,10 +243,3 @@ void utf8iterator_reset(Utf8Iterator* iter) {
261
243
  iter->_pos = iter->_mark_pos;
262
244
  read_char(iter);
263
245
  }
264
-
265
- // Sets the position and original text fields of an error to the value at the
266
- // mark.
267
- void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) {
268
- error->position = iter->_mark_pos;
269
- error->original_text = iter->_mark;
270
- }
@@ -1,41 +1,26 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // This contains an implementation of a UTF8 iterator and decoder suitable for
18
- // an HTML5 parser. This does a bit more than straight UTF-8 decoding. The
1
+ #ifndef GUMBO_UTF8_H_
2
+ #define GUMBO_UTF8_H_
3
+
4
+ // This contains an implementation of a UTF-8 iterator and decoder suitable for
5
+ // a HTML5 parser. This does a bit more than straight UTF-8 decoding. The
19
6
  // HTML5 spec specifies that:
20
7
  // 1. Decoding errors are parse errors.
21
- // 2. Certain other codepoints (eg. control characters) are parse errors.
8
+ // 2. Certain other codepoints (e.g. control characters) are parse errors.
22
9
  // 3. Carriage returns and CR/LF groups are converted to line feeds.
23
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
10
+ // https://encoding.spec.whatwg.org/#utf-8-decode
24
11
  //
25
- // Also, we want to keep track of source positions for error handling. As a
12
+ // Also, we want to keep track of source positions for error handling. As a
26
13
  // result, we fold all that functionality into this decoder, and can't use an
27
14
  // off-the-shelf library.
28
15
  //
29
16
  // This header is internal-only, which is why we prefix functions with only
30
17
  // utf8_ or utf8_iterator_ instead of gumbo_utf8_.
31
18
 
32
- #ifndef GUMBO_UTF8_H_
33
- #define GUMBO_UTF8_H_
34
-
35
19
  #include <stdbool.h>
36
20
  #include <stddef.h>
37
21
 
38
22
  #include "gumbo.h"
23
+ #include "macros.h"
39
24
 
40
25
  #ifdef __cplusplus
41
26
  extern "C" {
@@ -45,13 +30,15 @@ struct GumboInternalError;
45
30
  struct GumboInternalParser;
46
31
 
47
32
  // Unicode replacement char.
48
- extern const int kUtf8ReplacementChar;
33
+ #define kUtf8ReplacementChar 0xFFFD
34
+ #define kUtf8BomChar 0xFEFF
35
+ #define kUtf8MaxChar 0x10FFFF
49
36
 
50
37
  typedef struct GumboInternalUtf8Iterator {
51
38
  // Points at the start of the code point most recently read into 'current'.
52
39
  const char* _start;
53
40
 
54
- // Points at the mark. The mark is initially set to the beginning of the
41
+ // Points at the mark. The mark is initially set to the beginning of the
55
42
  // input.
56
43
  const char* _mark;
57
44
 
@@ -62,7 +49,7 @@ typedef struct GumboInternalUtf8Iterator {
62
49
  int _current;
63
50
 
64
51
  // The width in bytes of the current code point.
65
- int _width;
52
+ size_t _width;
66
53
 
67
54
  // The SourcePosition for the current location.
68
55
  GumboSourcePosition _pos;
@@ -75,45 +62,94 @@ typedef struct GumboInternalUtf8Iterator {
75
62
  struct GumboInternalParser* _parser;
76
63
  } Utf8Iterator;
77
64
 
78
- // Returns true if this Unicode code point is in the list of characters
79
- // forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
80
- bool utf8_is_invalid_code_point(int c);
65
+ // Returns true if this Unicode code point is a surrogate.
66
+ CONST_FN static inline bool utf8_is_surrogate(int c) {
67
+ return c >= 0xD800 && c <= 0xDFFF;
68
+ }
69
+
70
+ // Returns true if this Unicode code point is a noncharacter.
71
+ CONST_FN static inline bool utf8_is_noncharacter(int c) {
72
+ return
73
+ (c >= 0xFDD0 && c <= 0xFDEF)
74
+ || ((c & 0xFFFF) == 0xFFFE)
75
+ || ((c & 0xFFFF) == 0xFFFF);
76
+ }
77
+
78
+ // Returns true if this Unicode code point is a control.
79
+ CONST_FN static inline bool utf8_is_control(int c) {
80
+ return ((unsigned int)c < 0x1Fu) || (c >= 0x7F && c <= 0x9F);
81
+ }
81
82
 
82
- // Initializes a new Utf8Iterator from the given byte buffer. The source does
83
+ // Initializes a new Utf8Iterator from the given byte buffer. The source does
83
84
  // not have to be NUL-terminated, but the length must be passed in explicitly.
84
- void utf8iterator_init(struct GumboInternalParser* parser, const char* source,
85
- size_t source_length, Utf8Iterator* iter);
85
+ void utf8iterator_init (
86
+ struct GumboInternalParser* parser,
87
+ const char* source,
88
+ size_t source_length,
89
+ Utf8Iterator* iter
90
+ );
86
91
 
87
92
  // Advances the current position by one code point.
88
93
  void utf8iterator_next(Utf8Iterator* iter);
89
94
 
90
95
  // Returns the current code point as an integer.
91
- int utf8iterator_current(const Utf8Iterator* iter);
96
+ static inline int utf8iterator_current(const Utf8Iterator* iter) {
97
+ return iter->_current;
98
+ }
92
99
 
93
100
  // Retrieves and fills the output parameter with the current source position.
94
- void utf8iterator_get_position(
95
- const Utf8Iterator* iter, GumboSourcePosition* output);
101
+ static inline void utf8iterator_get_position (
102
+ const Utf8Iterator* iter,
103
+ GumboSourcePosition* output
104
+ ) {
105
+ *output = iter->_pos;
106
+ }
107
+
108
+ // Retrieves the marked position.
109
+ static inline GumboSourcePosition utf8iterator_get_mark_position (
110
+ const Utf8Iterator* iter
111
+ ) {
112
+ return iter->_mark_pos;
113
+ }
96
114
 
97
115
  // Retrieves a character pointer to the start of the current character.
98
- const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
116
+ static inline const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
117
+ return iter->_start;
118
+ }
99
119
 
100
- // Retrieves a character pointer to 1 past the end of the buffer. This is
120
+ // Retrieves the width of the current character.
121
+ static inline size_t utf8iterator_get_width(const Utf8Iterator* iter) {
122
+ return iter->_width;
123
+ }
124
+
125
+ // Retrieves a character pointer to 1 past the end of the buffer. This is
101
126
  // necessary for certain state machines and string comparisons that would like
102
127
  // to look directly for ASCII text in the buffer without going through the
103
128
  // decoder.
104
- const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
129
+ static inline const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
130
+ return iter->_end;
131
+ }
132
+
133
+ // Retrieves a character pointer to the marked position.
134
+ static inline const char* utf8iterator_get_mark_pointer(const Utf8Iterator* iter) {
135
+ return iter->_mark;
136
+ }
105
137
 
106
138
  // If the upcoming text in the buffer matches the specified prefix (which has
107
- // length 'length'), consume it and return true. Otherwise, return false with
108
- // no other effects. If the length of the string would overflow the buffer,
109
- // this returns false. Note that prefix should not contain null bytes because
110
- // of the use of strncmp/strncasecmp internally. All existing use-cases adhere
139
+ // length 'length'), consume it and return true. Otherwise, return false with
140
+ // no other effects. If the length of the string would overflow the buffer,
141
+ // this returns false. Note that prefix should not contain null bytes because
142
+ // of the use of strncmp/strncasecmp internally. All existing use-cases adhere
111
143
  // to this.
112
- bool utf8iterator_maybe_consume_match(
113
- Utf8Iterator* iter, const char* prefix, size_t length, bool case_sensitive);
144
+ bool utf8iterator_maybe_consume_match (
145
+ Utf8Iterator* iter,
146
+ const char* prefix,
147
+ size_t length,
148
+ bool case_sensitive
149
+ );
114
150
 
115
151
  // "Marks" a particular location of interest in the input stream, so that it can
116
- // later be reset() to. There's also the ability to record an error at the
152
+ // later be reset() to. There's also the ability to record an error at the
117
153
  // point that was marked, as oftentimes that's more useful than the last
118
154
  // character before the error was detected.
119
155
  void utf8iterator_mark(Utf8Iterator* iter);
@@ -121,12 +157,8 @@ void utf8iterator_mark(Utf8Iterator* iter);
121
157
  // Returns the current input stream position to the mark.
122
158
  void utf8iterator_reset(Utf8Iterator* iter);
123
159
 
124
- // Sets the position and original text fields of an error to the value at the
125
- // mark.
126
- void utf8iterator_fill_error_at_mark(
127
- Utf8Iterator* iter, struct GumboInternalError* error);
128
-
129
160
  #ifdef __cplusplus
130
161
  }
131
162
  #endif
132
- #endif // GUMBO_UTF8_H_
163
+
164
+ #endif // GUMBO_UTF8_H_