nokogumbo 1.4.2 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cf20dd502d8ec6022f2c72193bb0c9a908251088
4
- data.tar.gz: 326f85766d0e4f97683f5df026f08f4dc33806e8
3
+ metadata.gz: 2ea9a9bee29cbf479c7afab79a4df38040ed60e7
4
+ data.tar.gz: f7e87af169388de1f0b32fd457e73182169bc503
5
5
  SHA512:
6
- metadata.gz: 800800652a5260bf54399e8cca1fc6e63f7ef53aea489245c5315b6e955b38aa4dfc6d7272b99898ab78150464640ac14c995aa38b9c77644dab5d73fc0e46a5
7
- data.tar.gz: 18ba647671103cfc2853a88935fe91eb965d1e6fbe1aad981438297a5035ec222b5ae6c5ed3ef127429c8b58edd02a6a5a877ba7e7ec3390d05779f7420f1521
6
+ metadata.gz: 6413991d638e6bacc465442546c7ca7756fb0cd8f577ccb9b65d60da87c009b7b2d212bb3fe928b7bbde97ba66f9180d8c5e3efdd02648e33e5d7546b452476f
7
+ data.tar.gz: 4a3486ed36ab147b828e9f79244c36a2d33b2a8dccc61fa8d6abf75556ebb76313c7d009c2502bc3142e38644759c1e6c6bab5d1c1b21b346bf16702802650e4
@@ -27,7 +27,7 @@ struct GumboInternalParser;
27
27
 
28
28
  GumboAttribute* gumbo_get_attribute(
29
29
  const GumboVector* attributes, const char* name) {
30
- for (int i = 0; i < attributes->length; ++i) {
30
+ for (unsigned int i = 0; i < attributes->length; ++i) {
31
31
  GumboAttribute* attr = attributes->data[i];
32
32
  if (!strcasecmp(attr->name, name)) {
33
33
  return attr;
@@ -30,7 +30,7 @@
30
30
  #include <ctype.h>
31
31
  #include <stddef.h>
32
32
  #include <stdio.h>
33
- #include <string.h> // Only for debug assertions at present.
33
+ #include <string.h> // Only for debug assertions at present.
34
34
 
35
35
  #include "error.h"
36
36
  #include "string_piece.h"
@@ -49,44 +49,18 @@ typedef struct {
49
49
  int to_char;
50
50
  } CharReplacement;
51
51
 
52
- static const CharReplacement kCharReplacements[] = {
53
- { 0x00, 0xfffd },
54
- { 0x0d, 0x000d },
55
- { 0x80, 0x20ac },
56
- { 0x81, 0x0081 },
57
- { 0x82, 0x201A },
58
- { 0x83, 0x0192 },
59
- { 0x84, 0x201E },
60
- { 0x85, 0x2026 },
61
- { 0x86, 0x2020 },
62
- { 0x87, 0x2021 },
63
- { 0x88, 0x02C6 },
64
- { 0x89, 0x2030 },
65
- { 0x8A, 0x0160 },
66
- { 0x8B, 0x2039 },
67
- { 0x8C, 0x0152 },
68
- { 0x8D, 0x008D },
69
- { 0x8E, 0x017D },
70
- { 0x8F, 0x008F },
71
- { 0x90, 0x0090 },
72
- { 0x91, 0x2018 },
73
- { 0x92, 0x2019 },
74
- { 0x93, 0x201C },
75
- { 0x94, 0x201D },
76
- { 0x95, 0x2022 },
77
- { 0x96, 0x2013 },
78
- { 0x97, 0x2014 },
79
- { 0x98, 0x02DC },
80
- { 0x99, 0x2122 },
81
- { 0x9A, 0x0161 },
82
- { 0x9B, 0x203A },
83
- { 0x9C, 0x0153 },
84
- { 0x9D, 0x009D },
85
- { 0x9E, 0x017E },
86
- { 0x9F, 0x0178 },
87
- // Terminator.
88
- { -1, -1 }
89
- };
52
+ static const CharReplacement kCharReplacements[] = {{0x00, 0xfffd},
53
+ {0x0d, 0x000d}, {0x80, 0x20ac}, {0x81, 0x0081}, {0x82, 0x201A},
54
+ {0x83, 0x0192}, {0x84, 0x201E}, {0x85, 0x2026}, {0x86, 0x2020},
55
+ {0x87, 0x2021}, {0x88, 0x02C6}, {0x89, 0x2030}, {0x8A, 0x0160},
56
+ {0x8B, 0x2039}, {0x8C, 0x0152}, {0x8D, 0x008D}, {0x8E, 0x017D},
57
+ {0x8F, 0x008F}, {0x90, 0x0090}, {0x91, 0x2018}, {0x92, 0x2019},
58
+ {0x93, 0x201C}, {0x94, 0x201D}, {0x95, 0x2022}, {0x96, 0x2013},
59
+ {0x97, 0x2014}, {0x98, 0x02DC}, {0x99, 0x2122}, {0x9A, 0x0161},
60
+ {0x9B, 0x203A}, {0x9C, 0x0153}, {0x9D, 0x009D}, {0x9E, 0x017E},
61
+ {0x9F, 0x0178},
62
+ // Terminator.
63
+ {-1, -1}};
90
64
 
91
65
  static int parse_digit(int c, bool allow_hex) {
92
66
  if (c >= '0' && c <= '9') {
@@ -111,9 +85,8 @@ static void add_no_digit_error(
111
85
  error->type = GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS;
112
86
  }
113
87
 
114
- static void add_codepoint_error(
115
- struct GumboInternalParser* parser, Utf8Iterator* input,
116
- GumboErrorType type, int codepoint) {
88
+ static void add_codepoint_error(struct GumboInternalParser* parser,
89
+ Utf8Iterator* input, GumboErrorType type, int codepoint) {
117
90
  GumboError* error = gumbo_add_error(parser);
118
91
  if (!error) {
119
92
  return;
@@ -123,9 +96,8 @@ static void add_codepoint_error(
123
96
  error->v.codepoint = codepoint;
124
97
  }
125
98
 
126
- static void add_named_reference_error(
127
- struct GumboInternalParser* parser, Utf8Iterator* input,
128
- GumboErrorType type, GumboStringPiece text) {
99
+ static void add_named_reference_error(struct GumboInternalParser* parser,
100
+ Utf8Iterator* input, GumboErrorType type, GumboStringPiece text) {
129
101
  GumboError* error = gumbo_add_error(parser);
130
102
  if (!error) {
131
103
  return;
@@ -211,8 +183,7 @@ static bool maybe_add_invalid_named_reference(
211
183
  // worry about consuming characters.
212
184
  const char* start = utf8iterator_get_char_pointer(input);
213
185
  int c = utf8iterator_current(input);
214
- while ((c >= 'a' && c <= 'z') ||
215
- (c >= 'A' && c <= 'Z') ||
186
+ while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
216
187
  (c >= '0' && c <= '9')) {
217
188
  utf8iterator_next(input);
218
189
  c = utf8iterator_current(input);
@@ -228,12 +199,11 @@ static bool maybe_add_invalid_named_reference(
228
199
  return true;
229
200
  }
230
201
 
231
-
232
202
  #line 2465 "char_ref.rl"
233
203
 
204
+ // clang-format off
234
205
 
235
-
236
- #line 237 "char_ref.c"
206
+ #line 238 "char_ref.c"
237
207
  static const short _char_ref_actions[] = {
238
208
  0, 1, 0, 1, 1, 1, 2, 1,
239
209
  3, 1, 4, 1, 5, 1, 6, 1,
@@ -13960,17 +13930,15 @@ static const short _char_ref_eof_trans[] = {
13960
13930
  };
13961
13931
 
13962
13932
  static const int char_ref_start = 7623;
13963
- static const int char_ref_first_final = 7623;
13964
- static const int char_ref_error = 0;
13965
13933
 
13966
13934
  static const int char_ref_en_valid_named_ref = 7623;
13967
13935
 
13968
13936
 
13969
- #line 2468 "char_ref.rl"
13937
+ #line 2469 "char_ref.rl"
13938
+ // clang-format on
13970
13939
 
13971
- static bool consume_named_ref(
13972
- struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
13973
- OneOrTwoCodepoints* output) {
13940
+ static bool consume_named_ref(struct GumboInternalParser* parser,
13941
+ Utf8Iterator* input, bool is_in_attribute, OneOrTwoCodepoints* output) {
13974
13942
  assert(output->first == kGumboNoChar);
13975
13943
  const char* p = utf8iterator_get_char_pointer(input);
13976
13944
  const char* pe = utf8iterator_get_end_pointer(input);
@@ -13979,8 +13947,9 @@ static bool consume_named_ref(
13979
13947
  const char *ts, *start;
13980
13948
  int cs, act;
13981
13949
 
13950
+ // clang-format off
13982
13951
 
13983
- #line 13984 "char_ref.c"
13952
+ #line 13985 "char_ref.c"
13984
13953
  {
13985
13954
  cs = char_ref_start;
13986
13955
  ts = 0;
@@ -13988,14 +13957,15 @@ static bool consume_named_ref(
13988
13957
  act = 0;
13989
13958
  }
13990
13959
 
13991
- #line 2481 "char_ref.rl"
13960
+ #line 2484 "char_ref.rl"
13992
13961
  // Avoid unused variable warnings.
13993
13962
  (void) act;
13994
13963
  (void) ts;
13964
+ (void) char_ref_en_valid_named_ref;
13995
13965
 
13996
13966
  start = p;
13997
13967
 
13998
- #line 13999 "char_ref.c"
13968
+ #line 14001 "char_ref.c"
13999
13969
  {
14000
13970
  int _slen;
14001
13971
  int _trans;
@@ -14017,7 +13987,7 @@ _resume:
14017
13987
  #line 1 "NONE"
14018
13988
  {ts = p;}
14019
13989
  break;
14020
- #line 14021 "char_ref.c"
13990
+ #line 14023 "char_ref.c"
14021
13991
  }
14022
13992
  }
14023
13993
 
@@ -23000,7 +22970,7 @@ _eof_trans:
23000
22970
  #line 2273 "char_ref.rl"
23001
22971
  {{p = ((te))-1;}{ output->first = 0xd7; {p++; goto _out; } }}
23002
22972
  break;
23003
- #line 23004 "char_ref.c"
22973
+ #line 23006 "char_ref.c"
23004
22974
  }
23005
22975
  }
23006
22976
 
@@ -23013,7 +22983,7 @@ _again:
23013
22983
  #line 1 "NONE"
23014
22984
  {ts = 0;}
23015
22985
  break;
23016
- #line 23017 "char_ref.c"
22986
+ #line 23019 "char_ref.c"
23017
22987
  }
23018
22988
  }
23019
22989
 
@@ -23033,7 +23003,8 @@ _again:
23033
23003
  _out: {}
23034
23004
  }
23035
23005
 
23036
- #line 2487 "char_ref.rl"
23006
+ #line 2491 "char_ref.rl"
23007
+ // clang-format on
23037
23008
 
23038
23009
  if (cs >= 7623) {
23039
23010
  assert(output->first != kGumboNoChar);
@@ -23067,10 +23038,9 @@ _again:
23067
23038
  }
23068
23039
  }
23069
23040
 
23070
- bool consume_char_ref(
23071
- struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
23072
- int additional_allowed_char, bool is_in_attribute,
23073
- OneOrTwoCodepoints* output) {
23041
+ bool consume_char_ref(struct GumboInternalParser* parser,
23042
+ struct GumboInternalUtf8Iterator* input, int additional_allowed_char,
23043
+ bool is_in_attribute, OneOrTwoCodepoints* output) {
23074
23044
  utf8iterator_mark(input);
23075
23045
  utf8iterator_next(input);
23076
23046
  int c = utf8iterator_current(input);
@@ -49,10 +49,9 @@ typedef struct {
49
49
  // errors to the GumboParser's errors vector, if the spec calls for it. Pass a
50
50
  // space for the "additional allowed char" when the spec says "with no
51
51
  // additional allowed char". Returns false on parse error, true otherwise.
52
- bool consume_char_ref(
53
- struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
54
- int additional_allowed_char, bool is_in_attribute,
55
- OneOrTwoCodepoints* output);
52
+ bool consume_char_ref(struct GumboInternalParser* parser,
53
+ struct GumboInternalUtf8Iterator* input, int additional_allowed_char,
54
+ bool is_in_attribute, OneOrTwoCodepoints* output);
56
55
 
57
56
  #ifdef __cplusplus
58
57
  }
@@ -2464,7 +2464,9 @@ valid_named_ref := |*
2464
2464
  *|;
2465
2465
  }%%
2466
2466
 
2467
- %% write data;
2467
+ // clang-format off
2468
+ %% write data noerror nofinal;
2469
+ // clang-format on
2468
2470
 
2469
2471
  static bool consume_named_ref(
2470
2472
  struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
@@ -2477,13 +2479,16 @@ static bool consume_named_ref(
2477
2479
  const char *ts, *start;
2478
2480
  int cs, act;
2479
2481
 
2482
+ // clang-format off
2480
2483
  %% write init;
2481
2484
  // Avoid unused variable warnings.
2482
2485
  (void) act;
2483
2486
  (void) ts;
2487
+ (void) char_ref_en_valid_named_ref;
2484
2488
 
2485
2489
  start = p;
2486
2490
  %% write exec;
2491
+ // clang-format on
2487
2492
 
2488
2493
  if (cs >= %%{ write first_final; }%%) {
2489
2494
  assert(output->first != kGumboNoChar);
@@ -27,18 +27,17 @@
27
27
  #include "util.h"
28
28
  #include "vector.h"
29
29
 
30
- static const size_t kMessageBufferSize = 256;
31
-
32
30
  // Prints a formatted message to a StringBuffer. This automatically resizes the
33
31
  // StringBuffer as necessary to fit the message. Returns the number of bytes
34
32
  // written.
35
- static int print_message(GumboParser* parser, GumboStringBuffer* output,
36
- const char* format, ...) {
33
+ static int print_message(
34
+ GumboParser* parser, GumboStringBuffer* output, const char* format, ...) {
37
35
  va_list args;
38
- va_start(args, format);
39
36
  int remaining_capacity = output->capacity - output->length;
40
- int bytes_written = vsnprintf(output->data + output->length,
41
- remaining_capacity, format, args);
37
+ va_start(args, format);
38
+ int bytes_written = vsnprintf(
39
+ output->data + output->length, remaining_capacity, format, args);
40
+ va_end(args);
42
41
  #ifdef _MSC_VER
43
42
  if (bytes_written == -1) {
44
43
  // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
@@ -47,15 +46,15 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
47
46
  // we retry (letting it fail and returning 0 if it doesn't), since there's
48
47
  // no way to smartly resize the buffer.
49
48
  gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
50
- int result = vsnprintf(output->data + output->length,
51
- remaining_capacity, format, args);
49
+ va_start(args, format);
50
+ int result = vsnprintf(
51
+ output->data + output->length, remaining_capacity, format, args);
52
52
  va_end(args);
53
53
  return result == -1 ? 0 : result;
54
54
  }
55
55
  #else
56
56
  // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
57
57
  if (bytes_written == -1) {
58
- va_end(args);
59
58
  return 0;
60
59
  }
61
60
  #endif
@@ -64,19 +63,19 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
64
63
  gumbo_string_buffer_reserve(
65
64
  parser, output->capacity + bytes_written, output);
66
65
  remaining_capacity = output->capacity - output->length;
67
- bytes_written = vsnprintf(output->data + output->length,
68
- remaining_capacity, format, args);
66
+ va_start(args, format);
67
+ bytes_written = vsnprintf(
68
+ output->data + output->length, remaining_capacity, format, args);
69
+ va_end(args);
69
70
  }
70
71
  output->length += bytes_written;
71
- va_end(args);
72
72
  return bytes_written;
73
73
  }
74
74
 
75
- static void print_tag_stack(
76
- GumboParser* parser, const GumboParserError* error,
75
+ static void print_tag_stack(GumboParser* parser, const GumboParserError* error,
77
76
  GumboStringBuffer* output) {
78
77
  print_message(parser, output, " Currently open tags: ");
79
- for (int i = 0; i < error->tag_stack.length; ++i) {
78
+ for (unsigned int i = 0; i < error->tag_stack.length; ++i) {
80
79
  if (i) {
81
80
  print_message(parser, output, ", ");
82
81
  }
@@ -87,12 +86,11 @@ static void print_tag_stack(
87
86
  }
88
87
 
89
88
  static void handle_parser_error(GumboParser* parser,
90
- const GumboParserError* error,
91
- GumboStringBuffer* output) {
89
+ const GumboParserError* error, GumboStringBuffer* output) {
92
90
  if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
93
91
  error->input_type != GUMBO_TOKEN_DOCTYPE) {
94
- print_message(parser, output,
95
- "The doctype must be the first token in the document");
92
+ print_message(
93
+ parser, output, "The doctype must be the first token in the document");
96
94
  return;
97
95
  }
98
96
 
@@ -151,13 +149,14 @@ static const char* find_last_newline(
151
149
  static const char* find_next_newline(
152
150
  const char* original_text, const char* error_location) {
153
151
  const char* c = error_location;
154
- for (; *c && *c != '\n'; ++c);
152
+ for (; *c && *c != '\n'; ++c)
153
+ ;
155
154
  return c;
156
155
  }
157
156
 
158
157
  GumboError* gumbo_add_error(GumboParser* parser) {
159
158
  int max_errors = parser->_options->max_errors;
160
- if (max_errors >= 0 && parser->_output->errors.length >= max_errors) {
159
+ if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) {
161
160
  return NULL;
162
161
  }
163
162
  GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
@@ -167,50 +166,52 @@ GumboError* gumbo_add_error(GumboParser* parser) {
167
166
 
168
167
  void gumbo_error_to_string(
169
168
  GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
170
- print_message(parser, output, "@%d:%d: ",
171
- error->position.line, error->position.column);
169
+ print_message(
170
+ parser, output, "@%d:%d: ", error->position.line, error->position.column);
172
171
  switch (error->type) {
173
172
  case GUMBO_ERR_UTF8_INVALID:
174
- print_message(parser, output, "Invalid UTF8 character 0x%x",
175
- error->v.codepoint);
173
+ print_message(
174
+ parser, output, "Invalid UTF8 character 0x%x", error->v.codepoint);
176
175
  break;
177
176
  case GUMBO_ERR_UTF8_TRUNCATED:
178
177
  print_message(parser, output,
179
- "Input stream ends with a truncated UTF8 character 0x%x",
180
- error->v.codepoint);
178
+ "Input stream ends with a truncated UTF8 character 0x%x",
179
+ error->v.codepoint);
181
180
  break;
182
181
  case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
183
- print_message(parser, output,
184
- "No digits after &# in numeric character reference");
182
+ print_message(
183
+ parser, output, "No digits after &# in numeric character reference");
185
184
  break;
186
185
  case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
187
186
  print_message(parser, output,
188
- "The numeric character reference &#%d should be followed "
189
- "by a semicolon", error->v.codepoint);
187
+ "The numeric character reference &#%d should be followed "
188
+ "by a semicolon",
189
+ error->v.codepoint);
190
190
  break;
191
191
  case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
192
192
  print_message(parser, output,
193
- "The numeric character reference &#%d; encodes an invalid "
194
- "unicode codepoint", error->v.codepoint);
193
+ "The numeric character reference &#%d; encodes an invalid "
194
+ "unicode codepoint",
195
+ error->v.codepoint);
195
196
  break;
196
197
  case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
197
198
  // The textual data came from one of the literal strings in the table, and
198
199
  // so it'll be null-terminated.
199
200
  print_message(parser, output,
200
- "The named character reference &%.*s should be followed by a "
201
- "semicolon", (int) error->v.text.length, error->v.text.data);
201
+ "The named character reference &%.*s should be followed by a "
202
+ "semicolon",
203
+ (int) error->v.text.length, error->v.text.data);
202
204
  break;
203
205
  case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
204
206
  print_message(parser, output,
205
- "The named character reference &%.*s; is not a valid entity name",
206
- (int) error->v.text.length, error->v.text.data);
207
+ "The named character reference &%.*s; is not a valid entity name",
208
+ (int) error->v.text.length, error->v.text.data);
207
209
  break;
208
210
  case GUMBO_ERR_DUPLICATE_ATTR:
209
211
  print_message(parser, output,
210
- "Attribute %s occurs multiple times, at positions %d and %d",
211
- error->v.duplicate_attr.name,
212
- error->v.duplicate_attr.original_index,
213
- error->v.duplicate_attr.new_index);
212
+ "Attribute %s occurs multiple times, at positions %d and %d",
213
+ error->v.duplicate_attr.name, error->v.duplicate_attr.original_index,
214
+ error->v.duplicate_attr.new_index);
214
215
  break;
215
216
  case GUMBO_ERR_PARSER:
216
217
  case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
@@ -218,21 +219,19 @@ void gumbo_error_to_string(
218
219
  break;
219
220
  default:
220
221
  print_message(parser, output,
221
- "Tokenizer error with an unimplemented error message");
222
+ "Tokenizer error with an unimplemented error message");
222
223
  break;
223
224
  }
224
225
  gumbo_string_buffer_append_codepoint(parser, '.', output);
225
226
  }
226
227
 
227
- void gumbo_caret_diagnostic_to_string(
228
- GumboParser* parser, const GumboError* error,
229
- const char* source_text, GumboStringBuffer* output) {
228
+ void gumbo_caret_diagnostic_to_string(GumboParser* parser,
229
+ const GumboError* error, const char* source_text,
230
+ GumboStringBuffer* output) {
230
231
  gumbo_error_to_string(parser, error, output);
231
232
 
232
- const char* line_start =
233
- find_last_newline(source_text, error->original_text);
234
- const char* line_end =
235
- find_next_newline(source_text, error->original_text);
233
+ const char* line_start = find_last_newline(source_text, error->original_text);
234
+ const char* line_end = find_next_newline(source_text, error->original_text);
236
235
  GumboStringPiece original_line;
237
236
  original_line.data = line_start;
238
237
  original_line.length = line_end - line_start;
@@ -273,7 +272,7 @@ void gumbo_init_errors(GumboParser* parser) {
273
272
  }
274
273
 
275
274
  void gumbo_destroy_errors(GumboParser* parser) {
276
- for (int i = 0; i < parser->_output->errors.length; ++i) {
275
+ for (unsigned int i = 0; i < parser->_output->errors.length; ++i) {
277
276
  gumbo_error_destroy(parser, parser->_output->errors.data[i]);
278
277
  }
279
278
  gumbo_vector_destroy(parser, &parser->_output->errors);