nokogumbo 1.5.0 → 2.0.0.pre.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +56 -0
  3. data/README.md +146 -22
  4. data/ext/nokogumbo/extconf.rb +116 -0
  5. data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
  6. data/gumbo-parser/src/ascii.c +33 -0
  7. data/gumbo-parser/src/ascii.h +31 -0
  8. data/gumbo-parser/src/attribute.c +26 -28
  9. data/gumbo-parser/src/attribute.h +3 -23
  10. data/gumbo-parser/src/char_ref.c +135 -2351
  11. data/gumbo-parser/src/char_ref.h +13 -29
  12. data/gumbo-parser/src/error.c +215 -133
  13. data/gumbo-parser/src/error.h +34 -49
  14. data/gumbo-parser/src/foreign_attrs.c +104 -0
  15. data/gumbo-parser/src/gumbo.h +506 -304
  16. data/gumbo-parser/src/insertion_mode.h +4 -28
  17. data/gumbo-parser/src/macros.h +91 -0
  18. data/gumbo-parser/src/parser.c +1989 -1431
  19. data/gumbo-parser/src/parser.h +6 -22
  20. data/gumbo-parser/src/replacement.h +33 -0
  21. data/gumbo-parser/src/string_buffer.c +43 -50
  22. data/gumbo-parser/src/string_buffer.h +24 -40
  23. data/gumbo-parser/src/string_piece.c +39 -39
  24. data/gumbo-parser/src/svg_attrs.c +174 -0
  25. data/gumbo-parser/src/svg_tags.c +137 -0
  26. data/gumbo-parser/src/tag.c +186 -59
  27. data/gumbo-parser/src/tag_lookup.c +382 -0
  28. data/gumbo-parser/src/tag_lookup.h +13 -0
  29. data/gumbo-parser/src/token_type.h +1 -25
  30. data/gumbo-parser/src/tokenizer.c +899 -495
  31. data/gumbo-parser/src/tokenizer.h +37 -37
  32. data/gumbo-parser/src/tokenizer_states.h +6 -22
  33. data/gumbo-parser/src/utf8.c +103 -86
  34. data/gumbo-parser/src/utf8.h +37 -41
  35. data/gumbo-parser/src/util.c +48 -38
  36. data/gumbo-parser/src/util.h +10 -40
  37. data/gumbo-parser/src/vector.c +45 -57
  38. data/gumbo-parser/src/vector.h +17 -39
  39. data/lib/nokogumbo.rb +10 -174
  40. data/lib/nokogumbo/html5.rb +250 -0
  41. data/lib/nokogumbo/html5/document.rb +37 -0
  42. data/lib/nokogumbo/html5/document_fragment.rb +46 -0
  43. data/lib/nokogumbo/version.rb +3 -0
  44. data/lib/nokogumbo/xml/node.rb +57 -0
  45. metadata +32 -19
  46. data/ext/nokogumboc/extconf.rb +0 -60
  47. data/gumbo-parser/src/char_ref.rl +0 -2554
  48. data/gumbo-parser/src/string_piece.h +0 -38
  49. data/gumbo-parser/src/tag.in +0 -150
  50. data/gumbo-parser/src/tag_enum.h +0 -153
  51. data/gumbo-parser/src/tag_gperf.h +0 -105
  52. data/gumbo-parser/src/tag_sizes.h +0 -4
  53. data/gumbo-parser/src/tag_strings.h +0 -153
  54. data/gumbo-parser/visualc/include/strings.h +0 -4
  55. data/test-nokogumbo.rb +0 -190
@@ -1,25 +1,9 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // This contains an implementation of a tokenizer for HTML5. It consumes a
18
- // buffer of UTF-8 characters, and then emits a stream of tokens.
19
-
20
1
  #ifndef GUMBO_TOKENIZER_H_
21
2
  #define GUMBO_TOKENIZER_H_
22
3
 
4
+ // This contains an implementation of a tokenizer for HTML5. It consumes a
5
+ // buffer of UTF-8 characters, and then emits a stream of tokens.
6
+
23
7
  #include <stdbool.h>
24
8
  #include <stddef.h>
25
9
 
@@ -49,11 +33,21 @@ typedef struct GumboInternalTokenDocType {
49
33
  // Struct containing all information pertaining to start tag tokens.
50
34
  typedef struct GumboInternalTokenStartTag {
51
35
  GumboTag tag;
36
+ // NULL unless tag is GUMBO_TAG_UNKNOWN
37
+ char *name;
52
38
  GumboVector /* GumboAttribute */ attributes;
53
39
  bool is_self_closing;
54
40
  } GumboTokenStartTag;
55
41
 
56
- // A data structure representing a single token in the input stream. This
42
+ // Struct containing all information pertaining to end tag tokens.
43
+ typedef struct GumboInternalTokenEndTag {
44
+ GumboTag tag;
45
+ // NULL unless tag is GUMBO_TAG_UNKNOWN
46
+ char *name;
47
+ bool is_self_closing;
48
+ } GumboTokenEndTag;
49
+
50
+ // A data structure representing a single token in the input stream. This
57
51
  // contains an enum for the type, the source position, a GumboStringPiece
58
52
  // pointing to the original text, and then a union for any parsed data.
59
53
  typedef struct GumboInternalToken {
@@ -63,7 +57,7 @@ typedef struct GumboInternalToken {
63
57
  union {
64
58
  GumboTokenDocType doc_type;
65
59
  GumboTokenStartTag start_tag;
66
- GumboTag end_tag;
60
+ GumboTokenEndTag end_tag;
67
61
  const char* text; // For comments.
68
62
  int character; // For character, whitespace, null, and EOF tokens.
69
63
  } v;
@@ -71,28 +65,35 @@ typedef struct GumboInternalToken {
71
65
 
72
66
  // Initializes the tokenizer state within the GumboParser object, setting up a
73
67
  // parse of the specified text.
74
- void gumbo_tokenizer_state_init(
75
- struct GumboInternalParser* parser, const char* text, size_t text_length);
68
+ void gumbo_tokenizer_state_init (
69
+ struct GumboInternalParser* parser,
70
+ const char* text,
71
+ size_t text_length
72
+ );
76
73
 
77
74
  // Destroys the tokenizer state within the GumboParser object, freeing any
78
75
  // dynamically-allocated structures within it.
79
76
  void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser);
80
77
 
81
- // Sets the tokenizer state to the specified value. This is needed by some
78
+ // Sets the tokenizer state to the specified value. This is needed by some
82
79
  // parser states, which alter the state of the tokenizer in response to tags
83
80
  // seen.
84
- void gumbo_tokenizer_set_state(
85
- struct GumboInternalParser* parser, GumboTokenizerEnum state);
81
+ void gumbo_tokenizer_set_state (
82
+ struct GumboInternalParser* parser,
83
+ GumboTokenizerEnum state
84
+ );
86
85
 
87
- // Flags whether the current node is a foreign content element. This is
86
+ // Flags whether the current node is a foreign content element. This is
88
87
  // necessary for the markup declaration open state, where the tokenizer must be
89
88
  // aware of the state of the parser to properly tokenize bad comment tags.
90
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
91
- void gumbo_tokenizer_set_is_current_node_foreign(
92
- struct GumboInternalParser* parser, bool is_foreign);
89
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
90
+ void gumbo_tokenizer_set_is_current_node_foreign (
91
+ struct GumboInternalParser* parser,
92
+ bool is_foreign
93
+ );
93
94
 
94
95
  // Lexes a single token from the specified buffer, filling the output with the
95
- // parsed GumboToken data structure. Returns true for a successful
96
+ // parsed GumboToken data structure. Returns true for a successful
96
97
  // tokenization, false if a parse error occurs.
97
98
  //
98
99
  // Example:
@@ -101,23 +102,22 @@ void gumbo_tokenizer_set_is_current_node_foreign(
101
102
  // gumbo_tokenizer_state_init(&parser, text, strlen(text));
102
103
  // while (gumbo_lex(&parser, &output)) {
103
104
  // ...do stuff with output.
104
- // gumbo_token_destroy(&parser, &token);
105
+ // gumbo_token_destroy(&token);
105
106
  // }
106
107
  // gumbo_tokenizer_state_destroy(&parser);
107
108
  bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
108
109
 
109
- // Frees the internally-allocated pointers within an GumboToken. Note that this
110
+ // Frees the internally-allocated pointers within a GumboToken. Note that this
110
111
  // doesn't free the token itself, since oftentimes it will be allocated on the
111
- // stack. A simple call to free() (or GumboParser->deallocator, if
112
- // appropriate) can handle that.
112
+ // stack.
113
113
  //
114
114
  // Note that if you are handing over ownership of the internal strings to some
115
115
  // other data structure - for example, a parse tree - these do not need to be
116
116
  // freed.
117
- void gumbo_token_destroy(struct GumboInternalParser* parser, GumboToken* token);
117
+ void gumbo_token_destroy(GumboToken* token);
118
118
 
119
119
  #ifdef __cplusplus
120
120
  }
121
121
  #endif
122
122
 
123
- #endif // GUMBO_TOKENIZER_H_
123
+ #endif // GUMBO_TOKENIZER_H_
@@ -1,32 +1,16 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // This contains the list of states used in the tokenizer. Although at first
1
+ #ifndef GUMBO_TOKENIZER_STATES_H_
2
+ #define GUMBO_TOKENIZER_STATES_H_
3
+
4
+ // This contains the list of states used in the tokenizer. Although at first
18
5
  // glance it seems like these could be kept internal to the tokenizer, several
19
6
  // of the actions in the parser require that it reach into the tokenizer and
20
- // reset the tokenizer state. For that to work, it needs to have the
7
+ // reset the tokenizer state. For that to work, it needs to have the
21
8
  // definitions of individual states available.
22
9
  //
23
10
  // This may also be useful for providing more detailed error messages for parse
24
11
  // errors, as we can match up states and inputs in a table without having to
25
12
  // clutter the tokenizer code with lots of precise error messages.
26
13
 
27
- #ifndef GUMBO_TOKENIZER_STATES_H_
28
- #define GUMBO_TOKENIZER_STATES_H_
29
-
30
14
  // The ordering of this enum is also used to build the dispatch table for the
31
15
  // tokenizer state machine, so if it is changed, be sure to update that too.
32
16
  typedef enum {
@@ -100,4 +84,4 @@ typedef enum {
100
84
  GUMBO_LEX_CDATA
101
85
  } GumboTokenizerEnum;
102
86
 
103
- #endif // GUMBO_TOKENIZER_STATES_H_
87
+ #endif // GUMBO_TOKENIZER_STATES_H_
@@ -1,59 +1,53 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
1
+ /*
2
+ Copyright 2018 Craig Barnes.
3
+ Copyright 2010 Google Inc.
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ https://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+ */
16
17
 
17
18
  #include "utf8.h"
18
19
 
19
20
  #include <assert.h>
20
21
  #include <stdint.h>
21
22
  #include <string.h>
22
- #include <strings.h> // For strncasecmp.
23
23
 
24
24
  #include "error.h"
25
25
  #include "gumbo.h"
26
26
  #include "parser.h"
27
- #include "util.h"
27
+ #include "ascii.h"
28
28
  #include "vector.h"
29
29
 
30
30
  const int kUtf8ReplacementChar = 0xFFFD;
31
31
 
32
- // Reference material:
33
- // Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description
34
- // RFC 3629: http://tools.ietf.org/html/rfc3629
35
- // HTML5 Unicode handling:
36
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream
37
- //
38
- // This implementation is based on a DFA-based decoder by Bjoern Hoehrmann
39
- // <bjoern@hoehrmann.de>. We wrap the inner table-based decoder routine in our
40
- // own handling for newlines, tabs, invalid continuation bytes, and other
41
- // conditions that the HTML5 spec fully specifies but normal UTF8 decoders do
42
- // not handle.
43
- // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of
44
- // the license agreement and code follows.
32
+ // References:
33
+ // * https://tools.ietf.org/html/rfc3629
34
+ // * https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
45
35
 
46
- // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
47
-
48
- // Permission is hereby granted, free of charge, to any person obtaining a copy
49
- // of this software and associated documentation files (the "Software"), to deal
50
- // in the Software without restriction, including without limitation the rights
51
- // to
52
- // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
53
- // of the Software, and to permit persons to whom the Software is furnished to
54
- // do
55
- // so, subject to the following conditions:
36
+ // The following code is a DFA-based UTF-8 decoder by Bjoern Hoehrmann.
37
+ // We wrap the inner table-based decoder routine in our own handling for
38
+ // newlines, tabs, invalid continuation bytes, and other conditions that
39
+ // the HTML5 spec fully specifies but normal UTF-8 decoders do not handle.
40
+ // See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
56
41
 
42
+ // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
43
+ //
44
+ // Permission is hereby granted, free of charge, to any person obtaining a
45
+ // copy of this software and associated documentation files (the "Software"),
46
+ // to deal in the Software without restriction, including without limitation
47
+ // the rights to use, copy, modify, merge, publish, distribute, sublicense,
48
+ // and/or sell copies of the Software, and to permit persons to whom the
49
+ // Software is furnished to do so, subject to the following conditions:
50
+ //
57
51
  // The above copyright notice and this permission notice shall be included in
58
52
  // all copies or substantial portions of the Software.
59
53
 
@@ -61,35 +55,33 @@ const int kUtf8ReplacementChar = 0xFFFD;
61
55
  #define UTF8_REJECT 12
62
56
 
63
57
  static const uint8_t utf8d[] = {
64
- // The first part of the table maps bytes to character classes that
65
- // to reduce the size of the transition table and create bitmasks.
66
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71
- 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9,
72
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
73
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2,
74
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10,
75
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8,
76
- 8, 8, 8, 8, 8, 8,
77
-
78
- // The second part is a transition table that maps a combination
79
- // of a state of the automaton and a character class to a state.
80
- 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12,
81
- 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12,
82
- 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
83
- 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12,
84
- 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
85
- 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
58
+ // The first part of the table maps bytes to character classes that
59
+ // to reduce the size of the transition table and create bitmasks.
60
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
61
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
62
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
63
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
64
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
65
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
66
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
67
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
68
+
69
+ // The second part is a transition table that maps a combination
70
+ // of a state of the automaton and a character class to a state.
71
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
72
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
73
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
74
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
75
+ 12,36,12,12,12,12,12,12,12,12,12,12,
86
76
  };
87
77
 
88
- uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
78
+ static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
89
79
  uint32_t type = utf8d[byte];
90
80
 
91
- *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
92
- : (0xff >> type) & (byte);
81
+ *codep =
82
+ (*state != UTF8_ACCEPT)
83
+ ? (byte & 0x3fu) | (*codep << 6)
84
+ : (0xff >> type) & (byte);
93
85
 
94
86
  *state = utf8d[256 + *state + type];
95
87
  return *state;
@@ -113,8 +105,8 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
113
105
  // At the point the error is recorded, the code point hasn't been computed
114
106
  // yet (and can't be, because it's invalid), so we need to build up the raw
115
107
  // hex value from the bytes under the cursor.
116
- uint64_t code_point = 0;
117
- for (int i = 0; i < iter->_width; ++i) {
108
+ uint32_t code_point = 0;
109
+ for (size_t i = 0; i < iter->_width; ++i) {
118
110
  code_point = (code_point << 8) | (unsigned char) iter->_start[i];
119
111
  }
120
112
  error->v.codepoint = code_point;
@@ -139,10 +131,10 @@ static void read_char(Utf8Iterator* iter) {
139
131
  if (state == UTF8_ACCEPT) {
140
132
  iter->_width = c - iter->_start + 1;
141
133
  // This is the special handling for carriage returns that is mandated by
142
- // the HTML5 spec. Since we're looking for particular 7-bit literal
134
+ // the HTML5 spec. Since we're looking for particular 7-bit literal
143
135
  // characters, we operate in terms of chars and only need a check for iter
144
136
  // overrun, instead of having to read in a full next code point.
145
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
137
+ // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
146
138
  if (code_point == '\r') {
147
139
  assert(iter->_width == 1);
148
140
  const char* next = c + 1;
@@ -156,8 +148,10 @@ static void read_char(Utf8Iterator* iter) {
156
148
  code_point = '\n';
157
149
  }
158
150
  if (utf8_is_invalid_code_point(code_point)) {
151
+ // Invalid code points are errors, but they are not replaced by
152
+ // U+FFFD.
153
+ // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
159
154
  add_error(iter, GUMBO_ERR_UTF8_INVALID);
160
- code_point = kUtf8ReplacementChar;
161
155
  }
162
156
  iter->_current = code_point;
163
157
  return;
@@ -171,8 +165,8 @@ static void read_char(Utf8Iterator* iter) {
171
165
  }
172
166
  }
173
167
  // If we got here without exiting early, then we've reached the end of the
174
- // iterator. Add an error for truncated input, set the width to consume the
175
- // rest of the iterator, and emit a replacement character. The next time we
168
+ // iterator. Add an error for truncated input, set the width to consume the
169
+ // rest of the iterator, and emit a replacement character. The next time we
176
170
  // enter this method, it will detect that there's no input to consume and
177
171
  // output an EOF.
178
172
  iter->_current = kUtf8ReplacementChar;
@@ -196,13 +190,23 @@ static void update_position(Utf8Iterator* iter) {
196
190
  // Returns true if this Unicode code point is in the list of characters
197
191
  // forbidden by the HTML5 spec, such as undefined control chars.
198
192
  bool utf8_is_invalid_code_point(int c) {
199
- return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
200
- (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
201
- ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
193
+ return
194
+ (c >= 0x1 && c <= 0x8)
195
+ || c == 0xB
196
+ || (c >= 0xE && c <= 0x1F)
197
+ || (c >= 0x7F && c <= 0x9F)
198
+ || (c >= 0xFDD0 && c <= 0xFDEF)
199
+ || ((c & 0xFFFF) == 0xFFFE)
200
+ || ((c & 0xFFFF) == 0xFFFF)
201
+ ;
202
202
  }
203
203
 
204
- void utf8iterator_init(GumboParser* parser, const char* source,
205
- size_t source_length, Utf8Iterator* iter) {
204
+ void utf8iterator_init (
205
+ GumboParser* parser,
206
+ const char* source,
207
+ size_t source_length,
208
+ Utf8Iterator* iter
209
+ ) {
206
210
  iter->_start = source;
207
211
  iter->_end = source + source_length;
208
212
  iter->_pos.line = 1;
@@ -220,10 +224,14 @@ void utf8iterator_next(Utf8Iterator* iter) {
220
224
  read_char(iter);
221
225
  }
222
226
 
223
- int utf8iterator_current(const Utf8Iterator* iter) { return iter->_current; }
227
+ int utf8iterator_current(const Utf8Iterator* iter) {
228
+ return iter->_current;
229
+ }
224
230
 
225
- void utf8iterator_get_position(
226
- const Utf8Iterator* iter, GumboSourcePosition* output) {
231
+ void utf8iterator_get_position (
232
+ const Utf8Iterator* iter,
233
+ GumboSourcePosition* output
234
+ ) {
227
235
  *output = iter->_pos;
228
236
  }
229
237
 
@@ -235,13 +243,22 @@ const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
235
243
  return iter->_end;
236
244
  }
237
245
 
238
- bool utf8iterator_maybe_consume_match(Utf8Iterator* iter, const char* prefix,
239
- size_t length, bool case_sensitive) {
240
- bool matched = (iter->_start + length <= iter->_end) &&
241
- (case_sensitive ? !strncmp(iter->_start, prefix, length)
242
- : !strncasecmp(iter->_start, prefix, length));
246
+ bool utf8iterator_maybe_consume_match (
247
+ Utf8Iterator* iter,
248
+ const char* prefix,
249
+ size_t length,
250
+ bool case_sensitive
251
+ ) {
252
+ bool matched =
253
+ (iter->_start + length <= iter->_end)
254
+ && (
255
+ case_sensitive
256
+ ? !strncmp(iter->_start, prefix, length)
257
+ : !gumbo_ascii_strncasecmp(iter->_start, prefix, length)
258
+ )
259
+ ;
243
260
  if (matched) {
244
- for (unsigned int i = 0; i < length; ++i) {
261
+ for (size_t i = 0; i < length; ++i) {
245
262
  utf8iterator_next(iter);
246
263
  }
247
264
  return true;
@@ -1,41 +1,26 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // This contains an implementation of a UTF8 iterator and decoder suitable for
18
- // an HTML5 parser. This does a bit more than straight UTF-8 decoding. The
1
+ #ifndef GUMBO_UTF8_H_
2
+ #define GUMBO_UTF8_H_
3
+
4
+ // This contains an implementation of a UTF-8 iterator and decoder suitable for
5
+ // a HTML5 parser. This does a bit more than straight UTF-8 decoding. The
19
6
  // HTML5 spec specifies that:
20
7
  // 1. Decoding errors are parse errors.
21
- // 2. Certain other codepoints (eg. control characters) are parse errors.
8
+ // 2. Certain other codepoints (e.g. control characters) are parse errors.
22
9
  // 3. Carriage returns and CR/LF groups are converted to line feeds.
23
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
10
+ // https://encoding.spec.whatwg.org/#utf-8-decode
24
11
  //
25
- // Also, we want to keep track of source positions for error handling. As a
12
+ // Also, we want to keep track of source positions for error handling. As a
26
13
  // result, we fold all that functionality into this decoder, and can't use an
27
14
  // off-the-shelf library.
28
15
  //
29
16
  // This header is internal-only, which is why we prefix functions with only
30
17
  // utf8_ or utf8_iterator_ instead of gumbo_utf8_.
31
18
 
32
- #ifndef GUMBO_UTF8_H_
33
- #define GUMBO_UTF8_H_
34
-
35
19
  #include <stdbool.h>
36
20
  #include <stddef.h>
37
21
 
38
22
  #include "gumbo.h"
23
+ #include "macros.h"
39
24
 
40
25
  #ifdef __cplusplus
41
26
  extern "C" {
@@ -51,7 +36,7 @@ typedef struct GumboInternalUtf8Iterator {
51
36
  // Points at the start of the code point most recently read into 'current'.
52
37
  const char* _start;
53
38
 
54
- // Points at the mark. The mark is initially set to the beginning of the
39
+ // Points at the mark. The mark is initially set to the beginning of the
55
40
  // input.
56
41
  const char* _mark;
57
42
 
@@ -62,7 +47,7 @@ typedef struct GumboInternalUtf8Iterator {
62
47
  int _current;
63
48
 
64
49
  // The width in bytes of the current code point.
65
- int _width;
50
+ size_t _width;
66
51
 
67
52
  // The SourcePosition for the current location.
68
53
  GumboSourcePosition _pos;
@@ -77,12 +62,16 @@ typedef struct GumboInternalUtf8Iterator {
77
62
 
78
63
  // Returns true if this Unicode code point is in the list of characters
79
64
  // forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
80
- bool utf8_is_invalid_code_point(int c);
65
+ bool utf8_is_invalid_code_point(int c) CONST_FN;
81
66
 
82
- // Initializes a new Utf8Iterator from the given byte buffer. The source does
67
+ // Initializes a new Utf8Iterator from the given byte buffer. The source does
83
68
  // not have to be NUL-terminated, but the length must be passed in explicitly.
84
- void utf8iterator_init(struct GumboInternalParser* parser, const char* source,
85
- size_t source_length, Utf8Iterator* iter);
69
+ void utf8iterator_init (
70
+ struct GumboInternalParser* parser,
71
+ const char* source,
72
+ size_t source_length,
73
+ Utf8Iterator* iter
74
+ );
86
75
 
87
76
  // Advances the current position by one code point.
88
77
  void utf8iterator_next(Utf8Iterator* iter);
@@ -97,23 +86,27 @@ void utf8iterator_get_position(
97
86
  // Retrieves a character pointer to the start of the current character.
98
87
  const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
99
88
 
100
- // Retrieves a character pointer to 1 past the end of the buffer. This is
89
+ // Retrieves a character pointer to 1 past the end of the buffer. This is
101
90
  // necessary for certain state machines and string comparisons that would like
102
91
  // to look directly for ASCII text in the buffer without going through the
103
92
  // decoder.
104
93
  const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
105
94
 
106
95
  // If the upcoming text in the buffer matches the specified prefix (which has
107
- // length 'length'), consume it and return true. Otherwise, return false with
108
- // no other effects. If the length of the string would overflow the buffer,
109
- // this returns false. Note that prefix should not contain null bytes because
110
- // of the use of strncmp/strncasecmp internally. All existing use-cases adhere
96
+ // length 'length'), consume it and return true. Otherwise, return false with
97
+ // no other effects. If the length of the string would overflow the buffer,
98
+ // this returns false. Note that prefix should not contain null bytes because
99
+ // of the use of strncmp/strncasecmp internally. All existing use-cases adhere
111
100
  // to this.
112
- bool utf8iterator_maybe_consume_match(
113
- Utf8Iterator* iter, const char* prefix, size_t length, bool case_sensitive);
101
+ bool utf8iterator_maybe_consume_match (
102
+ Utf8Iterator* iter,
103
+ const char* prefix,
104
+ size_t length,
105
+ bool case_sensitive
106
+ );
114
107
 
115
108
  // "Marks" a particular location of interest in the input stream, so that it can
116
- // later be reset() to. There's also the ability to record an error at the
109
+ // later be reset() to. There's also the ability to record an error at the
117
110
  // point that was marked, as oftentimes that's more useful than the last
118
111
  // character before the error was detected.
119
112
  void utf8iterator_mark(Utf8Iterator* iter);
@@ -123,10 +116,13 @@ void utf8iterator_reset(Utf8Iterator* iter);
123
116
 
124
117
  // Sets the position and original text fields of an error to the value at the
125
118
  // mark.
126
- void utf8iterator_fill_error_at_mark(
127
- Utf8Iterator* iter, struct GumboInternalError* error);
119
+ void utf8iterator_fill_error_at_mark (
120
+ Utf8Iterator* iter,
121
+ struct GumboInternalError* error
122
+ );
128
123
 
129
124
  #ifdef __cplusplus
130
125
  }
131
126
  #endif
132
- #endif // GUMBO_UTF8_H_
127
+
128
+ #endif // GUMBO_UTF8_H_