nokogumbo 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 96fa61565f78d5491e0b6d5b505cf936524745eb848b8b6584fc15e20c7ae35b
4
+ data.tar.gz: e5416f71bbe90323f04b8aad4dc48b28947e43a9eb46f446f8ca1444f519a07b
5
+ SHA512:
6
+ metadata.gz: 676bf3585d38cd4ad5c72b8b3afd4952e248c747683ae1072dd43f6ce1ccd279177e4d0c75a9821ed76d32806333128152231349d8d113ae5d81279580b13004
7
+ data.tar.gz: 3459078d96977399e75551c4a3ee5623091f48569984b771e540ec111125f5af91e39a8d78cbd3ce9280326b1b9395dc4a0b0d7f0a72294876682cb9fe35e3d9
data/README.md CHANGED
@@ -36,7 +36,13 @@ Example
36
36
  -----
37
37
  ```ruby
38
38
  require 'nokogumbo'
39
- puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title']
39
+ puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
40
+ ```
41
+
42
+ Use `.to_html` instead of `.to_s` when parsing and serializing multiple times
43
+ ```
44
+ require 'nokogumbo'
45
+ Nokogiri::HTML5.parse(Nokogiri::HTML5.parse('<div></div> a').to_html).to_html
40
46
  ```
41
47
 
42
48
  Notes
@@ -83,5 +89,5 @@ Installation
83
89
  Related efforts
84
90
  ============
85
91
 
86
- * [ruby-gumbo](https://github.com/galdor/ruby-gumbo#readme) - a ruby binding
92
+ * [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) - a ruby binding
87
93
  for the Gumbo HTML5 parser.
@@ -1,7 +1,7 @@
1
1
  require 'mkmf'
2
2
  $CFLAGS += " -std=c99"
3
3
 
4
- if have_library('xml2', 'xmlNewDoc')
4
+ if have_library('xml2', 'xmlNewDoc')
5
5
  # libxml2 libraries from http://www.xmlsoft.org/
6
6
  pkg_config('libxml-2.0')
7
7
 
@@ -19,11 +19,6 @@ if have_library('xml2', 'xmlNewDoc')
19
19
 
20
20
  # if found, enable direct calls to Nokogiri (and libxml2)
21
21
  $CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext)
22
-
23
- if File.exists?("/etc/gentoo-release")
24
- # link to the library to prevent: nokogumbo.c:(.text+0x26a): undefined reference to `Nokogiri_wrap_xml_document'
25
- $LDFLAGS += " -L#{nokogiri_ext} -l:nokogiri.so"
26
- end
27
22
  end
28
23
  end
29
24
 
@@ -45,4 +40,21 @@ unless have_library('gumbo', 'gumbo_parse')
45
40
  end
46
41
  end
47
42
 
43
+ # We use some Gumbo Internals, and not all distros ship the internal headers.
44
+ header_typedefs = {
45
+ 'error.h' => 'GumboErrorType',
46
+ 'insertion_mode.h' => 'GumboInsertionMode',
47
+ 'parser.h' => 'GumboParser',
48
+ 'string_buffer.h' => 'GumboStringBuffer',
49
+ 'token_type.h' => 'GumboTokenType',
50
+ }
51
+
52
+ header_typedefs.each_pair do |header, type|
53
+ unless find_type(type, header)
54
+ require 'fileutils'
55
+ FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/#{header}"],
56
+ "#{rakehome}/ext/nokogumboc"
57
+ end
58
+ end
59
+
48
60
  create_makefile('nokogumboc')
@@ -19,10 +19,13 @@
19
19
  //
20
20
 
21
21
  #include <ruby.h>
22
- #include <gumbo.h>
22
+ #include "gumbo.h"
23
+ #include "error.h"
24
+ #include "parser.h"
23
25
 
24
26
  // class constants
25
27
  static VALUE Document;
28
+ static VALUE XMLSyntaxError;
26
29
 
27
30
  #ifdef NGLIB
28
31
  #include <nokogiri.h>
@@ -82,8 +85,11 @@ static VALUE xmlNewDoc(char* version) {
82
85
  }
83
86
  #endif
84
87
 
85
- // Build a Nokogiri Element for a given GumboElement (recursively)
86
- static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
88
+ // Build a xmlNodePtr for a given GumboNode (recursively)
89
+ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
90
+
91
+ // Build a xmlNodePtr for a given GumboElement (recursively)
92
+ static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
87
93
  // determine tag name for a given node
88
94
  xmlNodePtr element;
89
95
  if (node->tag != GUMBO_TAG_UNKNOWN) {
@@ -151,55 +157,108 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
151
157
  // add in the children
152
158
  GumboVector* children = &node->children;
153
159
  for (int i=0; i < children->length; i++) {
154
- GumboNode* child = children->data[i];
155
-
156
- xmlNodePtr node = NIL;
157
-
158
- switch (child->type) {
159
- case GUMBO_NODE_ELEMENT:
160
- node = walk_tree(document, &child->v.element);
161
- break;
162
- case GUMBO_NODE_WHITESPACE:
163
- case GUMBO_NODE_TEXT:
164
- node = xmlNewDocText(document, CONST_CAST child->v.text.text);
165
- break;
166
- case GUMBO_NODE_CDATA:
167
- node = xmlNewCDataBlock(document,
168
- CONST_CAST child->v.text.original_text.data,
169
- (int) child->v.text.original_text.length);
170
- break;
171
- case GUMBO_NODE_COMMENT:
172
- node = xmlNewDocComment(document, CONST_CAST child->v.text.text);
173
- break;
174
- case GUMBO_NODE_DOCUMENT:
175
- break; // should never happen -- ignore
176
- }
177
-
160
+ xmlNodePtr node = walk_tree(document, children->data[i]);
178
161
  if (node) xmlAddChild(element, node);
179
162
  }
180
163
 
181
164
  return element;
182
165
  }
183
166
 
167
+ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
168
+ switch (node->type) {
169
+ case GUMBO_NODE_DOCUMENT:
170
+ return NIL;
171
+ case GUMBO_NODE_ELEMENT:
172
+ case GUMBO_NODE_TEMPLATE:
173
+ return walk_element(document, &node->v.element);
174
+ case GUMBO_NODE_TEXT:
175
+ case GUMBO_NODE_WHITESPACE:
176
+ return xmlNewDocText(document, CONST_CAST node->v.text.text);
177
+ case GUMBO_NODE_CDATA:
178
+ return xmlNewCDataBlock(document,
179
+ CONST_CAST node->v.text.original_text.data,
180
+ (int) node->v.text.original_text.length);
181
+ case GUMBO_NODE_COMMENT:
182
+ return xmlNewDocComment(document, CONST_CAST node->v.text.text);
183
+ }
184
+ }
185
+
184
186
  // Parse a string using gumbo_parse into a Nokogiri document
185
- static VALUE parse(VALUE self, VALUE string) {
186
- GumboOutput *output = gumbo_parse_with_options(
187
- &kGumboDefaultOptions, RSTRING_PTR(string),
188
- (size_t) RSTRING_LEN(string)
189
- );
187
+ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
188
+ GumboOptions options;
189
+ memcpy(&options, &kGumboDefaultOptions, sizeof options);
190
+ options.max_errors = NUM2INT(max_parse_errors);
191
+
192
+ const char *input = RSTRING_PTR(string);
193
+ size_t input_len = RSTRING_LEN(string);
194
+ GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
190
195
  xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
191
- xmlNodePtr root = walk_tree(doc, &output->root->v.element);
192
- xmlDocSetRootElement(doc, root);
196
+ #ifdef NGLIB
197
+ doc->type = XML_HTML_DOCUMENT_NODE;
198
+ #endif
193
199
  if (output->document->v.document.has_doctype) {
200
+ const char *name = output->document->v.document.name;
194
201
  const char *public = output->document->v.document.public_identifier;
195
202
  const char *system = output->document->v.document.system_identifier;
196
- xmlCreateIntSubset(doc, CONST_CAST "html",
197
- (strlen(public) ? CONST_CAST public : NIL),
198
- (strlen(system) ? CONST_CAST system : NIL));
203
+ xmlCreateIntSubset(doc, CONST_CAST name,
204
+ (public[0] ? CONST_CAST public : NIL),
205
+ (system[0] ? CONST_CAST system : NIL));
199
206
  }
200
- gumbo_destroy_output(&kGumboDefaultOptions, output);
201
207
 
202
- return Nokogiri_wrap_xml_document(Document, doc);
208
+ GumboVector *children = &output->document->v.document.children;
209
+ for (int i=0; i < children->length; i++) {
210
+ GumboNode *child = children->data[i];
211
+ xmlNodePtr node = walk_tree(doc, child);
212
+ if (node) {
213
+ if (child == output->root)
214
+ xmlDocSetRootElement(doc, node);
215
+ else
216
+ xmlAddChild((xmlNodePtr)doc, node);
217
+ }
218
+ }
219
+
220
+ VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
221
+
222
+ // Add parse errors to rdoc.
223
+ if (output->errors.length) {
224
+ GumboVector *errors = &output->errors;
225
+ GumboParser parser = { ._options = &options };
226
+ GumboStringBuffer msg;
227
+ VALUE rerrors = rb_ary_new2(errors->length);
228
+
229
+ gumbo_string_buffer_init(&parser, &msg);
230
+ for (int i=0; i < errors->length; i++) {
231
+ GumboError *err = errors->data[i];
232
+ gumbo_string_buffer_clear(&parser, &msg);
233
+ // Work around bug in gumbo_caret_diagnostic_to_string.
234
+ // See https://github.com/google/gumbo-parser/pull/371
235
+ // The bug occurs when the error starts with a newline (unless it's the
236
+ // first character in the input--but that shouldn't cause an error in
237
+ // the first place.
238
+ if (*err->original_text == '\n' && err->original_text != input)
239
+ --err->original_text;
240
+ gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
241
+ VALUE err_str = rb_str_new(msg.data, msg.length);
242
+ VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError);
243
+ rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
244
+ rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
245
+ rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
246
+ rb_iv_set(syntax_error, "@file", Qnil);
247
+ rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
248
+ rb_iv_set(syntax_error, "@str1", Qnil);
249
+ rb_iv_set(syntax_error, "@str2", Qnil);
250
+ rb_iv_set(syntax_error, "@str3", Qnil);
251
+ rb_iv_set(syntax_error, "@int1", INT2NUM(err->type));
252
+ rb_iv_set(syntax_error, "@column", INT2NUM(err->position.column));
253
+ rb_ary_push(rerrors, syntax_error);
254
+ }
255
+ rb_iv_set(rdoc, "@errors", rerrors);
256
+ gumbo_string_buffer_destroy(&parser, &msg);
257
+ }
258
+
259
+ gumbo_destroy_output(&options, output);
260
+
261
+ return rdoc;
203
262
  }
204
263
 
205
264
  // Initialize the Nokogumbo class and fetch constants we will use later
@@ -211,10 +270,11 @@ void Init_nokogumboc() {
211
270
  VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
212
271
  VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
213
272
  Document = rb_const_get(HTML, rb_intern("Document"));
273
+ VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
274
+ XMLSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
214
275
 
215
276
  #ifndef NGLIB
216
277
  // more class constants
217
- VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
218
278
  Element = rb_const_get(XML, rb_intern("Element"));
219
279
  Text = rb_const_get(XML, rb_intern("Text"));
220
280
  CDATA = rb_const_get(XML, rb_intern("CDATA"));
@@ -223,7 +283,7 @@ void Init_nokogumboc() {
223
283
  // interned symbols
224
284
  new = rb_intern("new");
225
285
  set_attribute = rb_intern("set_attribute");
226
- add_child = rb_intern("add_child");
286
+ add_child = rb_intern("add_child_node_and_reparent_attrs");
227
287
  internal_subset = rb_intern("internal_subset");
228
288
  remove_ = rb_intern("remove");
229
289
  create_internal_subset = rb_intern("create_internal_subset");
@@ -231,5 +291,5 @@ void Init_nokogumboc() {
231
291
 
232
292
  // define Nokogumbo class with a singleton parse method
233
293
  VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
234
- rb_define_singleton_method(Gumbo, "parse", parse, 1);
294
+ rb_define_singleton_method(Gumbo, "parse", parse, 2);
235
295
  }
@@ -27,7 +27,7 @@ struct GumboInternalParser;
27
27
 
28
28
  GumboAttribute* gumbo_get_attribute(
29
29
  const GumboVector* attributes, const char* name) {
30
- for (int i = 0; i < attributes->length; ++i) {
30
+ for (unsigned int i = 0; i < attributes->length; ++i) {
31
31
  GumboAttribute* attr = attributes->data[i];
32
32
  if (!strcasecmp(attr->name, name)) {
33
33
  return attr;
@@ -30,7 +30,7 @@
30
30
  #include <ctype.h>
31
31
  #include <stddef.h>
32
32
  #include <stdio.h>
33
- #include <string.h> // Only for debug assertions at present.
33
+ #include <string.h> // Only for debug assertions at present.
34
34
 
35
35
  #include "error.h"
36
36
  #include "string_piece.h"
@@ -49,44 +49,18 @@ typedef struct {
49
49
  int to_char;
50
50
  } CharReplacement;
51
51
 
52
- static const CharReplacement kCharReplacements[] = {
53
- { 0x00, 0xfffd },
54
- { 0x0d, 0x000d },
55
- { 0x80, 0x20ac },
56
- { 0x81, 0x0081 },
57
- { 0x82, 0x201A },
58
- { 0x83, 0x0192 },
59
- { 0x84, 0x201E },
60
- { 0x85, 0x2026 },
61
- { 0x86, 0x2020 },
62
- { 0x87, 0x2021 },
63
- { 0x88, 0x02C6 },
64
- { 0x89, 0x2030 },
65
- { 0x8A, 0x0160 },
66
- { 0x8B, 0x2039 },
67
- { 0x8C, 0x0152 },
68
- { 0x8D, 0x008D },
69
- { 0x8E, 0x017D },
70
- { 0x8F, 0x008F },
71
- { 0x90, 0x0090 },
72
- { 0x91, 0x2018 },
73
- { 0x92, 0x2019 },
74
- { 0x93, 0x201C },
75
- { 0x94, 0x201D },
76
- { 0x95, 0x2022 },
77
- { 0x96, 0x2013 },
78
- { 0x97, 0x2014 },
79
- { 0x98, 0x02DC },
80
- { 0x99, 0x2122 },
81
- { 0x9A, 0x0161 },
82
- { 0x9B, 0x203A },
83
- { 0x9C, 0x0153 },
84
- { 0x9D, 0x009D },
85
- { 0x9E, 0x017E },
86
- { 0x9F, 0x0178 },
87
- // Terminator.
88
- { -1, -1 }
89
- };
52
+ static const CharReplacement kCharReplacements[] = {{0x00, 0xfffd},
53
+ {0x0d, 0x000d}, {0x80, 0x20ac}, {0x81, 0x0081}, {0x82, 0x201A},
54
+ {0x83, 0x0192}, {0x84, 0x201E}, {0x85, 0x2026}, {0x86, 0x2020},
55
+ {0x87, 0x2021}, {0x88, 0x02C6}, {0x89, 0x2030}, {0x8A, 0x0160},
56
+ {0x8B, 0x2039}, {0x8C, 0x0152}, {0x8D, 0x008D}, {0x8E, 0x017D},
57
+ {0x8F, 0x008F}, {0x90, 0x0090}, {0x91, 0x2018}, {0x92, 0x2019},
58
+ {0x93, 0x201C}, {0x94, 0x201D}, {0x95, 0x2022}, {0x96, 0x2013},
59
+ {0x97, 0x2014}, {0x98, 0x02DC}, {0x99, 0x2122}, {0x9A, 0x0161},
60
+ {0x9B, 0x203A}, {0x9C, 0x0153}, {0x9D, 0x009D}, {0x9E, 0x017E},
61
+ {0x9F, 0x0178},
62
+ // Terminator.
63
+ {-1, -1}};
90
64
 
91
65
  static int parse_digit(int c, bool allow_hex) {
92
66
  if (c >= '0' && c <= '9') {
@@ -111,9 +85,8 @@ static void add_no_digit_error(
111
85
  error->type = GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS;
112
86
  }
113
87
 
114
- static void add_codepoint_error(
115
- struct GumboInternalParser* parser, Utf8Iterator* input,
116
- GumboErrorType type, int codepoint) {
88
+ static void add_codepoint_error(struct GumboInternalParser* parser,
89
+ Utf8Iterator* input, GumboErrorType type, int codepoint) {
117
90
  GumboError* error = gumbo_add_error(parser);
118
91
  if (!error) {
119
92
  return;
@@ -123,9 +96,8 @@ static void add_codepoint_error(
123
96
  error->v.codepoint = codepoint;
124
97
  }
125
98
 
126
- static void add_named_reference_error(
127
- struct GumboInternalParser* parser, Utf8Iterator* input,
128
- GumboErrorType type, GumboStringPiece text) {
99
+ static void add_named_reference_error(struct GumboInternalParser* parser,
100
+ Utf8Iterator* input, GumboErrorType type, GumboStringPiece text) {
129
101
  GumboError* error = gumbo_add_error(parser);
130
102
  if (!error) {
131
103
  return;
@@ -211,8 +183,7 @@ static bool maybe_add_invalid_named_reference(
211
183
  // worry about consuming characters.
212
184
  const char* start = utf8iterator_get_char_pointer(input);
213
185
  int c = utf8iterator_current(input);
214
- while ((c >= 'a' && c <= 'z') ||
215
- (c >= 'A' && c <= 'Z') ||
186
+ while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
216
187
  (c >= '0' && c <= '9')) {
217
188
  utf8iterator_next(input);
218
189
  c = utf8iterator_current(input);
@@ -228,12 +199,11 @@ static bool maybe_add_invalid_named_reference(
228
199
  return true;
229
200
  }
230
201
 
231
-
232
202
  #line 2465 "char_ref.rl"
233
203
 
204
+ // clang-format off
234
205
 
235
-
236
- #line 237 "char_ref.c"
206
+ #line 238 "char_ref.c"
237
207
  static const short _char_ref_actions[] = {
238
208
  0, 1, 0, 1, 1, 1, 2, 1,
239
209
  3, 1, 4, 1, 5, 1, 6, 1,
@@ -13960,17 +13930,15 @@ static const short _char_ref_eof_trans[] = {
13960
13930
  };
13961
13931
 
13962
13932
  static const int char_ref_start = 7623;
13963
- static const int char_ref_first_final = 7623;
13964
- static const int char_ref_error = 0;
13965
13933
 
13966
13934
  static const int char_ref_en_valid_named_ref = 7623;
13967
13935
 
13968
13936
 
13969
- #line 2468 "char_ref.rl"
13937
+ #line 2469 "char_ref.rl"
13938
+ // clang-format on
13970
13939
 
13971
- static bool consume_named_ref(
13972
- struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
13973
- OneOrTwoCodepoints* output) {
13940
+ static bool consume_named_ref(struct GumboInternalParser* parser,
13941
+ Utf8Iterator* input, bool is_in_attribute, OneOrTwoCodepoints* output) {
13974
13942
  assert(output->first == kGumboNoChar);
13975
13943
  const char* p = utf8iterator_get_char_pointer(input);
13976
13944
  const char* pe = utf8iterator_get_end_pointer(input);
@@ -13979,8 +13947,9 @@ static bool consume_named_ref(
13979
13947
  const char *ts, *start;
13980
13948
  int cs, act;
13981
13949
 
13950
+ // clang-format off
13982
13951
 
13983
- #line 13984 "char_ref.c"
13952
+ #line 13985 "char_ref.c"
13984
13953
  {
13985
13954
  cs = char_ref_start;
13986
13955
  ts = 0;
@@ -13988,14 +13957,15 @@ static bool consume_named_ref(
13988
13957
  act = 0;
13989
13958
  }
13990
13959
 
13991
- #line 2481 "char_ref.rl"
13960
+ #line 2484 "char_ref.rl"
13992
13961
  // Avoid unused variable warnings.
13993
13962
  (void) act;
13994
13963
  (void) ts;
13964
+ (void) char_ref_en_valid_named_ref;
13995
13965
 
13996
13966
  start = p;
13997
13967
 
13998
- #line 13999 "char_ref.c"
13968
+ #line 14001 "char_ref.c"
13999
13969
  {
14000
13970
  int _slen;
14001
13971
  int _trans;
@@ -14017,7 +13987,7 @@ _resume:
14017
13987
  #line 1 "NONE"
14018
13988
  {ts = p;}
14019
13989
  break;
14020
- #line 14021 "char_ref.c"
13990
+ #line 14023 "char_ref.c"
14021
13991
  }
14022
13992
  }
14023
13993
 
@@ -23000,7 +22970,7 @@ _eof_trans:
23000
22970
  #line 2273 "char_ref.rl"
23001
22971
  {{p = ((te))-1;}{ output->first = 0xd7; {p++; goto _out; } }}
23002
22972
  break;
23003
- #line 23004 "char_ref.c"
22973
+ #line 23006 "char_ref.c"
23004
22974
  }
23005
22975
  }
23006
22976
 
@@ -23013,7 +22983,7 @@ _again:
23013
22983
  #line 1 "NONE"
23014
22984
  {ts = 0;}
23015
22985
  break;
23016
- #line 23017 "char_ref.c"
22986
+ #line 23019 "char_ref.c"
23017
22987
  }
23018
22988
  }
23019
22989
 
@@ -23033,7 +23003,8 @@ _again:
23033
23003
  _out: {}
23034
23004
  }
23035
23005
 
23036
- #line 2487 "char_ref.rl"
23006
+ #line 2491 "char_ref.rl"
23007
+ // clang-format on
23037
23008
 
23038
23009
  if (cs >= 7623) {
23039
23010
  assert(output->first != kGumboNoChar);
@@ -23067,10 +23038,9 @@ _again:
23067
23038
  }
23068
23039
  }
23069
23040
 
23070
- bool consume_char_ref(
23071
- struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
23072
- int additional_allowed_char, bool is_in_attribute,
23073
- OneOrTwoCodepoints* output) {
23041
+ bool consume_char_ref(struct GumboInternalParser* parser,
23042
+ struct GumboInternalUtf8Iterator* input, int additional_allowed_char,
23043
+ bool is_in_attribute, OneOrTwoCodepoints* output) {
23074
23044
  utf8iterator_mark(input);
23075
23045
  utf8iterator_next(input);
23076
23046
  int c = utf8iterator_current(input);