nokogumbo 1.3.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 96fa61565f78d5491e0b6d5b505cf936524745eb848b8b6584fc15e20c7ae35b
4
+ data.tar.gz: e5416f71bbe90323f04b8aad4dc48b28947e43a9eb46f446f8ca1444f519a07b
5
+ SHA512:
6
+ metadata.gz: 676bf3585d38cd4ad5c72b8b3afd4952e248c747683ae1072dd43f6ce1ccd279177e4d0c75a9821ed76d32806333128152231349d8d113ae5d81279580b13004
7
+ data.tar.gz: 3459078d96977399e75551c4a3ee5623091f48569984b771e540ec111125f5af91e39a8d78cbd3ce9280326b1b9395dc4a0b0d7f0a72294876682cb9fe35e3d9
data/README.md CHANGED
@@ -36,7 +36,13 @@ Example
36
36
  -----
37
37
  ```ruby
38
38
  require 'nokogumbo'
39
- puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title']
39
+ puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
40
+ ```
41
+
42
+ Use `.to_html` instead of `.to_s` when parsing and serializing multiple times
43
+ ```
44
+ require 'nokogumbo'
45
+ Nokogiri::HTML5.parse(Nokogiri::HTML5.parse('<div></div> a').to_html).to_html
40
46
  ```
41
47
 
42
48
  Notes
@@ -83,5 +89,5 @@ Installation
83
89
  Related efforts
84
90
  ============
85
91
 
86
- * [ruby-gumbo](https://github.com/galdor/ruby-gumbo#readme) - a ruby binding
92
+ * [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) - a ruby binding
87
93
  for the Gumbo HTML5 parser.
@@ -1,7 +1,7 @@
1
1
  require 'mkmf'
2
2
  $CFLAGS += " -std=c99"
3
3
 
4
- if have_library('xml2', 'xmlNewDoc')
4
+ if have_library('xml2', 'xmlNewDoc')
5
5
  # libxml2 libraries from http://www.xmlsoft.org/
6
6
  pkg_config('libxml-2.0')
7
7
 
@@ -19,11 +19,6 @@ if have_library('xml2', 'xmlNewDoc')
19
19
 
20
20
  # if found, enable direct calls to Nokogiri (and libxml2)
21
21
  $CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext)
22
-
23
- if File.exists?("/etc/gentoo-release")
24
- # link to the library to prevent: nokogumbo.c:(.text+0x26a): undefined reference to `Nokogiri_wrap_xml_document'
25
- $LDFLAGS += " -L#{nokogiri_ext} -l:nokogiri.so"
26
- end
27
22
  end
28
23
  end
29
24
 
@@ -45,4 +40,21 @@ unless have_library('gumbo', 'gumbo_parse')
45
40
  end
46
41
  end
47
42
 
43
+ # We use some Gumbo Internals, and not all distros ship the internal headers.
44
+ header_typedefs = {
45
+ 'error.h' => 'GumboErrorType',
46
+ 'insertion_mode.h' => 'GumboInsertionMode',
47
+ 'parser.h' => 'GumboParser',
48
+ 'string_buffer.h' => 'GumboStringBuffer',
49
+ 'token_type.h' => 'GumboTokenType',
50
+ }
51
+
52
+ header_typedefs.each_pair do |header, type|
53
+ unless find_type(type, header)
54
+ require 'fileutils'
55
+ FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/#{header}"],
56
+ "#{rakehome}/ext/nokogumboc"
57
+ end
58
+ end
59
+
48
60
  create_makefile('nokogumboc')
@@ -19,10 +19,13 @@
19
19
  //
20
20
 
21
21
  #include <ruby.h>
22
- #include <gumbo.h>
22
+ #include "gumbo.h"
23
+ #include "error.h"
24
+ #include "parser.h"
23
25
 
24
26
  // class constants
25
27
  static VALUE Document;
28
+ static VALUE XMLSyntaxError;
26
29
 
27
30
  #ifdef NGLIB
28
31
  #include <nokogiri.h>
@@ -82,8 +85,11 @@ static VALUE xmlNewDoc(char* version) {
82
85
  }
83
86
  #endif
84
87
 
85
- // Build a Nokogiri Element for a given GumboElement (recursively)
86
- static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
88
+ // Build a xmlNodePtr for a given GumboNode (recursively)
89
+ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
90
+
91
+ // Build a xmlNodePtr for a given GumboElement (recursively)
92
+ static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
87
93
  // determine tag name for a given node
88
94
  xmlNodePtr element;
89
95
  if (node->tag != GUMBO_TAG_UNKNOWN) {
@@ -151,55 +157,108 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
151
157
  // add in the children
152
158
  GumboVector* children = &node->children;
153
159
  for (int i=0; i < children->length; i++) {
154
- GumboNode* child = children->data[i];
155
-
156
- xmlNodePtr node = NIL;
157
-
158
- switch (child->type) {
159
- case GUMBO_NODE_ELEMENT:
160
- node = walk_tree(document, &child->v.element);
161
- break;
162
- case GUMBO_NODE_WHITESPACE:
163
- case GUMBO_NODE_TEXT:
164
- node = xmlNewDocText(document, CONST_CAST child->v.text.text);
165
- break;
166
- case GUMBO_NODE_CDATA:
167
- node = xmlNewCDataBlock(document,
168
- CONST_CAST child->v.text.original_text.data,
169
- (int) child->v.text.original_text.length);
170
- break;
171
- case GUMBO_NODE_COMMENT:
172
- node = xmlNewDocComment(document, CONST_CAST child->v.text.text);
173
- break;
174
- case GUMBO_NODE_DOCUMENT:
175
- break; // should never happen -- ignore
176
- }
177
-
160
+ xmlNodePtr node = walk_tree(document, children->data[i]);
178
161
  if (node) xmlAddChild(element, node);
179
162
  }
180
163
 
181
164
  return element;
182
165
  }
183
166
 
167
+ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
168
+ switch (node->type) {
169
+ case GUMBO_NODE_DOCUMENT:
170
+ return NIL;
171
+ case GUMBO_NODE_ELEMENT:
172
+ case GUMBO_NODE_TEMPLATE:
173
+ return walk_element(document, &node->v.element);
174
+ case GUMBO_NODE_TEXT:
175
+ case GUMBO_NODE_WHITESPACE:
176
+ return xmlNewDocText(document, CONST_CAST node->v.text.text);
177
+ case GUMBO_NODE_CDATA:
178
+ return xmlNewCDataBlock(document,
179
+ CONST_CAST node->v.text.original_text.data,
180
+ (int) node->v.text.original_text.length);
181
+ case GUMBO_NODE_COMMENT:
182
+ return xmlNewDocComment(document, CONST_CAST node->v.text.text);
183
+ }
184
+ }
185
+
184
186
  // Parse a string using gumbo_parse into a Nokogiri document
185
- static VALUE parse(VALUE self, VALUE string) {
186
- GumboOutput *output = gumbo_parse_with_options(
187
- &kGumboDefaultOptions, RSTRING_PTR(string),
188
- (size_t) RSTRING_LEN(string)
189
- );
187
+ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
188
+ GumboOptions options;
189
+ memcpy(&options, &kGumboDefaultOptions, sizeof options);
190
+ options.max_errors = NUM2INT(max_parse_errors);
191
+
192
+ const char *input = RSTRING_PTR(string);
193
+ size_t input_len = RSTRING_LEN(string);
194
+ GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
190
195
  xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
191
- xmlNodePtr root = walk_tree(doc, &output->root->v.element);
192
- xmlDocSetRootElement(doc, root);
196
+ #ifdef NGLIB
197
+ doc->type = XML_HTML_DOCUMENT_NODE;
198
+ #endif
193
199
  if (output->document->v.document.has_doctype) {
200
+ const char *name = output->document->v.document.name;
194
201
  const char *public = output->document->v.document.public_identifier;
195
202
  const char *system = output->document->v.document.system_identifier;
196
- xmlCreateIntSubset(doc, CONST_CAST "html",
197
- (strlen(public) ? CONST_CAST public : NIL),
198
- (strlen(system) ? CONST_CAST system : NIL));
203
+ xmlCreateIntSubset(doc, CONST_CAST name,
204
+ (public[0] ? CONST_CAST public : NIL),
205
+ (system[0] ? CONST_CAST system : NIL));
199
206
  }
200
- gumbo_destroy_output(&kGumboDefaultOptions, output);
201
207
 
202
- return Nokogiri_wrap_xml_document(Document, doc);
208
+ GumboVector *children = &output->document->v.document.children;
209
+ for (int i=0; i < children->length; i++) {
210
+ GumboNode *child = children->data[i];
211
+ xmlNodePtr node = walk_tree(doc, child);
212
+ if (node) {
213
+ if (child == output->root)
214
+ xmlDocSetRootElement(doc, node);
215
+ else
216
+ xmlAddChild((xmlNodePtr)doc, node);
217
+ }
218
+ }
219
+
220
+ VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
221
+
222
+ // Add parse errors to rdoc.
223
+ if (output->errors.length) {
224
+ GumboVector *errors = &output->errors;
225
+ GumboParser parser = { ._options = &options };
226
+ GumboStringBuffer msg;
227
+ VALUE rerrors = rb_ary_new2(errors->length);
228
+
229
+ gumbo_string_buffer_init(&parser, &msg);
230
+ for (int i=0; i < errors->length; i++) {
231
+ GumboError *err = errors->data[i];
232
+ gumbo_string_buffer_clear(&parser, &msg);
233
+ // Work around bug in gumbo_caret_diagnostic_to_string.
234
+ // See https://github.com/google/gumbo-parser/pull/371
235
+ // The bug occurs when the error starts with a newline (unless it's the
236
+ // first character in the input--but that shouldn't cause an error in
237
+ // the first place.
238
+ if (*err->original_text == '\n' && err->original_text != input)
239
+ --err->original_text;
240
+ gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
241
+ VALUE err_str = rb_str_new(msg.data, msg.length);
242
+ VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError);
243
+ rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
244
+ rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
245
+ rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
246
+ rb_iv_set(syntax_error, "@file", Qnil);
247
+ rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
248
+ rb_iv_set(syntax_error, "@str1", Qnil);
249
+ rb_iv_set(syntax_error, "@str2", Qnil);
250
+ rb_iv_set(syntax_error, "@str3", Qnil);
251
+ rb_iv_set(syntax_error, "@int1", INT2NUM(err->type));
252
+ rb_iv_set(syntax_error, "@column", INT2NUM(err->position.column));
253
+ rb_ary_push(rerrors, syntax_error);
254
+ }
255
+ rb_iv_set(rdoc, "@errors", rerrors);
256
+ gumbo_string_buffer_destroy(&parser, &msg);
257
+ }
258
+
259
+ gumbo_destroy_output(&options, output);
260
+
261
+ return rdoc;
203
262
  }
204
263
 
205
264
  // Initialize the Nokogumbo class and fetch constants we will use later
@@ -211,10 +270,11 @@ void Init_nokogumboc() {
211
270
  VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
212
271
  VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
213
272
  Document = rb_const_get(HTML, rb_intern("Document"));
273
+ VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
274
+ XMLSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
214
275
 
215
276
  #ifndef NGLIB
216
277
  // more class constants
217
- VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
218
278
  Element = rb_const_get(XML, rb_intern("Element"));
219
279
  Text = rb_const_get(XML, rb_intern("Text"));
220
280
  CDATA = rb_const_get(XML, rb_intern("CDATA"));
@@ -223,7 +283,7 @@ void Init_nokogumboc() {
223
283
  // interned symbols
224
284
  new = rb_intern("new");
225
285
  set_attribute = rb_intern("set_attribute");
226
- add_child = rb_intern("add_child");
286
+ add_child = rb_intern("add_child_node_and_reparent_attrs");
227
287
  internal_subset = rb_intern("internal_subset");
228
288
  remove_ = rb_intern("remove");
229
289
  create_internal_subset = rb_intern("create_internal_subset");
@@ -231,5 +291,5 @@ void Init_nokogumboc() {
231
291
 
232
292
  // define Nokogumbo class with a singleton parse method
233
293
  VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
234
- rb_define_singleton_method(Gumbo, "parse", parse, 1);
294
+ rb_define_singleton_method(Gumbo, "parse", parse, 2);
235
295
  }
@@ -27,7 +27,7 @@ struct GumboInternalParser;
27
27
 
28
28
  GumboAttribute* gumbo_get_attribute(
29
29
  const GumboVector* attributes, const char* name) {
30
- for (int i = 0; i < attributes->length; ++i) {
30
+ for (unsigned int i = 0; i < attributes->length; ++i) {
31
31
  GumboAttribute* attr = attributes->data[i];
32
32
  if (!strcasecmp(attr->name, name)) {
33
33
  return attr;
@@ -30,7 +30,7 @@
30
30
  #include <ctype.h>
31
31
  #include <stddef.h>
32
32
  #include <stdio.h>
33
- #include <string.h> // Only for debug assertions at present.
33
+ #include <string.h> // Only for debug assertions at present.
34
34
 
35
35
  #include "error.h"
36
36
  #include "string_piece.h"
@@ -49,44 +49,18 @@ typedef struct {
49
49
  int to_char;
50
50
  } CharReplacement;
51
51
 
52
- static const CharReplacement kCharReplacements[] = {
53
- { 0x00, 0xfffd },
54
- { 0x0d, 0x000d },
55
- { 0x80, 0x20ac },
56
- { 0x81, 0x0081 },
57
- { 0x82, 0x201A },
58
- { 0x83, 0x0192 },
59
- { 0x84, 0x201E },
60
- { 0x85, 0x2026 },
61
- { 0x86, 0x2020 },
62
- { 0x87, 0x2021 },
63
- { 0x88, 0x02C6 },
64
- { 0x89, 0x2030 },
65
- { 0x8A, 0x0160 },
66
- { 0x8B, 0x2039 },
67
- { 0x8C, 0x0152 },
68
- { 0x8D, 0x008D },
69
- { 0x8E, 0x017D },
70
- { 0x8F, 0x008F },
71
- { 0x90, 0x0090 },
72
- { 0x91, 0x2018 },
73
- { 0x92, 0x2019 },
74
- { 0x93, 0x201C },
75
- { 0x94, 0x201D },
76
- { 0x95, 0x2022 },
77
- { 0x96, 0x2013 },
78
- { 0x97, 0x2014 },
79
- { 0x98, 0x02DC },
80
- { 0x99, 0x2122 },
81
- { 0x9A, 0x0161 },
82
- { 0x9B, 0x203A },
83
- { 0x9C, 0x0153 },
84
- { 0x9D, 0x009D },
85
- { 0x9E, 0x017E },
86
- { 0x9F, 0x0178 },
87
- // Terminator.
88
- { -1, -1 }
89
- };
52
+ static const CharReplacement kCharReplacements[] = {{0x00, 0xfffd},
53
+ {0x0d, 0x000d}, {0x80, 0x20ac}, {0x81, 0x0081}, {0x82, 0x201A},
54
+ {0x83, 0x0192}, {0x84, 0x201E}, {0x85, 0x2026}, {0x86, 0x2020},
55
+ {0x87, 0x2021}, {0x88, 0x02C6}, {0x89, 0x2030}, {0x8A, 0x0160},
56
+ {0x8B, 0x2039}, {0x8C, 0x0152}, {0x8D, 0x008D}, {0x8E, 0x017D},
57
+ {0x8F, 0x008F}, {0x90, 0x0090}, {0x91, 0x2018}, {0x92, 0x2019},
58
+ {0x93, 0x201C}, {0x94, 0x201D}, {0x95, 0x2022}, {0x96, 0x2013},
59
+ {0x97, 0x2014}, {0x98, 0x02DC}, {0x99, 0x2122}, {0x9A, 0x0161},
60
+ {0x9B, 0x203A}, {0x9C, 0x0153}, {0x9D, 0x009D}, {0x9E, 0x017E},
61
+ {0x9F, 0x0178},
62
+ // Terminator.
63
+ {-1, -1}};
90
64
 
91
65
  static int parse_digit(int c, bool allow_hex) {
92
66
  if (c >= '0' && c <= '9') {
@@ -111,9 +85,8 @@ static void add_no_digit_error(
111
85
  error->type = GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS;
112
86
  }
113
87
 
114
- static void add_codepoint_error(
115
- struct GumboInternalParser* parser, Utf8Iterator* input,
116
- GumboErrorType type, int codepoint) {
88
+ static void add_codepoint_error(struct GumboInternalParser* parser,
89
+ Utf8Iterator* input, GumboErrorType type, int codepoint) {
117
90
  GumboError* error = gumbo_add_error(parser);
118
91
  if (!error) {
119
92
  return;
@@ -123,9 +96,8 @@ static void add_codepoint_error(
123
96
  error->v.codepoint = codepoint;
124
97
  }
125
98
 
126
- static void add_named_reference_error(
127
- struct GumboInternalParser* parser, Utf8Iterator* input,
128
- GumboErrorType type, GumboStringPiece text) {
99
+ static void add_named_reference_error(struct GumboInternalParser* parser,
100
+ Utf8Iterator* input, GumboErrorType type, GumboStringPiece text) {
129
101
  GumboError* error = gumbo_add_error(parser);
130
102
  if (!error) {
131
103
  return;
@@ -211,8 +183,7 @@ static bool maybe_add_invalid_named_reference(
211
183
  // worry about consuming characters.
212
184
  const char* start = utf8iterator_get_char_pointer(input);
213
185
  int c = utf8iterator_current(input);
214
- while ((c >= 'a' && c <= 'z') ||
215
- (c >= 'A' && c <= 'Z') ||
186
+ while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
216
187
  (c >= '0' && c <= '9')) {
217
188
  utf8iterator_next(input);
218
189
  c = utf8iterator_current(input);
@@ -228,12 +199,11 @@ static bool maybe_add_invalid_named_reference(
228
199
  return true;
229
200
  }
230
201
 
231
-
232
202
  #line 2465 "char_ref.rl"
233
203
 
204
+ // clang-format off
234
205
 
235
-
236
- #line 237 "char_ref.c"
206
+ #line 238 "char_ref.c"
237
207
  static const short _char_ref_actions[] = {
238
208
  0, 1, 0, 1, 1, 1, 2, 1,
239
209
  3, 1, 4, 1, 5, 1, 6, 1,
@@ -13960,17 +13930,15 @@ static const short _char_ref_eof_trans[] = {
13960
13930
  };
13961
13931
 
13962
13932
  static const int char_ref_start = 7623;
13963
- static const int char_ref_first_final = 7623;
13964
- static const int char_ref_error = 0;
13965
13933
 
13966
13934
  static const int char_ref_en_valid_named_ref = 7623;
13967
13935
 
13968
13936
 
13969
- #line 2468 "char_ref.rl"
13937
+ #line 2469 "char_ref.rl"
13938
+ // clang-format on
13970
13939
 
13971
- static bool consume_named_ref(
13972
- struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
13973
- OneOrTwoCodepoints* output) {
13940
+ static bool consume_named_ref(struct GumboInternalParser* parser,
13941
+ Utf8Iterator* input, bool is_in_attribute, OneOrTwoCodepoints* output) {
13974
13942
  assert(output->first == kGumboNoChar);
13975
13943
  const char* p = utf8iterator_get_char_pointer(input);
13976
13944
  const char* pe = utf8iterator_get_end_pointer(input);
@@ -13979,8 +13947,9 @@ static bool consume_named_ref(
13979
13947
  const char *ts, *start;
13980
13948
  int cs, act;
13981
13949
 
13950
+ // clang-format off
13982
13951
 
13983
- #line 13984 "char_ref.c"
13952
+ #line 13985 "char_ref.c"
13984
13953
  {
13985
13954
  cs = char_ref_start;
13986
13955
  ts = 0;
@@ -13988,14 +13957,15 @@ static bool consume_named_ref(
13988
13957
  act = 0;
13989
13958
  }
13990
13959
 
13991
- #line 2481 "char_ref.rl"
13960
+ #line 2484 "char_ref.rl"
13992
13961
  // Avoid unused variable warnings.
13993
13962
  (void) act;
13994
13963
  (void) ts;
13964
+ (void) char_ref_en_valid_named_ref;
13995
13965
 
13996
13966
  start = p;
13997
13967
 
13998
- #line 13999 "char_ref.c"
13968
+ #line 14001 "char_ref.c"
13999
13969
  {
14000
13970
  int _slen;
14001
13971
  int _trans;
@@ -14017,7 +13987,7 @@ _resume:
14017
13987
  #line 1 "NONE"
14018
13988
  {ts = p;}
14019
13989
  break;
14020
- #line 14021 "char_ref.c"
13990
+ #line 14023 "char_ref.c"
14021
13991
  }
14022
13992
  }
14023
13993
 
@@ -23000,7 +22970,7 @@ _eof_trans:
23000
22970
  #line 2273 "char_ref.rl"
23001
22971
  {{p = ((te))-1;}{ output->first = 0xd7; {p++; goto _out; } }}
23002
22972
  break;
23003
- #line 23004 "char_ref.c"
22973
+ #line 23006 "char_ref.c"
23004
22974
  }
23005
22975
  }
23006
22976
 
@@ -23013,7 +22983,7 @@ _again:
23013
22983
  #line 1 "NONE"
23014
22984
  {ts = 0;}
23015
22985
  break;
23016
- #line 23017 "char_ref.c"
22986
+ #line 23019 "char_ref.c"
23017
22987
  }
23018
22988
  }
23019
22989
 
@@ -23033,7 +23003,8 @@ _again:
23033
23003
  _out: {}
23034
23004
  }
23035
23005
 
23036
- #line 2487 "char_ref.rl"
23006
+ #line 2491 "char_ref.rl"
23007
+ // clang-format on
23037
23008
 
23038
23009
  if (cs >= 7623) {
23039
23010
  assert(output->first != kGumboNoChar);
@@ -23067,10 +23038,9 @@ _again:
23067
23038
  }
23068
23039
  }
23069
23040
 
23070
- bool consume_char_ref(
23071
- struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
23072
- int additional_allowed_char, bool is_in_attribute,
23073
- OneOrTwoCodepoints* output) {
23041
+ bool consume_char_ref(struct GumboInternalParser* parser,
23042
+ struct GumboInternalUtf8Iterator* input, int additional_allowed_char,
23043
+ bool is_in_attribute, OneOrTwoCodepoints* output) {
23074
23044
  utf8iterator_mark(input);
23075
23045
  utf8iterator_next(input);
23076
23046
  int c = utf8iterator_current(input);