nokogumbo 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,7 @@ typedef enum {
29
29
  GUMBO_TOKEN_COMMENT,
30
30
  GUMBO_TOKEN_WHITESPACE,
31
31
  GUMBO_TOKEN_CHARACTER,
32
+ GUMBO_TOKEN_CDATA,
32
33
  GUMBO_TOKEN_NULL,
33
34
  GUMBO_TOKEN_EOF
34
35
  } GumboTokenType;
@@ -136,6 +136,10 @@ typedef struct GumboInternalTokenizerState {
136
136
  // markup declaration state.
137
137
  bool _is_current_node_foreign;
138
138
 
139
+ // A flag indicating whether the tokenizer is in a CDATA section. If so, then
140
+ // text tokens emitted will be GUMBO_TOKEN_CDATA.
141
+ bool _is_in_cdata;
142
+
139
143
  // Certain states (notably character references) may emit two character tokens
140
144
  // at once, but the contract for lex() fills in only one token at a time. The
141
145
  // extra character is buffered here, and then this is checked on entry to
@@ -315,7 +319,11 @@ static int ensure_lowercase(int c) {
315
319
  return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
316
320
  }
317
321
 
318
- static GumboTokenType get_char_token_type(int c) {
322
+ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
323
+ if (is_in_cdata && c > 0) {
324
+ return GUMBO_TOKEN_CDATA;
325
+ }
326
+
319
327
  switch (c) {
320
328
  case '\t':
321
329
  case '\n':
@@ -348,12 +356,10 @@ static void clear_temporary_buffer(GumboParser* parser) {
348
356
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
349
357
  assert(!tokenizer->_temporary_buffer_emit);
350
358
  utf8iterator_mark(&tokenizer->_input);
351
- gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
352
- gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
359
+ gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
353
360
  // The temporary buffer and script data buffer are the same object in the
354
361
  // spec, so the script data buffer should be cleared as well.
355
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
356
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
362
+ gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
357
363
  }
358
364
 
359
365
  // Appends a codepoint to the temporary buffer.
@@ -475,7 +481,7 @@ static void finish_doctype_system_id(GumboParser* parser) {
475
481
 
476
482
  // Writes a single specified character to the output token.
477
483
  static void emit_char(GumboParser* parser, int c, GumboToken* output) {
478
- output->type = get_char_token_type(c);
484
+ output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
479
485
  output->v.character = c;
480
486
  finish_token(parser, output);
481
487
  }
@@ -689,7 +695,11 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
689
695
  gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
690
696
 
691
697
  assert(tag_state->_attributes.data == NULL);
692
- gumbo_vector_init(parser, 4, &tag_state->_attributes);
698
+ // Initial size chosen by statistical analysis of a corpus of 60k webpages.
699
+ // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
700
+ // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
701
+ // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
702
+ gumbo_vector_init(parser, 1, &tag_state->_attributes);
693
703
  tag_state->_drop_next_attr_value = false;
694
704
  tag_state->_is_start_tag = is_start_tag;
695
705
  tag_state->_is_self_closing = false;
@@ -743,11 +753,9 @@ static void finish_tag_name(GumboParser* parser) {
743
753
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
744
754
  GumboTagState* tag_state = &tokenizer->_tag_state;
745
755
 
746
- const char* temp;
747
- copy_over_tag_buffer(parser, &temp);
748
- tag_state->_tag = gumbo_tag_enum(temp);
756
+ tag_state->_tag = gumbo_tagn_enum(
757
+ tag_state->_buffer.data, tag_state->_buffer.length);
749
758
  reinitialize_tag_buffer(parser);
750
- gumbo_parser_deallocate(parser, (void*) temp);
751
759
  }
752
760
 
753
761
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
@@ -833,13 +841,9 @@ static void finish_attribute_value(GumboParser* parser) {
833
841
  static bool is_appropriate_end_tag(GumboParser* parser) {
834
842
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
835
843
  assert(!tag_state->_is_start_tag);
836
- // Null terminate the current string buffer, so it can be passed to
837
- // gumbo_tag_enum, but don't increment the length in case we need to dump the
838
- // buffer as character tokens.
839
- gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
840
- --tag_state->_buffer.length;
841
844
  return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
842
- tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
845
+ tag_state->_last_start_tag ==
846
+ gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
843
847
  }
844
848
 
845
849
  void gumbo_tokenizer_state_init(
@@ -850,6 +854,7 @@ void gumbo_tokenizer_state_init(
850
854
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851
855
  tokenizer->_reconsume_current_input = false;
852
856
  tokenizer->_is_current_node_foreign = false;
857
+ tokenizer->_is_in_cdata = false;
853
858
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
854
859
 
855
860
  tokenizer->_buffered_emit_char = kGumboNoChar;
@@ -1588,8 +1593,7 @@ static StateResult handle_script_double_escaped_lt_state(
1588
1593
  int c, GumboToken* output) {
1589
1594
  if (c == '/') {
1590
1595
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1591
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
1592
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
1596
+ gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1593
1597
  return emit_current_char(parser, output);
1594
1598
  } else {
1595
1599
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
@@ -2041,6 +2045,7 @@ static StateResult handle_markup_declaration_state(
2041
2045
  utf8iterator_maybe_consume_match(
2042
2046
  &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2043
2047
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2048
+ tokenizer->_is_in_cdata = true;
2044
2049
  tokenizer->_reconsume_current_input = true;
2045
2050
  } else {
2046
2051
  tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
@@ -2568,7 +2573,8 @@ static StateResult handle_after_doctype_public_id_state(
2568
2573
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2569
2574
  tokenizer->_reconsume_current_input = true;
2570
2575
  tokenizer->_doc_type_state.force_quirks = true;
2571
- return NEXT_CHAR;
2576
+ emit_doctype(parser, output);
2577
+ return RETURN_ERROR;
2572
2578
  default:
2573
2579
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2574
2580
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
@@ -2813,6 +2819,7 @@ static StateResult handle_cdata_state(
2813
2819
  tokenizer->_reconsume_current_input = true;
2814
2820
  reset_token_start_point(tokenizer);
2815
2821
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2822
+ tokenizer->_is_in_cdata = false;
2816
2823
  return NEXT_CHAR;
2817
2824
  } else {
2818
2825
  return emit_current_char(parser, output);
@@ -2929,7 +2936,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2929
2936
  assert(!tokenizer->_temporary_buffer_emit);
2930
2937
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2931
2938
  int c = utf8iterator_current(&tokenizer->_input);
2932
- gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
2939
+ gumbo_debug("Lexing character '%c' (%d) in state %d.\n",
2940
+ c, c, tokenizer->_state);
2933
2941
  StateResult result =
2934
2942
  dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2935
2943
  // We need to clear reconsume_current_input before returning to prevent
@@ -133,10 +133,10 @@ static void read_char(Utf8Iterator* iter) {
133
133
  decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
134
134
  if (state == UTF8_ACCEPT) {
135
135
  iter->_width = c - iter->_start + 1;
136
- // This is the special handling for carriage returns that is mandated by the
137
- // HTML5 spec. Since we're looking for particular 7-bit literal characters,
138
- // we operate in terms of chars and only need a check for iter overrun,
139
- // instead of having to read in a full next code point.
136
+ // This is the special handling for carriage returns that is mandated by
137
+ // the HTML5 spec. Since we're looking for particular 7-bit literal
138
+ // characters, we operate in terms of chars and only need a check for iter
139
+ // overrun, instead of having to read in a full next code point.
140
140
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
141
141
  if (code_point == '\r') {
142
142
  assert(iter->_width == 1);
@@ -165,10 +165,11 @@ static void read_char(Utf8Iterator* iter) {
165
165
  return;
166
166
  }
167
167
  }
168
- // If we got here without exiting early, then we've reached the end of the iterator.
169
- // Add an error for truncated input, set the width to consume the rest of the
170
- // iterator, and emit a replacement character. The next time we enter this method,
171
- // it will detect that there's no input to consume and
168
+ // If we got here without exiting early, then we've reached the end of the
169
+ // iterator. Add an error for truncated input, set the width to consume the
170
+ // rest of the iterator, and emit a replacement character. The next time we
171
+ // enter this method, it will detect that there's no input to consume and
172
+ // output an EOF.
172
173
  iter->_current = kUtf8ReplacementChar;
173
174
  iter->_width = iter->_end - iter->_start;
174
175
  add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
@@ -81,7 +81,7 @@ void* gumbo_vector_pop(
81
81
  return vector->data[--vector->length];
82
82
  }
83
83
 
84
- int gumbo_vector_index_of(GumboVector* vector, void* element) {
84
+ int gumbo_vector_index_of(GumboVector* vector, const void* element) {
85
85
  for (int i = 0; i < vector->length; ++i) {
86
86
  if (vector->data[i] == element) {
87
87
  return i;
@@ -1,3 +1,4 @@
1
1
  /*Dummy file to satisfy source file dependencies on Windows platform*/
2
2
  #define strcasecmp _stricmp
3
- #define strncasecmp _strnicmp
3
+ #define strncasecmp _strnicmp
4
+ #define inline __inline
data/test-nokogumbo.rb ADDED
@@ -0,0 +1,140 @@
1
+ $:.unshift('lib')
2
+ $:.unshift('ext/nokogumboc')
3
+
4
+ gem 'minitest'
5
+
6
+ require 'nokogumbo'
7
+ require 'minitest/autorun'
8
+
9
+ class TestNokogumbo < Minitest::Test
10
+ def test_element_text
11
+ doc = Nokogiri::HTML5(buffer)
12
+ assert_equal "content", doc.at('span').text
13
+ end
14
+
15
+ def test_element_cdata
16
+ doc = Nokogiri::HTML5(buffer)
17
+ assert_equal "foo<x>bar", doc.at('textarea').text.strip
18
+ end
19
+
20
+ def test_attr_value
21
+ doc = Nokogiri::HTML5(buffer)
22
+ assert_equal "utf-8", doc.at('meta')['charset']
23
+ end
24
+
25
+ def test_comment
26
+ doc = Nokogiri::HTML5(buffer)
27
+ assert_equal " test comment ", doc.xpath('//comment()').text
28
+ end
29
+
30
+ def test_unknown_element
31
+ doc = Nokogiri::HTML5(buffer)
32
+ assert_equal "main", doc.at('main').name
33
+ end
34
+
35
+ def test_IO
36
+ require 'stringio'
37
+ doc = Nokogiri::HTML5(StringIO.new(buffer))
38
+ assert_equal 'textarea', doc.at('form').element_children.first.name
39
+ end
40
+
41
+ def test_nil
42
+ doc = Nokogiri::HTML5(nil)
43
+ assert_equal 1, doc.search('body').count
44
+ end
45
+
46
+ if ''.respond_to? 'encoding'
47
+ def test_macroman_encoding
48
+ mac="<span>\xCA</span>".force_encoding('macroman')
49
+ doc = Nokogiri::HTML5(mac)
50
+ assert_equal '<span>&#xA0;</span>', doc.at('span').to_xml
51
+ end
52
+
53
+ def test_iso8859_encoding
54
+ iso8859="<span>Se\xF1or</span>".force_encoding(Encoding::ASCII_8BIT)
55
+ doc = Nokogiri::HTML5(iso8859)
56
+ assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
57
+ end
58
+
59
+ def test_charset_encoding
60
+ utf8="<meta charset='utf-8'><span>Se\xC3\xB1or</span>".
61
+ force_encoding(Encoding::ASCII_8BIT)
62
+ doc = Nokogiri::HTML5(utf8)
63
+ assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
64
+ end
65
+
66
+ def test_bogus_encoding
67
+ bogus="<meta charset='bogus'><span>Se\xF1or</span>".
68
+ force_encoding(Encoding::ASCII_8BIT)
69
+ doc = Nokogiri::HTML5(bogus)
70
+ assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
71
+ end
72
+ end
73
+
74
+ def test_html5_doctype
75
+ doc = Nokogumbo.parse("<!DOCTYPE html><html></html>")
76
+ assert_match /<!DOCTYPE html>/, doc.to_html
77
+ end
78
+
79
+ def test_fragment_head
80
+ doc = Nokogiri::HTML5.fragment(buffer[/<head>(.*?)<\/head>/m, 1])
81
+ assert_equal "hello world", doc.xpath('title').text
82
+ assert_equal "utf-8", doc.xpath('meta').first['charset']
83
+ end
84
+
85
+ def test_fragment_body
86
+ doc = Nokogiri::HTML5.fragment(buffer[/<body>(.*?)<\/body>/m, 1])
87
+ assert_equal '<span>content</span>', doc.xpath('main/span').to_xml
88
+ assert_equal " test comment ", doc.xpath('comment()').text
89
+ end
90
+
91
+ def test_xlink_attribute
92
+ source = <<-EOF.gsub(/^ {6}/, '')
93
+ <svg xmlns="http://www.w3.org/2000/svg">
94
+ <a xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#s1"/>
95
+ </svg>
96
+ EOF
97
+ doc = Nokogiri::HTML5.fragment(source)
98
+ a = doc.at('a')
99
+ assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
100
+ end
101
+
102
+ def test_template
103
+ source = <<-EOF.gsub(/^ {6}/, '')
104
+ <template id="productrow">
105
+ <tr>
106
+ <td class="record"></td>
107
+ <td></td>
108
+ </tr>
109
+ </template>
110
+ EOF
111
+ doc = Nokogiri::HTML5.fragment(source)
112
+ template = doc.at('template')
113
+ assert_equal "productrow", template['id']
114
+ assert_equal "record", template.at('td')['class']
115
+ end
116
+
117
+ private
118
+
119
+ def buffer
120
+ <<-EOF.gsub(/^ /, '')
121
+ <html>
122
+ <head>
123
+ <meta charset="utf-8"/>
124
+ <title>hello world</title>
125
+ </head>
126
+ <body>
127
+ <h1>hello world</h1>
128
+ <main>
129
+ <span>content</span>
130
+ </main>
131
+ <!-- test comment -->
132
+ <form>
133
+ <textarea>foo<x>bar</textarea>
134
+ </form>
135
+ </body>
136
+ </html>
137
+ EOF
138
+ end
139
+
140
+ end
metadata CHANGED
@@ -1,32 +1,32 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
5
4
  prerelease:
5
+ version: 1.4.1
6
6
  platform: ruby
7
7
  authors:
8
8
  - Sam Ruby
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-01-02 00:00:00.000000000 Z
12
+ date: 2015-03-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: nokogiri
16
- requirement: !ruby/object:Gem::Requirement
17
- none: false
15
+ version_requirements: !ruby/object:Gem::Requirement
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
20
+ none: false
22
21
  type: :runtime
22
+ name: nokogiri
23
23
  prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
25
- none: false
24
+ requirement: !ruby/object:Gem::Requirement
26
25
  requirements:
27
26
  - - ! '>='
28
27
  - !ruby/object:Gem::Version
29
28
  version: '0'
29
+ none: false
30
30
  description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
31
31
  access the result as a Nokogiri parsed document.
32
32
  email: rubys@intertwingly.net
@@ -56,6 +56,11 @@ files:
56
56
  - gumbo-parser/src/string_piece.c
57
57
  - gumbo-parser/src/string_piece.h
58
58
  - gumbo-parser/src/tag.c
59
+ - gumbo-parser/src/tag.in
60
+ - gumbo-parser/src/tag_enum.h
61
+ - gumbo-parser/src/tag_gperf.h
62
+ - gumbo-parser/src/tag_sizes.h
63
+ - gumbo-parser/src/tag_strings.h
59
64
  - gumbo-parser/src/token_type.h
60
65
  - gumbo-parser/src/tokenizer.c
61
66
  - gumbo-parser/src/tokenizer.h
@@ -67,6 +72,7 @@ files:
67
72
  - gumbo-parser/src/vector.c
68
73
  - gumbo-parser/src/vector.h
69
74
  - gumbo-parser/visualc/include/strings.h
75
+ - test-nokogumbo.rb
70
76
  homepage: https://github.com/rubys/nokogumbo/#readme
71
77
  licenses:
72
78
  - Apache 2.0
@@ -75,20 +81,20 @@ rdoc_options: []
75
81
  require_paths:
76
82
  - lib
77
83
  required_ruby_version: !ruby/object:Gem::Requirement
78
- none: false
79
84
  requirements:
80
85
  - - ! '>='
81
86
  - !ruby/object:Gem::Version
82
87
  version: '0'
83
- required_rubygems_version: !ruby/object:Gem::Requirement
84
88
  none: false
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
90
  requirements:
86
91
  - - ! '>='
87
92
  - !ruby/object:Gem::Version
88
93
  version: '0'
94
+ none: false
89
95
  requirements: []
90
96
  rubyforge_project:
91
- rubygems_version: 1.8.23
97
+ rubygems_version: 1.8.23.2
92
98
  signing_key:
93
99
  specification_version: 3
94
100
  summary: Nokogiri interface to the Gumbo HTML5 parser