nokogumbo 1.3.0 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -29,6 +29,7 @@ typedef enum {
29
29
  GUMBO_TOKEN_COMMENT,
30
30
  GUMBO_TOKEN_WHITESPACE,
31
31
  GUMBO_TOKEN_CHARACTER,
32
+ GUMBO_TOKEN_CDATA,
32
33
  GUMBO_TOKEN_NULL,
33
34
  GUMBO_TOKEN_EOF
34
35
  } GumboTokenType;
@@ -136,6 +136,10 @@ typedef struct GumboInternalTokenizerState {
136
136
  // markup declaration state.
137
137
  bool _is_current_node_foreign;
138
138
 
139
+ // A flag indicating whether the tokenizer is in a CDATA section. If so, then
140
+ // text tokens emitted will be GUMBO_TOKEN_CDATA.
141
+ bool _is_in_cdata;
142
+
139
143
  // Certain states (notably character references) may emit two character tokens
140
144
  // at once, but the contract for lex() fills in only one token at a time. The
141
145
  // extra character is buffered here, and then this is checked on entry to
@@ -315,7 +319,11 @@ static int ensure_lowercase(int c) {
315
319
  return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
316
320
  }
317
321
 
318
- static GumboTokenType get_char_token_type(int c) {
322
+ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
323
+ if (is_in_cdata && c > 0) {
324
+ return GUMBO_TOKEN_CDATA;
325
+ }
326
+
319
327
  switch (c) {
320
328
  case '\t':
321
329
  case '\n':
@@ -348,12 +356,10 @@ static void clear_temporary_buffer(GumboParser* parser) {
348
356
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
349
357
  assert(!tokenizer->_temporary_buffer_emit);
350
358
  utf8iterator_mark(&tokenizer->_input);
351
- gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
352
- gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
359
+ gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
353
360
  // The temporary buffer and script data buffer are the same object in the
354
361
  // spec, so the script data buffer should be cleared as well.
355
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
356
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
362
+ gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
357
363
  }
358
364
 
359
365
  // Appends a codepoint to the temporary buffer.
@@ -475,7 +481,7 @@ static void finish_doctype_system_id(GumboParser* parser) {
475
481
 
476
482
  // Writes a single specified character to the output token.
477
483
  static void emit_char(GumboParser* parser, int c, GumboToken* output) {
478
- output->type = get_char_token_type(c);
484
+ output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
479
485
  output->v.character = c;
480
486
  finish_token(parser, output);
481
487
  }
@@ -689,7 +695,11 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
689
695
  gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
690
696
 
691
697
  assert(tag_state->_attributes.data == NULL);
692
- gumbo_vector_init(parser, 4, &tag_state->_attributes);
698
+ // Initial size chosen by statistical analysis of a corpus of 60k webpages.
699
+ // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
700
+ // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
701
+ // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
702
+ gumbo_vector_init(parser, 1, &tag_state->_attributes);
693
703
  tag_state->_drop_next_attr_value = false;
694
704
  tag_state->_is_start_tag = is_start_tag;
695
705
  tag_state->_is_self_closing = false;
@@ -743,11 +753,9 @@ static void finish_tag_name(GumboParser* parser) {
743
753
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
744
754
  GumboTagState* tag_state = &tokenizer->_tag_state;
745
755
 
746
- const char* temp;
747
- copy_over_tag_buffer(parser, &temp);
748
- tag_state->_tag = gumbo_tag_enum(temp);
756
+ tag_state->_tag = gumbo_tagn_enum(
757
+ tag_state->_buffer.data, tag_state->_buffer.length);
749
758
  reinitialize_tag_buffer(parser);
750
- gumbo_parser_deallocate(parser, (void*) temp);
751
759
  }
752
760
 
753
761
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
@@ -833,13 +841,9 @@ static void finish_attribute_value(GumboParser* parser) {
833
841
  static bool is_appropriate_end_tag(GumboParser* parser) {
834
842
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
835
843
  assert(!tag_state->_is_start_tag);
836
- // Null terminate the current string buffer, so it can be passed to
837
- // gumbo_tag_enum, but don't increment the length in case we need to dump the
838
- // buffer as character tokens.
839
- gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
840
- --tag_state->_buffer.length;
841
844
  return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
842
- tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
845
+ tag_state->_last_start_tag ==
846
+ gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
843
847
  }
844
848
 
845
849
  void gumbo_tokenizer_state_init(
@@ -850,6 +854,7 @@ void gumbo_tokenizer_state_init(
850
854
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851
855
  tokenizer->_reconsume_current_input = false;
852
856
  tokenizer->_is_current_node_foreign = false;
857
+ tokenizer->_is_in_cdata = false;
853
858
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
854
859
 
855
860
  tokenizer->_buffered_emit_char = kGumboNoChar;
@@ -1588,8 +1593,7 @@ static StateResult handle_script_double_escaped_lt_state(
1588
1593
  int c, GumboToken* output) {
1589
1594
  if (c == '/') {
1590
1595
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1591
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
1592
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
1596
+ gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1593
1597
  return emit_current_char(parser, output);
1594
1598
  } else {
1595
1599
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
@@ -2041,6 +2045,7 @@ static StateResult handle_markup_declaration_state(
2041
2045
  utf8iterator_maybe_consume_match(
2042
2046
  &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2043
2047
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2048
+ tokenizer->_is_in_cdata = true;
2044
2049
  tokenizer->_reconsume_current_input = true;
2045
2050
  } else {
2046
2051
  tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
@@ -2568,7 +2573,8 @@ static StateResult handle_after_doctype_public_id_state(
2568
2573
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2569
2574
  tokenizer->_reconsume_current_input = true;
2570
2575
  tokenizer->_doc_type_state.force_quirks = true;
2571
- return NEXT_CHAR;
2576
+ emit_doctype(parser, output);
2577
+ return RETURN_ERROR;
2572
2578
  default:
2573
2579
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2574
2580
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
@@ -2813,6 +2819,7 @@ static StateResult handle_cdata_state(
2813
2819
  tokenizer->_reconsume_current_input = true;
2814
2820
  reset_token_start_point(tokenizer);
2815
2821
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2822
+ tokenizer->_is_in_cdata = false;
2816
2823
  return NEXT_CHAR;
2817
2824
  } else {
2818
2825
  return emit_current_char(parser, output);
@@ -2929,7 +2936,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2929
2936
  assert(!tokenizer->_temporary_buffer_emit);
2930
2937
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2931
2938
  int c = utf8iterator_current(&tokenizer->_input);
2932
- gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
2939
+ gumbo_debug("Lexing character '%c' (%d) in state %d.\n",
2940
+ c, c, tokenizer->_state);
2933
2941
  StateResult result =
2934
2942
  dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2935
2943
  // We need to clear reconsume_current_input before returning to prevent
@@ -133,10 +133,10 @@ static void read_char(Utf8Iterator* iter) {
133
133
  decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
134
134
  if (state == UTF8_ACCEPT) {
135
135
  iter->_width = c - iter->_start + 1;
136
- // This is the special handling for carriage returns that is mandated by the
137
- // HTML5 spec. Since we're looking for particular 7-bit literal characters,
138
- // we operate in terms of chars and only need a check for iter overrun,
139
- // instead of having to read in a full next code point.
136
+ // This is the special handling for carriage returns that is mandated by
137
+ // the HTML5 spec. Since we're looking for particular 7-bit literal
138
+ // characters, we operate in terms of chars and only need a check for iter
139
+ // overrun, instead of having to read in a full next code point.
140
140
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
141
141
  if (code_point == '\r') {
142
142
  assert(iter->_width == 1);
@@ -165,10 +165,11 @@ static void read_char(Utf8Iterator* iter) {
165
165
  return;
166
166
  }
167
167
  }
168
- // If we got here without exiting early, then we've reached the end of the iterator.
169
- // Add an error for truncated input, set the width to consume the rest of the
170
- // iterator, and emit a replacement character. The next time we enter this method,
171
- // it will detect that there's no input to consume and
168
+ // If we got here without exiting early, then we've reached the end of the
169
+ // iterator. Add an error for truncated input, set the width to consume the
170
+ // rest of the iterator, and emit a replacement character. The next time we
171
+ // enter this method, it will detect that there's no input to consume and
172
+ // output an EOF.
172
173
  iter->_current = kUtf8ReplacementChar;
173
174
  iter->_width = iter->_end - iter->_start;
174
175
  add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
@@ -81,7 +81,7 @@ void* gumbo_vector_pop(
81
81
  return vector->data[--vector->length];
82
82
  }
83
83
 
84
- int gumbo_vector_index_of(GumboVector* vector, void* element) {
84
+ int gumbo_vector_index_of(GumboVector* vector, const void* element) {
85
85
  for (int i = 0; i < vector->length; ++i) {
86
86
  if (vector->data[i] == element) {
87
87
  return i;
@@ -1,3 +1,4 @@
1
1
  /*Dummy file to satisfy source file dependencies on Windows platform*/
2
2
  #define strcasecmp _stricmp
3
- #define strncasecmp _strnicmp
3
+ #define strncasecmp _strnicmp
4
+ #define inline __inline
data/test-nokogumbo.rb ADDED
@@ -0,0 +1,140 @@
1
+ $:.unshift('lib')
2
+ $:.unshift('ext/nokogumboc')
3
+
4
+ gem 'minitest'
5
+
6
+ require 'nokogumbo'
7
+ require 'minitest/autorun'
8
+
9
+ class TestNokogumbo < Minitest::Test
10
+ def test_element_text
11
+ doc = Nokogiri::HTML5(buffer)
12
+ assert_equal "content", doc.at('span').text
13
+ end
14
+
15
+ def test_element_cdata
16
+ doc = Nokogiri::HTML5(buffer)
17
+ assert_equal "foo<x>bar", doc.at('textarea').text.strip
18
+ end
19
+
20
+ def test_attr_value
21
+ doc = Nokogiri::HTML5(buffer)
22
+ assert_equal "utf-8", doc.at('meta')['charset']
23
+ end
24
+
25
+ def test_comment
26
+ doc = Nokogiri::HTML5(buffer)
27
+ assert_equal " test comment ", doc.xpath('//comment()').text
28
+ end
29
+
30
+ def test_unknown_element
31
+ doc = Nokogiri::HTML5(buffer)
32
+ assert_equal "main", doc.at('main').name
33
+ end
34
+
35
+ def test_IO
36
+ require 'stringio'
37
+ doc = Nokogiri::HTML5(StringIO.new(buffer))
38
+ assert_equal 'textarea', doc.at('form').element_children.first.name
39
+ end
40
+
41
+ def test_nil
42
+ doc = Nokogiri::HTML5(nil)
43
+ assert_equal 1, doc.search('body').count
44
+ end
45
+
46
+ if ''.respond_to? 'encoding'
47
+ def test_macroman_encoding
48
+ mac="<span>\xCA</span>".force_encoding('macroman')
49
+ doc = Nokogiri::HTML5(mac)
50
+ assert_equal '<span>&#xA0;</span>', doc.at('span').to_xml
51
+ end
52
+
53
+ def test_iso8859_encoding
54
+ iso8859="<span>Se\xF1or</span>".force_encoding(Encoding::ASCII_8BIT)
55
+ doc = Nokogiri::HTML5(iso8859)
56
+ assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
57
+ end
58
+
59
+ def test_charset_encoding
60
+ utf8="<meta charset='utf-8'><span>Se\xC3\xB1or</span>".
61
+ force_encoding(Encoding::ASCII_8BIT)
62
+ doc = Nokogiri::HTML5(utf8)
63
+ assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
64
+ end
65
+
66
+ def test_bogus_encoding
67
+ bogus="<meta charset='bogus'><span>Se\xF1or</span>".
68
+ force_encoding(Encoding::ASCII_8BIT)
69
+ doc = Nokogiri::HTML5(bogus)
70
+ assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
71
+ end
72
+ end
73
+
74
+ def test_html5_doctype
75
+ doc = Nokogumbo.parse("<!DOCTYPE html><html></html>")
76
+ assert_match /<!DOCTYPE html>/, doc.to_html
77
+ end
78
+
79
+ def test_fragment_head
80
+ doc = Nokogiri::HTML5.fragment(buffer[/<head>(.*?)<\/head>/m, 1])
81
+ assert_equal "hello world", doc.xpath('title').text
82
+ assert_equal "utf-8", doc.xpath('meta').first['charset']
83
+ end
84
+
85
+ def test_fragment_body
86
+ doc = Nokogiri::HTML5.fragment(buffer[/<body>(.*?)<\/body>/m, 1])
87
+ assert_equal '<span>content</span>', doc.xpath('main/span').to_xml
88
+ assert_equal " test comment ", doc.xpath('comment()').text
89
+ end
90
+
91
+ def test_xlink_attribute
92
+ source = <<-EOF.gsub(/^ {6}/, '')
93
+ <svg xmlns="http://www.w3.org/2000/svg">
94
+ <a xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#s1"/>
95
+ </svg>
96
+ EOF
97
+ doc = Nokogiri::HTML5.fragment(source)
98
+ a = doc.at('a')
99
+ assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
100
+ end
101
+
102
+ def test_template
103
+ source = <<-EOF.gsub(/^ {6}/, '')
104
+ <template id="productrow">
105
+ <tr>
106
+ <td class="record"></td>
107
+ <td></td>
108
+ </tr>
109
+ </template>
110
+ EOF
111
+ doc = Nokogiri::HTML5.fragment(source)
112
+ template = doc.at('template')
113
+ assert_equal "productrow", template['id']
114
+ assert_equal "record", template.at('td')['class']
115
+ end
116
+
117
+ private
118
+
119
+ def buffer
120
+ <<-EOF.gsub(/^ /, '')
121
+ <html>
122
+ <head>
123
+ <meta charset="utf-8"/>
124
+ <title>hello world</title>
125
+ </head>
126
+ <body>
127
+ <h1>hello world</h1>
128
+ <main>
129
+ <span>content</span>
130
+ </main>
131
+ <!-- test comment -->
132
+ <form>
133
+ <textarea>foo<x>bar</textarea>
134
+ </form>
135
+ </body>
136
+ </html>
137
+ EOF
138
+ end
139
+
140
+ end
metadata CHANGED
@@ -1,32 +1,32 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
5
4
  prerelease:
5
+ version: 1.4.1
6
6
  platform: ruby
7
7
  authors:
8
8
  - Sam Ruby
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-01-02 00:00:00.000000000 Z
12
+ date: 2015-03-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: nokogiri
16
- requirement: !ruby/object:Gem::Requirement
17
- none: false
15
+ version_requirements: !ruby/object:Gem::Requirement
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
20
+ none: false
22
21
  type: :runtime
22
+ name: nokogiri
23
23
  prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
25
- none: false
24
+ requirement: !ruby/object:Gem::Requirement
26
25
  requirements:
27
26
  - - ! '>='
28
27
  - !ruby/object:Gem::Version
29
28
  version: '0'
29
+ none: false
30
30
  description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
31
31
  access the result as a Nokogiri parsed document.
32
32
  email: rubys@intertwingly.net
@@ -56,6 +56,11 @@ files:
56
56
  - gumbo-parser/src/string_piece.c
57
57
  - gumbo-parser/src/string_piece.h
58
58
  - gumbo-parser/src/tag.c
59
+ - gumbo-parser/src/tag.in
60
+ - gumbo-parser/src/tag_enum.h
61
+ - gumbo-parser/src/tag_gperf.h
62
+ - gumbo-parser/src/tag_sizes.h
63
+ - gumbo-parser/src/tag_strings.h
59
64
  - gumbo-parser/src/token_type.h
60
65
  - gumbo-parser/src/tokenizer.c
61
66
  - gumbo-parser/src/tokenizer.h
@@ -67,6 +72,7 @@ files:
67
72
  - gumbo-parser/src/vector.c
68
73
  - gumbo-parser/src/vector.h
69
74
  - gumbo-parser/visualc/include/strings.h
75
+ - test-nokogumbo.rb
70
76
  homepage: https://github.com/rubys/nokogumbo/#readme
71
77
  licenses:
72
78
  - Apache 2.0
@@ -75,20 +81,20 @@ rdoc_options: []
75
81
  require_paths:
76
82
  - lib
77
83
  required_ruby_version: !ruby/object:Gem::Requirement
78
- none: false
79
84
  requirements:
80
85
  - - ! '>='
81
86
  - !ruby/object:Gem::Version
82
87
  version: '0'
83
- required_rubygems_version: !ruby/object:Gem::Requirement
84
88
  none: false
89
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
90
  requirements:
86
91
  - - ! '>='
87
92
  - !ruby/object:Gem::Version
88
93
  version: '0'
94
+ none: false
89
95
  requirements: []
90
96
  rubyforge_project:
91
- rubygems_version: 1.8.23
97
+ rubygems_version: 1.8.23.2
92
98
  signing_key:
93
99
  specification_version: 3
94
100
  summary: Nokogiri interface to the Gumbo HTML5 parser