RubyGems - nokogumbo - Versions diffs - 1.3.0 → 1.4.1 - Mend

nokogumbo 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/README.md +1 -1
data/ext/nokogumboc/nokogumbo.c +1 -0
data/gumbo-parser/src/error.c +6 -3
data/gumbo-parser/src/gumbo.h +36 -170
data/gumbo-parser/src/parser.c +1030 -779
data/gumbo-parser/src/string_buffer.c +8 -1
data/gumbo-parser/src/string_buffer.h +5 -0
data/gumbo-parser/src/tag.c +35 -162
data/gumbo-parser/src/tag.in +150 -0
data/gumbo-parser/src/tag_enum.h +150 -0
data/gumbo-parser/src/tag_gperf.h +343 -0
data/gumbo-parser/src/tag_sizes.h +1 -0
data/gumbo-parser/src/tag_strings.h +150 -0
data/gumbo-parser/src/token_type.h +1 -0
data/gumbo-parser/src/tokenizer.c +29 -21
data/gumbo-parser/src/utf8.c +9 -8
data/gumbo-parser/src/vector.c +1 -1
data/gumbo-parser/visualc/include/strings.h +2 -1
data/test-nokogumbo.rb +140 -0
metadata +16 -10

data/gumbo-parser/src/token_type.h CHANGED Viewed

@@ -29,6 +29,7 @@ typedef enum {
   GUMBO_TOKEN_COMMENT,
   GUMBO_TOKEN_WHITESPACE,
   GUMBO_TOKEN_CHARACTER,
+  GUMBO_TOKEN_CDATA,
   GUMBO_TOKEN_NULL,
   GUMBO_TOKEN_EOF
 } GumboTokenType;

data/gumbo-parser/src/tokenizer.c CHANGED Viewed

@@ -136,6 +136,10 @@ typedef struct GumboInternalTokenizerState {
   // markup declaration state.
   bool _is_current_node_foreign;
+  // A flag indicating whether the tokenizer is in a CDATA section.  If so, then
+  // text tokens emitted will be GUMBO_TOKEN_CDATA.
+  bool _is_in_cdata;
   // Certain states (notably character references) may emit two character tokens
   // at once, but the contract for lex() fills in only one token at a time.  The
   // extra character is buffered here, and then this is checked on entry to
@@ -315,7 +319,11 @@ static int ensure_lowercase(int c) {
   return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
 }
-static GumboTokenType get_char_token_type(int c) {
+static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
+  if (is_in_cdata && c > 0) {
+    return GUMBO_TOKEN_CDATA;
+  }
   switch (c) {
     case '\t':
     case '\n':
@@ -348,12 +356,10 @@ static void clear_temporary_buffer(GumboParser* parser) {
   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
   assert(!tokenizer->_temporary_buffer_emit);
   utf8iterator_mark(&tokenizer->_input);
-  gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
-  gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
+  gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
   // The temporary buffer and script data buffer are the same object in the
   // spec, so the script data buffer should be cleared as well.
-  gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
-  gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
+  gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
 }
 // Appends a codepoint to the temporary buffer.
@@ -475,7 +481,7 @@ static void finish_doctype_system_id(GumboParser* parser) {
 // Writes a single specified character to the output token.
 static void emit_char(GumboParser* parser, int c, GumboToken* output) {
-  output->type = get_char_token_type(c);
+  output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
   output->v.character = c;
   finish_token(parser, output);
 }
@@ -689,7 +695,11 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
   gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
   assert(tag_state->_attributes.data == NULL);
-  gumbo_vector_init(parser, 4, &tag_state->_attributes);
+  // Initial size chosen by statistical analysis of a corpus of 60k webpages.
+  // 99.5% of elements have 0 attributes, 93% of the remainder have 1.  These
+  // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
+  // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
+  gumbo_vector_init(parser, 1, &tag_state->_attributes);
   tag_state->_drop_next_attr_value = false;
   tag_state->_is_start_tag = is_start_tag;
   tag_state->_is_self_closing = false;
@@ -743,11 +753,9 @@ static void finish_tag_name(GumboParser* parser) {
   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
   GumboTagState* tag_state = &tokenizer->_tag_state;
-  const char* temp;
-  copy_over_tag_buffer(parser, &temp);
-  tag_state->_tag = gumbo_tag_enum(temp);
+  tag_state->_tag = gumbo_tagn_enum(
+		tag_state->_buffer.data, tag_state->_buffer.length);
   reinitialize_tag_buffer(parser);
-  gumbo_parser_deallocate(parser, (void*) temp);
 }
 // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
@@ -833,13 +841,9 @@ static void finish_attribute_value(GumboParser* parser) {
 static bool is_appropriate_end_tag(GumboParser* parser) {
   GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
   assert(!tag_state->_is_start_tag);
-  // Null terminate the current string buffer, so it can be passed to
-  // gumbo_tag_enum, but don't increment the length in case we need to dump the
-  // buffer as character tokens.
-  gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
-  --tag_state->_buffer.length;
   return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
-      tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
+      tag_state->_last_start_tag ==
+	  gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
 }
 void gumbo_tokenizer_state_init(
@@ -850,6 +854,7 @@ void gumbo_tokenizer_state_init(
   gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
   tokenizer->_reconsume_current_input = false;
   tokenizer->_is_current_node_foreign = false;
+  tokenizer->_is_in_cdata = false;
   tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
   tokenizer->_buffered_emit_char = kGumboNoChar;
@@ -1588,8 +1593,7 @@ static StateResult handle_script_double_escaped_lt_state(
     int c, GumboToken* output) {
   if (c == '/') {
     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
-    gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
-    gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
+    gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
     return emit_current_char(parser, output);
   } else {
     gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
@@ -2041,6 +2045,7 @@ static StateResult handle_markup_declaration_state(
              utf8iterator_maybe_consume_match(
                 &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
     gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
+    tokenizer->_is_in_cdata = true;
     tokenizer->_reconsume_current_input = true;
   } else {
     tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
@@ -2568,7 +2573,8 @@ static StateResult handle_after_doctype_public_id_state(
       gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
       tokenizer->_reconsume_current_input = true;
       tokenizer->_doc_type_state.force_quirks = true;
-      return NEXT_CHAR;
+      emit_doctype(parser, output);
+      return RETURN_ERROR;
     default:
       tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
       gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
@@ -2813,6 +2819,7 @@ static StateResult handle_cdata_state(
     tokenizer->_reconsume_current_input = true;
     reset_token_start_point(tokenizer);
     gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
+    tokenizer->_is_in_cdata = false;
     return NEXT_CHAR;
   } else {
     return emit_current_char(parser, output);
@@ -2929,7 +2936,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
     assert(!tokenizer->_temporary_buffer_emit);
     assert(tokenizer->_buffered_emit_char == kGumboNoChar);
     int c = utf8iterator_current(&tokenizer->_input);
-    gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
+    gumbo_debug("Lexing character '%c' (%d) in state %d.\n",
+        c, c, tokenizer->_state);
     StateResult result =
         dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
     // We need to clear reconsume_current_input before returning to prevent

data/gumbo-parser/src/utf8.c CHANGED Viewed

@@ -133,10 +133,10 @@ static void read_char(Utf8Iterator* iter) {
     decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
     if (state == UTF8_ACCEPT) {
       iter->_width = c - iter->_start + 1;
-      // This is the special handling for carriage returns that is mandated by the
-      // HTML5 spec.  Since we're looking for particular 7-bit literal characters,
-      // we operate in terms of chars and only need a check for iter overrun,
-      // instead of having to read in a full next code point.
+      // This is the special handling for carriage returns that is mandated by
+      // the HTML5 spec.  Since we're looking for particular 7-bit literal
+      // characters, we operate in terms of chars and only need a check for iter
+      // overrun, instead of having to read in a full next code point.
       // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
       if (code_point == '\r') {
         assert(iter->_width == 1);
@@ -165,10 +165,11 @@ static void read_char(Utf8Iterator* iter) {
       return;
     }
   }
-  // If we got here without exiting early, then we've reached the end of the iterator.
-  // Add an error for truncated input, set the width to consume the rest of the
-  // iterator, and emit a replacement character.  The next time we enter this method,
-  // it will detect that there's no input to consume and
+  // If we got here without exiting early, then we've reached the end of the
+  // iterator.  Add an error for truncated input, set the width to consume the
+  // rest of the iterator, and emit a replacement character.  The next time we
+  // enter this method, it will detect that there's no input to consume and
+  // output an EOF.
   iter->_current = kUtf8ReplacementChar;
   iter->_width = iter->_end - iter->_start;
   add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);

data/gumbo-parser/src/vector.c CHANGED Viewed

@@ -81,7 +81,7 @@ void* gumbo_vector_pop(
   return vector->data[--vector->length];
 }
-int gumbo_vector_index_of(GumboVector* vector, void* element) {
+int gumbo_vector_index_of(GumboVector* vector, const void* element) {
   for (int i = 0; i < vector->length; ++i) {
     if (vector->data[i] == element) {
       return i;

data/gumbo-parser/visualc/include/strings.h CHANGED Viewed

@@ -1,3 +1,4 @@
 /*Dummy file to satisfy source file dependencies on Windows platform*/
 #define strcasecmp _stricmp
-#define  strncasecmp _strnicmp
+#define strncasecmp _strnicmp
+#define inline __inline

data/test-nokogumbo.rb ADDED Viewed

@@ -0,0 +1,140 @@
+$:.unshift('lib')
+$:.unshift('ext/nokogumboc')
+gem 'minitest'
+require 'nokogumbo'
+require 'minitest/autorun'
+class TestNokogumbo < Minitest::Test
+  def test_element_text
+    doc = Nokogiri::HTML5(buffer)
+    assert_equal "content", doc.at('span').text
+  end
+  def test_element_cdata
+    doc = Nokogiri::HTML5(buffer)
+    assert_equal "foo<x>bar", doc.at('textarea').text.strip
+  end
+  def test_attr_value
+    doc = Nokogiri::HTML5(buffer)
+    assert_equal "utf-8", doc.at('meta')['charset']
+  end
+  def test_comment
+    doc = Nokogiri::HTML5(buffer)
+    assert_equal " test comment ", doc.xpath('//comment()').text
+  end
+  def test_unknown_element
+    doc = Nokogiri::HTML5(buffer)
+    assert_equal "main", doc.at('main').name
+  end
+  def test_IO
+    require 'stringio'
+    doc = Nokogiri::HTML5(StringIO.new(buffer))
+    assert_equal 'textarea', doc.at('form').element_children.first.name
+  end
+  def test_nil
+    doc = Nokogiri::HTML5(nil)
+    assert_equal 1, doc.search('body').count
+  end
+  if ''.respond_to? 'encoding'
+    def test_macroman_encoding
+      mac="<span>\xCA</span>".force_encoding('macroman')
+      doc = Nokogiri::HTML5(mac)
+      assert_equal '<span>&#xA0;</span>', doc.at('span').to_xml
+    end
+    def test_iso8859_encoding
+      iso8859="<span>Se\xF1or</span>".force_encoding(Encoding::ASCII_8BIT)
+      doc = Nokogiri::HTML5(iso8859)
+      assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
+    end
+    def test_charset_encoding
+      utf8="<meta charset='utf-8'><span>Se\xC3\xB1or</span>".
+        force_encoding(Encoding::ASCII_8BIT)
+      doc = Nokogiri::HTML5(utf8)
+      assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
+    end
+    def test_bogus_encoding
+      bogus="<meta charset='bogus'><span>Se\xF1or</span>".
+        force_encoding(Encoding::ASCII_8BIT)
+      doc = Nokogiri::HTML5(bogus)
+      assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
+    end
+  end
+  def test_html5_doctype
+    doc = Nokogumbo.parse("<!DOCTYPE html><html></html>")
+    assert_match /<!DOCTYPE html>/, doc.to_html
+  end
+  def test_fragment_head
+    doc = Nokogiri::HTML5.fragment(buffer[/<head>(.*?)<\/head>/m, 1])
+    assert_equal "hello world", doc.xpath('title').text
+    assert_equal "utf-8", doc.xpath('meta').first['charset']
+  end
+  def test_fragment_body
+    doc = Nokogiri::HTML5.fragment(buffer[/<body>(.*?)<\/body>/m, 1])
+    assert_equal '<span>content</span>', doc.xpath('main/span').to_xml
+    assert_equal " test comment ", doc.xpath('comment()').text
+  end
+  def test_xlink_attribute
+    source = <<-EOF.gsub(/^ {6}/, '')
+      <svg xmlns="http://www.w3.org/2000/svg">
+        <a xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#s1"/>
+      </svg>
+    EOF
+    doc = Nokogiri::HTML5.fragment(source)
+    a = doc.at('a')
+    assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
+  end
+  def test_template
+    source = <<-EOF.gsub(/^ {6}/, '')
+      <template id="productrow">
+        <tr>
+          <td class="record"></td>
+          <td></td>
+        </tr>
+      </template>
+    EOF
+    doc = Nokogiri::HTML5.fragment(source)
+    template = doc.at('template')
+    assert_equal "productrow", template['id']
+    assert_equal "record", template.at('td')['class']
+  end
+private
+  def buffer
+    <<-EOF.gsub(/^      /, '')
+      <html>
+        <head>
+          <meta charset="utf-8"/>
+          <title>hello world</title>
+        </head>
+        <body>
+          <h1>hello world</h1>
+          <main>
+            <span>content</span>
+          </main>
+          <!-- test comment -->
+          <form>
+            <textarea>foo<x>bar</textarea>
+          </form>
+        </body>
+      </html>
+    EOF
+  end
+end

metadata CHANGED Viewed

@@ -1,32 +1,32 @@
 --- !ruby/object:Gem::Specification
 name: nokogumbo
 version: !ruby/object:Gem::Version
-  version: 1.3.0
   prerelease:
+  version: 1.4.1
 platform: ruby
 authors:
 - Sam Ruby
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-01-02 00:00:00.000000000 Z
+date: 2015-03-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: nokogiri
-  requirement: !ruby/object:Gem::Requirement
-    none: false
+  version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+    none: false
   type: :runtime
+  name: nokogiri
   prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    none: false
+  requirement: !ruby/object:Gem::Requirement
     requirements:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+    none: false
 description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
   access the result as a Nokogiri parsed document.
 email: rubys@intertwingly.net
@@ -56,6 +56,11 @@ files:
 - gumbo-parser/src/string_piece.c
 - gumbo-parser/src/string_piece.h
 - gumbo-parser/src/tag.c
+- gumbo-parser/src/tag.in
+- gumbo-parser/src/tag_enum.h
+- gumbo-parser/src/tag_gperf.h
+- gumbo-parser/src/tag_sizes.h
+- gumbo-parser/src/tag_strings.h
 - gumbo-parser/src/token_type.h
 - gumbo-parser/src/tokenizer.c
 - gumbo-parser/src/tokenizer.h
@@ -67,6 +72,7 @@ files:
 - gumbo-parser/src/vector.c
 - gumbo-parser/src/vector.h
 - gumbo-parser/visualc/include/strings.h
+- test-nokogumbo.rb
 homepage: https://github.com/rubys/nokogumbo/#readme
 licenses:
 - Apache 2.0
@@ -75,20 +81,20 @@ rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
-required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
+required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
+  none: false
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.23
+rubygems_version: 1.8.23.2
 signing_key:
 specification_version: 3
 summary: Nokogiri interface to the Gumbo HTML5 parser