RubyGems - html_tokenizer - Versions diffs - 0.0.1 → 0.0.7 - Mend

html_tokenizer 0.0.1 → 0.0.7

Files changed (13) hide show

checksums.yaml +5 -5
data/.gitignore +1 -0
data/Gemfile.lock +3 -3
data/ext/html_tokenizer_ext/extconf.rb +7 -1
data/ext/html_tokenizer_ext/parser.c +64 -38
data/ext/html_tokenizer_ext/parser.h +6 -0
data/ext/html_tokenizer_ext/tokenizer.c +55 -22
data/ext/html_tokenizer_ext/tokenizer.h +6 -0
data/html_tokenizer.gemspec +1 -1
data/lib/html_tokenizer.rb +3 -2
data/test/unit/parser_test.rb +48 -29
data/test/unit/tokenizer_test.rb +22 -1
metadata +13 -13

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: a3d58539284af566692b81cc4633af1137baabea
-  data.tar.gz: 1877010598cbadadb27212eae39346769fa2afde
+SHA256:
+  metadata.gz: 7f63f5699a8e9dc129392fa0d554196d9c2322c43f19cd21b353194b90d1c0f1
+  data.tar.gz: db308d4bb26d5181da91f9353e29d12d7aa822f02d1f0214959663516891781b
 SHA512:
-  metadata.gz: 9d6e46dfd48e7bb4967cfa13e90bdd213331b6dd38f9a57739ac1317f4b8147a9f11c5a39001ebf1c36554350ec444eaccc19dfe04040dadf2fc71d92435d5d5
-  data.tar.gz: f8fad88c25ff9404d710d4609ffc89fe80b08a71ba864cdca91a50cb9c5c98844befb75c43f5d4c6e0c56641fba57b4fda35ddf1c485abdcaf6bcd121bb4b0be
+  metadata.gz: 7f70e313d9206393e094b38569584f2f5e95bf3cb8abbd840fd063652da7d66b4c4c3a29e55b900f8da4f3c672d312dedb3895cebc5727292287e60b41b3049d
+  data.tar.gz: 9999d3a9e5c51ac426cb33551f14dcba980b7ee45543c000422ea7c80fe6884b34297790f977f53b95d6d128a35d49bcfa052b8b005f52d33d1f1a675d599ec8

data/.gitignore CHANGED Viewed

@@ -30,6 +30,7 @@ tmp/
 *.i*86
 *.x86_64
 *.hex
+*.gem
 # Debug files
 *.dSYM/

data/Gemfile.lock CHANGED Viewed

@@ -1,13 +1,13 @@
 PATH
   remote: .
   specs:
-    html_tokenizer (0.0.1)
+    html_tokenizer (0.0.7)
 GEM
   remote: https://rubygems.org/
   specs:
     minitest (5.9.0)
-    rake (11.1.2)
+    rake (12.3.0)
     rake-compiler (0.9.9)
       rake
@@ -21,4 +21,4 @@ DEPENDENCIES
   rake-compiler
 BUNDLED WITH
-   1.12.3
+   1.16.1

data/ext/html_tokenizer_ext/extconf.rb CHANGED Viewed

@@ -1,6 +1,12 @@
 require 'mkmf'
 $CXXFLAGS += " -std=c++11 "
-$CXXFLAGS += " -g -Og -ggdb "
+$CXXFLAGS += " -g -O1 -ggdb "
+$CFLAGS += " -g -O1 -ggdb "
+if ENV['DEBUG']
+  $CXXFLAGS += "  -DDEBUG "
+  $CFLAGS += "  -DDEBUG "
+end
 create_makefile('html_tokenizer_ext')

data/ext/html_tokenizer_ext/parser.c CHANGED Viewed

@@ -1,4 +1,5 @@
 #include <ruby.h>
+#include <ruby/encoding.h>
 #include "html_tokenizer.h"
 #include "parser.h"
@@ -13,6 +14,7 @@ static void parser_free(void *ptr)
   size_t i;
   if(parser) {
+    tokenizer_free_members(&parser->tk);
     if(parser->doc.data) {
       DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
       xfree(parser->doc.data);
@@ -65,6 +67,7 @@ static inline void parser_append_ref(struct token_reference_t *dest, struct toke
   if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
     dest->type = src->type;
     dest->start = src->start;
+    dest->mb_start = src->mb_start;
     dest->length = src->length;
     dest->line_number = src->line_number;
     dest->column_number = src->column_number;
@@ -79,6 +82,8 @@ static void parser_add_error(struct parser_t *parser, const char *message)
 {
   REALLOC_N(parser->errors, struct parser_document_error_t, parser->errors_count + 1);
   parser->errors[parser->errors_count].message = strdup(message);
+  parser->errors[parser->errors_count].pos = parser->tk.scan.cursor;
+  parser->errors[parser->errors_count].mb_pos = parser->tk.scan.mb_cursor;
   parser->errors[parser->errors_count].line_number = parser->doc.line_number;
   parser->errors[parser->errors_count].column_number = parser->doc.column_number;
   parser->errors_count += 1;
@@ -362,15 +367,21 @@ static inline int rawtext_context(struct parser_t *parser)
 static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
 {
+  rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
   long unsigned int i;
+  const char *buf, *nextlf;
-  for(i = start;i < (start + length); i++) {
-    if(parser->doc.data[i] == '\n') {
+  for(i = 0; i < length;) {
+    buf = &parser->doc.data[start + i];
+    nextlf = memchr(buf, '\n', length - i);
+    if(nextlf) {
       parser->doc.column_number = 0;
       parser->doc.line_number += 1;
+      i += (nextlf - buf) + 1;
     }
     else {
-      parser->doc.column_number += 1;
+      parser->doc.column_number += rb_enc_strlen(buf, buf + length - i, enc);
+      break;
     }
   }
@@ -383,11 +394,14 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
   struct token_reference_t ref = {
     .type = type,
     .start = tk->scan.cursor,
+    .mb_start = tk->scan.mb_cursor,
     .length = length,
     .line_number = parser->doc.line_number,
     .column_number = parser->doc.column_number,
   };
   int parse_again = 1;
+  long unsigned int mb_strlen;
+  rb_encoding *enc;
   while(parse_again) {
     switch(parser->context)
@@ -438,8 +452,10 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
   }
   if(rb_block_given_p()) {
+    enc = rb_enc_from_index(parser->doc.enc_index);
+    mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
     rb_yield_values(5, token_type_to_symbol(type),
-      INT2NUM(ref.start), INT2NUM(ref.start + ref.length),
+      INT2NUM(ref.mb_start), INT2NUM(ref.mb_start + mb_strlen),
       INT2NUM(ref.line_number), INT2NUM(ref.column_number));
   }
@@ -465,6 +481,8 @@ static VALUE parser_initialize_method(VALUE self)
   parser->doc.length = 0;
   parser->doc.data = NULL;
+  parser->doc.enc_index = 0;
+  parser->doc.mb_length = 0;
   parser->doc.line_number = 1;
   parser->doc.column_number = 0;
@@ -478,11 +496,17 @@ static VALUE parser_initialize_method(VALUE self)
 static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
 {
   void *old = parser->doc.data;
+  unsigned long int mb_length;
+  char *buf;
+  rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
   REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
   DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
-    parser->doc.data,  parser->doc.length + length + 1);
-  strcpy(parser->doc.data+parser->doc.length, string);
+    parser->doc.data, parser->doc.length + length + 1);
+  buf = parser->doc.data + parser->doc.length;
+  strcpy(buf, string);
+  mb_length = rb_enc_strlen(buf, buf + length, enc);
   parser->doc.length += length;
+  parser->doc.mb_length += mb_length;
   return 1;
 }
@@ -490,7 +514,7 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
 {
   struct parser_t *parser = NULL;
   char *string = NULL;
-  long unsigned int length = 0, cursor = 0;
+  long unsigned int length = 0, cursor = 0, mb_cursor = 0;
   if(NIL_P(source))
     return Qnil;
@@ -502,6 +526,15 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
   length = strlen(string);
   cursor = parser->doc.length;
+  mb_cursor = parser->doc.mb_length;
+  if(parser->doc.data == NULL) {
+    parser->doc.enc_index = rb_enc_get_index(source);
+  }
+  else if(parser->doc.enc_index != rb_enc_get_index(source)) {
+    rb_raise(rb_eArgError, "cannot append %s string to %s document",
+      rb_enc_name(rb_enc_get(source)), rb_enc_name(rb_enc_from_index(parser->doc.enc_index)));
+  }
   if(!parser_document_append(parser, string, length)) {
     // error
@@ -513,10 +546,12 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
   }
   else {
     parser->tk.scan.cursor = cursor;
-    parser->tk.scan.string = parser->doc.data;
-    parser->tk.scan.length = parser->doc.length;
+    tokenizer_set_scan_string(&parser->tk, parser->doc.data, parser->doc.length);
+    parser->tk.scan.enc_index = parser->doc.enc_index;
+    parser->tk.scan.mb_cursor = mb_cursor;
     tokenizer_scan_all(&parser->tk);
+    tokenizer_free_scan_string(&parser->tk);
   }
   return Qtrue;
@@ -535,17 +570,30 @@ static VALUE parser_append_placeholder_method(VALUE self, VALUE source)
 static VALUE parser_document_method(VALUE self)
 {
   struct parser_t *parser = NULL;
+  rb_encoding *enc;
   Parser_Get_Struct(self, parser);
   if(!parser->doc.data)
     return Qnil;
-  return rb_str_new(parser->doc.data, parser->doc.length);
+  enc = rb_enc_from_index(parser->doc.enc_index);
+  return rb_enc_str_new(parser->doc.data, parser->doc.length, enc);
 }
 static VALUE parser_document_length_method(VALUE self)
 {
   struct parser_t *parser = NULL;
+  rb_encoding *enc;
+  const char *buf;
   Parser_Get_Struct(self, parser);
-  return ULONG2NUM(parser->doc.length);
+  if(parser->doc.data == NULL) {
+    return ULONG2NUM(0);
+  }
+  else {
+    buf = parser->doc.data;
+    enc = rb_enc_from_index(parser->doc.enc_index);
+    return ULONG2NUM(rb_enc_strlen(buf, buf + parser->doc.length, enc));
+  }
 }
 static VALUE parser_context_method(VALUE self)
@@ -588,9 +636,10 @@ static VALUE parser_context_method(VALUE self)
 static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
 {
+  rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
   if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
     return Qnil;
-  return rb_str_new(parser->doc.data+ref->start, ref->length);
+  return rb_enc_str_new(parser->doc.data+ref->start, ref->length, enc);
 }
 static VALUE parser_tag_name_method(VALUE self)
@@ -665,29 +714,6 @@ static VALUE parser_rawtext_text_method(VALUE self)
   return ref_to_str(parser, &parser->rawtext.text);
 }
-static VALUE parser_extract_method(VALUE self, VALUE start_p, VALUE end_p)
-{
-  struct parser_t *parser = NULL;
-  unsigned long int start, end;
-  struct token_reference_t ref;
-  Parser_Get_Struct(self, parser);
-  start = NUM2ULONG(start_p);
-  end = NUM2ULONG(end_p);
-  if(end < start) {
-    rb_raise(rb_eArgError, "'end' must be greater or equal than 'start'");
-  }
-  if(end > parser->doc.length) {
-    rb_raise(rb_eArgError, "'end' argument not in range of document");
-  }
-  ref.type = TOKEN_TEXT; // anything not NONE
-  ref.start = start;
-  ref.length = end - start;
-  return ref_to_str(parser, &ref);
-}
 static VALUE parser_errors_count_method(VALUE self)
 {
   struct parser_t *parser = NULL;
@@ -699,12 +725,13 @@ static VALUE create_parser_error(struct parser_document_error_t *error)
 {
   VALUE module = rb_const_get(rb_cObject, rb_intern("HtmlTokenizer"));
   VALUE klass = rb_const_get(module, rb_intern("ParserError"));
-  VALUE args[3] = {
+  VALUE args[4] = {
     rb_str_new2(error->message),
+    ULONG2NUM(error->mb_pos),
     ULONG2NUM(error->line_number),
     ULONG2NUM(error->column_number),
   };
-  return rb_class_new_instance(3, args, klass);
+  return rb_class_new_instance(4, args, klass);
 }
 static VALUE parser_errors_method(VALUE self, VALUE error_p)
@@ -749,7 +776,6 @@ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer)
   rb_define_method(cParser, "column_number", parser_column_number_method, 0);
   rb_define_method(cParser, "parse", parser_parse_method, 1);
   rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
-  rb_define_method(cParser, "extract", parser_extract_method, 2);
   rb_define_method(cParser, "context", parser_context_method, 0);
   rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
   rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);

data/ext/html_tokenizer_ext/parser.h CHANGED Viewed

@@ -19,6 +19,8 @@ enum parser_context {
 struct parser_document_error_t {
   char *message;
+  long unsigned int pos;
+  long unsigned int mb_pos;
   long unsigned int line_number;
   long unsigned int column_number;
 };
@@ -28,11 +30,15 @@ struct parser_document_t {
   char *data;
   long unsigned int line_number;
   long unsigned int column_number;
+  int enc_index;
+  long unsigned int mb_length;
 };
 struct token_reference_t {
   enum token_type type;
   long unsigned int start;
+  long unsigned int mb_start;
   long unsigned int length;
   long unsigned int line_number;
   long unsigned int column_number;

data/ext/html_tokenizer_ext/tokenizer.c CHANGED Viewed

@@ -1,4 +1,5 @@
 #include <ruby.h>
+#include <ruby/encoding.h>
 #include "html_tokenizer.h"
 #include "tokenizer.h"
@@ -11,16 +12,7 @@ static void tokenizer_free(void *ptr)
 {
   struct tokenizer_t *tk = ptr;
   if(tk) {
-    if(tk->current_tag) {
-      DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
-      xfree(tk->current_tag);
-      tk->current_tag = NULL;
-    }
-    if(tk->scan.string) {
-      DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
-      xfree(tk->scan.string);
-      tk->scan.string = NULL;
-    }
+    tokenizer_free_members(tk);
     DBG_PRINT("tk=%p xfree(tk)", tk);
     xfree(tk);
   }
@@ -60,6 +52,8 @@ void tokenizer_init(struct tokenizer_t *tk)
   tk->scan.string = NULL;
   tk->scan.cursor = 0;
   tk->scan.length = 0;
+  tk->scan.mb_cursor = 0;
+  tk->scan.enc_index = 0;
   tk->attribute_value_start = 0;
   tk->found_attribute = 0;
@@ -72,6 +66,21 @@ void tokenizer_init(struct tokenizer_t *tk)
   return;
 }
+void tokenizer_free_members(struct tokenizer_t *tk)
+{
+  if(tk->current_tag) {
+    DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
+    xfree(tk->current_tag);
+    tk->current_tag = NULL;
+  }
+  if(tk->scan.string) {
+    DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
+    xfree(tk->scan.string);
+    tk->scan.string = NULL;
+  }
+  return;
+}
 VALUE token_type_to_symbol(enum token_type type)
 {
   switch(type) {
@@ -115,17 +124,27 @@ VALUE token_type_to_symbol(enum token_type type)
   return Qnil;
 }
+static long unsigned int tokenizer_mblength(struct tokenizer_t *tk, long unsigned int length)
+{
+  rb_encoding *enc = rb_enc_from_index(tk->scan.enc_index);
+  const char *buf = tk->scan.string + tk->scan.cursor;
+  return rb_enc_strlen(buf, buf + length, enc);
+}
 static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
 {
+  long unsigned int mb_length = tokenizer_mblength(tk, length);
   tk->last_token = type;
-  rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.cursor), INT2NUM(tk->scan.cursor + length));
+  rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.mb_cursor), INT2NUM(tk->scan.mb_cursor + mb_length));
 }
 static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
 {
+  long unsigned int mb_length = tokenizer_mblength(tk, length);
   if(tk->f_callback)
     tk->f_callback(tk, type, length, tk->callback_data);
   tk->scan.cursor += length;
+  tk->scan.mb_cursor += mb_length;
 }
 static VALUE tokenizer_initialize_method(VALUE self)
@@ -356,6 +375,7 @@ static int scan_open_tag(struct tokenizer_t *tk)
   else if(is_doctype(&tk->scan)) {
     tokenizer_callback(tk, TOKEN_TAG_START, 1);
     tokenizer_callback(tk, TOKEN_TAG_NAME, 8);
+    push_context(tk, TOKENIZER_TAG_NAME);
     return 1;
   }
   else if(is_cdata_start(&tk->scan)) {
@@ -642,11 +662,30 @@ void tokenizer_scan_all(struct tokenizer_t *tk)
   return;
 }
+void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
+{
+  const char *old = tk->scan.string;
+  REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
+  DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
+    tk->scan.string, length + 1);
+  if(string && length > 0) {
+    strncpy(tk->scan.string, string, length);
+    tk->scan.string[length] = 0;
+  }
+  tk->scan.length = length;
+  return;
+}
+void tokenizer_free_scan_string(struct tokenizer_t *tk)
+{
+  tokenizer_set_scan_string(tk, NULL, 0);
+  return;
+}
 static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
 {
   struct tokenizer_t *tk = NULL;
   char *c_source;
-  char *old;
   if(NIL_P(source))
     return Qnil;
@@ -656,19 +695,13 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
   c_source = StringValueCStr(source);
   tk->scan.cursor = 0;
-  tk->scan.length = strlen(c_source);
-  old = tk->scan.string;
-  REALLOC_N(tk->scan.string, char, tk->scan.length+1);
-  DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
-    tk->scan.string,  tk->scan.length+1);
-  strncpy(tk->scan.string, c_source, tk->scan.length);
+  tokenizer_set_scan_string(tk, c_source, strlen(c_source));
+  tk->scan.enc_index = rb_enc_get_index(source);
+  tk->scan.mb_cursor = 0;
   tokenizer_scan_all(tk);
-  DBG_PRINT("tk=%p xfree(tk->scan.string) 0x%p", tk, tk->scan.string);
-  xfree(tk->scan.string);
-  tk->scan.string = NULL;
+  tokenizer_free_scan_string(tk);
   return Qtrue;
 }

data/ext/html_tokenizer_ext/tokenizer.h CHANGED Viewed

@@ -43,6 +43,9 @@ struct scan_t {
   char *string;
   long unsigned int cursor;
   long unsigned int length;
+  int enc_index;
+  long unsigned int mb_cursor;
 };
 struct tokenizer_t
@@ -67,6 +70,9 @@ struct tokenizer_t
 void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
 void tokenizer_init(struct tokenizer_t *tk);
+void tokenizer_free_members(struct tokenizer_t *tk);
+void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length);
+void tokenizer_free_scan_string(struct tokenizer_t *tk);
 void tokenizer_scan_all(struct tokenizer_t *tk);
 VALUE token_type_to_symbol(enum token_type type);

data/html_tokenizer.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |spec|
   spec.name    = "html_tokenizer"
-  spec.version = "0.0.1"
+  spec.version = "0.0.7"
   spec.summary = "HTML Tokenizer"
   spec.author  = "Francois Chagnon"

data/lib/html_tokenizer.rb CHANGED Viewed

@@ -2,9 +2,10 @@ require 'html_tokenizer_ext'
 module HtmlTokenizer
   class ParserError < RuntimeError
-    attr_reader :line, :column
-    def initialize(message, line, column)
+    attr_reader :position, :line, :column
+    def initialize(message, position, line, column)
       super(message)
+      @position = position
       @line = line
       @column = column
     end

data/test/unit/parser_test.rb CHANGED Viewed

@@ -431,35 +431,13 @@ class HtmlTokenizer::ParserTest < Minitest::Test
       tokens << token
     end
     assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
-    assert_equal "bar\n", @parser.extract(34, 38)
-  end
-  def test_extract_method
-    parse("abcdefg")
-    assert_equal "a", @parser.extract(0, 1)
-    assert_equal "cd", @parser.extract(2, 4)
-  end
-  def test_extract_method_raises_argument_error_end_past_length
-    parse("abcdefg")
-    e = assert_raises(ArgumentError) do
-      @parser.extract(0, 32)
-    end
-    assert_equal "'end' argument not in range of document", e.message
-  end
-  def test_extract_method_raises_argument_error_end_less_than_start
-    parse("abcdefg")
-    e = assert_raises(ArgumentError) do
-      @parser.extract(1, 0)
-    end
-    assert_equal "'end' must be greater or equal than 'start'", e.message
   end
   def test_solidus_or_tag_name_error
     parse('<>')
     assert_equal 1, @parser.errors_count
     assert_equal "expected '/' or tag name", @parser.errors.first.to_s
+    assert_equal 1, @parser.errors.first.position
     assert_equal 1, @parser.errors.first.line
     assert_equal 1, @parser.errors.first.column
   end
@@ -468,6 +446,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
     parse('< ')
     assert_equal 1, @parser.errors_count
     assert_equal "expected '/' or tag name", @parser.errors.first.to_s
+    assert_equal 1, @parser.errors.first.position
     assert_equal 1, @parser.errors.first.line
     assert_equal 1, @parser.errors.first.column
   end
@@ -476,6 +455,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
     parse('<foo =')
     assert_equal 1, @parser.errors_count
     assert_equal "expected whitespace, '>', attribute name or value", @parser.errors.first.to_s
+    assert_equal 5, @parser.errors.first.position
     assert_equal 1, @parser.errors.first.line
     assert_equal 5, @parser.errors.first.column
   end
@@ -484,6 +464,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
     parse('<foo /x')
     assert_equal 1, @parser.errors_count
     assert_equal "expected '>' after '/'", @parser.errors.first.to_s
+    assert_equal 6, @parser.errors.first.position
     assert_equal 1, @parser.errors.first.line
     assert_equal 6, @parser.errors.first.column
   end
@@ -492,6 +473,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
     parse('<foo / ')
     assert_equal 1, @parser.errors_count
     assert_equal "expected '>' after '/'", @parser.errors.first.to_s
+    assert_equal 6, @parser.errors.first.position
     assert_equal 1, @parser.errors.first.line
     assert_equal 6, @parser.errors.first.column
   end
@@ -499,29 +481,33 @@ class HtmlTokenizer::ParserTest < Minitest::Test
   def test_attribute_name_error
     parse('<foo bar~')
     assert_equal 2, @parser.errors_count
-    assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors.first.to_s
-    assert_equal 1, @parser.errors.first.line
-    assert_equal 8, @parser.errors.first.column
     assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors[0].to_s
+    assert_equal 8, @parser.errors.first.position
     assert_equal 1, @parser.errors[0].line
     assert_equal 8, @parser.errors[0].column
+    assert_equal "expected whitespace, '>', attribute name or value", @parser.errors[1].to_s
+    assert_equal 8, @parser.errors.first.position
+    assert_equal 1, @parser.errors[1].line
+    assert_equal 8, @parser.errors[1].column
   end
   def test_attribute_whitespace_or_equal_error
     parse('<foo bar ~')
     assert_equal 2, @parser.errors_count
-    assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors.first.to_s
-    assert_equal 1, @parser.errors.first.line
-    assert_equal 9, @parser.errors.first.column
     assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors[0].to_s
     assert_equal 1, @parser.errors[0].line
     assert_equal 9, @parser.errors[0].column
+    assert_equal "expected whitespace, '>', attribute name or value", @parser.errors[1].to_s
+    assert_equal 9, @parser.errors.first.position
+    assert_equal 1, @parser.errors[1].line
+    assert_equal 9, @parser.errors[1].column
   end
   def test_attribute_whitespace_or_equal_error_2
     parse('<foo bar = >')
     assert_equal 1, @parser.errors_count
     assert_equal "expected attribute value after '='", @parser.errors.first.to_s
+    assert_equal 11, @parser.errors.first.position
     assert_equal 1, @parser.errors.first.line
     assert_equal 11, @parser.errors.first.column
   end
@@ -530,10 +516,34 @@ class HtmlTokenizer::ParserTest < Minitest::Test
     parse('<foo bar=""x')
     assert_equal 1, @parser.errors_count
     assert_equal "expected space after attribute value", @parser.errors.first.to_s
+    assert_equal 11, @parser.errors.first.position
     assert_equal 1, @parser.errors.first.line
     assert_equal 11, @parser.errors.first.column
   end
+  def test_attribute_with_mutlibyte_characters
+    data = ["<div title", "='your store’s'>"]
+    tokens = []
+    parse(*data) { |name, start, stop| tokens << [name, start, stop, data.join[start...stop]] }
+    assert_equal "div", @parser.tag_name
+    assert_equal "title", @parser.attribute_name
+    assert_equal "your store’s", @parser.attribute_value
+    assert_equal data.join, @parser.document
+    assert_equal data.join.size, @parser.document_length
+    assert_equal data.join.size, @parser.column_number
+    assert_equal [
+      [:tag_start, 0, 1, "<"],
+      [:tag_name, 1, 4, "div"],
+      [:whitespace, 4, 5, " "],
+      [:attribute_name, 5, 10, "title"],
+      [:equal, 10, 11, "="],
+      [:attribute_quoted_value_start, 11, 12, "'"],
+      [:attribute_quoted_value, 12, 24, "your store’s"],
+      [:attribute_quoted_value_end, 24, 25, "'"],
+      [:tag_end, 25, 26, ">"],
+    ], tokens
+  end
   def test_valid_syntaxes
     parse(
       '<div>',
@@ -564,6 +574,15 @@ class HtmlTokenizer::ParserTest < Minitest::Test
     assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
   end
+  def test_doctype_without_space
+    parse('<!DOCTYPE')
+    assert_equal "!DOCTYPE", @parser.tag_name
+    parse('foo')
+    assert_equal "!DOCTYPEfoo", @parser.tag_name
+    assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
+  end
   private
   def parse(*parts, &block)

data/test/unit/tokenizer_test.rb CHANGED Viewed

@@ -324,13 +324,34 @@ class HtmlTokenizer::TokenizerTest < Minitest::Test
     ], result
   end
+  def test_html_with_mutlibyte_characters
+    data = "<div title='your store’s'>foo</div>"
+    result = tokenize(data)
+    assert_equal [
+      [:tag_start, "<"],
+      [:tag_name, "div"],
+      [:whitespace, " "],
+      [:attribute_name, "title"],
+      [:equal, "="],
+      [:attribute_quoted_value_start, "'"],
+      [:attribute_quoted_value, "your store’s"],
+      [:attribute_quoted_value_end, "'"],
+      [:tag_end, ">"],
+      [:text, "foo"],
+      [:tag_start, "<"],
+      [:solidus, "/"],
+      [:tag_name, "div"],
+      [:tag_end, ">"],
+    ], result
+  end
   private
   def tokenize(*parts)
     tokens = []
     @tokenizer = HtmlTokenizer::Tokenizer.new
     parts.each do |part|
-      @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start..(stop-1)]] }
+      @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start...stop]] }
     end
     tokens
   end

metadata CHANGED Viewed

@@ -1,55 +1,55 @@
 --- !ruby/object:Gem::Specification
 name: html_tokenizer
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.7
 platform: ruby
 authors:
 - Francois Chagnon
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-10-26 00:00:00.000000000 Z
+date: 2018-05-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: rake-compiler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: minitest
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
 description:
@@ -60,8 +60,8 @@ extensions:
 - ext/html_tokenizer_ext/extconf.rb
 extra_rdoc_files: []
 files:
-- .autotest
-- .gitignore
+- ".autotest"
+- ".gitignore"
 - Gemfile
 - Gemfile.lock
 - LICENSE
@@ -90,17 +90,17 @@ require_paths:
 - ext
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.0.14.1
+rubygems_version: 2.7.6
 signing_key:
 specification_version: 4
 summary: HTML Tokenizer