nokogumbo 1.4.1 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,9 +26,7 @@
26
26
 
27
27
  struct GumboInternalParser;
28
28
 
29
- // Size chosen via statistical analysis of ~60K websites.
30
- // 99% of text nodes and 98% of attribute names/values fit in this initial size.
31
- static const size_t kDefaultStringBufferSize = 5;
29
+ static const size_t kDefaultStringBufferSize = 10;
32
30
 
33
31
  static void maybe_resize_string_buffer(
34
32
  struct GumboInternalParser* parser, size_t additional_chars,
@@ -102,11 +100,6 @@ char* gumbo_string_buffer_to_string(
102
100
  return buffer;
103
101
  }
104
102
 
105
- void gumbo_string_buffer_clear(
106
- struct GumboInternalParser* parser, GumboStringBuffer* input) {
107
- input->length = 0;
108
- }
109
-
110
103
  void gumbo_string_buffer_destroy(
111
104
  struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
112
105
  gumbo_parser_deallocate(parser, buffer->data);
@@ -70,11 +70,6 @@ void gumbo_string_buffer_append_string(
70
70
  char* gumbo_string_buffer_to_string(
71
71
  struct GumboInternalParser* parser, GumboStringBuffer* input);
72
72
 
73
- // Reinitialize this string buffer. This clears it by setting length=0. It
74
- // does not zero out the buffer itself.
75
- void gumbo_string_buffer_clear(
76
- struct GumboInternalParser* parser, GumboStringBuffer* input);
77
-
78
73
  // Deallocates this GumboStringBuffer.
79
74
  void gumbo_string_buffer_destroy(
80
75
  struct GumboInternalParser* parser, GumboStringBuffer* buffer);
@@ -18,25 +18,172 @@
18
18
 
19
19
  #include <assert.h>
20
20
  #include <ctype.h>
21
- #include <string.h>
21
+ #include <strings.h> // For strcasecmp.
22
22
 
23
+ // NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
24
+ // TODO(jdtang): Investigate whether there're efficiency benefits to putting the
25
+ // most common tag names first, or to putting them in alphabetical order and
26
+ // using a binary search.
23
27
  const char* kGumboTagNames[] = {
24
- # include "tag_strings.h"
28
+ "html",
29
+ "head",
30
+ "title",
31
+ "base",
32
+ "link",
33
+ "meta",
34
+ "style",
35
+ "script",
36
+ "noscript",
37
+ "template",
38
+ "body",
39
+ "article",
40
+ "section",
41
+ "nav",
42
+ "aside",
43
+ "h1",
44
+ "h2",
45
+ "h3",
46
+ "h4",
47
+ "h5",
48
+ "h6",
49
+ "hgroup",
50
+ "header",
51
+ "footer",
52
+ "address",
53
+ "p",
54
+ "hr",
55
+ "pre",
56
+ "blockquote",
57
+ "ol",
58
+ "ul",
59
+ "li",
60
+ "dl",
61
+ "dt",
62
+ "dd",
63
+ "figure",
64
+ "figcaption",
65
+ "main",
66
+ "div",
67
+ "a",
68
+ "em",
69
+ "strong",
70
+ "small",
71
+ "s",
72
+ "cite",
73
+ "q",
74
+ "dfn",
75
+ "abbr",
76
+ "data",
77
+ "time",
78
+ "code",
79
+ "var",
80
+ "samp",
81
+ "kbd",
82
+ "sub",
83
+ "sup",
84
+ "i",
85
+ "b",
86
+ "u",
87
+ "mark",
88
+ "ruby",
89
+ "rt",
90
+ "rp",
91
+ "bdi",
92
+ "bdo",
93
+ "span",
94
+ "br",
95
+ "wbr",
96
+ "ins",
97
+ "del",
98
+ "image",
99
+ "img",
100
+ "iframe",
101
+ "embed",
102
+ "object",
103
+ "param",
104
+ "video",
105
+ "audio",
106
+ "source",
107
+ "track",
108
+ "canvas",
109
+ "map",
110
+ "area",
111
+ "math",
112
+ "mi",
113
+ "mo",
114
+ "mn",
115
+ "ms",
116
+ "mtext",
117
+ "mglyph",
118
+ "malignmark",
119
+ "annotation-xml",
120
+ "svg",
121
+ "foreignobject",
122
+ "desc",
123
+ "table",
124
+ "caption",
125
+ "colgroup",
126
+ "col",
127
+ "tbody",
128
+ "thead",
129
+ "tfoot",
130
+ "tr",
131
+ "td",
132
+ "th",
133
+ "form",
134
+ "fieldset",
135
+ "legend",
136
+ "label",
137
+ "input",
138
+ "button",
139
+ "select",
140
+ "datalist",
141
+ "optgroup",
142
+ "option",
143
+ "textarea",
144
+ "keygen",
145
+ "output",
146
+ "progress",
147
+ "meter",
148
+ "details",
149
+ "summary",
150
+ "menu",
151
+ "menuitem",
152
+ "applet",
153
+ "acronym",
154
+ "bgsound",
155
+ "dir",
156
+ "frame",
157
+ "frameset",
158
+ "noframes",
159
+ "isindex",
160
+ "listing",
161
+ "xmp",
162
+ "nextid",
163
+ "noembed",
164
+ "plaintext",
165
+ "rb",
166
+ "strike",
167
+ "basefont",
168
+ "big",
169
+ "blink",
170
+ "center",
171
+ "font",
172
+ "marquee",
173
+ "multicol",
174
+ "nobr",
175
+ "spacer",
176
+ "tt",
25
177
  "", // TAG_UNKNOWN
26
178
  "", // TAG_LAST
27
179
  };
28
180
 
29
- static const unsigned char kGumboTagSizes[] = {
30
- # include "tag_sizes.h"
31
- 0, // TAG_UNKNOWN
32
- 0, // TAG_LAST
33
- };
34
-
35
181
  const char* gumbo_normalized_tagname(GumboTag tag) {
36
182
  assert(tag <= GUMBO_TAG_LAST);
37
183
  return kGumboTagNames[tag];
38
184
  }
39
185
 
186
+ // TODO(jdtang): Add test for this.
40
187
  void gumbo_tag_from_original_text(GumboStringPiece* text) {
41
188
  if (text->data == NULL) {
42
189
  return;
@@ -65,34 +212,14 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
65
212
  }
66
213
  }
67
214
 
68
- static int
69
- case_memcmp(const char *s1, const char *s2, unsigned int n)
70
- {
71
- while (n--) {
72
- unsigned char c1 = tolower(*s1++);
73
- unsigned char c2 = tolower(*s2++);
74
- if (c1 != c2)
75
- return (int)c1 - (int)c2;
76
- }
77
- return 0;
78
- }
79
-
80
- #include "tag_gperf.h"
81
- #define TAG_MAP_SIZE (sizeof(kGumboTagMap)/sizeof(kGumboTagMap[0]))
82
-
83
- GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
84
- if (length) {
85
- unsigned int key = tag_hash(tagname, length);
86
- if (key < TAG_MAP_SIZE) {
87
- GumboTag tag = kGumboTagMap[key];
88
- if (length == kGumboTagSizes[(int)tag] &&
89
- !case_memcmp(tagname, kGumboTagNames[(int)tag], length))
90
- return tag;
215
+ GumboTag gumbo_tag_enum(const char* tagname) {
216
+ for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
217
+ // TODO(jdtang): strcasecmp is non-portable, so if we want to support
218
+ // non-GCC compilers, we'll need some #ifdef magic. This source already has
219
+ // pretty significant issues with MSVC6 anyway.
220
+ if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
221
+ return i;
91
222
  }
92
223
  }
93
224
  return GUMBO_TAG_UNKNOWN;
94
225
  }
95
-
96
- GumboTag gumbo_tag_enum(const char* tagname) {
97
- return gumbo_tagn_enum(tagname, strlen(tagname));
98
- }
@@ -356,10 +356,12 @@ static void clear_temporary_buffer(GumboParser* parser) {
356
356
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
357
357
  assert(!tokenizer->_temporary_buffer_emit);
358
358
  utf8iterator_mark(&tokenizer->_input);
359
- gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
359
+ gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
360
+ gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
360
361
  // The temporary buffer and script data buffer are the same object in the
361
362
  // spec, so the script data buffer should be cleared as well.
362
- gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
363
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
364
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
363
365
  }
364
366
 
365
367
  // Appends a codepoint to the temporary buffer.
@@ -695,11 +697,7 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
695
697
  gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
696
698
 
697
699
  assert(tag_state->_attributes.data == NULL);
698
- // Initial size chosen by statistical analysis of a corpus of 60k webpages.
699
- // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
700
- // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
701
- // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
702
- gumbo_vector_init(parser, 1, &tag_state->_attributes);
700
+ gumbo_vector_init(parser, 4, &tag_state->_attributes);
703
701
  tag_state->_drop_next_attr_value = false;
704
702
  tag_state->_is_start_tag = is_start_tag;
705
703
  tag_state->_is_self_closing = false;
@@ -753,9 +751,11 @@ static void finish_tag_name(GumboParser* parser) {
753
751
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
754
752
  GumboTagState* tag_state = &tokenizer->_tag_state;
755
753
 
756
- tag_state->_tag = gumbo_tagn_enum(
757
- tag_state->_buffer.data, tag_state->_buffer.length);
754
+ const char* temp;
755
+ copy_over_tag_buffer(parser, &temp);
756
+ tag_state->_tag = gumbo_tag_enum(temp);
758
757
  reinitialize_tag_buffer(parser);
758
+ gumbo_parser_deallocate(parser, (void*) temp);
759
759
  }
760
760
 
761
761
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
@@ -841,9 +841,13 @@ static void finish_attribute_value(GumboParser* parser) {
841
841
  static bool is_appropriate_end_tag(GumboParser* parser) {
842
842
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
843
843
  assert(!tag_state->_is_start_tag);
844
+ // Null terminate the current string buffer, so it can be passed to
845
+ // gumbo_tag_enum, but don't increment the length in case we need to dump the
846
+ // buffer as character tokens.
847
+ gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
848
+ --tag_state->_buffer.length;
844
849
  return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
845
- tag_state->_last_start_tag ==
846
- gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
850
+ tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
847
851
  }
848
852
 
849
853
  void gumbo_tokenizer_state_init(
@@ -1593,7 +1597,8 @@ static StateResult handle_script_double_escaped_lt_state(
1593
1597
  int c, GumboToken* output) {
1594
1598
  if (c == '/') {
1595
1599
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1596
- gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1600
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
1601
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
1597
1602
  return emit_current_char(parser, output);
1598
1603
  } else {
1599
1604
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
@@ -2819,7 +2824,7 @@ static StateResult handle_cdata_state(
2819
2824
  tokenizer->_reconsume_current_input = true;
2820
2825
  reset_token_start_point(tokenizer);
2821
2826
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2822
- tokenizer->_is_in_cdata = false;
2827
+ tokenizer->_is_in_cdata = true;
2823
2828
  return NEXT_CHAR;
2824
2829
  } else {
2825
2830
  return emit_current_char(parser, output);
@@ -81,7 +81,7 @@ void* gumbo_vector_pop(
81
81
  return vector->data[--vector->length];
82
82
  }
83
83
 
84
- int gumbo_vector_index_of(GumboVector* vector, const void* element) {
84
+ int gumbo_vector_index_of(GumboVector* vector, void* element) {
85
85
  for (int i = 0; i < vector->length; ++i) {
86
86
  if (vector->data[i] == element) {
87
87
  return i;
data/test-nokogumbo.rb CHANGED
@@ -99,7 +99,7 @@ class TestNokogumbo < Minitest::Test
99
99
  assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
100
100
  end
101
101
 
102
- def test_template
102
+ def x_test_template # future
103
103
  source = <<-EOF.gsub(/^ {6}/, '')
104
104
  <template id="productrow">
105
105
  <tr>
metadata CHANGED
@@ -1,32 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 1.4.1
4
+ version: 1.4.2
6
5
  platform: ruby
7
6
  authors:
8
7
  - Sam Ruby
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2015-03-13 00:00:00.000000000 Z
11
+ date: 2015-05-12 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
- version_requirements: !ruby/object:Gem::Requirement
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
- none: false
21
20
  type: :runtime
22
- name: nokogiri
23
21
  prerelease: false
24
- requirement: !ruby/object:Gem::Requirement
22
+ version_requirements: !ruby/object:Gem::Requirement
25
23
  requirements:
26
- - - ! '>='
24
+ - - ">="
27
25
  - !ruby/object:Gem::Version
28
26
  version: '0'
29
- none: false
30
27
  description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
31
28
  access the result as a Nokogiri parsed document.
32
29
  email: rubys@intertwingly.net
@@ -35,11 +32,10 @@ extensions:
35
32
  - ext/nokogumboc/extconf.rb
36
33
  extra_rdoc_files: []
37
34
  files:
38
- - ext/nokogumboc/extconf.rb
39
- - ext/nokogumboc/nokogumbo.c
40
- - lib/nokogumbo.rb
41
35
  - LICENSE.txt
42
36
  - README.md
37
+ - ext/nokogumboc/extconf.rb
38
+ - ext/nokogumboc/nokogumbo.c
43
39
  - gumbo-parser/src/attribute.c
44
40
  - gumbo-parser/src/attribute.h
45
41
  - gumbo-parser/src/char_ref.c
@@ -56,11 +52,6 @@ files:
56
52
  - gumbo-parser/src/string_piece.c
57
53
  - gumbo-parser/src/string_piece.h
58
54
  - gumbo-parser/src/tag.c
59
- - gumbo-parser/src/tag.in
60
- - gumbo-parser/src/tag_enum.h
61
- - gumbo-parser/src/tag_gperf.h
62
- - gumbo-parser/src/tag_sizes.h
63
- - gumbo-parser/src/tag_strings.h
64
55
  - gumbo-parser/src/token_type.h
65
56
  - gumbo-parser/src/tokenizer.c
66
57
  - gumbo-parser/src/tokenizer.h
@@ -72,30 +63,30 @@ files:
72
63
  - gumbo-parser/src/vector.c
73
64
  - gumbo-parser/src/vector.h
74
65
  - gumbo-parser/visualc/include/strings.h
66
+ - lib/nokogumbo.rb
75
67
  - test-nokogumbo.rb
76
68
  homepage: https://github.com/rubys/nokogumbo/#readme
77
69
  licenses:
78
70
  - Apache 2.0
71
+ metadata: {}
79
72
  post_install_message:
80
73
  rdoc_options: []
81
74
  require_paths:
82
75
  - lib
83
76
  required_ruby_version: !ruby/object:Gem::Requirement
84
77
  requirements:
85
- - - ! '>='
78
+ - - ">="
86
79
  - !ruby/object:Gem::Version
87
80
  version: '0'
88
- none: false
89
81
  required_rubygems_version: !ruby/object:Gem::Requirement
90
82
  requirements:
91
- - - ! '>='
83
+ - - ">="
92
84
  - !ruby/object:Gem::Version
93
85
  version: '0'
94
- none: false
95
86
  requirements: []
96
87
  rubyforge_project:
97
- rubygems_version: 1.8.23.2
88
+ rubygems_version: 2.4.5
98
89
  signing_key:
99
- specification_version: 3
90
+ specification_version: 4
100
91
  summary: Nokogiri interface to the Gumbo HTML5 parser
101
92
  test_files: []