nokogumbo 1.4.1 → 1.4.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,9 +26,7 @@
26
26
 
27
27
  struct GumboInternalParser;
28
28
 
29
- // Size chosen via statistical analysis of ~60K websites.
30
- // 99% of text nodes and 98% of attribute names/values fit in this initial size.
31
- static const size_t kDefaultStringBufferSize = 5;
29
+ static const size_t kDefaultStringBufferSize = 10;
32
30
 
33
31
  static void maybe_resize_string_buffer(
34
32
  struct GumboInternalParser* parser, size_t additional_chars,
@@ -102,11 +100,6 @@ char* gumbo_string_buffer_to_string(
102
100
  return buffer;
103
101
  }
104
102
 
105
- void gumbo_string_buffer_clear(
106
- struct GumboInternalParser* parser, GumboStringBuffer* input) {
107
- input->length = 0;
108
- }
109
-
110
103
  void gumbo_string_buffer_destroy(
111
104
  struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
112
105
  gumbo_parser_deallocate(parser, buffer->data);
@@ -70,11 +70,6 @@ void gumbo_string_buffer_append_string(
70
70
  char* gumbo_string_buffer_to_string(
71
71
  struct GumboInternalParser* parser, GumboStringBuffer* input);
72
72
 
73
- // Reinitialize this string buffer. This clears it by setting length=0. It
74
- // does not zero out the buffer itself.
75
- void gumbo_string_buffer_clear(
76
- struct GumboInternalParser* parser, GumboStringBuffer* input);
77
-
78
73
  // Deallocates this GumboStringBuffer.
79
74
  void gumbo_string_buffer_destroy(
80
75
  struct GumboInternalParser* parser, GumboStringBuffer* buffer);
@@ -18,25 +18,172 @@
18
18
 
19
19
  #include <assert.h>
20
20
  #include <ctype.h>
21
- #include <string.h>
21
+ #include <strings.h> // For strcasecmp.
22
22
 
23
+ // NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
24
+ // TODO(jdtang): Investigate whether there're efficiency benefits to putting the
25
+ // most common tag names first, or to putting them in alphabetical order and
26
+ // using a binary search.
23
27
  const char* kGumboTagNames[] = {
24
- # include "tag_strings.h"
28
+ "html",
29
+ "head",
30
+ "title",
31
+ "base",
32
+ "link",
33
+ "meta",
34
+ "style",
35
+ "script",
36
+ "noscript",
37
+ "template",
38
+ "body",
39
+ "article",
40
+ "section",
41
+ "nav",
42
+ "aside",
43
+ "h1",
44
+ "h2",
45
+ "h3",
46
+ "h4",
47
+ "h5",
48
+ "h6",
49
+ "hgroup",
50
+ "header",
51
+ "footer",
52
+ "address",
53
+ "p",
54
+ "hr",
55
+ "pre",
56
+ "blockquote",
57
+ "ol",
58
+ "ul",
59
+ "li",
60
+ "dl",
61
+ "dt",
62
+ "dd",
63
+ "figure",
64
+ "figcaption",
65
+ "main",
66
+ "div",
67
+ "a",
68
+ "em",
69
+ "strong",
70
+ "small",
71
+ "s",
72
+ "cite",
73
+ "q",
74
+ "dfn",
75
+ "abbr",
76
+ "data",
77
+ "time",
78
+ "code",
79
+ "var",
80
+ "samp",
81
+ "kbd",
82
+ "sub",
83
+ "sup",
84
+ "i",
85
+ "b",
86
+ "u",
87
+ "mark",
88
+ "ruby",
89
+ "rt",
90
+ "rp",
91
+ "bdi",
92
+ "bdo",
93
+ "span",
94
+ "br",
95
+ "wbr",
96
+ "ins",
97
+ "del",
98
+ "image",
99
+ "img",
100
+ "iframe",
101
+ "embed",
102
+ "object",
103
+ "param",
104
+ "video",
105
+ "audio",
106
+ "source",
107
+ "track",
108
+ "canvas",
109
+ "map",
110
+ "area",
111
+ "math",
112
+ "mi",
113
+ "mo",
114
+ "mn",
115
+ "ms",
116
+ "mtext",
117
+ "mglyph",
118
+ "malignmark",
119
+ "annotation-xml",
120
+ "svg",
121
+ "foreignobject",
122
+ "desc",
123
+ "table",
124
+ "caption",
125
+ "colgroup",
126
+ "col",
127
+ "tbody",
128
+ "thead",
129
+ "tfoot",
130
+ "tr",
131
+ "td",
132
+ "th",
133
+ "form",
134
+ "fieldset",
135
+ "legend",
136
+ "label",
137
+ "input",
138
+ "button",
139
+ "select",
140
+ "datalist",
141
+ "optgroup",
142
+ "option",
143
+ "textarea",
144
+ "keygen",
145
+ "output",
146
+ "progress",
147
+ "meter",
148
+ "details",
149
+ "summary",
150
+ "menu",
151
+ "menuitem",
152
+ "applet",
153
+ "acronym",
154
+ "bgsound",
155
+ "dir",
156
+ "frame",
157
+ "frameset",
158
+ "noframes",
159
+ "isindex",
160
+ "listing",
161
+ "xmp",
162
+ "nextid",
163
+ "noembed",
164
+ "plaintext",
165
+ "rb",
166
+ "strike",
167
+ "basefont",
168
+ "big",
169
+ "blink",
170
+ "center",
171
+ "font",
172
+ "marquee",
173
+ "multicol",
174
+ "nobr",
175
+ "spacer",
176
+ "tt",
25
177
  "", // TAG_UNKNOWN
26
178
  "", // TAG_LAST
27
179
  };
28
180
 
29
- static const unsigned char kGumboTagSizes[] = {
30
- # include "tag_sizes.h"
31
- 0, // TAG_UNKNOWN
32
- 0, // TAG_LAST
33
- };
34
-
35
181
  const char* gumbo_normalized_tagname(GumboTag tag) {
36
182
  assert(tag <= GUMBO_TAG_LAST);
37
183
  return kGumboTagNames[tag];
38
184
  }
39
185
 
186
+ // TODO(jdtang): Add test for this.
40
187
  void gumbo_tag_from_original_text(GumboStringPiece* text) {
41
188
  if (text->data == NULL) {
42
189
  return;
@@ -65,34 +212,14 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
65
212
  }
66
213
  }
67
214
 
68
- static int
69
- case_memcmp(const char *s1, const char *s2, unsigned int n)
70
- {
71
- while (n--) {
72
- unsigned char c1 = tolower(*s1++);
73
- unsigned char c2 = tolower(*s2++);
74
- if (c1 != c2)
75
- return (int)c1 - (int)c2;
76
- }
77
- return 0;
78
- }
79
-
80
- #include "tag_gperf.h"
81
- #define TAG_MAP_SIZE (sizeof(kGumboTagMap)/sizeof(kGumboTagMap[0]))
82
-
83
- GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
84
- if (length) {
85
- unsigned int key = tag_hash(tagname, length);
86
- if (key < TAG_MAP_SIZE) {
87
- GumboTag tag = kGumboTagMap[key];
88
- if (length == kGumboTagSizes[(int)tag] &&
89
- !case_memcmp(tagname, kGumboTagNames[(int)tag], length))
90
- return tag;
215
+ GumboTag gumbo_tag_enum(const char* tagname) {
216
+ for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
217
+ // TODO(jdtang): strcasecmp is non-portable, so if we want to support
218
+ // non-GCC compilers, we'll need some #ifdef magic. This source already has
219
+ // pretty significant issues with MSVC6 anyway.
220
+ if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
221
+ return i;
91
222
  }
92
223
  }
93
224
  return GUMBO_TAG_UNKNOWN;
94
225
  }
95
-
96
- GumboTag gumbo_tag_enum(const char* tagname) {
97
- return gumbo_tagn_enum(tagname, strlen(tagname));
98
- }
@@ -356,10 +356,12 @@ static void clear_temporary_buffer(GumboParser* parser) {
356
356
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
357
357
  assert(!tokenizer->_temporary_buffer_emit);
358
358
  utf8iterator_mark(&tokenizer->_input);
359
- gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
359
+ gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
360
+ gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
360
361
  // The temporary buffer and script data buffer are the same object in the
361
362
  // spec, so the script data buffer should be cleared as well.
362
- gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
363
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
364
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
363
365
  }
364
366
 
365
367
  // Appends a codepoint to the temporary buffer.
@@ -695,11 +697,7 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
695
697
  gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
696
698
 
697
699
  assert(tag_state->_attributes.data == NULL);
698
- // Initial size chosen by statistical analysis of a corpus of 60k webpages.
699
- // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
700
- // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
701
- // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
702
- gumbo_vector_init(parser, 1, &tag_state->_attributes);
700
+ gumbo_vector_init(parser, 4, &tag_state->_attributes);
703
701
  tag_state->_drop_next_attr_value = false;
704
702
  tag_state->_is_start_tag = is_start_tag;
705
703
  tag_state->_is_self_closing = false;
@@ -753,9 +751,11 @@ static void finish_tag_name(GumboParser* parser) {
753
751
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
754
752
  GumboTagState* tag_state = &tokenizer->_tag_state;
755
753
 
756
- tag_state->_tag = gumbo_tagn_enum(
757
- tag_state->_buffer.data, tag_state->_buffer.length);
754
+ const char* temp;
755
+ copy_over_tag_buffer(parser, &temp);
756
+ tag_state->_tag = gumbo_tag_enum(temp);
758
757
  reinitialize_tag_buffer(parser);
758
+ gumbo_parser_deallocate(parser, (void*) temp);
759
759
  }
760
760
 
761
761
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
@@ -841,9 +841,13 @@ static void finish_attribute_value(GumboParser* parser) {
841
841
  static bool is_appropriate_end_tag(GumboParser* parser) {
842
842
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
843
843
  assert(!tag_state->_is_start_tag);
844
+ // Null terminate the current string buffer, so it can be passed to
845
+ // gumbo_tag_enum, but don't increment the length in case we need to dump the
846
+ // buffer as character tokens.
847
+ gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
848
+ --tag_state->_buffer.length;
844
849
  return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
845
- tag_state->_last_start_tag ==
846
- gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
850
+ tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
847
851
  }
848
852
 
849
853
  void gumbo_tokenizer_state_init(
@@ -1593,7 +1597,8 @@ static StateResult handle_script_double_escaped_lt_state(
1593
1597
  int c, GumboToken* output) {
1594
1598
  if (c == '/') {
1595
1599
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1596
- gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1600
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
1601
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
1597
1602
  return emit_current_char(parser, output);
1598
1603
  } else {
1599
1604
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
@@ -2819,7 +2824,7 @@ static StateResult handle_cdata_state(
2819
2824
  tokenizer->_reconsume_current_input = true;
2820
2825
  reset_token_start_point(tokenizer);
2821
2826
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2822
- tokenizer->_is_in_cdata = false;
2827
+ tokenizer->_is_in_cdata = true;
2823
2828
  return NEXT_CHAR;
2824
2829
  } else {
2825
2830
  return emit_current_char(parser, output);
@@ -81,7 +81,7 @@ void* gumbo_vector_pop(
81
81
  return vector->data[--vector->length];
82
82
  }
83
83
 
84
- int gumbo_vector_index_of(GumboVector* vector, const void* element) {
84
+ int gumbo_vector_index_of(GumboVector* vector, void* element) {
85
85
  for (int i = 0; i < vector->length; ++i) {
86
86
  if (vector->data[i] == element) {
87
87
  return i;
data/test-nokogumbo.rb CHANGED
@@ -99,7 +99,7 @@ class TestNokogumbo < Minitest::Test
99
99
  assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
100
100
  end
101
101
 
102
- def test_template
102
+ def x_test_template # future
103
103
  source = <<-EOF.gsub(/^ {6}/, '')
104
104
  <template id="productrow">
105
105
  <tr>
metadata CHANGED
@@ -1,32 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 1.4.1
4
+ version: 1.4.2
6
5
  platform: ruby
7
6
  authors:
8
7
  - Sam Ruby
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2015-03-13 00:00:00.000000000 Z
11
+ date: 2015-05-12 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
- version_requirements: !ruby/object:Gem::Requirement
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
- none: false
21
20
  type: :runtime
22
- name: nokogiri
23
21
  prerelease: false
24
- requirement: !ruby/object:Gem::Requirement
22
+ version_requirements: !ruby/object:Gem::Requirement
25
23
  requirements:
26
- - - ! '>='
24
+ - - ">="
27
25
  - !ruby/object:Gem::Version
28
26
  version: '0'
29
- none: false
30
27
  description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
31
28
  access the result as a Nokogiri parsed document.
32
29
  email: rubys@intertwingly.net
@@ -35,11 +32,10 @@ extensions:
35
32
  - ext/nokogumboc/extconf.rb
36
33
  extra_rdoc_files: []
37
34
  files:
38
- - ext/nokogumboc/extconf.rb
39
- - ext/nokogumboc/nokogumbo.c
40
- - lib/nokogumbo.rb
41
35
  - LICENSE.txt
42
36
  - README.md
37
+ - ext/nokogumboc/extconf.rb
38
+ - ext/nokogumboc/nokogumbo.c
43
39
  - gumbo-parser/src/attribute.c
44
40
  - gumbo-parser/src/attribute.h
45
41
  - gumbo-parser/src/char_ref.c
@@ -56,11 +52,6 @@ files:
56
52
  - gumbo-parser/src/string_piece.c
57
53
  - gumbo-parser/src/string_piece.h
58
54
  - gumbo-parser/src/tag.c
59
- - gumbo-parser/src/tag.in
60
- - gumbo-parser/src/tag_enum.h
61
- - gumbo-parser/src/tag_gperf.h
62
- - gumbo-parser/src/tag_sizes.h
63
- - gumbo-parser/src/tag_strings.h
64
55
  - gumbo-parser/src/token_type.h
65
56
  - gumbo-parser/src/tokenizer.c
66
57
  - gumbo-parser/src/tokenizer.h
@@ -72,30 +63,30 @@ files:
72
63
  - gumbo-parser/src/vector.c
73
64
  - gumbo-parser/src/vector.h
74
65
  - gumbo-parser/visualc/include/strings.h
66
+ - lib/nokogumbo.rb
75
67
  - test-nokogumbo.rb
76
68
  homepage: https://github.com/rubys/nokogumbo/#readme
77
69
  licenses:
78
70
  - Apache 2.0
71
+ metadata: {}
79
72
  post_install_message:
80
73
  rdoc_options: []
81
74
  require_paths:
82
75
  - lib
83
76
  required_ruby_version: !ruby/object:Gem::Requirement
84
77
  requirements:
85
- - - ! '>='
78
+ - - ">="
86
79
  - !ruby/object:Gem::Version
87
80
  version: '0'
88
- none: false
89
81
  required_rubygems_version: !ruby/object:Gem::Requirement
90
82
  requirements:
91
- - - ! '>='
83
+ - - ">="
92
84
  - !ruby/object:Gem::Version
93
85
  version: '0'
94
- none: false
95
86
  requirements: []
96
87
  rubyforge_project:
97
- rubygems_version: 1.8.23.2
88
+ rubygems_version: 2.4.5
98
89
  signing_key:
99
- specification_version: 3
90
+ specification_version: 4
100
91
  summary: Nokogiri interface to the Gumbo HTML5 parser
101
92
  test_files: []