nokogumbo 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,8 +64,8 @@ typedef struct GumboInternalToken {
64
64
  GumboTokenDocType doc_type;
65
65
  GumboTokenStartTag start_tag;
66
66
  GumboTag end_tag;
67
- const char* text; // For comments.
68
- int character; // For character, whitespace, null, and EOF tokens.
67
+ const char* text; // For comments.
68
+ int character; // For character, whitespace, null, and EOF tokens.
69
69
  } v;
70
70
  } GumboToken;
71
71
 
@@ -19,7 +19,7 @@
19
19
  #include <assert.h>
20
20
  #include <stdint.h>
21
21
  #include <string.h>
22
- #include <strings.h> // For strncasecmp.
22
+ #include <strings.h> // For strncasecmp.
23
23
 
24
24
  #include "error.h"
25
25
  #include "gumbo.h"
@@ -47,9 +47,11 @@ const int kUtf8ReplacementChar = 0xFFFD;
47
47
 
48
48
  // Permission is hereby granted, free of charge, to any person obtaining a copy
49
49
  // of this software and associated documentation files (the "Software"), to deal
50
- // in the Software without restriction, including without limitation the rights to
50
+ // in the Software without restriction, including without limitation the rights
51
+ // to
51
52
  // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
52
- // of the Software, and to permit persons to whom the Software is furnished to do
53
+ // of the Software, and to permit persons to whom the Software is furnished to
54
+ // do
53
55
  // so, subject to the following conditions:
54
56
 
55
57
  // The above copyright notice and this permission notice shall be included in
@@ -59,32 +61,35 @@ const int kUtf8ReplacementChar = 0xFFFD;
59
61
  #define UTF8_REJECT 12
60
62
 
61
63
  static const uint8_t utf8d[] = {
62
- // The first part of the table maps bytes to character classes that
63
- // to reduce the size of the transition table and create bitmasks.
64
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
65
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
66
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
67
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
68
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
69
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
70
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
71
- 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
72
-
73
- // The second part is a transition table that maps a combination
74
- // of a state of the automaton and a character class to a state.
75
- 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
76
- 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
77
- 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
78
- 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
79
- 12,36,12,12,12,12,12,12,12,12,12,12,
64
+ // The first part of the table maps bytes to character classes that
65
+ // to reduce the size of the transition table and create bitmasks.
66
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71
+ 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9,
72
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
73
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2,
74
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10,
75
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8,
76
+ 8, 8, 8, 8, 8, 8,
77
+
78
+ // The second part is a transition table that maps a combination
79
+ // of a state of the automaton and a character class to a state.
80
+ 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12,
81
+ 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12,
82
+ 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
83
+ 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12,
84
+ 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
85
+ 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
80
86
  };
81
87
 
82
88
  uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
83
89
  uint32_t type = utf8d[byte];
84
90
 
85
- *codep = (*state != UTF8_ACCEPT) ?
86
- (byte & 0x3fu) | (*codep << 6) :
87
- (0xff >> type) & (byte);
91
+ *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
92
+ : (0xff >> type) & (byte);
88
93
 
89
94
  *state = utf8d[256 + *state + type];
90
95
  return *state;
@@ -130,13 +135,13 @@ static void read_char(Utf8Iterator* iter) {
130
135
  uint32_t code_point = 0;
131
136
  uint32_t state = UTF8_ACCEPT;
132
137
  for (const char* c = iter->_start; c < iter->_end; ++c) {
133
- decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
138
+ decode(&state, &code_point, (uint32_t)(unsigned char) (*c));
134
139
  if (state == UTF8_ACCEPT) {
135
140
  iter->_width = c - iter->_start + 1;
136
- // This is the special handling for carriage returns that is mandated by the
137
- // HTML5 spec. Since we're looking for particular 7-bit literal characters,
138
- // we operate in terms of chars and only need a check for iter overrun,
139
- // instead of having to read in a full next code point.
141
+ // This is the special handling for carriage returns that is mandated by
142
+ // the HTML5 spec. Since we're looking for particular 7-bit literal
143
+ // characters, we operate in terms of chars and only need a check for iter
144
+ // overrun, instead of having to read in a full next code point.
140
145
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
141
146
  if (code_point == '\r') {
142
147
  assert(iter->_width == 1);
@@ -165,10 +170,11 @@ static void read_char(Utf8Iterator* iter) {
165
170
  return;
166
171
  }
167
172
  }
168
- // If we got here without exiting early, then we've reached the end of the iterator.
169
- // Add an error for truncated input, set the width to consume the rest of the
170
- // iterator, and emit a replacement character. The next time we enter this method,
171
- // it will detect that there's no input to consume and
173
+ // If we got here without exiting early, then we've reached the end of the
174
+ // iterator. Add an error for truncated input, set the width to consume the
175
+ // rest of the iterator, and emit a replacement character. The next time we
176
+ // enter this method, it will detect that there's no input to consume and
177
+ // output an EOF.
172
178
  iter->_current = kUtf8ReplacementChar;
173
179
  iter->_width = iter->_end - iter->_start;
174
180
  add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
@@ -179,10 +185,10 @@ static void update_position(Utf8Iterator* iter) {
179
185
  if (iter->_current == '\n') {
180
186
  ++iter->_pos.line;
181
187
  iter->_pos.column = 1;
182
- } else if(iter->_current == '\t') {
188
+ } else if (iter->_current == '\t') {
183
189
  int tab_stop = iter->_parser->_options->tab_stop;
184
190
  iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
185
- } else if(iter->_current != -1) {
191
+ } else if (iter->_current != -1) {
186
192
  ++iter->_pos.column;
187
193
  }
188
194
  }
@@ -191,13 +197,12 @@ static void update_position(Utf8Iterator* iter) {
191
197
  // forbidden by the HTML5 spec, such as undefined control chars.
192
198
  bool utf8_is_invalid_code_point(int c) {
193
199
  return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
194
- (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
195
- ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
200
+ (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
201
+ ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
196
202
  }
197
203
 
198
- void utf8iterator_init(
199
- GumboParser* parser, const char* source, size_t source_length,
200
- Utf8Iterator* iter) {
204
+ void utf8iterator_init(GumboParser* parser, const char* source,
205
+ size_t source_length, Utf8Iterator* iter) {
201
206
  iter->_start = source;
202
207
  iter->_end = source + source_length;
203
208
  iter->_pos.line = 1;
@@ -215,9 +220,7 @@ void utf8iterator_next(Utf8Iterator* iter) {
215
220
  read_char(iter);
216
221
  }
217
222
 
218
- int utf8iterator_current(const Utf8Iterator* iter) {
219
- return iter->_current;
220
- }
223
+ int utf8iterator_current(const Utf8Iterator* iter) { return iter->_current; }
221
224
 
222
225
  void utf8iterator_get_position(
223
226
  const Utf8Iterator* iter, GumboSourcePosition* output) {
@@ -232,14 +235,13 @@ const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
232
235
  return iter->_end;
233
236
  }
234
237
 
235
- bool utf8iterator_maybe_consume_match(
236
- Utf8Iterator* iter, const char* prefix, size_t length,
237
- bool case_sensitive) {
238
- bool matched = (iter->_start + length <= iter->_end) && (case_sensitive ?
239
- !strncmp(iter->_start, prefix, length) :
240
- !strncasecmp(iter->_start, prefix, length));
238
+ bool utf8iterator_maybe_consume_match(Utf8Iterator* iter, const char* prefix,
239
+ size_t length, bool case_sensitive) {
240
+ bool matched = (iter->_start + length <= iter->_end) &&
241
+ (case_sensitive ? !strncmp(iter->_start, prefix, length)
242
+ : !strncasecmp(iter->_start, prefix, length));
241
243
  if (matched) {
242
- for (int i = 0; i < length; ++i) {
244
+ for (unsigned int i = 0; i < length; ++i) {
243
245
  utf8iterator_next(iter);
244
246
  }
245
247
  return true;
@@ -262,8 +264,7 @@ void utf8iterator_reset(Utf8Iterator* iter) {
262
264
 
263
265
  // Sets the position and original text fields of an error to the value at the
264
266
  // mark.
265
- void utf8iterator_fill_error_at_mark(
266
- Utf8Iterator* iter, GumboError* error) {
267
+ void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) {
267
268
  error->position = iter->_mark_pos;
268
269
  error->original_text = iter->_mark;
269
270
  }
@@ -81,8 +81,7 @@ bool utf8_is_invalid_code_point(int c);
81
81
 
82
82
  // Initializes a new Utf8Iterator from the given byte buffer. The source does
83
83
  // not have to be NUL-terminated, but the length must be passed in explicitly.
84
- void utf8iterator_init(
85
- struct GumboInternalParser* parser, const char* source,
84
+ void utf8iterator_init(struct GumboInternalParser* parser, const char* source,
86
85
  size_t source_length, Utf8Iterator* iter);
87
86
 
88
87
  // Advances the current position by one code point.
@@ -29,7 +29,7 @@
29
29
  // TODO(jdtang): This should be elsewhere, but there's no .c file for
30
30
  // SourcePositions and yet the constant needs some linkage, so this is as good
31
31
  // as any.
32
- const GumboSourcePosition kGumboEmptySourcePosition = { 0, 0, 0 };
32
+ const GumboSourcePosition kGumboEmptySourcePosition = {0, 0, 0};
33
33
 
34
34
  void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) {
35
35
  return parser->_options->allocator(parser->_options->userdata, num_bytes);
@@ -25,8 +25,6 @@
25
25
  #include <stdbool.h>
26
26
  #include <stddef.h>
27
27
 
28
-
29
-
30
28
  #ifdef __cplusplus
31
29
  extern "C" {
32
30
  #endif
@@ -25,21 +25,22 @@
25
25
 
26
26
  struct GumboInternalParser;
27
27
 
28
- const GumboVector kGumboEmptyVector = { NULL, 0, 0 };
28
+ const GumboVector kGumboEmptyVector = {NULL, 0, 0};
29
29
 
30
- void gumbo_vector_init(
31
- struct GumboInternalParser* parser, size_t initial_capacity, GumboVector* vector) {
30
+ void gumbo_vector_init(struct GumboInternalParser* parser,
31
+ size_t initial_capacity, GumboVector* vector) {
32
32
  vector->length = 0;
33
33
  vector->capacity = initial_capacity;
34
34
  if (initial_capacity > 0) {
35
- vector->data = gumbo_parser_allocate(
36
- parser, sizeof(void*) * initial_capacity);
35
+ vector->data =
36
+ gumbo_parser_allocate(parser, sizeof(void*) * initial_capacity);
37
37
  } else {
38
38
  vector->data = NULL;
39
39
  }
40
40
  }
41
41
 
42
- void gumbo_vector_destroy(struct GumboInternalParser* parser, GumboVector* vector) {
42
+ void gumbo_vector_destroy(
43
+ struct GumboInternalParser* parser, GumboVector* vector) {
43
44
  if (vector->capacity > 0) {
44
45
  gumbo_parser_deallocate(parser, vector->data);
45
46
  }
@@ -59,8 +60,8 @@ static void enlarge_vector_if_full(
59
60
  } else {
60
61
  // 0-capacity vector; no previous array to deallocate.
61
62
  vector->capacity = 2;
62
- vector->data = gumbo_parser_allocate(
63
- parser, sizeof(void*) * vector->capacity);
63
+ vector->data =
64
+ gumbo_parser_allocate(parser, sizeof(void*) * vector->capacity);
64
65
  }
65
66
  }
66
67
  }
@@ -81,8 +82,8 @@ void* gumbo_vector_pop(
81
82
  return vector->data[--vector->length];
82
83
  }
83
84
 
84
- int gumbo_vector_index_of(GumboVector* vector, void* element) {
85
- for (int i = 0; i < vector->length; ++i) {
85
+ int gumbo_vector_index_of(GumboVector* vector, const void* element) {
86
+ for (unsigned int i = 0; i < vector->length; ++i) {
86
87
  if (vector->data[i] == element) {
87
88
  return i;
88
89
  }
@@ -90,15 +91,14 @@ int gumbo_vector_index_of(GumboVector* vector, void* element) {
90
91
  return -1;
91
92
  }
92
93
 
93
- void gumbo_vector_insert_at(
94
- struct GumboInternalParser* parser, void* element, int index,
95
- GumboVector* vector) {
94
+ void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
95
+ unsigned int index, GumboVector* vector) {
96
96
  assert(index >= 0);
97
97
  assert(index <= vector->length);
98
98
  enlarge_vector_if_full(parser, vector);
99
99
  ++vector->length;
100
100
  memmove(&vector->data[index + 1], &vector->data[index],
101
- sizeof(void*) * (vector->length - index - 1));
101
+ sizeof(void*) * (vector->length - index - 1));
102
102
  vector->data[index] = element;
103
103
  }
104
104
 
@@ -111,13 +111,13 @@ void gumbo_vector_remove(
111
111
  gumbo_vector_remove_at(parser, index, vector);
112
112
  }
113
113
 
114
- void* gumbo_vector_remove_at(
115
- struct GumboInternalParser* parser, int index, GumboVector* vector) {
114
+ void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
115
+ unsigned int index, GumboVector* vector) {
116
116
  assert(index >= 0);
117
117
  assert(index < vector->length);
118
118
  void* result = vector->data[index];
119
119
  memmove(&vector->data[index], &vector->data[index + 1],
120
- sizeof(void*) * (vector->length - index - 1));
120
+ sizeof(void*) * (vector->length - index - 1));
121
121
  --vector->length;
122
122
  return result;
123
123
  }
@@ -28,9 +28,8 @@ extern "C" {
28
28
  struct GumboInternalParser;
29
29
 
30
30
  // Initializes a new GumboVector with the specified initial capacity.
31
- void gumbo_vector_init(
32
- struct GumboInternalParser* parser, size_t initial_capacity,
33
- GumboVector* vector);
31
+ void gumbo_vector_init(struct GumboInternalParser* parser,
32
+ size_t initial_capacity, GumboVector* vector);
34
33
 
35
34
  // Frees the memory used by an GumboVector. Does not free the contained
36
35
  // pointers.
@@ -48,9 +47,8 @@ void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector);
48
47
 
49
48
  // Inserts an element at a specific index. This is potentially O(N) time, but
50
49
  // is necessary for some of the spec's behavior.
51
- void gumbo_vector_insert_at(
52
- struct GumboInternalParser* parser, void* element, int index,
53
- GumboVector* vector);
50
+ void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
51
+ unsigned int index, GumboVector* vector);
54
52
 
55
53
  // Removes an element from the vector, or does nothing if the element is not in
56
54
  // the vector.
@@ -59,8 +57,8 @@ void gumbo_vector_remove(
59
57
 
60
58
  // Removes and returns an element at a specific index. Note that this is
61
59
  // potentially O(N) time and should be used sparingly.
62
- void* gumbo_vector_remove_at(
63
- struct GumboInternalParser* parser, int index, GumboVector* vector);
60
+ void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
61
+ unsigned int index, GumboVector* vector);
64
62
 
65
63
  #ifdef __cplusplus
66
64
  }
@@ -1,3 +1,4 @@
1
1
  /*Dummy file to satisfy source file dependencies on Windows platform*/
2
2
  #define strcasecmp _stricmp
3
- #define strncasecmp _strnicmp
3
+ #define strncasecmp _strnicmp
4
+ #define inline __inline
@@ -4,14 +4,14 @@ require 'nokogumboc'
4
4
  module Nokogiri
5
5
  # Parse an HTML document. +string+ contains the document. +string+
6
6
  # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
7
- def self.HTML5(string)
8
- Nokogiri::HTML5.parse(string)
7
+ def self.HTML5(*args)
8
+ Nokogiri::HTML5.parse(*args)
9
9
  end
10
10
 
11
11
  module HTML5
12
12
  # Parse an HTML document. +string+ contains the document. +string+
13
13
  # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
14
- def self.parse(string)
14
+ def self.parse(string, options={})
15
15
  if string.respond_to? :read
16
16
  string = string.read
17
17
  end
@@ -21,7 +21,7 @@ module Nokogiri
21
21
  string = reencode(string)
22
22
  end
23
23
 
24
- Nokogumbo.parse(string.to_s)
24
+ Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0)
25
25
  end
26
26
 
27
27
  # Fetch and parse a HTML document from the web, following redirects,
@@ -67,7 +67,7 @@ module Nokogiri
67
67
 
68
68
  case response
69
69
  when Net::HTTPSuccess
70
- doc = parse(reencode(response.body, response['content-type']))
70
+ doc = parse(reencode(response.body, response['content-type']), options)
71
71
  doc.instance_variable_set('@response', response)
72
72
  doc.class.send(:attr_reader, :response)
73
73
  doc
@@ -83,8 +83,8 @@ module Nokogiri
83
83
  # while fragment is on the Gumbo TODO list, simulate it by doing
84
84
  # a full document parse and ignoring the parent <html>, <head>, and <body>
85
85
  # tags, and collecting up the children of each.
86
- def self.fragment(string)
87
- doc = parse(string)
86
+ def self.fragment(*args)
87
+ doc = parse(*args)
88
88
  fragment = Nokogiri::HTML::DocumentFragment.new(doc)
89
89
 
90
90
  if doc.children.length != 1 or doc.children.first.name != 'html'
@@ -151,7 +151,7 @@ module Nokogiri
151
151
 
152
152
  # look for a charset in a content-encoding header
153
153
  if content_type
154
- encoding ||= content_type[/charset=(.*?)($|\s|;)/i, 1]
154
+ encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
155
155
  end
156
156
 
157
157
  # look for a charset in a meta tag in the first 1024 bytes
@@ -0,0 +1,190 @@
1
+ $:.unshift('lib')
2
+ $:.unshift('ext/nokogumboc')
3
+
4
+ gem 'minitest'
5
+
6
+ require 'nokogumbo'
7
+ require 'minitest/autorun'
8
+
9
+ class TestNokogumbo < Minitest::Test
10
+ def test_element_text
11
+ doc = Nokogiri::HTML5(buffer)
12
+ assert_equal "content", doc.at('span').text
13
+ end
14
+
15
+ def test_element_cdata_textarea
16
+ doc = Nokogiri::HTML5(buffer)
17
+ assert_equal "foo<x>bar", doc.at('textarea').text.strip
18
+ end
19
+
20
+ def test_element_cdata_script
21
+ doc = Nokogiri::HTML5.fragment(buffer)
22
+ assert_equal true, doc.document.html?
23
+ assert_equal "<script> if (a < b) alert(1) </script>", doc.at('script').to_s
24
+ end
25
+
26
+ def test_attr_value
27
+ doc = Nokogiri::HTML5(buffer)
28
+ assert_equal "utf-8", doc.at('meta')['charset']
29
+ end
30
+
31
+ def test_comment
32
+ doc = Nokogiri::HTML5(buffer)
33
+ assert_equal " test comment ", doc.xpath('//comment()').text
34
+ end
35
+
36
+ def test_unknown_element
37
+ doc = Nokogiri::HTML5(buffer)
38
+ assert_equal "main", doc.at('main').name
39
+ end
40
+
41
+ def test_IO
42
+ require 'stringio'
43
+ doc = Nokogiri::HTML5(StringIO.new(buffer))
44
+ assert_equal 'textarea', doc.at('form').element_children.first.name
45
+ end
46
+
47
+ def test_nil
48
+ doc = Nokogiri::HTML5(nil)
49
+ assert_equal 1, doc.search('body').count
50
+ end
51
+
52
+ if ''.respond_to? 'encoding'
53
+ def test_macroman_encoding
54
+ mac="<span>\xCA</span>".force_encoding('macroman')
55
+ doc = Nokogiri::HTML5(mac)
56
+ assert_equal '<span>&#xA0;</span>', doc.at('span').to_xml
57
+ end
58
+
59
+ def test_iso8859_encoding
60
+ iso8859="<span>Se\xF1or</span>".force_encoding(Encoding::ASCII_8BIT)
61
+ doc = Nokogiri::HTML5(iso8859)
62
+ assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
63
+ end
64
+
65
+ def test_charset_encoding
66
+ utf8="<meta charset='utf-8'><span>Se\xC3\xB1or</span>".
67
+ force_encoding(Encoding::ASCII_8BIT)
68
+ doc = Nokogiri::HTML5(utf8)
69
+ assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
70
+ end
71
+
72
+ def test_bogus_encoding
73
+ bogus="<meta charset='bogus'><span>Se\xF1or</span>".
74
+ force_encoding(Encoding::ASCII_8BIT)
75
+ doc = Nokogiri::HTML5(bogus)
76
+ assert_equal '<span>Se&#xF1;or</span>', doc.at('span').to_xml
77
+ end
78
+ end
79
+
80
+ def test_html5_doctype
81
+ doc = Nokogiri::HTML5.parse("<!DOCTYPE html><html></html>")
82
+ assert_match /<!DOCTYPE html>/, doc.to_html
83
+ end
84
+
85
+ def test_fragment_head
86
+ doc = Nokogiri::HTML5.fragment(buffer[/<head>(.*?)<\/head>/m, 1])
87
+ assert_equal "hello world", doc.xpath('title').text
88
+ assert_equal "utf-8", doc.xpath('meta').first['charset']
89
+ end
90
+
91
+ def test_fragment_body
92
+ doc = Nokogiri::HTML5.fragment(buffer[/<body>(.*?)<\/body>/m, 1])
93
+ assert_equal '<span>content</span>', doc.xpath('main/span').to_xml
94
+ assert_equal " test comment ", doc.xpath('comment()').text
95
+ end
96
+
97
+ def test_xlink_attribute
98
+ source = <<-EOF.gsub(/^ {6}/, '')
99
+ <svg xmlns="http://www.w3.org/2000/svg">
100
+ <a xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#s1"/>
101
+ </svg>
102
+ EOF
103
+ doc = Nokogiri::HTML5.fragment(source)
104
+ a = doc.at('a')
105
+ assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
106
+ end
107
+
108
+ def test_template
109
+ source = <<-EOF.gsub(/^ {6}/, '')
110
+ <template id="productrow">
111
+ <tr>
112
+ <td class="record"></td>
113
+ <td></td>
114
+ </tr>
115
+ </template>
116
+ EOF
117
+ doc = Nokogiri::HTML5.fragment(source)
118
+ template = doc.at('template')
119
+ assert_equal "productrow", template['id']
120
+ assert_equal "record", template.at('td')['class']
121
+ end
122
+
123
+ def test_root_comments
124
+ doc = Nokogiri::HTML5("<!DOCTYPE html><!-- start --><html></html><!-- -->")
125
+ assert_equal ["html", "comment", "html", "comment"], doc.children.map(&:name)
126
+ end
127
+
128
+ def test_parse_errors
129
+ doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 10)
130
+ assert_equal doc.errors.length, 2
131
+ doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 10)
132
+ assert_empty doc.errors
133
+ end
134
+
135
+ def test_max_parse_errors
136
+ # This document contains 2 parse errors, but we force limit to 1.
137
+ doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 1)
138
+ assert_equal 1, doc.errors.length
139
+ doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 1)
140
+ assert_empty doc.errors
141
+ end
142
+
143
+ def test_default_max_parse_errors
144
+ # This document contains 200 parse errors, but default limit is 0.
145
+ doc = Nokogiri::HTML5("<!DOCTYPE html><html>" + "</p>" * 200)
146
+ assert_equal 0, doc.errors.length
147
+ end
148
+
149
+ def test_parse_fragment_errors
150
+ doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
151
+ refute_empty doc.errors
152
+ end
153
+
154
+ def test_fragment_max_parse_errors
155
+ # This fragment contains 3 parse errors, but we force limit to 1.
156
+ doc = Nokogiri::HTML5.fragment("<!-- -- --></a>", max_parse_errors: 1)
157
+ assert_equal 1, doc.errors.length
158
+ end
159
+
160
+ def test_fragment_default_max_parse_errors
161
+ # This fragment contains 201 parse errors, but default limit is 0.
162
+ doc = Nokogiri::HTML5.fragment("</p>" * 200)
163
+ assert_equal 0, doc.errors.length
164
+ end
165
+
166
+ private
167
+
168
+ def buffer
169
+ <<-EOF.gsub(/^ /, '')
170
+ <html>
171
+ <head>
172
+ <meta charset="utf-8"/>
173
+ <title>hello world</title>
174
+ <script> if (a < b) alert(1) </script>
175
+ </head>
176
+ <body>
177
+ <h1>hello world</h1>
178
+ <main>
179
+ <span>content</span>
180
+ </main>
181
+ <!-- test comment -->
182
+ <form>
183
+ <textarea>foo<x>bar</textarea>
184
+ </form>
185
+ </body>
186
+ </html>
187
+ EOF
188
+ end
189
+
190
+ end