nokogumbo 1.4.2 → 1.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/gumbo-parser/src/attribute.c +1 -1
- data/gumbo-parser/src/char_ref.c +37 -67
- data/gumbo-parser/src/char_ref.h +3 -4
- data/gumbo-parser/src/char_ref.rl +6 -1
- data/gumbo-parser/src/error.c +50 -51
- data/gumbo-parser/src/error.h +7 -9
- data/gumbo-parser/src/gumbo.h +45 -181
- data/gumbo-parser/src/parser.c +1397 -989
- data/gumbo-parser/src/string_buffer.c +14 -10
- data/gumbo-parser/src/string_buffer.h +9 -6
- data/gumbo-parser/src/string_piece.c +5 -6
- data/gumbo-parser/src/string_piece.h +2 -3
- data/gumbo-parser/src/tag.c +36 -166
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/tokenizer.c +264 -360
- data/gumbo-parser/src/tokenizer.h +2 -2
- data/gumbo-parser/src/utf8.c +44 -44
- data/gumbo-parser/src/utf8.h +1 -2
- data/gumbo-parser/src/util.c +1 -1
- data/gumbo-parser/src/util.h +0 -2
- data/gumbo-parser/src/vector.c +17 -17
- data/gumbo-parser/src/vector.h +6 -8
- metadata +8 -3
@@ -64,8 +64,8 @@ typedef struct GumboInternalToken {
|
|
64
64
|
GumboTokenDocType doc_type;
|
65
65
|
GumboTokenStartTag start_tag;
|
66
66
|
GumboTag end_tag;
|
67
|
-
const char* text;
|
68
|
-
int character;
|
67
|
+
const char* text; // For comments.
|
68
|
+
int character; // For character, whitespace, null, and EOF tokens.
|
69
69
|
} v;
|
70
70
|
} GumboToken;
|
71
71
|
|
data/gumbo-parser/src/utf8.c
CHANGED
@@ -19,7 +19,7 @@
|
|
19
19
|
#include <assert.h>
|
20
20
|
#include <stdint.h>
|
21
21
|
#include <string.h>
|
22
|
-
#include <strings.h>
|
22
|
+
#include <strings.h> // For strncasecmp.
|
23
23
|
|
24
24
|
#include "error.h"
|
25
25
|
#include "gumbo.h"
|
@@ -47,9 +47,11 @@ const int kUtf8ReplacementChar = 0xFFFD;
|
|
47
47
|
|
48
48
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
49
49
|
// of this software and associated documentation files (the "Software"), to deal
|
50
|
-
// in the Software without restriction, including without limitation the rights
|
50
|
+
// in the Software without restriction, including without limitation the rights
|
51
|
+
// to
|
51
52
|
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
52
|
-
// of the Software, and to permit persons to whom the Software is furnished to
|
53
|
+
// of the Software, and to permit persons to whom the Software is furnished to
|
54
|
+
// do
|
53
55
|
// so, subject to the following conditions:
|
54
56
|
|
55
57
|
// The above copyright notice and this permission notice shall be included in
|
@@ -59,32 +61,35 @@ const int kUtf8ReplacementChar = 0xFFFD;
|
|
59
61
|
#define UTF8_REJECT 12
|
60
62
|
|
61
63
|
static const uint8_t utf8d[] = {
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
64
|
+
// The first part of the table maps bytes to character classes that
|
65
|
+
// to reduce the size of the transition table and create bitmasks.
|
66
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
67
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
68
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
69
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
70
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
71
|
+
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9,
|
72
|
+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
|
73
|
+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2,
|
74
|
+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10,
|
75
|
+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8,
|
76
|
+
8, 8, 8, 8, 8, 8,
|
77
|
+
|
78
|
+
// The second part is a transition table that maps a combination
|
79
|
+
// of a state of the automaton and a character class to a state.
|
80
|
+
0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12,
|
81
|
+
12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12,
|
82
|
+
12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
|
83
|
+
12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12,
|
84
|
+
12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
|
85
|
+
12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
80
86
|
};
|
81
87
|
|
82
88
|
uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
83
89
|
uint32_t type = utf8d[byte];
|
84
90
|
|
85
|
-
*codep = (*state != UTF8_ACCEPT) ?
|
86
|
-
|
87
|
-
(0xff >> type) & (byte);
|
91
|
+
*codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
|
92
|
+
: (0xff >> type) & (byte);
|
88
93
|
|
89
94
|
*state = utf8d[256 + *state + type];
|
90
95
|
return *state;
|
@@ -130,7 +135,7 @@ static void read_char(Utf8Iterator* iter) {
|
|
130
135
|
uint32_t code_point = 0;
|
131
136
|
uint32_t state = UTF8_ACCEPT;
|
132
137
|
for (const char* c = iter->_start; c < iter->_end; ++c) {
|
133
|
-
decode(&state, &code_point, (uint32_t)
|
138
|
+
decode(&state, &code_point, (uint32_t)(unsigned char) (*c));
|
134
139
|
if (state == UTF8_ACCEPT) {
|
135
140
|
iter->_width = c - iter->_start + 1;
|
136
141
|
// This is the special handling for carriage returns that is mandated by
|
@@ -180,10 +185,10 @@ static void update_position(Utf8Iterator* iter) {
|
|
180
185
|
if (iter->_current == '\n') {
|
181
186
|
++iter->_pos.line;
|
182
187
|
iter->_pos.column = 1;
|
183
|
-
} else if(iter->_current == '\t') {
|
188
|
+
} else if (iter->_current == '\t') {
|
184
189
|
int tab_stop = iter->_parser->_options->tab_stop;
|
185
190
|
iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
|
186
|
-
} else if(iter->_current != -1) {
|
191
|
+
} else if (iter->_current != -1) {
|
187
192
|
++iter->_pos.column;
|
188
193
|
}
|
189
194
|
}
|
@@ -192,13 +197,12 @@ static void update_position(Utf8Iterator* iter) {
|
|
192
197
|
// forbidden by the HTML5 spec, such as undefined control chars.
|
193
198
|
bool utf8_is_invalid_code_point(int c) {
|
194
199
|
return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
|
195
|
-
|
196
|
-
|
200
|
+
(c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
|
201
|
+
((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
|
197
202
|
}
|
198
203
|
|
199
|
-
void utf8iterator_init(
|
200
|
-
|
201
|
-
Utf8Iterator* iter) {
|
204
|
+
void utf8iterator_init(GumboParser* parser, const char* source,
|
205
|
+
size_t source_length, Utf8Iterator* iter) {
|
202
206
|
iter->_start = source;
|
203
207
|
iter->_end = source + source_length;
|
204
208
|
iter->_pos.line = 1;
|
@@ -216,9 +220,7 @@ void utf8iterator_next(Utf8Iterator* iter) {
|
|
216
220
|
read_char(iter);
|
217
221
|
}
|
218
222
|
|
219
|
-
int utf8iterator_current(const Utf8Iterator* iter) {
|
220
|
-
return iter->_current;
|
221
|
-
}
|
223
|
+
int utf8iterator_current(const Utf8Iterator* iter) { return iter->_current; }
|
222
224
|
|
223
225
|
void utf8iterator_get_position(
|
224
226
|
const Utf8Iterator* iter, GumboSourcePosition* output) {
|
@@ -233,14 +235,13 @@ const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
|
|
233
235
|
return iter->_end;
|
234
236
|
}
|
235
237
|
|
236
|
-
bool utf8iterator_maybe_consume_match(
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
!strncasecmp(iter->_start, prefix, length));
|
238
|
+
bool utf8iterator_maybe_consume_match(Utf8Iterator* iter, const char* prefix,
|
239
|
+
size_t length, bool case_sensitive) {
|
240
|
+
bool matched = (iter->_start + length <= iter->_end) &&
|
241
|
+
(case_sensitive ? !strncmp(iter->_start, prefix, length)
|
242
|
+
: !strncasecmp(iter->_start, prefix, length));
|
242
243
|
if (matched) {
|
243
|
-
for (int i = 0; i < length; ++i) {
|
244
|
+
for (unsigned int i = 0; i < length; ++i) {
|
244
245
|
utf8iterator_next(iter);
|
245
246
|
}
|
246
247
|
return true;
|
@@ -263,8 +264,7 @@ void utf8iterator_reset(Utf8Iterator* iter) {
|
|
263
264
|
|
264
265
|
// Sets the position and original text fields of an error to the value at the
|
265
266
|
// mark.
|
266
|
-
void utf8iterator_fill_error_at_mark(
|
267
|
-
Utf8Iterator* iter, GumboError* error) {
|
267
|
+
void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) {
|
268
268
|
error->position = iter->_mark_pos;
|
269
269
|
error->original_text = iter->_mark;
|
270
270
|
}
|
data/gumbo-parser/src/utf8.h
CHANGED
@@ -81,8 +81,7 @@ bool utf8_is_invalid_code_point(int c);
|
|
81
81
|
|
82
82
|
// Initializes a new Utf8Iterator from the given byte buffer. The source does
|
83
83
|
// not have to be NUL-terminated, but the length must be passed in explicitly.
|
84
|
-
void utf8iterator_init(
|
85
|
-
struct GumboInternalParser* parser, const char* source,
|
84
|
+
void utf8iterator_init(struct GumboInternalParser* parser, const char* source,
|
86
85
|
size_t source_length, Utf8Iterator* iter);
|
87
86
|
|
88
87
|
// Advances the current position by one code point.
|
data/gumbo-parser/src/util.c
CHANGED
@@ -29,7 +29,7 @@
|
|
29
29
|
// TODO(jdtang): This should be elsewhere, but there's no .c file for
|
30
30
|
// SourcePositions and yet the constant needs some linkage, so this is as good
|
31
31
|
// as any.
|
32
|
-
const GumboSourcePosition kGumboEmptySourcePosition = {
|
32
|
+
const GumboSourcePosition kGumboEmptySourcePosition = {0, 0, 0};
|
33
33
|
|
34
34
|
void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) {
|
35
35
|
return parser->_options->allocator(parser->_options->userdata, num_bytes);
|
data/gumbo-parser/src/util.h
CHANGED
data/gumbo-parser/src/vector.c
CHANGED
@@ -25,21 +25,22 @@
|
|
25
25
|
|
26
26
|
struct GumboInternalParser;
|
27
27
|
|
28
|
-
const GumboVector kGumboEmptyVector = {
|
28
|
+
const GumboVector kGumboEmptyVector = {NULL, 0, 0};
|
29
29
|
|
30
|
-
void gumbo_vector_init(
|
31
|
-
|
30
|
+
void gumbo_vector_init(struct GumboInternalParser* parser,
|
31
|
+
size_t initial_capacity, GumboVector* vector) {
|
32
32
|
vector->length = 0;
|
33
33
|
vector->capacity = initial_capacity;
|
34
34
|
if (initial_capacity > 0) {
|
35
|
-
vector->data =
|
36
|
-
parser, sizeof(void*) * initial_capacity);
|
35
|
+
vector->data =
|
36
|
+
gumbo_parser_allocate(parser, sizeof(void*) * initial_capacity);
|
37
37
|
} else {
|
38
38
|
vector->data = NULL;
|
39
39
|
}
|
40
40
|
}
|
41
41
|
|
42
|
-
void gumbo_vector_destroy(
|
42
|
+
void gumbo_vector_destroy(
|
43
|
+
struct GumboInternalParser* parser, GumboVector* vector) {
|
43
44
|
if (vector->capacity > 0) {
|
44
45
|
gumbo_parser_deallocate(parser, vector->data);
|
45
46
|
}
|
@@ -59,8 +60,8 @@ static void enlarge_vector_if_full(
|
|
59
60
|
} else {
|
60
61
|
// 0-capacity vector; no previous array to deallocate.
|
61
62
|
vector->capacity = 2;
|
62
|
-
vector->data =
|
63
|
-
parser, sizeof(void*) * vector->capacity);
|
63
|
+
vector->data =
|
64
|
+
gumbo_parser_allocate(parser, sizeof(void*) * vector->capacity);
|
64
65
|
}
|
65
66
|
}
|
66
67
|
}
|
@@ -81,8 +82,8 @@ void* gumbo_vector_pop(
|
|
81
82
|
return vector->data[--vector->length];
|
82
83
|
}
|
83
84
|
|
84
|
-
int gumbo_vector_index_of(GumboVector* vector, void* element) {
|
85
|
-
for (int i = 0; i < vector->length; ++i) {
|
85
|
+
int gumbo_vector_index_of(GumboVector* vector, const void* element) {
|
86
|
+
for (unsigned int i = 0; i < vector->length; ++i) {
|
86
87
|
if (vector->data[i] == element) {
|
87
88
|
return i;
|
88
89
|
}
|
@@ -90,15 +91,14 @@ int gumbo_vector_index_of(GumboVector* vector, void* element) {
|
|
90
91
|
return -1;
|
91
92
|
}
|
92
93
|
|
93
|
-
void gumbo_vector_insert_at(
|
94
|
-
|
95
|
-
GumboVector* vector) {
|
94
|
+
void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
|
95
|
+
unsigned int index, GumboVector* vector) {
|
96
96
|
assert(index >= 0);
|
97
97
|
assert(index <= vector->length);
|
98
98
|
enlarge_vector_if_full(parser, vector);
|
99
99
|
++vector->length;
|
100
100
|
memmove(&vector->data[index + 1], &vector->data[index],
|
101
|
-
|
101
|
+
sizeof(void*) * (vector->length - index - 1));
|
102
102
|
vector->data[index] = element;
|
103
103
|
}
|
104
104
|
|
@@ -111,13 +111,13 @@ void gumbo_vector_remove(
|
|
111
111
|
gumbo_vector_remove_at(parser, index, vector);
|
112
112
|
}
|
113
113
|
|
114
|
-
void* gumbo_vector_remove_at(
|
115
|
-
|
114
|
+
void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
|
115
|
+
unsigned int index, GumboVector* vector) {
|
116
116
|
assert(index >= 0);
|
117
117
|
assert(index < vector->length);
|
118
118
|
void* result = vector->data[index];
|
119
119
|
memmove(&vector->data[index], &vector->data[index + 1],
|
120
|
-
|
120
|
+
sizeof(void*) * (vector->length - index - 1));
|
121
121
|
--vector->length;
|
122
122
|
return result;
|
123
123
|
}
|
data/gumbo-parser/src/vector.h
CHANGED
@@ -28,9 +28,8 @@ extern "C" {
|
|
28
28
|
struct GumboInternalParser;
|
29
29
|
|
30
30
|
// Initializes a new GumboVector with the specified initial capacity.
|
31
|
-
void gumbo_vector_init(
|
32
|
-
|
33
|
-
GumboVector* vector);
|
31
|
+
void gumbo_vector_init(struct GumboInternalParser* parser,
|
32
|
+
size_t initial_capacity, GumboVector* vector);
|
34
33
|
|
35
34
|
// Frees the memory used by an GumboVector. Does not free the contained
|
36
35
|
// pointers.
|
@@ -48,9 +47,8 @@ void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector);
|
|
48
47
|
|
49
48
|
// Inserts an element at a specific index. This is potentially O(N) time, but
|
50
49
|
// is necessary for some of the spec's behavior.
|
51
|
-
void gumbo_vector_insert_at(
|
52
|
-
|
53
|
-
GumboVector* vector);
|
50
|
+
void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
|
51
|
+
unsigned int index, GumboVector* vector);
|
54
52
|
|
55
53
|
// Removes an element from the vector, or does nothing if the element is not in
|
56
54
|
// the vector.
|
@@ -59,8 +57,8 @@ void gumbo_vector_remove(
|
|
59
57
|
|
60
58
|
// Removes and returns an element at a specific index. Note that this is
|
61
59
|
// potentially O(N) time and should be used sparingly.
|
62
|
-
void* gumbo_vector_remove_at(
|
63
|
-
|
60
|
+
void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
|
61
|
+
unsigned int index, GumboVector* vector);
|
64
62
|
|
65
63
|
#ifdef __cplusplus
|
66
64
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-09-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -52,6 +52,11 @@ files:
|
|
52
52
|
- gumbo-parser/src/string_piece.c
|
53
53
|
- gumbo-parser/src/string_piece.h
|
54
54
|
- gumbo-parser/src/tag.c
|
55
|
+
- gumbo-parser/src/tag.in
|
56
|
+
- gumbo-parser/src/tag_enum.h
|
57
|
+
- gumbo-parser/src/tag_gperf.h
|
58
|
+
- gumbo-parser/src/tag_sizes.h
|
59
|
+
- gumbo-parser/src/tag_strings.h
|
55
60
|
- gumbo-parser/src/token_type.h
|
56
61
|
- gumbo-parser/src/tokenizer.c
|
57
62
|
- gumbo-parser/src/tokenizer.h
|
@@ -85,7 +90,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
85
90
|
version: '0'
|
86
91
|
requirements: []
|
87
92
|
rubyforge_project:
|
88
|
-
rubygems_version: 2.4.5
|
93
|
+
rubygems_version: 2.4.5.1
|
89
94
|
signing_key:
|
90
95
|
specification_version: 4
|
91
96
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|