nokogumbo 1.4.8 → 1.4.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,57 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Contains the definition of the top-level GumboParser structure that's
18
+ // threaded through basically every internal function in the library.
19
+
20
+ #ifndef GUMBO_PARSER_H_
21
+ #define GUMBO_PARSER_H_
22
+
23
+ #ifdef __cplusplus
24
+ extern "C" {
25
+ #endif
26
+
27
+ struct GumboInternalParserState;
28
+ struct GumboInternalOutput;
29
+ struct GumboInternalOptions;
30
+ struct GumboInternalTokenizerState;
31
+
32
+ // An overarching struct that's threaded through (nearly) all functions in the
33
+ // library, OOP-style. This gives each function access to the options and
34
+ // output, along with any internal state needed for the parse.
35
+ typedef struct GumboInternalParser {
36
+ // Settings for this parse run.
37
+ const struct GumboInternalOptions* _options;
38
+
39
+ // Output for the parse.
40
+ struct GumboInternalOutput* _output;
41
+
42
+ // The internal tokenizer state, defined as a pointer to avoid a cyclic
43
+ // dependency on html5tokenizer.h. The main parse routine is responsible for
44
+ // initializing this on parse start, and destroying it on parse end.
45
+ // End-users will never see a non-garbage value in this pointer.
46
+ struct GumboInternalTokenizerState* _tokenizer_state;
47
+
48
+ // The internal parser state. Initialized on parse start and destroyed on
49
+ // parse end; end-users will never see a non-garbage value in this pointer.
50
+ struct GumboInternalParserState* _parser_state;
51
+ } GumboParser;
52
+
53
+ #ifdef __cplusplus
54
+ }
55
+ #endif
56
+
57
+ #endif // GUMBO_PARSER_H_
@@ -0,0 +1,110 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "string_buffer.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "string_piece.h"
25
+ #include "util.h"
26
+
27
+ struct GumboInternalParser;
28
+
29
+ // Size chosen via statistical analysis of ~60K websites.
30
+ // 99% of text nodes and 98% of attribute names/values fit in this initial size.
31
+ static const size_t kDefaultStringBufferSize = 5;
32
+
33
+ static void maybe_resize_string_buffer(struct GumboInternalParser* parser,
34
+ size_t additional_chars, GumboStringBuffer* buffer) {
35
+ size_t new_length = buffer->length + additional_chars;
36
+ size_t new_capacity = buffer->capacity;
37
+ while (new_capacity < new_length) {
38
+ new_capacity *= 2;
39
+ }
40
+ if (new_capacity != buffer->capacity) {
41
+ char* new_data = gumbo_parser_allocate(parser, new_capacity);
42
+ memcpy(new_data, buffer->data, buffer->length);
43
+ gumbo_parser_deallocate(parser, buffer->data);
44
+ buffer->data = new_data;
45
+ buffer->capacity = new_capacity;
46
+ }
47
+ }
48
+
49
+ void gumbo_string_buffer_init(
50
+ struct GumboInternalParser* parser, GumboStringBuffer* output) {
51
+ output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize);
52
+ output->length = 0;
53
+ output->capacity = kDefaultStringBufferSize;
54
+ }
55
+
56
+ void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
57
+ size_t min_capacity, GumboStringBuffer* output) {
58
+ maybe_resize_string_buffer(parser, min_capacity - output->length, output);
59
+ }
60
+
61
+ void gumbo_string_buffer_append_codepoint(
62
+ struct GumboInternalParser* parser, int c, GumboStringBuffer* output) {
63
+ // num_bytes is actually the number of continuation bytes, 1 less than the
64
+ // total number of bytes. This is done to keep the loop below simple and
65
+ // should probably change if we unroll it.
66
+ int num_bytes, prefix;
67
+ if (c <= 0x7f) {
68
+ num_bytes = 0;
69
+ prefix = 0;
70
+ } else if (c <= 0x7ff) {
71
+ num_bytes = 1;
72
+ prefix = 0xc0;
73
+ } else if (c <= 0xffff) {
74
+ num_bytes = 2;
75
+ prefix = 0xe0;
76
+ } else {
77
+ num_bytes = 3;
78
+ prefix = 0xf0;
79
+ }
80
+ maybe_resize_string_buffer(parser, num_bytes + 1, output);
81
+ output->data[output->length++] = prefix | (c >> (num_bytes * 6));
82
+ for (int i = num_bytes - 1; i >= 0; --i) {
83
+ output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6)));
84
+ }
85
+ }
86
+
87
+ void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
88
+ GumboStringPiece* str, GumboStringBuffer* output) {
89
+ maybe_resize_string_buffer(parser, str->length, output);
90
+ memcpy(output->data + output->length, str->data, str->length);
91
+ output->length += str->length;
92
+ }
93
+
94
+ char* gumbo_string_buffer_to_string(
95
+ struct GumboInternalParser* parser, GumboStringBuffer* input) {
96
+ char* buffer = gumbo_parser_allocate(parser, input->length + 1);
97
+ memcpy(buffer, input->data, input->length);
98
+ buffer[input->length] = '\0';
99
+ return buffer;
100
+ }
101
+
102
+ void gumbo_string_buffer_clear(
103
+ struct GumboInternalParser* parser, GumboStringBuffer* input) {
104
+ input->length = 0;
105
+ }
106
+
107
+ void gumbo_string_buffer_destroy(
108
+ struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
109
+ gumbo_parser_deallocate(parser, buffer->data);
110
+ }
@@ -0,0 +1,84 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ #ifndef GUMBO_STRING_BUFFER_H_
18
+ #define GUMBO_STRING_BUFFER_H_
19
+
20
+ #include <stdbool.h>
21
+ #include <stddef.h>
22
+
23
+ #include "gumbo.h"
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+ struct GumboInternalParser;
30
+
31
+ // A struct representing a mutable, growable string. This consists of a
32
+ // heap-allocated buffer that may grow (by doubling) as necessary. When
33
+ // converting to a string, this allocates a new buffer that is only as long as
34
+ // it needs to be. Note that the internal buffer here is *not* nul-terminated,
35
+ // so be sure not to use ordinary string manipulation functions on it.
36
+ typedef struct {
37
+ // A pointer to the beginning of the string. NULL iff length == 0.
38
+ char* data;
39
+
40
+ // The length of the string fragment, in bytes. May be zero.
41
+ size_t length;
42
+
43
+ // The capacity of the buffer, in bytes.
44
+ size_t capacity;
45
+ } GumboStringBuffer;
46
+
47
+ // Initializes a new GumboStringBuffer.
48
+ void gumbo_string_buffer_init(
49
+ struct GumboInternalParser* parser, GumboStringBuffer* output);
50
+
51
+ // Ensures that the buffer contains at least a certain amount of space. Most
52
+ // useful with snprintf and the other length-delimited string functions, which
53
+ // may want to write directly into the buffer.
54
+ void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
55
+ size_t min_capacity, GumboStringBuffer* output);
56
+
57
+ // Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
58
+ // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
59
+ // value of the codepoint.
60
+ void gumbo_string_buffer_append_codepoint(
61
+ struct GumboInternalParser* parser, int c, GumboStringBuffer* output);
62
+
63
+ // Appends a string onto the end of the GumboStringBuffer.
64
+ void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
65
+ GumboStringPiece* str, GumboStringBuffer* output);
66
+
67
+ // Converts this string buffer to const char*, alloctaing a new buffer for it.
68
+ char* gumbo_string_buffer_to_string(
69
+ struct GumboInternalParser* parser, GumboStringBuffer* input);
70
+
71
+ // Reinitialize this string buffer. This clears it by setting length=0. It
72
+ // does not zero out the buffer itself.
73
+ void gumbo_string_buffer_clear(
74
+ struct GumboInternalParser* parser, GumboStringBuffer* input);
75
+
76
+ // Deallocates this GumboStringBuffer.
77
+ void gumbo_string_buffer_destroy(
78
+ struct GumboInternalParser* parser, GumboStringBuffer* buffer);
79
+
80
+ #ifdef __cplusplus
81
+ }
82
+ #endif
83
+
84
+ #endif // GUMBO_STRING_BUFFER_H_
@@ -0,0 +1,48 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "string_piece.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "util.h"
25
+
26
+ struct GumboInternalParser;
27
+
28
+ const GumboStringPiece kGumboEmptyString = {NULL, 0};
29
+
30
+ bool gumbo_string_equals(
31
+ const GumboStringPiece* str1, const GumboStringPiece* str2) {
32
+ return str1->length == str2->length &&
33
+ !memcmp(str1->data, str2->data, str1->length);
34
+ }
35
+
36
+ bool gumbo_string_equals_ignore_case(
37
+ const GumboStringPiece* str1, const GumboStringPiece* str2) {
38
+ return str1->length == str2->length &&
39
+ !strncasecmp(str1->data, str2->data, str1->length);
40
+ }
41
+
42
+ void gumbo_string_copy(struct GumboInternalParser* parser,
43
+ GumboStringPiece* dest, const GumboStringPiece* source) {
44
+ dest->length = source->length;
45
+ char* buffer = gumbo_parser_allocate(parser, source->length);
46
+ memcpy(buffer, source->data, source->length);
47
+ dest->data = buffer;
48
+ }
@@ -0,0 +1,38 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_STRING_PIECE_H_
18
+ #define GUMBO_STRING_PIECE_H_
19
+
20
+ #include "gumbo.h"
21
+
22
+ #ifdef __cplusplus
23
+ extern "C" {
24
+ #endif
25
+
26
+ struct GumboInternalParser;
27
+
28
+ // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29
+ // destination and copying over the characters from source. Dest should be
30
+ // empty, with no buffer allocated; otherwise, this leaks it.
31
+ void gumbo_string_copy(struct GumboInternalParser* parser,
32
+ GumboStringPiece* dest, const GumboStringPiece* source);
33
+
34
+ #ifdef __cplusplus
35
+ }
36
+ #endif
37
+
38
+ #endif // GUMBO_STRING_PIECE_H_
@@ -0,0 +1,95 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "gumbo.h"
18
+
19
+ #include <assert.h>
20
+ #include <ctype.h>
21
+ #include <string.h>
22
+
23
+ const char* kGumboTagNames[] = {
24
+ #include "tag_strings.h"
25
+ "", // TAG_UNKNOWN
26
+ "", // TAG_LAST
27
+ };
28
+
29
+ static const unsigned char kGumboTagSizes[] = {
30
+ #include "tag_sizes.h"
31
+ 0, // TAG_UNKNOWN
32
+ 0, // TAG_LAST
33
+ };
34
+
35
+ const char* gumbo_normalized_tagname(GumboTag tag) {
36
+ assert(tag <= GUMBO_TAG_LAST);
37
+ return kGumboTagNames[tag];
38
+ }
39
+
40
+ void gumbo_tag_from_original_text(GumboStringPiece* text) {
41
+ if (text->data == NULL) {
42
+ return;
43
+ }
44
+
45
+ assert(text->length >= 2);
46
+ assert(text->data[0] == '<');
47
+ assert(text->data[text->length - 1] == '>');
48
+ if (text->data[1] == '/') {
49
+ // End tag.
50
+ assert(text->length >= 3);
51
+ text->data += 2; // Move past </
52
+ text->length -= 3;
53
+ } else {
54
+ // Start tag.
55
+ text->data += 1; // Move past <
56
+ text->length -= 2;
57
+ // strnchr is apparently not a standard C library function, so I loop
58
+ // explicitly looking for whitespace or other illegal tag characters.
59
+ for (const char* c = text->data; c != text->data + text->length; ++c) {
60
+ if (isspace(*c) || *c == '/') {
61
+ text->length = c - text->data;
62
+ break;
63
+ }
64
+ }
65
+ }
66
+ }
67
+
68
+ static int case_memcmp(const char* s1, const char* s2, unsigned int n) {
69
+ while (n--) {
70
+ unsigned char c1 = tolower(*s1++);
71
+ unsigned char c2 = tolower(*s2++);
72
+ if (c1 != c2) return (int) c1 - (int) c2;
73
+ }
74
+ return 0;
75
+ }
76
+
77
+ #include "tag_gperf.h"
78
+ #define TAG_MAP_SIZE (sizeof(kGumboTagMap) / sizeof(kGumboTagMap[0]))
79
+
80
+ GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
81
+ if (length) {
82
+ unsigned int key = tag_hash(tagname, length);
83
+ if (key < TAG_MAP_SIZE) {
84
+ GumboTag tag = kGumboTagMap[key];
85
+ if (length == kGumboTagSizes[(int) tag] &&
86
+ !case_memcmp(tagname, kGumboTagNames[(int) tag], length))
87
+ return tag;
88
+ }
89
+ }
90
+ return GUMBO_TAG_UNKNOWN;
91
+ }
92
+
93
+ GumboTag gumbo_tag_enum(const char* tagname) {
94
+ return gumbo_tagn_enum(tagname, strlen(tagname));
95
+ }
@@ -0,0 +1,150 @@
1
+ html
2
+ head
3
+ title
4
+ base
5
+ link
6
+ meta
7
+ style
8
+ script
9
+ noscript
10
+ template
11
+ body
12
+ article
13
+ section
14
+ nav
15
+ aside
16
+ h1
17
+ h2
18
+ h3
19
+ h4
20
+ h5
21
+ h6
22
+ hgroup
23
+ header
24
+ footer
25
+ address
26
+ p
27
+ hr
28
+ pre
29
+ blockquote
30
+ ol
31
+ ul
32
+ li
33
+ dl
34
+ dt
35
+ dd
36
+ figure
37
+ figcaption
38
+ main
39
+ div
40
+ a
41
+ em
42
+ strong
43
+ small
44
+ s
45
+ cite
46
+ q
47
+ dfn
48
+ abbr
49
+ data
50
+ time
51
+ code
52
+ var
53
+ samp
54
+ kbd
55
+ sub
56
+ sup
57
+ i
58
+ b
59
+ u
60
+ mark
61
+ ruby
62
+ rt
63
+ rp
64
+ bdi
65
+ bdo
66
+ span
67
+ br
68
+ wbr
69
+ ins
70
+ del
71
+ image
72
+ img
73
+ iframe
74
+ embed
75
+ object
76
+ param
77
+ video
78
+ audio
79
+ source
80
+ track
81
+ canvas
82
+ map
83
+ area
84
+ math
85
+ mi
86
+ mo
87
+ mn
88
+ ms
89
+ mtext
90
+ mglyph
91
+ malignmark
92
+ annotation-xml
93
+ svg
94
+ foreignobject
95
+ desc
96
+ table
97
+ caption
98
+ colgroup
99
+ col
100
+ tbody
101
+ thead
102
+ tfoot
103
+ tr
104
+ td
105
+ th
106
+ form
107
+ fieldset
108
+ legend
109
+ label
110
+ input
111
+ button
112
+ select
113
+ datalist
114
+ optgroup
115
+ option
116
+ textarea
117
+ keygen
118
+ output
119
+ progress
120
+ meter
121
+ details
122
+ summary
123
+ menu
124
+ menuitem
125
+ applet
126
+ acronym
127
+ bgsound
128
+ dir
129
+ frame
130
+ frameset
131
+ noframes
132
+ isindex
133
+ listing
134
+ xmp
135
+ nextid
136
+ noembed
137
+ plaintext
138
+ rb
139
+ strike
140
+ basefont
141
+ big
142
+ blink
143
+ center
144
+ font
145
+ marquee
146
+ multicol
147
+ nobr
148
+ spacer
149
+ tt
150
+ rtc