nokogumbo 1.4.7 → 1.4.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,57 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Contains the definition of the top-level GumboParser structure that's
18
- // threaded through basically every internal function in the library.
19
-
20
- #ifndef GUMBO_PARSER_H_
21
- #define GUMBO_PARSER_H_
22
-
23
- #ifdef __cplusplus
24
- extern "C" {
25
- #endif
26
-
27
- struct GumboInternalParserState;
28
- struct GumboInternalOutput;
29
- struct GumboInternalOptions;
30
- struct GumboInternalTokenizerState;
31
-
32
- // An overarching struct that's threaded through (nearly) all functions in the
33
- // library, OOP-style. This gives each function access to the options and
34
- // output, along with any internal state needed for the parse.
35
- typedef struct GumboInternalParser {
36
- // Settings for this parse run.
37
- const struct GumboInternalOptions* _options;
38
-
39
- // Output for the parse.
40
- struct GumboInternalOutput* _output;
41
-
42
- // The internal tokenizer state, defined as a pointer to avoid a cyclic
43
- // dependency on html5tokenizer.h. The main parse routine is responsible for
44
- // initializing this on parse start, and destroying it on parse end.
45
- // End-users will never see a non-garbage value in this pointer.
46
- struct GumboInternalTokenizerState* _tokenizer_state;
47
-
48
- // The internal parser state. Initialized on parse start and destroyed on
49
- // parse end; end-users will never see a non-garbage value in this pointer.
50
- struct GumboInternalParserState* _parser_state;
51
- } GumboParser;
52
-
53
- #ifdef __cplusplus
54
- }
55
- #endif
56
-
57
- #endif // GUMBO_PARSER_H_
@@ -1,110 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "string_buffer.h"
18
-
19
- #include <assert.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <strings.h>
23
-
24
- #include "string_piece.h"
25
- #include "util.h"
26
-
27
- struct GumboInternalParser;
28
-
29
- // Size chosen via statistical analysis of ~60K websites.
30
- // 99% of text nodes and 98% of attribute names/values fit in this initial size.
31
- static const size_t kDefaultStringBufferSize = 5;
32
-
33
- static void maybe_resize_string_buffer(struct GumboInternalParser* parser,
34
- size_t additional_chars, GumboStringBuffer* buffer) {
35
- size_t new_length = buffer->length + additional_chars;
36
- size_t new_capacity = buffer->capacity;
37
- while (new_capacity < new_length) {
38
- new_capacity *= 2;
39
- }
40
- if (new_capacity != buffer->capacity) {
41
- char* new_data = gumbo_parser_allocate(parser, new_capacity);
42
- memcpy(new_data, buffer->data, buffer->length);
43
- gumbo_parser_deallocate(parser, buffer->data);
44
- buffer->data = new_data;
45
- buffer->capacity = new_capacity;
46
- }
47
- }
48
-
49
- void gumbo_string_buffer_init(
50
- struct GumboInternalParser* parser, GumboStringBuffer* output) {
51
- output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize);
52
- output->length = 0;
53
- output->capacity = kDefaultStringBufferSize;
54
- }
55
-
56
- void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
57
- size_t min_capacity, GumboStringBuffer* output) {
58
- maybe_resize_string_buffer(parser, min_capacity - output->length, output);
59
- }
60
-
61
- void gumbo_string_buffer_append_codepoint(
62
- struct GumboInternalParser* parser, int c, GumboStringBuffer* output) {
63
- // num_bytes is actually the number of continuation bytes, 1 less than the
64
- // total number of bytes. This is done to keep the loop below simple and
65
- // should probably change if we unroll it.
66
- int num_bytes, prefix;
67
- if (c <= 0x7f) {
68
- num_bytes = 0;
69
- prefix = 0;
70
- } else if (c <= 0x7ff) {
71
- num_bytes = 1;
72
- prefix = 0xc0;
73
- } else if (c <= 0xffff) {
74
- num_bytes = 2;
75
- prefix = 0xe0;
76
- } else {
77
- num_bytes = 3;
78
- prefix = 0xf0;
79
- }
80
- maybe_resize_string_buffer(parser, num_bytes + 1, output);
81
- output->data[output->length++] = prefix | (c >> (num_bytes * 6));
82
- for (int i = num_bytes - 1; i >= 0; --i) {
83
- output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6)));
84
- }
85
- }
86
-
87
- void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
88
- GumboStringPiece* str, GumboStringBuffer* output) {
89
- maybe_resize_string_buffer(parser, str->length, output);
90
- memcpy(output->data + output->length, str->data, str->length);
91
- output->length += str->length;
92
- }
93
-
94
- char* gumbo_string_buffer_to_string(
95
- struct GumboInternalParser* parser, GumboStringBuffer* input) {
96
- char* buffer = gumbo_parser_allocate(parser, input->length + 1);
97
- memcpy(buffer, input->data, input->length);
98
- buffer[input->length] = '\0';
99
- return buffer;
100
- }
101
-
102
- void gumbo_string_buffer_clear(
103
- struct GumboInternalParser* parser, GumboStringBuffer* input) {
104
- input->length = 0;
105
- }
106
-
107
- void gumbo_string_buffer_destroy(
108
- struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
109
- gumbo_parser_deallocate(parser, buffer->data);
110
- }
@@ -1,84 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- #ifndef GUMBO_STRING_BUFFER_H_
18
- #define GUMBO_STRING_BUFFER_H_
19
-
20
- #include <stdbool.h>
21
- #include <stddef.h>
22
-
23
- #include "gumbo.h"
24
-
25
- #ifdef __cplusplus
26
- extern "C" {
27
- #endif
28
-
29
- struct GumboInternalParser;
30
-
31
- // A struct representing a mutable, growable string. This consists of a
32
- // heap-allocated buffer that may grow (by doubling) as necessary. When
33
- // converting to a string, this allocates a new buffer that is only as long as
34
- // it needs to be. Note that the internal buffer here is *not* nul-terminated,
35
- // so be sure not to use ordinary string manipulation functions on it.
36
- typedef struct {
37
- // A pointer to the beginning of the string. NULL iff length == 0.
38
- char* data;
39
-
40
- // The length of the string fragment, in bytes. May be zero.
41
- size_t length;
42
-
43
- // The capacity of the buffer, in bytes.
44
- size_t capacity;
45
- } GumboStringBuffer;
46
-
47
- // Initializes a new GumboStringBuffer.
48
- void gumbo_string_buffer_init(
49
- struct GumboInternalParser* parser, GumboStringBuffer* output);
50
-
51
- // Ensures that the buffer contains at least a certain amount of space. Most
52
- // useful with snprintf and the other length-delimited string functions, which
53
- // may want to write directly into the buffer.
54
- void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
55
- size_t min_capacity, GumboStringBuffer* output);
56
-
57
- // Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
58
- // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
59
- // value of the codepoint.
60
- void gumbo_string_buffer_append_codepoint(
61
- struct GumboInternalParser* parser, int c, GumboStringBuffer* output);
62
-
63
- // Appends a string onto the end of the GumboStringBuffer.
64
- void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
65
- GumboStringPiece* str, GumboStringBuffer* output);
66
-
67
- // Converts this string buffer to const char*, alloctaing a new buffer for it.
68
- char* gumbo_string_buffer_to_string(
69
- struct GumboInternalParser* parser, GumboStringBuffer* input);
70
-
71
- // Reinitialize this string buffer. This clears it by setting length=0. It
72
- // does not zero out the buffer itself.
73
- void gumbo_string_buffer_clear(
74
- struct GumboInternalParser* parser, GumboStringBuffer* input);
75
-
76
- // Deallocates this GumboStringBuffer.
77
- void gumbo_string_buffer_destroy(
78
- struct GumboInternalParser* parser, GumboStringBuffer* buffer);
79
-
80
- #ifdef __cplusplus
81
- }
82
- #endif
83
-
84
- #endif // GUMBO_STRING_BUFFER_H_
@@ -1,48 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "string_piece.h"
18
-
19
- #include <assert.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <strings.h>
23
-
24
- #include "util.h"
25
-
26
- struct GumboInternalParser;
27
-
28
- const GumboStringPiece kGumboEmptyString = {NULL, 0};
29
-
30
- bool gumbo_string_equals(
31
- const GumboStringPiece* str1, const GumboStringPiece* str2) {
32
- return str1->length == str2->length &&
33
- !memcmp(str1->data, str2->data, str1->length);
34
- }
35
-
36
- bool gumbo_string_equals_ignore_case(
37
- const GumboStringPiece* str1, const GumboStringPiece* str2) {
38
- return str1->length == str2->length &&
39
- !strncasecmp(str1->data, str2->data, str1->length);
40
- }
41
-
42
- void gumbo_string_copy(struct GumboInternalParser* parser,
43
- GumboStringPiece* dest, const GumboStringPiece* source) {
44
- dest->length = source->length;
45
- char* buffer = gumbo_parser_allocate(parser, source->length);
46
- memcpy(buffer, source->data, source->length);
47
- dest->data = buffer;
48
- }
@@ -1,38 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #ifndef GUMBO_STRING_PIECE_H_
18
- #define GUMBO_STRING_PIECE_H_
19
-
20
- #include "gumbo.h"
21
-
22
- #ifdef __cplusplus
23
- extern "C" {
24
- #endif
25
-
26
- struct GumboInternalParser;
27
-
28
- // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29
- // destination and copying over the characters from source. Dest should be
30
- // empty, with no buffer allocated; otherwise, this leaks it.
31
- void gumbo_string_copy(struct GumboInternalParser* parser,
32
- GumboStringPiece* dest, const GumboStringPiece* source);
33
-
34
- #ifdef __cplusplus
35
- }
36
- #endif
37
-
38
- #endif // GUMBO_STRING_PIECE_H_
@@ -1,95 +0,0 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "gumbo.h"
18
-
19
- #include <assert.h>
20
- #include <ctype.h>
21
- #include <string.h>
22
-
23
- const char* kGumboTagNames[] = {
24
- #include "tag_strings.h"
25
- "", // TAG_UNKNOWN
26
- "", // TAG_LAST
27
- };
28
-
29
- static const unsigned char kGumboTagSizes[] = {
30
- #include "tag_sizes.h"
31
- 0, // TAG_UNKNOWN
32
- 0, // TAG_LAST
33
- };
34
-
35
- const char* gumbo_normalized_tagname(GumboTag tag) {
36
- assert(tag <= GUMBO_TAG_LAST);
37
- return kGumboTagNames[tag];
38
- }
39
-
40
- void gumbo_tag_from_original_text(GumboStringPiece* text) {
41
- if (text->data == NULL) {
42
- return;
43
- }
44
-
45
- assert(text->length >= 2);
46
- assert(text->data[0] == '<');
47
- assert(text->data[text->length - 1] == '>');
48
- if (text->data[1] == '/') {
49
- // End tag.
50
- assert(text->length >= 3);
51
- text->data += 2; // Move past </
52
- text->length -= 3;
53
- } else {
54
- // Start tag.
55
- text->data += 1; // Move past <
56
- text->length -= 2;
57
- // strnchr is apparently not a standard C library function, so I loop
58
- // explicitly looking for whitespace or other illegal tag characters.
59
- for (const char* c = text->data; c != text->data + text->length; ++c) {
60
- if (isspace(*c) || *c == '/') {
61
- text->length = c - text->data;
62
- break;
63
- }
64
- }
65
- }
66
- }
67
-
68
- static int case_memcmp(const char* s1, const char* s2, unsigned int n) {
69
- while (n--) {
70
- unsigned char c1 = tolower(*s1++);
71
- unsigned char c2 = tolower(*s2++);
72
- if (c1 != c2) return (int) c1 - (int) c2;
73
- }
74
- return 0;
75
- }
76
-
77
- #include "tag_gperf.h"
78
- #define TAG_MAP_SIZE (sizeof(kGumboTagMap) / sizeof(kGumboTagMap[0]))
79
-
80
- GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
81
- if (length) {
82
- unsigned int key = tag_hash(tagname, length);
83
- if (key < TAG_MAP_SIZE) {
84
- GumboTag tag = kGumboTagMap[key];
85
- if (length == kGumboTagSizes[(int) tag] &&
86
- !case_memcmp(tagname, kGumboTagNames[(int) tag], length))
87
- return tag;
88
- }
89
- }
90
- return GUMBO_TAG_UNKNOWN;
91
- }
92
-
93
- GumboTag gumbo_tag_enum(const char* tagname) {
94
- return gumbo_tagn_enum(tagname, strlen(tagname));
95
- }
@@ -1,150 +0,0 @@
1
- html
2
- head
3
- title
4
- base
5
- link
6
- meta
7
- style
8
- script
9
- noscript
10
- template
11
- body
12
- article
13
- section
14
- nav
15
- aside
16
- h1
17
- h2
18
- h3
19
- h4
20
- h5
21
- h6
22
- hgroup
23
- header
24
- footer
25
- address
26
- p
27
- hr
28
- pre
29
- blockquote
30
- ol
31
- ul
32
- li
33
- dl
34
- dt
35
- dd
36
- figure
37
- figcaption
38
- main
39
- div
40
- a
41
- em
42
- strong
43
- small
44
- s
45
- cite
46
- q
47
- dfn
48
- abbr
49
- data
50
- time
51
- code
52
- var
53
- samp
54
- kbd
55
- sub
56
- sup
57
- i
58
- b
59
- u
60
- mark
61
- ruby
62
- rt
63
- rp
64
- bdi
65
- bdo
66
- span
67
- br
68
- wbr
69
- ins
70
- del
71
- image
72
- img
73
- iframe
74
- embed
75
- object
76
- param
77
- video
78
- audio
79
- source
80
- track
81
- canvas
82
- map
83
- area
84
- math
85
- mi
86
- mo
87
- mn
88
- ms
89
- mtext
90
- mglyph
91
- malignmark
92
- annotation-xml
93
- svg
94
- foreignobject
95
- desc
96
- table
97
- caption
98
- colgroup
99
- col
100
- tbody
101
- thead
102
- tfoot
103
- tr
104
- td
105
- th
106
- form
107
- fieldset
108
- legend
109
- label
110
- input
111
- button
112
- select
113
- datalist
114
- optgroup
115
- option
116
- textarea
117
- keygen
118
- output
119
- progress
120
- meter
121
- details
122
- summary
123
- menu
124
- menuitem
125
- applet
126
- acronym
127
- bgsound
128
- dir
129
- frame
130
- frameset
131
- noframes
132
- isindex
133
- listing
134
- xmp
135
- nextid
136
- noembed
137
- plaintext
138
- rb
139
- strike
140
- basefont
141
- big
142
- blink
143
- center
144
- font
145
- marquee
146
- multicol
147
- nobr
148
- spacer
149
- tt
150
- rtc