nokogumbo 1.4.7 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,57 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Contains the definition of the top-level GumboParser structure that's
18
- // threaded through basically every internal function in the library.
19
-
20
- #ifndef GUMBO_PARSER_H_
21
- #define GUMBO_PARSER_H_
22
-
23
- #ifdef __cplusplus
24
- extern "C" {
25
- #endif
26
-
27
- struct GumboInternalParserState;
28
- struct GumboInternalOutput;
29
- struct GumboInternalOptions;
30
- struct GumboInternalTokenizerState;
31
-
32
- // An overarching struct that's threaded through (nearly) all functions in the
33
- // library, OOP-style. This gives each function access to the options and
34
- // output, along with any internal state needed for the parse.
35
- typedef struct GumboInternalParser {
36
- // Settings for this parse run.
37
- const struct GumboInternalOptions* _options;
38
-
39
- // Output for the parse.
40
- struct GumboInternalOutput* _output;
41
-
42
- // The internal tokenizer state, defined as a pointer to avoid a cyclic
43
- // dependency on html5tokenizer.h. The main parse routine is responsible for
44
- // initializing this on parse start, and destroying it on parse end.
45
- // End-users will never see a non-garbage value in this pointer.
46
- struct GumboInternalTokenizerState* _tokenizer_state;
47
-
48
- // The internal parser state. Initialized on parse start and destroyed on
49
- // parse end; end-users will never see a non-garbage value in this pointer.
50
- struct GumboInternalParserState* _parser_state;
51
- } GumboParser;
52
-
53
- #ifdef __cplusplus
54
- }
55
- #endif
56
-
57
- #endif // GUMBO_PARSER_H_
@@ -1,110 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "string_buffer.h"
18
-
19
- #include <assert.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <strings.h>
23
-
24
- #include "string_piece.h"
25
- #include "util.h"
26
-
27
- struct GumboInternalParser;
28
-
29
- // Size chosen via statistical analysis of ~60K websites.
30
- // 99% of text nodes and 98% of attribute names/values fit in this initial size.
31
- static const size_t kDefaultStringBufferSize = 5;
32
-
33
- static void maybe_resize_string_buffer(struct GumboInternalParser* parser,
34
- size_t additional_chars, GumboStringBuffer* buffer) {
35
- size_t new_length = buffer->length + additional_chars;
36
- size_t new_capacity = buffer->capacity;
37
- while (new_capacity < new_length) {
38
- new_capacity *= 2;
39
- }
40
- if (new_capacity != buffer->capacity) {
41
- char* new_data = gumbo_parser_allocate(parser, new_capacity);
42
- memcpy(new_data, buffer->data, buffer->length);
43
- gumbo_parser_deallocate(parser, buffer->data);
44
- buffer->data = new_data;
45
- buffer->capacity = new_capacity;
46
- }
47
- }
48
-
49
- void gumbo_string_buffer_init(
50
- struct GumboInternalParser* parser, GumboStringBuffer* output) {
51
- output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize);
52
- output->length = 0;
53
- output->capacity = kDefaultStringBufferSize;
54
- }
55
-
56
- void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
57
- size_t min_capacity, GumboStringBuffer* output) {
58
- maybe_resize_string_buffer(parser, min_capacity - output->length, output);
59
- }
60
-
61
- void gumbo_string_buffer_append_codepoint(
62
- struct GumboInternalParser* parser, int c, GumboStringBuffer* output) {
63
- // num_bytes is actually the number of continuation bytes, 1 less than the
64
- // total number of bytes. This is done to keep the loop below simple and
65
- // should probably change if we unroll it.
66
- int num_bytes, prefix;
67
- if (c <= 0x7f) {
68
- num_bytes = 0;
69
- prefix = 0;
70
- } else if (c <= 0x7ff) {
71
- num_bytes = 1;
72
- prefix = 0xc0;
73
- } else if (c <= 0xffff) {
74
- num_bytes = 2;
75
- prefix = 0xe0;
76
- } else {
77
- num_bytes = 3;
78
- prefix = 0xf0;
79
- }
80
- maybe_resize_string_buffer(parser, num_bytes + 1, output);
81
- output->data[output->length++] = prefix | (c >> (num_bytes * 6));
82
- for (int i = num_bytes - 1; i >= 0; --i) {
83
- output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6)));
84
- }
85
- }
86
-
87
- void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
88
- GumboStringPiece* str, GumboStringBuffer* output) {
89
- maybe_resize_string_buffer(parser, str->length, output);
90
- memcpy(output->data + output->length, str->data, str->length);
91
- output->length += str->length;
92
- }
93
-
94
- char* gumbo_string_buffer_to_string(
95
- struct GumboInternalParser* parser, GumboStringBuffer* input) {
96
- char* buffer = gumbo_parser_allocate(parser, input->length + 1);
97
- memcpy(buffer, input->data, input->length);
98
- buffer[input->length] = '\0';
99
- return buffer;
100
- }
101
-
102
- void gumbo_string_buffer_clear(
103
- struct GumboInternalParser* parser, GumboStringBuffer* input) {
104
- input->length = 0;
105
- }
106
-
107
- void gumbo_string_buffer_destroy(
108
- struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
109
- gumbo_parser_deallocate(parser, buffer->data);
110
- }
@@ -1,84 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- #ifndef GUMBO_STRING_BUFFER_H_
18
- #define GUMBO_STRING_BUFFER_H_
19
-
20
- #include <stdbool.h>
21
- #include <stddef.h>
22
-
23
- #include "gumbo.h"
24
-
25
- #ifdef __cplusplus
26
- extern "C" {
27
- #endif
28
-
29
- struct GumboInternalParser;
30
-
31
- // A struct representing a mutable, growable string. This consists of a
32
- // heap-allocated buffer that may grow (by doubling) as necessary. When
33
- // converting to a string, this allocates a new buffer that is only as long as
34
- // it needs to be. Note that the internal buffer here is *not* nul-terminated,
35
- // so be sure not to use ordinary string manipulation functions on it.
36
- typedef struct {
37
- // A pointer to the beginning of the string. NULL iff length == 0.
38
- char* data;
39
-
40
- // The length of the string fragment, in bytes. May be zero.
41
- size_t length;
42
-
43
- // The capacity of the buffer, in bytes.
44
- size_t capacity;
45
- } GumboStringBuffer;
46
-
47
- // Initializes a new GumboStringBuffer.
48
- void gumbo_string_buffer_init(
49
- struct GumboInternalParser* parser, GumboStringBuffer* output);
50
-
51
- // Ensures that the buffer contains at least a certain amount of space. Most
52
- // useful with snprintf and the other length-delimited string functions, which
53
- // may want to write directly into the buffer.
54
- void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
55
- size_t min_capacity, GumboStringBuffer* output);
56
-
57
- // Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
58
- // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
59
- // value of the codepoint.
60
- void gumbo_string_buffer_append_codepoint(
61
- struct GumboInternalParser* parser, int c, GumboStringBuffer* output);
62
-
63
- // Appends a string onto the end of the GumboStringBuffer.
64
- void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
65
- GumboStringPiece* str, GumboStringBuffer* output);
66
-
67
- // Converts this string buffer to const char*, alloctaing a new buffer for it.
68
- char* gumbo_string_buffer_to_string(
69
- struct GumboInternalParser* parser, GumboStringBuffer* input);
70
-
71
- // Reinitialize this string buffer. This clears it by setting length=0. It
72
- // does not zero out the buffer itself.
73
- void gumbo_string_buffer_clear(
74
- struct GumboInternalParser* parser, GumboStringBuffer* input);
75
-
76
- // Deallocates this GumboStringBuffer.
77
- void gumbo_string_buffer_destroy(
78
- struct GumboInternalParser* parser, GumboStringBuffer* buffer);
79
-
80
- #ifdef __cplusplus
81
- }
82
- #endif
83
-
84
- #endif // GUMBO_STRING_BUFFER_H_
@@ -1,48 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "string_piece.h"
18
-
19
- #include <assert.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <strings.h>
23
-
24
- #include "util.h"
25
-
26
- struct GumboInternalParser;
27
-
28
- const GumboStringPiece kGumboEmptyString = {NULL, 0};
29
-
30
- bool gumbo_string_equals(
31
- const GumboStringPiece* str1, const GumboStringPiece* str2) {
32
- return str1->length == str2->length &&
33
- !memcmp(str1->data, str2->data, str1->length);
34
- }
35
-
36
- bool gumbo_string_equals_ignore_case(
37
- const GumboStringPiece* str1, const GumboStringPiece* str2) {
38
- return str1->length == str2->length &&
39
- !strncasecmp(str1->data, str2->data, str1->length);
40
- }
41
-
42
- void gumbo_string_copy(struct GumboInternalParser* parser,
43
- GumboStringPiece* dest, const GumboStringPiece* source) {
44
- dest->length = source->length;
45
- char* buffer = gumbo_parser_allocate(parser, source->length);
46
- memcpy(buffer, source->data, source->length);
47
- dest->data = buffer;
48
- }
@@ -1,38 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #ifndef GUMBO_STRING_PIECE_H_
18
- #define GUMBO_STRING_PIECE_H_
19
-
20
- #include "gumbo.h"
21
-
22
- #ifdef __cplusplus
23
- extern "C" {
24
- #endif
25
-
26
- struct GumboInternalParser;
27
-
28
- // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29
- // destination and copying over the characters from source. Dest should be
30
- // empty, with no buffer allocated; otherwise, this leaks it.
31
- void gumbo_string_copy(struct GumboInternalParser* parser,
32
- GumboStringPiece* dest, const GumboStringPiece* source);
33
-
34
- #ifdef __cplusplus
35
- }
36
- #endif
37
-
38
- #endif // GUMBO_STRING_PIECE_H_
@@ -1,95 +0,0 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "gumbo.h"
18
-
19
- #include <assert.h>
20
- #include <ctype.h>
21
- #include <string.h>
22
-
23
- const char* kGumboTagNames[] = {
24
- #include "tag_strings.h"
25
- "", // TAG_UNKNOWN
26
- "", // TAG_LAST
27
- };
28
-
29
- static const unsigned char kGumboTagSizes[] = {
30
- #include "tag_sizes.h"
31
- 0, // TAG_UNKNOWN
32
- 0, // TAG_LAST
33
- };
34
-
35
- const char* gumbo_normalized_tagname(GumboTag tag) {
36
- assert(tag <= GUMBO_TAG_LAST);
37
- return kGumboTagNames[tag];
38
- }
39
-
40
- void gumbo_tag_from_original_text(GumboStringPiece* text) {
41
- if (text->data == NULL) {
42
- return;
43
- }
44
-
45
- assert(text->length >= 2);
46
- assert(text->data[0] == '<');
47
- assert(text->data[text->length - 1] == '>');
48
- if (text->data[1] == '/') {
49
- // End tag.
50
- assert(text->length >= 3);
51
- text->data += 2; // Move past </
52
- text->length -= 3;
53
- } else {
54
- // Start tag.
55
- text->data += 1; // Move past <
56
- text->length -= 2;
57
- // strnchr is apparently not a standard C library function, so I loop
58
- // explicitly looking for whitespace or other illegal tag characters.
59
- for (const char* c = text->data; c != text->data + text->length; ++c) {
60
- if (isspace(*c) || *c == '/') {
61
- text->length = c - text->data;
62
- break;
63
- }
64
- }
65
- }
66
- }
67
-
68
- static int case_memcmp(const char* s1, const char* s2, unsigned int n) {
69
- while (n--) {
70
- unsigned char c1 = tolower(*s1++);
71
- unsigned char c2 = tolower(*s2++);
72
- if (c1 != c2) return (int) c1 - (int) c2;
73
- }
74
- return 0;
75
- }
76
-
77
- #include "tag_gperf.h"
78
- #define TAG_MAP_SIZE (sizeof(kGumboTagMap) / sizeof(kGumboTagMap[0]))
79
-
80
- GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
81
- if (length) {
82
- unsigned int key = tag_hash(tagname, length);
83
- if (key < TAG_MAP_SIZE) {
84
- GumboTag tag = kGumboTagMap[key];
85
- if (length == kGumboTagSizes[(int) tag] &&
86
- !case_memcmp(tagname, kGumboTagNames[(int) tag], length))
87
- return tag;
88
- }
89
- }
90
- return GUMBO_TAG_UNKNOWN;
91
- }
92
-
93
- GumboTag gumbo_tag_enum(const char* tagname) {
94
- return gumbo_tagn_enum(tagname, strlen(tagname));
95
- }
@@ -1,150 +0,0 @@
1
- html
2
- head
3
- title
4
- base
5
- link
6
- meta
7
- style
8
- script
9
- noscript
10
- template
11
- body
12
- article
13
- section
14
- nav
15
- aside
16
- h1
17
- h2
18
- h3
19
- h4
20
- h5
21
- h6
22
- hgroup
23
- header
24
- footer
25
- address
26
- p
27
- hr
28
- pre
29
- blockquote
30
- ol
31
- ul
32
- li
33
- dl
34
- dt
35
- dd
36
- figure
37
- figcaption
38
- main
39
- div
40
- a
41
- em
42
- strong
43
- small
44
- s
45
- cite
46
- q
47
- dfn
48
- abbr
49
- data
50
- time
51
- code
52
- var
53
- samp
54
- kbd
55
- sub
56
- sup
57
- i
58
- b
59
- u
60
- mark
61
- ruby
62
- rt
63
- rp
64
- bdi
65
- bdo
66
- span
67
- br
68
- wbr
69
- ins
70
- del
71
- image
72
- img
73
- iframe
74
- embed
75
- object
76
- param
77
- video
78
- audio
79
- source
80
- track
81
- canvas
82
- map
83
- area
84
- math
85
- mi
86
- mo
87
- mn
88
- ms
89
- mtext
90
- mglyph
91
- malignmark
92
- annotation-xml
93
- svg
94
- foreignobject
95
- desc
96
- table
97
- caption
98
- colgroup
99
- col
100
- tbody
101
- thead
102
- tfoot
103
- tr
104
- td
105
- th
106
- form
107
- fieldset
108
- legend
109
- label
110
- input
111
- button
112
- select
113
- datalist
114
- optgroup
115
- option
116
- textarea
117
- keygen
118
- output
119
- progress
120
- meter
121
- details
122
- summary
123
- menu
124
- menuitem
125
- applet
126
- acronym
127
- bgsound
128
- dir
129
- frame
130
- frameset
131
- noframes
132
- isindex
133
- listing
134
- xmp
135
- nextid
136
- noembed
137
- plaintext
138
- rb
139
- strike
140
- basefont
141
- big
142
- blink
143
- center
144
- font
145
- marquee
146
- multicol
147
- nobr
148
- spacer
149
- tt
150
- rtc