nokogumbo 0.5 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/work/Makefile +213 -0
- data/work/attribute.c +44 -0
- data/work/attribute.h +37 -0
- data/work/attribute.o +0 -0
- data/work/char_ref.c +2561 -0
- data/work/char_ref.h +61 -0
- data/work/char_ref.o +0 -0
- data/work/error.c +258 -0
- data/work/error.h +225 -0
- data/work/error.o +0 -0
- data/work/gumbo.h +800 -0
- data/work/insertion_mode.h +54 -0
- data/work/mkmf.log +41 -0
- data/work/nokogumbo.c +97 -0
- data/work/nokogumbo.o +0 -0
- data/work/nokogumboc.so +0 -0
- data/work/parser.c +3893 -0
- data/work/parser.h +57 -0
- data/work/parser.o +0 -0
- data/work/string_buffer.c +106 -0
- data/work/string_buffer.h +82 -0
- data/work/string_buffer.o +0 -0
- data/work/string_piece.c +49 -0
- data/work/string_piece.h +39 -0
- data/work/string_piece.o +0 -0
- data/work/tag.c +222 -0
- data/work/tag.o +0 -0
- data/work/token_type.h +40 -0
- data/work/tokenizer.c +2978 -0
- data/work/tokenizer.h +123 -0
- data/work/tokenizer.o +0 -0
- data/work/tokenizer_states.h +103 -0
- data/work/utf8.c +268 -0
- data/work/utf8.h +127 -0
- data/work/utf8.o +0 -0
- data/work/util.c +58 -0
- data/work/util.h +57 -0
- data/work/util.o +0 -0
- data/work/vector.c +121 -0
- data/work/vector.h +66 -0
- data/work/vector.o +0 -0
- metadata +42 -2
- data/Rakefile +0 -68
data/work/parser.h
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
//
|
17
|
+
// Contains the definition of the top-level GumboParser structure that's
|
18
|
+
// threaded through basically every internal function in the library.
|
19
|
+
|
20
|
+
#ifndef GUMBO_PARSER_H_
|
21
|
+
#define GUMBO_PARSER_H_
|
22
|
+
|
23
|
+
#ifdef __cplusplus
|
24
|
+
extern "C" {
|
25
|
+
#endif
|
26
|
+
|
27
|
+
struct _GumboParserState;
|
28
|
+
struct _GumboOutput;
|
29
|
+
struct _GumboOptions;
|
30
|
+
struct _GumboTokenizerState;
|
31
|
+
|
32
|
+
// An overarching struct that's threaded through (nearly) all functions in the
|
33
|
+
// library, OOP-style. This gives each function access to the options and
|
34
|
+
// output, along with any internal state needed for the parse.
|
35
|
+
typedef struct _GumboParser {
|
36
|
+
// Settings for this parse run.
|
37
|
+
const struct _GumboOptions* _options;
|
38
|
+
|
39
|
+
// Output for the parse.
|
40
|
+
struct _GumboOutput* _output;
|
41
|
+
|
42
|
+
// The internal tokenizer state, defined as a pointer to avoid a cyclic
|
43
|
+
// dependency on html5tokenizer.h. The main parse routine is responsible for
|
44
|
+
// initializing this on parse start, and destroying it on parse end.
|
45
|
+
// End-users will never see a non-garbage value in this pointer.
|
46
|
+
struct _GumboTokenizerState* _tokenizer_state;
|
47
|
+
|
48
|
+
// The internal parser state. Initialized on parse start and destroyed on
|
49
|
+
// parse end; end-users will never see a non-garbage value in this pointer.
|
50
|
+
struct _GumboParserState* _parser_state;
|
51
|
+
} GumboParser;
|
52
|
+
|
53
|
+
#ifdef __cplusplus
|
54
|
+
}
|
55
|
+
#endif
|
56
|
+
|
57
|
+
#endif // GUMBO_PARSER_H_
|
data/work/parser.o
ADDED
Binary file
|
@@ -0,0 +1,106 @@
|
|
1
|
+
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
|
17
|
+
#include "string_buffer.h"
|
18
|
+
|
19
|
+
#include <assert.h>
|
20
|
+
#include <stdlib.h>
|
21
|
+
#include <string.h>
|
22
|
+
#include <strings.h>
|
23
|
+
|
24
|
+
#include "string_piece.h"
|
25
|
+
#include "util.h"
|
26
|
+
|
27
|
+
struct _GumboParser;
|
28
|
+
|
29
|
+
static const size_t kDefaultStringBufferSize = 10;
|
30
|
+
|
31
|
+
static void maybe_resize_string_buffer(
|
32
|
+
struct _GumboParser* parser, size_t additional_chars,
|
33
|
+
GumboStringBuffer* buffer) {
|
34
|
+
size_t new_length = buffer->length + additional_chars;
|
35
|
+
size_t new_capacity = buffer->capacity;
|
36
|
+
while (new_capacity < new_length) {
|
37
|
+
new_capacity *= 2;
|
38
|
+
}
|
39
|
+
if (new_capacity != buffer->capacity) {
|
40
|
+
char* new_data = gumbo_parser_allocate(parser, new_capacity);
|
41
|
+
memcpy(new_data, buffer->data, buffer->length);
|
42
|
+
gumbo_parser_deallocate(parser, buffer->data);
|
43
|
+
buffer->data = new_data;
|
44
|
+
buffer->capacity = new_capacity;
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
void gumbo_string_buffer_init(
|
49
|
+
struct _GumboParser* parser, GumboStringBuffer* output) {
|
50
|
+
output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize);
|
51
|
+
output->length = 0;
|
52
|
+
output->capacity = kDefaultStringBufferSize;
|
53
|
+
}
|
54
|
+
|
55
|
+
void gumbo_string_buffer_reserve(
|
56
|
+
struct _GumboParser* parser, size_t min_capacity,
|
57
|
+
GumboStringBuffer* output) {
|
58
|
+
maybe_resize_string_buffer(parser, min_capacity - output->length, output);
|
59
|
+
}
|
60
|
+
|
61
|
+
void gumbo_string_buffer_append_codepoint(
|
62
|
+
struct _GumboParser* parser, int c, GumboStringBuffer* output) {
|
63
|
+
// num_bytes is actually the number of continuation bytes, 1 less than the
|
64
|
+
// total number of bytes. This is done to keep the loop below simple and
|
65
|
+
// should probably change if we unroll it.
|
66
|
+
int num_bytes, prefix;
|
67
|
+
if (c <= 0x7f) {
|
68
|
+
num_bytes = 0;
|
69
|
+
prefix = 0;
|
70
|
+
} else if (c <= 0x7ff) {
|
71
|
+
num_bytes = 1;
|
72
|
+
prefix = 0xc0;
|
73
|
+
} else if (c <= 0xffff) {
|
74
|
+
num_bytes = 2;
|
75
|
+
prefix = 0xe0;
|
76
|
+
} else {
|
77
|
+
num_bytes = 3;
|
78
|
+
prefix = 0xf0;
|
79
|
+
}
|
80
|
+
maybe_resize_string_buffer(parser, num_bytes + 1, output);
|
81
|
+
output->data[output->length++] = prefix | (c >> (num_bytes * 6));
|
82
|
+
for (int i = num_bytes - 1; i >= 0; --i) {
|
83
|
+
output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6)));
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
void gumbo_string_buffer_append_string(
|
88
|
+
struct _GumboParser* parser, GumboStringPiece* str,
|
89
|
+
GumboStringBuffer* output) {
|
90
|
+
maybe_resize_string_buffer(parser, str->length, output);
|
91
|
+
memcpy(output->data + output->length, str->data, str->length);
|
92
|
+
output->length += str->length;
|
93
|
+
}
|
94
|
+
|
95
|
+
char* gumbo_string_buffer_to_string(
|
96
|
+
struct _GumboParser* parser, GumboStringBuffer* input) {
|
97
|
+
char* buffer = gumbo_parser_allocate(parser, input->length + 1);
|
98
|
+
memcpy(buffer, input->data, input->length);
|
99
|
+
buffer[input->length] = '\0';
|
100
|
+
return buffer;
|
101
|
+
}
|
102
|
+
|
103
|
+
void gumbo_string_buffer_destroy(
|
104
|
+
struct _GumboParser* parser, GumboStringBuffer* buffer) {
|
105
|
+
gumbo_parser_deallocate(parser, buffer->data);
|
106
|
+
}
|
@@ -0,0 +1,82 @@
|
|
1
|
+
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
//
|
17
|
+
#ifndef GUMBO_STRING_BUFFER_H_
|
18
|
+
#define GUMBO_STRING_BUFFER_H_
|
19
|
+
|
20
|
+
#include <stdbool.h>
|
21
|
+
#include <stddef.h>
|
22
|
+
|
23
|
+
#ifdef __cplusplus
|
24
|
+
extern "C" {
|
25
|
+
#endif
|
26
|
+
|
27
|
+
// Forward declaration since it's passed into some of the functions in this
|
28
|
+
// header.
|
29
|
+
struct _GumboParser;
|
30
|
+
struct _GumboStringPiece;
|
31
|
+
|
32
|
+
// A struct representing a mutable, growable string. This consists of a
|
33
|
+
// heap-allocated buffer that may grow (by doubling) as necessary. When
|
34
|
+
// converting to a string, this allocates a new buffer that is only as long as
|
35
|
+
// it needs to be. Note that the internal buffer here is *not* nul-terminated,
|
36
|
+
// so be sure not to use ordinary string manipulation functions on it.
|
37
|
+
typedef struct _GumboStringBuffer {
|
38
|
+
// A pointer to the beginning of the string. NULL iff length == 0.
|
39
|
+
char* data;
|
40
|
+
|
41
|
+
// The length of the string fragment, in bytes. May be zero.
|
42
|
+
size_t length;
|
43
|
+
|
44
|
+
// The capacity of the buffer, in bytes.
|
45
|
+
size_t capacity;
|
46
|
+
} GumboStringBuffer;
|
47
|
+
|
48
|
+
// Initializes a new GumboStringBuffer.
|
49
|
+
void gumbo_string_buffer_init(
|
50
|
+
struct _GumboParser* parser, GumboStringBuffer* output);
|
51
|
+
|
52
|
+
// Ensures that the buffer contains at least a certain amount of space. Most
|
53
|
+
// useful with snprintf and the other length-delimited string functions, which
|
54
|
+
// may want to write directly into the buffer.
|
55
|
+
void gumbo_string_buffer_reserve(
|
56
|
+
struct _GumboParser* parser, size_t min_capacity,
|
57
|
+
GumboStringBuffer* output);
|
58
|
+
|
59
|
+
// Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
|
60
|
+
// This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
|
61
|
+
// value of the codepoint.
|
62
|
+
void gumbo_string_buffer_append_codepoint(
|
63
|
+
struct _GumboParser* parser, int c, GumboStringBuffer* output);
|
64
|
+
|
65
|
+
// Appends a string onto the end of the GumboStringBuffer.
|
66
|
+
void gumbo_string_buffer_append_string(
|
67
|
+
struct _GumboParser* parser, struct _GumboStringPiece* str,
|
68
|
+
GumboStringBuffer* output);
|
69
|
+
|
70
|
+
// Converts this string buffer to const char*, alloctaing a new buffer for it.
|
71
|
+
char* gumbo_string_buffer_to_string(
|
72
|
+
struct _GumboParser* parser, GumboStringBuffer* input);
|
73
|
+
|
74
|
+
// Deallocates this GumboStringBuffer.
|
75
|
+
void gumbo_string_buffer_destroy(
|
76
|
+
struct _GumboParser* parser, GumboStringBuffer* buffer);
|
77
|
+
|
78
|
+
#ifdef __cplusplus
|
79
|
+
}
|
80
|
+
#endif
|
81
|
+
|
82
|
+
#endif // GUMBO_STRING_BUFFER_H_
|
Binary file
|
data/work/string_piece.c
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
|
17
|
+
#include "string_piece.h"
|
18
|
+
|
19
|
+
#include <assert.h>
|
20
|
+
#include <stdlib.h>
|
21
|
+
#include <string.h>
|
22
|
+
#include <strings.h>
|
23
|
+
|
24
|
+
#include "util.h"
|
25
|
+
|
26
|
+
struct _GumboParser;
|
27
|
+
|
28
|
+
const GumboStringPiece kGumboEmptyString = { NULL, 0 };
|
29
|
+
|
30
|
+
bool gumbo_string_equals(
|
31
|
+
const GumboStringPiece* str1, const GumboStringPiece* str2) {
|
32
|
+
return str1->length == str2->length &&
|
33
|
+
!memcmp(str1->data, str2->data, str1->length);
|
34
|
+
}
|
35
|
+
|
36
|
+
bool gumbo_string_equals_ignore_case(
|
37
|
+
const GumboStringPiece* str1, const GumboStringPiece* str2) {
|
38
|
+
return str1->length == str2->length &&
|
39
|
+
!strncasecmp(str1->data, str2->data, str1->length);
|
40
|
+
}
|
41
|
+
|
42
|
+
void gumbo_string_copy(
|
43
|
+
struct _GumboParser* parser, GumboStringPiece* dest,
|
44
|
+
const GumboStringPiece* source) {
|
45
|
+
dest->length = source->length;
|
46
|
+
char* buffer = gumbo_parser_allocate(parser, source->length);
|
47
|
+
memcpy(buffer, source->data, source->length);
|
48
|
+
dest->data = buffer;
|
49
|
+
}
|
data/work/string_piece.h
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
|
17
|
+
#ifndef GUMBO_STRING_PIECE_H_
|
18
|
+
#define GUMBO_STRING_PIECE_H_
|
19
|
+
|
20
|
+
#include "gumbo.h"
|
21
|
+
|
22
|
+
#ifdef __cplusplus
|
23
|
+
extern "C" {
|
24
|
+
#endif
|
25
|
+
|
26
|
+
struct _GumboParser;
|
27
|
+
|
28
|
+
// Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
|
29
|
+
// destination and copying over the characters from source. Dest should be
|
30
|
+
// empty, with no buffer allocated; otherwise, this leaks it.
|
31
|
+
void gumbo_string_copy(
|
32
|
+
struct _GumboParser* parser, GumboStringPiece* dest,
|
33
|
+
const GumboStringPiece* source);
|
34
|
+
|
35
|
+
#ifdef __cplusplus
|
36
|
+
}
|
37
|
+
#endif
|
38
|
+
|
39
|
+
#endif // GUMBO_STRING_PIECE_H_
|
data/work/string_piece.o
ADDED
Binary file
|
data/work/tag.c
ADDED
@@ -0,0 +1,222 @@
|
|
1
|
+
// Copyright 2011 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
|
17
|
+
#include "gumbo.h"
|
18
|
+
|
19
|
+
#include <assert.h>
|
20
|
+
#include <ctype.h>
|
21
|
+
#include <strings.h> // For strcasecmp.
|
22
|
+
|
23
|
+
// NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
|
24
|
+
// TODO(jdtang): Investigate whether there're efficiency benefits to putting the
|
25
|
+
// most common tag names first, or to putting them in alphabetical order and
|
26
|
+
// using a binary search.
|
27
|
+
const char* kGumboTagNames[] = {
|
28
|
+
"html",
|
29
|
+
"head",
|
30
|
+
"title",
|
31
|
+
"base",
|
32
|
+
"link",
|
33
|
+
"meta",
|
34
|
+
"style",
|
35
|
+
"script",
|
36
|
+
"noscript",
|
37
|
+
"body",
|
38
|
+
"section",
|
39
|
+
"nav",
|
40
|
+
"article",
|
41
|
+
"aside",
|
42
|
+
"h1",
|
43
|
+
"h2",
|
44
|
+
"h3",
|
45
|
+
"h4",
|
46
|
+
"h5",
|
47
|
+
"h6",
|
48
|
+
"hgroup",
|
49
|
+
"header",
|
50
|
+
"footer",
|
51
|
+
"address",
|
52
|
+
"p",
|
53
|
+
"hr",
|
54
|
+
"pre",
|
55
|
+
"blockquote",
|
56
|
+
"ol",
|
57
|
+
"ul",
|
58
|
+
"li",
|
59
|
+
"dl",
|
60
|
+
"dt",
|
61
|
+
"dd",
|
62
|
+
"figure",
|
63
|
+
"figcaption",
|
64
|
+
"div",
|
65
|
+
"a",
|
66
|
+
"em",
|
67
|
+
"strong",
|
68
|
+
"small",
|
69
|
+
"s",
|
70
|
+
"cite",
|
71
|
+
"q",
|
72
|
+
"dfn",
|
73
|
+
"abbr",
|
74
|
+
"time",
|
75
|
+
"code",
|
76
|
+
"var",
|
77
|
+
"samp",
|
78
|
+
"kbd",
|
79
|
+
"sub",
|
80
|
+
"sup",
|
81
|
+
"i",
|
82
|
+
"b",
|
83
|
+
"mark",
|
84
|
+
"ruby",
|
85
|
+
"rt",
|
86
|
+
"rp",
|
87
|
+
"bdi",
|
88
|
+
"bdo",
|
89
|
+
"span",
|
90
|
+
"br",
|
91
|
+
"wbr",
|
92
|
+
"ins",
|
93
|
+
"del",
|
94
|
+
"image",
|
95
|
+
"img",
|
96
|
+
"iframe",
|
97
|
+
"embed",
|
98
|
+
"object",
|
99
|
+
"param",
|
100
|
+
"video",
|
101
|
+
"audio",
|
102
|
+
"source",
|
103
|
+
"track",
|
104
|
+
"canvas",
|
105
|
+
"map",
|
106
|
+
"area",
|
107
|
+
"math",
|
108
|
+
"mi",
|
109
|
+
"mo",
|
110
|
+
"mn",
|
111
|
+
"ms",
|
112
|
+
"mtext",
|
113
|
+
"mglyph",
|
114
|
+
"malignmark",
|
115
|
+
"annotation-xml",
|
116
|
+
"svg",
|
117
|
+
"foreignobject",
|
118
|
+
"desc",
|
119
|
+
"table",
|
120
|
+
"caption",
|
121
|
+
"colgroup",
|
122
|
+
"col",
|
123
|
+
"tbody",
|
124
|
+
"thead",
|
125
|
+
"tfoot",
|
126
|
+
"tr",
|
127
|
+
"td",
|
128
|
+
"th",
|
129
|
+
"form",
|
130
|
+
"fieldset",
|
131
|
+
"legend",
|
132
|
+
"label",
|
133
|
+
"input",
|
134
|
+
"button",
|
135
|
+
"select",
|
136
|
+
"datalist",
|
137
|
+
"optgroup",
|
138
|
+
"option",
|
139
|
+
"textarea",
|
140
|
+
"keygen",
|
141
|
+
"output",
|
142
|
+
"progress",
|
143
|
+
"meter",
|
144
|
+
"details",
|
145
|
+
"summary",
|
146
|
+
"command",
|
147
|
+
"menu",
|
148
|
+
"applet",
|
149
|
+
"acronym",
|
150
|
+
"bgsound",
|
151
|
+
"dir",
|
152
|
+
"frame",
|
153
|
+
"frameset",
|
154
|
+
"noframes",
|
155
|
+
"isindex",
|
156
|
+
"listing",
|
157
|
+
"xmp",
|
158
|
+
"nextid",
|
159
|
+
"noembed",
|
160
|
+
"plaintext",
|
161
|
+
"rb",
|
162
|
+
"strike",
|
163
|
+
"basefont",
|
164
|
+
"big",
|
165
|
+
"blink",
|
166
|
+
"center",
|
167
|
+
"font",
|
168
|
+
"marquee",
|
169
|
+
"multicol",
|
170
|
+
"nobr",
|
171
|
+
"spacer",
|
172
|
+
"tt",
|
173
|
+
"u",
|
174
|
+
"", // TAG_UNKNOWN
|
175
|
+
"", // TAG_LAST
|
176
|
+
};
|
177
|
+
|
178
|
+
const char* gumbo_normalized_tagname(GumboTag tag) {
|
179
|
+
assert(tag <= GUMBO_TAG_LAST);
|
180
|
+
return kGumboTagNames[tag];
|
181
|
+
}
|
182
|
+
|
183
|
+
// TODO(jdtang): Add test for this.
|
184
|
+
void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
185
|
+
if (text->data == NULL) {
|
186
|
+
return;
|
187
|
+
}
|
188
|
+
|
189
|
+
assert(text->length >= 2);
|
190
|
+
assert(text->data[0] == '<');
|
191
|
+
assert(text->data[text->length - 1] == '>');
|
192
|
+
if (text->data[1] == '/') {
|
193
|
+
// End tag.
|
194
|
+
assert(text->length >= 3);
|
195
|
+
text->data += 2; // Move past </
|
196
|
+
text->length -= 3;
|
197
|
+
} else {
|
198
|
+
// Start tag.
|
199
|
+
text->data += 1; // Move past <
|
200
|
+
text->length -= 2;
|
201
|
+
// strnchr is apparently not a standard C library function, so I loop
|
202
|
+
// explicitly looking for whitespace or other illegal tag characters.
|
203
|
+
for (const char* c = text->data; c != text->data + text->length; ++c) {
|
204
|
+
if (isspace(*c) || *c == '/') {
|
205
|
+
text->length = c - text->data;
|
206
|
+
break;
|
207
|
+
}
|
208
|
+
}
|
209
|
+
}
|
210
|
+
}
|
211
|
+
|
212
|
+
GumboTag gumbo_tag_enum(const char* tagname) {
|
213
|
+
for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
|
214
|
+
// TODO(jdtang): strcasecmp is non-portable, so if we want to support
|
215
|
+
// non-GCC compilers, we'll need some #ifdef magic. This source already has
|
216
|
+
// pretty significant issues with MSVC6 anyway.
|
217
|
+
if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
|
218
|
+
return i;
|
219
|
+
}
|
220
|
+
}
|
221
|
+
return GUMBO_TAG_UNKNOWN;
|
222
|
+
}
|
data/work/tag.o
ADDED
Binary file
|
data/work/token_type.h
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
// Copyright 2011 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
|
17
|
+
#ifndef GUMBO_TOKEN_TYPE_H_
|
18
|
+
#define GUMBO_TOKEN_TYPE_H_
|
19
|
+
|
20
|
+
#ifdef __cplusplus
|
21
|
+
extern "C" {
|
22
|
+
#endif
|
23
|
+
|
24
|
+
// An enum representing the type of token.
|
25
|
+
typedef enum _GumboTokenType {
|
26
|
+
GUMBO_TOKEN_DOCTYPE,
|
27
|
+
GUMBO_TOKEN_START_TAG,
|
28
|
+
GUMBO_TOKEN_END_TAG,
|
29
|
+
GUMBO_TOKEN_COMMENT,
|
30
|
+
GUMBO_TOKEN_WHITESPACE,
|
31
|
+
GUMBO_TOKEN_CHARACTER,
|
32
|
+
GUMBO_TOKEN_NULL,
|
33
|
+
GUMBO_TOKEN_EOF
|
34
|
+
} GumboTokenType;
|
35
|
+
|
36
|
+
#ifdef __cplusplus
|
37
|
+
} // extern C
|
38
|
+
#endif
|
39
|
+
|
40
|
+
#endif // GUMBO_TOKEN_TYPE_H_
|