nokogumbo 0.4 → 0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +0 -3
- data/Rakefile +10 -9
- metadata +3 -29
- data/work/attribute.c +0 -44
- data/work/attribute.h +0 -37
- data/work/char_ref.c +0 -2561
- data/work/char_ref.h +0 -61
- data/work/error.c +0 -258
- data/work/error.h +0 -225
- data/work/gumbo.h +0 -800
- data/work/insertion_mode.h +0 -54
- data/work/nokogumbo.c +0 -254
- data/work/parser.c +0 -3893
- data/work/parser.h +0 -57
- data/work/string_buffer.c +0 -106
- data/work/string_buffer.h +0 -82
- data/work/string_piece.c +0 -49
- data/work/string_piece.h +0 -39
- data/work/tag.c +0 -222
- data/work/token_type.h +0 -40
- data/work/tokenizer.c +0 -2978
- data/work/tokenizer.h +0 -123
- data/work/tokenizer_states.h +0 -103
- data/work/utf8.c +0 -268
- data/work/utf8.h +0 -127
- data/work/util.c +0 -58
- data/work/util.h +0 -57
- data/work/vector.c +0 -121
- data/work/vector.h +0 -66
data/work/utf8.h
DELETED
@@ -1,127 +0,0 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
//
|
17
|
-
// This contains an implementation of a UTF8 iterator and decoder suitable for
|
18
|
-
// an HTML5 parser. This does a bit more than straight UTF-8 decoding. The
|
19
|
-
// HTML5 spec specifies that:
|
20
|
-
// 1. Decoding errors are parse errors.
|
21
|
-
// 2. Certain other codepoints (eg. control characters) are parse errors.
|
22
|
-
// 3. Carriage returns and CR/LF groups are converted to line feeds.
|
23
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
|
24
|
-
//
|
25
|
-
// Also, we want to keep track of source positions for error handling. As a
|
26
|
-
// result, we fold all that functionality into this decoder, and can't use an
|
27
|
-
// off-the-shelf library.
|
28
|
-
//
|
29
|
-
// This header is internal-only, which is why we prefix functions with only
|
30
|
-
// utf8_ or utf8_iterator_ instead of gumbo_utf8_.
|
31
|
-
|
32
|
-
#ifndef GUMBO_UTF8_H_
|
33
|
-
#define GUMBO_UTF8_H_
|
34
|
-
|
35
|
-
#include <stdbool.h>
|
36
|
-
#include <stddef.h>
|
37
|
-
|
38
|
-
#include "gumbo.h"
|
39
|
-
|
40
|
-
#ifdef __cplusplus
|
41
|
-
extern "C" {
|
42
|
-
#endif
|
43
|
-
|
44
|
-
struct _GumboError;
|
45
|
-
struct _GumboParser;
|
46
|
-
|
47
|
-
// Unicode replacement char.
|
48
|
-
extern const int kUtf8ReplacementChar;
|
49
|
-
|
50
|
-
typedef struct _Utf8Iterator {
|
51
|
-
// Points at the start of the code point most recently read into 'current'.
|
52
|
-
const char* _start;
|
53
|
-
|
54
|
-
// Points at the mark. The mark is initially set to the beginning of the
|
55
|
-
// input.
|
56
|
-
const char* _mark;
|
57
|
-
|
58
|
-
// Points past the end of the iter, like a past-the-end iterator in the STL.
|
59
|
-
const char* _end;
|
60
|
-
|
61
|
-
// The code point under the cursor.
|
62
|
-
int _current;
|
63
|
-
|
64
|
-
// The width in bytes of the current code point.
|
65
|
-
int _width;
|
66
|
-
|
67
|
-
// The SourcePosition for the current location.
|
68
|
-
GumboSourcePosition _pos;
|
69
|
-
|
70
|
-
// The SourcePosition for the mark.
|
71
|
-
GumboSourcePosition _mark_pos;
|
72
|
-
|
73
|
-
// Pointer back to the GumboParser instance, for configuration options and
|
74
|
-
// error recording.
|
75
|
-
struct _GumboParser* _parser;
|
76
|
-
} Utf8Iterator;
|
77
|
-
|
78
|
-
// Returns true if this Unicode code point is in the list of characters
|
79
|
-
// forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
|
80
|
-
bool utf8_is_invalid_code_point(int c);
|
81
|
-
|
82
|
-
// Initializes a new Utf8Iterator from the given byte buffer. The source does
|
83
|
-
// not have to be NUL-terminated, but the length must be passed in explicitly.
|
84
|
-
void utf8iterator_init(
|
85
|
-
struct _GumboParser* parser, const char* source, size_t source_length,
|
86
|
-
Utf8Iterator* iter);
|
87
|
-
|
88
|
-
// Advances the current position by one code point.
|
89
|
-
void utf8iterator_next(Utf8Iterator* iter);
|
90
|
-
|
91
|
-
// Returns the current code point as an integer.
|
92
|
-
int utf8iterator_current(const Utf8Iterator* iter);
|
93
|
-
|
94
|
-
// Retrieves and fills the output parameter with the current source position.
|
95
|
-
void utf8iterator_get_position(
|
96
|
-
const Utf8Iterator* iter, GumboSourcePosition* output);
|
97
|
-
|
98
|
-
// Retrieves a character pointer to the start of the current character.
|
99
|
-
const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
|
100
|
-
|
101
|
-
// If the upcoming text in the buffer matches the specified prefix (which has
|
102
|
-
// length 'length'), consume it and return true. Otherwise, return false with
|
103
|
-
// no other effects. If the length of the string would overflow the buffer,
|
104
|
-
// this returns false. Note that prefix should not contain null bytes because
|
105
|
-
// of the use of strncmp/strncasecmp internally. All existing use-cases adhere
|
106
|
-
// to this.
|
107
|
-
bool utf8iterator_maybe_consume_match(
|
108
|
-
Utf8Iterator* iter, const char* prefix, size_t length, bool case_sensitive);
|
109
|
-
|
110
|
-
// "Marks" a particular location of interest in the input stream, so that it can
|
111
|
-
// later be reset() to. There's also the ability to record an error at the
|
112
|
-
// point that was marked, as oftentimes that's more useful than the last
|
113
|
-
// character before the error was detected.
|
114
|
-
void utf8iterator_mark(Utf8Iterator* iter);
|
115
|
-
|
116
|
-
// Returns the current input stream position to the mark.
|
117
|
-
void utf8iterator_reset(Utf8Iterator* iter);
|
118
|
-
|
119
|
-
// Sets the position and original text fields of an error to the value at the
|
120
|
-
// mark.
|
121
|
-
void utf8iterator_fill_error_at_mark(
|
122
|
-
Utf8Iterator* iter, struct _GumboError* error);
|
123
|
-
|
124
|
-
#ifdef __cplusplus
|
125
|
-
}
|
126
|
-
#endif
|
127
|
-
#endif // GUMBO_UTF8_H_
|
data/work/util.c
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
|
17
|
-
#include "util.h"
|
18
|
-
|
19
|
-
#include <assert.h>
|
20
|
-
#include <stdlib.h>
|
21
|
-
#include <string.h>
|
22
|
-
#include <strings.h>
|
23
|
-
#include <stdarg.h>
|
24
|
-
#include <stdio.h>
|
25
|
-
|
26
|
-
#include "gumbo.h"
|
27
|
-
#include "parser.h"
|
28
|
-
|
29
|
-
// TODO(jdtang): This should be elsewhere, but there's no .c file for
|
30
|
-
// SourcePositions and yet the constant needs some linkage, so this is as good
|
31
|
-
// as any.
|
32
|
-
const GumboSourcePosition kGumboEmptySourcePosition = { 0, 0, 0 };
|
33
|
-
|
34
|
-
void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) {
|
35
|
-
return parser->_options->allocator(parser->_options->userdata, num_bytes);
|
36
|
-
}
|
37
|
-
|
38
|
-
void gumbo_parser_deallocate(GumboParser* parser, void* ptr) {
|
39
|
-
return parser->_options->deallocator(parser->_options->userdata, ptr);
|
40
|
-
}
|
41
|
-
|
42
|
-
char* gumbo_copy_stringz(GumboParser* parser, const char* str) {
|
43
|
-
char* buffer = gumbo_parser_allocate(parser, strlen(str) + 1);
|
44
|
-
strcpy(buffer, str);
|
45
|
-
return buffer;
|
46
|
-
}
|
47
|
-
|
48
|
-
// Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG
|
49
|
-
// to use.
|
50
|
-
void gumbo_debug(const char* format, ...) {
|
51
|
-
#ifdef GUMBO_DEBUG
|
52
|
-
va_list args;
|
53
|
-
va_start(args, format);
|
54
|
-
vprintf(format, args);
|
55
|
-
va_end(args);
|
56
|
-
fflush(stdout);
|
57
|
-
#endif
|
58
|
-
}
|
data/work/util.h
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
//
|
17
|
-
// This contains some utility functions that didn't fit into any of the other
|
18
|
-
// headers.
|
19
|
-
|
20
|
-
#ifndef GUMBO_UTIL_H_
|
21
|
-
#define GUMBO_UTIL_H_
|
22
|
-
|
23
|
-
#include <stdbool.h>
|
24
|
-
#include <stddef.h>
|
25
|
-
|
26
|
-
#ifdef __cplusplus
|
27
|
-
extern "C" {
|
28
|
-
#endif
|
29
|
-
|
30
|
-
// Forward declaration since it's passed into some of the functions in this
|
31
|
-
// header.
|
32
|
-
struct _GumboParser;
|
33
|
-
|
34
|
-
// Utility function for allocating & copying a null-terminated string into a
|
35
|
-
// freshly-allocated buffer. This is necessary for proper memory management; we
|
36
|
-
// have the convention that all const char* in parse tree structures are
|
37
|
-
// freshly-allocated, so if we didn't copy, we'd try to delete a literal string
|
38
|
-
// when the parse tree is destroyed.
|
39
|
-
char* gumbo_copy_stringz(struct _GumboParser* parser, const char* str);
|
40
|
-
|
41
|
-
// Allocate a chunk of memory, using the allocator specified in the Parser's
|
42
|
-
// config options.
|
43
|
-
void* gumbo_parser_allocate(struct _GumboParser* parser, size_t num_bytes);
|
44
|
-
|
45
|
-
// Deallocate a chunk of memory, using the deallocator specified in the Parser's
|
46
|
-
// config options.
|
47
|
-
void gumbo_parser_deallocate(struct _GumboParser* parser, void* ptr);
|
48
|
-
|
49
|
-
// Debug wrapper for printf, to make it easier to turn off debugging info when
|
50
|
-
// required.
|
51
|
-
void gumbo_debug(const char* format, ...);
|
52
|
-
|
53
|
-
#ifdef __cplusplus
|
54
|
-
}
|
55
|
-
#endif
|
56
|
-
|
57
|
-
#endif // GUMBO_UTIL_H_
|
data/work/vector.c
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
|
17
|
-
#include "vector.h"
|
18
|
-
|
19
|
-
#include <assert.h>
|
20
|
-
#include <stdlib.h>
|
21
|
-
#include <string.h>
|
22
|
-
#include <strings.h>
|
23
|
-
|
24
|
-
#include "util.h"
|
25
|
-
|
26
|
-
struct _GumboParser;
|
27
|
-
|
28
|
-
const GumboVector kGumboEmptyVector = { NULL, 0, 0 };
|
29
|
-
|
30
|
-
void gumbo_vector_init(
|
31
|
-
struct _GumboParser* parser, size_t initial_capacity, GumboVector* vector) {
|
32
|
-
vector->length = 0;
|
33
|
-
vector->capacity = initial_capacity;
|
34
|
-
if (initial_capacity > 0) {
|
35
|
-
vector->data = gumbo_parser_allocate(
|
36
|
-
parser, sizeof(void*) * initial_capacity);
|
37
|
-
} else {
|
38
|
-
vector->data = NULL;
|
39
|
-
}
|
40
|
-
}
|
41
|
-
|
42
|
-
void gumbo_vector_destroy(struct _GumboParser* parser, GumboVector* vector) {
|
43
|
-
if (vector->capacity > 0) {
|
44
|
-
gumbo_parser_deallocate(parser, vector->data);
|
45
|
-
}
|
46
|
-
}
|
47
|
-
|
48
|
-
static void enlarge_vector_if_full(
|
49
|
-
struct _GumboParser* parser, GumboVector* vector) {
|
50
|
-
if (vector->length >= vector->capacity) {
|
51
|
-
if (vector->capacity) {
|
52
|
-
size_t old_num_bytes = sizeof(void*) * vector->capacity;
|
53
|
-
vector->capacity *= 2;
|
54
|
-
size_t num_bytes = sizeof(void*) * vector->capacity;
|
55
|
-
void** temp = gumbo_parser_allocate(parser, num_bytes);
|
56
|
-
memcpy(temp, vector->data, old_num_bytes);
|
57
|
-
gumbo_parser_deallocate(parser, vector->data);
|
58
|
-
vector->data = temp;
|
59
|
-
} else {
|
60
|
-
// 0-capacity vector; no previous array to deallocate.
|
61
|
-
vector->capacity = 2;
|
62
|
-
vector->data = gumbo_parser_allocate(
|
63
|
-
parser, sizeof(void*) * vector->capacity);
|
64
|
-
}
|
65
|
-
}
|
66
|
-
}
|
67
|
-
|
68
|
-
void gumbo_vector_add(
|
69
|
-
struct _GumboParser* parser, void* element, GumboVector* vector) {
|
70
|
-
enlarge_vector_if_full(parser, vector);
|
71
|
-
assert(vector->data);
|
72
|
-
assert(vector->length < vector->capacity);
|
73
|
-
vector->data[vector->length++] = element;
|
74
|
-
}
|
75
|
-
|
76
|
-
void* gumbo_vector_pop(struct _GumboParser* parser, GumboVector* vector) {
|
77
|
-
if (vector->length == 0) {
|
78
|
-
return NULL;
|
79
|
-
}
|
80
|
-
return vector->data[--vector->length];
|
81
|
-
}
|
82
|
-
|
83
|
-
int gumbo_vector_index_of(GumboVector* vector, void* element) {
|
84
|
-
for (int i = 0; i < vector->length; ++i) {
|
85
|
-
if (vector->data[i] == element) {
|
86
|
-
return i;
|
87
|
-
}
|
88
|
-
}
|
89
|
-
return -1;
|
90
|
-
}
|
91
|
-
|
92
|
-
void gumbo_vector_insert_at(
|
93
|
-
struct _GumboParser* parser, void* element, int index, GumboVector* vector) {
|
94
|
-
assert(index >= 0);
|
95
|
-
assert(index <= vector->length);
|
96
|
-
enlarge_vector_if_full(parser, vector);
|
97
|
-
++vector->length;
|
98
|
-
memmove(&vector->data[index + 1], &vector->data[index],
|
99
|
-
sizeof(void*) * (vector->length - index - 1));
|
100
|
-
vector->data[index] = element;
|
101
|
-
}
|
102
|
-
|
103
|
-
void gumbo_vector_remove(
|
104
|
-
struct _GumboParser* parser, void* node, GumboVector* vector) {
|
105
|
-
int index = gumbo_vector_index_of(vector, node);
|
106
|
-
if (index == -1) {
|
107
|
-
return;
|
108
|
-
}
|
109
|
-
gumbo_vector_remove_at(parser, index, vector);
|
110
|
-
}
|
111
|
-
|
112
|
-
void* gumbo_vector_remove_at(
|
113
|
-
struct _GumboParser* parser, int index, GumboVector* vector) {
|
114
|
-
assert(index >= 0);
|
115
|
-
assert(index < vector->length);
|
116
|
-
void* result = vector->data[index];
|
117
|
-
memmove(&vector->data[index], &vector->data[index + 1],
|
118
|
-
sizeof(void*) * (vector->length - index - 1));
|
119
|
-
--vector->length;
|
120
|
-
return result;
|
121
|
-
}
|
data/work/vector.h
DELETED
@@ -1,66 +0,0 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
|
17
|
-
#ifndef GUMBO_VECTOR_H_
|
18
|
-
#define GUMBO_VECTOR_H_
|
19
|
-
|
20
|
-
#include "gumbo.h"
|
21
|
-
|
22
|
-
#ifdef __cplusplus
|
23
|
-
extern "C" {
|
24
|
-
#endif
|
25
|
-
|
26
|
-
// Forward declaration since it's passed into some of the functions in this
|
27
|
-
// header.
|
28
|
-
struct _GumboParser;
|
29
|
-
|
30
|
-
// Initializes a new GumboVector with the specified initial capacity.
|
31
|
-
void gumbo_vector_init(
|
32
|
-
struct _GumboParser* parser, size_t initial_capacity, GumboVector* vector);
|
33
|
-
|
34
|
-
// Frees the memory used by an GumboVector. Does not free the contained
|
35
|
-
// pointers.
|
36
|
-
void gumbo_vector_destroy(struct _GumboParser* parser, GumboVector* vector);
|
37
|
-
|
38
|
-
// Adds a new element to an GumboVector.
|
39
|
-
void gumbo_vector_add(
|
40
|
-
struct _GumboParser* parser, void* element, GumboVector* vector);
|
41
|
-
|
42
|
-
// Removes and returns the element most recently added to the GumboVector.
|
43
|
-
// Ownership is transferred to caller. Capacity is unchanged. If the vector is
|
44
|
-
// empty, NULL is returned.
|
45
|
-
void* gumbo_vector_pop(struct _GumboParser* parser, GumboVector* vector);
|
46
|
-
|
47
|
-
// Inserts an element at a specific index. This is potentially O(N) time, but
|
48
|
-
// is necessary for some of the spec's behavior.
|
49
|
-
void gumbo_vector_insert_at(
|
50
|
-
struct _GumboParser* parser, void* element, int index, GumboVector* vector);
|
51
|
-
|
52
|
-
// Removes an element from the vector, or does nothing if the element is not in
|
53
|
-
// the vector.
|
54
|
-
void gumbo_vector_remove(
|
55
|
-
struct _GumboParser* parser, void* element, GumboVector* vector);
|
56
|
-
|
57
|
-
// Removes and returns an element at a specific index. Note that this is
|
58
|
-
// potentially O(N) time and should be used sparingly.
|
59
|
-
void* gumbo_vector_remove_at(
|
60
|
-
struct _GumboParser* parser, int index, GumboVector* vector);
|
61
|
-
|
62
|
-
#ifdef __cplusplus
|
63
|
-
}
|
64
|
-
#endif
|
65
|
-
|
66
|
-
#endif // GUMBO_VECTOR_H_
|