ruby-gumbo 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.mkd +28 -31
  4. data/Rakefile +60 -59
  5. data/ext/extconf.rb +17 -9
  6. data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
  7. data/lib/gumbo.rb +19 -0
  8. data/lib/gumbo/element.rb +52 -0
  9. data/lib/gumbo/{extra.rb → node.rb} +19 -22
  10. data/lib/gumbo/text.rb +29 -0
  11. data/vendor/gumbo-parser/src/attribute.c +44 -0
  12. data/vendor/gumbo-parser/src/attribute.h +37 -0
  13. data/vendor/gumbo-parser/src/char_ref.c +2561 -0
  14. data/vendor/gumbo-parser/src/char_ref.h +61 -0
  15. data/vendor/gumbo-parser/src/error.c +258 -0
  16. data/vendor/gumbo-parser/src/error.h +227 -0
  17. data/vendor/gumbo-parser/src/gumbo.h +807 -0
  18. data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
  19. data/vendor/gumbo-parser/src/parser.c +3917 -0
  20. data/vendor/gumbo-parser/src/parser.h +57 -0
  21. data/vendor/gumbo-parser/src/string_buffer.c +106 -0
  22. data/vendor/gumbo-parser/src/string_buffer.h +81 -0
  23. data/vendor/gumbo-parser/src/string_piece.c +49 -0
  24. data/vendor/gumbo-parser/src/string_piece.h +39 -0
  25. data/vendor/gumbo-parser/src/tag.c +225 -0
  26. data/vendor/gumbo-parser/src/token_type.h +40 -0
  27. data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
  28. data/vendor/gumbo-parser/src/tokenizer.h +123 -0
  29. data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
  30. data/vendor/gumbo-parser/src/utf8.c +275 -0
  31. data/vendor/gumbo-parser/src/utf8.h +127 -0
  32. data/vendor/gumbo-parser/src/util.c +58 -0
  33. data/vendor/gumbo-parser/src/util.h +62 -0
  34. data/vendor/gumbo-parser/src/vector.c +123 -0
  35. data/vendor/gumbo-parser/src/vector.h +69 -0
  36. metadata +40 -10
  37. data/ext/extconf.h +0 -3
@@ -0,0 +1,127 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // This contains an implementation of a UTF8 iterator and decoder suitable for
18
+ // an HTML5 parser. This does a bit more than straight UTF-8 decoding. The
19
+ // HTML5 spec specifies that:
20
+ // 1. Decoding errors are parse errors.
21
+ // 2. Certain other codepoints (eg. control characters) are parse errors.
22
+ // 3. Carriage returns and CR/LF groups are converted to line feeds.
23
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling
24
+ //
25
+ // Also, we want to keep track of source positions for error handling. As a
26
+ // result, we fold all that functionality into this decoder, and can't use an
27
+ // off-the-shelf library.
28
+ //
29
+ // This header is internal-only, which is why we prefix functions with only
30
+ // utf8_ or utf8_iterator_ instead of gumbo_utf8_.
31
+
32
+ #ifndef GUMBO_UTF8_H_
33
+ #define GUMBO_UTF8_H_
34
+
35
+ #include <stdbool.h>
36
+ #include <stddef.h>
37
+
38
+ #include "gumbo.h"
39
+
40
+ #ifdef __cplusplus
41
+ extern "C" {
42
+ #endif
43
+
44
+ struct GumboInternalError;
45
+ struct GumboInternalParser;
46
+
47
+ // Unicode replacement char.
48
+ extern const int kUtf8ReplacementChar;
49
+
50
+ typedef struct GumboInternalUtf8Iterator {
51
+ // Points at the start of the code point most recently read into 'current'.
52
+ const char* _start;
53
+
54
+ // Points at the mark. The mark is initially set to the beginning of the
55
+ // input.
56
+ const char* _mark;
57
+
58
+ // Points past the end of the iter, like a past-the-end iterator in the STL.
59
+ const char* _end;
60
+
61
+ // The code point under the cursor.
62
+ int _current;
63
+
64
+ // The width in bytes of the current code point.
65
+ int _width;
66
+
67
+ // The SourcePosition for the current location.
68
+ GumboSourcePosition _pos;
69
+
70
+ // The SourcePosition for the mark.
71
+ GumboSourcePosition _mark_pos;
72
+
73
+ // Pointer back to the GumboParser instance, for configuration options and
74
+ // error recording.
75
+ struct GumboInternalParser* _parser;
76
+ } Utf8Iterator;
77
+
78
+ // Returns true if this Unicode code point is in the list of characters
79
+ // forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
80
+ bool utf8_is_invalid_code_point(int c);
81
+
82
+ // Initializes a new Utf8Iterator from the given byte buffer. The source does
83
+ // not have to be NUL-terminated, but the length must be passed in explicitly.
84
+ void utf8iterator_init(
85
+ struct GumboInternalParser* parser, const char* source,
86
+ size_t source_length, Utf8Iterator* iter);
87
+
88
+ // Advances the current position by one code point.
89
+ void utf8iterator_next(Utf8Iterator* iter);
90
+
91
+ // Returns the current code point as an integer.
92
+ int utf8iterator_current(const Utf8Iterator* iter);
93
+
94
+ // Retrieves and fills the output parameter with the current source position.
95
+ void utf8iterator_get_position(
96
+ const Utf8Iterator* iter, GumboSourcePosition* output);
97
+
98
+ // Retrieves a character pointer to the start of the current character.
99
+ const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
100
+
101
+ // If the upcoming text in the buffer matches the specified prefix (which has
102
+ // length 'length'), consume it and return true. Otherwise, return false with
103
+ // no other effects. If the length of the string would overflow the buffer,
104
+ // this returns false. Note that prefix should not contain null bytes because
105
+ // of the use of strncmp/strncasecmp internally. All existing use-cases adhere
106
+ // to this.
107
+ bool utf8iterator_maybe_consume_match(
108
+ Utf8Iterator* iter, const char* prefix, size_t length, bool case_sensitive);
109
+
110
+ // "Marks" a particular location of interest in the input stream, so that it can
111
+ // later be reset() to. There's also the ability to record an error at the
112
+ // point that was marked, as oftentimes that's more useful than the last
113
+ // character before the error was detected.
114
+ void utf8iterator_mark(Utf8Iterator* iter);
115
+
116
+ // Returns the current input stream position to the mark.
117
+ void utf8iterator_reset(Utf8Iterator* iter);
118
+
119
+ // Sets the position and original text fields of an error to the value at the
120
+ // mark.
121
+ void utf8iterator_fill_error_at_mark(
122
+ Utf8Iterator* iter, struct GumboInternalError* error);
123
+
124
+ #ifdef __cplusplus
125
+ }
126
+ #endif
127
+ #endif // GUMBO_UTF8_H_
@@ -0,0 +1,58 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "util.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+ #include <stdarg.h>
24
+ #include <stdio.h>
25
+
26
+ #include "gumbo.h"
27
+ #include "parser.h"
28
+
29
+ // TODO(jdtang): This should be elsewhere, but there's no .c file for
30
+ // SourcePositions and yet the constant needs some linkage, so this is as good
31
+ // as any.
32
+ const GumboSourcePosition kGumboEmptySourcePosition = { 0, 0, 0 };
33
+
34
+ void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) {
35
+ return parser->_options->allocator(parser->_options->userdata, num_bytes);
36
+ }
37
+
38
+ void gumbo_parser_deallocate(GumboParser* parser, void* ptr) {
39
+ parser->_options->deallocator(parser->_options->userdata, ptr);
40
+ }
41
+
42
+ char* gumbo_copy_stringz(GumboParser* parser, const char* str) {
43
+ char* buffer = gumbo_parser_allocate(parser, strlen(str) + 1);
44
+ strcpy(buffer, str);
45
+ return buffer;
46
+ }
47
+
48
+ // Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG
49
+ // to use.
50
+ void gumbo_debug(const char* format, ...) {
51
+ #ifdef GUMBO_DEBUG
52
+ va_list args;
53
+ va_start(args, format);
54
+ vprintf(format, args);
55
+ va_end(args);
56
+ fflush(stdout);
57
+ #endif
58
+ }
@@ -0,0 +1,62 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // This contains some utility functions that didn't fit into any of the other
18
+ // headers.
19
+
20
+ #ifndef GUMBO_UTIL_H_
21
+ #define GUMBO_UTIL_H_
22
+ #ifdef _MSC_VER
23
+ #define _CRT_SECURE_NO_WARNINGS
24
+ #endif
25
+ #include <stdbool.h>
26
+ #include <stddef.h>
27
+
28
+
29
+
30
+ #ifdef __cplusplus
31
+ extern "C" {
32
+ #endif
33
+
34
+ // Forward declaration since it's passed into some of the functions in this
35
+ // header.
36
+ struct GumboInternalParser;
37
+
38
+ // Utility function for allocating & copying a null-terminated string into a
39
+ // freshly-allocated buffer. This is necessary for proper memory management; we
40
+ // have the convention that all const char* in parse tree structures are
41
+ // freshly-allocated, so if we didn't copy, we'd try to delete a literal string
42
+ // when the parse tree is destroyed.
43
+ char* gumbo_copy_stringz(struct GumboInternalParser* parser, const char* str);
44
+
45
+ // Allocate a chunk of memory, using the allocator specified in the Parser's
46
+ // config options.
47
+ void* gumbo_parser_allocate(
48
+ struct GumboInternalParser* parser, size_t num_bytes);
49
+
50
+ // Deallocate a chunk of memory, using the deallocator specified in the Parser's
51
+ // config options.
52
+ void gumbo_parser_deallocate(struct GumboInternalParser* parser, void* ptr);
53
+
54
+ // Debug wrapper for printf, to make it easier to turn off debugging info when
55
+ // required.
56
+ void gumbo_debug(const char* format, ...);
57
+
58
+ #ifdef __cplusplus
59
+ }
60
+ #endif
61
+
62
+ #endif // GUMBO_UTIL_H_
@@ -0,0 +1,123 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "vector.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "util.h"
25
+
26
+ struct GumboInternalParser;
27
+
28
+ const GumboVector kGumboEmptyVector = { NULL, 0, 0 };
29
+
30
+ void gumbo_vector_init(
31
+ struct GumboInternalParser* parser, size_t initial_capacity, GumboVector* vector) {
32
+ vector->length = 0;
33
+ vector->capacity = initial_capacity;
34
+ if (initial_capacity > 0) {
35
+ vector->data = gumbo_parser_allocate(
36
+ parser, sizeof(void*) * initial_capacity);
37
+ } else {
38
+ vector->data = NULL;
39
+ }
40
+ }
41
+
42
+ void gumbo_vector_destroy(struct GumboInternalParser* parser, GumboVector* vector) {
43
+ if (vector->capacity > 0) {
44
+ gumbo_parser_deallocate(parser, vector->data);
45
+ }
46
+ }
47
+
48
+ static void enlarge_vector_if_full(
49
+ struct GumboInternalParser* parser, GumboVector* vector) {
50
+ if (vector->length >= vector->capacity) {
51
+ if (vector->capacity) {
52
+ size_t old_num_bytes = sizeof(void*) * vector->capacity;
53
+ vector->capacity *= 2;
54
+ size_t num_bytes = sizeof(void*) * vector->capacity;
55
+ void** temp = gumbo_parser_allocate(parser, num_bytes);
56
+ memcpy(temp, vector->data, old_num_bytes);
57
+ gumbo_parser_deallocate(parser, vector->data);
58
+ vector->data = temp;
59
+ } else {
60
+ // 0-capacity vector; no previous array to deallocate.
61
+ vector->capacity = 2;
62
+ vector->data = gumbo_parser_allocate(
63
+ parser, sizeof(void*) * vector->capacity);
64
+ }
65
+ }
66
+ }
67
+
68
+ void gumbo_vector_add(
69
+ struct GumboInternalParser* parser, void* element, GumboVector* vector) {
70
+ enlarge_vector_if_full(parser, vector);
71
+ assert(vector->data);
72
+ assert(vector->length < vector->capacity);
73
+ vector->data[vector->length++] = element;
74
+ }
75
+
76
+ void* gumbo_vector_pop(
77
+ struct GumboInternalParser* parser, GumboVector* vector) {
78
+ if (vector->length == 0) {
79
+ return NULL;
80
+ }
81
+ return vector->data[--vector->length];
82
+ }
83
+
84
+ int gumbo_vector_index_of(GumboVector* vector, void* element) {
85
+ for (int i = 0; i < vector->length; ++i) {
86
+ if (vector->data[i] == element) {
87
+ return i;
88
+ }
89
+ }
90
+ return -1;
91
+ }
92
+
93
+ void gumbo_vector_insert_at(
94
+ struct GumboInternalParser* parser, void* element, int index,
95
+ GumboVector* vector) {
96
+ assert(index >= 0);
97
+ assert(index <= vector->length);
98
+ enlarge_vector_if_full(parser, vector);
99
+ ++vector->length;
100
+ memmove(&vector->data[index + 1], &vector->data[index],
101
+ sizeof(void*) * (vector->length - index - 1));
102
+ vector->data[index] = element;
103
+ }
104
+
105
+ void gumbo_vector_remove(
106
+ struct GumboInternalParser* parser, void* node, GumboVector* vector) {
107
+ int index = gumbo_vector_index_of(vector, node);
108
+ if (index == -1) {
109
+ return;
110
+ }
111
+ gumbo_vector_remove_at(parser, index, vector);
112
+ }
113
+
114
+ void* gumbo_vector_remove_at(
115
+ struct GumboInternalParser* parser, int index, GumboVector* vector) {
116
+ assert(index >= 0);
117
+ assert(index < vector->length);
118
+ void* result = vector->data[index];
119
+ memmove(&vector->data[index], &vector->data[index + 1],
120
+ sizeof(void*) * (vector->length - index - 1));
121
+ --vector->length;
122
+ return result;
123
+ }
@@ -0,0 +1,69 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_VECTOR_H_
18
+ #define GUMBO_VECTOR_H_
19
+
20
+ #include "gumbo.h"
21
+
22
+ #ifdef __cplusplus
23
+ extern "C" {
24
+ #endif
25
+
26
+ // Forward declaration since it's passed into some of the functions in this
27
+ // header.
28
+ struct GumboInternalParser;
29
+
30
+ // Initializes a new GumboVector with the specified initial capacity.
31
+ void gumbo_vector_init(
32
+ struct GumboInternalParser* parser, size_t initial_capacity,
33
+ GumboVector* vector);
34
+
35
+ // Frees the memory used by an GumboVector. Does not free the contained
36
+ // pointers.
37
+ void gumbo_vector_destroy(
38
+ struct GumboInternalParser* parser, GumboVector* vector);
39
+
40
+ // Adds a new element to an GumboVector.
41
+ void gumbo_vector_add(
42
+ struct GumboInternalParser* parser, void* element, GumboVector* vector);
43
+
44
+ // Removes and returns the element most recently added to the GumboVector.
45
+ // Ownership is transferred to caller. Capacity is unchanged. If the vector is
46
+ // empty, NULL is returned.
47
+ void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector);
48
+
49
+ // Inserts an element at a specific index. This is potentially O(N) time, but
50
+ // is necessary for some of the spec's behavior.
51
+ void gumbo_vector_insert_at(
52
+ struct GumboInternalParser* parser, void* element, int index,
53
+ GumboVector* vector);
54
+
55
+ // Removes an element from the vector, or does nothing if the element is not in
56
+ // the vector.
57
+ void gumbo_vector_remove(
58
+ struct GumboInternalParser* parser, void* element, GumboVector* vector);
59
+
60
+ // Removes and returns an element at a specific index. Note that this is
61
+ // potentially O(N) time and should be used sparingly.
62
+ void* gumbo_vector_remove_at(
63
+ struct GumboInternalParser* parser, int index, GumboVector* vector);
64
+
65
+ #ifdef __cplusplus
66
+ }
67
+ #endif
68
+
69
+ #endif // GUMBO_VECTOR_H_
metadata CHANGED
@@ -1,29 +1,59 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-gumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nicolas Martyanoff
8
+ - Ian MacLeod
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-09-17 00:00:00.000000000 Z
12
+ date: 2014-06-23 00:00:00.000000000 Z
12
13
  dependencies: []
13
14
  description:
14
- email: khaelin@gmail.com
15
+ email:
16
+ - khaelin@gmail.com
17
+ - ian@nevir.net
15
18
  executables: []
16
19
  extensions:
17
20
  - ext/extconf.rb
18
21
  extra_rdoc_files: []
19
22
  files:
20
- - Rakefile
21
23
  - LICENSE
22
24
  - README.mkd
23
- - lib/gumbo/extra.rb
25
+ - Rakefile
24
26
  - ext/extconf.rb
25
- - ext/extconf.h
26
- - ext/gumbo.c
27
+ - ext/ruby_gumbo_ext.c
28
+ - lib/gumbo.rb
29
+ - lib/gumbo/element.rb
30
+ - lib/gumbo/node.rb
31
+ - lib/gumbo/text.rb
32
+ - vendor/gumbo-parser/src/attribute.c
33
+ - vendor/gumbo-parser/src/attribute.h
34
+ - vendor/gumbo-parser/src/char_ref.c
35
+ - vendor/gumbo-parser/src/char_ref.h
36
+ - vendor/gumbo-parser/src/error.c
37
+ - vendor/gumbo-parser/src/error.h
38
+ - vendor/gumbo-parser/src/gumbo.h
39
+ - vendor/gumbo-parser/src/insertion_mode.h
40
+ - vendor/gumbo-parser/src/parser.c
41
+ - vendor/gumbo-parser/src/parser.h
42
+ - vendor/gumbo-parser/src/string_buffer.c
43
+ - vendor/gumbo-parser/src/string_buffer.h
44
+ - vendor/gumbo-parser/src/string_piece.c
45
+ - vendor/gumbo-parser/src/string_piece.h
46
+ - vendor/gumbo-parser/src/tag.c
47
+ - vendor/gumbo-parser/src/token_type.h
48
+ - vendor/gumbo-parser/src/tokenizer.c
49
+ - vendor/gumbo-parser/src/tokenizer.h
50
+ - vendor/gumbo-parser/src/tokenizer_states.h
51
+ - vendor/gumbo-parser/src/utf8.c
52
+ - vendor/gumbo-parser/src/utf8.h
53
+ - vendor/gumbo-parser/src/util.c
54
+ - vendor/gumbo-parser/src/util.h
55
+ - vendor/gumbo-parser/src/vector.c
56
+ - vendor/gumbo-parser/src/vector.h
27
57
  homepage:
28
58
  licenses:
29
59
  - ISC
@@ -34,17 +64,17 @@ require_paths:
34
64
  - lib
35
65
  required_ruby_version: !ruby/object:Gem::Requirement
36
66
  requirements:
37
- - - '>='
67
+ - - ">="
38
68
  - !ruby/object:Gem::Version
39
69
  version: 1.9.3
40
70
  required_rubygems_version: !ruby/object:Gem::Requirement
41
71
  requirements:
42
- - - '>='
72
+ - - ">="
43
73
  - !ruby/object:Gem::Version
44
74
  version: '0'
45
75
  requirements: []
46
76
  rubyforge_project:
47
- rubygems_version: 2.0.3
77
+ rubygems_version: 2.2.2
48
78
  signing_key:
49
79
  specification_version: 4
50
80
  summary: Ruby bindings for the gumbo html5 parser