nokogumbo 1.5.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +237 -26
- data/ext/nokogumbo/extconf.rb +121 -0
- data/ext/nokogumbo/nokogumbo.c +793 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +5972 -6816
- data/gumbo-parser/src/char_ref.h +14 -45
- data/gumbo-parser/src/error.c +510 -163
- data/gumbo-parser/src/error.h +70 -147
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +577 -305
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +2922 -2228
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +2127 -1561
- data/gumbo-parser/src/tokenizer.h +41 -52
- data/gumbo-parser/src/tokenizer_states.h +281 -45
- data/gumbo-parser/src/utf8.c +98 -123
- data/gumbo-parser/src/utf8.h +84 -52
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +11 -173
- data/lib/nokogumbo/html5.rb +252 -0
- data/lib/nokogumbo/html5/document.rb +53 -0
- data/lib/nokogumbo/html5/document_fragment.rb +62 -0
- data/lib/nokogumbo/html5/node.rb +72 -0
- data/lib/nokogumbo/version.rb +3 -0
- metadata +43 -24
- data/ext/nokogumboc/extconf.rb +0 -60
- data/ext/nokogumboc/nokogumbo.c +0 -295
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
data/gumbo-parser/src/util.c
CHANGED
@@ -1,58 +1,68 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
1
|
+
/*
|
2
|
+
Copyright 2017-2018 Craig Barnes.
|
3
|
+
Copyright 2010 Google Inc.
|
16
4
|
|
17
|
-
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
you may not use this file except in compliance with the License.
|
7
|
+
You may obtain a copy of the License at
|
8
|
+
|
9
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
18
10
|
|
19
|
-
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
See the License for the specific language governing permissions and
|
15
|
+
limitations under the License.
|
16
|
+
*/
|
17
|
+
|
18
|
+
#include <stdio.h>
|
20
19
|
#include <stdlib.h>
|
21
20
|
#include <string.h>
|
22
|
-
#include
|
23
|
-
#include <stdarg.h>
|
24
|
-
#include <stdio.h>
|
25
|
-
|
21
|
+
#include "util.h"
|
26
22
|
#include "gumbo.h"
|
27
|
-
#include "parser.h"
|
28
23
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
24
|
+
void* gumbo_alloc(size_t size) {
|
25
|
+
void* ptr = malloc(size);
|
26
|
+
if (unlikely(ptr == NULL)) {
|
27
|
+
perror(__func__);
|
28
|
+
abort();
|
29
|
+
}
|
30
|
+
return ptr;
|
31
|
+
}
|
33
32
|
|
34
|
-
void*
|
35
|
-
|
33
|
+
void* gumbo_realloc(void* ptr, size_t size) {
|
34
|
+
ptr = realloc(ptr, size);
|
35
|
+
if (unlikely(ptr == NULL)) {
|
36
|
+
perror(__func__);
|
37
|
+
abort();
|
38
|
+
}
|
39
|
+
return ptr;
|
36
40
|
}
|
37
41
|
|
38
|
-
void
|
39
|
-
|
42
|
+
void gumbo_free(void* ptr) {
|
43
|
+
free(ptr);
|
40
44
|
}
|
41
45
|
|
42
|
-
char*
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
+
char* gumbo_strdup(const char* str) {
|
47
|
+
const size_t size = strlen(str) + 1;
|
48
|
+
// The strdup(3) function isn't available in strict "-std=c99" mode
|
49
|
+
// (it's part of POSIX, not C99), so use malloc(3) and memcpy(3)
|
50
|
+
// instead:
|
51
|
+
char* buffer = gumbo_alloc(size);
|
52
|
+
return memcpy(buffer, str, size);
|
46
53
|
}
|
47
54
|
|
48
|
-
// Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG
|
49
|
-
// to use.
|
50
|
-
void gumbo_debug(const char* format, ...) {
|
51
55
|
#ifdef GUMBO_DEBUG
|
56
|
+
#include <stdarg.h>
|
57
|
+
// Debug function to trace operation of the parser
|
58
|
+
// (define GUMBO_DEBUG to use).
|
59
|
+
void gumbo_debug(const char* format, ...) {
|
52
60
|
va_list args;
|
53
61
|
va_start(args, format);
|
54
62
|
vprintf(format, args);
|
55
63
|
va_end(args);
|
56
64
|
fflush(stdout);
|
57
|
-
#endif
|
58
65
|
}
|
66
|
+
#else
|
67
|
+
void gumbo_debug(const char* UNUSED_ARG(format), ...) {}
|
68
|
+
#endif
|
data/gumbo-parser/src/util.h
CHANGED
@@ -1,60 +1,30 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
//
|
17
|
-
// This contains some utility functions that didn't fit into any of the other
|
18
|
-
// headers.
|
19
|
-
|
20
1
|
#ifndef GUMBO_UTIL_H_
|
21
2
|
#define GUMBO_UTIL_H_
|
22
|
-
|
23
|
-
#define _CRT_SECURE_NO_WARNINGS
|
24
|
-
#endif
|
3
|
+
|
25
4
|
#include <stdbool.h>
|
26
5
|
#include <stddef.h>
|
6
|
+
#include "macros.h"
|
27
7
|
|
28
8
|
#ifdef __cplusplus
|
29
9
|
extern "C" {
|
30
10
|
#endif
|
31
11
|
|
32
|
-
// Forward declaration since it's passed into some of the functions in this
|
33
|
-
// header.
|
34
|
-
struct GumboInternalParser;
|
35
|
-
|
36
12
|
// Utility function for allocating & copying a null-terminated string into a
|
37
|
-
// freshly-allocated buffer.
|
13
|
+
// freshly-allocated buffer. This is necessary for proper memory management; we
|
38
14
|
// have the convention that all const char* in parse tree structures are
|
39
15
|
// freshly-allocated, so if we didn't copy, we'd try to delete a literal string
|
40
16
|
// when the parse tree is destroyed.
|
41
|
-
char*
|
42
|
-
|
43
|
-
// Allocate a chunk of memory, using the allocator specified in the Parser's
|
44
|
-
// config options.
|
45
|
-
void* gumbo_parser_allocate(
|
46
|
-
struct GumboInternalParser* parser, size_t num_bytes);
|
17
|
+
char* gumbo_strdup(const char* str) XMALLOC NONNULL_ARGS;
|
47
18
|
|
48
|
-
|
49
|
-
|
50
|
-
void
|
19
|
+
void* gumbo_alloc(size_t size) XMALLOC;
|
20
|
+
void* gumbo_realloc(void* ptr, size_t size) RETURNS_NONNULL;
|
21
|
+
void gumbo_free(void* ptr);
|
51
22
|
|
52
|
-
// Debug wrapper for printf
|
53
|
-
|
54
|
-
void gumbo_debug(const char* format, ...);
|
23
|
+
// Debug wrapper for printf
|
24
|
+
void gumbo_debug(const char* format, ...) PRINTF(1);
|
55
25
|
|
56
26
|
#ifdef __cplusplus
|
57
27
|
}
|
58
28
|
#endif
|
59
29
|
|
60
|
-
#endif
|
30
|
+
#endif // GUMBO_UTIL_H_
|
data/gumbo-parser/src/vector.c
CHANGED
@@ -1,81 +1,64 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
1
|
+
/*
|
2
|
+
Copyright 2018 Craig Barnes.
|
3
|
+
Copyright 2010 Google Inc.
|
16
4
|
|
17
|
-
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
you may not use this file except in compliance with the License.
|
7
|
+
You may obtain a copy of the License at
|
8
|
+
|
9
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
See the License for the specific language governing permissions and
|
15
|
+
limitations under the License.
|
16
|
+
*/
|
18
17
|
|
19
18
|
#include <assert.h>
|
20
19
|
#include <stdlib.h>
|
21
20
|
#include <string.h>
|
22
|
-
#include
|
23
|
-
|
21
|
+
#include "vector.h"
|
24
22
|
#include "util.h"
|
25
23
|
|
26
|
-
|
27
|
-
|
28
|
-
const GumboVector kGumboEmptyVector = {NULL, 0, 0};
|
29
|
-
|
30
|
-
void gumbo_vector_init(struct GumboInternalParser* parser,
|
31
|
-
size_t initial_capacity, GumboVector* vector) {
|
24
|
+
void gumbo_vector_init(unsigned int initial_capacity, GumboVector* vector) {
|
32
25
|
vector->length = 0;
|
33
26
|
vector->capacity = initial_capacity;
|
34
27
|
if (initial_capacity > 0) {
|
35
|
-
vector->data =
|
36
|
-
gumbo_parser_allocate(parser, sizeof(void*) * initial_capacity);
|
28
|
+
vector->data = gumbo_alloc(sizeof(void*) * initial_capacity);
|
37
29
|
} else {
|
38
30
|
vector->data = NULL;
|
39
31
|
}
|
40
32
|
}
|
41
33
|
|
42
|
-
void gumbo_vector_destroy(
|
43
|
-
struct GumboInternalParser* parser, GumboVector* vector) {
|
34
|
+
void gumbo_vector_destroy(GumboVector* vector) {
|
44
35
|
if (vector->capacity > 0) {
|
45
|
-
|
36
|
+
gumbo_free(vector->data);
|
46
37
|
}
|
47
38
|
}
|
48
39
|
|
49
|
-
static void enlarge_vector_if_full(
|
50
|
-
struct GumboInternalParser* parser, GumboVector* vector) {
|
40
|
+
static void enlarge_vector_if_full(GumboVector* vector) {
|
51
41
|
if (vector->length >= vector->capacity) {
|
52
42
|
if (vector->capacity) {
|
53
|
-
size_t old_num_bytes = sizeof(void*) * vector->capacity;
|
54
43
|
vector->capacity *= 2;
|
55
44
|
size_t num_bytes = sizeof(void*) * vector->capacity;
|
56
|
-
|
57
|
-
memcpy(temp, vector->data, old_num_bytes);
|
58
|
-
gumbo_parser_deallocate(parser, vector->data);
|
59
|
-
vector->data = temp;
|
45
|
+
vector->data = gumbo_realloc(vector->data, num_bytes);
|
60
46
|
} else {
|
61
47
|
// 0-capacity vector; no previous array to deallocate.
|
62
48
|
vector->capacity = 2;
|
63
|
-
vector->data =
|
64
|
-
gumbo_parser_allocate(parser, sizeof(void*) * vector->capacity);
|
49
|
+
vector->data = gumbo_alloc(sizeof(void*) * vector->capacity);
|
65
50
|
}
|
66
51
|
}
|
67
52
|
}
|
68
53
|
|
69
|
-
void gumbo_vector_add(
|
70
|
-
|
71
|
-
enlarge_vector_if_full(parser, vector);
|
54
|
+
void gumbo_vector_add(void* element, GumboVector* vector) {
|
55
|
+
enlarge_vector_if_full(vector);
|
72
56
|
assert(vector->data);
|
73
57
|
assert(vector->length < vector->capacity);
|
74
58
|
vector->data[vector->length++] = element;
|
75
59
|
}
|
76
60
|
|
77
|
-
void* gumbo_vector_pop(
|
78
|
-
struct GumboInternalParser* parser, GumboVector* vector) {
|
61
|
+
void* gumbo_vector_pop(GumboVector* vector) {
|
79
62
|
if (vector->length == 0) {
|
80
63
|
return NULL;
|
81
64
|
}
|
@@ -91,33 +74,38 @@ int gumbo_vector_index_of(GumboVector* vector, const void* element) {
|
|
91
74
|
return -1;
|
92
75
|
}
|
93
76
|
|
94
|
-
void gumbo_vector_insert_at(
|
95
|
-
|
96
|
-
|
77
|
+
void gumbo_vector_insert_at (
|
78
|
+
void* element,
|
79
|
+
unsigned int index,
|
80
|
+
GumboVector* vector
|
81
|
+
) {
|
97
82
|
assert(index <= vector->length);
|
98
|
-
enlarge_vector_if_full(
|
83
|
+
enlarge_vector_if_full(vector);
|
99
84
|
++vector->length;
|
100
|
-
memmove(
|
101
|
-
|
85
|
+
memmove (
|
86
|
+
&vector->data[index + 1],
|
87
|
+
&vector->data[index],
|
88
|
+
sizeof(void*) * (vector->length - index - 1)
|
89
|
+
);
|
102
90
|
vector->data[index] = element;
|
103
91
|
}
|
104
92
|
|
105
|
-
void gumbo_vector_remove(
|
106
|
-
struct GumboInternalParser* parser, void* node, GumboVector* vector) {
|
93
|
+
void gumbo_vector_remove(void* node, GumboVector* vector) {
|
107
94
|
int index = gumbo_vector_index_of(vector, node);
|
108
95
|
if (index == -1) {
|
109
96
|
return;
|
110
97
|
}
|
111
|
-
gumbo_vector_remove_at(
|
98
|
+
gumbo_vector_remove_at(index, vector);
|
112
99
|
}
|
113
100
|
|
114
|
-
void* gumbo_vector_remove_at(
|
115
|
-
unsigned int index, GumboVector* vector) {
|
116
|
-
assert(index >= 0);
|
101
|
+
void* gumbo_vector_remove_at(unsigned int index, GumboVector* vector) {
|
117
102
|
assert(index < vector->length);
|
118
103
|
void* result = vector->data[index];
|
119
|
-
memmove(
|
120
|
-
|
104
|
+
memmove (
|
105
|
+
&vector->data[index],
|
106
|
+
&vector->data[index + 1],
|
107
|
+
sizeof(void*) * (vector->length - index - 1)
|
108
|
+
);
|
121
109
|
--vector->length;
|
122
110
|
return result;
|
123
111
|
}
|
data/gumbo-parser/src/vector.h
CHANGED
@@ -1,19 +1,3 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
|
17
1
|
#ifndef GUMBO_VECTOR_H_
|
18
2
|
#define GUMBO_VECTOR_H_
|
19
3
|
|
@@ -23,45 +7,39 @@
|
|
23
7
|
extern "C" {
|
24
8
|
#endif
|
25
9
|
|
26
|
-
// Forward declaration since it's passed into some of the functions in this
|
27
|
-
// header.
|
28
|
-
struct GumboInternalParser;
|
29
|
-
|
30
10
|
// Initializes a new GumboVector with the specified initial capacity.
|
31
|
-
void gumbo_vector_init(
|
32
|
-
size_t initial_capacity, GumboVector* vector);
|
11
|
+
void gumbo_vector_init(unsigned int initial_capacity, GumboVector* vector);
|
33
12
|
|
34
|
-
// Frees the memory used by
|
13
|
+
// Frees the memory used by a GumboVector. Does not free the contained
|
35
14
|
// pointers.
|
36
|
-
void gumbo_vector_destroy(
|
37
|
-
struct GumboInternalParser* parser, GumboVector* vector);
|
15
|
+
void gumbo_vector_destroy(GumboVector* vector);
|
38
16
|
|
39
|
-
// Adds a new element to
|
40
|
-
void gumbo_vector_add(
|
41
|
-
struct GumboInternalParser* parser, void* element, GumboVector* vector);
|
17
|
+
// Adds a new element to a GumboVector.
|
18
|
+
void gumbo_vector_add(void* element, GumboVector* vector);
|
42
19
|
|
43
20
|
// Removes and returns the element most recently added to the GumboVector.
|
44
|
-
// Ownership is transferred to caller.
|
21
|
+
// Ownership is transferred to caller. Capacity is unchanged. If the vector is
|
45
22
|
// empty, NULL is returned.
|
46
|
-
void* gumbo_vector_pop(
|
23
|
+
void* gumbo_vector_pop(GumboVector* vector);
|
47
24
|
|
48
|
-
// Inserts an element at a specific index.
|
25
|
+
// Inserts an element at a specific index. This is potentially O(N) time, but
|
49
26
|
// is necessary for some of the spec's behavior.
|
50
|
-
void gumbo_vector_insert_at(
|
51
|
-
|
27
|
+
void gumbo_vector_insert_at (
|
28
|
+
void* element,
|
29
|
+
unsigned int index,
|
30
|
+
GumboVector* vector
|
31
|
+
);
|
52
32
|
|
53
33
|
// Removes an element from the vector, or does nothing if the element is not in
|
54
34
|
// the vector.
|
55
|
-
void gumbo_vector_remove(
|
56
|
-
struct GumboInternalParser* parser, void* element, GumboVector* vector);
|
35
|
+
void gumbo_vector_remove(void* element, GumboVector* vector);
|
57
36
|
|
58
|
-
// Removes and returns an element at a specific index.
|
37
|
+
// Removes and returns an element at a specific index. Note that this is
|
59
38
|
// potentially O(N) time and should be used sparingly.
|
60
|
-
void* gumbo_vector_remove_at(
|
61
|
-
unsigned int index, GumboVector* vector);
|
39
|
+
void* gumbo_vector_remove_at(unsigned int index, GumboVector* vector);
|
62
40
|
|
63
41
|
#ifdef __cplusplus
|
64
42
|
}
|
65
43
|
#endif
|
66
44
|
|
67
|
-
#endif
|
45
|
+
#endif // GUMBO_VECTOR_H_
|
data/lib/nokogumbo.rb
CHANGED
@@ -1,179 +1,17 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
-
require '
|
2
|
+
require 'nokogumbo/version'
|
3
|
+
require 'nokogumbo/html5'
|
3
4
|
|
4
|
-
|
5
|
-
# Parse an HTML document. +string+ contains the document. +string+
|
6
|
-
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
|
7
|
-
def self.HTML5(*args)
|
8
|
-
Nokogiri::HTML5.parse(*args)
|
9
|
-
end
|
5
|
+
require 'nokogumbo/nokogumbo'
|
10
6
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
def self.parse(string, options={})
|
15
|
-
if string.respond_to? :read
|
16
|
-
string = string.read
|
17
|
-
end
|
7
|
+
module Nokogumbo
|
8
|
+
# The default maximum number of attributes per element.
|
9
|
+
DEFAULT_MAX_ATTRIBUTES = 400
|
18
10
|
|
19
|
-
|
20
|
-
|
21
|
-
string = reencode(string)
|
22
|
-
end
|
11
|
+
# The default maximum number of errors for parsing a document or a fragment.
|
12
|
+
DEFAULT_MAX_ERRORS = 0
|
23
13
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
# Fetch and parse a HTML document from the web, following redirects,
|
28
|
-
# handling https, and determining the character encoding using HTML5
|
29
|
-
# rules. +uri+ may be a +String+ or a +URI+. +options+ contains
|
30
|
-
# http headers and special options. Everything which is not a
|
31
|
-
# special option is considered a header. Special options include:
|
32
|
-
# * :follow_limit => number of redirects which are followed
|
33
|
-
# * :basic_auth => [username, password]
|
34
|
-
def self.get(uri, options={})
|
35
|
-
headers = options.clone
|
36
|
-
headers = {:follow_limit => headers} if Numeric === headers # deprecated
|
37
|
-
limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
38
|
-
|
39
|
-
require 'net/http'
|
40
|
-
uri = URI(uri) unless URI === uri
|
41
|
-
|
42
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
43
|
-
|
44
|
-
# TLS / SSL support
|
45
|
-
http.use_ssl = true if uri.scheme == 'https'
|
46
|
-
|
47
|
-
# Pass through Net::HTTP override values, which currently include:
|
48
|
-
# :ca_file, :ca_path, :cert, :cert_store, :ciphers,
|
49
|
-
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
50
|
-
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
51
|
-
# :verify_callback, :verify_depth, :verify_mode
|
52
|
-
options.each do |key, value|
|
53
|
-
http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
|
54
|
-
end
|
55
|
-
|
56
|
-
request = Net::HTTP::Get.new(uri.request_uri)
|
57
|
-
|
58
|
-
# basic authentication
|
59
|
-
auth = headers.delete(:basic_auth)
|
60
|
-
auth ||= [uri.user, uri.password] if uri.user and uri.password
|
61
|
-
request.basic_auth auth.first, auth.last if auth
|
62
|
-
|
63
|
-
# remaining options are treated as headers
|
64
|
-
headers.each {|key, value| request[key.to_s] = value.to_s}
|
65
|
-
|
66
|
-
response = http.request(request)
|
67
|
-
|
68
|
-
case response
|
69
|
-
when Net::HTTPSuccess
|
70
|
-
doc = parse(reencode(response.body, response['content-type']), options)
|
71
|
-
doc.instance_variable_set('@response', response)
|
72
|
-
doc.class.send(:attr_reader, :response)
|
73
|
-
doc
|
74
|
-
when Net::HTTPRedirection
|
75
|
-
response.value if limit <= 1
|
76
|
-
location = URI.join(uri, response['location'])
|
77
|
-
get(location, options.merge(:follow_limit => limit-1))
|
78
|
-
else
|
79
|
-
response.value
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
# while fragment is on the Gumbo TODO list, simulate it by doing
|
84
|
-
# a full document parse and ignoring the parent <html>, <head>, and <body>
|
85
|
-
# tags, and collecting up the children of each.
|
86
|
-
def self.fragment(*args)
|
87
|
-
doc = parse(*args)
|
88
|
-
fragment = Nokogiri::HTML::DocumentFragment.new(doc)
|
89
|
-
|
90
|
-
if doc.children.length != 1 or doc.children.first.name != 'html'
|
91
|
-
# no HTML? Return document as is
|
92
|
-
fragment = doc
|
93
|
-
else
|
94
|
-
# examine children of HTML element
|
95
|
-
children = doc.children.first.children
|
96
|
-
|
97
|
-
# head is always first. If present, take children but otherwise
|
98
|
-
# ignore the head element
|
99
|
-
if children.length > 0 and doc.children.first.name = 'head'
|
100
|
-
fragment << children.shift.children
|
101
|
-
end
|
102
|
-
|
103
|
-
# body may be next, or last. If found, take children but otherwise
|
104
|
-
# ignore the body element. Also take any remaining elements, taking
|
105
|
-
# care to preserve order.
|
106
|
-
if children.length > 0 and doc.children.first.name = 'body'
|
107
|
-
fragment << children.shift.children
|
108
|
-
fragment << children
|
109
|
-
elsif children.length > 0 and doc.children.last.name = 'body'
|
110
|
-
body = children.pop
|
111
|
-
fragment << children
|
112
|
-
fragment << body.children
|
113
|
-
else
|
114
|
-
fragment << children
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
# return result
|
119
|
-
fragment
|
120
|
-
end
|
121
|
-
|
122
|
-
private
|
123
|
-
|
124
|
-
# Charset sniffing is a complex and controversial topic that understandably
|
125
|
-
# isn't done _by default_ by the Ruby Net::HTTP library. This being said,
|
126
|
-
# it is a very real problem for consumers of HTML as the default for HTML
|
127
|
-
# is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
|
128
|
-
# *only* supports utf-8.
|
129
|
-
#
|
130
|
-
# Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
|
131
|
-
# detection. Following this lead, Nokogiri::HTML5 attempts to do likewise,
|
132
|
-
# while attempting to more closely follow the HTML5 standard.
|
133
|
-
#
|
134
|
-
# http://bugs.ruby-lang.org/issues/2567
|
135
|
-
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
136
|
-
#
|
137
|
-
def self.reencode(body, content_type=nil)
|
138
|
-
return body unless body.respond_to? :encoding
|
139
|
-
|
140
|
-
if body.encoding == Encoding::ASCII_8BIT
|
141
|
-
encoding = nil
|
142
|
-
|
143
|
-
# look for a Byte Order Mark (BOM)
|
144
|
-
if body[0..1] == "\xFE\xFF"
|
145
|
-
encoding = 'utf-16be'
|
146
|
-
elsif body[0..1] == "\xFF\xFE"
|
147
|
-
encoding = 'utf-16le'
|
148
|
-
elsif body[0..2] == "\xEF\xBB\xBF"
|
149
|
-
encoding = 'utf-8'
|
150
|
-
end
|
151
|
-
|
152
|
-
# look for a charset in a content-encoding header
|
153
|
-
if content_type
|
154
|
-
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
155
|
-
end
|
156
|
-
|
157
|
-
# look for a charset in a meta tag in the first 1024 bytes
|
158
|
-
if not encoding
|
159
|
-
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
|
160
|
-
data.scan(/<meta.*?>/m).each do |meta|
|
161
|
-
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
# if all else fails, default to the official default encoding for HTML
|
166
|
-
encoding ||= Encoding::ISO_8859_1
|
167
|
-
|
168
|
-
# change the encoding to match the detected or inferred encoding
|
169
|
-
begin
|
170
|
-
body.force_encoding(encoding)
|
171
|
-
rescue ArgumentError
|
172
|
-
body.force_encoding(Encoding::ISO_8859_1)
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
body.encode(Encoding::UTF_8)
|
177
|
-
end
|
178
|
-
end
|
14
|
+
# The default maximum depth of the DOM tree produced by parsing a document
|
15
|
+
# or fragment.
|
16
|
+
DEFAULT_MAX_TREE_DEPTH = 400
|
179
17
|
end
|