nokogumbo 1.5.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +237 -26
  3. data/ext/nokogumbo/extconf.rb +121 -0
  4. data/ext/nokogumbo/nokogumbo.c +793 -0
  5. data/gumbo-parser/src/ascii.c +75 -0
  6. data/gumbo-parser/src/ascii.h +115 -0
  7. data/gumbo-parser/src/attribute.c +26 -28
  8. data/gumbo-parser/src/attribute.h +3 -23
  9. data/gumbo-parser/src/char_ref.c +5972 -6816
  10. data/gumbo-parser/src/char_ref.h +14 -45
  11. data/gumbo-parser/src/error.c +510 -163
  12. data/gumbo-parser/src/error.h +70 -147
  13. data/gumbo-parser/src/foreign_attrs.c +104 -0
  14. data/gumbo-parser/src/gumbo.h +577 -305
  15. data/gumbo-parser/src/insertion_mode.h +4 -28
  16. data/gumbo-parser/src/macros.h +91 -0
  17. data/gumbo-parser/src/parser.c +2922 -2228
  18. data/gumbo-parser/src/parser.h +6 -22
  19. data/gumbo-parser/src/replacement.h +33 -0
  20. data/gumbo-parser/src/string_buffer.c +43 -50
  21. data/gumbo-parser/src/string_buffer.h +24 -40
  22. data/gumbo-parser/src/string_piece.c +39 -39
  23. data/gumbo-parser/src/svg_attrs.c +174 -0
  24. data/gumbo-parser/src/svg_tags.c +137 -0
  25. data/gumbo-parser/src/tag.c +186 -59
  26. data/gumbo-parser/src/tag_lookup.c +382 -0
  27. data/gumbo-parser/src/tag_lookup.h +13 -0
  28. data/gumbo-parser/src/token_buffer.c +79 -0
  29. data/gumbo-parser/src/token_buffer.h +71 -0
  30. data/gumbo-parser/src/token_type.h +1 -25
  31. data/gumbo-parser/src/tokenizer.c +2127 -1561
  32. data/gumbo-parser/src/tokenizer.h +41 -52
  33. data/gumbo-parser/src/tokenizer_states.h +281 -45
  34. data/gumbo-parser/src/utf8.c +98 -123
  35. data/gumbo-parser/src/utf8.h +84 -52
  36. data/gumbo-parser/src/util.c +48 -38
  37. data/gumbo-parser/src/util.h +10 -40
  38. data/gumbo-parser/src/vector.c +45 -57
  39. data/gumbo-parser/src/vector.h +17 -39
  40. data/lib/nokogumbo.rb +11 -173
  41. data/lib/nokogumbo/html5.rb +252 -0
  42. data/lib/nokogumbo/html5/document.rb +53 -0
  43. data/lib/nokogumbo/html5/document_fragment.rb +62 -0
  44. data/lib/nokogumbo/html5/node.rb +72 -0
  45. data/lib/nokogumbo/version.rb +3 -0
  46. metadata +43 -24
  47. data/ext/nokogumboc/extconf.rb +0 -60
  48. data/ext/nokogumboc/nokogumbo.c +0 -295
  49. data/gumbo-parser/src/char_ref.rl +0 -2554
  50. data/gumbo-parser/src/string_piece.h +0 -38
  51. data/gumbo-parser/src/tag.in +0 -150
  52. data/gumbo-parser/src/tag_enum.h +0 -153
  53. data/gumbo-parser/src/tag_gperf.h +0 -105
  54. data/gumbo-parser/src/tag_sizes.h +0 -4
  55. data/gumbo-parser/src/tag_strings.h +0 -153
  56. data/gumbo-parser/visualc/include/strings.h +0 -4
  57. data/test-nokogumbo.rb +0 -190
@@ -1,58 +1,68 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
1
+ /*
2
+ Copyright 2017-2018 Craig Barnes.
3
+ Copyright 2010 Google Inc.
16
4
 
17
- #include "util.h"
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ https://www.apache.org/licenses/LICENSE-2.0
18
10
 
19
- #include <assert.h>
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+ */
17
+
18
+ #include <stdio.h>
20
19
  #include <stdlib.h>
21
20
  #include <string.h>
22
- #include <strings.h>
23
- #include <stdarg.h>
24
- #include <stdio.h>
25
-
21
+ #include "util.h"
26
22
  #include "gumbo.h"
27
- #include "parser.h"
28
23
 
29
- // TODO(jdtang): This should be elsewhere, but there's no .c file for
30
- // SourcePositions and yet the constant needs some linkage, so this is as good
31
- // as any.
32
- const GumboSourcePosition kGumboEmptySourcePosition = {0, 0, 0};
24
+ void* gumbo_alloc(size_t size) {
25
+ void* ptr = malloc(size);
26
+ if (unlikely(ptr == NULL)) {
27
+ perror(__func__);
28
+ abort();
29
+ }
30
+ return ptr;
31
+ }
33
32
 
34
- void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) {
35
- return parser->_options->allocator(parser->_options->userdata, num_bytes);
33
+ void* gumbo_realloc(void* ptr, size_t size) {
34
+ ptr = realloc(ptr, size);
35
+ if (unlikely(ptr == NULL)) {
36
+ perror(__func__);
37
+ abort();
38
+ }
39
+ return ptr;
36
40
  }
37
41
 
38
- void gumbo_parser_deallocate(GumboParser* parser, void* ptr) {
39
- parser->_options->deallocator(parser->_options->userdata, ptr);
42
+ void gumbo_free(void* ptr) {
43
+ free(ptr);
40
44
  }
41
45
 
42
- char* gumbo_copy_stringz(GumboParser* parser, const char* str) {
43
- char* buffer = gumbo_parser_allocate(parser, strlen(str) + 1);
44
- strcpy(buffer, str);
45
- return buffer;
46
+ char* gumbo_strdup(const char* str) {
47
+ const size_t size = strlen(str) + 1;
48
+ // The strdup(3) function isn't available in strict "-std=c99" mode
49
+ // (it's part of POSIX, not C99), so use malloc(3) and memcpy(3)
50
+ // instead:
51
+ char* buffer = gumbo_alloc(size);
52
+ return memcpy(buffer, str, size);
46
53
  }
47
54
 
48
- // Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG
49
- // to use.
50
- void gumbo_debug(const char* format, ...) {
51
55
  #ifdef GUMBO_DEBUG
56
+ #include <stdarg.h>
57
+ // Debug function to trace operation of the parser
58
+ // (define GUMBO_DEBUG to use).
59
+ void gumbo_debug(const char* format, ...) {
52
60
  va_list args;
53
61
  va_start(args, format);
54
62
  vprintf(format, args);
55
63
  va_end(args);
56
64
  fflush(stdout);
57
- #endif
58
65
  }
66
+ #else
67
+ void gumbo_debug(const char* UNUSED_ARG(format), ...) {}
68
+ #endif
@@ -1,60 +1,30 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // This contains some utility functions that didn't fit into any of the other
18
- // headers.
19
-
20
1
  #ifndef GUMBO_UTIL_H_
21
2
  #define GUMBO_UTIL_H_
22
- #ifdef _MSC_VER
23
- #define _CRT_SECURE_NO_WARNINGS
24
- #endif
3
+
25
4
  #include <stdbool.h>
26
5
  #include <stddef.h>
6
+ #include "macros.h"
27
7
 
28
8
  #ifdef __cplusplus
29
9
  extern "C" {
30
10
  #endif
31
11
 
32
- // Forward declaration since it's passed into some of the functions in this
33
- // header.
34
- struct GumboInternalParser;
35
-
36
12
  // Utility function for allocating & copying a null-terminated string into a
37
- // freshly-allocated buffer. This is necessary for proper memory management; we
13
+ // freshly-allocated buffer. This is necessary for proper memory management; we
38
14
  // have the convention that all const char* in parse tree structures are
39
15
  // freshly-allocated, so if we didn't copy, we'd try to delete a literal string
40
16
  // when the parse tree is destroyed.
41
- char* gumbo_copy_stringz(struct GumboInternalParser* parser, const char* str);
42
-
43
- // Allocate a chunk of memory, using the allocator specified in the Parser's
44
- // config options.
45
- void* gumbo_parser_allocate(
46
- struct GumboInternalParser* parser, size_t num_bytes);
17
+ char* gumbo_strdup(const char* str) XMALLOC NONNULL_ARGS;
47
18
 
48
- // Deallocate a chunk of memory, using the deallocator specified in the Parser's
49
- // config options.
50
- void gumbo_parser_deallocate(struct GumboInternalParser* parser, void* ptr);
19
+ void* gumbo_alloc(size_t size) XMALLOC;
20
+ void* gumbo_realloc(void* ptr, size_t size) RETURNS_NONNULL;
21
+ void gumbo_free(void* ptr);
51
22
 
52
- // Debug wrapper for printf, to make it easier to turn off debugging info when
53
- // required.
54
- void gumbo_debug(const char* format, ...);
23
+ // Debug wrapper for printf
24
+ void gumbo_debug(const char* format, ...) PRINTF(1);
55
25
 
56
26
  #ifdef __cplusplus
57
27
  }
58
28
  #endif
59
29
 
60
- #endif // GUMBO_UTIL_H_
30
+ #endif // GUMBO_UTIL_H_
@@ -1,81 +1,64 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
1
+ /*
2
+ Copyright 2018 Craig Barnes.
3
+ Copyright 2010 Google Inc.
16
4
 
17
- #include "vector.h"
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ https://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+ */
18
17
 
19
18
  #include <assert.h>
20
19
  #include <stdlib.h>
21
20
  #include <string.h>
22
- #include <strings.h>
23
-
21
+ #include "vector.h"
24
22
  #include "util.h"
25
23
 
26
- struct GumboInternalParser;
27
-
28
- const GumboVector kGumboEmptyVector = {NULL, 0, 0};
29
-
30
- void gumbo_vector_init(struct GumboInternalParser* parser,
31
- size_t initial_capacity, GumboVector* vector) {
24
+ void gumbo_vector_init(unsigned int initial_capacity, GumboVector* vector) {
32
25
  vector->length = 0;
33
26
  vector->capacity = initial_capacity;
34
27
  if (initial_capacity > 0) {
35
- vector->data =
36
- gumbo_parser_allocate(parser, sizeof(void*) * initial_capacity);
28
+ vector->data = gumbo_alloc(sizeof(void*) * initial_capacity);
37
29
  } else {
38
30
  vector->data = NULL;
39
31
  }
40
32
  }
41
33
 
42
- void gumbo_vector_destroy(
43
- struct GumboInternalParser* parser, GumboVector* vector) {
34
+ void gumbo_vector_destroy(GumboVector* vector) {
44
35
  if (vector->capacity > 0) {
45
- gumbo_parser_deallocate(parser, vector->data);
36
+ gumbo_free(vector->data);
46
37
  }
47
38
  }
48
39
 
49
- static void enlarge_vector_if_full(
50
- struct GumboInternalParser* parser, GumboVector* vector) {
40
+ static void enlarge_vector_if_full(GumboVector* vector) {
51
41
  if (vector->length >= vector->capacity) {
52
42
  if (vector->capacity) {
53
- size_t old_num_bytes = sizeof(void*) * vector->capacity;
54
43
  vector->capacity *= 2;
55
44
  size_t num_bytes = sizeof(void*) * vector->capacity;
56
- void** temp = gumbo_parser_allocate(parser, num_bytes);
57
- memcpy(temp, vector->data, old_num_bytes);
58
- gumbo_parser_deallocate(parser, vector->data);
59
- vector->data = temp;
45
+ vector->data = gumbo_realloc(vector->data, num_bytes);
60
46
  } else {
61
47
  // 0-capacity vector; no previous array to deallocate.
62
48
  vector->capacity = 2;
63
- vector->data =
64
- gumbo_parser_allocate(parser, sizeof(void*) * vector->capacity);
49
+ vector->data = gumbo_alloc(sizeof(void*) * vector->capacity);
65
50
  }
66
51
  }
67
52
  }
68
53
 
69
- void gumbo_vector_add(
70
- struct GumboInternalParser* parser, void* element, GumboVector* vector) {
71
- enlarge_vector_if_full(parser, vector);
54
+ void gumbo_vector_add(void* element, GumboVector* vector) {
55
+ enlarge_vector_if_full(vector);
72
56
  assert(vector->data);
73
57
  assert(vector->length < vector->capacity);
74
58
  vector->data[vector->length++] = element;
75
59
  }
76
60
 
77
- void* gumbo_vector_pop(
78
- struct GumboInternalParser* parser, GumboVector* vector) {
61
+ void* gumbo_vector_pop(GumboVector* vector) {
79
62
  if (vector->length == 0) {
80
63
  return NULL;
81
64
  }
@@ -91,33 +74,38 @@ int gumbo_vector_index_of(GumboVector* vector, const void* element) {
91
74
  return -1;
92
75
  }
93
76
 
94
- void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
95
- unsigned int index, GumboVector* vector) {
96
- assert(index >= 0);
77
+ void gumbo_vector_insert_at (
78
+ void* element,
79
+ unsigned int index,
80
+ GumboVector* vector
81
+ ) {
97
82
  assert(index <= vector->length);
98
- enlarge_vector_if_full(parser, vector);
83
+ enlarge_vector_if_full(vector);
99
84
  ++vector->length;
100
- memmove(&vector->data[index + 1], &vector->data[index],
101
- sizeof(void*) * (vector->length - index - 1));
85
+ memmove (
86
+ &vector->data[index + 1],
87
+ &vector->data[index],
88
+ sizeof(void*) * (vector->length - index - 1)
89
+ );
102
90
  vector->data[index] = element;
103
91
  }
104
92
 
105
- void gumbo_vector_remove(
106
- struct GumboInternalParser* parser, void* node, GumboVector* vector) {
93
+ void gumbo_vector_remove(void* node, GumboVector* vector) {
107
94
  int index = gumbo_vector_index_of(vector, node);
108
95
  if (index == -1) {
109
96
  return;
110
97
  }
111
- gumbo_vector_remove_at(parser, index, vector);
98
+ gumbo_vector_remove_at(index, vector);
112
99
  }
113
100
 
114
- void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
115
- unsigned int index, GumboVector* vector) {
116
- assert(index >= 0);
101
+ void* gumbo_vector_remove_at(unsigned int index, GumboVector* vector) {
117
102
  assert(index < vector->length);
118
103
  void* result = vector->data[index];
119
- memmove(&vector->data[index], &vector->data[index + 1],
120
- sizeof(void*) * (vector->length - index - 1));
104
+ memmove (
105
+ &vector->data[index],
106
+ &vector->data[index + 1],
107
+ sizeof(void*) * (vector->length - index - 1)
108
+ );
121
109
  --vector->length;
122
110
  return result;
123
111
  }
@@ -1,19 +1,3 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
1
  #ifndef GUMBO_VECTOR_H_
18
2
  #define GUMBO_VECTOR_H_
19
3
 
@@ -23,45 +7,39 @@
23
7
  extern "C" {
24
8
  #endif
25
9
 
26
- // Forward declaration since it's passed into some of the functions in this
27
- // header.
28
- struct GumboInternalParser;
29
-
30
10
  // Initializes a new GumboVector with the specified initial capacity.
31
- void gumbo_vector_init(struct GumboInternalParser* parser,
32
- size_t initial_capacity, GumboVector* vector);
11
+ void gumbo_vector_init(unsigned int initial_capacity, GumboVector* vector);
33
12
 
34
- // Frees the memory used by an GumboVector. Does not free the contained
13
+ // Frees the memory used by a GumboVector. Does not free the contained
35
14
  // pointers.
36
- void gumbo_vector_destroy(
37
- struct GumboInternalParser* parser, GumboVector* vector);
15
+ void gumbo_vector_destroy(GumboVector* vector);
38
16
 
39
- // Adds a new element to an GumboVector.
40
- void gumbo_vector_add(
41
- struct GumboInternalParser* parser, void* element, GumboVector* vector);
17
+ // Adds a new element to a GumboVector.
18
+ void gumbo_vector_add(void* element, GumboVector* vector);
42
19
 
43
20
  // Removes and returns the element most recently added to the GumboVector.
44
- // Ownership is transferred to caller. Capacity is unchanged. If the vector is
21
+ // Ownership is transferred to caller. Capacity is unchanged. If the vector is
45
22
  // empty, NULL is returned.
46
- void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector);
23
+ void* gumbo_vector_pop(GumboVector* vector);
47
24
 
48
- // Inserts an element at a specific index. This is potentially O(N) time, but
25
+ // Inserts an element at a specific index. This is potentially O(N) time, but
49
26
  // is necessary for some of the spec's behavior.
50
- void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
51
- unsigned int index, GumboVector* vector);
27
+ void gumbo_vector_insert_at (
28
+ void* element,
29
+ unsigned int index,
30
+ GumboVector* vector
31
+ );
52
32
 
53
33
  // Removes an element from the vector, or does nothing if the element is not in
54
34
  // the vector.
55
- void gumbo_vector_remove(
56
- struct GumboInternalParser* parser, void* element, GumboVector* vector);
35
+ void gumbo_vector_remove(void* element, GumboVector* vector);
57
36
 
58
- // Removes and returns an element at a specific index. Note that this is
37
+ // Removes and returns an element at a specific index. Note that this is
59
38
  // potentially O(N) time and should be used sparingly.
60
- void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
61
- unsigned int index, GumboVector* vector);
39
+ void* gumbo_vector_remove_at(unsigned int index, GumboVector* vector);
62
40
 
63
41
  #ifdef __cplusplus
64
42
  }
65
43
  #endif
66
44
 
67
- #endif // GUMBO_VECTOR_H_
45
+ #endif // GUMBO_VECTOR_H_
@@ -1,179 +1,17 @@
1
1
  require 'nokogiri'
2
- require 'nokogumboc'
2
+ require 'nokogumbo/version'
3
+ require 'nokogumbo/html5'
3
4
 
4
- module Nokogiri
5
- # Parse an HTML document. +string+ contains the document. +string+
6
- # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
7
- def self.HTML5(*args)
8
- Nokogiri::HTML5.parse(*args)
9
- end
5
+ require 'nokogumbo/nokogumbo'
10
6
 
11
- module HTML5
12
- # Parse an HTML document. +string+ contains the document. +string+
13
- # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
14
- def self.parse(string, options={})
15
- if string.respond_to? :read
16
- string = string.read
17
- end
7
+ module Nokogumbo
8
+ # The default maximum number of attributes per element.
9
+ DEFAULT_MAX_ATTRIBUTES = 400
18
10
 
19
- # convert to UTF-8 (Ruby 1.9+)
20
- if string.respond_to?(:encoding) and string.encoding != Encoding::UTF_8
21
- string = reencode(string)
22
- end
11
+ # The default maximum number of errors for parsing a document or a fragment.
12
+ DEFAULT_MAX_ERRORS = 0
23
13
 
24
- Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0)
25
- end
26
-
27
- # Fetch and parse a HTML document from the web, following redirects,
28
- # handling https, and determining the character encoding using HTML5
29
- # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
30
- # http headers and special options. Everything which is not a
31
- # special option is considered a header. Special options include:
32
- # * :follow_limit => number of redirects which are followed
33
- # * :basic_auth => [username, password]
34
- def self.get(uri, options={})
35
- headers = options.clone
36
- headers = {:follow_limit => headers} if Numeric === headers # deprecated
37
- limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
38
-
39
- require 'net/http'
40
- uri = URI(uri) unless URI === uri
41
-
42
- http = Net::HTTP.new(uri.host, uri.port)
43
-
44
- # TLS / SSL support
45
- http.use_ssl = true if uri.scheme == 'https'
46
-
47
- # Pass through Net::HTTP override values, which currently include:
48
- # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
49
- # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
50
- # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
51
- # :verify_callback, :verify_depth, :verify_mode
52
- options.each do |key, value|
53
- http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
54
- end
55
-
56
- request = Net::HTTP::Get.new(uri.request_uri)
57
-
58
- # basic authentication
59
- auth = headers.delete(:basic_auth)
60
- auth ||= [uri.user, uri.password] if uri.user and uri.password
61
- request.basic_auth auth.first, auth.last if auth
62
-
63
- # remaining options are treated as headers
64
- headers.each {|key, value| request[key.to_s] = value.to_s}
65
-
66
- response = http.request(request)
67
-
68
- case response
69
- when Net::HTTPSuccess
70
- doc = parse(reencode(response.body, response['content-type']), options)
71
- doc.instance_variable_set('@response', response)
72
- doc.class.send(:attr_reader, :response)
73
- doc
74
- when Net::HTTPRedirection
75
- response.value if limit <= 1
76
- location = URI.join(uri, response['location'])
77
- get(location, options.merge(:follow_limit => limit-1))
78
- else
79
- response.value
80
- end
81
- end
82
-
83
- # while fragment is on the Gumbo TODO list, simulate it by doing
84
- # a full document parse and ignoring the parent <html>, <head>, and <body>
85
- # tags, and collecting up the children of each.
86
- def self.fragment(*args)
87
- doc = parse(*args)
88
- fragment = Nokogiri::HTML::DocumentFragment.new(doc)
89
-
90
- if doc.children.length != 1 or doc.children.first.name != 'html'
91
- # no HTML? Return document as is
92
- fragment = doc
93
- else
94
- # examine children of HTML element
95
- children = doc.children.first.children
96
-
97
- # head is always first. If present, take children but otherwise
98
- # ignore the head element
99
- if children.length > 0 and doc.children.first.name = 'head'
100
- fragment << children.shift.children
101
- end
102
-
103
- # body may be next, or last. If found, take children but otherwise
104
- # ignore the body element. Also take any remaining elements, taking
105
- # care to preserve order.
106
- if children.length > 0 and doc.children.first.name = 'body'
107
- fragment << children.shift.children
108
- fragment << children
109
- elsif children.length > 0 and doc.children.last.name = 'body'
110
- body = children.pop
111
- fragment << children
112
- fragment << body.children
113
- else
114
- fragment << children
115
- end
116
- end
117
-
118
- # return result
119
- fragment
120
- end
121
-
122
- private
123
-
124
- # Charset sniffing is a complex and controversial topic that understandably
125
- # isn't done _by default_ by the Ruby Net::HTTP library. This being said,
126
- # it is a very real problem for consumers of HTML as the default for HTML
127
- # is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
128
- # *only* supports utf-8.
129
- #
130
- # Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
131
- # detection. Following this lead, Nokogiri::HTML5 attempts to do likewise,
132
- # while attempting to more closely follow the HTML5 standard.
133
- #
134
- # http://bugs.ruby-lang.org/issues/2567
135
- # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
136
- #
137
- def self.reencode(body, content_type=nil)
138
- return body unless body.respond_to? :encoding
139
-
140
- if body.encoding == Encoding::ASCII_8BIT
141
- encoding = nil
142
-
143
- # look for a Byte Order Mark (BOM)
144
- if body[0..1] == "\xFE\xFF"
145
- encoding = 'utf-16be'
146
- elsif body[0..1] == "\xFF\xFE"
147
- encoding = 'utf-16le'
148
- elsif body[0..2] == "\xEF\xBB\xBF"
149
- encoding = 'utf-8'
150
- end
151
-
152
- # look for a charset in a content-encoding header
153
- if content_type
154
- encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
155
- end
156
-
157
- # look for a charset in a meta tag in the first 1024 bytes
158
- if not encoding
159
- data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
160
- data.scan(/<meta.*?>/m).each do |meta|
161
- encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
162
- end
163
- end
164
-
165
- # if all else fails, default to the official default encoding for HTML
166
- encoding ||= Encoding::ISO_8859_1
167
-
168
- # change the encoding to match the detected or inferred encoding
169
- begin
170
- body.force_encoding(encoding)
171
- rescue ArgumentError
172
- body.force_encoding(Encoding::ISO_8859_1)
173
- end
174
- end
175
-
176
- body.encode(Encoding::UTF_8)
177
- end
178
- end
14
+ # The default maximum depth of the DOM tree produced by parsing a document
15
+ # or fragment.
16
+ DEFAULT_MAX_TREE_DEPTH = 400
179
17
  end