nokogumbo 1.5.0 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +237 -26
  3. data/ext/nokogumbo/extconf.rb +121 -0
  4. data/ext/nokogumbo/nokogumbo.c +793 -0
  5. data/gumbo-parser/src/ascii.c +75 -0
  6. data/gumbo-parser/src/ascii.h +115 -0
  7. data/gumbo-parser/src/attribute.c +26 -28
  8. data/gumbo-parser/src/attribute.h +3 -23
  9. data/gumbo-parser/src/char_ref.c +5972 -6816
  10. data/gumbo-parser/src/char_ref.h +14 -45
  11. data/gumbo-parser/src/error.c +510 -163
  12. data/gumbo-parser/src/error.h +70 -147
  13. data/gumbo-parser/src/foreign_attrs.c +104 -0
  14. data/gumbo-parser/src/gumbo.h +577 -305
  15. data/gumbo-parser/src/insertion_mode.h +4 -28
  16. data/gumbo-parser/src/macros.h +91 -0
  17. data/gumbo-parser/src/parser.c +2922 -2228
  18. data/gumbo-parser/src/parser.h +6 -22
  19. data/gumbo-parser/src/replacement.h +33 -0
  20. data/gumbo-parser/src/string_buffer.c +43 -50
  21. data/gumbo-parser/src/string_buffer.h +24 -40
  22. data/gumbo-parser/src/string_piece.c +39 -39
  23. data/gumbo-parser/src/svg_attrs.c +174 -0
  24. data/gumbo-parser/src/svg_tags.c +137 -0
  25. data/gumbo-parser/src/tag.c +186 -59
  26. data/gumbo-parser/src/tag_lookup.c +382 -0
  27. data/gumbo-parser/src/tag_lookup.h +13 -0
  28. data/gumbo-parser/src/token_buffer.c +79 -0
  29. data/gumbo-parser/src/token_buffer.h +71 -0
  30. data/gumbo-parser/src/token_type.h +1 -25
  31. data/gumbo-parser/src/tokenizer.c +2127 -1561
  32. data/gumbo-parser/src/tokenizer.h +41 -52
  33. data/gumbo-parser/src/tokenizer_states.h +281 -45
  34. data/gumbo-parser/src/utf8.c +98 -123
  35. data/gumbo-parser/src/utf8.h +84 -52
  36. data/gumbo-parser/src/util.c +48 -38
  37. data/gumbo-parser/src/util.h +10 -40
  38. data/gumbo-parser/src/vector.c +45 -57
  39. data/gumbo-parser/src/vector.h +17 -39
  40. data/lib/nokogumbo.rb +11 -173
  41. data/lib/nokogumbo/html5.rb +252 -0
  42. data/lib/nokogumbo/html5/document.rb +53 -0
  43. data/lib/nokogumbo/html5/document_fragment.rb +62 -0
  44. data/lib/nokogumbo/html5/node.rb +72 -0
  45. data/lib/nokogumbo/version.rb +3 -0
  46. metadata +43 -24
  47. data/ext/nokogumboc/extconf.rb +0 -60
  48. data/ext/nokogumboc/nokogumbo.c +0 -295
  49. data/gumbo-parser/src/char_ref.rl +0 -2554
  50. data/gumbo-parser/src/string_piece.h +0 -38
  51. data/gumbo-parser/src/tag.in +0 -150
  52. data/gumbo-parser/src/tag_enum.h +0 -153
  53. data/gumbo-parser/src/tag_gperf.h +0 -105
  54. data/gumbo-parser/src/tag_sizes.h +0 -4
  55. data/gumbo-parser/src/tag_strings.h +0 -153
  56. data/gumbo-parser/visualc/include/strings.h +0 -4
  57. data/test-nokogumbo.rb +0 -190
@@ -1,58 +1,68 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
1
+ /*
2
+ Copyright 2017-2018 Craig Barnes.
3
+ Copyright 2010 Google Inc.
16
4
 
17
- #include "util.h"
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ https://www.apache.org/licenses/LICENSE-2.0
18
10
 
19
- #include <assert.h>
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+ */
17
+
18
+ #include <stdio.h>
20
19
  #include <stdlib.h>
21
20
  #include <string.h>
22
- #include <strings.h>
23
- #include <stdarg.h>
24
- #include <stdio.h>
25
-
21
+ #include "util.h"
26
22
  #include "gumbo.h"
27
- #include "parser.h"
28
23
 
29
- // TODO(jdtang): This should be elsewhere, but there's no .c file for
30
- // SourcePositions and yet the constant needs some linkage, so this is as good
31
- // as any.
32
- const GumboSourcePosition kGumboEmptySourcePosition = {0, 0, 0};
24
+ void* gumbo_alloc(size_t size) {
25
+ void* ptr = malloc(size);
26
+ if (unlikely(ptr == NULL)) {
27
+ perror(__func__);
28
+ abort();
29
+ }
30
+ return ptr;
31
+ }
33
32
 
34
- void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) {
35
- return parser->_options->allocator(parser->_options->userdata, num_bytes);
33
+ void* gumbo_realloc(void* ptr, size_t size) {
34
+ ptr = realloc(ptr, size);
35
+ if (unlikely(ptr == NULL)) {
36
+ perror(__func__);
37
+ abort();
38
+ }
39
+ return ptr;
36
40
  }
37
41
 
38
- void gumbo_parser_deallocate(GumboParser* parser, void* ptr) {
39
- parser->_options->deallocator(parser->_options->userdata, ptr);
42
+ void gumbo_free(void* ptr) {
43
+ free(ptr);
40
44
  }
41
45
 
42
- char* gumbo_copy_stringz(GumboParser* parser, const char* str) {
43
- char* buffer = gumbo_parser_allocate(parser, strlen(str) + 1);
44
- strcpy(buffer, str);
45
- return buffer;
46
+ char* gumbo_strdup(const char* str) {
47
+ const size_t size = strlen(str) + 1;
48
+ // The strdup(3) function isn't available in strict "-std=c99" mode
49
+ // (it's part of POSIX, not C99), so use malloc(3) and memcpy(3)
50
+ // instead:
51
+ char* buffer = gumbo_alloc(size);
52
+ return memcpy(buffer, str, size);
46
53
  }
47
54
 
48
- // Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG
49
- // to use.
50
- void gumbo_debug(const char* format, ...) {
51
55
  #ifdef GUMBO_DEBUG
56
+ #include <stdarg.h>
57
+ // Debug function to trace operation of the parser
58
+ // (define GUMBO_DEBUG to use).
59
+ void gumbo_debug(const char* format, ...) {
52
60
  va_list args;
53
61
  va_start(args, format);
54
62
  vprintf(format, args);
55
63
  va_end(args);
56
64
  fflush(stdout);
57
- #endif
58
65
  }
66
+ #else
67
+ void gumbo_debug(const char* UNUSED_ARG(format), ...) {}
68
+ #endif
@@ -1,60 +1,30 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // This contains some utility functions that didn't fit into any of the other
18
- // headers.
19
-
20
1
  #ifndef GUMBO_UTIL_H_
21
2
  #define GUMBO_UTIL_H_
22
- #ifdef _MSC_VER
23
- #define _CRT_SECURE_NO_WARNINGS
24
- #endif
3
+
25
4
  #include <stdbool.h>
26
5
  #include <stddef.h>
6
+ #include "macros.h"
27
7
 
28
8
  #ifdef __cplusplus
29
9
  extern "C" {
30
10
  #endif
31
11
 
32
- // Forward declaration since it's passed into some of the functions in this
33
- // header.
34
- struct GumboInternalParser;
35
-
36
12
  // Utility function for allocating & copying a null-terminated string into a
37
- // freshly-allocated buffer. This is necessary for proper memory management; we
13
+ // freshly-allocated buffer. This is necessary for proper memory management; we
38
14
  // have the convention that all const char* in parse tree structures are
39
15
  // freshly-allocated, so if we didn't copy, we'd try to delete a literal string
40
16
  // when the parse tree is destroyed.
41
- char* gumbo_copy_stringz(struct GumboInternalParser* parser, const char* str);
42
-
43
- // Allocate a chunk of memory, using the allocator specified in the Parser's
44
- // config options.
45
- void* gumbo_parser_allocate(
46
- struct GumboInternalParser* parser, size_t num_bytes);
17
+ char* gumbo_strdup(const char* str) XMALLOC NONNULL_ARGS;
47
18
 
48
- // Deallocate a chunk of memory, using the deallocator specified in the Parser's
49
- // config options.
50
- void gumbo_parser_deallocate(struct GumboInternalParser* parser, void* ptr);
19
+ void* gumbo_alloc(size_t size) XMALLOC;
20
+ void* gumbo_realloc(void* ptr, size_t size) RETURNS_NONNULL;
21
+ void gumbo_free(void* ptr);
51
22
 
52
- // Debug wrapper for printf, to make it easier to turn off debugging info when
53
- // required.
54
- void gumbo_debug(const char* format, ...);
23
+ // Debug wrapper for printf
24
+ void gumbo_debug(const char* format, ...) PRINTF(1);
55
25
 
56
26
  #ifdef __cplusplus
57
27
  }
58
28
  #endif
59
29
 
60
- #endif // GUMBO_UTIL_H_
30
+ #endif // GUMBO_UTIL_H_
@@ -1,81 +1,64 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
1
+ /*
2
+ Copyright 2018 Craig Barnes.
3
+ Copyright 2010 Google Inc.
16
4
 
17
- #include "vector.h"
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ https://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+ */
18
17
 
19
18
  #include <assert.h>
20
19
  #include <stdlib.h>
21
20
  #include <string.h>
22
- #include <strings.h>
23
-
21
+ #include "vector.h"
24
22
  #include "util.h"
25
23
 
26
- struct GumboInternalParser;
27
-
28
- const GumboVector kGumboEmptyVector = {NULL, 0, 0};
29
-
30
- void gumbo_vector_init(struct GumboInternalParser* parser,
31
- size_t initial_capacity, GumboVector* vector) {
24
+ void gumbo_vector_init(unsigned int initial_capacity, GumboVector* vector) {
32
25
  vector->length = 0;
33
26
  vector->capacity = initial_capacity;
34
27
  if (initial_capacity > 0) {
35
- vector->data =
36
- gumbo_parser_allocate(parser, sizeof(void*) * initial_capacity);
28
+ vector->data = gumbo_alloc(sizeof(void*) * initial_capacity);
37
29
  } else {
38
30
  vector->data = NULL;
39
31
  }
40
32
  }
41
33
 
42
- void gumbo_vector_destroy(
43
- struct GumboInternalParser* parser, GumboVector* vector) {
34
+ void gumbo_vector_destroy(GumboVector* vector) {
44
35
  if (vector->capacity > 0) {
45
- gumbo_parser_deallocate(parser, vector->data);
36
+ gumbo_free(vector->data);
46
37
  }
47
38
  }
48
39
 
49
- static void enlarge_vector_if_full(
50
- struct GumboInternalParser* parser, GumboVector* vector) {
40
+ static void enlarge_vector_if_full(GumboVector* vector) {
51
41
  if (vector->length >= vector->capacity) {
52
42
  if (vector->capacity) {
53
- size_t old_num_bytes = sizeof(void*) * vector->capacity;
54
43
  vector->capacity *= 2;
55
44
  size_t num_bytes = sizeof(void*) * vector->capacity;
56
- void** temp = gumbo_parser_allocate(parser, num_bytes);
57
- memcpy(temp, vector->data, old_num_bytes);
58
- gumbo_parser_deallocate(parser, vector->data);
59
- vector->data = temp;
45
+ vector->data = gumbo_realloc(vector->data, num_bytes);
60
46
  } else {
61
47
  // 0-capacity vector; no previous array to deallocate.
62
48
  vector->capacity = 2;
63
- vector->data =
64
- gumbo_parser_allocate(parser, sizeof(void*) * vector->capacity);
49
+ vector->data = gumbo_alloc(sizeof(void*) * vector->capacity);
65
50
  }
66
51
  }
67
52
  }
68
53
 
69
- void gumbo_vector_add(
70
- struct GumboInternalParser* parser, void* element, GumboVector* vector) {
71
- enlarge_vector_if_full(parser, vector);
54
+ void gumbo_vector_add(void* element, GumboVector* vector) {
55
+ enlarge_vector_if_full(vector);
72
56
  assert(vector->data);
73
57
  assert(vector->length < vector->capacity);
74
58
  vector->data[vector->length++] = element;
75
59
  }
76
60
 
77
- void* gumbo_vector_pop(
78
- struct GumboInternalParser* parser, GumboVector* vector) {
61
+ void* gumbo_vector_pop(GumboVector* vector) {
79
62
  if (vector->length == 0) {
80
63
  return NULL;
81
64
  }
@@ -91,33 +74,38 @@ int gumbo_vector_index_of(GumboVector* vector, const void* element) {
91
74
  return -1;
92
75
  }
93
76
 
94
- void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
95
- unsigned int index, GumboVector* vector) {
96
- assert(index >= 0);
77
+ void gumbo_vector_insert_at (
78
+ void* element,
79
+ unsigned int index,
80
+ GumboVector* vector
81
+ ) {
97
82
  assert(index <= vector->length);
98
- enlarge_vector_if_full(parser, vector);
83
+ enlarge_vector_if_full(vector);
99
84
  ++vector->length;
100
- memmove(&vector->data[index + 1], &vector->data[index],
101
- sizeof(void*) * (vector->length - index - 1));
85
+ memmove (
86
+ &vector->data[index + 1],
87
+ &vector->data[index],
88
+ sizeof(void*) * (vector->length - index - 1)
89
+ );
102
90
  vector->data[index] = element;
103
91
  }
104
92
 
105
- void gumbo_vector_remove(
106
- struct GumboInternalParser* parser, void* node, GumboVector* vector) {
93
+ void gumbo_vector_remove(void* node, GumboVector* vector) {
107
94
  int index = gumbo_vector_index_of(vector, node);
108
95
  if (index == -1) {
109
96
  return;
110
97
  }
111
- gumbo_vector_remove_at(parser, index, vector);
98
+ gumbo_vector_remove_at(index, vector);
112
99
  }
113
100
 
114
- void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
115
- unsigned int index, GumboVector* vector) {
116
- assert(index >= 0);
101
+ void* gumbo_vector_remove_at(unsigned int index, GumboVector* vector) {
117
102
  assert(index < vector->length);
118
103
  void* result = vector->data[index];
119
- memmove(&vector->data[index], &vector->data[index + 1],
120
- sizeof(void*) * (vector->length - index - 1));
104
+ memmove (
105
+ &vector->data[index],
106
+ &vector->data[index + 1],
107
+ sizeof(void*) * (vector->length - index - 1)
108
+ );
121
109
  --vector->length;
122
110
  return result;
123
111
  }
@@ -1,19 +1,3 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
1
  #ifndef GUMBO_VECTOR_H_
18
2
  #define GUMBO_VECTOR_H_
19
3
 
@@ -23,45 +7,39 @@
23
7
  extern "C" {
24
8
  #endif
25
9
 
26
- // Forward declaration since it's passed into some of the functions in this
27
- // header.
28
- struct GumboInternalParser;
29
-
30
10
  // Initializes a new GumboVector with the specified initial capacity.
31
- void gumbo_vector_init(struct GumboInternalParser* parser,
32
- size_t initial_capacity, GumboVector* vector);
11
+ void gumbo_vector_init(unsigned int initial_capacity, GumboVector* vector);
33
12
 
34
- // Frees the memory used by an GumboVector. Does not free the contained
13
+ // Frees the memory used by a GumboVector. Does not free the contained
35
14
  // pointers.
36
- void gumbo_vector_destroy(
37
- struct GumboInternalParser* parser, GumboVector* vector);
15
+ void gumbo_vector_destroy(GumboVector* vector);
38
16
 
39
- // Adds a new element to an GumboVector.
40
- void gumbo_vector_add(
41
- struct GumboInternalParser* parser, void* element, GumboVector* vector);
17
+ // Adds a new element to a GumboVector.
18
+ void gumbo_vector_add(void* element, GumboVector* vector);
42
19
 
43
20
  // Removes and returns the element most recently added to the GumboVector.
44
- // Ownership is transferred to caller. Capacity is unchanged. If the vector is
21
+ // Ownership is transferred to caller. Capacity is unchanged. If the vector is
45
22
  // empty, NULL is returned.
46
- void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector);
23
+ void* gumbo_vector_pop(GumboVector* vector);
47
24
 
48
- // Inserts an element at a specific index. This is potentially O(N) time, but
25
+ // Inserts an element at a specific index. This is potentially O(N) time, but
49
26
  // is necessary for some of the spec's behavior.
50
- void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
51
- unsigned int index, GumboVector* vector);
27
+ void gumbo_vector_insert_at (
28
+ void* element,
29
+ unsigned int index,
30
+ GumboVector* vector
31
+ );
52
32
 
53
33
  // Removes an element from the vector, or does nothing if the element is not in
54
34
  // the vector.
55
- void gumbo_vector_remove(
56
- struct GumboInternalParser* parser, void* element, GumboVector* vector);
35
+ void gumbo_vector_remove(void* element, GumboVector* vector);
57
36
 
58
- // Removes and returns an element at a specific index. Note that this is
37
+ // Removes and returns an element at a specific index. Note that this is
59
38
  // potentially O(N) time and should be used sparingly.
60
- void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
61
- unsigned int index, GumboVector* vector);
39
+ void* gumbo_vector_remove_at(unsigned int index, GumboVector* vector);
62
40
 
63
41
  #ifdef __cplusplus
64
42
  }
65
43
  #endif
66
44
 
67
- #endif // GUMBO_VECTOR_H_
45
+ #endif // GUMBO_VECTOR_H_
@@ -1,179 +1,17 @@
1
1
  require 'nokogiri'
2
- require 'nokogumboc'
2
+ require 'nokogumbo/version'
3
+ require 'nokogumbo/html5'
3
4
 
4
- module Nokogiri
5
- # Parse an HTML document. +string+ contains the document. +string+
6
- # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
7
- def self.HTML5(*args)
8
- Nokogiri::HTML5.parse(*args)
9
- end
5
+ require 'nokogumbo/nokogumbo'
10
6
 
11
- module HTML5
12
- # Parse an HTML document. +string+ contains the document. +string+
13
- # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
14
- def self.parse(string, options={})
15
- if string.respond_to? :read
16
- string = string.read
17
- end
7
+ module Nokogumbo
8
+ # The default maximum number of attributes per element.
9
+ DEFAULT_MAX_ATTRIBUTES = 400
18
10
 
19
- # convert to UTF-8 (Ruby 1.9+)
20
- if string.respond_to?(:encoding) and string.encoding != Encoding::UTF_8
21
- string = reencode(string)
22
- end
11
+ # The default maximum number of errors for parsing a document or a fragment.
12
+ DEFAULT_MAX_ERRORS = 0
23
13
 
24
- Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0)
25
- end
26
-
27
- # Fetch and parse a HTML document from the web, following redirects,
28
- # handling https, and determining the character encoding using HTML5
29
- # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
30
- # http headers and special options. Everything which is not a
31
- # special option is considered a header. Special options include:
32
- # * :follow_limit => number of redirects which are followed
33
- # * :basic_auth => [username, password]
34
- def self.get(uri, options={})
35
- headers = options.clone
36
- headers = {:follow_limit => headers} if Numeric === headers # deprecated
37
- limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
38
-
39
- require 'net/http'
40
- uri = URI(uri) unless URI === uri
41
-
42
- http = Net::HTTP.new(uri.host, uri.port)
43
-
44
- # TLS / SSL support
45
- http.use_ssl = true if uri.scheme == 'https'
46
-
47
- # Pass through Net::HTTP override values, which currently include:
48
- # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
49
- # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
50
- # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
51
- # :verify_callback, :verify_depth, :verify_mode
52
- options.each do |key, value|
53
- http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
54
- end
55
-
56
- request = Net::HTTP::Get.new(uri.request_uri)
57
-
58
- # basic authentication
59
- auth = headers.delete(:basic_auth)
60
- auth ||= [uri.user, uri.password] if uri.user and uri.password
61
- request.basic_auth auth.first, auth.last if auth
62
-
63
- # remaining options are treated as headers
64
- headers.each {|key, value| request[key.to_s] = value.to_s}
65
-
66
- response = http.request(request)
67
-
68
- case response
69
- when Net::HTTPSuccess
70
- doc = parse(reencode(response.body, response['content-type']), options)
71
- doc.instance_variable_set('@response', response)
72
- doc.class.send(:attr_reader, :response)
73
- doc
74
- when Net::HTTPRedirection
75
- response.value if limit <= 1
76
- location = URI.join(uri, response['location'])
77
- get(location, options.merge(:follow_limit => limit-1))
78
- else
79
- response.value
80
- end
81
- end
82
-
83
- # while fragment is on the Gumbo TODO list, simulate it by doing
84
- # a full document parse and ignoring the parent <html>, <head>, and <body>
85
- # tags, and collecting up the children of each.
86
- def self.fragment(*args)
87
- doc = parse(*args)
88
- fragment = Nokogiri::HTML::DocumentFragment.new(doc)
89
-
90
- if doc.children.length != 1 or doc.children.first.name != 'html'
91
- # no HTML? Return document as is
92
- fragment = doc
93
- else
94
- # examine children of HTML element
95
- children = doc.children.first.children
96
-
97
- # head is always first. If present, take children but otherwise
98
- # ignore the head element
99
- if children.length > 0 and doc.children.first.name = 'head'
100
- fragment << children.shift.children
101
- end
102
-
103
- # body may be next, or last. If found, take children but otherwise
104
- # ignore the body element. Also take any remaining elements, taking
105
- # care to preserve order.
106
- if children.length > 0 and doc.children.first.name = 'body'
107
- fragment << children.shift.children
108
- fragment << children
109
- elsif children.length > 0 and doc.children.last.name = 'body'
110
- body = children.pop
111
- fragment << children
112
- fragment << body.children
113
- else
114
- fragment << children
115
- end
116
- end
117
-
118
- # return result
119
- fragment
120
- end
121
-
122
- private
123
-
124
- # Charset sniffing is a complex and controversial topic that understandably
125
- # isn't done _by default_ by the Ruby Net::HTTP library. This being said,
126
- # it is a very real problem for consumers of HTML as the default for HTML
127
- # is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
128
- # *only* supports utf-8.
129
- #
130
- # Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
131
- # detection. Following this lead, Nokogiri::HTML5 attempts to do likewise,
132
- # while attempting to more closely follow the HTML5 standard.
133
- #
134
- # http://bugs.ruby-lang.org/issues/2567
135
- # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
136
- #
137
- def self.reencode(body, content_type=nil)
138
- return body unless body.respond_to? :encoding
139
-
140
- if body.encoding == Encoding::ASCII_8BIT
141
- encoding = nil
142
-
143
- # look for a Byte Order Mark (BOM)
144
- if body[0..1] == "\xFE\xFF"
145
- encoding = 'utf-16be'
146
- elsif body[0..1] == "\xFF\xFE"
147
- encoding = 'utf-16le'
148
- elsif body[0..2] == "\xEF\xBB\xBF"
149
- encoding = 'utf-8'
150
- end
151
-
152
- # look for a charset in a content-encoding header
153
- if content_type
154
- encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
155
- end
156
-
157
- # look for a charset in a meta tag in the first 1024 bytes
158
- if not encoding
159
- data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
160
- data.scan(/<meta.*?>/m).each do |meta|
161
- encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
162
- end
163
- end
164
-
165
- # if all else fails, default to the official default encoding for HTML
166
- encoding ||= Encoding::ISO_8859_1
167
-
168
- # change the encoding to match the detected or inferred encoding
169
- begin
170
- body.force_encoding(encoding)
171
- rescue ArgumentError
172
- body.force_encoding(Encoding::ISO_8859_1)
173
- end
174
- end
175
-
176
- body.encode(Encoding::UTF_8)
177
- end
178
- end
14
+ # The default maximum depth of the DOM tree produced by parsing a document
15
+ # or fragment.
16
+ DEFAULT_MAX_TREE_DEPTH = 400
179
17
  end