nokogumbo 0.5 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/work/Makefile +213 -0
- data/work/attribute.c +44 -0
- data/work/attribute.h +37 -0
- data/work/attribute.o +0 -0
- data/work/char_ref.c +2561 -0
- data/work/char_ref.h +61 -0
- data/work/char_ref.o +0 -0
- data/work/error.c +258 -0
- data/work/error.h +225 -0
- data/work/error.o +0 -0
- data/work/gumbo.h +800 -0
- data/work/insertion_mode.h +54 -0
- data/work/mkmf.log +41 -0
- data/work/nokogumbo.c +97 -0
- data/work/nokogumbo.o +0 -0
- data/work/nokogumboc.so +0 -0
- data/work/parser.c +3893 -0
- data/work/parser.h +57 -0
- data/work/parser.o +0 -0
- data/work/string_buffer.c +106 -0
- data/work/string_buffer.h +82 -0
- data/work/string_buffer.o +0 -0
- data/work/string_piece.c +49 -0
- data/work/string_piece.h +39 -0
- data/work/string_piece.o +0 -0
- data/work/tag.c +222 -0
- data/work/tag.o +0 -0
- data/work/token_type.h +40 -0
- data/work/tokenizer.c +2978 -0
- data/work/tokenizer.h +123 -0
- data/work/tokenizer.o +0 -0
- data/work/tokenizer_states.h +103 -0
- data/work/utf8.c +268 -0
- data/work/utf8.h +127 -0
- data/work/utf8.o +0 -0
- data/work/util.c +58 -0
- data/work/util.h +57 -0
- data/work/util.o +0 -0
- data/work/vector.c +121 -0
- data/work/vector.h +66 -0
- data/work/vector.o +0 -0
- metadata +42 -2
- data/Rakefile +0 -68
data/work/util.h
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
//
|
17
|
+
// This contains some utility functions that didn't fit into any of the other
|
18
|
+
// headers.
|
19
|
+
|
20
|
+
#ifndef GUMBO_UTIL_H_
|
21
|
+
#define GUMBO_UTIL_H_
|
22
|
+
|
23
|
+
#include <stdbool.h>
|
24
|
+
#include <stddef.h>
|
25
|
+
|
26
|
+
#ifdef __cplusplus
|
27
|
+
extern "C" {
|
28
|
+
#endif
|
29
|
+
|
30
|
+
// Forward declaration since it's passed into some of the functions in this
|
31
|
+
// header.
|
32
|
+
struct _GumboParser;
|
33
|
+
|
34
|
+
// Utility function for allocating & copying a null-terminated string into a
|
35
|
+
// freshly-allocated buffer. This is necessary for proper memory management; we
|
36
|
+
// have the convention that all const char* in parse tree structures are
|
37
|
+
// freshly-allocated, so if we didn't copy, we'd try to delete a literal string
|
38
|
+
// when the parse tree is destroyed.
|
39
|
+
char* gumbo_copy_stringz(struct _GumboParser* parser, const char* str);
|
40
|
+
|
41
|
+
// Allocate a chunk of memory, using the allocator specified in the Parser's
|
42
|
+
// config options.
|
43
|
+
void* gumbo_parser_allocate(struct _GumboParser* parser, size_t num_bytes);
|
44
|
+
|
45
|
+
// Deallocate a chunk of memory, using the deallocator specified in the Parser's
|
46
|
+
// config options.
|
47
|
+
void gumbo_parser_deallocate(struct _GumboParser* parser, void* ptr);
|
48
|
+
|
49
|
+
// Debug wrapper for printf, to make it easier to turn off debugging info when
|
50
|
+
// required.
|
51
|
+
void gumbo_debug(const char* format, ...);
|
52
|
+
|
53
|
+
#ifdef __cplusplus
|
54
|
+
}
|
55
|
+
#endif
|
56
|
+
|
57
|
+
#endif // GUMBO_UTIL_H_
|
data/work/util.o
ADDED
Binary file
|
data/work/vector.c
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
|
17
|
+
#include "vector.h"
|
18
|
+
|
19
|
+
#include <assert.h>
|
20
|
+
#include <stdlib.h>
|
21
|
+
#include <string.h>
|
22
|
+
#include <strings.h>
|
23
|
+
|
24
|
+
#include "util.h"
|
25
|
+
|
26
|
+
struct _GumboParser;
|
27
|
+
|
28
|
+
const GumboVector kGumboEmptyVector = { NULL, 0, 0 };
|
29
|
+
|
30
|
+
void gumbo_vector_init(
|
31
|
+
struct _GumboParser* parser, size_t initial_capacity, GumboVector* vector) {
|
32
|
+
vector->length = 0;
|
33
|
+
vector->capacity = initial_capacity;
|
34
|
+
if (initial_capacity > 0) {
|
35
|
+
vector->data = gumbo_parser_allocate(
|
36
|
+
parser, sizeof(void*) * initial_capacity);
|
37
|
+
} else {
|
38
|
+
vector->data = NULL;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
void gumbo_vector_destroy(struct _GumboParser* parser, GumboVector* vector) {
|
43
|
+
if (vector->capacity > 0) {
|
44
|
+
gumbo_parser_deallocate(parser, vector->data);
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
static void enlarge_vector_if_full(
|
49
|
+
struct _GumboParser* parser, GumboVector* vector) {
|
50
|
+
if (vector->length >= vector->capacity) {
|
51
|
+
if (vector->capacity) {
|
52
|
+
size_t old_num_bytes = sizeof(void*) * vector->capacity;
|
53
|
+
vector->capacity *= 2;
|
54
|
+
size_t num_bytes = sizeof(void*) * vector->capacity;
|
55
|
+
void** temp = gumbo_parser_allocate(parser, num_bytes);
|
56
|
+
memcpy(temp, vector->data, old_num_bytes);
|
57
|
+
gumbo_parser_deallocate(parser, vector->data);
|
58
|
+
vector->data = temp;
|
59
|
+
} else {
|
60
|
+
// 0-capacity vector; no previous array to deallocate.
|
61
|
+
vector->capacity = 2;
|
62
|
+
vector->data = gumbo_parser_allocate(
|
63
|
+
parser, sizeof(void*) * vector->capacity);
|
64
|
+
}
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
void gumbo_vector_add(
|
69
|
+
struct _GumboParser* parser, void* element, GumboVector* vector) {
|
70
|
+
enlarge_vector_if_full(parser, vector);
|
71
|
+
assert(vector->data);
|
72
|
+
assert(vector->length < vector->capacity);
|
73
|
+
vector->data[vector->length++] = element;
|
74
|
+
}
|
75
|
+
|
76
|
+
void* gumbo_vector_pop(struct _GumboParser* parser, GumboVector* vector) {
|
77
|
+
if (vector->length == 0) {
|
78
|
+
return NULL;
|
79
|
+
}
|
80
|
+
return vector->data[--vector->length];
|
81
|
+
}
|
82
|
+
|
83
|
+
int gumbo_vector_index_of(GumboVector* vector, void* element) {
|
84
|
+
for (int i = 0; i < vector->length; ++i) {
|
85
|
+
if (vector->data[i] == element) {
|
86
|
+
return i;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
return -1;
|
90
|
+
}
|
91
|
+
|
92
|
+
void gumbo_vector_insert_at(
|
93
|
+
struct _GumboParser* parser, void* element, int index, GumboVector* vector) {
|
94
|
+
assert(index >= 0);
|
95
|
+
assert(index <= vector->length);
|
96
|
+
enlarge_vector_if_full(parser, vector);
|
97
|
+
++vector->length;
|
98
|
+
memmove(&vector->data[index + 1], &vector->data[index],
|
99
|
+
sizeof(void*) * (vector->length - index - 1));
|
100
|
+
vector->data[index] = element;
|
101
|
+
}
|
102
|
+
|
103
|
+
void gumbo_vector_remove(
|
104
|
+
struct _GumboParser* parser, void* node, GumboVector* vector) {
|
105
|
+
int index = gumbo_vector_index_of(vector, node);
|
106
|
+
if (index == -1) {
|
107
|
+
return;
|
108
|
+
}
|
109
|
+
gumbo_vector_remove_at(parser, index, vector);
|
110
|
+
}
|
111
|
+
|
112
|
+
void* gumbo_vector_remove_at(
|
113
|
+
struct _GumboParser* parser, int index, GumboVector* vector) {
|
114
|
+
assert(index >= 0);
|
115
|
+
assert(index < vector->length);
|
116
|
+
void* result = vector->data[index];
|
117
|
+
memmove(&vector->data[index], &vector->data[index + 1],
|
118
|
+
sizeof(void*) * (vector->length - index - 1));
|
119
|
+
--vector->length;
|
120
|
+
return result;
|
121
|
+
}
|
data/work/vector.h
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
|
17
|
+
#ifndef GUMBO_VECTOR_H_
|
18
|
+
#define GUMBO_VECTOR_H_
|
19
|
+
|
20
|
+
#include "gumbo.h"
|
21
|
+
|
22
|
+
#ifdef __cplusplus
|
23
|
+
extern "C" {
|
24
|
+
#endif
|
25
|
+
|
26
|
+
// Forward declaration since it's passed into some of the functions in this
|
27
|
+
// header.
|
28
|
+
struct _GumboParser;
|
29
|
+
|
30
|
+
// Initializes a new GumboVector with the specified initial capacity.
|
31
|
+
void gumbo_vector_init(
|
32
|
+
struct _GumboParser* parser, size_t initial_capacity, GumboVector* vector);
|
33
|
+
|
34
|
+
// Frees the memory used by an GumboVector. Does not free the contained
|
35
|
+
// pointers.
|
36
|
+
void gumbo_vector_destroy(struct _GumboParser* parser, GumboVector* vector);
|
37
|
+
|
38
|
+
// Adds a new element to an GumboVector.
|
39
|
+
void gumbo_vector_add(
|
40
|
+
struct _GumboParser* parser, void* element, GumboVector* vector);
|
41
|
+
|
42
|
+
// Removes and returns the element most recently added to the GumboVector.
|
43
|
+
// Ownership is transferred to caller. Capacity is unchanged. If the vector is
|
44
|
+
// empty, NULL is returned.
|
45
|
+
void* gumbo_vector_pop(struct _GumboParser* parser, GumboVector* vector);
|
46
|
+
|
47
|
+
// Inserts an element at a specific index. This is potentially O(N) time, but
|
48
|
+
// is necessary for some of the spec's behavior.
|
49
|
+
void gumbo_vector_insert_at(
|
50
|
+
struct _GumboParser* parser, void* element, int index, GumboVector* vector);
|
51
|
+
|
52
|
+
// Removes an element from the vector, or does nothing if the element is not in
|
53
|
+
// the vector.
|
54
|
+
void gumbo_vector_remove(
|
55
|
+
struct _GumboParser* parser, void* element, GumboVector* vector);
|
56
|
+
|
57
|
+
// Removes and returns an element at a specific index. Note that this is
|
58
|
+
// potentially O(N) time and should be used sparingly.
|
59
|
+
void* gumbo_vector_remove_at(
|
60
|
+
struct _GumboParser* parser, int index, GumboVector* vector);
|
61
|
+
|
62
|
+
#ifdef __cplusplus
|
63
|
+
}
|
64
|
+
#endif
|
65
|
+
|
66
|
+
#endif // GUMBO_VECTOR_H_
|
data/work/vector.o
ADDED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.5.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -37,9 +37,49 @@ extra_rdoc_files: []
|
|
37
37
|
files:
|
38
38
|
- lib/nokogumbo.rb
|
39
39
|
- LICENSE.txt
|
40
|
-
- Rakefile
|
41
40
|
- README.md
|
42
41
|
- work/extconf.rb
|
42
|
+
- work/utf8.h
|
43
|
+
- work/string_piece.o
|
44
|
+
- work/string_buffer.o
|
45
|
+
- work/nokogumbo.c
|
46
|
+
- work/token_type.h
|
47
|
+
- work/util.h
|
48
|
+
- work/nokogumbo.o
|
49
|
+
- work/parser.o
|
50
|
+
- work/Makefile
|
51
|
+
- work/utf8.o
|
52
|
+
- work/vector.c
|
53
|
+
- work/string_buffer.c
|
54
|
+
- work/tokenizer_states.h
|
55
|
+
- work/error.h
|
56
|
+
- work/parser.h
|
57
|
+
- work/error.c
|
58
|
+
- work/tokenizer.h
|
59
|
+
- work/nokogumboc.so
|
60
|
+
- work/string_buffer.h
|
61
|
+
- work/vector.o
|
62
|
+
- work/vector.h
|
63
|
+
- work/tag.o
|
64
|
+
- work/tokenizer.o
|
65
|
+
- work/string_piece.h
|
66
|
+
- work/attribute.c
|
67
|
+
- work/mkmf.log
|
68
|
+
- work/char_ref.c
|
69
|
+
- work/string_piece.c
|
70
|
+
- work/error.o
|
71
|
+
- work/gumbo.h
|
72
|
+
- work/tag.c
|
73
|
+
- work/util.c
|
74
|
+
- work/parser.c
|
75
|
+
- work/utf8.c
|
76
|
+
- work/attribute.h
|
77
|
+
- work/char_ref.h
|
78
|
+
- work/char_ref.o
|
79
|
+
- work/insertion_mode.h
|
80
|
+
- work/tokenizer.c
|
81
|
+
- work/util.o
|
82
|
+
- work/attribute.o
|
43
83
|
homepage: https://github.com/rubys/nokogumbo/#readme
|
44
84
|
licenses:
|
45
85
|
- Apache 2.0
|
data/Rakefile
DELETED
@@ -1,68 +0,0 @@
|
|
1
|
-
require 'rubygems/package_task'
|
2
|
-
require 'rake/clean'
|
3
|
-
|
4
|
-
task 'default' => 'test'
|
5
|
-
|
6
|
-
file 'gumbo-parser' do
|
7
|
-
sh 'git clone https://github.com/google/gumbo-parser.git'
|
8
|
-
end
|
9
|
-
|
10
|
-
file 'work/extconf.rb' => ['ext/extconf.rb', 'gumbo-parser'] do
|
11
|
-
mkdir_p 'work'
|
12
|
-
rm_f 'work/Makefile'
|
13
|
-
cp Dir['gumbo-parser/src/*'], 'work', :preserve => true
|
14
|
-
cp Dir['ext/*'], 'work'
|
15
|
-
end
|
16
|
-
|
17
|
-
file 'work/Makefile' => 'work/extconf.rb' do
|
18
|
-
Dir.chdir 'work' do
|
19
|
-
ruby 'extconf.rb'
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
file 'work/nokogumbo.c' => 'ext/nokogumbo.c' do
|
24
|
-
cp 'ext/nokogumbo.c', 'work/nokogumbo.c'
|
25
|
-
end
|
26
|
-
|
27
|
-
task 'compile' => ['work/Makefile', 'work/nokogumbo.c'] do
|
28
|
-
Dir.chdir 'work' do
|
29
|
-
sh 'make -s'
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
task 'test' => 'compile' do
|
34
|
-
ruby 'test-nokogumbo.rb'
|
35
|
-
end
|
36
|
-
|
37
|
-
CLEAN.include 'pkg', 'gumbo-parser', 'work'
|
38
|
-
|
39
|
-
SPEC = Gem::Specification.new do |gem|
|
40
|
-
gem.name = 'nokogumbo'
|
41
|
-
gem.version = '0.5'
|
42
|
-
gem.email = 'rubys@intertwingly.net'
|
43
|
-
gem.homepage = 'https://github.com/rubys/nokogumbo/#readme'
|
44
|
-
gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser'
|
45
|
-
gem.extensions = 'work/extconf.rb'
|
46
|
-
gem.author = 'Sam Ruby'
|
47
|
-
gem.add_dependency 'nokogiri'
|
48
|
-
gem.license = 'Apache 2.0'
|
49
|
-
gem.description = %q(
|
50
|
-
Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
51
|
-
access the result as a Nokogiri parsed document.).strip.gsub(/\s+/, ' ')
|
52
|
-
gem.files = FileList[
|
53
|
-
'lib/nokogumbo.rb',
|
54
|
-
'LICENSE.txt',
|
55
|
-
'Rakefile',
|
56
|
-
'README.md'
|
57
|
-
]
|
58
|
-
end
|
59
|
-
|
60
|
-
task 'package_workfiles' => 'work/extconf.rb' do
|
61
|
-
PKG.package_files += FileList['work/*.rb', 'work/*.c', 'work/*.h']
|
62
|
-
end
|
63
|
-
|
64
|
-
task 'gem' => ['test', 'package_workfiles']
|
65
|
-
PKG = Gem::PackageTask.new(SPEC) do |pkg|
|
66
|
-
pkg.need_tar = true
|
67
|
-
pkg.need_zip = true
|
68
|
-
end
|