nokogiri 1.10.7 → 1.16.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +42 -0
- data/LICENSE-DEPENDENCIES.md +1632 -1022
- data/LICENSE.md +1 -1
- data/README.md +188 -96
- data/bin/nokogiri +63 -50
- data/dependencies.yml +34 -66
- data/ext/nokogiri/depend +38 -358
- data/ext/nokogiri/extconf.rb +862 -421
- data/ext/nokogiri/gumbo.c +594 -0
- data/ext/nokogiri/html4_document.c +165 -0
- data/ext/nokogiri/html4_element_description.c +299 -0
- data/ext/nokogiri/html4_entity_lookup.c +37 -0
- data/ext/nokogiri/html4_sax_parser_context.c +108 -0
- data/ext/nokogiri/html4_sax_push_parser.c +95 -0
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +251 -105
- data/ext/nokogiri/nokogiri.h +222 -90
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +17 -17
- data/ext/nokogiri/xml_attribute_decl.c +22 -22
- data/ext/nokogiri/xml_cdata.c +39 -31
- data/ext/nokogiri/xml_comment.c +20 -27
- data/ext/nokogiri/xml_document.c +408 -243
- data/ext/nokogiri/xml_document_fragment.c +13 -17
- data/ext/nokogiri/xml_dtd.c +64 -58
- data/ext/nokogiri/xml_element_content.c +63 -55
- data/ext/nokogiri/xml_element_decl.c +31 -31
- data/ext/nokogiri/xml_encoding_handler.c +54 -21
- data/ext/nokogiri/xml_entity_decl.c +37 -35
- data/ext/nokogiri/xml_entity_reference.c +17 -19
- data/ext/nokogiri/xml_namespace.c +131 -61
- data/ext/nokogiri/xml_node.c +1343 -674
- data/ext/nokogiri/xml_node_set.c +246 -216
- data/ext/nokogiri/xml_processing_instruction.c +18 -20
- data/ext/nokogiri/xml_reader.c +305 -213
- data/ext/nokogiri/xml_relax_ng.c +87 -78
- data/ext/nokogiri/xml_sax_parser.c +149 -124
- data/ext/nokogiri/xml_sax_parser_context.c +149 -103
- data/ext/nokogiri/xml_sax_push_parser.c +65 -37
- data/ext/nokogiri/xml_schema.c +138 -82
- data/ext/nokogiri/xml_syntax_error.c +42 -21
- data/ext/nokogiri/xml_text.c +35 -26
- data/ext/nokogiri/xml_xpath_context.c +363 -178
- data/ext/nokogiri/xslt_stylesheet.c +335 -189
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +126 -0
- data/gumbo-parser/THANKS +27 -0
- data/gumbo-parser/src/Makefile +34 -0
- data/gumbo-parser/src/README.md +41 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +42 -0
- data/gumbo-parser/src/attribute.h +17 -0
- data/gumbo-parser/src/char_ref.c +22225 -0
- data/gumbo-parser/src/char_ref.h +29 -0
- data/gumbo-parser/src/char_ref.rl +2154 -0
- data/gumbo-parser/src/error.c +630 -0
- data/gumbo-parser/src/error.h +148 -0
- data/gumbo-parser/src/foreign_attrs.c +103 -0
- data/gumbo-parser/src/foreign_attrs.gperf +27 -0
- data/gumbo-parser/src/insertion_mode.h +33 -0
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/nokogiri_gumbo.h +944 -0
- data/gumbo-parser/src/parser.c +4891 -0
- data/gumbo-parser/src/parser.h +41 -0
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +103 -0
- data/gumbo-parser/src/string_buffer.h +68 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_attrs.gperf +77 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/svg_tags.gperf +55 -0
- data/gumbo-parser/src/tag.c +223 -0
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.gperf +170 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +17 -0
- data/gumbo-parser/src/tokenizer.c +3464 -0
- data/gumbo-parser/src/tokenizer.h +112 -0
- data/gumbo-parser/src/tokenizer_states.h +339 -0
- data/gumbo-parser/src/utf8.c +245 -0
- data/gumbo-parser/src/utf8.h +164 -0
- data/gumbo-parser/src/util.c +66 -0
- data/gumbo-parser/src/util.h +34 -0
- data/gumbo-parser/src/vector.c +111 -0
- data/gumbo-parser/src/vector.h +45 -0
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +10 -8
- data/lib/nokogiri/css/parser.rb +397 -377
- data/lib/nokogiri/css/parser.y +250 -245
- data/lib/nokogiri/css/parser_extras.rb +54 -49
- data/lib/nokogiri/css/syntax_error.rb +3 -1
- data/lib/nokogiri/css/tokenizer.rb +5 -3
- data/lib/nokogiri/css/tokenizer.rex +3 -2
- data/lib/nokogiri/css/xpath_visitor.rb +205 -96
- data/lib/nokogiri/css.rb +56 -17
- data/lib/nokogiri/decorators/slop.rb +9 -7
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +32 -0
- data/lib/nokogiri/gumbo.rb +15 -0
- data/lib/nokogiri/html.rb +38 -27
- data/lib/nokogiri/{html → html4}/builder.rb +4 -2
- data/lib/nokogiri/html4/document.rb +214 -0
- data/lib/nokogiri/html4/document_fragment.rb +54 -0
- data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
- data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
- data/lib/nokogiri/{html → html4}/sax/parser.rb +17 -16
- data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
- data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
- data/lib/nokogiri/html4.rb +47 -0
- data/lib/nokogiri/html5/document.rb +168 -0
- data/lib/nokogiri/html5/document_fragment.rb +90 -0
- data/lib/nokogiri/html5/node.rb +103 -0
- data/lib/nokogiri/html5.rb +326 -0
- data/lib/nokogiri/jruby/dependencies.rb +3 -0
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/syntax_error.rb +2 -0
- data/lib/nokogiri/version/constant.rb +6 -0
- data/lib/nokogiri/version/info.rb +224 -0
- data/lib/nokogiri/version.rb +3 -108
- data/lib/nokogiri/xml/attr.rb +55 -3
- data/lib/nokogiri/xml/attribute_decl.rb +6 -2
- data/lib/nokogiri/xml/builder.rb +75 -34
- data/lib/nokogiri/xml/cdata.rb +3 -1
- data/lib/nokogiri/xml/character_data.rb +2 -0
- data/lib/nokogiri/xml/document.rb +312 -127
- data/lib/nokogiri/xml/document_fragment.rb +93 -48
- data/lib/nokogiri/xml/dtd.rb +4 -2
- data/lib/nokogiri/xml/element_content.rb +12 -2
- data/lib/nokogiri/xml/element_decl.rb +6 -2
- data/lib/nokogiri/xml/entity_decl.rb +7 -3
- data/lib/nokogiri/xml/entity_reference.rb +2 -0
- data/lib/nokogiri/xml/namespace.rb +44 -0
- data/lib/nokogiri/xml/node/save_options.rb +23 -8
- data/lib/nokogiri/xml/node.rb +1096 -419
- data/lib/nokogiri/xml/node_set.rb +137 -61
- data/lib/nokogiri/xml/notation.rb +13 -0
- data/lib/nokogiri/xml/parse_options.rb +145 -52
- data/lib/nokogiri/xml/pp/character_data.rb +9 -6
- data/lib/nokogiri/xml/pp/node.rb +42 -30
- data/lib/nokogiri/xml/pp.rb +4 -2
- data/lib/nokogiri/xml/processing_instruction.rb +4 -1
- data/lib/nokogiri/xml/reader.rb +21 -28
- data/lib/nokogiri/xml/relax_ng.rb +8 -2
- data/lib/nokogiri/xml/sax/document.rb +45 -49
- data/lib/nokogiri/xml/sax/parser.rb +39 -36
- data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
- data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
- data/lib/nokogiri/xml/sax.rb +6 -4
- data/lib/nokogiri/xml/schema.rb +19 -9
- data/lib/nokogiri/xml/searchable.rb +120 -72
- data/lib/nokogiri/xml/syntax_error.rb +7 -5
- data/lib/nokogiri/xml/text.rb +2 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
- data/lib/nokogiri/xml/xpath.rb +15 -4
- data/lib/nokogiri/xml/xpath_context.rb +3 -3
- data/lib/nokogiri/xml.rb +39 -38
- data/lib/nokogiri/xslt/stylesheet.rb +3 -1
- data/lib/nokogiri/xslt.rb +101 -22
- data/lib/nokogiri.rb +59 -75
- data/lib/xsd/xmlparser/nokogiri.rb +29 -25
- data/patches/libxml2/{0004-libxml2.la-is-in-top_builddir.patch → 0003-libxml2.la-is-in-top_builddir.patch} +1 -1
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
- data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
- data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
- data/ports/archives/libxml2-2.12.3.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
- metadata +121 -291
- data/ext/nokogiri/html_document.c +0 -170
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.c +0 -279
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.c +0 -32
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.c +0 -116
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.c +0 -87
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.c +0 -61
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -14
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
- data/lib/nokogiri/html/document.rb +0 -335
- data/lib/nokogiri/html/document_fragment.rb +0 -49
- data/lib/nokogiri/html/element_description_defaults.rb +0 -671
- data/lib/nokogiri/html/sax/parser_context.rb +0 -16
- data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
- data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
- /data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
- /data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
@@ -0,0 +1,126 @@
|
|
1
|
+
.PHONY: all clean check coverage
|
2
|
+
|
3
|
+
gumbo_objs := $(patsubst %.c,build/%.o,$(wildcard src/*.c))
|
4
|
+
test_objs := $(patsubst %.cc,build/%.o,$(wildcard test/*.cc))
|
5
|
+
gtest_lib := googletest/make/gtest_main.a
|
6
|
+
|
7
|
+
# make SANITIZEFLAGS='-fsanitize=undefined -fsanitize=address'
|
8
|
+
SANITIZEFLAGS :=
|
9
|
+
CPPFLAGS := -Isrc
|
10
|
+
CFLAGS := -std=c99 -Os -Wall
|
11
|
+
CXXFLAGS := -isystem googletest/include -std=c++11 -Os -Wall
|
12
|
+
LDFLAGS := -pthread
|
13
|
+
|
14
|
+
all: check
|
15
|
+
|
16
|
+
fuzzers: fuzzer-normal fuzzer-asan fuzzer-ubsan fuzzer-msan
|
17
|
+
|
18
|
+
fuzzer-normal:
|
19
|
+
./fuzzer/build.sh
|
20
|
+
|
21
|
+
fuzzer-asan:
|
22
|
+
SANITIZER=asan ./fuzzer/build.sh
|
23
|
+
|
24
|
+
fuzzer-ubsan:
|
25
|
+
SANITIZER=ubsan ./fuzzer/build.sh
|
26
|
+
|
27
|
+
fuzzer-msan:
|
28
|
+
SANITIZER=msan ./fuzzer/build.sh
|
29
|
+
|
30
|
+
# don't try to regenerate ragel or gperf files in CI, that should be a development-only action and
|
31
|
+
# the generated files should be committed to SCM
|
32
|
+
ifneq ($(CI),true)
|
33
|
+
src/foreign_attrs.c: src/foreign_attrs.gperf
|
34
|
+
gperf -m100 -n $< | ./gperf-filter.sed > $@
|
35
|
+
|
36
|
+
src/%.c: src/%.gperf
|
37
|
+
gperf -m100 $< | ./gperf-filter.sed > $@
|
38
|
+
|
39
|
+
src/%.c: src/%.rl
|
40
|
+
ragel -F1 -o $@ $<
|
41
|
+
endif
|
42
|
+
|
43
|
+
build/src:
|
44
|
+
mkdir -p $@
|
45
|
+
|
46
|
+
build/test:
|
47
|
+
mkdir -p $@
|
48
|
+
|
49
|
+
build/src/%.o: src/%.c build/src/flags | build/src
|
50
|
+
$(CC) -MMD $(CPPFLAGS) $(CFLAGS) $(SANITIZEFLAGS) -c -o $@ $<
|
51
|
+
|
52
|
+
build/test/%.o: test/%.cc build/test/flags | build/test
|
53
|
+
$(CXX) -MMD $(CPPFLAGS) $(CXXFLAGS) $(SANITIZEFLAGS) -c -o $@ $<
|
54
|
+
|
55
|
+
build/run_tests: $(gumbo_objs) $(test_objs) $(gtest_lib)
|
56
|
+
$(CXX) -o $@ $+ $(LDFLAGS) $(SANITIZEFLAGS)
|
57
|
+
|
58
|
+
check: build/run_tests
|
59
|
+
./build/run_tests
|
60
|
+
|
61
|
+
coverage:
|
62
|
+
$(RM) build/{src,test}/*.gcda
|
63
|
+
$(RM) build/*.info
|
64
|
+
$(MAKE) CPPFLAGS='-Isrc -DNDEBUG=1' \
|
65
|
+
CFLAGS='-std=c99 --coverage -g -O0' \
|
66
|
+
CXXFLAGS='-isystem googletest/include -std=c++11 --coverage -g -O0' \
|
67
|
+
LDFLAGS='--coverage' \
|
68
|
+
build/run_tests
|
69
|
+
lcov --no-external \
|
70
|
+
--initial \
|
71
|
+
--capture \
|
72
|
+
--base-directory . \
|
73
|
+
--directory build \
|
74
|
+
--output-file build/coverage-pre.info
|
75
|
+
awk -F '[:,]' \
|
76
|
+
'/^SF:/ { delete defs } /^FN:/ { defs[$$2]=1 } /^DA:/ { if ($$3 == 0 && $$2 in defs) next } { print }' \
|
77
|
+
build/coverage-pre.info > build/coverage-initial.info
|
78
|
+
./build/run_tests
|
79
|
+
lcov --no-external \
|
80
|
+
--capture \
|
81
|
+
--base-directory . \
|
82
|
+
--directory build \
|
83
|
+
--rc lcov_branch_coverage=1 \
|
84
|
+
--output-file build/coverage-test.info
|
85
|
+
lcov --add-tracefile build/coverage-initial.info \
|
86
|
+
--add-tracefile build/coverage-test.info \
|
87
|
+
--rc lcov_branch_coverage=1 \
|
88
|
+
--output-file build/coverage.info
|
89
|
+
lcov --remove build/coverage.info '$(CURDIR)/googletest/*' \
|
90
|
+
--rc lcov_branch_coverage=1 \
|
91
|
+
--output-file build/coverage.info
|
92
|
+
genhtml --branch-coverage \
|
93
|
+
--output-directory build/coverage \
|
94
|
+
build/coverage.info
|
95
|
+
|
96
|
+
clean:
|
97
|
+
$(RM) -r build
|
98
|
+
$(RM) -r fuzzer/build fuzzer/src-* fuzzer/gumbo_corpus
|
99
|
+
|
100
|
+
build/src/flags: | build/src
|
101
|
+
@echo 'old_CC := $(CC)' > $@
|
102
|
+
@echo 'old_CPPFLAGS := $(CPPFLAGS)' >> $@
|
103
|
+
@echo 'old_CFLAGS := $(CFLAGS)' >>$@
|
104
|
+
@echo 'old_SANITIZEFLAGS := $(SANITIZEFLAGS)' >> $@
|
105
|
+
@echo 'old_LDFLAGS := $(LDFLAGS)' >> $@
|
106
|
+
|
107
|
+
build/test/flags: | build/test
|
108
|
+
@echo 'old_CXX := $(CXX)' > $@
|
109
|
+
@echo 'old_CPPFLAGS := $(CPPFLAGS)' >> $@
|
110
|
+
@echo 'old_CXXFLAGS := $(CXXFLAGS)' >> $@
|
111
|
+
@echo 'old_SANITIZEFLAGS := $(SANITIZEFLAGS)' >> $@
|
112
|
+
@echo 'old_LDFLAGS := $(LDFLAGS)' >> $@
|
113
|
+
|
114
|
+
ifeq (,$(filter clean coverage,$(MAKECMDGOALS)))
|
115
|
+
# Ensure that the flags are up to date.
|
116
|
+
-include build/src/flags build/test/flags
|
117
|
+
ifneq ($(old_CC) | $(old_CPPFLAGS) | $(old_CFLAGS) | $(old_SANITIZEFLAGS) | $(old_LDFLAGS),$(CC) | $(CPPFLAGS) | $(CFLAGS) | $(SANITIZEFLAGS) | $(LDFLAGS))
|
118
|
+
.PHONY: build/src/flags
|
119
|
+
endif
|
120
|
+
ifneq ($(old_CXX) | $(old_CPPFLAGS) | $(old_CXXFLAGS) | $(old_SANITIZEFLAGS) | $(old_LDFLAGS),$(CXX) | $(CPPFLAGS) | $(CXXFLAGS) | $(SANITIZEFLAGS) | $(LDFLAGS))
|
121
|
+
.PHONY: build/test/flags
|
122
|
+
endif
|
123
|
+
|
124
|
+
# Include dependencies.
|
125
|
+
-include $(test_objs:.o=.d) $(gumbo_objs:.o=.d)
|
126
|
+
endif
|
data/gumbo-parser/THANKS
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
Gumbo HTML parser THANKS file
|
2
|
+
|
3
|
+
Gumbo was originally written by Jonathan Tang, but many people helped out through suggestions, question-answering, code reviews, bugfixes, and organizational support. Here is a list of these people. Help me keep it complete and exempt of errors.
|
4
|
+
|
5
|
+
Adam Barth
|
6
|
+
Adam Roben
|
7
|
+
Ben Noordhuis
|
8
|
+
Bowen Han
|
9
|
+
Constantinos Michael
|
10
|
+
Craig Barnes
|
11
|
+
Geoffrey Sneddon
|
12
|
+
Ian Hickson
|
13
|
+
Jack Deng
|
14
|
+
Joel Low
|
15
|
+
Jonathan Shneier
|
16
|
+
Kevin Hendricks
|
17
|
+
Mason Tang
|
18
|
+
Maxim Zakharov
|
19
|
+
Michal Zalewski
|
20
|
+
Neal Norwitz
|
21
|
+
Othar Hansson
|
22
|
+
Ryan Grove
|
23
|
+
Stefan Haustein
|
24
|
+
Steffen Meschkat
|
25
|
+
Steven Kabbes
|
26
|
+
Thiago Farina
|
27
|
+
Vicent Marti
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# this Makefile is used by ext/nokogiri/extconf.rb
|
2
|
+
# to enable a mini_portile2 recipe to build the gumbo parser
|
3
|
+
.PHONY: clean
|
4
|
+
|
5
|
+
CFLAGS += -std=c99 -Wall
|
6
|
+
|
7
|
+
# allow the ENV var to override this
|
8
|
+
RANLIB ?= ranlib
|
9
|
+
|
10
|
+
gumbo_objs := \
|
11
|
+
ascii.o \
|
12
|
+
attribute.o \
|
13
|
+
char_ref.o \
|
14
|
+
error.o \
|
15
|
+
foreign_attrs.o \
|
16
|
+
parser.o \
|
17
|
+
string_buffer.o \
|
18
|
+
string_piece.o \
|
19
|
+
svg_attrs.o \
|
20
|
+
svg_tags.o \
|
21
|
+
tag.o \
|
22
|
+
tag_lookup.o \
|
23
|
+
token_buffer.o \
|
24
|
+
tokenizer.o \
|
25
|
+
utf8.o \
|
26
|
+
util.o \
|
27
|
+
vector.o
|
28
|
+
|
29
|
+
libgumbo.a: $(gumbo_objs)
|
30
|
+
$(AR) $(ARFLAGS) $@ $(gumbo_objs)
|
31
|
+
- ($(RANLIB) $@ || true) >/dev/null 2>&1
|
32
|
+
|
33
|
+
clean:
|
34
|
+
rm -f $(gumbo_objs) libgumbo.a
|
@@ -0,0 +1,41 @@
|
|
1
|
+
libgumbo
|
2
|
+
========
|
3
|
+
|
4
|
+
This is an internal fork of the [libgumbo] library, which was copied and
|
5
|
+
later modified under the terms of the Apache 2.0 [license]. See `lua-gumbo`
|
6
|
+
commit [`0a04728`] for details of the original import.
|
7
|
+
|
8
|
+
Since importing the code, the following notable fixes and improvements
|
9
|
+
have been made:
|
10
|
+
|
11
|
+
* `91cef89`: Re-implement `adjust_foreign_attributes()` with a gperf hash
|
12
|
+
* `b11abe7`: Pass `TagSet` arrays into functions by reference instead of value
|
13
|
+
* `b73dc03`: Simplify `maybe_replace_codepoint()` function
|
14
|
+
* `d5d0bb3`: Remove special handling of `<menuitem>` tag
|
15
|
+
* `7bd5162`: Remove special handling of `<isindex>` tag
|
16
|
+
* `a5c1b0e`: Use `realloc(3)` instead of `malloc(3)` in `enlarge_vector_if_full()`
|
17
|
+
* `dcbebd7`: Use `realloc(3)` instead of `malloc(3)` in `maybe_resize_string_buffer()`
|
18
|
+
* `df15262`: Make `destroy_node()` function non-recursive
|
19
|
+
* `2df37f5`: Fix signedness of some format specifiers
|
20
|
+
* `176553e`: Add maximum element nesting limit
|
21
|
+
* `bed0f4a`: Annotate `gumbo_debug()` with `PRINTF` macro and fix warnings
|
22
|
+
* `7ffc218`: Annotate `print_message()` with `PRINTF` macro and fix warnings
|
23
|
+
* `1bd8ab5`, `9136507`, `53a1f9a`: Deduplicate some identical `TagSet` arrays
|
24
|
+
* `a7a9065`: Add some GCC/Clang function attributes
|
25
|
+
* `8d3d4e4`: Remove custom allocator support
|
26
|
+
* `8d3b006`: Fix recording of source positions for `</form>` end tags
|
27
|
+
* `1a8d763`: Replace linear search in `maybe_replace_codepoint()` with a lookup table
|
28
|
+
* `6dca79e`: Replace `strcasecmp()` and `strncasecmp()` with ascii-only equivalents
|
29
|
+
* `17ab1d2`: Fix `TAGSET_INCLUDES` macro to work properly with multiple bit flags
|
30
|
+
* `7e56d45`: Re-implement `gumbo_normalize_svg_tagname()` with a gperf hash
|
31
|
+
* `a518d35`: Replace linear array search in `adjust_svg_attributes()` with a gperf hash
|
32
|
+
* `a4a7433`: Fix duplicate `TagSet` initializer being ignored in `is_special_node()`
|
33
|
+
* `8137fcd`: Add support for `<dialog>` tag
|
34
|
+
* `4b35471`: Add missing `static` qualifiers to hide symbols that shouldn't be extern
|
35
|
+
* `df57c59`, `03101f3`, `ea62330`: Replace use of locale-dependant `ctype.h` functions
|
36
|
+
with custom, ASCII-only equivalents
|
37
|
+
|
38
|
+
|
39
|
+
[libgumbo]: https://github.com/google/gumbo-parser/tree/aa91b27b02c0c80c482e24348a457ed7c3c088e0/src
|
40
|
+
[license]: https://github.com/google/gumbo-parser/blob/aa91b27b02c0c80c482e24348a457ed7c3c088e0/COPYING
|
41
|
+
[`0a04728`]: https://gitlab.com/craigbarnes/lua-gumbo/commit/0a047282815af86f3367a7d95fefcfe5723ece48
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#include "ascii.h"
|
2
|
+
|
3
|
+
int gumbo_ascii_strcasecmp(const char *s1, const char *s2) {
|
4
|
+
int c1, c2;
|
5
|
+
while (*s1 && *s2) {
|
6
|
+
c1 = (int)(unsigned char) gumbo_ascii_tolower(*s1);
|
7
|
+
c2 = (int)(unsigned char) gumbo_ascii_tolower(*s2);
|
8
|
+
if (c1 != c2) {
|
9
|
+
return (c1 - c2);
|
10
|
+
}
|
11
|
+
s1++;
|
12
|
+
s2++;
|
13
|
+
}
|
14
|
+
return (((int)(unsigned char) *s1) - ((int)(unsigned char) *s2));
|
15
|
+
}
|
16
|
+
|
17
|
+
int gumbo_ascii_strncasecmp(const char *s1, const char *s2, size_t n) {
|
18
|
+
int c1, c2;
|
19
|
+
while (n && *s1 && *s2) {
|
20
|
+
n -= 1;
|
21
|
+
c1 = (int)(unsigned char) gumbo_ascii_tolower(*s1);
|
22
|
+
c2 = (int)(unsigned char) gumbo_ascii_tolower(*s2);
|
23
|
+
if (c1 != c2) {
|
24
|
+
return (c1 - c2);
|
25
|
+
}
|
26
|
+
s1++;
|
27
|
+
s2++;
|
28
|
+
}
|
29
|
+
if (n) {
|
30
|
+
return (((int)(unsigned char) *s1) - ((int)(unsigned char) *s2));
|
31
|
+
}
|
32
|
+
return 0;
|
33
|
+
}
|
34
|
+
|
35
|
+
const unsigned char _gumbo_ascii_table[0x80] = {
|
36
|
+
0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x03,0x03,0x01,0x03,0x03,0x01,0x01,
|
37
|
+
0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
|
38
|
+
0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
39
|
+
0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00,
|
40
|
+
0x00,0x28,0x28,0x28,0x28,0x28,0x28,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
|
41
|
+
0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x00,0x00,0x00,0x00,0x00,
|
42
|
+
0x00,0x50,0x50,0x50,0x50,0x50,0x50,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
|
43
|
+
0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x00,0x00,0x00,0x00,0x00,
|
44
|
+
};
|
45
|
+
|
46
|
+
// Table generation code.
|
47
|
+
// clang -DGUMBO_GEN_TABLE=1 ascii.c && ./a.out && rm a.out
|
48
|
+
#if GUMBO_GEN_TABLE
|
49
|
+
#include <stdio.h>
|
50
|
+
|
51
|
+
int main() {
|
52
|
+
printf("const unsigned char _gumbo_ascii_table[0x80] = {");
|
53
|
+
for (int c = 0; c < 0x80; ++c) {
|
54
|
+
unsigned int x = 0;
|
55
|
+
// https://infra.spec.whatwg.org/#ascii-code-point
|
56
|
+
if (c <= 0x1f)
|
57
|
+
x |= GUMBO_ASCII_CNTRL;
|
58
|
+
if (c == 0x09 || c == 0x0a || c == 0x0c || c == 0x0d || c == 0x20)
|
59
|
+
x |= GUMBO_ASCII_SPACE;
|
60
|
+
if (c >= 0x30 && c <= 0x39)
|
61
|
+
x |= GUMBO_ASCII_DIGIT;
|
62
|
+
if ((c >= 0x30 && c <= 0x39) || (c >= 0x41 && c <= 0x46))
|
63
|
+
x |= GUMBO_ASCII_UPPER_XDIGIT;
|
64
|
+
if ((c >= 0x30 && c <= 0x39) || (c >= 0x61 && c <= 0x66))
|
65
|
+
x |= GUMBO_ASCII_LOWER_XDIGIT;
|
66
|
+
if (c >= 0x41 && c <= 0x5a)
|
67
|
+
x |= GUMBO_ASCII_UPPER_ALPHA;
|
68
|
+
if (c >= 0x61 && c <= 0x7a)
|
69
|
+
x |= GUMBO_ASCII_LOWER_ALPHA;
|
70
|
+
printf("%s0x%02x,", (c % 16 == 0? "\n " : ""), x);
|
71
|
+
}
|
72
|
+
printf("\n};\n");
|
73
|
+
return 0;
|
74
|
+
}
|
75
|
+
#endif
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#ifndef GUMBO_ASCII_H_
|
2
|
+
#define GUMBO_ASCII_H_
|
3
|
+
|
4
|
+
#include <stddef.h>
|
5
|
+
#include "macros.h"
|
6
|
+
|
7
|
+
#ifdef __cplusplus
|
8
|
+
extern "C" {
|
9
|
+
#endif
|
10
|
+
|
11
|
+
PURE NONNULL_ARGS
|
12
|
+
int gumbo_ascii_strcasecmp(const char *s1, const char *s2);
|
13
|
+
|
14
|
+
PURE NONNULL_ARGS
|
15
|
+
int gumbo_ascii_strncasecmp(const char *s1, const char *s2, size_t n);
|
16
|
+
|
17
|
+
// If these values change, then _gumbo_ascii_table needs to be regenerated.
|
18
|
+
#define GUMBO_ASCII_CNTRL 1
|
19
|
+
#define GUMBO_ASCII_SPACE 2
|
20
|
+
#define GUMBO_ASCII_DIGIT 4
|
21
|
+
#define GUMBO_ASCII_UPPER_XDIGIT 8
|
22
|
+
#define GUMBO_ASCII_LOWER_XDIGIT 16
|
23
|
+
#define GUMBO_ASCII_UPPER_ALPHA 32
|
24
|
+
#define GUMBO_ASCII_LOWER_ALPHA 64
|
25
|
+
#define GUMBO_ASCII_XDIGIT (GUMBO_ASCII_LOWER_XDIGIT | GUMBO_ASCII_UPPER_XDIGIT)
|
26
|
+
#define GUMBO_ASCII_ALPHA (GUMBO_ASCII_UPPER_ALPHA | GUMBO_ASCII_LOWER_ALPHA)
|
27
|
+
#define GUMBO_ASCII_ALNUM (GUMBO_ASCII_DIGIT | GUMBO_ASCII_ALPHA)
|
28
|
+
|
29
|
+
extern const unsigned char _gumbo_ascii_table[0x80];
|
30
|
+
|
31
|
+
CONST_FN
|
32
|
+
static inline int gumbo_ascii_isascii(int c) {
|
33
|
+
return ((unsigned int)c & ~0x7fu) == 0;
|
34
|
+
}
|
35
|
+
|
36
|
+
// 0x00 -- 0x1F (A C0 control)
|
37
|
+
CONST_FN
|
38
|
+
static inline int gumbo_ascii_iscntrl(int c) {
|
39
|
+
return gumbo_ascii_isascii(c)
|
40
|
+
&& (_gumbo_ascii_table[c] & GUMBO_ASCII_CNTRL);
|
41
|
+
}
|
42
|
+
|
43
|
+
// 0x09, 0x0a, 0x0c, 0x0d, 0x20
|
44
|
+
CONST_FN
|
45
|
+
static inline int gumbo_ascii_isspace(int c) {
|
46
|
+
return gumbo_ascii_isascii(c)
|
47
|
+
&& (_gumbo_ascii_table[c] & GUMBO_ASCII_SPACE);
|
48
|
+
}
|
49
|
+
|
50
|
+
CONST_FN
|
51
|
+
static inline int gumbo_ascii_istab_or_newline(int c) {
|
52
|
+
return c == 0x09 || c == 0x0a || c == 0x0d;
|
53
|
+
}
|
54
|
+
|
55
|
+
|
56
|
+
CONST_FN
|
57
|
+
static inline int gumbo_ascii_isdigit(int c) {
|
58
|
+
return c >= 0x30 && c <= 0x39;
|
59
|
+
}
|
60
|
+
|
61
|
+
CONST_FN
|
62
|
+
static inline int gumbo_ascii_isalpha(int c) {
|
63
|
+
return gumbo_ascii_isascii(c)
|
64
|
+
&& (_gumbo_ascii_table[c] & GUMBO_ASCII_ALPHA);
|
65
|
+
}
|
66
|
+
|
67
|
+
CONST_FN
|
68
|
+
static inline int gumbo_ascii_isxdigit(int c) {
|
69
|
+
return gumbo_ascii_isascii(c)
|
70
|
+
&& (_gumbo_ascii_table[c] & GUMBO_ASCII_XDIGIT);
|
71
|
+
}
|
72
|
+
|
73
|
+
CONST_FN
|
74
|
+
static inline int gumbo_ascii_isupper_xdigit(int c) {
|
75
|
+
return gumbo_ascii_isascii(c)
|
76
|
+
&& (_gumbo_ascii_table[c] & GUMBO_ASCII_UPPER_XDIGIT);
|
77
|
+
}
|
78
|
+
|
79
|
+
CONST_FN
|
80
|
+
static inline int gumbo_ascii_islower_xdigit(int c) {
|
81
|
+
return gumbo_ascii_isascii(c)
|
82
|
+
&& (_gumbo_ascii_table[c] & GUMBO_ASCII_LOWER_XDIGIT);
|
83
|
+
}
|
84
|
+
|
85
|
+
CONST_FN
|
86
|
+
static inline int gumbo_ascii_isupper(int c) {
|
87
|
+
return ((unsigned)(c) - 'A') < 26;
|
88
|
+
}
|
89
|
+
|
90
|
+
CONST_FN
|
91
|
+
static inline int gumbo_ascii_islower(int c) {
|
92
|
+
return gumbo_ascii_isascii(c)
|
93
|
+
&& (_gumbo_ascii_table[c] & GUMBO_ASCII_LOWER_ALPHA);
|
94
|
+
}
|
95
|
+
|
96
|
+
CONST_FN
|
97
|
+
static inline int gumbo_ascii_isalnum(int c) {
|
98
|
+
return gumbo_ascii_isascii(c)
|
99
|
+
&& (_gumbo_ascii_table[c] & GUMBO_ASCII_ALNUM);
|
100
|
+
}
|
101
|
+
|
102
|
+
|
103
|
+
CONST_FN
|
104
|
+
static inline int gumbo_ascii_tolower(int c) {
|
105
|
+
if (gumbo_ascii_isupper(c)) {
|
106
|
+
return c | 32;
|
107
|
+
}
|
108
|
+
return c;
|
109
|
+
}
|
110
|
+
|
111
|
+
#ifdef __cplusplus
|
112
|
+
}
|
113
|
+
#endif
|
114
|
+
|
115
|
+
#endif // GUMBO_ASCII_H_
|
@@ -0,0 +1,42 @@
|
|
1
|
+
/*
|
2
|
+
Copyright 2018 Craig Barnes.
|
3
|
+
Copyright 2010 Google Inc.
|
4
|
+
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
you may not use this file except in compliance with the License.
|
7
|
+
You may obtain a copy of the License at
|
8
|
+
|
9
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
See the License for the specific language governing permissions and
|
15
|
+
limitations under the License.
|
16
|
+
*/
|
17
|
+
|
18
|
+
#include <assert.h>
|
19
|
+
#include <stdlib.h>
|
20
|
+
#include <string.h>
|
21
|
+
#include "attribute.h"
|
22
|
+
#include "ascii.h"
|
23
|
+
#include "util.h"
|
24
|
+
|
25
|
+
GumboAttribute* gumbo_get_attribute (
|
26
|
+
const GumboVector* attributes,
|
27
|
+
const char* name
|
28
|
+
) {
|
29
|
+
for (unsigned int i = 0; i < attributes->length; ++i) {
|
30
|
+
GumboAttribute* attr = attributes->data[i];
|
31
|
+
if (!gumbo_ascii_strcasecmp(attr->name, name)) {
|
32
|
+
return attr;
|
33
|
+
}
|
34
|
+
}
|
35
|
+
return NULL;
|
36
|
+
}
|
37
|
+
|
38
|
+
void gumbo_destroy_attribute(GumboAttribute* attribute) {
|
39
|
+
gumbo_free((void*) attribute->name);
|
40
|
+
gumbo_free((void*) attribute->value);
|
41
|
+
gumbo_free((void*) attribute);
|
42
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef GUMBO_ATTRIBUTE_H_
|
2
|
+
#define GUMBO_ATTRIBUTE_H_
|
3
|
+
|
4
|
+
#include "nokogiri_gumbo.h"
|
5
|
+
|
6
|
+
#ifdef __cplusplus
|
7
|
+
extern "C" {
|
8
|
+
#endif
|
9
|
+
|
10
|
+
// Release the memory used for a GumboAttribute, including the attribute itself
|
11
|
+
void gumbo_destroy_attribute(GumboAttribute* attribute);
|
12
|
+
|
13
|
+
#ifdef __cplusplus
|
14
|
+
}
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#endif // GUMBO_ATTRIBUTE_H_
|