feedtools 0.2.26 → 0.2.27
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +232 -216
- data/db/migration.rb +2 -0
- data/db/schema.mysql.sql +2 -0
- data/db/schema.postgresql.sql +3 -1
- data/db/schema.sqlite.sql +3 -1
- data/lib/feed_tools.rb +37 -14
- data/lib/feed_tools/database_feed_cache.rb +13 -2
- data/lib/feed_tools/feed.rb +430 -104
- data/lib/feed_tools/feed_item.rb +533 -268
- data/lib/feed_tools/helpers/generic_helper.rb +1 -1
- data/lib/feed_tools/helpers/html_helper.rb +78 -116
- data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
- data/lib/feed_tools/helpers/uri_helper.rb +46 -54
- data/lib/feed_tools/monkey_patch.rb +27 -1
- data/lib/feed_tools/vendor/html5/History.txt +10 -0
- data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
- data/lib/feed_tools/vendor/html5/README +45 -0
- data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
- data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
- data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
- data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
- data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
- data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
- data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
- data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
- data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
- data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
- data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
- data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
- data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
- data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
- data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
- data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
- data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
- data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
- data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
- data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
- data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
- data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
- data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
- data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
- data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
- data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
- data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
- data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
- data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
- data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
- data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
- data/lib/feed_tools/vendor/uri.rb +781 -0
- data/lib/feed_tools/version.rb +1 -1
- data/rakefile +27 -6
- data/test/unit/atom_test.rb +298 -210
- data/test/unit/helper_test.rb +7 -12
- data/test/unit/rdf_test.rb +51 -1
- data/test/unit/rss_test.rb +13 -3
- metadata +239 -116
- data/lib/feed_tools/vendor/htree.rb +0 -97
- data/lib/feed_tools/vendor/htree/container.rb +0 -10
- data/lib/feed_tools/vendor/htree/context.rb +0 -67
- data/lib/feed_tools/vendor/htree/display.rb +0 -27
- data/lib/feed_tools/vendor/htree/doc.rb +0 -149
- data/lib/feed_tools/vendor/htree/elem.rb +0 -262
- data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
- data/lib/feed_tools/vendor/htree/equality.rb +0 -218
- data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
- data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
- data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
- data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
- data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
- data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
- data/lib/feed_tools/vendor/htree/loc.rb +0 -367
- data/lib/feed_tools/vendor/htree/modules.rb +0 -48
- data/lib/feed_tools/vendor/htree/name.rb +0 -124
- data/lib/feed_tools/vendor/htree/output.rb +0 -207
- data/lib/feed_tools/vendor/htree/parse.rb +0 -409
- data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
- data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
- data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
- data/lib/feed_tools/vendor/htree/scan.rb +0 -166
- data/lib/feed_tools/vendor/htree/tag.rb +0 -111
- data/lib/feed_tools/vendor/htree/template.rb +0 -909
- data/lib/feed_tools/vendor/htree/text.rb +0 -115
- data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'html5/constants'
|
2
|
+
require 'html5/filters/base'
|
3
|
+
|
4
|
+
module HTML5
|
5
|
+
module Filters
|
6
|
+
class WhitespaceFilter < Base
|
7
|
+
|
8
|
+
SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
|
9
|
+
SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
|
10
|
+
|
11
|
+
def each
|
12
|
+
preserve = 0
|
13
|
+
__getobj__.each do |token|
|
14
|
+
case token[:type]
|
15
|
+
when :StartTag
|
16
|
+
if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
|
17
|
+
preserve += 1
|
18
|
+
end
|
19
|
+
|
20
|
+
when :EndTag
|
21
|
+
preserve -= 1 if preserve > 0
|
22
|
+
|
23
|
+
when :SpaceCharacters
|
24
|
+
token[:data] = " " if preserve == 0 && token[:data]
|
25
|
+
|
26
|
+
when :Characters
|
27
|
+
token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
|
28
|
+
end
|
29
|
+
|
30
|
+
yield token
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,248 @@
|
|
1
|
+
require 'html5/constants'
|
2
|
+
require 'html5/tokenizer'
|
3
|
+
require 'html5/treebuilders/rexml'
|
4
|
+
|
5
|
+
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
|
6
|
+
require 'html5/html5parser/' + File.basename(path)
|
7
|
+
end
|
8
|
+
|
9
|
+
module HTML5
|
10
|
+
|
11
|
+
# Error in parsed document
|
12
|
+
class ParseError < Exception; end
|
13
|
+
class AssertionError < Exception; end
|
14
|
+
|
15
|
+
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
|
16
|
+
#
|
17
|
+
class HTMLParser
|
18
|
+
|
19
|
+
attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table
|
20
|
+
|
21
|
+
attr_reader :phases, :tokenizer, :tree, :errors
|
22
|
+
|
23
|
+
def self.parse(stream, options = {})
|
24
|
+
encoding = options.delete(:encoding)
|
25
|
+
new(options).parse(stream,encoding)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.parse_fragment(stream, options = {})
|
29
|
+
container = options.delete(:container) || 'div'
|
30
|
+
encoding = options.delete(:encoding)
|
31
|
+
new(options).parse_fragment(stream, container, encoding)
|
32
|
+
end
|
33
|
+
|
34
|
+
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
|
35
|
+
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
|
36
|
+
|
37
|
+
# :strict - raise an exception when a parse error is encountered
|
38
|
+
# :tree - a treebuilder class controlling the type of tree that will be
|
39
|
+
# returned. Built in treebuilders can be accessed through
|
40
|
+
# HTML5::TreeBuilders[treeType]
|
41
|
+
def initialize(options = {})
|
42
|
+
@strict = false
|
43
|
+
@errors = []
|
44
|
+
|
45
|
+
@tokenizer = HTMLTokenizer
|
46
|
+
@tree = TreeBuilders::REXML::TreeBuilder
|
47
|
+
|
48
|
+
options.each {|name, value| instance_variable_set("@#{name}", value) }
|
49
|
+
@lowercase_attr_name = nil unless instance_variables.include?("@lowercase_attr_name")
|
50
|
+
@lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name")
|
51
|
+
|
52
|
+
@tree = @tree.new
|
53
|
+
|
54
|
+
@phases = @@phases.inject({}) do |phases, phase_name|
|
55
|
+
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
|
56
|
+
phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
|
57
|
+
phases
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def _parse(stream, inner_html, encoding, container = 'div')
|
62
|
+
@tree.reset
|
63
|
+
@first_start_tag = false
|
64
|
+
@errors = []
|
65
|
+
|
66
|
+
@tokenizer = @tokenizer.class unless Class === @tokenizer
|
67
|
+
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
|
68
|
+
:parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
|
69
|
+
|
70
|
+
if inner_html
|
71
|
+
case @inner_html = container.downcase
|
72
|
+
when 'title', 'textarea'
|
73
|
+
@tokenizer.content_model_flag = :RCDATA
|
74
|
+
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
|
75
|
+
@tokenizer.content_model_flag = :CDATA
|
76
|
+
when 'plaintext'
|
77
|
+
@tokenizer.content_model_flag = :PLAINTEXT
|
78
|
+
else
|
79
|
+
# content_model_flag already is PCDATA
|
80
|
+
@tokenizer.content_model_flag = :PCDATA
|
81
|
+
end
|
82
|
+
|
83
|
+
@phase = @phases[:rootElement]
|
84
|
+
@phase.insert_html_element
|
85
|
+
reset_insertion_mode
|
86
|
+
else
|
87
|
+
@inner_html = false
|
88
|
+
@phase = @phases[:initial]
|
89
|
+
end
|
90
|
+
|
91
|
+
# We only seem to have InBodyPhase testcases where the following is
|
92
|
+
# relevant ... need others too
|
93
|
+
@last_phase = nil
|
94
|
+
|
95
|
+
# XXX This is temporary for the moment so there isn't any other
|
96
|
+
# changes needed for the parser to work with the iterable tokenizer
|
97
|
+
@tokenizer.each do |token|
|
98
|
+
token = normalize_token(token)
|
99
|
+
|
100
|
+
method = 'process%s' % token[:type]
|
101
|
+
|
102
|
+
case token[:type]
|
103
|
+
when :Characters, :SpaceCharacters, :Comment
|
104
|
+
@phase.send method, token[:data]
|
105
|
+
when :StartTag
|
106
|
+
@phase.send method, token[:name], token[:data]
|
107
|
+
when :EndTag
|
108
|
+
@phase.send method, token[:name]
|
109
|
+
when :Doctype
|
110
|
+
@phase.send method, token[:name], token[:publicId],
|
111
|
+
token[:systemId], token[:correct]
|
112
|
+
else
|
113
|
+
parse_error(token[:data], token[:datavars])
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# When the loop finishes it's EOF
|
118
|
+
@phase.process_eof
|
119
|
+
end
|
120
|
+
|
121
|
+
# Parse a HTML document into a well-formed tree
|
122
|
+
#
|
123
|
+
# stream - a filelike object or string containing the HTML to be parsed
|
124
|
+
#
|
125
|
+
# The optional encoding parameter must be a string that indicates
|
126
|
+
# the encoding. If specified, that encoding will be used,
|
127
|
+
# regardless of any BOM or later declaration (such as in a meta
|
128
|
+
# element)
|
129
|
+
def parse(stream, encoding=nil)
|
130
|
+
_parse(stream, false, encoding)
|
131
|
+
@tree.get_document
|
132
|
+
end
|
133
|
+
|
134
|
+
# Parse a HTML fragment into a well-formed tree fragment
|
135
|
+
|
136
|
+
# container - name of the element we're setting the inner_html property
|
137
|
+
# if set to nil, default to 'div'
|
138
|
+
#
|
139
|
+
# stream - a filelike object or string containing the HTML to be parsed
|
140
|
+
#
|
141
|
+
# The optional encoding parameter must be a string that indicates
|
142
|
+
# the encoding. If specified, that encoding will be used,
|
143
|
+
# regardless of any BOM or later declaration (such as in a meta
|
144
|
+
# element)
|
145
|
+
def parse_fragment(stream, container='div', encoding=nil)
|
146
|
+
_parse(stream, true, encoding, container)
|
147
|
+
@tree.get_fragment
|
148
|
+
end
|
149
|
+
|
150
|
+
def parse_error(code = 'XXX-undefined-error', data = {})
|
151
|
+
# XXX The idea is to make data mandatory.
|
152
|
+
@errors.push([@tokenizer.stream.position, code, data])
|
153
|
+
raise ParseError if @strict
|
154
|
+
end
|
155
|
+
|
156
|
+
# HTML5 specific normalizations to the token stream
|
157
|
+
def normalize_token(token)
|
158
|
+
|
159
|
+
if token[:type] == :EmptyTag
|
160
|
+
# When a solidus (/) is encountered within a tag name what happens
|
161
|
+
# depends on whether the current tag name matches that of a void
|
162
|
+
# element. If it matches a void element atheists did the wrong
|
163
|
+
# thing and if it doesn't it's wrong for everyone.
|
164
|
+
|
165
|
+
unless VOID_ELEMENTS.include?(token[:name])
|
166
|
+
parse_error("incorrectly-placed-solidus")
|
167
|
+
end
|
168
|
+
|
169
|
+
token[:type] = :StartTag
|
170
|
+
end
|
171
|
+
|
172
|
+
if token[:type] == :StartTag
|
173
|
+
token[:name] = token[:name].downcase
|
174
|
+
|
175
|
+
# We need to remove the duplicate attributes and convert attributes
|
176
|
+
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
177
|
+
|
178
|
+
unless token[:data].empty?
|
179
|
+
data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
|
180
|
+
token[:data] = Hash[*data.flatten]
|
181
|
+
end
|
182
|
+
|
183
|
+
elsif token[:type] == :EndTag
|
184
|
+
parse_error("attributes-in-end-tag") unless token[:data].empty?
|
185
|
+
token[:name] = token[:name].downcase
|
186
|
+
end
|
187
|
+
|
188
|
+
token
|
189
|
+
end
|
190
|
+
|
191
|
+
@@new_modes = {
|
192
|
+
'select' => :inSelect,
|
193
|
+
'td' => :inCell,
|
194
|
+
'th' => :inCell,
|
195
|
+
'tr' => :inRow,
|
196
|
+
'tbody' => :inTableBody,
|
197
|
+
'thead' => :inTableBody,
|
198
|
+
'tfoot' => :inTableBody,
|
199
|
+
'caption' => :inCaption,
|
200
|
+
'colgroup' => :inColumnGroup,
|
201
|
+
'table' => :inTable,
|
202
|
+
'head' => :inBody,
|
203
|
+
'body' => :inBody,
|
204
|
+
'frameset' => :inFrameset
|
205
|
+
}
|
206
|
+
|
207
|
+
def reset_insertion_mode
|
208
|
+
# The name of this method is mostly historical. (It's also used in the
|
209
|
+
# specification.)
|
210
|
+
last = false
|
211
|
+
|
212
|
+
@tree.open_elements.reverse.each do |node|
|
213
|
+
node_name = node.name
|
214
|
+
|
215
|
+
if node == @tree.open_elements.first
|
216
|
+
last = true
|
217
|
+
unless ['td', 'th'].include?(node_name)
|
218
|
+
# XXX
|
219
|
+
# assert @inner_html
|
220
|
+
node_name = @inner_html
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Check for conditions that should only happen in the inner_html
|
225
|
+
# case
|
226
|
+
if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
|
227
|
+
# XXX
|
228
|
+
# assert @inner_html
|
229
|
+
end
|
230
|
+
|
231
|
+
if @@new_modes.has_key?(node_name)
|
232
|
+
@phase = @phases[@@new_modes[node_name]]
|
233
|
+
elsif node_name == 'html'
|
234
|
+
@phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
|
235
|
+
elsif last
|
236
|
+
@phase = @phases[:inBody]
|
237
|
+
else
|
238
|
+
next
|
239
|
+
end
|
240
|
+
|
241
|
+
break
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def _(string); string; end
|
246
|
+
end
|
247
|
+
|
248
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'html5/html5parser/phase'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
class AfterBodyPhase < Phase
|
5
|
+
|
6
|
+
handle_end 'html'
|
7
|
+
|
8
|
+
def processComment(data)
|
9
|
+
# This is needed because data is to be appended to the <html> element
|
10
|
+
# here and not to whatever is currently open.
|
11
|
+
@tree.insert_comment(data, @tree.open_elements.first)
|
12
|
+
end
|
13
|
+
|
14
|
+
def processCharacters(data)
|
15
|
+
parse_error("unexpected-char-after-body")
|
16
|
+
@parser.phase = @parser.phases[:inBody]
|
17
|
+
@parser.phase.processCharacters(data)
|
18
|
+
end
|
19
|
+
|
20
|
+
def processStartTag(name, attributes)
|
21
|
+
parse_error("unexpected-start-tag-after-body", {"name" => name})
|
22
|
+
@parser.phase = @parser.phases[:inBody]
|
23
|
+
@parser.phase.processStartTag(name, attributes)
|
24
|
+
end
|
25
|
+
|
26
|
+
def endTagHtml(name)
|
27
|
+
if @parser.inner_html
|
28
|
+
parse_error
|
29
|
+
else
|
30
|
+
# XXX: This may need to be done, not sure
|
31
|
+
# Don't set last_phase to the current phase but to the inBody phase
|
32
|
+
# instead. No need for extra parse errors if there's something after </html>.
|
33
|
+
# Try "<!doctype html>X</html>X" for instance.
|
34
|
+
@parser.last_phase = @parser.phase
|
35
|
+
@parser.phase = @parser.phases[:trailingEnd]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def endTagOther(name)
|
40
|
+
parse_error("unexpected-end-tag-after-body", {"name" => name})
|
41
|
+
@parser.phase = @parser.phases[:inBody]
|
42
|
+
@parser.phase.processEndTag(name)
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'html5/html5parser/phase'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
class AfterFramesetPhase < Phase
|
5
|
+
|
6
|
+
# http://www.whatwg.org/specs/web-apps/current-work/#after3
|
7
|
+
|
8
|
+
handle_start 'html', 'noframes'
|
9
|
+
|
10
|
+
handle_end 'html'
|
11
|
+
|
12
|
+
def processCharacters(data)
|
13
|
+
parse_error("unexpected-char-after-frameset")
|
14
|
+
end
|
15
|
+
|
16
|
+
def startTagNoframes(name, attributes)
|
17
|
+
@parser.phases[:inBody].processStartTag(name, attributes)
|
18
|
+
end
|
19
|
+
|
20
|
+
def startTagOther(name, attributes)
|
21
|
+
parse_error("unexpected-start-tag-after-frameset", {"name" => name})
|
22
|
+
end
|
23
|
+
|
24
|
+
def endTagHtml(name)
|
25
|
+
@parser.last_phase = @parser.phase
|
26
|
+
@parser.phase = @parser.phases[:trailingEnd]
|
27
|
+
end
|
28
|
+
|
29
|
+
def endTagOther(name)
|
30
|
+
parse_error("unexpected-end-tag-after-frameset", {"name" => name})
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'html5/html5parser/phase'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
class AfterHeadPhase < Phase
|
5
|
+
|
6
|
+
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
|
7
|
+
|
8
|
+
def process_eof
|
9
|
+
anything_else
|
10
|
+
@parser.phase.process_eof
|
11
|
+
end
|
12
|
+
|
13
|
+
def processCharacters(data)
|
14
|
+
anything_else
|
15
|
+
@parser.phase.processCharacters(data)
|
16
|
+
end
|
17
|
+
|
18
|
+
def startTagBody(name, attributes)
|
19
|
+
@tree.insert_element(name, attributes)
|
20
|
+
@parser.phase = @parser.phases[:inBody]
|
21
|
+
end
|
22
|
+
|
23
|
+
def startTagFrameset(name, attributes)
|
24
|
+
@tree.insert_element(name, attributes)
|
25
|
+
@parser.phase = @parser.phases[:inFrameset]
|
26
|
+
end
|
27
|
+
|
28
|
+
def startTagFromHead(name, attributes)
|
29
|
+
parse_error("unexpected-start-tag-out-of-my-head", {"name" => name})
|
30
|
+
@parser.phase = @parser.phases[:inHead]
|
31
|
+
@parser.phase.processStartTag(name, attributes)
|
32
|
+
end
|
33
|
+
|
34
|
+
def startTagOther(name, attributes)
|
35
|
+
anything_else
|
36
|
+
@parser.phase.processStartTag(name, attributes)
|
37
|
+
end
|
38
|
+
|
39
|
+
def processEndTag(name)
|
40
|
+
anything_else
|
41
|
+
@parser.phase.processEndTag(name)
|
42
|
+
end
|
43
|
+
|
44
|
+
def anything_else
|
45
|
+
@tree.insert_element('body', {})
|
46
|
+
@parser.phase = @parser.phases[:inBody]
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'html5/html5parser/phase'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
class BeforeHeadPhase < Phase
|
5
|
+
|
6
|
+
handle_start 'html', 'head'
|
7
|
+
|
8
|
+
handle_end %w( html head body br p ) => 'ImplyHead'
|
9
|
+
|
10
|
+
def process_eof
|
11
|
+
startTagHead('head', {})
|
12
|
+
@parser.phase.process_eof
|
13
|
+
end
|
14
|
+
|
15
|
+
def processCharacters(data)
|
16
|
+
startTagHead('head', {})
|
17
|
+
@parser.phase.processCharacters(data)
|
18
|
+
end
|
19
|
+
|
20
|
+
def startTagHead(name, attributes)
|
21
|
+
@tree.insert_element(name, attributes)
|
22
|
+
@tree.head_pointer = @tree.open_elements[-1]
|
23
|
+
@parser.phase = @parser.phases[:inHead]
|
24
|
+
end
|
25
|
+
|
26
|
+
def startTagOther(name, attributes)
|
27
|
+
startTagHead('head', {})
|
28
|
+
@parser.phase.processStartTag(name, attributes)
|
29
|
+
end
|
30
|
+
|
31
|
+
def endTagImplyHead(name)
|
32
|
+
startTagHead('head', {})
|
33
|
+
@parser.phase.processEndTag(name)
|
34
|
+
end
|
35
|
+
|
36
|
+
def endTagOther(name)
|
37
|
+
parse_error("end-tag-after-implied-root", {"name" => name})
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|