feedtools 0.2.26 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,36 @@
1
+ require 'html5/constants'
2
+ require 'html5/filters/base'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class WhitespaceFilter < Base
7
+
8
+ SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
9
+ SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
10
+
11
+ def each
12
+ preserve = 0
13
+ __getobj__.each do |token|
14
+ case token[:type]
15
+ when :StartTag
16
+ if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
17
+ preserve += 1
18
+ end
19
+
20
+ when :EndTag
21
+ preserve -= 1 if preserve > 0
22
+
23
+ when :SpaceCharacters
24
+ token[:data] = " " if preserve == 0 && token[:data]
25
+
26
+ when :Characters
27
+ token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
28
+ end
29
+
30
+ yield token
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,248 @@
1
+ require 'html5/constants'
2
+ require 'html5/tokenizer'
3
+ require 'html5/treebuilders/rexml'
4
+
5
+ Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
6
+ require 'html5/html5parser/' + File.basename(path)
7
+ end
8
+
9
+ module HTML5
10
+
11
+ # Error in parsed document
12
+ class ParseError < Exception; end
13
+ class AssertionError < Exception; end
14
+
15
+ # HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
16
+ #
17
+ class HTMLParser
18
+
19
+ attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table
20
+
21
+ attr_reader :phases, :tokenizer, :tree, :errors
22
+
23
+ def self.parse(stream, options = {})
24
+ encoding = options.delete(:encoding)
25
+ new(options).parse(stream,encoding)
26
+ end
27
+
28
+ def self.parse_fragment(stream, options = {})
29
+ container = options.delete(:container) || 'div'
30
+ encoding = options.delete(:encoding)
31
+ new(options).parse_fragment(stream, container, encoding)
32
+ end
33
+
34
+ @@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
35
+ inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
36
+
37
+ # :strict - raise an exception when a parse error is encountered
38
+ # :tree - a treebuilder class controlling the type of tree that will be
39
+ # returned. Built in treebuilders can be accessed through
40
+ # HTML5::TreeBuilders[treeType]
41
+ def initialize(options = {})
42
+ @strict = false
43
+ @errors = []
44
+
45
+ @tokenizer = HTMLTokenizer
46
+ @tree = TreeBuilders::REXML::TreeBuilder
47
+
48
+ options.each {|name, value| instance_variable_set("@#{name}", value) }
49
+ @lowercase_attr_name = nil unless instance_variables.include?("@lowercase_attr_name")
50
+ @lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name")
51
+
52
+ @tree = @tree.new
53
+
54
+ @phases = @@phases.inject({}) do |phases, phase_name|
55
+ phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
56
+ phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
57
+ phases
58
+ end
59
+ end
60
+
61
+ def _parse(stream, inner_html, encoding, container = 'div')
62
+ @tree.reset
63
+ @first_start_tag = false
64
+ @errors = []
65
+
66
+ @tokenizer = @tokenizer.class unless Class === @tokenizer
67
+ @tokenizer = @tokenizer.new(stream, :encoding => encoding,
68
+ :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
69
+
70
+ if inner_html
71
+ case @inner_html = container.downcase
72
+ when 'title', 'textarea'
73
+ @tokenizer.content_model_flag = :RCDATA
74
+ when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
75
+ @tokenizer.content_model_flag = :CDATA
76
+ when 'plaintext'
77
+ @tokenizer.content_model_flag = :PLAINTEXT
78
+ else
79
+ # content_model_flag already is PCDATA
80
+ @tokenizer.content_model_flag = :PCDATA
81
+ end
82
+
83
+ @phase = @phases[:rootElement]
84
+ @phase.insert_html_element
85
+ reset_insertion_mode
86
+ else
87
+ @inner_html = false
88
+ @phase = @phases[:initial]
89
+ end
90
+
91
+ # We only seem to have InBodyPhase testcases where the following is
92
+ # relevant ... need others too
93
+ @last_phase = nil
94
+
95
+ # XXX This is temporary for the moment so there isn't any other
96
+ # changes needed for the parser to work with the iterable tokenizer
97
+ @tokenizer.each do |token|
98
+ token = normalize_token(token)
99
+
100
+ method = 'process%s' % token[:type]
101
+
102
+ case token[:type]
103
+ when :Characters, :SpaceCharacters, :Comment
104
+ @phase.send method, token[:data]
105
+ when :StartTag
106
+ @phase.send method, token[:name], token[:data]
107
+ when :EndTag
108
+ @phase.send method, token[:name]
109
+ when :Doctype
110
+ @phase.send method, token[:name], token[:publicId],
111
+ token[:systemId], token[:correct]
112
+ else
113
+ parse_error(token[:data], token[:datavars])
114
+ end
115
+ end
116
+
117
+ # When the loop finishes it's EOF
118
+ @phase.process_eof
119
+ end
120
+
121
+ # Parse a HTML document into a well-formed tree
122
+ #
123
+ # stream - a filelike object or string containing the HTML to be parsed
124
+ #
125
+ # The optional encoding parameter must be a string that indicates
126
+ # the encoding. If specified, that encoding will be used,
127
+ # regardless of any BOM or later declaration (such as in a meta
128
+ # element)
129
+ def parse(stream, encoding=nil)
130
+ _parse(stream, false, encoding)
131
+ @tree.get_document
132
+ end
133
+
134
+ # Parse a HTML fragment into a well-formed tree fragment
135
+
136
+ # container - name of the element we're setting the inner_html property
137
+ # if set to nil, default to 'div'
138
+ #
139
+ # stream - a filelike object or string containing the HTML to be parsed
140
+ #
141
+ # The optional encoding parameter must be a string that indicates
142
+ # the encoding. If specified, that encoding will be used,
143
+ # regardless of any BOM or later declaration (such as in a meta
144
+ # element)
145
+ def parse_fragment(stream, container='div', encoding=nil)
146
+ _parse(stream, true, encoding, container)
147
+ @tree.get_fragment
148
+ end
149
+
150
+ def parse_error(code = 'XXX-undefined-error', data = {})
151
+ # XXX The idea is to make data mandatory.
152
+ @errors.push([@tokenizer.stream.position, code, data])
153
+ raise ParseError if @strict
154
+ end
155
+
156
+ # HTML5 specific normalizations to the token stream
157
+ def normalize_token(token)
158
+
159
+ if token[:type] == :EmptyTag
160
+ # When a solidus (/) is encountered within a tag name what happens
161
+ # depends on whether the current tag name matches that of a void
162
+ # element. If it matches a void element atheists did the wrong
163
+ # thing and if it doesn't it's wrong for everyone.
164
+
165
+ unless VOID_ELEMENTS.include?(token[:name])
166
+ parse_error("incorrectly-placed-solidus")
167
+ end
168
+
169
+ token[:type] = :StartTag
170
+ end
171
+
172
+ if token[:type] == :StartTag
173
+ token[:name] = token[:name].downcase
174
+
175
+ # We need to remove the duplicate attributes and convert attributes
176
+ # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
177
+
178
+ unless token[:data].empty?
179
+ data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
180
+ token[:data] = Hash[*data.flatten]
181
+ end
182
+
183
+ elsif token[:type] == :EndTag
184
+ parse_error("attributes-in-end-tag") unless token[:data].empty?
185
+ token[:name] = token[:name].downcase
186
+ end
187
+
188
+ token
189
+ end
190
+
191
+ @@new_modes = {
192
+ 'select' => :inSelect,
193
+ 'td' => :inCell,
194
+ 'th' => :inCell,
195
+ 'tr' => :inRow,
196
+ 'tbody' => :inTableBody,
197
+ 'thead' => :inTableBody,
198
+ 'tfoot' => :inTableBody,
199
+ 'caption' => :inCaption,
200
+ 'colgroup' => :inColumnGroup,
201
+ 'table' => :inTable,
202
+ 'head' => :inBody,
203
+ 'body' => :inBody,
204
+ 'frameset' => :inFrameset
205
+ }
206
+
207
+ def reset_insertion_mode
208
+ # The name of this method is mostly historical. (It's also used in the
209
+ # specification.)
210
+ last = false
211
+
212
+ @tree.open_elements.reverse.each do |node|
213
+ node_name = node.name
214
+
215
+ if node == @tree.open_elements.first
216
+ last = true
217
+ unless ['td', 'th'].include?(node_name)
218
+ # XXX
219
+ # assert @inner_html
220
+ node_name = @inner_html
221
+ end
222
+ end
223
+
224
+ # Check for conditions that should only happen in the inner_html
225
+ # case
226
+ if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
227
+ # XXX
228
+ # assert @inner_html
229
+ end
230
+
231
+ if @@new_modes.has_key?(node_name)
232
+ @phase = @phases[@@new_modes[node_name]]
233
+ elsif node_name == 'html'
234
+ @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
235
+ elsif last
236
+ @phase = @phases[:inBody]
237
+ else
238
+ next
239
+ end
240
+
241
+ break
242
+ end
243
+ end
244
+
245
+ def _(string); string; end
246
+ end
247
+
248
+ end
@@ -0,0 +1,46 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterBodyPhase < Phase
5
+
6
+ handle_end 'html'
7
+
8
+ def processComment(data)
9
+ # This is needed because data is to be appended to the <html> element
10
+ # here and not to whatever is currently open.
11
+ @tree.insert_comment(data, @tree.open_elements.first)
12
+ end
13
+
14
+ def processCharacters(data)
15
+ parse_error("unexpected-char-after-body")
16
+ @parser.phase = @parser.phases[:inBody]
17
+ @parser.phase.processCharacters(data)
18
+ end
19
+
20
+ def processStartTag(name, attributes)
21
+ parse_error("unexpected-start-tag-after-body", {"name" => name})
22
+ @parser.phase = @parser.phases[:inBody]
23
+ @parser.phase.processStartTag(name, attributes)
24
+ end
25
+
26
+ def endTagHtml(name)
27
+ if @parser.inner_html
28
+ parse_error
29
+ else
30
+ # XXX: This may need to be done, not sure
31
+ # Don't set last_phase to the current phase but to the inBody phase
32
+ # instead. No need for extra parse errors if there's something after </html>.
33
+ # Try "<!doctype html>X</html>X" for instance.
34
+ @parser.last_phase = @parser.phase
35
+ @parser.phase = @parser.phases[:trailingEnd]
36
+ end
37
+ end
38
+
39
+ def endTagOther(name)
40
+ parse_error("unexpected-end-tag-after-body", {"name" => name})
41
+ @parser.phase = @parser.phases[:inBody]
42
+ @parser.phase.processEndTag(name)
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,33 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterFramesetPhase < Phase
5
+
6
+ # http://www.whatwg.org/specs/web-apps/current-work/#after3
7
+
8
+ handle_start 'html', 'noframes'
9
+
10
+ handle_end 'html'
11
+
12
+ def processCharacters(data)
13
+ parse_error("unexpected-char-after-frameset")
14
+ end
15
+
16
+ def startTagNoframes(name, attributes)
17
+ @parser.phases[:inBody].processStartTag(name, attributes)
18
+ end
19
+
20
+ def startTagOther(name, attributes)
21
+ parse_error("unexpected-start-tag-after-frameset", {"name" => name})
22
+ end
23
+
24
+ def endTagHtml(name)
25
+ @parser.last_phase = @parser.phase
26
+ @parser.phase = @parser.phases[:trailingEnd]
27
+ end
28
+
29
+ def endTagOther(name)
30
+ parse_error("unexpected-end-tag-after-frameset", {"name" => name})
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,50 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class AfterHeadPhase < Phase
5
+
6
+ handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
7
+
8
+ def process_eof
9
+ anything_else
10
+ @parser.phase.process_eof
11
+ end
12
+
13
+ def processCharacters(data)
14
+ anything_else
15
+ @parser.phase.processCharacters(data)
16
+ end
17
+
18
+ def startTagBody(name, attributes)
19
+ @tree.insert_element(name, attributes)
20
+ @parser.phase = @parser.phases[:inBody]
21
+ end
22
+
23
+ def startTagFrameset(name, attributes)
24
+ @tree.insert_element(name, attributes)
25
+ @parser.phase = @parser.phases[:inFrameset]
26
+ end
27
+
28
+ def startTagFromHead(name, attributes)
29
+ parse_error("unexpected-start-tag-out-of-my-head", {"name" => name})
30
+ @parser.phase = @parser.phases[:inHead]
31
+ @parser.phase.processStartTag(name, attributes)
32
+ end
33
+
34
+ def startTagOther(name, attributes)
35
+ anything_else
36
+ @parser.phase.processStartTag(name, attributes)
37
+ end
38
+
39
+ def processEndTag(name)
40
+ anything_else
41
+ @parser.phase.processEndTag(name)
42
+ end
43
+
44
+ def anything_else
45
+ @tree.insert_element('body', {})
46
+ @parser.phase = @parser.phases[:inBody]
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,41 @@
1
+ require 'html5/html5parser/phase'
2
+
3
+ module HTML5
4
+ class BeforeHeadPhase < Phase
5
+
6
+ handle_start 'html', 'head'
7
+
8
+ handle_end %w( html head body br p ) => 'ImplyHead'
9
+
10
+ def process_eof
11
+ startTagHead('head', {})
12
+ @parser.phase.process_eof
13
+ end
14
+
15
+ def processCharacters(data)
16
+ startTagHead('head', {})
17
+ @parser.phase.processCharacters(data)
18
+ end
19
+
20
+ def startTagHead(name, attributes)
21
+ @tree.insert_element(name, attributes)
22
+ @tree.head_pointer = @tree.open_elements[-1]
23
+ @parser.phase = @parser.phases[:inHead]
24
+ end
25
+
26
+ def startTagOther(name, attributes)
27
+ startTagHead('head', {})
28
+ @parser.phase.processStartTag(name, attributes)
29
+ end
30
+
31
+ def endTagImplyHead(name)
32
+ startTagHead('head', {})
33
+ @parser.phase.processEndTag(name)
34
+ end
35
+
36
+ def endTagOther(name)
37
+ parse_error("end-tag-after-implied-root", {"name" => name})
38
+ end
39
+
40
+ end
41
+ end