spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,135 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+
3
+ require 'html5/html5parser'
4
+ require 'html5/treewalkers'
5
+ require 'html5/treebuilders'
6
+
7
+ $tree_types_to_test = {
8
+ 'simpletree' =>
9
+ {:builder => HTML5::TreeBuilders['simpletree'],
10
+ :walker => HTML5::TreeWalkers['simpletree']},
11
+ 'rexml' =>
12
+ {:builder => HTML5::TreeBuilders['rexml'],
13
+ :walker => HTML5::TreeWalkers['rexml']},
14
+ 'hpricot' =>
15
+ {:builder => HTML5::TreeBuilders['hpricot'],
16
+ :walker => HTML5::TreeWalkers['hpricot']},
17
+ }
18
+
19
+ puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
20
+
21
+ class TestTreeWalkers < Test::Unit::TestCase
22
+ include HTML5::TestSupport
23
+
24
+ def concatenateCharacterTokens(tokens)
25
+ charactersToken = nil
26
+ for token in tokens
27
+ type = token[:type]
28
+ if [:Characters, :SpaceCharacters].include?(type)
29
+ if charactersToken == nil
30
+ charactersToken = {:type => :Characters, :data => token[:data]}
31
+ else
32
+ charactersToken[:data] += token[:data]
33
+ end
34
+ else
35
+ if charactersToken != nil
36
+ yield charactersToken
37
+ charactersToken = nil
38
+ end
39
+ yield token
40
+ end
41
+ end
42
+ yield charactersToken if charactersToken != nil
43
+ end
44
+
45
+ def convertTokens(tokens)
46
+ output = []
47
+ indent = 0
48
+ concatenateCharacterTokens(tokens) do |token|
49
+ case token[:type]
50
+ when :StartTag, :EmptyTag
51
+ output << "#{' '*indent}<#{token[:name]}>"
52
+ indent += 2
53
+ for name, value in token[:data].to_a.sort
54
+ next if name=='xmlns'
55
+ output << "#{' '*indent}#{name}=\"#{value}\""
56
+ end
57
+ indent -= 2 if token[:type] == :EmptyTag
58
+ when :EndTag
59
+ indent -= 2
60
+ when :Comment
61
+ output << "#{' '*indent}<!-- #{token[:data]} -->"
62
+ when :Doctype
63
+ if token[:name] and token[:name].any?
64
+ output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
65
+ else
66
+ output << "#{' '*indent}<!DOCTYPE >"
67
+ end
68
+ when :Characters, :SpaceCharacters
69
+ output << "#{' '*indent}\"#{token[:data]}\""
70
+ end
71
+ end
72
+ output.join("\n")
73
+ end
74
+
75
+ html5_test_files('tree-construction').each do |test_file|
76
+
77
+ test_name = File.basename(test_file).sub('.dat', '')
78
+ next if test_name == 'tests5' # TODO
79
+
80
+ TestData.new(test_file, %w(data errors document-fragment document)).
81
+ each_with_index do |(input, errors, inner_html, expected), index|
82
+
83
+ expected = expected.gsub("\n| ","\n")[2..-1]
84
+
85
+ $tree_types_to_test.each do |tree_name, tree_class|
86
+
87
+ define_method "test_#{test_name}_#{index}_#{tree_name}" do
88
+
89
+ parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
90
+
91
+ if inner_html
92
+ parser.parse_fragment(input, inner_html)
93
+ else
94
+ parser.parse(input)
95
+ end
96
+
97
+ document = parser.tree.get_document
98
+
99
+ begin
100
+ output = sortattrs(convertTokens(tree_class[:walker].new(document)))
101
+ expected = sortattrs(expected)
102
+ assert_equal expected, output, [
103
+ '', 'Input:', input,
104
+ '', 'Expected:', expected,
105
+ '', 'Received:', output
106
+ ].join("\n")
107
+ rescue NotImplementedError
108
+ # Amnesty for those that confess...
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+
115
+ def test_all_tokens
116
+ expected = [
117
+ {:data => [], :type => :StartTag, :name => 'html'},
118
+ {:data => [], :type => :StartTag, :name => 'head'},
119
+ {:data => [], :type => :EndTag, :name => 'head'},
120
+ {:data => [], :type => :StartTag, :name => 'body'},
121
+ {:data => [], :type => :EndTag, :name => 'body'},
122
+ {:data => [], :type => :EndTag, :name => 'html'}]
123
+ for treeName, tree_class in $tree_types_to_test
124
+ p = HTML5::HTMLParser.new(:tree => tree_class[:builder])
125
+ document = p.parse("<html></html>")
126
+ # document = tree_class.get(:adapter)(document)
127
+ output = tree_class[:walker].new(document)
128
+ expected.zip(output) do |expected_token, output_token|
129
+ assert_equal(expected_token, output_token)
130
+ end
131
+ end
132
+ end
133
+
134
+
135
+ end
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby -wKU
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
4
+
5
+ require 'html5'
6
+ require 'html5/filters/validator'
7
+
8
+ class TestValidator < Test::Unit::TestCase
9
+ def run_validator_test(test)
10
+ p = HTML5::HTMLParser.new(:tokenizer => HTMLConformanceChecker)
11
+ p.parse(test['input'])
12
+ errorCodes = p.errors.collect{|e| e[1]}
13
+ if test.has_key?('fail-if')
14
+ assert !errorCodes.include?(test['fail-if'])
15
+ end
16
+ if test.has_key?('fail-unless')
17
+ assert errorCodes.include?(test['fail-unless'])
18
+ end
19
+ end
20
+
21
+ for filename in html5_test_files('validator')
22
+ tests = JSON.load(open(filename))
23
+ testName = File.basename(filename).sub(".test", "")
24
+ tests['tests'].each_with_index do |test, index|
25
+ define_method "test_#{testName}_#{index}" do
26
+ run_validator_test(test)
27
+ end
28
+ end
29
+ end
30
+ end
31
+
@@ -0,0 +1,67 @@
1
+ require 'html5/constants'
2
+
3
+ class TokenizerTestParser
4
+ def initialize(tokenizer)
5
+ @tokenizer = tokenizer
6
+ end
7
+
8
+ def parse
9
+ @outputTokens = []
10
+
11
+ debug = nil
12
+ for token in @tokenizer
13
+ debug = token.inspect if token[:type] == :ParseError
14
+ send(('process' + token[:type].to_s), token)
15
+ end
16
+
17
+ return @outputTokens
18
+ end
19
+
20
+ def processDoctype(token)
21
+ @outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
22
+ token[:systemId], token[:correct]])
23
+ end
24
+
25
+ def processStartTag(token)
26
+ if token[:self_closing]
27
+ @outputTokens.push(["StartTag", token[:name], token[:data], token[:self_closing]])
28
+ else
29
+ @outputTokens.push(["StartTag", token[:name], token[:data]])
30
+ end
31
+ end
32
+
33
+ def processEmptyTag(token)
34
+ if not HTML5::VOID_ELEMENTS.include? token[:name]
35
+ @outputTokens.push("ParseError")
36
+ end
37
+ @outputTokens.push(["StartTag", token[:name], token[:data]])
38
+ end
39
+
40
+ def processEndTag(token)
41
+ if token[:data].length > 0
42
+ self.processParseError(token)
43
+ end
44
+ @outputTokens.push(["EndTag", token[:name]])
45
+ end
46
+
47
+ def processComment(token)
48
+ @outputTokens.push(["Comment", token[:data]])
49
+ end
50
+
51
+ def processCharacters(token)
52
+ @outputTokens.push(["Character", token[:data]])
53
+ end
54
+
55
+ alias processSpaceCharacters processCharacters
56
+
57
+ def processCharacters(token)
58
+ @outputTokens.push(["Character", token[:data]])
59
+ end
60
+
61
+ def process_eof(token)
62
+ end
63
+
64
+ def processParseError(token)
65
+ @outputTokens.push("ParseError")
66
+ end
67
+ end
@@ -0,0 +1,38 @@
1
+ #
2
+ # This temporary test driver tracks progress on getting HTML5lib working
3
+ # on Ruby 1.9. Prereqs of Hoe, Hpricot, and UniversalDetector will be
4
+ # required to complete this.
5
+ #
6
+ # Once all the tests pass, this file should be deleted
7
+ #
8
+
9
+ require 'test/test_cli'
10
+
11
+ # requires UniversalDetector
12
+ # require 'test/test_encoding'
13
+
14
+ require 'test/test_input_stream'
15
+
16
+ require 'test/test_lxp'
17
+
18
+ require 'test/test_parser'
19
+
20
+ # warning: method redefined; discarding old test
21
+ # warning: instance variable @expanded_name not initialized
22
+ # SimpleDelegator.class
23
+ # require 'test/test_sanitizer'
24
+
25
+ require 'test/test_serializer'
26
+
27
+ require 'test/test_sniffer'
28
+
29
+ require 'test/test_stream'
30
+
31
+ # warning: shadowing outer local variable - tokens
32
+ # require 'test/test_tokenizer'
33
+
34
+ # requires hpricot
35
+ # require 'test/test_treewalkers'
36
+
37
+ # warning: instance variable @delegate_sd_obj not initialized
38
+ # require 'test/test_validator'
metadata ADDED
@@ -0,0 +1,198 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spk-html5
3
+ version: !ruby/object:Gem::Version
4
+ hash: 53
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 10
9
+ - 1
10
+ version: 0.10.1
11
+ platform: ruby
12
+ authors:
13
+ - Ryan King
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-05 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rchardet
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 9
30
+ segments:
31
+ - 1
32
+ - 3
33
+ version: "1.3"
34
+ type: :runtime
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: rubyforge
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ hash: 7
45
+ segments:
46
+ - 2
47
+ - 0
48
+ - 4
49
+ version: 2.0.4
50
+ type: :development
51
+ version_requirements: *id002
52
+ - !ruby/object:Gem::Dependency
53
+ name: hoe
54
+ prerelease: false
55
+ requirement: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 19
61
+ segments:
62
+ - 2
63
+ - 6
64
+ - 2
65
+ version: 2.6.2
66
+ type: :development
67
+ version_requirements: *id003
68
+ description: A ruby implementation of the parsing algorithm in HTML5.
69
+ email: ryan@theryanking.com
70
+ executables:
71
+ - html5
72
+ extensions: []
73
+
74
+ extra_rdoc_files:
75
+ - History.txt
76
+ - Manifest.txt
77
+ files:
78
+ - History.txt
79
+ - Manifest.txt
80
+ - README
81
+ - Rakefile.rb
82
+ - bin/html5
83
+ - lib/html5.rb
84
+ - lib/html5/cli.rb
85
+ - lib/html5/constants.rb
86
+ - lib/html5/filters/base.rb
87
+ - lib/html5/filters/inject_meta_charset.rb
88
+ - lib/html5/filters/iso639codes.rb
89
+ - lib/html5/filters/optionaltags.rb
90
+ - lib/html5/filters/rfc2046.rb
91
+ - lib/html5/filters/rfc3987.rb
92
+ - lib/html5/filters/sanitizer.rb
93
+ - lib/html5/filters/validator.rb
94
+ - lib/html5/filters/whitespace.rb
95
+ - lib/html5/html5parser.rb
96
+ - lib/html5/html5parser/after_after_body_phase.rb
97
+ - lib/html5/html5parser/after_after_frameset_phase.rb
98
+ - lib/html5/html5parser/after_body_phase.rb
99
+ - lib/html5/html5parser/after_frameset_phase.rb
100
+ - lib/html5/html5parser/after_head_phase.rb
101
+ - lib/html5/html5parser/before_head_phase.rb
102
+ - lib/html5/html5parser/before_html_phase.rb
103
+ - lib/html5/html5parser/in_body_phase.rb
104
+ - lib/html5/html5parser/in_caption_phase.rb
105
+ - lib/html5/html5parser/in_cell_phase.rb
106
+ - lib/html5/html5parser/in_column_group_phase.rb
107
+ - lib/html5/html5parser/in_foreign_content_phase.rb
108
+ - lib/html5/html5parser/in_frameset_phase.rb
109
+ - lib/html5/html5parser/in_head_phase.rb
110
+ - lib/html5/html5parser/in_row_phase.rb
111
+ - lib/html5/html5parser/in_select_phase.rb
112
+ - lib/html5/html5parser/in_select_table_phase.rb
113
+ - lib/html5/html5parser/in_table_body_phase.rb
114
+ - lib/html5/html5parser/in_table_phase.rb
115
+ - lib/html5/html5parser/initial_phase.rb
116
+ - lib/html5/html5parser/phase.rb
117
+ - lib/html5/inputstream.rb
118
+ - lib/html5/liberalxmlparser.rb
119
+ - lib/html5/sanitizer.rb
120
+ - lib/html5/serializer.rb
121
+ - lib/html5/serializer/htmlserializer.rb
122
+ - lib/html5/serializer/xhtmlserializer.rb
123
+ - lib/html5/sniffer.rb
124
+ - lib/html5/tokenizer.rb
125
+ - lib/html5/treebuilders.rb
126
+ - lib/html5/treebuilders/base.rb
127
+ - lib/html5/treebuilders/hpricot.rb
128
+ - lib/html5/treebuilders/rexml.rb
129
+ - lib/html5/treebuilders/simpletree.rb
130
+ - lib/html5/treewalkers.rb
131
+ - lib/html5/treewalkers/base.rb
132
+ - lib/html5/treewalkers/hpricot.rb
133
+ - lib/html5/treewalkers/rexml.rb
134
+ - lib/html5/treewalkers/simpletree.rb
135
+ - lib/html5/version.rb
136
+ - test/preamble.rb
137
+ - test/test_cli.rb
138
+ - test/test_encoding.rb
139
+ - test/test_input_stream.rb
140
+ - test/test_lxp.rb
141
+ - test/test_parser.rb
142
+ - test/test_sanitizer.rb
143
+ - test/test_serializer.rb
144
+ - test/test_sniffer.rb
145
+ - test/test_stream.rb
146
+ - test/test_tokenizer.rb
147
+ - test/test_treewalkers.rb
148
+ - test/test_validator.rb
149
+ - test/tokenizer_test_parser.rb
150
+ - test19.rb
151
+ has_rdoc: true
152
+ homepage: http://code.google.com/p/html5lib
153
+ licenses: []
154
+
155
+ post_install_message:
156
+ rdoc_options:
157
+ - --main
158
+ - README.txt
159
+ require_paths:
160
+ - lib
161
+ required_ruby_version: !ruby/object:Gem::Requirement
162
+ none: false
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ hash: 3
167
+ segments:
168
+ - 0
169
+ version: "0"
170
+ required_rubygems_version: !ruby/object:Gem::Requirement
171
+ none: false
172
+ requirements:
173
+ - - ">="
174
+ - !ruby/object:Gem::Version
175
+ hash: 3
176
+ segments:
177
+ - 0
178
+ version: "0"
179
+ requirements: []
180
+
181
+ rubyforge_project: spk-html5
182
+ rubygems_version: 1.3.7
183
+ signing_key:
184
+ specification_version: 3
185
+ summary: HTML5 parser/tokenizer.
186
+ test_files:
187
+ - test/test_sniffer.rb
188
+ - test/test_treewalkers.rb
189
+ - test/test_input_stream.rb
190
+ - test/test_stream.rb
191
+ - test/test_encoding.rb
192
+ - test/test_serializer.rb
193
+ - test/test_validator.rb
194
+ - test/test_tokenizer.rb
195
+ - test/test_sanitizer.rb
196
+ - test/test_parser.rb
197
+ - test/test_cli.rb
198
+ - test/test_lxp.rb