spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,135 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+
3
+ require 'html5/html5parser'
4
+ require 'html5/treewalkers'
5
+ require 'html5/treebuilders'
6
+
7
+ $tree_types_to_test = {
8
+ 'simpletree' =>
9
+ {:builder => HTML5::TreeBuilders['simpletree'],
10
+ :walker => HTML5::TreeWalkers['simpletree']},
11
+ 'rexml' =>
12
+ {:builder => HTML5::TreeBuilders['rexml'],
13
+ :walker => HTML5::TreeWalkers['rexml']},
14
+ 'hpricot' =>
15
+ {:builder => HTML5::TreeBuilders['hpricot'],
16
+ :walker => HTML5::TreeWalkers['hpricot']},
17
+ }
18
+
19
+ puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
20
+
21
+ class TestTreeWalkers < Test::Unit::TestCase
22
+ include HTML5::TestSupport
23
+
24
+ def concatenateCharacterTokens(tokens)
25
+ charactersToken = nil
26
+ for token in tokens
27
+ type = token[:type]
28
+ if [:Characters, :SpaceCharacters].include?(type)
29
+ if charactersToken == nil
30
+ charactersToken = {:type => :Characters, :data => token[:data]}
31
+ else
32
+ charactersToken[:data] += token[:data]
33
+ end
34
+ else
35
+ if charactersToken != nil
36
+ yield charactersToken
37
+ charactersToken = nil
38
+ end
39
+ yield token
40
+ end
41
+ end
42
+ yield charactersToken if charactersToken != nil
43
+ end
44
+
45
+ def convertTokens(tokens)
46
+ output = []
47
+ indent = 0
48
+ concatenateCharacterTokens(tokens) do |token|
49
+ case token[:type]
50
+ when :StartTag, :EmptyTag
51
+ output << "#{' '*indent}<#{token[:name]}>"
52
+ indent += 2
53
+ for name, value in token[:data].to_a.sort
54
+ next if name=='xmlns'
55
+ output << "#{' '*indent}#{name}=\"#{value}\""
56
+ end
57
+ indent -= 2 if token[:type] == :EmptyTag
58
+ when :EndTag
59
+ indent -= 2
60
+ when :Comment
61
+ output << "#{' '*indent}<!-- #{token[:data]} -->"
62
+ when :Doctype
63
+ if token[:name] and token[:name].any?
64
+ output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
65
+ else
66
+ output << "#{' '*indent}<!DOCTYPE >"
67
+ end
68
+ when :Characters, :SpaceCharacters
69
+ output << "#{' '*indent}\"#{token[:data]}\""
70
+ end
71
+ end
72
+ output.join("\n")
73
+ end
74
+
75
+ html5_test_files('tree-construction').each do |test_file|
76
+
77
+ test_name = File.basename(test_file).sub('.dat', '')
78
+ next if test_name == 'tests5' # TODO
79
+
80
+ TestData.new(test_file, %w(data errors document-fragment document)).
81
+ each_with_index do |(input, errors, inner_html, expected), index|
82
+
83
+ expected = expected.gsub("\n| ","\n")[2..-1]
84
+
85
+ $tree_types_to_test.each do |tree_name, tree_class|
86
+
87
+ define_method "test_#{test_name}_#{index}_#{tree_name}" do
88
+
89
+ parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
90
+
91
+ if inner_html
92
+ parser.parse_fragment(input, inner_html)
93
+ else
94
+ parser.parse(input)
95
+ end
96
+
97
+ document = parser.tree.get_document
98
+
99
+ begin
100
+ output = sortattrs(convertTokens(tree_class[:walker].new(document)))
101
+ expected = sortattrs(expected)
102
+ assert_equal expected, output, [
103
+ '', 'Input:', input,
104
+ '', 'Expected:', expected,
105
+ '', 'Received:', output
106
+ ].join("\n")
107
+ rescue NotImplementedError
108
+ # Amnesty for those that confess...
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+
115
+ def test_all_tokens
116
+ expected = [
117
+ {:data => [], :type => :StartTag, :name => 'html'},
118
+ {:data => [], :type => :StartTag, :name => 'head'},
119
+ {:data => [], :type => :EndTag, :name => 'head'},
120
+ {:data => [], :type => :StartTag, :name => 'body'},
121
+ {:data => [], :type => :EndTag, :name => 'body'},
122
+ {:data => [], :type => :EndTag, :name => 'html'}]
123
+ for treeName, tree_class in $tree_types_to_test
124
+ p = HTML5::HTMLParser.new(:tree => tree_class[:builder])
125
+ document = p.parse("<html></html>")
126
+ # document = tree_class.get(:adapter)(document)
127
+ output = tree_class[:walker].new(document)
128
+ expected.zip(output) do |expected_token, output_token|
129
+ assert_equal(expected_token, output_token)
130
+ end
131
+ end
132
+ end
133
+
134
+
135
+ end
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby -wKU
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
4
+
5
+ require 'html5'
6
+ require 'html5/filters/validator'
7
+
8
+ class TestValidator < Test::Unit::TestCase
9
+ def run_validator_test(test)
10
+ p = HTML5::HTMLParser.new(:tokenizer => HTMLConformanceChecker)
11
+ p.parse(test['input'])
12
+ errorCodes = p.errors.collect{|e| e[1]}
13
+ if test.has_key?('fail-if')
14
+ assert !errorCodes.include?(test['fail-if'])
15
+ end
16
+ if test.has_key?('fail-unless')
17
+ assert errorCodes.include?(test['fail-unless'])
18
+ end
19
+ end
20
+
21
+ for filename in html5_test_files('validator')
22
+ tests = JSON.load(open(filename))
23
+ testName = File.basename(filename).sub(".test", "")
24
+ tests['tests'].each_with_index do |test, index|
25
+ define_method "test_#{testName}_#{index}" do
26
+ run_validator_test(test)
27
+ end
28
+ end
29
+ end
30
+ end
31
+
@@ -0,0 +1,67 @@
1
+ require 'html5/constants'
2
+
3
+ class TokenizerTestParser
4
+ def initialize(tokenizer)
5
+ @tokenizer = tokenizer
6
+ end
7
+
8
+ def parse
9
+ @outputTokens = []
10
+
11
+ debug = nil
12
+ for token in @tokenizer
13
+ debug = token.inspect if token[:type] == :ParseError
14
+ send(('process' + token[:type].to_s), token)
15
+ end
16
+
17
+ return @outputTokens
18
+ end
19
+
20
+ def processDoctype(token)
21
+ @outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
22
+ token[:systemId], token[:correct]])
23
+ end
24
+
25
+ def processStartTag(token)
26
+ if token[:self_closing]
27
+ @outputTokens.push(["StartTag", token[:name], token[:data], token[:self_closing]])
28
+ else
29
+ @outputTokens.push(["StartTag", token[:name], token[:data]])
30
+ end
31
+ end
32
+
33
+ def processEmptyTag(token)
34
+ if not HTML5::VOID_ELEMENTS.include? token[:name]
35
+ @outputTokens.push("ParseError")
36
+ end
37
+ @outputTokens.push(["StartTag", token[:name], token[:data]])
38
+ end
39
+
40
+ def processEndTag(token)
41
+ if token[:data].length > 0
42
+ self.processParseError(token)
43
+ end
44
+ @outputTokens.push(["EndTag", token[:name]])
45
+ end
46
+
47
+ def processComment(token)
48
+ @outputTokens.push(["Comment", token[:data]])
49
+ end
50
+
51
+ def processCharacters(token)
52
+ @outputTokens.push(["Character", token[:data]])
53
+ end
54
+
55
+ alias processSpaceCharacters processCharacters
56
+
57
+ def processCharacters(token)
58
+ @outputTokens.push(["Character", token[:data]])
59
+ end
60
+
61
+ def process_eof(token)
62
+ end
63
+
64
+ def processParseError(token)
65
+ @outputTokens.push("ParseError")
66
+ end
67
+ end
@@ -0,0 +1,38 @@
1
+ #
2
+ # This temporary test driver tracks progress on getting HTML5lib working
3
+ # on Ruby 1.9. Prereqs of Hoe, Hpricot, and UniversalDetector will be
4
+ # required to complete this.
5
+ #
6
+ # Once all the tests pass, this file should be deleted
7
+ #
8
+
9
+ require 'test/test_cli'
10
+
11
+ # requires UniversalDetector
12
+ # require 'test/test_encoding'
13
+
14
+ require 'test/test_input_stream'
15
+
16
+ require 'test/test_lxp'
17
+
18
+ require 'test/test_parser'
19
+
20
+ # warning: method redefined; discarding old test
21
+ # warning: instance variable @expanded_name not initialized
22
+ # SimpleDelegator.class
23
+ # require 'test/test_sanitizer'
24
+
25
+ require 'test/test_serializer'
26
+
27
+ require 'test/test_sniffer'
28
+
29
+ require 'test/test_stream'
30
+
31
+ # warning: shadowing outer local variable - tokens
32
+ # require 'test/test_tokenizer'
33
+
34
+ # requires hpricot
35
+ # require 'test/test_treewalkers'
36
+
37
+ # warning: instance variable @delegate_sd_obj not initialized
38
+ # require 'test/test_validator'
metadata ADDED
@@ -0,0 +1,198 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spk-html5
3
+ version: !ruby/object:Gem::Version
4
+ hash: 53
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 10
9
+ - 1
10
+ version: 0.10.1
11
+ platform: ruby
12
+ authors:
13
+ - Ryan King
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-05 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rchardet
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 9
30
+ segments:
31
+ - 1
32
+ - 3
33
+ version: "1.3"
34
+ type: :runtime
35
+ version_requirements: *id001
36
+ - !ruby/object:Gem::Dependency
37
+ name: rubyforge
38
+ prerelease: false
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ hash: 7
45
+ segments:
46
+ - 2
47
+ - 0
48
+ - 4
49
+ version: 2.0.4
50
+ type: :development
51
+ version_requirements: *id002
52
+ - !ruby/object:Gem::Dependency
53
+ name: hoe
54
+ prerelease: false
55
+ requirement: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 19
61
+ segments:
62
+ - 2
63
+ - 6
64
+ - 2
65
+ version: 2.6.2
66
+ type: :development
67
+ version_requirements: *id003
68
+ description: A ruby implementation of the parsing algorithm in HTML5.
69
+ email: ryan@theryanking.com
70
+ executables:
71
+ - html5
72
+ extensions: []
73
+
74
+ extra_rdoc_files:
75
+ - History.txt
76
+ - Manifest.txt
77
+ files:
78
+ - History.txt
79
+ - Manifest.txt
80
+ - README
81
+ - Rakefile.rb
82
+ - bin/html5
83
+ - lib/html5.rb
84
+ - lib/html5/cli.rb
85
+ - lib/html5/constants.rb
86
+ - lib/html5/filters/base.rb
87
+ - lib/html5/filters/inject_meta_charset.rb
88
+ - lib/html5/filters/iso639codes.rb
89
+ - lib/html5/filters/optionaltags.rb
90
+ - lib/html5/filters/rfc2046.rb
91
+ - lib/html5/filters/rfc3987.rb
92
+ - lib/html5/filters/sanitizer.rb
93
+ - lib/html5/filters/validator.rb
94
+ - lib/html5/filters/whitespace.rb
95
+ - lib/html5/html5parser.rb
96
+ - lib/html5/html5parser/after_after_body_phase.rb
97
+ - lib/html5/html5parser/after_after_frameset_phase.rb
98
+ - lib/html5/html5parser/after_body_phase.rb
99
+ - lib/html5/html5parser/after_frameset_phase.rb
100
+ - lib/html5/html5parser/after_head_phase.rb
101
+ - lib/html5/html5parser/before_head_phase.rb
102
+ - lib/html5/html5parser/before_html_phase.rb
103
+ - lib/html5/html5parser/in_body_phase.rb
104
+ - lib/html5/html5parser/in_caption_phase.rb
105
+ - lib/html5/html5parser/in_cell_phase.rb
106
+ - lib/html5/html5parser/in_column_group_phase.rb
107
+ - lib/html5/html5parser/in_foreign_content_phase.rb
108
+ - lib/html5/html5parser/in_frameset_phase.rb
109
+ - lib/html5/html5parser/in_head_phase.rb
110
+ - lib/html5/html5parser/in_row_phase.rb
111
+ - lib/html5/html5parser/in_select_phase.rb
112
+ - lib/html5/html5parser/in_select_table_phase.rb
113
+ - lib/html5/html5parser/in_table_body_phase.rb
114
+ - lib/html5/html5parser/in_table_phase.rb
115
+ - lib/html5/html5parser/initial_phase.rb
116
+ - lib/html5/html5parser/phase.rb
117
+ - lib/html5/inputstream.rb
118
+ - lib/html5/liberalxmlparser.rb
119
+ - lib/html5/sanitizer.rb
120
+ - lib/html5/serializer.rb
121
+ - lib/html5/serializer/htmlserializer.rb
122
+ - lib/html5/serializer/xhtmlserializer.rb
123
+ - lib/html5/sniffer.rb
124
+ - lib/html5/tokenizer.rb
125
+ - lib/html5/treebuilders.rb
126
+ - lib/html5/treebuilders/base.rb
127
+ - lib/html5/treebuilders/hpricot.rb
128
+ - lib/html5/treebuilders/rexml.rb
129
+ - lib/html5/treebuilders/simpletree.rb
130
+ - lib/html5/treewalkers.rb
131
+ - lib/html5/treewalkers/base.rb
132
+ - lib/html5/treewalkers/hpricot.rb
133
+ - lib/html5/treewalkers/rexml.rb
134
+ - lib/html5/treewalkers/simpletree.rb
135
+ - lib/html5/version.rb
136
+ - test/preamble.rb
137
+ - test/test_cli.rb
138
+ - test/test_encoding.rb
139
+ - test/test_input_stream.rb
140
+ - test/test_lxp.rb
141
+ - test/test_parser.rb
142
+ - test/test_sanitizer.rb
143
+ - test/test_serializer.rb
144
+ - test/test_sniffer.rb
145
+ - test/test_stream.rb
146
+ - test/test_tokenizer.rb
147
+ - test/test_treewalkers.rb
148
+ - test/test_validator.rb
149
+ - test/tokenizer_test_parser.rb
150
+ - test19.rb
151
+ has_rdoc: true
152
+ homepage: http://code.google.com/p/html5lib
153
+ licenses: []
154
+
155
+ post_install_message:
156
+ rdoc_options:
157
+ - --main
158
+ - README.txt
159
+ require_paths:
160
+ - lib
161
+ required_ruby_version: !ruby/object:Gem::Requirement
162
+ none: false
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ hash: 3
167
+ segments:
168
+ - 0
169
+ version: "0"
170
+ required_rubygems_version: !ruby/object:Gem::Requirement
171
+ none: false
172
+ requirements:
173
+ - - ">="
174
+ - !ruby/object:Gem::Version
175
+ hash: 3
176
+ segments:
177
+ - 0
178
+ version: "0"
179
+ requirements: []
180
+
181
+ rubyforge_project: spk-html5
182
+ rubygems_version: 1.3.7
183
+ signing_key:
184
+ specification_version: 3
185
+ summary: HTML5 parser/tokenizer.
186
+ test_files:
187
+ - test/test_sniffer.rb
188
+ - test/test_treewalkers.rb
189
+ - test/test_input_stream.rb
190
+ - test/test_stream.rb
191
+ - test/test_encoding.rb
192
+ - test/test_serializer.rb
193
+ - test/test_validator.rb
194
+ - test/test_tokenizer.rb
195
+ - test/test_sanitizer.rb
196
+ - test/test_parser.rb
197
+ - test/test_cli.rb
198
+ - test/test_lxp.rb