gammo 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.travis.yml +6 -0
  4. data/Gemfile +9 -0
  5. data/Gemfile.lock +27 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +177 -0
  8. data/Rakefile +25 -0
  9. data/gammo.gemspec +23 -0
  10. data/lib/gammo.rb +15 -0
  11. data/lib/gammo/attribute.rb +17 -0
  12. data/lib/gammo/fragment_parser.rb +65 -0
  13. data/lib/gammo/node.rb +157 -0
  14. data/lib/gammo/parser.rb +524 -0
  15. data/lib/gammo/parser/constants.rb +94 -0
  16. data/lib/gammo/parser/foreign.rb +307 -0
  17. data/lib/gammo/parser/insertion_mode.rb +74 -0
  18. data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
  19. data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
  20. data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
  21. data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
  22. data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
  23. data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
  24. data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
  25. data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
  26. data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
  27. data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
  28. data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
  29. data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
  30. data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
  31. data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
  32. data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
  33. data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
  34. data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
  35. data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
  36. data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
  37. data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
  38. data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
  39. data/lib/gammo/parser/insertion_mode/text.rb +32 -0
  40. data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
  41. data/lib/gammo/parser/node_stack.rb +24 -0
  42. data/lib/gammo/tags.rb +9 -0
  43. data/lib/gammo/tags/table.rb +744 -0
  44. data/lib/gammo/tokenizer.rb +373 -0
  45. data/lib/gammo/tokenizer/debug.rb +34 -0
  46. data/lib/gammo/tokenizer/entity.rb +2240 -0
  47. data/lib/gammo/tokenizer/escape.rb +174 -0
  48. data/lib/gammo/tokenizer/script_scanner.rb +229 -0
  49. data/lib/gammo/tokenizer/tokens.rb +66 -0
  50. data/lib/gammo/version.rb +3 -0
  51. data/misc/html.yaml +384 -0
  52. data/misc/table.erubi +14 -0
  53. metadata +97 -0
@@ -0,0 +1,46 @@
1
+ module Gammo
2
+ class Parser
3
+ # Section 12.2.6.4.17.
4
+ class InSelectInTable < InsertionMode
5
+ def start_tag_token(token)
6
+ case token.tag
7
+ when Tags::Caption, Tags::Table, Tags::Tbody, Tags::Tfoot, Tags::Thead, Tags::Tr, Tags::Td, Tags::Th
8
+ if token.instance_of?(Tokenizer::EndTagToken) && parser.element_in_scope?(TABLE_SCOPE, token.tag)
9
+ # ignore the token
10
+ halt true
11
+ end
12
+ parser.open_elements.reverse_each_with_index do |elm, i|
13
+ if elm.tag == Tags::Select
14
+ parser.open_elements = parser.open_elements.slice(0, i)
15
+ break
16
+ end
17
+ end
18
+ parser.reset_insertion_mode
19
+ halt false
20
+ end
21
+ end
22
+
23
+ def end_tag_token(token)
24
+ case token.tag
25
+ when Tags::Caption, Tags::Table, Tags::Tbody, Tags::Tfoot, Tags::Thead, Tags::Tr, Tags::Td, Tags::Th
26
+ if token.instance_of?(Tokenizer::EndTagToken) && !parser.element_in_scope?(TABLE_SCOPE, token.tag)
27
+ # ignore the token
28
+ halt true
29
+ end
30
+ parser.open_elements.reverse_each_with_index do |elm, i|
31
+ if elm.tag == Tags::Select
32
+ parser.open_elements = parser.open_elements.slice(0, i)
33
+ break
34
+ end
35
+ end
36
+ parser.reset_insertion_mode
37
+ halt false
38
+ end
39
+ end
40
+
41
+ def default(_)
42
+ halt InSelect.new(parser).process
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,114 @@
1
+ module Gammo
2
+ class Parser
3
+ # Section 12.2.6.4.9.
4
+ class InTable < InsertionMode
5
+ def text_token(token)
6
+ token.data = token.data.gsub("\x00", "")
7
+ case parser.open_elements.last.tag
8
+ when Tags::Table, Tags::Tbody, Tags::Tfoot, Tags::Thead, Tags::Tr
9
+ if token.data.strip == ""
10
+ parser.add_text token.data
11
+ halt true
12
+ end
13
+ end
14
+ end
15
+
16
+ def start_tag_token(token)
17
+ case token.tag
18
+ when Tags::Caption
19
+ parser.clear_stack_to_context(TABLE_SCOPE)
20
+ parser.active_formatting_elements << Node::DEFAULT_SCOPE_MARKER
21
+ parser.add_element
22
+ parser.insertion_mode = InCaption
23
+ halt true
24
+ when Tags::Colgroup
25
+ parser.clear_stack_to_context(TABLE_SCOPE)
26
+ parser.add_element
27
+ parser.insertion_mode = InColumnGroup
28
+ halt true
29
+ when Tags::Col
30
+ parser.parse_implied_token(Tokenizer::StartTagToken, Tags::Colgroup, Tags::Colgroup.to_s)
31
+ halt false
32
+ when Tags::Tbody, Tags::Tfoot, Tags::Thead
33
+ parser.clear_stack_to_context(TABLE_SCOPE)
34
+ parser.add_element
35
+ parser.insertion_mode = InTableBody
36
+ halt true
37
+ when Tags::Td, Tags::Th, Tags::Tr
38
+ parser.parse_implied_token(Tokenizer::StartTagToken, Tags::Tbody, Tags::Tbody.to_s)
39
+ halt false
40
+ when Tags::Table
41
+ if parser.pop_until(TABLE_SCOPE, Tags::Table)
42
+ parser.reset_insertion_mode
43
+ halt false
44
+ end
45
+ # ignore the token
46
+ halt true
47
+ when Tags::Style, Tags::Script, Tags::Template
48
+ halt InHead.new(parser).process
49
+ when Tags::Input
50
+ token.attributes.each do |attr|
51
+ # skip setting frameset_ok = false
52
+ if attr.key == 'type' && attr.value.downcase == 'hidden'
53
+ parser.add_element
54
+ parser.open_elements.pop
55
+ halt true
56
+ end
57
+ end
58
+ when Tags::Form
59
+ # ignore the token
60
+ halt true if parser.form || parser.open_elements.any? { |open_element| open_element.tag == Tags::Template }
61
+ parser.add_element
62
+ parser.form = parser.open_elements.pop
63
+ when Tags::Select
64
+ parser.reconstruct_active_formatting_elements
65
+ case parser.top.tag
66
+ when Tags::Table, Tags::Tbody, Tags::Tfoot, Tags::Thead, Tags::Tr
67
+ parser.foster_parenting = true
68
+ end
69
+ parser.add_element
70
+ parser.foster_parenting = false
71
+ parser.frameset_ok = true
72
+ parser.insertion_mode = InSelectInTable
73
+ halt true
74
+ end
75
+ end
76
+
77
+ def end_tag_token(token)
78
+ case token.tag
79
+ when Tags::Table
80
+ parser.reset_insertion_mode if parser.pop_until(TABLE_SCOPE, Tags::Table)
81
+ # Ignore the token
82
+ halt true
83
+ when Tags::Body, Tags::Caption, Tags::Col, Tags::Colgroup, Tags::Html,
84
+ Tags::Tbody, Tags::Td, Tags::Tfoot, Tags::Th, Tags::Thead, Tags::Tr
85
+ # Ignore the token
86
+ halt true
87
+ when Tags::Template
88
+ halt InHead.new(parser).process
89
+ end
90
+ end
91
+
92
+ def comment_token(token)
93
+ parser.add_child(Node::Comment.new(data: token.data))
94
+ halt true
95
+ end
96
+
97
+ def doctype_token(token)
98
+ # Ignore the token
99
+ halt true
100
+ end
101
+
102
+ def error_token(token)
103
+ InBody.new(parser).process
104
+ end
105
+
106
+ def default(_)
107
+ parser.foster_parenting = true
108
+ result = InBody.new(parser).process
109
+ parser.foster_parenting = false
110
+ halt result
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,55 @@
1
+ module Gammo
2
+ class Parser
3
+ # Section 12.2.6.4.13.
4
+ class InTableBody < InsertionMode
5
+ def start_tag_token(token)
6
+ case token.tag
7
+ when Tags::Tr
8
+ parser.clear_stack_to_context(TABLE_BODY_SCOPE)
9
+ parser.add_element
10
+ parser.insertion_mode = InRow
11
+ halt true
12
+ when Tags::Td, Tags::Th
13
+ parser.parse_implied_token(Tokenizer::StartTagToken, Tags::Tr, Tags::Tr.to_s)
14
+ halt false
15
+ when Tags::Caption, Tags::Col, Tags::Colgroup, Tags::Tbody, Tags::Tfoot, Tags::Thead
16
+ # ignore the token
17
+ halt true unless parser.pop_until(TABLE_SCOPE, Tags::Tbody, Tags::Thead, Tags::Tfoot)
18
+ parser.insertion_mode = InTable
19
+ halt false
20
+ end
21
+ end
22
+
23
+ def end_tag_token(token)
24
+ case token.tag
25
+ when Tags::Tbody, Tags::Tfoot, Tags::Thead
26
+ if parser.element_in_scope?(TABLE_SCOPE, token.tag)
27
+ parser.clear_stack_to_context(TABLE_BODY_SCOPE)
28
+ parser.open_elements.pop
29
+ parser.insertion_mode = InTable
30
+ end
31
+ halt true
32
+ when Tags::Table
33
+ if parser.pop_until(TABLE_SCOPE, Tags::Tbody, Tags::Thead, Tags::Tfoot)
34
+ parser.insertion_mode = InTable
35
+ halt false
36
+ end
37
+ # ignore the token
38
+ halt true
39
+ when Tags::Body, Tags::Caption, Tags::Colgroup, Tags::Html, Tags::Td, Tags::Th, Tags::Tr
40
+ # ignore the token
41
+ halt true
42
+ end
43
+ end
44
+
45
+ def comment_token(token)
46
+ parser.add_child(Node::Comment.new(data: token.data))
47
+ halt true
48
+ end
49
+
50
+ def default(_)
51
+ halt InTable.new(parser).process
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,80 @@
1
+ module Gammo
2
+ class Parser
3
+ # Section 12.2.6.4.18.
4
+ class InTemplate < InsertionMode
5
+ def text_token(token)
6
+ halt InBody.new(parser).process
7
+ end
8
+
9
+ def comment_token(token)
10
+ halt InBody.new(parser).process
11
+ end
12
+
13
+ def doctype_token(token)
14
+ halt InBody.new(parser).process
15
+ end
16
+
17
+ def start_tag_token(token)
18
+ case token.tag
19
+ when Tags::Base, Tags::Basefont, Tags::Bgsound, Tags::Link, Tags::Meta, Tags::Noframes, Tags::Script, Tags::Style, Tags::Template, Tags::Title
20
+ halt InHead.new(parser).process
21
+ when Tags::Caption, Tags::Colgroup, Tags::Tbody, Tags::Tfoot, Tags::Thead
22
+ parser.template_stack.pop
23
+ parser.template_stack << InTable
24
+ parser.insertion_mode = InTable
25
+ halt false
26
+ when Tags::Col
27
+ parser.template_stack.pop
28
+ parser.template_stack << InColumnGroup
29
+ parser.insertion_mode = InColumnGroup
30
+ halt false
31
+ when Tags::Tr
32
+ parser.template_stack.pop
33
+ parser.template_stack << InTableBody
34
+ parser.insertion_mode = InTableBody
35
+ halt false
36
+ when Tags::Td, Tags::Th
37
+ parser.template_stack.pop
38
+ parser.template_stack << InRow
39
+ parser.insertion_mode = InRow
40
+ halt false
41
+ else
42
+ parser.template_stack.pop
43
+ parser.template_stack << InBody
44
+ parser.insertion_mode = InBody
45
+ halt false
46
+ end
47
+ end
48
+
49
+ def end_tag_token(token)
50
+ case token.tag
51
+ when Tags::Template
52
+ halt InHead.new(parser).process
53
+ else
54
+ # ignore the token
55
+ halt true
56
+ end
57
+ end
58
+
59
+ def error_token(token)
60
+ halt true unless parser.open_elements.any? {|elm| elm.tag == Tags::Template }
61
+ # remove this divergence from the html5 spec
62
+ parser.generate_implied_end_tags
63
+ parser.open_elements.reverse_each_with_index do |elm, index|
64
+ if !elm.namespace && elm.tag == Tags::Template
65
+ parser.open_elements = parser.open_elements.slice(0, index)
66
+ break
67
+ end
68
+ end
69
+ parser.clear_active_formatting_elements
70
+ parser.template_stack.pop
71
+ parser.reset_insertion_mode
72
+ halt false
73
+ end
74
+
75
+ def default(_)
76
+ halt false
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,152 @@
1
+ require 'gammo/attribute'
2
+
3
+ module Gammo
4
+ class Parser
5
+ # Section 12.2.6.4.1
6
+ class Initial < InsertionMode
7
+ def text_token(token)
8
+ token.data = token.data.lstrip
9
+ # it's all whitespace so ignore it.
10
+ halt true if token.data.length.zero?
11
+ end
12
+
13
+ def comment_token(token)
14
+ parser.document.append_child(Node::Comment.new(data: token.data))
15
+ halt true
16
+ end
17
+
18
+ def doctype_token(token)
19
+ n, quirks = parse_doctype(token.data)
20
+ parser.document.append_child(n)
21
+ parser.quirks = quirks
22
+ parser.insertion_mode = BeforeHTML
23
+ halt true
24
+ end
25
+
26
+ def default(_)
27
+ parser.quirks = true
28
+ parser.insertion_mode = BeforeHTML
29
+ halt false
30
+ end
31
+
32
+ QUIRKY_IDS = [
33
+ "+//silmaril//dtd html pro v0r11 19970101//",
34
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
35
+ "-//as//dtd html 3.0 aswedit + extensions//",
36
+ "-//ietf//dtd html 2.0 level 1//",
37
+ "-//ietf//dtd html 2.0 level 2//",
38
+ "-//ietf//dtd html 2.0 strict level 1//",
39
+ "-//ietf//dtd html 2.0 strict level 2//",
40
+ "-//ietf//dtd html 2.0 strict//",
41
+ "-//ietf//dtd html 2.0//",
42
+ "-//ietf//dtd html 2.1e//",
43
+ "-//ietf//dtd html 3.0//",
44
+ "-//ietf//dtd html 3.2 final//",
45
+ "-//ietf//dtd html 3.2//",
46
+ "-//ietf//dtd html 3//",
47
+ "-//ietf//dtd html level 0//",
48
+ "-//ietf//dtd html level 1//",
49
+ "-//ietf//dtd html level 2//",
50
+ "-//ietf//dtd html level 3//",
51
+ "-//ietf//dtd html strict level 0//",
52
+ "-//ietf//dtd html strict level 1//",
53
+ "-//ietf//dtd html strict level 2//",
54
+ "-//ietf//dtd html strict level 3//",
55
+ "-//ietf//dtd html strict//",
56
+ "-//ietf//dtd html//",
57
+ "-//metrius//dtd metrius presentational//",
58
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
59
+ "-//microsoft//dtd internet explorer 2.0 html//",
60
+ "-//microsoft//dtd internet explorer 2.0 tables//",
61
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
62
+ "-//microsoft//dtd internet explorer 3.0 html//",
63
+ "-//microsoft//dtd internet explorer 3.0 tables//",
64
+ "-//netscape comm. corp.//dtd html//",
65
+ "-//netscape comm. corp.//dtd strict html//",
66
+ "-//o'reilly and associates//dtd html 2.0//",
67
+ "-//o'reilly and associates//dtd html extended 1.0//",
68
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
69
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
70
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
71
+ "-//spyglass//dtd html 2.0 extended//",
72
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
73
+ "-//sun microsystems corp.//dtd hotjava html//",
74
+ "-//sun microsystems corp.//dtd hotjava strict html//",
75
+ "-//w3c//dtd html 3 1995-03-24//",
76
+ "-//w3c//dtd html 3.2 draft//",
77
+ "-//w3c//dtd html 3.2 final//",
78
+ "-//w3c//dtd html 3.2//",
79
+ "-//w3c//dtd html 3.2s draft//",
80
+ "-//w3c//dtd html 4.0 frameset//",
81
+ "-//w3c//dtd html 4.0 transitional//",
82
+ "-//w3c//dtd html experimental 19960712//",
83
+ "-//w3c//dtd html experimental 970421//",
84
+ "-//w3c//dtd w3 html//",
85
+ "-//w3o//dtd w3 html 3.0//",
86
+ "-//webtechs//dtd mozilla html 2.0//",
87
+ "-//webtechs//dtd mozilla html//"
88
+ ].freeze
89
+
90
+ def parse_doctype(s)
91
+ node = Node::Doctype.new
92
+ pos = s.index(?\s)
93
+ pos = s.length unless pos
94
+ node.data = s.slice(0, pos)
95
+ quirks = false
96
+ quirks = true if node.data != 'html'
97
+ node.data = node.data.downcase
98
+ s = s.slice(pos..-1).lstrip
99
+ return [node, quirks || s != ''] if s.length < 6
100
+
101
+ key = s.slice(0, 6).downcase
102
+ s = s.slice(6..-1)
103
+ while key == 'public' || key == 'system'
104
+ s = s.lstrip
105
+ break if s.empty?
106
+ quote = s[0]
107
+ break if quote != ?" && quote != ?'
108
+ s = s.slice(1..-1)
109
+ id = ''
110
+ q = s.index(quote)
111
+ if q
112
+ id = s.slice(0, q)
113
+ s = s.slice((q + 1)..-1)
114
+ else
115
+ id = s
116
+ s = ''
117
+ end
118
+ node.attributes << Attribute.new(key: key, value: id)
119
+ key = key == 'public' ? 'system' : ''
120
+ if key != '' || s != ''
121
+ quirks = true
122
+ elsif node.attributes.length > 0
123
+ if node.attributes.first.key == 'public'
124
+ pub = node.attributes.first.value.downcase
125
+ case pub
126
+ when '-//w3o//dtd w3 html strict 3.0//en//', '-/w3d/dtd html 4.0 transitional/en', 'html'
127
+ quirks = true
128
+ else
129
+ QUIRKY_IDS.each do |quirky|
130
+ if pub.start_with?(quirky)
131
+ quirks = true
132
+ break
133
+ end
134
+ end
135
+ end
136
+ if node.attributes.length == 1 && pub.start_with?('-//w3c//dtd html 4.01 frameset//') || pub.start_with?('-//w3c//dtd html 4.01 transitional//')
137
+ quirks = true
138
+ end
139
+ end
140
+ last = node.attributes.last
141
+ if last.key == 'system' && last.value.downcase == 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
142
+ quirks = true
143
+ end
144
+ end
145
+ end
146
+ [node, quirks]
147
+ end
148
+
149
+ private :parse_doctype
150
+ end
151
+ end
152
+ end