gammo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.travis.yml +6 -0
  4. data/Gemfile +9 -0
  5. data/Gemfile.lock +27 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +177 -0
  8. data/Rakefile +25 -0
  9. data/gammo.gemspec +23 -0
  10. data/lib/gammo.rb +15 -0
  11. data/lib/gammo/attribute.rb +17 -0
  12. data/lib/gammo/fragment_parser.rb +65 -0
  13. data/lib/gammo/node.rb +157 -0
  14. data/lib/gammo/parser.rb +524 -0
  15. data/lib/gammo/parser/constants.rb +94 -0
  16. data/lib/gammo/parser/foreign.rb +307 -0
  17. data/lib/gammo/parser/insertion_mode.rb +74 -0
  18. data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
  19. data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
  20. data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
  21. data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
  22. data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
  23. data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
  24. data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
  25. data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
  26. data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
  27. data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
  28. data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
  29. data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
  30. data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
  31. data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
  32. data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
  33. data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
  34. data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
  35. data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
  36. data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
  37. data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
  38. data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
  39. data/lib/gammo/parser/insertion_mode/text.rb +32 -0
  40. data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
  41. data/lib/gammo/parser/node_stack.rb +24 -0
  42. data/lib/gammo/tags.rb +9 -0
  43. data/lib/gammo/tags/table.rb +744 -0
  44. data/lib/gammo/tokenizer.rb +373 -0
  45. data/lib/gammo/tokenizer/debug.rb +34 -0
  46. data/lib/gammo/tokenizer/entity.rb +2240 -0
  47. data/lib/gammo/tokenizer/escape.rb +174 -0
  48. data/lib/gammo/tokenizer/script_scanner.rb +229 -0
  49. data/lib/gammo/tokenizer/tokens.rb +66 -0
  50. data/lib/gammo/version.rb +3 -0
  51. data/misc/html.yaml +384 -0
  52. data/misc/table.erubi +14 -0
  53. metadata +97 -0
@@ -0,0 +1,46 @@
1
+ module Gammo
2
+ class Parser
3
+ # Section 12.2.6.4.17.
4
+ class InSelectInTable < InsertionMode
5
+ def start_tag_token(token)
6
+ case token.tag
7
+ when Tags::Caption, Tags::Table, Tags::Tbody, Tags::Tfoot, Tags::Thead, Tags::Tr, Tags::Td, Tags::Th
8
+ if token.instance_of?(Tokenizer::EndTagToken) && parser.element_in_scope?(TABLE_SCOPE, token.tag)
9
+ # ignore the token
10
+ halt true
11
+ end
12
+ parser.open_elements.reverse_each_with_index do |elm, i|
13
+ if elm.tag == Tags::Select
14
+ parser.open_elements = parser.open_elements.slice(0, i)
15
+ break
16
+ end
17
+ end
18
+ parser.reset_insertion_mode
19
+ halt false
20
+ end
21
+ end
22
+
23
+ def end_tag_token(token)
24
+ case token.tag
25
+ when Tags::Caption, Tags::Table, Tags::Tbody, Tags::Tfoot, Tags::Thead, Tags::Tr, Tags::Td, Tags::Th
26
+ if token.instance_of?(Tokenizer::EndTagToken) && !parser.element_in_scope?(TABLE_SCOPE, token.tag)
27
+ # ignore the token
28
+ halt true
29
+ end
30
+ parser.open_elements.reverse_each_with_index do |elm, i|
31
+ if elm.tag == Tags::Select
32
+ parser.open_elements = parser.open_elements.slice(0, i)
33
+ break
34
+ end
35
+ end
36
+ parser.reset_insertion_mode
37
+ halt false
38
+ end
39
+ end
40
+
41
+ def default(_)
42
+ halt InSelect.new(parser).process
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,114 @@
1
+ module Gammo
2
+ class Parser
3
+ # Section 12.2.6.4.9.
4
+ class InTable < InsertionMode
5
+ def text_token(token)
6
+ token.data = token.data.gsub("\x00", "")
7
+ case parser.open_elements.last.tag
8
+ when Tags::Table, Tags::Tbody, Tags::Tfoot, Tags::Thead, Tags::Tr
9
+ if token.data.strip == ""
10
+ parser.add_text token.data
11
+ halt true
12
+ end
13
+ end
14
+ end
15
+
16
+ def start_tag_token(token)
17
+ case token.tag
18
+ when Tags::Caption
19
+ parser.clear_stack_to_context(TABLE_SCOPE)
20
+ parser.active_formatting_elements << Node::DEFAULT_SCOPE_MARKER
21
+ parser.add_element
22
+ parser.insertion_mode = InCaption
23
+ halt true
24
+ when Tags::Colgroup
25
+ parser.clear_stack_to_context(TABLE_SCOPE)
26
+ parser.add_element
27
+ parser.insertion_mode = InColumnGroup
28
+ halt true
29
+ when Tags::Col
30
+ parser.parse_implied_token(Tokenizer::StartTagToken, Tags::Colgroup, Tags::Colgroup.to_s)
31
+ halt false
32
+ when Tags::Tbody, Tags::Tfoot, Tags::Thead
33
+ parser.clear_stack_to_context(TABLE_SCOPE)
34
+ parser.add_element
35
+ parser.insertion_mode = InTableBody
36
+ halt true
37
+ when Tags::Td, Tags::Th, Tags::Tr
38
+ parser.parse_implied_token(Tokenizer::StartTagToken, Tags::Tbody, Tags::Tbody.to_s)
39
+ halt false
40
+ when Tags::Table
41
+ if parser.pop_until(TABLE_SCOPE, Tags::Table)
42
+ parser.reset_insertion_mode
43
+ halt false
44
+ end
45
+ # ignore the token
46
+ halt true
47
+ when Tags::Style, Tags::Script, Tags::Template
48
+ halt InHead.new(parser).process
49
+ when Tags::Input
50
+ token.attributes.each do |attr|
51
+ # skip setting frameset_ok = false
52
+ if attr.key == 'type' && attr.value.downcase == 'hidden'
53
+ parser.add_element
54
+ parser.open_elements.pop
55
+ halt true
56
+ end
57
+ end
58
+ when Tags::Form
59
+ # ignore the token
60
+ halt true if parser.form || parser.open_elements.any? { |open_element| open_element.tag == Tags::Template }
61
+ parser.add_element
62
+ parser.form = parser.open_elements.pop
63
+ when Tags::Select
64
+ parser.reconstruct_active_formatting_elements
65
+ case parser.top.tag
66
+ when Tags::Table, Tags::Tbody, Tags::Tfoot, Tags::Thead, Tags::Tr
67
+ parser.foster_parenting = true
68
+ end
69
+ parser.add_element
70
+ parser.foster_parenting = false
71
+ parser.frameset_ok = true
72
+ parser.insertion_mode = InSelectInTable
73
+ halt true
74
+ end
75
+ end
76
+
77
+ def end_tag_token(token)
78
+ case token.tag
79
+ when Tags::Table
80
+ parser.reset_insertion_mode if parser.pop_until(TABLE_SCOPE, Tags::Table)
81
+ # Ignore the token
82
+ halt true
83
+ when Tags::Body, Tags::Caption, Tags::Col, Tags::Colgroup, Tags::Html,
84
+ Tags::Tbody, Tags::Td, Tags::Tfoot, Tags::Th, Tags::Thead, Tags::Tr
85
+ # Ignore the token
86
+ halt true
87
+ when Tags::Template
88
+ halt InHead.new(parser).process
89
+ end
90
+ end
91
+
92
+ def comment_token(token)
93
+ parser.add_child(Node::Comment.new(data: token.data))
94
+ halt true
95
+ end
96
+
97
+ def doctype_token(token)
98
+ # Ignore the token
99
+ halt true
100
+ end
101
+
102
+ def error_token(token)
103
+ InBody.new(parser).process
104
+ end
105
+
106
+ def default(_)
107
+ parser.foster_parenting = true
108
+ result = InBody.new(parser).process
109
+ parser.foster_parenting = false
110
+ halt result
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,55 @@
1
+ module Gammo
2
+ class Parser
3
+ # Section 12.2.6.4.13.
4
+ class InTableBody < InsertionMode
5
+ def start_tag_token(token)
6
+ case token.tag
7
+ when Tags::Tr
8
+ parser.clear_stack_to_context(TABLE_BODY_SCOPE)
9
+ parser.add_element
10
+ parser.insertion_mode = InRow
11
+ halt true
12
+ when Tags::Td, Tags::Th
13
+ parser.parse_implied_token(Tokenizer::StartTagToken, Tags::Tr, Tags::Tr.to_s)
14
+ halt false
15
+ when Tags::Caption, Tags::Col, Tags::Colgroup, Tags::Tbody, Tags::Tfoot, Tags::Thead
16
+ # ignore the token
17
+ halt true unless parser.pop_until(TABLE_SCOPE, Tags::Tbody, Tags::Thead, Tags::Tfoot)
18
+ parser.insertion_mode = InTable
19
+ halt false
20
+ end
21
+ end
22
+
23
+ def end_tag_token(token)
24
+ case token.tag
25
+ when Tags::Tbody, Tags::Tfoot, Tags::Thead
26
+ if parser.element_in_scope?(TABLE_SCOPE, token.tag)
27
+ parser.clear_stack_to_context(TABLE_BODY_SCOPE)
28
+ parser.open_elements.pop
29
+ parser.insertion_mode = InTable
30
+ end
31
+ halt true
32
+ when Tags::Table
33
+ if parser.pop_until(TABLE_SCOPE, Tags::Tbody, Tags::Thead, Tags::Tfoot)
34
+ parser.insertion_mode = InTable
35
+ halt false
36
+ end
37
+ # ignore the token
38
+ halt true
39
+ when Tags::Body, Tags::Caption, Tags::Colgroup, Tags::Html, Tags::Td, Tags::Th, Tags::Tr
40
+ # ignore the token
41
+ halt true
42
+ end
43
+ end
44
+
45
+ def comment_token(token)
46
+ parser.add_child(Node::Comment.new(data: token.data))
47
+ halt true
48
+ end
49
+
50
+ def default(_)
51
+ halt InTable.new(parser).process
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,80 @@
1
+ module Gammo
2
+ class Parser
3
+ # Section 12.2.6.4.18.
4
+ class InTemplate < InsertionMode
5
+ def text_token(token)
6
+ halt InBody.new(parser).process
7
+ end
8
+
9
+ def comment_token(token)
10
+ halt InBody.new(parser).process
11
+ end
12
+
13
+ def doctype_token(token)
14
+ halt InBody.new(parser).process
15
+ end
16
+
17
+ def start_tag_token(token)
18
+ case token.tag
19
+ when Tags::Base, Tags::Basefont, Tags::Bgsound, Tags::Link, Tags::Meta, Tags::Noframes, Tags::Script, Tags::Style, Tags::Template, Tags::Title
20
+ halt InHead.new(parser).process
21
+ when Tags::Caption, Tags::Colgroup, Tags::Tbody, Tags::Tfoot, Tags::Thead
22
+ parser.template_stack.pop
23
+ parser.template_stack << InTable
24
+ parser.insertion_mode = InTable
25
+ halt false
26
+ when Tags::Col
27
+ parser.template_stack.pop
28
+ parser.template_stack << InColumnGroup
29
+ parser.insertion_mode = InColumnGroup
30
+ halt false
31
+ when Tags::Tr
32
+ parser.template_stack.pop
33
+ parser.template_stack << InTableBody
34
+ parser.insertion_mode = InTableBody
35
+ halt false
36
+ when Tags::Td, Tags::Th
37
+ parser.template_stack.pop
38
+ parser.template_stack << InRow
39
+ parser.insertion_mode = InRow
40
+ halt false
41
+ else
42
+ parser.template_stack.pop
43
+ parser.template_stack << InBody
44
+ parser.insertion_mode = InBody
45
+ halt false
46
+ end
47
+ end
48
+
49
+ def end_tag_token(token)
50
+ case token.tag
51
+ when Tags::Template
52
+ halt InHead.new(parser).process
53
+ else
54
+ # ignore the token
55
+ halt true
56
+ end
57
+ end
58
+
59
+ def error_token(token)
60
+ halt true unless parser.open_elements.any? {|elm| elm.tag == Tags::Template }
61
+ # remove this divergence from the html5 spec
62
+ parser.generate_implied_end_tags
63
+ parser.open_elements.reverse_each_with_index do |elm, index|
64
+ if !elm.namespace && elm.tag == Tags::Template
65
+ parser.open_elements = parser.open_elements.slice(0, index)
66
+ break
67
+ end
68
+ end
69
+ parser.clear_active_formatting_elements
70
+ parser.template_stack.pop
71
+ parser.reset_insertion_mode
72
+ halt false
73
+ end
74
+
75
+ def default(_)
76
+ halt false
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,152 @@
1
+ require 'gammo/attribute'
2
+
3
+ module Gammo
4
+ class Parser
5
+ # Section 12.2.6.4.1
6
+ class Initial < InsertionMode
7
+ def text_token(token)
8
+ token.data = token.data.lstrip
9
+ # it's all whitespace so ignore it.
10
+ halt true if token.data.length.zero?
11
+ end
12
+
13
+ def comment_token(token)
14
+ parser.document.append_child(Node::Comment.new(data: token.data))
15
+ halt true
16
+ end
17
+
18
+ def doctype_token(token)
19
+ n, quirks = parse_doctype(token.data)
20
+ parser.document.append_child(n)
21
+ parser.quirks = quirks
22
+ parser.insertion_mode = BeforeHTML
23
+ halt true
24
+ end
25
+
26
+ def default(_)
27
+ parser.quirks = true
28
+ parser.insertion_mode = BeforeHTML
29
+ halt false
30
+ end
31
+
32
+ QUIRKY_IDS = [
33
+ "+//silmaril//dtd html pro v0r11 19970101//",
34
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
35
+ "-//as//dtd html 3.0 aswedit + extensions//",
36
+ "-//ietf//dtd html 2.0 level 1//",
37
+ "-//ietf//dtd html 2.0 level 2//",
38
+ "-//ietf//dtd html 2.0 strict level 1//",
39
+ "-//ietf//dtd html 2.0 strict level 2//",
40
+ "-//ietf//dtd html 2.0 strict//",
41
+ "-//ietf//dtd html 2.0//",
42
+ "-//ietf//dtd html 2.1e//",
43
+ "-//ietf//dtd html 3.0//",
44
+ "-//ietf//dtd html 3.2 final//",
45
+ "-//ietf//dtd html 3.2//",
46
+ "-//ietf//dtd html 3//",
47
+ "-//ietf//dtd html level 0//",
48
+ "-//ietf//dtd html level 1//",
49
+ "-//ietf//dtd html level 2//",
50
+ "-//ietf//dtd html level 3//",
51
+ "-//ietf//dtd html strict level 0//",
52
+ "-//ietf//dtd html strict level 1//",
53
+ "-//ietf//dtd html strict level 2//",
54
+ "-//ietf//dtd html strict level 3//",
55
+ "-//ietf//dtd html strict//",
56
+ "-//ietf//dtd html//",
57
+ "-//metrius//dtd metrius presentational//",
58
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
59
+ "-//microsoft//dtd internet explorer 2.0 html//",
60
+ "-//microsoft//dtd internet explorer 2.0 tables//",
61
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
62
+ "-//microsoft//dtd internet explorer 3.0 html//",
63
+ "-//microsoft//dtd internet explorer 3.0 tables//",
64
+ "-//netscape comm. corp.//dtd html//",
65
+ "-//netscape comm. corp.//dtd strict html//",
66
+ "-//o'reilly and associates//dtd html 2.0//",
67
+ "-//o'reilly and associates//dtd html extended 1.0//",
68
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
69
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
70
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
71
+ "-//spyglass//dtd html 2.0 extended//",
72
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
73
+ "-//sun microsystems corp.//dtd hotjava html//",
74
+ "-//sun microsystems corp.//dtd hotjava strict html//",
75
+ "-//w3c//dtd html 3 1995-03-24//",
76
+ "-//w3c//dtd html 3.2 draft//",
77
+ "-//w3c//dtd html 3.2 final//",
78
+ "-//w3c//dtd html 3.2//",
79
+ "-//w3c//dtd html 3.2s draft//",
80
+ "-//w3c//dtd html 4.0 frameset//",
81
+ "-//w3c//dtd html 4.0 transitional//",
82
+ "-//w3c//dtd html experimental 19960712//",
83
+ "-//w3c//dtd html experimental 970421//",
84
+ "-//w3c//dtd w3 html//",
85
+ "-//w3o//dtd w3 html 3.0//",
86
+ "-//webtechs//dtd mozilla html 2.0//",
87
+ "-//webtechs//dtd mozilla html//"
88
+ ].freeze
89
+
90
+ def parse_doctype(s)
91
+ node = Node::Doctype.new
92
+ pos = s.index(?\s)
93
+ pos = s.length unless pos
94
+ node.data = s.slice(0, pos)
95
+ quirks = false
96
+ quirks = true if node.data != 'html'
97
+ node.data = node.data.downcase
98
+ s = s.slice(pos..-1).lstrip
99
+ return [node, quirks || s != ''] if s.length < 6
100
+
101
+ key = s.slice(0, 6).downcase
102
+ s = s.slice(6..-1)
103
+ while key == 'public' || key == 'system'
104
+ s = s.lstrip
105
+ break if s.empty?
106
+ quote = s[0]
107
+ break if quote != ?" && quote != ?'
108
+ s = s.slice(1..-1)
109
+ id = ''
110
+ q = s.index(quote)
111
+ if q
112
+ id = s.slice(0, q)
113
+ s = s.slice((q + 1)..-1)
114
+ else
115
+ id = s
116
+ s = ''
117
+ end
118
+ node.attributes << Attribute.new(key: key, value: id)
119
+ key = key == 'public' ? 'system' : ''
120
+ if key != '' || s != ''
121
+ quirks = true
122
+ elsif node.attributes.length > 0
123
+ if node.attributes.first.key == 'public'
124
+ pub = node.attributes.first.value.downcase
125
+ case pub
126
+ when '-//w3o//dtd w3 html strict 3.0//en//', '-/w3d/dtd html 4.0 transitional/en', 'html'
127
+ quirks = true
128
+ else
129
+ QUIRKY_IDS.each do |quirky|
130
+ if pub.start_with?(quirky)
131
+ quirks = true
132
+ break
133
+ end
134
+ end
135
+ end
136
+ if node.attributes.length == 1 && pub.start_with?('-//w3c//dtd html 4.01 frameset//') || pub.start_with?('-//w3c//dtd html 4.01 transitional//')
137
+ quirks = true
138
+ end
139
+ end
140
+ last = node.attributes.last
141
+ if last.key == 'system' && last.value.downcase == 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
142
+ quirks = true
143
+ end
144
+ end
145
+ end
146
+ [node, quirks]
147
+ end
148
+
149
+ private :parse_doctype
150
+ end
151
+ end
152
+ end