gammo 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.travis.yml +6 -0
  4. data/Gemfile +9 -0
  5. data/Gemfile.lock +27 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +177 -0
  8. data/Rakefile +25 -0
  9. data/gammo.gemspec +23 -0
  10. data/lib/gammo.rb +15 -0
  11. data/lib/gammo/attribute.rb +17 -0
  12. data/lib/gammo/fragment_parser.rb +65 -0
  13. data/lib/gammo/node.rb +157 -0
  14. data/lib/gammo/parser.rb +524 -0
  15. data/lib/gammo/parser/constants.rb +94 -0
  16. data/lib/gammo/parser/foreign.rb +307 -0
  17. data/lib/gammo/parser/insertion_mode.rb +74 -0
  18. data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
  19. data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
  20. data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
  21. data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
  22. data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
  23. data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
  24. data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
  25. data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
  26. data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
  27. data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
  28. data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
  29. data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
  30. data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
  31. data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
  32. data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
  33. data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
  34. data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
  35. data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
  36. data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
  37. data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
  38. data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
  39. data/lib/gammo/parser/insertion_mode/text.rb +32 -0
  40. data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
  41. data/lib/gammo/parser/node_stack.rb +24 -0
  42. data/lib/gammo/tags.rb +9 -0
  43. data/lib/gammo/tags/table.rb +744 -0
  44. data/lib/gammo/tokenizer.rb +373 -0
  45. data/lib/gammo/tokenizer/debug.rb +34 -0
  46. data/lib/gammo/tokenizer/entity.rb +2240 -0
  47. data/lib/gammo/tokenizer/escape.rb +174 -0
  48. data/lib/gammo/tokenizer/script_scanner.rb +229 -0
  49. data/lib/gammo/tokenizer/tokens.rb +66 -0
  50. data/lib/gammo/version.rb +3 -0
  51. data/misc/html.yaml +384 -0
  52. data/misc/table.erubi +14 -0
  53. metadata +97 -0
@@ -0,0 +1,94 @@
1
+ module Gammo
2
+ class Parser
3
+ # Defines constants that do not fall into a particular concept of linguistic analysis.
4
+ module Constants
5
+ # The following elements have varying levels of special parsing rules.
6
+ # Section 12.2.4.2.
7
+ # @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
8
+ SPECIAL_ELEMENTS = {
9
+ "address" => true,
10
+ "applet" => true,
11
+ "area" => true,
12
+ "article" => true,
13
+ "aside" => true,
14
+ "base" => true,
15
+ "basefont" => true,
16
+ "bgsound" => true,
17
+ "blockquote" => true,
18
+ "body" => true,
19
+ "br" => true,
20
+ "button" => true,
21
+ "caption" => true,
22
+ "center" => true,
23
+ "col" => true,
24
+ "colgroup" => true,
25
+ "dd" => true,
26
+ "details" => true,
27
+ "dir" => true,
28
+ "div" => true,
29
+ "dl" => true,
30
+ "dt" => true,
31
+ "embed" => true,
32
+ "fieldset" => true,
33
+ "figcaption" => true,
34
+ "figure" => true,
35
+ "footer" => true,
36
+ "form" => true,
37
+ "frame" => true,
38
+ "frameset" => true,
39
+ "h1" => true,
40
+ "h2" => true,
41
+ "h3" => true,
42
+ "h4" => true,
43
+ "h5" => true,
44
+ "h6" => true,
45
+ "head" => true,
46
+ "header" => true,
47
+ "hgroup" => true,
48
+ "hr" => true,
49
+ "html" => true,
50
+ "iframe" => true,
51
+ "img" => true,
52
+ "input" => true,
53
+ "keygen" => true,
54
+ "li" => true,
55
+ "link" => true,
56
+ "listing" => true,
57
+ "main" => true,
58
+ "marquee" => true,
59
+ "menu" => true,
60
+ "meta" => true,
61
+ "nav" => true,
62
+ "noembed" => true,
63
+ "noframes" => true,
64
+ "noscript" => true,
65
+ "object" => true,
66
+ "ol" => true,
67
+ "p" => true,
68
+ "param" => true,
69
+ "plaintext" => true,
70
+ "pre" => true,
71
+ "script" => true,
72
+ "section" => true,
73
+ "select" => true,
74
+ "source" => true,
75
+ "style" => true,
76
+ "summary" => true,
77
+ "table" => true,
78
+ "tbody" => true,
79
+ "td" => true,
80
+ "template" => true,
81
+ "textarea" => true,
82
+ "tfoot" => true,
83
+ "th" => true,
84
+ "thead" => true,
85
+ "title" => true,
86
+ "tr" => true,
87
+ "track" => true,
88
+ "ul" => true,
89
+ "wbr" => true,
90
+ "xmp" => true
91
+ }.freeze
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,307 @@
1
+ require 'gammo/node'
2
+ require 'gammo/tags'
3
+
4
+ module Gammo
5
+ class Parser
6
+ # A set of methods and contants for parsing foreign content.
7
+ # Section 12.2.6.5.
8
+ # @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
9
+ module Foreign
10
+ # Element names that are broken out on parsing foreign content.
11
+ BREAKOUT = {
12
+ "b" => true,
13
+ "big" => true,
14
+ "blockquote" => true,
15
+ "body" => true,
16
+ "br" => true,
17
+ "center" => true,
18
+ "code" => true,
19
+ "dd" => true,
20
+ "div" => true,
21
+ "dl" => true,
22
+ "dt" => true,
23
+ "em" => true,
24
+ "embed" => true,
25
+ "h1" => true,
26
+ "h2" => true,
27
+ "h3" => true,
28
+ "h4" => true,
29
+ "h5" => true,
30
+ "h6" => true,
31
+ "head" => true,
32
+ "hr" => true,
33
+ "i" => true,
34
+ "img" => true,
35
+ "li" => true,
36
+ "listing" => true,
37
+ "menu" => true,
38
+ "meta" => true,
39
+ "nobr" => true,
40
+ "ol" => true,
41
+ "p" => true,
42
+ "pre" => true,
43
+ "ruby" => true,
44
+ "s" => true,
45
+ "small" => true,
46
+ "span" => true,
47
+ "strong" => true,
48
+ "strike" => true,
49
+ "sub" => true,
50
+ "sup" => true,
51
+ "table" => true,
52
+ "tt" => true,
53
+ "u" => true,
54
+ "ul" => true,
55
+ "var" => true
56
+ }.freeze
57
+
58
+ # If the token's tag name which is parsed as foreign content and has "svg"
59
+ # namespace matches with the key in the hash below, replace the key with
60
+ # corresponding value.
61
+ SVG_TAG_NAME_ADJUSTMENTS = {
62
+ "altglyph" => "altGlyph",
63
+ "altglyphdef" => "altGlyphDef",
64
+ "altglyphitem" => "altGlyphItem",
65
+ "animatecolor" => "animateColor",
66
+ "animatemotion" => "animateMotion",
67
+ "animatetransform" => "animateTransform",
68
+ "clippath" => "clipPath",
69
+ "feblend" => "feBlend",
70
+ "fecolormatrix" => "feColorMatrix",
71
+ "fecomponenttransfer" => "feComponentTransfer",
72
+ "fecomposite" => "feComposite",
73
+ "feconvolvematrix" => "feConvolveMatrix",
74
+ "fediffuselighting" => "feDiffuseLighting",
75
+ "fedisplacementmap" => "feDisplacementMap",
76
+ "fedistantlight" => "feDistantLight",
77
+ "feflood" => "feFlood",
78
+ "fefunca" => "feFuncA",
79
+ "fefuncb" => "feFuncB",
80
+ "fefuncg" => "feFuncG",
81
+ "fefuncr" => "feFuncR",
82
+ "fegaussianblur" => "feGaussianBlur",
83
+ "feimage" => "feImage",
84
+ "femerge" => "feMerge",
85
+ "femergenode" => "feMergeNode",
86
+ "femorphology" => "feMorphology",
87
+ "feoffset" => "feOffset",
88
+ "fepointlight" => "fePointLight",
89
+ "fespecularlighting" => "feSpecularLighting",
90
+ "fespotlight" => "feSpotLight",
91
+ "fetile" => "feTile",
92
+ "feturbulence" => "feTurbulence",
93
+ "foreignobject" => "foreignObject",
94
+ "glyphref" => "glyphRef",
95
+ "lineargradient" => "linearGradient",
96
+ "radialgradient" => "radialGradient",
97
+ "textpath" => "textPath",
98
+ }.freeze
99
+
100
+ # If any attribute key of the current token which is parsed as foreign content and has "math"
101
+ # namespace matches with the key in the hash below, replace the key with
102
+ # corresponding value.
103
+ # Section 12.2.6.1.
104
+ # https://html.spec.whatwg.org/multipage/parsing.html#creating-and-inserting-nodes
105
+ MATH_ML_ATTRIBUTE_ADJUSTMENTS = {
106
+ "definitionurl" => "definitionURL",
107
+ }.freeze
108
+
109
+ # If any attribute key of the current token which is parsed as foreign content and has "svg"
110
+ # namespace matches with the key in the hash below, replace the key with
111
+ # corresponding value.
112
+ # Section 12.2.6.1.
113
+ # https://html.spec.whatwg.org/multipage/parsing.html#creating-and-inserting-nodes
114
+ SVG_ATTRIBUTE_ADJUSTMENTS = {
115
+ "attributename" => "attributeName",
116
+ "attributetype" => "attributeType",
117
+ "basefrequency" => "baseFrequency",
118
+ "baseprofile" => "baseProfile",
119
+ "calcmode" => "calcMode",
120
+ "clippathunits" => "clipPathUnits",
121
+ "contentscripttype" => "contentScriptType",
122
+ "contentstyletype" => "contentStyleType",
123
+ "diffuseconstant" => "diffuseConstant",
124
+ "edgemode" => "edgeMode",
125
+ "externalresourcesrequired" => "externalResourcesRequired",
126
+ "filterunits" => "filterUnits",
127
+ "glyphref" => "glyphRef",
128
+ "gradienttransform" => "gradientTransform",
129
+ "gradientunits" => "gradientUnits",
130
+ "kernelmatrix" => "kernelMatrix",
131
+ "kernelunitlength" => "kernelUnitLength",
132
+ "keypoints" => "keyPoints",
133
+ "keysplines" => "keySplines",
134
+ "keytimes" => "keyTimes",
135
+ "lengthadjust" => "lengthAdjust",
136
+ "limitingconeangle" => "limitingConeAngle",
137
+ "markerheight" => "markerHeight",
138
+ "markerunits" => "markerUnits",
139
+ "markerwidth" => "markerWidth",
140
+ "maskcontentunits" => "maskContentUnits",
141
+ "maskunits" => "maskUnits",
142
+ "numoctaves" => "numOctaves",
143
+ "pathlength" => "pathLength",
144
+ "patterncontentunits" => "patternContentUnits",
145
+ "patterntransform" => "patternTransform",
146
+ "patternunits" => "patternUnits",
147
+ "pointsatx" => "pointsAtX",
148
+ "pointsaty" => "pointsAtY",
149
+ "pointsatz" => "pointsAtZ",
150
+ "preservealpha" => "preserveAlpha",
151
+ "preserveaspectratio" => "preserveAspectRatio",
152
+ "primitiveunits" => "primitiveUnits",
153
+ "refx" => "refX",
154
+ "refy" => "refY",
155
+ "repeatcount" => "repeatCount",
156
+ "repeatdur" => "repeatDur",
157
+ "requiredextensions" => "requiredExtensions",
158
+ "requiredfeatures" => "requiredFeatures",
159
+ "specularconstant" => "specularConstant",
160
+ "specularexponent" => "specularExponent",
161
+ "spreadmethod" => "spreadMethod",
162
+ "startoffset" => "startOffset",
163
+ "stddeviation" => "stdDeviation",
164
+ "stitchtiles" => "stitchTiles",
165
+ "surfacescale" => "surfaceScale",
166
+ "systemlanguage" => "systemLanguage",
167
+ "tablevalues" => "tableValues",
168
+ "targetx" => "targetX",
169
+ "targety" => "targetY",
170
+ "textlength" => "textLength",
171
+ "viewbox" => "viewBox",
172
+ "viewtarget" => "viewTarget",
173
+ "xchannelselector" => "xChannelSelector",
174
+ "ychannelselector" => "yChannelSelector",
175
+ "zoomandpan" => "zoomAndPan",
176
+ }.freeze
177
+
178
+ def parse_foreign_content
179
+ case token
180
+ when Tokenizer::TextToken
181
+ self.frameset_ok = token.data.lstrip.sub(/\A\x00*/, '').lstrip.empty? if frameset_ok
182
+ token.data = token.data.gsub(/\x00/, "\ufffd")
183
+ add_text token.data
184
+ when Tokenizer::CommentToken
185
+ add_child Node::Comment.new(data: token.data)
186
+ when Tokenizer::StartTagToken
187
+ unless fragment?
188
+ breakout = BREAKOUT[token.data]
189
+ if token.tag == Tags::Font
190
+ token.attributes.each do |attr|
191
+ case attr.key
192
+ when 'color', 'face', 'size'
193
+ breakout = true
194
+ break
195
+ end
196
+ end
197
+ end
198
+ if breakout
199
+ open_elements.reverse_each_with_index do |elm, index|
200
+ if !elm.namespace || html_integration_point?(elm) || math_ml_text_integration_point?(elm)
201
+ self.open_elements = open_elements.slice(0, index + 1)
202
+ break
203
+ end
204
+ end
205
+ return false
206
+ end
207
+ end
208
+ current = adjusted_current_node
209
+ case current.namespace
210
+ when 'math'
211
+ adjust_attribute_names(token.attributes, MATH_ML_ATTRIBUTE_ADJUSTMENTS)
212
+ when 'svg'
213
+ x = SVG_TAG_NAME_ADJUSTMENTS[token.data]
214
+ if x
215
+ token.tag = Tags.lookup(x)
216
+ token.data = x
217
+ end
218
+ adjust_attribute_names(token.attributes, SVG_ATTRIBUTE_ADJUSTMENTS)
219
+ else
220
+ raise ParseError, 'bad parser state: unexpected namespace'
221
+ end
222
+ adjust_foreign_attributes(token.attributes)
223
+ namespace = current.namespace
224
+ add_element
225
+ top.namespace = namespace
226
+ tokenizer.next_is_not_raw_text! if namespace
227
+ if has_self_closing_token
228
+ open_elements.pop
229
+ acknowledge_self_closing_tag
230
+ end
231
+ when Tokenizer::EndTagToken
232
+ open_elements.reverse_each_with_index do |elm, index|
233
+ return insertion_mode.new(self).process unless elm.namespace
234
+ if elm.data.downcase == token.data.downcase
235
+ self.open_elements = open_elements.slice(0, index)
236
+ break
237
+ end
238
+ end
239
+ return true
240
+ end
241
+ # ignore the token
242
+ true
243
+ end
244
+
245
+ def in_foreign_content?
246
+ return false if open_elements.length.zero?
247
+ node = adjusted_current_node
248
+ return false unless node.namespace
249
+ if math_ml_text_integration_point?(node)
250
+ return false if token.instance_of?(Tokenizer::StartTagToken) && token.tag != Tags::Mglyph &&
251
+ token.tag != Tags::Malignmark
252
+ return false if token.instance_of?(Tokenizer::TextToken)
253
+ end
254
+ return false if node.namespace == 'math' && node.tag == Tags::AnnotationXml && \
255
+ token.instance_of?(Tokenizer::StartTagToken) && token.tag == Tags::Svg
256
+ return false if html_integration_point?(node) && (token.instance_of?(Tokenizer::StartTagToken) || token.instance_of?(Tokenizer::TextToken))
257
+ return false if token.instance_of? Tokenizer::ErrorToken
258
+ true
259
+ end
260
+
261
+ def math_ml_text_integration_point?(node)
262
+ return false unless node.namespace == 'math'
263
+ case node.data
264
+ when 'mi', 'mo', 'mn', 'ms', 'mtext' then return true
265
+ else return false
266
+ end
267
+ end
268
+
269
+ def html_integration_point?(node)
270
+ return false unless node.instance_of? Node::Element
271
+ case node.namespace
272
+ when 'math'
273
+ node.attributes.each do |attr|
274
+ next unless attr.key == 'encoding'
275
+ val = attr.value.downcase
276
+ return true if val == 'text/html' || val == 'application/xhtml+xml'
277
+ end if node.data == 'annotation-xml'
278
+ when 'svg'
279
+ case node.data
280
+ when 'desc', 'foreignObject', 'title'
281
+ return true
282
+ end
283
+ else return false
284
+ end
285
+ false
286
+ end
287
+
288
+ def adjust_attribute_names(attrs, map)
289
+ attrs.each { |attr| attr.key = map[attr.key] if map.key?(attr.key) }
290
+ end
291
+
292
+ def adjust_foreign_attributes(attrs)
293
+ attrs.each_with_index do |attr, index|
294
+ next if attr.key == "" || !attr.key.start_with?(?x)
295
+ case attr.key
296
+ when "xlink:actuate", "xlink:arcrole", "xlink:href", "xlink:role",
297
+ "xlink:show", "xlink:title", "xlink:type", "xml:base", "xml:lang",
298
+ "xml:space", "xmlns:xlink"
299
+ j = attr.key.index(?:)
300
+ attrs[index].namespace = attr.key.slice(0, j)
301
+ attrs[index].key = attr.key.slice(j + 1 .. -1)
302
+ end
303
+ end
304
+ end
305
+ end
306
+ end
307
+ end
@@ -0,0 +1,74 @@
1
+ require 'gammo/tokenizer/tokens'
2
+
3
+ module Gammo
4
+ class Parser
5
+ class InsertionMode
6
+ attr_reader :parser
7
+
8
+ def initialize(parser)
9
+ @parser = parser
10
+ end
11
+
12
+ def process
13
+ case token = parser.token
14
+ when Tokenizer::ErrorToken then consume(:error_token)
15
+ when Tokenizer::TextToken then consume(:text_token)
16
+ when Tokenizer::StartTagToken then consume(:start_tag_token)
17
+ when Tokenizer::EndTagToken then consume(:end_tag_token)
18
+ when Tokenizer::SelfClosingTagToken then consume(:self_closing_tag_token)
19
+ when Tokenizer::CommentToken then consume(:comment_token)
20
+ when Tokenizer::DoctypeToken then consume(:doctype_token)
21
+ else default token
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def halt(consumed)
28
+ throw :halt, consumed
29
+ end
30
+
31
+ def consume(name)
32
+ catch :halt do
33
+ token = parser.token
34
+ __send__(name, token) if respond_to?(name)
35
+ default token
36
+ end
37
+ end
38
+
39
+ def copy_attributes(dst, src)
40
+ return if src.attributes.length.zero?
41
+ attr = {}
42
+ dst.attributes.each { |dattr| attr[dattr.key] = dattr.value }
43
+ src.attributes.each do |sattr|
44
+ unless attr.has_key?(sattr.key)
45
+ dst.attributes << sattr
46
+ attr[sattr.key] = sattr.value
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ require 'gammo/parser/insertion_mode/in_table'
55
+ require 'gammo/parser/insertion_mode/after_head'
56
+ require 'gammo/parser/insertion_mode/in_template'
57
+ require 'gammo/parser/insertion_mode/in_cell'
58
+ require 'gammo/parser/insertion_mode/in_column_group'
59
+ require 'gammo/parser/insertion_mode/text'
60
+ require 'gammo/parser/insertion_mode/in_body'
61
+ require 'gammo/parser/insertion_mode/in_row'
62
+ require 'gammo/parser/insertion_mode/initial'
63
+ require 'gammo/parser/insertion_mode/before_html'
64
+ require 'gammo/parser/insertion_mode/in_table_body'
65
+ require 'gammo/parser/insertion_mode/before_head'
66
+ require 'gammo/parser/insertion_mode/in_frameset'
67
+ require 'gammo/parser/insertion_mode/after_body'
68
+ require 'gammo/parser/insertion_mode/after_frameset'
69
+ require 'gammo/parser/insertion_mode/in_caption'
70
+ require 'gammo/parser/insertion_mode/after_after_body'
71
+ require 'gammo/parser/insertion_mode/in_head'
72
+ require 'gammo/parser/insertion_mode/in_head_noscript'
73
+ require 'gammo/parser/insertion_mode/in_select_in_table'
74
+ require 'gammo/parser/insertion_mode/in_select'