gammo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.travis.yml +6 -0
  4. data/Gemfile +9 -0
  5. data/Gemfile.lock +27 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +177 -0
  8. data/Rakefile +25 -0
  9. data/gammo.gemspec +23 -0
  10. data/lib/gammo.rb +15 -0
  11. data/lib/gammo/attribute.rb +17 -0
  12. data/lib/gammo/fragment_parser.rb +65 -0
  13. data/lib/gammo/node.rb +157 -0
  14. data/lib/gammo/parser.rb +524 -0
  15. data/lib/gammo/parser/constants.rb +94 -0
  16. data/lib/gammo/parser/foreign.rb +307 -0
  17. data/lib/gammo/parser/insertion_mode.rb +74 -0
  18. data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
  19. data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
  20. data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
  21. data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
  22. data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
  23. data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
  24. data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
  25. data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
  26. data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
  27. data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
  28. data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
  29. data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
  30. data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
  31. data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
  32. data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
  33. data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
  34. data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
  35. data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
  36. data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
  37. data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
  38. data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
  39. data/lib/gammo/parser/insertion_mode/text.rb +32 -0
  40. data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
  41. data/lib/gammo/parser/node_stack.rb +24 -0
  42. data/lib/gammo/tags.rb +9 -0
  43. data/lib/gammo/tags/table.rb +744 -0
  44. data/lib/gammo/tokenizer.rb +373 -0
  45. data/lib/gammo/tokenizer/debug.rb +34 -0
  46. data/lib/gammo/tokenizer/entity.rb +2240 -0
  47. data/lib/gammo/tokenizer/escape.rb +174 -0
  48. data/lib/gammo/tokenizer/script_scanner.rb +229 -0
  49. data/lib/gammo/tokenizer/tokens.rb +66 -0
  50. data/lib/gammo/version.rb +3 -0
  51. data/misc/html.yaml +384 -0
  52. data/misc/table.erubi +14 -0
  53. metadata +97 -0
@@ -0,0 +1,94 @@
1
+ module Gammo
2
+ class Parser
3
+ # Defines constants that do not fall into a particular concept of linguistic analysis.
4
+ module Constants
5
+ # The following elements have varying levels of special parsing rules.
6
+ # Section 12.2.4.2.
7
+ # @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
8
+ SPECIAL_ELEMENTS = {
9
+ "address" => true,
10
+ "applet" => true,
11
+ "area" => true,
12
+ "article" => true,
13
+ "aside" => true,
14
+ "base" => true,
15
+ "basefont" => true,
16
+ "bgsound" => true,
17
+ "blockquote" => true,
18
+ "body" => true,
19
+ "br" => true,
20
+ "button" => true,
21
+ "caption" => true,
22
+ "center" => true,
23
+ "col" => true,
24
+ "colgroup" => true,
25
+ "dd" => true,
26
+ "details" => true,
27
+ "dir" => true,
28
+ "div" => true,
29
+ "dl" => true,
30
+ "dt" => true,
31
+ "embed" => true,
32
+ "fieldset" => true,
33
+ "figcaption" => true,
34
+ "figure" => true,
35
+ "footer" => true,
36
+ "form" => true,
37
+ "frame" => true,
38
+ "frameset" => true,
39
+ "h1" => true,
40
+ "h2" => true,
41
+ "h3" => true,
42
+ "h4" => true,
43
+ "h5" => true,
44
+ "h6" => true,
45
+ "head" => true,
46
+ "header" => true,
47
+ "hgroup" => true,
48
+ "hr" => true,
49
+ "html" => true,
50
+ "iframe" => true,
51
+ "img" => true,
52
+ "input" => true,
53
+ "keygen" => true,
54
+ "li" => true,
55
+ "link" => true,
56
+ "listing" => true,
57
+ "main" => true,
58
+ "marquee" => true,
59
+ "menu" => true,
60
+ "meta" => true,
61
+ "nav" => true,
62
+ "noembed" => true,
63
+ "noframes" => true,
64
+ "noscript" => true,
65
+ "object" => true,
66
+ "ol" => true,
67
+ "p" => true,
68
+ "param" => true,
69
+ "plaintext" => true,
70
+ "pre" => true,
71
+ "script" => true,
72
+ "section" => true,
73
+ "select" => true,
74
+ "source" => true,
75
+ "style" => true,
76
+ "summary" => true,
77
+ "table" => true,
78
+ "tbody" => true,
79
+ "td" => true,
80
+ "template" => true,
81
+ "textarea" => true,
82
+ "tfoot" => true,
83
+ "th" => true,
84
+ "thead" => true,
85
+ "title" => true,
86
+ "tr" => true,
87
+ "track" => true,
88
+ "ul" => true,
89
+ "wbr" => true,
90
+ "xmp" => true
91
+ }.freeze
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,307 @@
1
+ require 'gammo/node'
2
+ require 'gammo/tags'
3
+
4
+ module Gammo
5
+ class Parser
6
+ # A set of methods and contants for parsing foreign content.
7
+ # Section 12.2.6.5.
8
+ # @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
9
+ module Foreign
10
+ # Element names that are broken out on parsing foreign content.
11
+ BREAKOUT = {
12
+ "b" => true,
13
+ "big" => true,
14
+ "blockquote" => true,
15
+ "body" => true,
16
+ "br" => true,
17
+ "center" => true,
18
+ "code" => true,
19
+ "dd" => true,
20
+ "div" => true,
21
+ "dl" => true,
22
+ "dt" => true,
23
+ "em" => true,
24
+ "embed" => true,
25
+ "h1" => true,
26
+ "h2" => true,
27
+ "h3" => true,
28
+ "h4" => true,
29
+ "h5" => true,
30
+ "h6" => true,
31
+ "head" => true,
32
+ "hr" => true,
33
+ "i" => true,
34
+ "img" => true,
35
+ "li" => true,
36
+ "listing" => true,
37
+ "menu" => true,
38
+ "meta" => true,
39
+ "nobr" => true,
40
+ "ol" => true,
41
+ "p" => true,
42
+ "pre" => true,
43
+ "ruby" => true,
44
+ "s" => true,
45
+ "small" => true,
46
+ "span" => true,
47
+ "strong" => true,
48
+ "strike" => true,
49
+ "sub" => true,
50
+ "sup" => true,
51
+ "table" => true,
52
+ "tt" => true,
53
+ "u" => true,
54
+ "ul" => true,
55
+ "var" => true
56
+ }.freeze
57
+
58
+ # If the token's tag name which is parsed as foreign content and has "svg"
59
+ # namespace matches with the key in the hash below, replace the key with
60
+ # corresponding value.
61
+ SVG_TAG_NAME_ADJUSTMENTS = {
62
+ "altglyph" => "altGlyph",
63
+ "altglyphdef" => "altGlyphDef",
64
+ "altglyphitem" => "altGlyphItem",
65
+ "animatecolor" => "animateColor",
66
+ "animatemotion" => "animateMotion",
67
+ "animatetransform" => "animateTransform",
68
+ "clippath" => "clipPath",
69
+ "feblend" => "feBlend",
70
+ "fecolormatrix" => "feColorMatrix",
71
+ "fecomponenttransfer" => "feComponentTransfer",
72
+ "fecomposite" => "feComposite",
73
+ "feconvolvematrix" => "feConvolveMatrix",
74
+ "fediffuselighting" => "feDiffuseLighting",
75
+ "fedisplacementmap" => "feDisplacementMap",
76
+ "fedistantlight" => "feDistantLight",
77
+ "feflood" => "feFlood",
78
+ "fefunca" => "feFuncA",
79
+ "fefuncb" => "feFuncB",
80
+ "fefuncg" => "feFuncG",
81
+ "fefuncr" => "feFuncR",
82
+ "fegaussianblur" => "feGaussianBlur",
83
+ "feimage" => "feImage",
84
+ "femerge" => "feMerge",
85
+ "femergenode" => "feMergeNode",
86
+ "femorphology" => "feMorphology",
87
+ "feoffset" => "feOffset",
88
+ "fepointlight" => "fePointLight",
89
+ "fespecularlighting" => "feSpecularLighting",
90
+ "fespotlight" => "feSpotLight",
91
+ "fetile" => "feTile",
92
+ "feturbulence" => "feTurbulence",
93
+ "foreignobject" => "foreignObject",
94
+ "glyphref" => "glyphRef",
95
+ "lineargradient" => "linearGradient",
96
+ "radialgradient" => "radialGradient",
97
+ "textpath" => "textPath",
98
+ }.freeze
99
+
100
+ # If any attribute key of the current token which is parsed as foreign content and has "math"
101
+ # namespace matches with the key in the hash below, replace the key with
102
+ # corresponding value.
103
+ # Section 12.2.6.1.
104
+ # https://html.spec.whatwg.org/multipage/parsing.html#creating-and-inserting-nodes
105
+ MATH_ML_ATTRIBUTE_ADJUSTMENTS = {
106
+ "definitionurl" => "definitionURL",
107
+ }.freeze
108
+
109
+ # If any attribute key of the current token which is parsed as foreign content and has "svg"
110
+ # namespace matches with the key in the hash below, replace the key with
111
+ # corresponding value.
112
+ # Section 12.2.6.1.
113
+ # https://html.spec.whatwg.org/multipage/parsing.html#creating-and-inserting-nodes
114
+ SVG_ATTRIBUTE_ADJUSTMENTS = {
115
+ "attributename" => "attributeName",
116
+ "attributetype" => "attributeType",
117
+ "basefrequency" => "baseFrequency",
118
+ "baseprofile" => "baseProfile",
119
+ "calcmode" => "calcMode",
120
+ "clippathunits" => "clipPathUnits",
121
+ "contentscripttype" => "contentScriptType",
122
+ "contentstyletype" => "contentStyleType",
123
+ "diffuseconstant" => "diffuseConstant",
124
+ "edgemode" => "edgeMode",
125
+ "externalresourcesrequired" => "externalResourcesRequired",
126
+ "filterunits" => "filterUnits",
127
+ "glyphref" => "glyphRef",
128
+ "gradienttransform" => "gradientTransform",
129
+ "gradientunits" => "gradientUnits",
130
+ "kernelmatrix" => "kernelMatrix",
131
+ "kernelunitlength" => "kernelUnitLength",
132
+ "keypoints" => "keyPoints",
133
+ "keysplines" => "keySplines",
134
+ "keytimes" => "keyTimes",
135
+ "lengthadjust" => "lengthAdjust",
136
+ "limitingconeangle" => "limitingConeAngle",
137
+ "markerheight" => "markerHeight",
138
+ "markerunits" => "markerUnits",
139
+ "markerwidth" => "markerWidth",
140
+ "maskcontentunits" => "maskContentUnits",
141
+ "maskunits" => "maskUnits",
142
+ "numoctaves" => "numOctaves",
143
+ "pathlength" => "pathLength",
144
+ "patterncontentunits" => "patternContentUnits",
145
+ "patterntransform" => "patternTransform",
146
+ "patternunits" => "patternUnits",
147
+ "pointsatx" => "pointsAtX",
148
+ "pointsaty" => "pointsAtY",
149
+ "pointsatz" => "pointsAtZ",
150
+ "preservealpha" => "preserveAlpha",
151
+ "preserveaspectratio" => "preserveAspectRatio",
152
+ "primitiveunits" => "primitiveUnits",
153
+ "refx" => "refX",
154
+ "refy" => "refY",
155
+ "repeatcount" => "repeatCount",
156
+ "repeatdur" => "repeatDur",
157
+ "requiredextensions" => "requiredExtensions",
158
+ "requiredfeatures" => "requiredFeatures",
159
+ "specularconstant" => "specularConstant",
160
+ "specularexponent" => "specularExponent",
161
+ "spreadmethod" => "spreadMethod",
162
+ "startoffset" => "startOffset",
163
+ "stddeviation" => "stdDeviation",
164
+ "stitchtiles" => "stitchTiles",
165
+ "surfacescale" => "surfaceScale",
166
+ "systemlanguage" => "systemLanguage",
167
+ "tablevalues" => "tableValues",
168
+ "targetx" => "targetX",
169
+ "targety" => "targetY",
170
+ "textlength" => "textLength",
171
+ "viewbox" => "viewBox",
172
+ "viewtarget" => "viewTarget",
173
+ "xchannelselector" => "xChannelSelector",
174
+ "ychannelselector" => "yChannelSelector",
175
+ "zoomandpan" => "zoomAndPan",
176
+ }.freeze
177
+
178
+ def parse_foreign_content
179
+ case token
180
+ when Tokenizer::TextToken
181
+ self.frameset_ok = token.data.lstrip.sub(/\A\x00*/, '').lstrip.empty? if frameset_ok
182
+ token.data = token.data.gsub(/\x00/, "\ufffd")
183
+ add_text token.data
184
+ when Tokenizer::CommentToken
185
+ add_child Node::Comment.new(data: token.data)
186
+ when Tokenizer::StartTagToken
187
+ unless fragment?
188
+ breakout = BREAKOUT[token.data]
189
+ if token.tag == Tags::Font
190
+ token.attributes.each do |attr|
191
+ case attr.key
192
+ when 'color', 'face', 'size'
193
+ breakout = true
194
+ break
195
+ end
196
+ end
197
+ end
198
+ if breakout
199
+ open_elements.reverse_each_with_index do |elm, index|
200
+ if !elm.namespace || html_integration_point?(elm) || math_ml_text_integration_point?(elm)
201
+ self.open_elements = open_elements.slice(0, index + 1)
202
+ break
203
+ end
204
+ end
205
+ return false
206
+ end
207
+ end
208
+ current = adjusted_current_node
209
+ case current.namespace
210
+ when 'math'
211
+ adjust_attribute_names(token.attributes, MATH_ML_ATTRIBUTE_ADJUSTMENTS)
212
+ when 'svg'
213
+ x = SVG_TAG_NAME_ADJUSTMENTS[token.data]
214
+ if x
215
+ token.tag = Tags.lookup(x)
216
+ token.data = x
217
+ end
218
+ adjust_attribute_names(token.attributes, SVG_ATTRIBUTE_ADJUSTMENTS)
219
+ else
220
+ raise ParseError, 'bad parser state: unexpected namespace'
221
+ end
222
+ adjust_foreign_attributes(token.attributes)
223
+ namespace = current.namespace
224
+ add_element
225
+ top.namespace = namespace
226
+ tokenizer.next_is_not_raw_text! if namespace
227
+ if has_self_closing_token
228
+ open_elements.pop
229
+ acknowledge_self_closing_tag
230
+ end
231
+ when Tokenizer::EndTagToken
232
+ open_elements.reverse_each_with_index do |elm, index|
233
+ return insertion_mode.new(self).process unless elm.namespace
234
+ if elm.data.downcase == token.data.downcase
235
+ self.open_elements = open_elements.slice(0, index)
236
+ break
237
+ end
238
+ end
239
+ return true
240
+ end
241
+ # ignore the token
242
+ true
243
+ end
244
+
245
+ def in_foreign_content?
246
+ return false if open_elements.length.zero?
247
+ node = adjusted_current_node
248
+ return false unless node.namespace
249
+ if math_ml_text_integration_point?(node)
250
+ return false if token.instance_of?(Tokenizer::StartTagToken) && token.tag != Tags::Mglyph &&
251
+ token.tag != Tags::Malignmark
252
+ return false if token.instance_of?(Tokenizer::TextToken)
253
+ end
254
+ return false if node.namespace == 'math' && node.tag == Tags::AnnotationXml && \
255
+ token.instance_of?(Tokenizer::StartTagToken) && token.tag == Tags::Svg
256
+ return false if html_integration_point?(node) && (token.instance_of?(Tokenizer::StartTagToken) || token.instance_of?(Tokenizer::TextToken))
257
+ return false if token.instance_of? Tokenizer::ErrorToken
258
+ true
259
+ end
260
+
261
+ def math_ml_text_integration_point?(node)
262
+ return false unless node.namespace == 'math'
263
+ case node.data
264
+ when 'mi', 'mo', 'mn', 'ms', 'mtext' then return true
265
+ else return false
266
+ end
267
+ end
268
+
269
+ def html_integration_point?(node)
270
+ return false unless node.instance_of? Node::Element
271
+ case node.namespace
272
+ when 'math'
273
+ node.attributes.each do |attr|
274
+ next unless attr.key == 'encoding'
275
+ val = attr.value.downcase
276
+ return true if val == 'text/html' || val == 'application/xhtml+xml'
277
+ end if node.data == 'annotation-xml'
278
+ when 'svg'
279
+ case node.data
280
+ when 'desc', 'foreignObject', 'title'
281
+ return true
282
+ end
283
+ else return false
284
+ end
285
+ false
286
+ end
287
+
288
+ def adjust_attribute_names(attrs, map)
289
+ attrs.each { |attr| attr.key = map[attr.key] if map.key?(attr.key) }
290
+ end
291
+
292
+ def adjust_foreign_attributes(attrs)
293
+ attrs.each_with_index do |attr, index|
294
+ next if attr.key == "" || !attr.key.start_with?(?x)
295
+ case attr.key
296
+ when "xlink:actuate", "xlink:arcrole", "xlink:href", "xlink:role",
297
+ "xlink:show", "xlink:title", "xlink:type", "xml:base", "xml:lang",
298
+ "xml:space", "xmlns:xlink"
299
+ j = attr.key.index(?:)
300
+ attrs[index].namespace = attr.key.slice(0, j)
301
+ attrs[index].key = attr.key.slice(j + 1 .. -1)
302
+ end
303
+ end
304
+ end
305
+ end
306
+ end
307
+ end
@@ -0,0 +1,74 @@
1
+ require 'gammo/tokenizer/tokens'
2
+
3
+ module Gammo
4
+ class Parser
5
+ class InsertionMode
6
+ attr_reader :parser
7
+
8
+ def initialize(parser)
9
+ @parser = parser
10
+ end
11
+
12
+ def process
13
+ case token = parser.token
14
+ when Tokenizer::ErrorToken then consume(:error_token)
15
+ when Tokenizer::TextToken then consume(:text_token)
16
+ when Tokenizer::StartTagToken then consume(:start_tag_token)
17
+ when Tokenizer::EndTagToken then consume(:end_tag_token)
18
+ when Tokenizer::SelfClosingTagToken then consume(:self_closing_tag_token)
19
+ when Tokenizer::CommentToken then consume(:comment_token)
20
+ when Tokenizer::DoctypeToken then consume(:doctype_token)
21
+ else default token
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def halt(consumed)
28
+ throw :halt, consumed
29
+ end
30
+
31
+ def consume(name)
32
+ catch :halt do
33
+ token = parser.token
34
+ __send__(name, token) if respond_to?(name)
35
+ default token
36
+ end
37
+ end
38
+
39
+ def copy_attributes(dst, src)
40
+ return if src.attributes.length.zero?
41
+ attr = {}
42
+ dst.attributes.each { |dattr| attr[dattr.key] = dattr.value }
43
+ src.attributes.each do |sattr|
44
+ unless attr.has_key?(sattr.key)
45
+ dst.attributes << sattr
46
+ attr[sattr.key] = sattr.value
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ require 'gammo/parser/insertion_mode/in_table'
55
+ require 'gammo/parser/insertion_mode/after_head'
56
+ require 'gammo/parser/insertion_mode/in_template'
57
+ require 'gammo/parser/insertion_mode/in_cell'
58
+ require 'gammo/parser/insertion_mode/in_column_group'
59
+ require 'gammo/parser/insertion_mode/text'
60
+ require 'gammo/parser/insertion_mode/in_body'
61
+ require 'gammo/parser/insertion_mode/in_row'
62
+ require 'gammo/parser/insertion_mode/initial'
63
+ require 'gammo/parser/insertion_mode/before_html'
64
+ require 'gammo/parser/insertion_mode/in_table_body'
65
+ require 'gammo/parser/insertion_mode/before_head'
66
+ require 'gammo/parser/insertion_mode/in_frameset'
67
+ require 'gammo/parser/insertion_mode/after_body'
68
+ require 'gammo/parser/insertion_mode/after_frameset'
69
+ require 'gammo/parser/insertion_mode/in_caption'
70
+ require 'gammo/parser/insertion_mode/after_after_body'
71
+ require 'gammo/parser/insertion_mode/in_head'
72
+ require 'gammo/parser/insertion_mode/in_head_noscript'
73
+ require 'gammo/parser/insertion_mode/in_select_in_table'
74
+ require 'gammo/parser/insertion_mode/in_select'