gammo 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.travis.yml +6 -0
  4. data/Gemfile +9 -0
  5. data/Gemfile.lock +27 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +177 -0
  8. data/Rakefile +25 -0
  9. data/gammo.gemspec +23 -0
  10. data/lib/gammo.rb +15 -0
  11. data/lib/gammo/attribute.rb +17 -0
  12. data/lib/gammo/fragment_parser.rb +65 -0
  13. data/lib/gammo/node.rb +157 -0
  14. data/lib/gammo/parser.rb +524 -0
  15. data/lib/gammo/parser/constants.rb +94 -0
  16. data/lib/gammo/parser/foreign.rb +307 -0
  17. data/lib/gammo/parser/insertion_mode.rb +74 -0
  18. data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
  19. data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
  20. data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
  21. data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
  22. data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
  23. data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
  24. data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
  25. data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
  26. data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
  27. data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
  28. data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
  29. data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
  30. data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
  31. data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
  32. data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
  33. data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
  34. data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
  35. data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
  36. data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
  37. data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
  38. data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
  39. data/lib/gammo/parser/insertion_mode/text.rb +32 -0
  40. data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
  41. data/lib/gammo/parser/node_stack.rb +24 -0
  42. data/lib/gammo/tags.rb +9 -0
  43. data/lib/gammo/tags/table.rb +744 -0
  44. data/lib/gammo/tokenizer.rb +373 -0
  45. data/lib/gammo/tokenizer/debug.rb +34 -0
  46. data/lib/gammo/tokenizer/entity.rb +2240 -0
  47. data/lib/gammo/tokenizer/escape.rb +174 -0
  48. data/lib/gammo/tokenizer/script_scanner.rb +229 -0
  49. data/lib/gammo/tokenizer/tokens.rb +66 -0
  50. data/lib/gammo/version.rb +3 -0
  51. data/misc/html.yaml +384 -0
  52. data/misc/table.erubi +14 -0
  53. metadata +97 -0
@@ -0,0 +1,524 @@
1
+ require 'delegate'
2
+ require 'gammo/node'
3
+ require 'gammo/tags'
4
+ require 'gammo/tokenizer'
5
+ require 'gammo/parser/node_stack'
6
+ require 'gammo/parser/foreign'
7
+ require 'gammo/parser/constants'
8
+ require 'gammo/parser/insertion_mode_stack'
9
+
10
+ module Gammo
11
+ # Class for parsing an HTML input and building an HTML tree.
12
+ class Parser
13
+ require 'gammo/parser/insertion_mode'
14
+
15
+ include Foreign
16
+ include Constants
17
+
18
+ # Raised if anything goes wrong while parsing an HTML.
19
+ ParseError = Class.new(ArgumentError)
20
+
21
+ # Default scope stop tags defined in 12.2.4.2.
22
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements
23
+ # @!visibility private
24
+ DEFAULT_SCOPE_STOP_TAGS = {
25
+ nil => [Tags::Applet, Tags::Caption, Tags::Html, Tags::Table, Tags::Td,
26
+ Tags::Th, Tags::Marquee, Tags::Object, Tags::Template],
27
+ 'math' => [Tags::AnnotationXml, Tags::Mi, Tags::Mn, Tags::Mo, Tags::Ms,
28
+ Tags::Mtext],
29
+ 'svg' => [Tags::Desc, Tags::ForeignObject, Tags::Title]
30
+ }.freeze
31
+
32
+ # Scope constants defined in 12.2.4.2.
33
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements
34
+
35
+ # @!visibility private
36
+ DEFAULT_SCOPE = 0
37
+
38
+ # @!visibility private
39
+ LIST_ITEM_SCOPE = 1
40
+
41
+ # @!visibility private
42
+ BUTTON_SCOPE = 2
43
+
44
+ # @!visibility private
45
+ TABLE_SCOPE = 3
46
+
47
+ # @!visibility private
48
+ TABLE_ROW_SCOPE = 4
49
+
50
+ # @!visibility private
51
+ TABLE_BODY_SCOPE = 5
52
+
53
+ # @!visibility private
54
+ SELECT_SCOPE = 6
55
+
56
+ # Tokenizer for parsing each token.
57
+ # @!visibility private
58
+ attr_accessor :tokenizer, :token
59
+
60
+ # The insertion mode is a state variable that controls the primary operation
61
+ # of the tree construction stage.
62
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode
63
+ # @!visibility private
64
+ attr_accessor :insertion_mode
65
+
66
+ # The original insertion mode is set to this accessor, defined in 12.2.4.1.
67
+ # When the insertion mode is switched to "text" or "in table text",
68
+ # the original insertion mode is also set. This is the insertion mode to
69
+ # which the tree construction stage will return.
70
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode
71
+ # @!visibility private
72
+ attr_accessor :original_insertion_mode
73
+
74
+ # `template_stack` represents the stack of template insertion modes.
75
+ # Defined in 12.4.2.1.
76
+ # @!visibility private
77
+ attr_accessor :template_stack
78
+
79
+ # The stack of open elements, defined in 12.2.4.2.
80
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements
81
+ # @!visibility private
82
+ attr_accessor :open_elements
83
+
84
+ # The list of active formatting elements defined in 12.2.4.3.
85
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements
86
+ # @!visibility private
87
+ attr_accessor :active_formatting_elements
88
+
89
+ # The element pointers defined in 12.2.4.4.
90
+ # https://html.spec.whatwg.org/multipage/parsing.html#other-parsing-state-flags
91
+ attr_accessor :head, :form
92
+
93
+ # Other parsing state flags defined in 12.2.4.5.
94
+ # https://html.spec.whatwg.org/multipage/parsing.html#other-parsing-state-flags
95
+ attr_accessor :scripting, :frameset_ok
96
+ alias_method :scripting?, :scripting
97
+ alias_method :frameset_ok?, :frameset_ok
98
+
99
+ # Document root element
100
+ attr_accessor :document
101
+
102
+ # Self-closing flag defined in 12.2.5.
103
+ # Self-closing tags like <img /> are treated as start tag token, except
104
+ # `has_self_closing_token` is set while they are being proceeded.
105
+ # @!visibility private
106
+ attr_accessor :has_self_closing_token
107
+
108
+ # Quirks flag is defined in 12.2.5.
109
+ # quirks flag is whether the parser is operating in the "force-quirks" mode.
110
+ # @!visibility private
111
+ attr_accessor :quirks
112
+
113
+ # `foster_parenting` is set to true if a new element should be inserted
114
+ # according to the foster parenting rule defined in 12.2.6.1.
115
+ # https://html.spec.whatwg.org/multipage/parsing.html#creating-and-inserting-nodes
116
+ # @!visibility private
117
+ attr_accessor :foster_parenting
118
+
119
+ # The context element is for use in parsing an HTML fragment, defined in
120
+ # 12.2.4.2.
121
+ # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
122
+ attr_accessor :context
123
+
124
+ # `input` is the original input text.
125
+ # @!visibility private
126
+ attr_reader :input
127
+
128
+ # Constructs a parser for parsing an HTML input.
129
+ # @param [String] input
130
+ # @param [TrueClass, FalseClass] scripting
131
+ # @param [TrueClass, FalseClass] frameset_ok
132
+ # @param [InsertionMode] insertion_mode
133
+ # @param [Gammo::Node] context
134
+ # @return [Gammo::Parser]
135
+ def initialize(input, scripting: true, frameset_ok: true, insertion_mode: Initial, context: nil)
136
+ @input = input
137
+ @scripting = scripting
138
+ @frameset_ok = frameset_ok
139
+ @context = context
140
+ @insertion_mode = insertion_mode
141
+ @token = nil
142
+ @tokenizer = Tokenizer.new(input)
143
+ @document = Node::Document.new
144
+ @open_elements = Parser::NodeStack.new([])
145
+ @active_formatting_elements = Parser::NodeStack.new([])
146
+ @template_stack = InsertionModeStack.new([])
147
+ @foster_parenting = false
148
+ @has_self_closing_token = false
149
+ @quirks = false
150
+ @form = nil
151
+ @head = nil
152
+ end
153
+
154
+ # Parses the current input and builds HTML tree from it.
155
+ # @raise [Gammo::ParseError] Raised if the parser gets error while parsing.
156
+ # @return [Gammo::Node::Document, nil]
157
+ def parse
158
+ while self.token != Tokenizer::EOS
159
+ # CDATA sections are allowed only in foreign content.
160
+ node = open_elements.last
161
+ tokenizer.allow_cdata!(node && node.namespace)
162
+ self.token = tokenizer.next_token
163
+ return if self.token.instance_of?(Tokenizer::ErrorToken) && self.token != Tokenizer::EOS
164
+ parse_current_token
165
+ break if self.token == Tokenizer::EOS
166
+ end
167
+ self.document
168
+ end
169
+
170
+ # Always returns false.
171
+ # @return [FalseClass]
172
+ # @!visibility private
173
+ def fragment?
174
+ false
175
+ end
176
+
177
+ # Returns true if given node is matched with any special elements
178
+ # defined in 12.2.4.2.
179
+ # https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
180
+ #
181
+ # @param [Gammo::Node] node
182
+ # @return [TrueClass, FalseClass]
183
+ # @see Gammo::Parser::Constants::SPECIAL_ELEMENTS
184
+ # @!visibility private
185
+ def special_element?(node)
186
+ case node.namespace
187
+ when nil, 'html'
188
+ SPECIAL_ELEMENTS[node.data]
189
+ when 'math'
190
+ case node.data
191
+ when 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml'
192
+ true
193
+ end
194
+ when 'svg'
195
+ case node.data
196
+ when 'foreignObject', 'desc', 'title'
197
+ true
198
+ end
199
+ end
200
+ end
201
+
202
+ # @!visibility private
203
+ def parse_implied_token(tok, tag, data)
204
+ real_token, self_closing = token, has_self_closing_token
205
+ @token = tok.new(data, tag: tag)
206
+ @has_self_closing_token = false
207
+ parse_current_token
208
+ @token, @has_self_closing_token = real_token, self_closing
209
+ end
210
+
211
+ # @!visibility private
212
+ def pop_until(scope, *match_tags)
213
+ index = index_of_element_in_scope(scope, *match_tags)
214
+ if index != -1
215
+ @open_elements = open_elements.slice(0, index)
216
+ return true
217
+ end
218
+ false
219
+ end
220
+
221
+ # @!visibility private
222
+ def index_of_element_in_scope(scope, *match_tags)
223
+ open_elements.reverse_each_with_index do |open_element, index|
224
+ tag = open_element.tag
225
+ unless open_element.namespace
226
+ return index if match_tags.include?(tag)
227
+ case scope
228
+ when DEFAULT_SCOPE
229
+ # no op
230
+ when LIST_ITEM_SCOPE
231
+ return -1 if tag == Tags::Ol || tag == Tags::Ul
232
+ when BUTTON_SCOPE
233
+ return -1 if tag == Tags::Button
234
+ when TABLE_SCOPE
235
+ return -1 if tag == Tags::Html || tag == Tags::Table || tag == Tags::Template
236
+ when SELECT_SCOPE
237
+ return -1 if tag == Tags::Optgroup && tag == Tags::Option
238
+ else
239
+ raise ParseError, 'unreachable parsing error, please report to github'
240
+ end
241
+ end
242
+ case scope
243
+ when DEFAULT_SCOPE, LIST_ITEM_SCOPE, BUTTON_SCOPE
244
+ return -1 if DEFAULT_SCOPE_STOP_TAGS[open_element.namespace].include?(tag)
245
+ end
246
+ end
247
+ -1
248
+ end
249
+
250
+ # @!visibility private
251
+ def parse_generic_raw_text_element
252
+ add_element
253
+ @original_insertion_mode = insertion_mode
254
+ @insertion_mode = Text
255
+ end
256
+
257
+ # 12.2.4.2
258
+ # @!visibility private
259
+ def adjusted_current_node
260
+ return context if open_elements.length == 1 && fragment? && context
261
+ open_elements.last
262
+ end
263
+
264
+ # @!visibility private
265
+ def element_in_scope?(scope, *match_tags)
266
+ index_of_element_in_scope(scope, *match_tags) != -1
267
+ end
268
+
269
+ # @!visibility private
270
+ def clear_stack_to_context(scope)
271
+ open_elements.reverse_each_with_index do |open_element, index|
272
+ tag = open_element.tag
273
+ case scope
274
+ when TABLE_SCOPE
275
+ if tag == Tags::Html || tag == Tags::Table || tag == Tags::Template
276
+ @open_elements = open_elements.slice(0, index + 1)
277
+ return
278
+ end
279
+ when TABLE_ROW_SCOPE
280
+ if tag == Tags::Html || tag == Tags::Tr || tag == Tags::Template
281
+ @open_elements = open_elements.slice(0, index + 1)
282
+ return
283
+ end
284
+ when TABLE_BODY_SCOPE
285
+ if tag == Tags::Html || tag == Tags::Tbody || tag == Tags::Tfoot || tag == Tags::Thead || tag == Tags::Template
286
+ @open_elements = open_elements.slice(0, index + 1)
287
+ return
288
+ end
289
+ else
290
+ raise ParseError, 'unreachable parsing error, please report to github'
291
+ end
292
+ end
293
+ end
294
+
295
+ # @!visibility private
296
+ def generate_implied_end_tags(*exceptions)
297
+ index = open_elements.reverse_each_with_index do |node, i|
298
+ break index unless node.instance_of? Node::Element
299
+ case node.tag
300
+ when Tags::Dd, Tags::Dt, Tags::Optgroup, Tags::Option, Tags::P, Tags::Rb, Tags::Rp, Tags::Rt, Tags::Rtc
301
+ break i if exceptions.include?(node.data)
302
+ next
303
+ end
304
+ break i
305
+ end
306
+ @open_elements = open_elements.slice(0, index + 1)
307
+ end
308
+
309
+ # @!visibility private
310
+ def add_child(node)
311
+ should_foster_parent? ? foster_parent(node) : top.append_child(node)
312
+ open_elements << node if node.instance_of?(Node::Element)
313
+ end
314
+
315
+ # @!visibility private
316
+ def top
317
+ open_elements.last || document
318
+ end
319
+
320
+ # @!visibility private
321
+ def add_element
322
+ add_child(Node::Element.new(tag: token.tag, data: token.data, attributes: token.attributes))
323
+ end
324
+
325
+ # @!visibility private
326
+ def should_foster_parent?
327
+ return false unless foster_parenting
328
+ case top.tag
329
+ when Tags::Table, Tags::Tbody, Tags::Tfoot, Tags::Thead, Tags::Tr
330
+ return true
331
+ end
332
+ false
333
+ end
334
+
335
+ # @!visibility private
336
+ def foster_parent(node)
337
+ i = 0
338
+ table = open_elements.reverse_each_with_index do |open_element, index|
339
+ if open_element.tag == Tags::Table
340
+ i = index
341
+ break open_element
342
+ end
343
+ end
344
+ j = 0
345
+ template = open_elements.reverse_each_with_index do |open_element, index|
346
+ if open_element.tag == Tags::Template
347
+ j = index
348
+ break open_element
349
+ end
350
+ end
351
+ return template.append_child(node) if template && (!table || j > i)
352
+ parent = table ? table.parent : open_elements[0]
353
+ parent = open_elements[i - 1] unless parent
354
+ prev = table ? table.previous_sibling : parent.last_child
355
+ if prev && prev.instance_of?(Node::Text) && node.instance_of?(Node::Text)
356
+ prev.data += node.data
357
+ return
358
+ end
359
+ parent.insert_before(node, table)
360
+ end
361
+
362
+ # @!visibility private
363
+ def add_text(text)
364
+ return if text.empty?
365
+ return foster_parent(Node::Text.new(data: text)) if should_foster_parent?
366
+ t = top
367
+ node = t.last_child
368
+ if node && node.instance_of?(Node::Text)
369
+ node.data += text
370
+ return
371
+ end
372
+ add_child Node::Text.new(data: text)
373
+ end
374
+
375
+ # @!visibility private
376
+ def add_formatting_element
377
+ tag, attrs = token.tag, token.attributes
378
+ add_element
379
+ identical_elements = 0
380
+ # todo
381
+ continued_finding = false
382
+ active_formatting_elements.reverse_each_with_index do |node, i|
383
+ continued_finding = false
384
+ break if node.instance_of?(Node::ScopeMarker)
385
+ next unless node.instance_of?(Node::Element)
386
+ next if node.namespace || node.tag != tag || node.attributes.length != attrs.length
387
+ # compare attrs
388
+ node.attributes.each.with_index do |a, j|
389
+ continue_comparing = false
390
+ attrs.each_with_index do |b, k|
391
+ if a.key == b.key && a.namespace == b.namespace && a.value == b.value
392
+ continue_comparing = true
393
+ break
394
+ end
395
+ end
396
+ next if continue_comparing
397
+ continued_finding = true
398
+ break if continued_finding
399
+ end
400
+ next if continued_finding
401
+ identical_elements += 1
402
+ active_formatting_elements.delete(node) if identical_elements >= 3
403
+ end
404
+
405
+ active_formatting_elements << open_elements.last
406
+ end
407
+
408
+ # @!visibility private
409
+ def clear_active_formatting_elements
410
+ loop do
411
+ node = active_formatting_elements.pop
412
+ return if active_formatting_elements.length.zero? || node.instance_of?(Node::ScopeMarker)
413
+ end
414
+ end
415
+
416
+ # @!visibility private
417
+ def reconstruct_active_formatting_elements
418
+ return unless node = active_formatting_elements.last
419
+ return if node.instance_of?(Node::ScopeMarker) || open_elements.index(node)
420
+ i = active_formatting_elements.length - 1
421
+ until node.is_a?(Node::ScopeMarker) || open_elements.index(node)
422
+ if i.zero?
423
+ i = -1
424
+ break
425
+ end
426
+ i -= 1
427
+ node = active_formatting_elements[i]
428
+ end
429
+ loop do
430
+ i += 1
431
+ cloned = active_formatting_elements[i].clone
432
+ add_child(cloned)
433
+ active_formatting_elements[i] = cloned
434
+ break if i == active_formatting_elements.length - 1
435
+ end
436
+ end
437
+
438
+ # @!visibility private
439
+ def acknowledge_self_closing_tag
440
+ @has_self_closing_token = false
441
+ end
442
+
443
+ # @!visibility private
444
+ def set_original_insertion_mode
445
+ raise 'bad parser state: original im was set twice' if original_insertion_mode
446
+ @original_insertion_mode = @insertion_mode
447
+ end
448
+
449
+ # @!visibility private
450
+ def reset_insertion_mode
451
+ open_elements.reverse_each_with_index do |open_element, index|
452
+ node = open_element
453
+ last = index.zero?
454
+ node = self.context if last && self.context
455
+ case node.tag
456
+ when Tags::Select
457
+ unless last
458
+ ancestor = node
459
+ first = open_elements[0]
460
+ while ancestor != first
461
+ ancestor = open_elements[open_elements.index(ancestor) - 1]
462
+ case ancestor.tag
463
+ when Tags::Template
464
+ @insertion_mode = InSelect
465
+ return
466
+ when Tags::Table
467
+ @insertion_mode = InSelectInTable
468
+ return
469
+ end
470
+ end
471
+ end
472
+ @insertion_mode = InSelect
473
+ when Tags::Td, Tags::Th
474
+ # remove this divergence from the HTML5 spec.
475
+ @insertion_mode = InCell
476
+ when Tags::Tr
477
+ @insertion_mode = InRow
478
+ when Tags::Tbody, Tags::Thead, Tags::Tfoot
479
+ @insertion_mode = InTableBody
480
+ when Tags::Caption
481
+ @insertion_mode = InCaption
482
+ when Tags::Colgroup
483
+ @insertion_mode = InColumnGroup
484
+ when Tags::Table
485
+ @insertion_mode = InTable
486
+ when Tags::Template
487
+ # remove this divergence from the HTML5 spec.
488
+ next if node.namespace
489
+ @insertion_mode = template_stack.last
490
+ when Tags::Head
491
+ # remove this divergence from the HTML5 spec.
492
+ @insertion_mode = InHead
493
+ when Tags::Body
494
+ @insertion_mode = InBody
495
+ when Tags::Frameset
496
+ @insertion_mode = InFrameset
497
+ when Tags::Html
498
+ @insertion_mode = @head ? AfterHead : BeforeHead
499
+ else
500
+ if last
501
+ @insertion_mode = InBody
502
+ return
503
+ end
504
+ next
505
+ end
506
+ return
507
+ end
508
+ end
509
+
510
+ # @!visibility private
511
+ def parse_current_token
512
+ if token.instance_of? Tokenizer::SelfClosingTagToken
513
+ self.has_self_closing_token = true
514
+ self.token = Tokenizer::StartTagToken.new(token.data, tag: token.tag, attributes: token.attributes)
515
+ end
516
+ consumed = false
517
+ until consumed
518
+ consumed =
519
+ in_foreign_content? ? parse_foreign_content : insertion_mode.new(self).process
520
+ end
521
+ self.has_self_closing_token = false if self.has_self_closing_token
522
+ end
523
+ end
524
+ end